Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 // *******************************************************************************
     20 // * @file
     21 // *  ihevc_itrans_recon_8x8_neon.s
     22 // *
     23 // * @brief
     24 // *  contains function definitions for single stage  inverse transform
     25 // *
     26 // * @author
     27 // * anand s
     28 // *
     29 // * @par list of functions:
     30 // *  - ihevc_itrans_recon_32x32()
     31 // *
     32 // * @remarks
     33 // *  the input buffer is being corrupted
     34 // *
     35 // *******************************************************************************
     36 //*/
     37 
     38 ///**
     39 // *******************************************************************************
     40 // *
     41 // * @brief
     42 // *  this function performs inverse transform  and reconstruction for 8x8
     43 // * input block
     44 // *
     45 // * @par description:
     46 // *  performs inverse transform and adds the prediction  data and clips output
     47 // * to 8 bit
     48 // *
     49 // * @param[in] pi2_src
     50 // *  input 16x16 coefficients
     51 // *
     52 // * @param[in] pi2_tmp
     53 // *  temporary 16x16 buffer for storing inverse
     54 // *
     55 // *  transform
     56 // *  1st stage output
     57 // *
     58 // * @param[in] pu1_pred
     59 // *  prediction 16x16 block
     60 // *
     61 // * @param[out] pu1_dst
     62 // *  output 8x8 block
     63 // *
     64 // * @param[in] src_strd
     65 // *  input stride
     66 // *
     67 // * @param[in] pred_strd
     68 // *  prediction stride
     69 // *
     70 // * @param[in] dst_strd
     71 // *  output stride
     72 // *
     73 // * @param[in] shift
     74 // *  output shift
     75 // *
     76 // * @param[in] x12
     77 // *  zero columns in pi2_src
     78 // *
     79 // * @returns  void
     80 // *
     81 // * @remarks
     82 // *  none
     83 // *
     84 // *******************************************************************************
     85 // */
     86 
     87 //void ihevc_itrans_recon_32x32(word16 *pi2_src,
     88 //                            word16 *pi2_tmp,
     89 //                            uword8 *pu1_pred,
     90 //                            uword8 *pu1_dst,
     91 //                            word32 src_strd,
     92 //                            word32 pred_strd,
     93 //                            word32 dst_strd,
     94 //                            word32 x12
     95 //                             word32    x11                )
     96 
     97 //**************variables vs registers*************************
     98 //    x0 => *pi2_src
     99 //    x1 => *pi2_tmp
    100 //    x2 => *pu1_pred
    101 //    x3 => *pu1_dst
    102 //    src_strd
    103 //    pred_strd
    104 //    dst_strd
    105 //    x12
    106 //    x11
    107 
    108 
    109 //d0[0]=    64        d2[0]=83
    110 //d0[1]= 90        d2[1]=82
    111 //d0[2]= 90        d2[2]=80
    112 //d0[3]= 90        d2[3]=78
    113 //d1[0]= 89         d3[0]=75
    114 //d1[1]= 88        d3[1]=73
    115 //d1[2]= 87        d3[2]=70
    116 //d1[3]= 85        d3[3]=67
    117 
    118 //d4[0]=    64        d6[0]=36
    119 //d4[1]= 61        d6[1]=31
    120 //d4[2]= 57        d6[2]=25
    121 //d4[3]= 54        d6[3]=22
    122 //d5[0]= 50         d7[0]=18
    123 //d5[1]= 46        d7[1]=13
    124 //d5[2]= 43        d7[2]=9
    125 //d5[3]= 38        d7[3]=4
    126 
    127 .text
    128 .align 4
    129 .include "ihevc_neon_macros.s"
    130 
    131 
    132 
    133 
    134 .set shift_stage1_idct ,   7
    135 .set shift_stage2_idct ,   12
    136 
    137 //#define zero_cols      x12
    138 //#define zero_rows     x11
    139 
    140 .globl ihevc_itrans_recon_32x32_av8
    141 
    142 .extern g_ai2_ihevc_trans_32_transpose
    143 
    144 x5_addr: .word 0xfffff000
    145 x9_addr: .word 0xffff0000
    146 
    147 .type ihevc_itrans_recon_32x32_av8, %function
    148 
    149 ihevc_itrans_recon_32x32_av8:
    150 
    151     ldr         w11, [sp]
    152 
    153 // stmfd sp!,{x0-x12,x14}
    154     push_v_regs
    155     stp         x19, x20,[sp,#-16]!
    156     stp         x0, x1,[sp,#-16]!
    157     stp         x5, x6,[sp,#-16]!
    158 
    159 //ldr            x8,[sp,#56]     @ prediction stride
    160 //ldr            x7,[sp,#64]     @ destination stride
    161     mov         x6, x4 // src stride
    162     mov         x12, x7
    163     lsl         x6, x6, #1                  // x sizeof(word16)
    164     add         x10,x6,x6, lsl #1           // 3 rows
    165 
    166 
    167     mov         x8,x0
    168 
    169     adrp        x14, :got:g_ai2_ihevc_trans_32_transpose
    170     ldr         x14, [x14, #:got_lo12:g_ai2_ihevc_trans_32_transpose]
    171 
    172     ld1         {v0.4h, v1.4h, v2.4h, v3.4h},[x14],#32
    173     ld1         {v4.4h, v5.4h, v6.4h, v7.4h},[x14],#32
    174 
    175 //registers which are free
    176 //  x10,x9,x11,x12
    177     mov         x9,#0xffffff00
    178     mov         x10,#0xfffffff0
    179     ldr         w5, x5_addr
    180     ldr         w7, x9_addr
    181     cmp         x12,x10
    182     mov         x20,#1
    183     csel        x14, x20, x14,hs
    184     bhs         stage1
    185 
    186 
    187     cmp         x12,x9
    188     mov         x20,#2
    189     csel        x14, x20, x14,hs
    190     bhs         stage1
    191 
    192     cmp         x12,x5
    193     mov         x20,#3
    194     csel        x14, x20, x14,hs
    195     bhs         stage1
    196 
    197     cmp         x12,x7
    198     mov         x20,#4
    199     csel        x14, x20, x14,hs
    200 
    201     mov         x14,#8
    202     b           stage1
    203 //.ltorg
    204 
    205 
    206 dct_stage1:
    207     add         x8,x8,#8
    208     mov         x0,x8
    209 
    210 stage1:
    211     ld1         {v10.4h},[x0],x6
    212     ld1         {v8.4h},[x0],x6
    213     ld1         {v11.4h},[x0],x6
    214     ld1         {v9.4h},[x0],x6
    215 
    216     smull       v24.4s, v8.4h, v0.h[1]     //// y1 * cos1(part of b0)
    217     smull       v26.4s, v8.4h, v0.h[3]     //// y1 * cos3(part of b1)
    218     smull       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
    219     smull       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
    220 
    221     smlal       v24.4s, v9.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
    222     smlal       v26.4s, v9.4h, v2.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
    223     smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
    224     smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    225 
    226 
    227 
    228 
    229 
    230     smull       v20.4s, v10.4h, v0.h[0]
    231     smlal       v20.4s, v11.4h, v0.h[2]
    232 
    233 
    234     smull       v22.4s, v10.4h, v0.h[0]
    235     smlal       v22.4s, v11.4h, v1.h[2]
    236 
    237     smull       v16.4s, v10.4h, v0.h[0]
    238     smlal       v16.4s, v11.4h, v2.h[2]
    239 
    240     smull       v18.4s, v10.4h, v0.h[0]
    241     smlal       v18.4s, v11.4h, v3.h[2]
    242     cmp         x11,x10
    243     bhs         shift1
    244 
    245     ld1         {v12.4h},[x0],x6
    246     ld1         {v14.4h},[x0],x6
    247     ld1         {v13.4h},[x0],x6
    248     ld1         {v15.4h},[x0],x6
    249 
    250 
    251 
    252 
    253 
    254 
    255 
    256     smlal       v24.4s, v14.4h, v1.h[1]
    257     smlal       v26.4s, v14.4h, v3.h[3]
    258     smlal       v28.4s, v14.4h, v6.h[1]
    259     smlsl       v30.4s, v14.4h, v7.h[1]
    260 
    261 
    262     smlal       v24.4s, v15.4h, v1.h[3]
    263     smlal       v26.4s, v15.4h, v5.h[1]
    264     smlsl       v28.4s, v15.4h, v7.h[1]
    265     smlsl       v30.4s, v15.4h, v3.h[3]
    266 
    267 
    268     smlal       v20.4s, v12.4h, v1.h[0]
    269     smlal       v20.4s, v13.4h, v1.h[2]
    270     smlal       v22.4s, v12.4h, v3.h[0]
    271     smlal       v22.4s, v13.4h, v4.h[2]
    272     smlal       v16.4s, v12.4h, v5.h[0]
    273     smlal       v16.4s, v13.4h, v7.h[2]
    274     smlal       v18.4s, v12.4h, v7.h[0]
    275     smlsl       v18.4s, v13.4h, v5.h[2]
    276 
    277     cmp         x11,x9
    278     bhs         shift1
    279 
    280     ld1         {v10.4h},[x0],x6
    281     ld1         {v8.4h},[x0],x6
    282     ld1         {v11.4h},[x0],x6
    283     ld1         {v9.4h},[x0],x6
    284 
    285 
    286     smlal       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
    287     smlal       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
    288     smlsl       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
    289     smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
    290 
    291     smlal       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
    292     smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
    293     smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
    294     smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    295 
    296 
    297 
    298 
    299 
    300     smlal       v20.4s, v10.4h, v2.h[0]
    301     smlal       v20.4s, v11.4h, v2.h[2]
    302 
    303 
    304     smlal       v22.4s, v10.4h, v6.h[0]
    305     smlal       v22.4s, v11.4h, v7.h[2]
    306 
    307     smlsl       v16.4s, v10.4h, v6.h[0]
    308     smlsl       v16.4s, v11.4h, v3.h[2]
    309 
    310     smlsl       v18.4s, v10.4h, v2.h[0]
    311     smlsl       v18.4s, v11.4h, v1.h[2]
    312 
    313     cmp         x11,x5
    314     bhs         shift1
    315 
    316 
    317     ld1         {v12.4h},[x0],x6
    318     ld1         {v14.4h},[x0],x6
    319     ld1         {v13.4h},[x0],x6
    320     ld1         {v15.4h},[x0],x6
    321 
    322 
    323 
    324 
    325 
    326 
    327 
    328 
    329 
    330     smlal       v24.4s, v14.4h, v3.h[1]
    331     smlsl       v26.4s, v14.4h, v6.h[1]
    332     smlsl       v28.4s, v14.4h, v0.h[1]
    333     smlsl       v30.4s, v14.4h, v6.h[3]
    334 
    335 
    336     smlal       v24.4s, v15.4h, v3.h[3]
    337     smlsl       v26.4s, v15.4h, v4.h[3]
    338     smlsl       v28.4s, v15.4h, v2.h[3]
    339     smlal       v30.4s, v15.4h, v5.h[3]
    340 
    341 
    342     smlal       v20.4s, v12.4h, v3.h[0]
    343     smlal       v20.4s, v13.4h, v3.h[2]
    344     smlsl       v22.4s, v12.4h, v7.h[0]
    345     smlsl       v22.4s, v13.4h, v5.h[2]
    346     smlsl       v16.4s, v12.4h, v1.h[0]
    347     smlsl       v16.4s, v13.4h, v1.h[2]
    348     smlsl       v18.4s, v12.4h, v5.h[0]
    349     smlal       v18.4s, v13.4h, v7.h[2]
    350 
    351     cmp         x11,x7
    352     bhs         shift1
    353 
    354 
    355     ld1         {v10.4h},[x0],x6
    356     ld1         {v8.4h},[x0],x6
    357     ld1         {v11.4h},[x0],x6
    358     ld1         {v9.4h},[x0],x6
    359 
    360 
    361 
    362     smlal       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
    363     smlsl       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
    364     smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
    365     smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
    366 
    367     smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
    368     smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
    369     smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
    370     smlal       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    371 
    372 
    373 
    374 
    375 
    376     smlal       v20.4s, v10.4h, v0.h[0]
    377     smlal       v20.4s, v11.4h, v4.h[2]
    378 
    379 
    380     smlsl       v22.4s, v10.4h, v0.h[0]
    381     smlsl       v22.4s, v11.4h, v2.h[2]
    382 
    383     smlsl       v16.4s, v10.4h, v0.h[0]
    384     smlsl       v16.4s, v11.4h, v6.h[2]
    385 
    386     smlal       v18.4s, v10.4h, v0.h[0]
    387     smlal       v18.4s, v11.4h, v0.h[2]
    388 
    389 
    390 
    391     ld1         {v12.4h},[x0],x6
    392     ld1         {v14.4h},[x0],x6
    393     ld1         {v13.4h},[x0],x6
    394     ld1         {v15.4h},[x0],x6
    395 
    396 
    397 
    398 
    399     smlal       v24.4s, v14.4h, v5.h[1]
    400     smlsl       v26.4s, v14.4h, v0.h[2]
    401     smlal       v28.4s, v14.4h, v5.h[3]
    402     smlal       v30.4s, v14.4h, v4.h[3]
    403 
    404 
    405     smlal       v24.4s, v15.4h, v5.h[3]
    406     smlsl       v26.4s, v15.4h, v1.h[1]
    407     smlal       v28.4s, v15.4h, v3.h[1]
    408     smlsl       v30.4s, v15.4h, v7.h[3]
    409 
    410 
    411     smlal       v20.4s, v12.4h, v5.h[0]
    412     smlal       v20.4s, v13.4h, v5.h[2]
    413     smlsl       v22.4s, v12.4h, v1.h[0]
    414     smlsl       v22.4s, v13.4h, v0.h[2]
    415     smlal       v16.4s, v12.4h, v7.h[0]
    416     smlal       v16.4s, v13.4h, v4.h[2]
    417     smlal       v18.4s, v12.4h, v3.h[0]
    418     smlal       v18.4s, v13.4h, v6.h[2]
    419 
    420 
    421     ld1         {v10.4h},[x0],x6
    422     ld1         {v8.4h},[x0],x6
    423     ld1         {v11.4h},[x0],x6
    424     ld1         {v9.4h},[x0],x6
    425 
    426 
    427 
    428 
    429 
    430 
    431 
    432     smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
    433     smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
    434     smlal       v28.4s, v8.4h, v0.h[1]     //// y1 * sin3(part of b2)
    435     smlsl       v30.4s, v8.4h, v4.h[1]     //// y1 * sin1(part of b3)
    436 
    437     smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
    438     smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
    439     smlal       v28.4s, v9.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
    440     smlsl       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    441 
    442 
    443 
    444 
    445 
    446     smlal       v20.4s, v10.4h, v6.h[0]
    447     smlal       v20.4s, v11.4h, v6.h[2]
    448 
    449 
    450     smlsl       v22.4s, v10.4h, v2.h[0]
    451     smlsl       v22.4s, v11.4h, v3.h[2]
    452 
    453     smlal       v16.4s, v10.4h, v2.h[0]
    454     smlal       v16.4s, v11.4h, v0.h[2]
    455 
    456     smlsl       v18.4s, v10.4h, v6.h[0]
    457     smlsl       v18.4s, v11.4h, v2.h[2]
    458 
    459     ld1         {v12.4h},[x0],x6
    460     ld1         {v14.4h},[x0],x6
    461     ld1         {v13.4h},[x0],x6
    462     ld1         {v15.4h},[x0],x6
    463 
    464 
    465     smlal       v24.4s, v14.4h, v7.h[1]
    466     smlsl       v26.4s, v14.4h, v5.h[3]
    467     smlal       v28.4s, v14.4h, v4.h[1]
    468     smlsl       v30.4s, v14.4h, v2.h[3]
    469 
    470 
    471     smlal       v24.4s, v15.4h, v7.h[3]
    472     smlsl       v26.4s, v15.4h, v7.h[1]
    473     smlal       v28.4s, v15.4h, v6.h[3]
    474     smlsl       v30.4s, v15.4h, v6.h[1]
    475 
    476 
    477     smlal       v20.4s, v12.4h, v7.h[0]
    478     smlal       v20.4s, v13.4h, v7.h[2]
    479     smlsl       v22.4s, v12.4h, v5.h[0]
    480     smlsl       v22.4s, v13.4h, v6.h[2]
    481     smlal       v16.4s, v12.4h, v3.h[0]
    482     smlal       v16.4s, v13.4h, v5.h[2]
    483     smlsl       v18.4s, v12.4h, v1.h[0]
    484     smlsl       v18.4s, v13.4h, v4.h[2]
    485 
    486 
    487 
    488 shift1:
    489     add         v8.4s,  v20.4s ,  v24.4s
    490     sub         v10.4s,  v20.4s ,  v24.4s
    491 
    492     add         v12.4s,  v22.4s ,  v26.4s
    493     sub         v24.4s,  v22.4s ,  v26.4s
    494 
    495     add         v14.4s,  v16.4s ,  v28.4s
    496     sub         v26.4s,  v16.4s ,  v28.4s
    497 
    498 
    499     add         v16.4s,  v18.4s ,  v30.4s
    500     sub         v28.4s,  v18.4s ,  v30.4s
    501 
    502 
    503     sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
    504     sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
    505     sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
    506     sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
    507     sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
    508     sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
    509     sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
    510     sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
    511 
    512 
    513     // registers used q15,q14,q6,q7
    514 
    515     umov        x15,v24.d[0]
    516     umov        x16,v25.d[0]
    517     umov        x19,v26.d[0]
    518     umov        x20,v27.d[0]
    519 
    520     trn1        v24.4h, v30.4h, v12.4h
    521     trn2        v25.4h, v30.4h, v12.4h
    522     trn1        v26.4h, v31.4h, v13.4h
    523     trn2        v27.4h, v31.4h, v13.4h
    524 
    525     trn1        v30.2s, v24.2s, v26.2s
    526     trn2        v31.2s, v24.2s, v26.2s
    527     trn1        v12.2s, v25.2s, v27.2s
    528     trn2        v13.2s, v25.2s, v27.2s
    529 
    530     trn1        v24.4h, v14.4h, v18.4h
    531     trn2        v25.4h, v14.4h, v18.4h
    532     trn1        v26.4h, v15.4h, v19.4h
    533     trn2        v27.4h, v15.4h, v19.4h
    534 
    535     trn1        v14.2s, v24.2s, v26.2s
    536     trn2        v15.2s, v24.2s, v26.2s
    537     trn1        v18.2s, v25.2s, v27.2s
    538     trn2        v19.2s, v25.2s, v27.2s
    539 
    540     mov         v24.d[0],x15
    541     mov         v25.d[0],x16
    542     mov         v26.d[0],x19
    543     mov         v27.d[0],x20
    544 
    545 // d30 =x0 1- 4 values
    546 // d31 =x2 1- 4 values
    547 // d12=x1 1- 4 values
    548 // d13=x3 1- 4 values
    549 // d14 =x0 28-31 values
    550 // d15 =x2 28- 31 values
    551 // d18=x1 28- 31 values
    552 // d19=x3 28- 31 values
    553 
    554 
    555 
    556     st1         { v30.4h, v31.4h},[x1],#16
    557     st1         { v12.4h, v13.4h},[x1],#16
    558     add         x1,x1,#192
    559     st1         { v14.4h, v15.4h},[x1],#16
    560     st1         { v18.4h, v19.4h},[x1],#16
    561     sub         x1,x1,#224
    562 
    563     mov         x0,x8
    564 
    565 
    566 
    567 
    568 
    569     ld1         {v10.4h},[x0],x6
    570     ld1         {v8.4h},[x0],x6
    571     ld1         {v11.4h},[x0],x6
    572     ld1         {v9.4h},[x0],x6
    573 
    574 
    575 
    576 
    577     smull       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
    578     smull       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
    579     smull       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
    580     smull       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
    581 
    582     smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
    583     smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
    584     smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
    585     smlsl       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
    586 
    587 
    588 
    589 
    590 
    591     smull       v20.4s, v10.4h, v0.h[0]
    592     smlal       v20.4s, v11.4h, v4.h[2]
    593 
    594 
    595     smull       v22.4s, v10.4h, v0.h[0]
    596     smlal       v22.4s, v11.4h, v5.h[2]
    597 
    598     smull       v16.4s, v10.4h, v0.h[0]
    599     smlal       v16.4s, v11.4h, v6.h[2]
    600 
    601     smull       v18.4s, v10.4h, v0.h[0]
    602     smlal       v18.4s, v11.4h, v7.h[2]
    603     cmp         x11,x10
    604     bhs         shift2
    605 
    606     ld1         {v12.4h},[x0],x6
    607     ld1         {v14.4h},[x0],x6
    608     ld1         {v13.4h},[x0],x6
    609     ld1         {v15.4h},[x0],x6
    610 
    611 
    612     smlsl       v24.4s, v14.4h, v4.h[3]
    613     smlsl       v26.4s, v14.4h, v2.h[1]
    614     smlsl       v28.4s, v14.4h, v0.h[1]
    615     smlsl       v30.4s, v14.4h, v2.h[3]
    616 
    617 
    618     smlsl       v24.4s, v15.4h, v0.h[3]
    619     smlsl       v26.4s, v15.4h, v3.h[1]
    620     smlsl       v28.4s, v15.4h, v6.h[3]
    621     smlal       v30.4s, v15.4h, v5.h[3]
    622 
    623 
    624     smlsl       v20.4s, v12.4h, v7.h[0]
    625     smlsl       v20.4s, v13.4h, v2.h[2]
    626     smlsl       v22.4s, v12.4h, v5.h[0]
    627     smlsl       v22.4s, v13.4h, v0.h[2]
    628     smlsl       v16.4s, v12.4h, v3.h[0]
    629     smlsl       v16.4s, v13.4h, v3.h[2]
    630     smlsl       v18.4s, v12.4h, v1.h[0]
    631     smlsl       v18.4s, v13.4h, v6.h[2]
    632 
    633     cmp         x11,x9
    634     bhs         shift2
    635 
    636 
    637     ld1         {v10.4h},[x0],x6
    638     ld1         {v8.4h},[x0],x6
    639     ld1         {v11.4h},[x0],x6
    640     ld1         {v9.4h},[x0],x6
    641 
    642 
    643 
    644 
    645 
    646 
    647 
    648     smlsl       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
    649     smlal       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
    650     smlal       v28.4s, v8.4h, v2.h[3]     //// y1 * sin3(part of b2)
    651     smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
    652 
    653     smlal       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
    654     smlal       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
    655     smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
    656     smlsl       v30.4s, v9.4h, v6.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
    657 
    658 
    659 
    660 
    661 
    662     smlsl       v20.4s, v10.4h, v2.h[0]
    663     smlsl       v20.4s, v11.4h, v6.h[2]
    664 
    665 
    666     smlsl       v22.4s, v10.4h, v6.h[0]
    667     smlal       v22.4s, v11.4h, v4.h[2]
    668 
    669     smlal       v16.4s, v10.4h, v6.h[0]
    670     smlal       v16.4s, v11.4h, v0.h[2]
    671 
    672     smlal       v18.4s, v10.4h, v2.h[0]
    673     smlal       v18.4s, v11.4h, v5.h[2]
    674 
    675     cmp         x11,x5
    676     bhs         shift2
    677 
    678 
    679     ld1         {v12.4h},[x0],x6
    680     ld1         {v14.4h},[x0],x6
    681     ld1         {v13.4h},[x0],x6
    682     ld1         {v15.4h},[x0],x6
    683 
    684 
    685 
    686 
    687 
    688     smlal       v24.4s, v14.4h, v2.h[3]
    689     smlal       v26.4s, v14.4h, v3.h[3]
    690     smlsl       v28.4s, v14.4h, v5.h[3]
    691     smlsl       v30.4s, v14.4h, v0.h[3]
    692 
    693 
    694     smlal       v24.4s, v15.4h, v1.h[3]
    695     smlsl       v26.4s, v15.4h, v6.h[3]
    696     smlsl       v28.4s, v15.4h, v0.h[3]
    697     smlal       v30.4s, v15.4h, v7.h[3]
    698 
    699 
    700     smlal       v20.4s, v12.4h, v5.h[0]
    701     smlal       v20.4s, v13.4h, v0.h[2]
    702     smlal       v22.4s, v12.4h, v1.h[0]
    703     smlal       v22.4s, v13.4h, v6.h[2]
    704     smlal       v16.4s, v12.4h, v7.h[0]
    705     smlsl       v16.4s, v13.4h, v2.h[2]
    706     smlsl       v18.4s, v12.4h, v3.h[0]
    707     smlsl       v18.4s, v13.4h, v4.h[2]
    708 
    709 
    710     cmp         x11,x7
    711     bhs         shift2
    712 
    713 
    714     ld1         {v10.4h},[x0],x6
    715     ld1         {v8.4h},[x0],x6
    716     ld1         {v11.4h},[x0],x6
    717     ld1         {v9.4h},[x0],x6
    718 
    719 
    720 
    721 
    722 
    723 
    724 
    725     smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
    726     smlsl       v26.4s, v8.4h, v1.h[1]     //// y1 * cos3(part of b1)
    727     smlsl       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
    728     smlal       v30.4s, v8.4h, v0.h[3]     //// y1 * sin1(part of b3)
    729 
    730     smlsl       v24.4s, v9.4h, v5.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
    731     smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
    732     smlal       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
    733     smlal       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    734 
    735 
    736 
    737 
    738 
    739     smlal       v20.4s, v10.4h, v0.h[0]
    740     smlsl       v20.4s, v11.4h, v7.h[2]
    741 
    742 
    743     smlsl       v22.4s, v10.4h, v0.h[0]
    744     smlsl       v22.4s, v11.4h, v1.h[2]
    745 
    746     smlsl       v16.4s, v10.4h, v0.h[0]
    747     smlal       v16.4s, v11.4h, v5.h[2]
    748 
    749     smlal       v18.4s, v10.4h, v0.h[0]
    750     smlal       v18.4s, v11.4h, v3.h[2]
    751 
    752 
    753 
    754     ld1         {v12.4h},[x0],x6
    755     ld1         {v14.4h},[x0],x6
    756     ld1         {v13.4h},[x0],x6
    757     ld1         {v15.4h},[x0],x6
    758 
    759 
    760     smlsl       v24.4s, v14.4h, v0.h[1]
    761     smlal       v26.4s, v14.4h, v6.h[1]
    762     smlal       v28.4s, v14.4h, v4.h[1]
    763     smlsl       v30.4s, v14.4h, v1.h[1]
    764 
    765 
    766     smlsl       v24.4s, v15.4h, v3.h[3]
    767     smlal       v26.4s, v15.4h, v0.h[1]
    768     smlsl       v28.4s, v15.4h, v5.h[1]
    769     smlsl       v30.4s, v15.4h, v6.h[1]
    770 
    771 
    772     smlsl       v20.4s, v12.4h, v3.h[0]
    773     smlsl       v20.4s, v13.4h, v1.h[2]
    774     smlsl       v22.4s, v12.4h, v7.h[0]
    775     smlal       v22.4s, v13.4h, v3.h[2]
    776     smlal       v16.4s, v12.4h, v1.h[0]
    777     smlal       v16.4s, v13.4h, v7.h[2]
    778     smlsl       v18.4s, v12.4h, v5.h[0]
    779     smlsl       v18.4s, v13.4h, v2.h[2]
    780 
    781     ld1         {v10.4h},[x0],x6
    782     ld1         {v8.4h},[x0],x6
    783     ld1         {v11.4h},[x0],x6
    784     ld1         {v9.4h},[x0],x6
    785 
    786 
    787 
    788 
    789     smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
    790     smlal       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
    791     smlsl       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
    792     smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
    793 
    794     smlal       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
    795     smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
    796     smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
    797     smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    798 
    799 
    800 
    801 
    802 
    803     smlsl       v20.4s, v10.4h, v6.h[0]
    804     smlal       v20.4s, v11.4h, v5.h[2]
    805 
    806 
    807     smlal       v22.4s, v10.4h, v2.h[0]
    808     smlal       v22.4s, v11.4h, v7.h[2]
    809 
    810     smlsl       v16.4s, v10.4h, v2.h[0]
    811     smlsl       v16.4s, v11.4h, v4.h[2]
    812 
    813     smlal       v18.4s, v10.4h, v6.h[0]
    814     smlal       v18.4s, v11.4h, v1.h[2]
    815 
    816 
    817     ld1         {v12.4h},[x0],x6
    818     ld1         {v14.4h},[x0],x6
    819     ld1         {v13.4h},[x0],x6
    820     ld1         {v15.4h},[x0],x6
    821 
    822 
    823 
    824 
    825 
    826     smlal       v24.4s, v14.4h, v1.h[1]
    827     smlsl       v26.4s, v14.4h, v0.h[3]
    828     smlal       v28.4s, v14.4h, v1.h[3]
    829     smlsl       v30.4s, v14.4h, v3.h[1]
    830 
    831 
    832     smlal       v24.4s, v15.4h, v5.h[3]
    833     smlsl       v26.4s, v15.4h, v5.h[1]
    834     smlal       v28.4s, v15.4h, v4.h[3]
    835     smlsl       v30.4s, v15.4h, v4.h[1]
    836 
    837 
    838     smlal       v20.4s, v12.4h, v1.h[0]
    839     smlal       v20.4s, v13.4h, v3.h[2]
    840     smlsl       v22.4s, v12.4h, v3.h[0]
    841     smlsl       v22.4s, v13.4h, v2.h[2]
    842     smlal       v16.4s, v12.4h, v5.h[0]
    843     smlal       v16.4s, v13.4h, v1.h[2]
    844     smlsl       v18.4s, v12.4h, v7.h[0]
    845     smlsl       v18.4s, v13.4h, v0.h[2]
    846 
    847 shift2:
    848     add         v8.4s,  v20.4s ,  v24.4s
    849     sub         v10.4s,  v20.4s ,  v24.4s
    850 
    851     add         v12.4s,  v22.4s ,  v26.4s
    852     sub         v24.4s,  v22.4s ,  v26.4s
    853 
    854     add         v14.4s,  v16.4s ,  v28.4s
    855     sub         v26.4s,  v16.4s ,  v28.4s
    856 
    857 
    858     add         v16.4s,  v18.4s ,  v30.4s
    859     sub         v28.4s,  v18.4s ,  v30.4s
    860 
    861 
    862     sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
    863     sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
    864     sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
    865     sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
    866     sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
    867     sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
    868     sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
    869     sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
    870 
    871     umov        x15,v24.d[0]
    872     umov        x16,v25.d[0]
    873     umov        x19,v26.d[0]
    874     umov        x20,v27.d[0]
    875 
    876     trn1        v24.4h, v30.4h, v12.4h
    877     trn2        v25.4h, v30.4h, v12.4h
    878     trn1        v26.4h, v31.4h, v13.4h
    879     trn2        v27.4h, v31.4h, v13.4h
    880 
    881     trn1        v30.2s, v24.2s, v26.2s
    882     trn2        v31.2s, v24.2s, v26.2s
    883     trn1        v12.2s, v25.2s, v27.2s
    884     trn2        v13.2s, v25.2s, v27.2s
    885 
    886     trn1        v24.4h, v14.4h, v18.4h
    887     trn2        v25.4h, v14.4h, v18.4h
    888     trn1        v26.4h, v15.4h, v19.4h
    889     trn2        v27.4h, v15.4h, v19.4h
    890 
    891     trn1        v14.2s, v24.2s, v26.2s
    892     trn2        v15.2s, v24.2s, v26.2s
    893     trn1        v18.2s, v25.2s, v27.2s
    894     trn2        v19.2s, v25.2s, v27.2s
    895 
    896     mov         v24.d[0],x15
    897     mov         v25.d[0],x16
    898     mov         v26.d[0],x19
    899     mov         v27.d[0],x20
    900 
    901     st1         { v30.4h, v31.4h},[x1],#16
    902     st1         { v12.4h, v13.4h},[x1],#16
    903     add         x1,x1,#128
    904     st1         { v14.4h, v15.4h},[x1],#16
    905     st1         { v18.4h, v19.4h},[x1],#16
    906     sub         x1,x1,#160
    907     mov         x0,x8
    908 
    909 
    910 
    911     ld1         {v10.4h},[x0],x6
    912     ld1         {v8.4h},[x0],x6
    913     ld1         {v11.4h},[x0],x6
    914     ld1         {v9.4h},[x0],x6
    915 
    916 
    917     smull       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
    918     smull       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
    919     smull       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
    920     smull       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
    921 
    922     smlsl       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
    923     smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
    924     smlsl       v28.4s, v9.4h, v0.h[2]     //// y1 * sin3 - y3 * cos1(part of b2)
    925     smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    926 
    927 
    928 
    929 
    930 
    931     smull       v20.4s, v10.4h, v0.h[0]
    932     smlsl       v20.4s, v11.4h, v7.h[2]
    933 
    934 
    935     smull       v22.4s, v10.4h, v0.h[0]
    936     smlsl       v22.4s, v11.4h, v6.h[2]
    937 
    938     smull       v16.4s, v10.4h, v0.h[0]
    939     smlsl       v16.4s, v11.4h, v5.h[2]
    940 
    941     smull       v18.4s, v10.4h, v0.h[0]
    942     smlsl       v18.4s, v11.4h, v4.h[2]
    943 
    944     cmp         x11,x10
    945     bhs         shift3
    946 
    947     ld1         {v12.4h},[x0],x6
    948     ld1         {v14.4h},[x0],x6
    949     ld1         {v13.4h},[x0],x6
    950     ld1         {v15.4h},[x0],x6
    951 
    952 
    953 
    954 
    955     smlsl       v24.4s, v14.4h, v5.h[1]
    956     smlsl       v26.4s, v14.4h, v7.h[3]
    957     smlal       v28.4s, v14.4h, v5.h[3]
    958     smlal       v30.4s, v14.4h, v3.h[1]
    959 
    960 
    961     smlal       v24.4s, v15.4h, v2.h[1]
    962     smlal       v26.4s, v15.4h, v1.h[1]
    963     smlal       v28.4s, v15.4h, v4.h[3]
    964     smlsl       v30.4s, v15.4h, v7.h[3]
    965 
    966 
    967     smlsl       v20.4s, v12.4h, v1.h[0]
    968     smlal       v20.4s, v13.4h, v6.h[2]
    969     smlsl       v22.4s, v12.4h, v3.h[0]
    970     smlal       v22.4s, v13.4h, v3.h[2]
    971     smlsl       v16.4s, v12.4h, v5.h[0]
    972     smlal       v16.4s, v13.4h, v0.h[2]
    973     smlsl       v18.4s, v12.4h, v7.h[0]
    974     smlal       v18.4s, v13.4h, v2.h[2]
    975 
    976     cmp         x11,x9
    977     bhs         shift3
    978 
    979     ld1         {v10.4h},[x0],x6
    980     ld1         {v8.4h},[x0],x6
    981     ld1         {v11.4h},[x0],x6
    982     ld1         {v9.4h},[x0],x6
    983 
    984     smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
    985     smlsl       v26.4s, v8.4h, v5.h[1]     //// y1 * cos3(part of b1)
    986     smlsl       v28.4s, v8.4h, v0.h[3]     //// y1 * sin3(part of b2)
    987     smlsl       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
    988 
    989     smlsl       v24.4s, v9.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
    990     smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
    991     smlal       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
    992     smlal       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    993 
    994 
    995 
    996 
    997 
    998     smlal       v20.4s, v10.4h, v2.h[0]
    999     smlsl       v20.4s, v11.4h, v5.h[2]
   1000 
   1001 
   1002     smlal       v22.4s, v10.4h, v6.h[0]
   1003     smlsl       v22.4s, v11.4h, v0.h[2]
   1004 
   1005     smlsl       v16.4s, v10.4h, v6.h[0]
   1006     smlsl       v16.4s, v11.4h, v4.h[2]
   1007 
   1008     smlsl       v18.4s, v10.4h, v2.h[0]
   1009     smlal       v18.4s, v11.4h, v6.h[2]
   1010 
   1011     cmp         x11,x5
   1012     bhs         shift3
   1013 
   1014 
   1015     ld1         {v12.4h},[x0],x6
   1016     ld1         {v14.4h},[x0],x6
   1017     ld1         {v13.4h},[x0],x6
   1018     ld1         {v15.4h},[x0],x6
   1019 
   1020 
   1021 
   1022 
   1023 
   1024 
   1025     smlsl       v24.4s, v14.4h, v7.h[1]
   1026     smlal       v26.4s, v14.4h, v2.h[1]
   1027     smlal       v28.4s, v14.4h, v4.h[1]
   1028     smlsl       v30.4s, v14.4h, v5.h[1]
   1029 
   1030 
   1031     smlal       v24.4s, v15.4h, v0.h[3]
   1032     smlal       v26.4s, v15.4h, v7.h[1]
   1033     smlsl       v28.4s, v15.4h, v1.h[1]
   1034     smlsl       v30.4s, v15.4h, v6.h[1]
   1035 
   1036 
   1037     smlsl       v20.4s, v12.4h, v3.h[0]
   1038     smlal       v20.4s, v13.4h, v4.h[2]
   1039     smlal       v22.4s, v12.4h, v7.h[0]
   1040     smlal       v22.4s, v13.4h, v2.h[2]
   1041     smlal       v16.4s, v12.4h, v1.h[0]
   1042     smlsl       v16.4s, v13.4h, v6.h[2]
   1043     smlal       v18.4s, v12.4h, v5.h[0]
   1044     smlsl       v18.4s, v13.4h, v0.h[2]
   1045 
   1046 
   1047     cmp         x11,x7
   1048     bhs         shift3
   1049 
   1050 
   1051     ld1         {v10.4h},[x0],x6
   1052     ld1         {v8.4h},[x0],x6
   1053     ld1         {v11.4h},[x0],x6
   1054     ld1         {v9.4h},[x0],x6
   1055 
   1056 
   1057     smlsl       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
   1058     smlsl       v26.4s, v8.4h, v0.h[1]     //// y1 * cos3(part of b1)
   1059     smlal       v28.4s, v8.4h, v6.h[3]     //// y1 * sin3(part of b2)
   1060     smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
   1061 
   1062     smlsl       v24.4s, v9.4h, v0.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
   1063     smlal       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   1064     smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
   1065     smlsl       v30.4s, v9.4h, v2.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
   1066 
   1067 
   1068 
   1069 
   1070 
   1071     smlal       v20.4s, v10.4h, v0.h[0]
   1072     smlsl       v20.4s, v11.4h, v3.h[2]
   1073 
   1074 
   1075     smlsl       v22.4s, v10.4h, v0.h[0]
   1076     smlsl       v22.4s, v11.4h, v5.h[2]
   1077 
   1078     smlsl       v16.4s, v10.4h, v0.h[0]
   1079     smlal       v16.4s, v11.4h, v1.h[2]
   1080 
   1081     smlal       v18.4s, v10.4h, v0.h[0]
   1082     smlal       v18.4s, v11.4h, v7.h[2]
   1083 
   1084 
   1085     ld1         {v12.4h},[x0],x6
   1086     ld1         {v14.4h},[x0],x6
   1087     ld1         {v13.4h},[x0],x6
   1088     ld1         {v15.4h},[x0],x6
   1089 
   1090 
   1091 
   1092     smlal       v24.4s, v14.4h, v6.h[3]
   1093     smlal       v26.4s, v14.4h, v3.h[3]
   1094     smlsl       v28.4s, v14.4h, v1.h[3]
   1095     smlal       v30.4s, v14.4h, v7.h[1]
   1096 
   1097 
   1098     smlal       v24.4s, v15.4h, v1.h[3]
   1099     smlsl       v26.4s, v15.4h, v2.h[3]
   1100     smlal       v28.4s, v15.4h, v7.h[1]
   1101     smlal       v30.4s, v15.4h, v4.h[1]
   1102 
   1103 
   1104     smlsl       v20.4s, v12.4h, v5.h[0]
   1105     smlal       v20.4s, v13.4h, v2.h[2]
   1106     smlal       v22.4s, v12.4h, v1.h[0]
   1107     smlsl       v22.4s, v13.4h, v7.h[2]
   1108     smlsl       v16.4s, v12.4h, v7.h[0]
   1109     smlsl       v16.4s, v13.4h, v3.h[2]
   1110     smlsl       v18.4s, v12.4h, v3.h[0]
   1111     smlal       v18.4s, v13.4h, v1.h[2]
   1112 
   1113 
   1114 
   1115     ld1         {v10.4h},[x0],x6
   1116     ld1         {v8.4h},[x0],x6
   1117     ld1         {v11.4h},[x0],x6
   1118     ld1         {v9.4h},[x0],x6
   1119 
   1120 
   1121 
   1122 
   1123     smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
   1124     smlsl       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
   1125     smlal       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
   1126     smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
   1127 
   1128     smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   1129     smlal       v26.4s, v9.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
   1130     smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
   1131     smlal       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
   1132 
   1133 
   1134 
   1135 
   1136 
   1137     smlal       v20.4s, v10.4h, v6.h[0]
   1138     smlsl       v20.4s, v11.4h, v1.h[2]
   1139 
   1140 
   1141     smlsl       v22.4s, v10.4h, v2.h[0]
   1142     smlal       v22.4s, v11.4h, v4.h[2]
   1143 
   1144     smlal       v16.4s, v10.4h, v2.h[0]
   1145     smlsl       v16.4s, v11.4h, v7.h[2]
   1146 
   1147     smlsl       v18.4s, v10.4h, v6.h[0]
   1148     smlsl       v18.4s, v11.4h, v5.h[2]
   1149 
   1150 
   1151     ld1         {v12.4h},[x0],x6
   1152     ld1         {v14.4h},[x0],x6
   1153     ld1         {v13.4h},[x0],x6
   1154     ld1         {v15.4h},[x0],x6
   1155 
   1156     smlal       v24.4s, v14.4h, v4.h[3]
   1157     smlsl       v26.4s, v14.4h, v6.h[1]
   1158     smlal       v28.4s, v14.4h, v7.h[3]
   1159     smlal       v30.4s, v14.4h, v6.h[3]
   1160 
   1161 
   1162     smlal       v24.4s, v15.4h, v3.h[3]
   1163     smlsl       v26.4s, v15.4h, v3.h[1]
   1164     smlal       v28.4s, v15.4h, v2.h[3]
   1165     smlsl       v30.4s, v15.4h, v2.h[1]
   1166 
   1167 
   1168     smlsl       v20.4s, v12.4h, v7.h[0]
   1169     smlal       v20.4s, v13.4h, v0.h[2]
   1170     smlal       v22.4s, v12.4h, v5.h[0]
   1171     smlsl       v22.4s, v13.4h, v1.h[2]
   1172     smlsl       v16.4s, v12.4h, v3.h[0]
   1173     smlal       v16.4s, v13.4h, v2.h[2]
   1174     smlal       v18.4s, v12.4h, v1.h[0]
   1175     smlsl       v18.4s, v13.4h, v3.h[2]
   1176 
   1177 shift3:
   1178     add         v8.4s,  v20.4s ,  v24.4s
   1179     sub         v10.4s,  v20.4s ,  v24.4s
   1180 
   1181     add         v12.4s,  v22.4s ,  v26.4s
   1182     sub         v24.4s,  v22.4s ,  v26.4s
   1183 
   1184     add         v14.4s,  v16.4s ,  v28.4s
   1185     sub         v26.4s,  v16.4s ,  v28.4s
   1186 
   1187 
   1188     add         v16.4s,  v18.4s ,  v30.4s
   1189     sub         v28.4s,  v18.4s ,  v30.4s
   1190 
   1191 
   1192     sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
   1193     sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
   1194     sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
   1195     sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
   1196     sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
   1197     sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
   1198     sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
   1199     sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
   1200 
   1201     umov        x15,v24.d[0]
   1202     umov        x16,v25.d[0]
   1203     umov        x19,v26.d[0]
   1204     umov        x20,v27.d[0]
   1205 
   1206     trn1        v24.4h, v30.4h, v12.4h
   1207     trn2        v25.4h, v30.4h, v12.4h
   1208     trn1        v26.4h, v31.4h, v13.4h
   1209     trn2        v27.4h, v31.4h, v13.4h
   1210 
   1211     trn1        v30.2s, v24.2s, v26.2s
   1212     trn2        v31.2s, v24.2s, v26.2s
   1213     trn1        v12.2s, v25.2s, v27.2s
   1214     trn2        v13.2s, v25.2s, v27.2s
   1215 
   1216     trn1        v24.4h, v14.4h, v18.4h
   1217     trn2        v25.4h, v14.4h, v18.4h
   1218     trn1        v26.4h, v15.4h, v19.4h
   1219     trn2        v27.4h, v15.4h, v19.4h
   1220 
   1221     trn1        v14.2s, v24.2s, v26.2s
   1222     trn2        v15.2s, v24.2s, v26.2s
   1223     trn1        v18.2s, v25.2s, v27.2s
   1224     trn2        v19.2s, v25.2s, v27.2s
   1225 
   1226     mov         v24.d[0],x15
   1227     mov         v25.d[0],x16
   1228     mov         v26.d[0],x19
   1229     mov         v27.d[0],x20
   1230     st1         { v30.4h, v31.4h},[x1],#16
   1231     st1         { v12.4h, v13.4h},[x1],#16
   1232     add         x1,x1,#64
   1233     st1         { v14.4h, v15.4h},[x1],#16
   1234     st1         { v18.4h, v19.4h},[x1],#16
   1235     sub         x1,x1,#96
   1236 
   1237     mov         x0,x8
   1238 
   1239 
   1240 
   1241     ld1         {v10.4h},[x0],x6
   1242     ld1         {v8.4h},[x0],x6
   1243     ld1         {v11.4h},[x0],x6
   1244     ld1         {v9.4h},[x0],x6
   1245 
   1246 
   1247     smull       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
   1248     smull       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
   1249     smull       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
   1250     smull       v30.4s, v8.4h, v7.h[3]     //// y1 * sin1(part of b3)
   1251 
   1252     smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   1253     smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
   1254     smlsl       v28.4s, v9.4h, v5.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
   1255     smlsl       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   1256 
   1257 
   1258 
   1259 
   1260 
   1261     smull       v20.4s, v10.4h, v0.h[0]
   1262     smlsl       v20.4s, v11.4h, v3.h[2]
   1263 
   1264 
   1265     smull       v22.4s, v10.4h, v0.h[0]
   1266     smlsl       v22.4s, v11.4h, v2.h[2]
   1267 
   1268     smull       v16.4s, v10.4h, v0.h[0]
   1269     smlsl       v16.4s, v11.4h, v1.h[2]
   1270 
   1271     smull       v18.4s, v10.4h, v0.h[0]
   1272     smlsl       v18.4s, v11.4h, v0.h[2]
   1273 
   1274     cmp         x11,x10
   1275     bhs         shift4
   1276 
   1277     ld1         {v12.4h},[x0],x6
   1278     ld1         {v14.4h},[x0],x6
   1279     ld1         {v13.4h},[x0],x6
   1280     ld1         {v15.4h},[x0],x6
   1281 
   1282 
   1283 
   1284 
   1285 
   1286 
   1287     smlal       v24.4s, v14.4h, v0.h[1]
   1288     smlal       v26.4s, v14.4h, v1.h[3]
   1289     smlal       v28.4s, v14.4h, v4.h[1]
   1290     smlal       v30.4s, v14.4h, v6.h[3]
   1291 
   1292 
   1293     smlsl       v24.4s, v15.4h, v4.h[1]
   1294     smlsl       v26.4s, v15.4h, v0.h[3]
   1295     smlsl       v28.4s, v15.4h, v2.h[3]
   1296     smlsl       v30.4s, v15.4h, v6.h[1]
   1297 
   1298 
   1299     smlal       v20.4s, v12.4h, v7.h[0]
   1300     smlal       v20.4s, v13.4h, v5.h[2]
   1301     smlal       v22.4s, v12.4h, v5.h[0]
   1302     smlsl       v22.4s, v13.4h, v7.h[2]
   1303     smlal       v16.4s, v12.4h, v3.h[0]
   1304     smlsl       v16.4s, v13.4h, v4.h[2]
   1305     smlal       v18.4s, v12.4h, v1.h[0]
   1306     smlsl       v18.4s, v13.4h, v1.h[2]
   1307 
   1308     cmp         x11,x9
   1309     bhs         shift4
   1310 
   1311     ld1         {v10.4h},[x0],x6
   1312     ld1         {v8.4h},[x0],x6
   1313     ld1         {v11.4h},[x0],x6
   1314     ld1         {v9.4h},[x0],x6
   1315 
   1316 
   1317 
   1318     smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
   1319     smlal       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
   1320     smlal       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
   1321     smlal       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
   1322 
   1323     smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   1324     smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   1325     smlsl       v28.4s, v9.4h, v0.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
   1326     smlsl       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   1327 
   1328 
   1329 
   1330 
   1331 
   1332     smlsl       v20.4s, v10.4h, v2.h[0]
   1333     smlal       v20.4s, v11.4h, v1.h[2]
   1334 
   1335 
   1336     smlsl       v22.4s, v10.4h, v6.h[0]
   1337     smlal       v22.4s, v11.4h, v3.h[2]
   1338 
   1339     smlal       v16.4s, v10.4h, v6.h[0]
   1340     smlsl       v16.4s, v11.4h, v7.h[2]
   1341 
   1342     smlal       v18.4s, v10.4h, v2.h[0]
   1343     smlsl       v18.4s, v11.4h, v2.h[2]
   1344 
   1345     cmp         x11,x5
   1346     bhs         shift4
   1347 
   1348 
   1349     ld1         {v12.4h},[x0],x6
   1350     ld1         {v14.4h},[x0],x6
   1351     ld1         {v13.4h},[x0],x6
   1352     ld1         {v15.4h},[x0],x6
   1353 
   1354 
   1355 
   1356 
   1357 
   1358 
   1359     smlsl       v24.4s, v14.4h, v1.h[1]
   1360     smlsl       v26.4s, v14.4h, v7.h[3]
   1361     smlal       v28.4s, v14.4h, v1.h[3]
   1362     smlal       v30.4s, v14.4h, v4.h[3]
   1363 
   1364 
   1365     smlal       v24.4s, v15.4h, v2.h[1]
   1366     smlal       v26.4s, v15.4h, v5.h[1]
   1367     smlsl       v28.4s, v15.4h, v3.h[1]
   1368     smlsl       v30.4s, v15.4h, v4.h[1]
   1369 
   1370 
   1371     smlsl       v20.4s, v12.4h, v5.h[0]
   1372     smlsl       v20.4s, v13.4h, v7.h[2]
   1373     smlsl       v22.4s, v12.4h, v1.h[0]
   1374     smlal       v22.4s, v13.4h, v1.h[2]
   1375     smlsl       v16.4s, v12.4h, v7.h[0]
   1376     smlal       v16.4s, v13.4h, v5.h[2]
   1377     smlal       v18.4s, v12.4h, v3.h[0]
   1378     smlsl       v18.4s, v13.4h, v3.h[2]
   1379 
   1380     cmp         x11,x7
   1381     bhs         shift4
   1382 
   1383 
   1384     ld1         {v10.4h},[x0],x6
   1385     ld1         {v8.4h},[x0],x6
   1386     ld1         {v11.4h},[x0],x6
   1387     ld1         {v9.4h},[x0],x6
   1388 
   1389 
   1390     smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
   1391     smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
   1392     smlal       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
   1393     smlal       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
   1394 
   1395     smlsl       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   1396     smlal       v26.4s, v9.4h, v0.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   1397     smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
   1398     smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   1399 
   1400 
   1401 
   1402 
   1403 
   1404     smlal       v20.4s, v10.4h, v0.h[0]
   1405     smlsl       v20.4s, v11.4h, v0.h[2]
   1406 
   1407 
   1408     smlsl       v22.4s, v10.4h, v0.h[0]
   1409     smlal       v22.4s, v11.4h, v6.h[2]
   1410 
   1411     smlsl       v16.4s, v10.4h, v0.h[0]
   1412     smlal       v16.4s, v11.4h, v2.h[2]
   1413 
   1414     smlal       v18.4s, v10.4h, v0.h[0]
   1415     smlsl       v18.4s, v11.4h, v4.h[2]
   1416 
   1417 
   1418 
   1419 
   1420     ld1         {v12.4h},[x0],x6
   1421     ld1         {v14.4h},[x0],x6
   1422     ld1         {v13.4h},[x0],x6
   1423     ld1         {v15.4h},[x0],x6
   1424 
   1425 
   1426 
   1427 
   1428 
   1429 
   1430     smlal       v24.4s, v14.4h, v3.h[1]
   1431     smlsl       v26.4s, v14.4h, v2.h[1]
   1432     smlal       v28.4s, v14.4h, v7.h[3]
   1433     smlal       v30.4s, v14.4h, v2.h[3]
   1434 
   1435 
   1436     smlsl       v24.4s, v15.4h, v0.h[3]
   1437     smlal       v26.4s, v15.4h, v4.h[3]
   1438     smlal       v28.4s, v15.4h, v6.h[3]
   1439     smlsl       v30.4s, v15.4h, v2.h[1]
   1440 
   1441 
   1442     smlal       v20.4s, v12.4h, v3.h[0]
   1443     smlsl       v20.4s, v13.4h, v6.h[2]
   1444     smlal       v22.4s, v12.4h, v7.h[0]
   1445     smlsl       v22.4s, v13.4h, v4.h[2]
   1446     smlsl       v16.4s, v12.4h, v1.h[0]
   1447     smlal       v16.4s, v13.4h, v0.h[2]
   1448     smlal       v18.4s, v12.4h, v5.h[0]
   1449     smlsl       v18.4s, v13.4h, v5.h[2]
   1450 
   1451 
   1452     ld1         {v10.4h},[x0],x6
   1453     ld1         {v8.4h},[x0],x6
   1454     ld1         {v11.4h},[x0],x6
   1455     ld1         {v9.4h},[x0],x6
   1456 
   1457 
   1458 
   1459 
   1460 
   1461     smlal       v24.4s, v8.4h, v3.h[3]     //// y1 * cos1(part of b0)
   1462     smlsl       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
   1463     smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
   1464     smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
   1465 
   1466     smlsl       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
   1467     smlsl       v26.4s, v9.4h, v6.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
   1468     smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
   1469     smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   1470 
   1471 
   1472 
   1473 
   1474 
   1475     smlsl       v20.4s, v10.4h, v6.h[0]
   1476     smlal       v20.4s, v11.4h, v2.h[2]
   1477 
   1478 
   1479     smlal       v22.4s, v10.4h, v2.h[0]
   1480     smlsl       v22.4s, v11.4h, v0.h[2]
   1481 
   1482     smlsl       v16.4s, v10.4h, v2.h[0]
   1483     smlal       v16.4s, v11.4h, v3.h[2]
   1484 
   1485     smlal       v18.4s, v10.4h, v6.h[0]
   1486     smlsl       v18.4s, v11.4h, v6.h[2]
   1487 
   1488 
   1489     ld1         {v12.4h},[x0],x6
   1490     ld1         {v14.4h},[x0],x6
   1491     ld1         {v13.4h},[x0],x6
   1492     ld1         {v15.4h},[x0],x6
   1493 
   1494 
   1495 
   1496 
   1497     smlsl       v24.4s, v14.4h, v5.h[1]
   1498     smlal       v26.4s, v14.4h, v3.h[3]
   1499     smlsl       v28.4s, v14.4h, v2.h[1]
   1500     smlal       v30.4s, v14.4h, v0.h[3]
   1501 
   1502 
   1503     smlal       v24.4s, v15.4h, v1.h[3]
   1504     smlsl       v26.4s, v15.4h, v1.h[1]
   1505     smlal       v28.4s, v15.4h, v0.h[3]
   1506     smlsl       v30.4s, v15.4h, v0.h[1]
   1507 
   1508 
   1509     smlsl       v20.4s, v12.4h, v1.h[0]
   1510     smlal       v20.4s, v13.4h, v4.h[2]
   1511     smlal       v22.4s, v12.4h, v3.h[0]
   1512     smlsl       v22.4s, v13.4h, v5.h[2]
   1513     smlsl       v16.4s, v12.4h, v5.h[0]
   1514     smlal       v16.4s, v13.4h, v6.h[2]
   1515     smlal       v18.4s, v12.4h, v7.h[0]
   1516     smlsl       v18.4s, v13.4h, v7.h[2]
   1517 
   1518 shift4:
   1519     add         v8.4s,  v20.4s ,  v24.4s
   1520     sub         v10.4s,  v20.4s ,  v24.4s
   1521 
   1522     add         v12.4s,  v22.4s ,  v26.4s
   1523     sub         v24.4s,  v22.4s ,  v26.4s
   1524 
   1525     add         v14.4s,  v16.4s ,  v28.4s
   1526     sub         v26.4s,  v16.4s ,  v28.4s
   1527 
   1528 
   1529     add         v16.4s,  v18.4s ,  v30.4s
   1530     sub         v28.4s,  v18.4s ,  v30.4s
   1531 
   1532 
   1533     sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
   1534     sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
   1535     sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
   1536     sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
   1537     sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
   1538     sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
   1539     sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
   1540     sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
   1541 
   1542     umov        x15,v24.d[0]
   1543     umov        x16,v25.d[0]
   1544     umov        x19,v26.d[0]
   1545     umov        x20,v27.d[0]
   1546 
   1547     trn1        v24.4h, v30.4h, v12.4h
   1548     trn2        v25.4h, v30.4h, v12.4h
   1549     trn1        v26.4h, v31.4h, v13.4h
   1550     trn2        v27.4h, v31.4h, v13.4h
   1551 
   1552     trn1        v30.2s, v24.2s, v26.2s
   1553     trn2        v31.2s, v24.2s, v26.2s
   1554     trn1        v12.2s, v25.2s, v27.2s
   1555     trn2        v13.2s, v25.2s, v27.2s
   1556 
   1557     trn1        v24.4h, v14.4h, v18.4h
   1558     trn2        v25.4h, v14.4h, v18.4h
   1559     trn1        v26.4h, v15.4h, v19.4h
   1560     trn2        v27.4h, v15.4h, v19.4h
   1561 
   1562     trn1        v14.2s, v24.2s, v26.2s
   1563     trn2        v15.2s, v24.2s, v26.2s
   1564     trn1        v18.2s, v25.2s, v27.2s
   1565     trn2        v19.2s, v25.2s, v27.2s
   1566 
   1567     mov         v24.d[0],x15
   1568     mov         v25.d[0],x16
   1569     mov         v26.d[0],x19
   1570     mov         v27.d[0],x20
   1571 
   1572     st1         { v30.4h, v31.4h},[x1],#16
   1573     st1         { v12.4h, v13.4h},[x1],#16
   1574     st1         { v14.4h, v15.4h},[x1],#16
   1575     st1         { v18.4h, v19.4h},[x1],#16
   1576 
   1577     add         x1,x1,#96
   1578 
   1579     subs        x14,x14,#1
   1580     bne         dct_stage1
   1581 second_stage_dct:
   1582 //    mov        x0,x1
   1583     ldp         x8, x7,[sp],#16
   1584     ldp         x0, x1,[sp],#16
   1585 
   1586 //    add x4,x2,x8, lsl #1    @ x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
   1587 //    add x5,x8,x8, lsl #1    @
   1588 //    sub   x0,x0,#512
   1589     mov         x11,#0xfffffff0
   1590     mov         x5, #0xffffff00
   1591     ldr         w6, x5_addr
   1592     ldr         w9, x9_addr
   1593 //    sub         x1,x1,#2048
   1594     mov         x4,x1
   1595     mov         x10,#240
   1596     mov         x14,#8
   1597     b           stage2
   1598 
   1599 // registers free :
   1600 
   1601 // arm registers used
   1602 // x8 : predicition stride
   1603 // x7 : destination stride
   1604 // x1: temp buffer
   1605 // x2 : pred buffer
   1606 // x3 : destination buffer
   1607 // x14 : loop counter
   1608 //x0 : scratch buffer
   1609 //x10 : used as stride
   1610 // x4 : used to store the initial address
   1611 //x12 : zero cols
   1612 // x11 : 0xfffffff0
   1613 // x5 : 0xffffff00
   1614 dct_stage2:
   1615     add         x4,x4,#32
   1616     mov         x1,x4
   1617 stage2:
   1618     ld1         {v10.4h, v11.4h},[x1],#16
   1619     ld1         {v8.4h, v9.4h},[x1],x10
   1620 
   1621     smull       v24.4s, v8.4h, v0.h[1]     //// y1 * cos1(part of b0)
   1622     smull       v26.4s, v8.4h, v0.h[3]     //// y1 * cos3(part of b1)
   1623     smull       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
   1624     smull       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
   1625 
   1626     smlal       v24.4s, v9.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   1627     smlal       v26.4s, v9.4h, v2.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
   1628     smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
   1629     smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   1630 
   1631 
   1632 
   1633     smull       v20.4s, v10.4h, v0.h[0]
   1634     smlal       v20.4s, v11.4h, v0.h[2]
   1635 
   1636 
   1637     smull       v22.4s, v10.4h, v0.h[0]
   1638     smlal       v22.4s, v11.4h, v1.h[2]
   1639 
   1640     smull       v16.4s, v10.4h, v0.h[0]
   1641     smlal       v16.4s, v11.4h, v2.h[2]
   1642 
   1643     smull       v18.4s, v10.4h, v0.h[0]
   1644     smlal       v18.4s, v11.4h, v3.h[2]
   1645     cmp         x12,x11
   1646     bhs         stage2_shift1
   1647 
   1648     ld1         {v12.4h, v13.4h},[x1],#16
   1649     ld1         {v14.4h, v15.4h},[x1],x10
   1650 
   1651 
   1652 
   1653 
   1654 
   1655 
   1656     smlal       v24.4s, v14.4h, v1.h[1]
   1657     smlal       v26.4s, v14.4h, v3.h[3]
   1658     smlal       v28.4s, v14.4h, v6.h[1]
   1659     smlsl       v30.4s, v14.4h, v7.h[1]
   1660 
   1661 
   1662     smlal       v24.4s, v15.4h, v1.h[3]
   1663     smlal       v26.4s, v15.4h, v5.h[1]
   1664     smlsl       v28.4s, v15.4h, v7.h[1]
   1665     smlsl       v30.4s, v15.4h, v3.h[3]
   1666 
   1667 
   1668     smlal       v20.4s, v12.4h, v1.h[0]
   1669     smlal       v20.4s, v13.4h, v1.h[2]
   1670     smlal       v22.4s, v12.4h, v3.h[0]
   1671     smlal       v22.4s, v13.4h, v4.h[2]
   1672     smlal       v16.4s, v12.4h, v5.h[0]
   1673     smlal       v16.4s, v13.4h, v7.h[2]
   1674     smlal       v18.4s, v12.4h, v7.h[0]
   1675     smlsl       v18.4s, v13.4h, v5.h[2]
   1676     cmp         x12,x5
   1677     bhs         stage2_shift1
   1678 
   1679     ld1         {v10.4h, v11.4h},[x1],#16
   1680     ld1         {v8.4h, v9.4h},[x1],x10
   1681 
   1682     smlal       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
   1683     smlal       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
   1684     smlsl       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
   1685     smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
   1686 
   1687     smlal       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   1688     smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   1689     smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
   1690     smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   1691 
   1692 
   1693 
   1694 
   1695 
   1696     smlal       v20.4s, v10.4h, v2.h[0]
   1697     smlal       v20.4s, v11.4h, v2.h[2]
   1698 
   1699 
   1700     smlal       v22.4s, v10.4h, v6.h[0]
   1701     smlal       v22.4s, v11.4h, v7.h[2]
   1702 
   1703     smlsl       v16.4s, v10.4h, v6.h[0]
   1704     smlsl       v16.4s, v11.4h, v3.h[2]
   1705 
   1706     smlsl       v18.4s, v10.4h, v2.h[0]
   1707     smlsl       v18.4s, v11.4h, v1.h[2]
   1708 
   1709     cmp         x12,x6
   1710     bhs         stage2_shift1
   1711 
   1712 
   1713     ld1         {v12.4h, v13.4h},[x1],#16
   1714     ld1         {v14.4h, v15.4h},[x1],x10
   1715 
   1716 
   1717 
   1718 
   1719 
   1720     smlal       v24.4s, v14.4h, v3.h[1]
   1721     smlsl       v26.4s, v14.4h, v6.h[1]
   1722     smlsl       v28.4s, v14.4h, v0.h[1]
   1723     smlsl       v30.4s, v14.4h, v6.h[3]
   1724 
   1725 
   1726     smlal       v24.4s, v15.4h, v3.h[3]
   1727     smlsl       v26.4s, v15.4h, v4.h[3]
   1728     smlsl       v28.4s, v15.4h, v2.h[3]
   1729     smlal       v30.4s, v15.4h, v5.h[3]
   1730 
   1731 
   1732     smlal       v20.4s, v12.4h, v3.h[0]
   1733     smlal       v20.4s, v13.4h, v3.h[2]
   1734     smlsl       v22.4s, v12.4h, v7.h[0]
   1735     smlsl       v22.4s, v13.4h, v5.h[2]
   1736     smlsl       v16.4s, v12.4h, v1.h[0]
   1737     smlsl       v16.4s, v13.4h, v1.h[2]
   1738     smlsl       v18.4s, v12.4h, v5.h[0]
   1739     smlal       v18.4s, v13.4h, v7.h[2]
   1740 
   1741     cmp         x12,x9
   1742     bhs         stage2_shift1
   1743 
   1744 
   1745     ld1         {v10.4h, v11.4h},[x1],#16
   1746     ld1         {v8.4h, v9.4h},[x1],x10
   1747 
   1748 
   1749     smlal       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
   1750     smlsl       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
   1751     smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
   1752     smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
   1753 
   1754     smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   1755     smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   1756     smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
   1757     smlal       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   1758 
   1759 
   1760 
   1761 
   1762 
   1763     smlal       v20.4s, v10.4h, v0.h[0]
   1764     smlal       v20.4s, v11.4h, v4.h[2]
   1765 
   1766 
   1767     smlsl       v22.4s, v10.4h, v0.h[0]
   1768     smlsl       v22.4s, v11.4h, v2.h[2]
   1769 
   1770     smlsl       v16.4s, v10.4h, v0.h[0]
   1771     smlsl       v16.4s, v11.4h, v6.h[2]
   1772 
   1773     smlal       v18.4s, v10.4h, v0.h[0]
   1774     smlal       v18.4s, v11.4h, v0.h[2]
   1775 
   1776     ld1         {v12.4h, v13.4h},[x1],#16
   1777     ld1         {v14.4h, v15.4h},[x1],x10
   1778 
   1779 
   1780 
   1781 
   1782 
   1783     smlal       v24.4s, v14.4h, v5.h[1]
   1784     smlsl       v26.4s, v14.4h, v0.h[2]
   1785     smlal       v28.4s, v14.4h, v5.h[3]
   1786     smlal       v30.4s, v14.4h, v4.h[3]
   1787 
   1788 
   1789     smlal       v24.4s, v15.4h, v5.h[3]
   1790     smlsl       v26.4s, v15.4h, v1.h[1]
   1791     smlal       v28.4s, v15.4h, v3.h[1]
   1792     smlsl       v30.4s, v15.4h, v7.h[3]
   1793 
   1794 
   1795     smlal       v20.4s, v12.4h, v5.h[0]
   1796     smlal       v20.4s, v13.4h, v5.h[2]
   1797     smlsl       v22.4s, v12.4h, v1.h[0]
   1798     smlsl       v22.4s, v13.4h, v0.h[2]
   1799     smlal       v16.4s, v12.4h, v7.h[0]
   1800     smlal       v16.4s, v13.4h, v4.h[2]
   1801     smlal       v18.4s, v12.4h, v3.h[0]
   1802     smlal       v18.4s, v13.4h, v6.h[2]
   1803 
   1804 
   1805     ld1         {v10.4h, v11.4h},[x1],#16
   1806     ld1         {v8.4h, v9.4h},[x1],x10
   1807 
   1808 
   1809 
   1810 
   1811     smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
   1812     smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
   1813     smlal       v28.4s, v8.4h, v0.h[1]     //// y1 * sin3(part of b2)
   1814     smlsl       v30.4s, v8.4h, v4.h[1]     //// y1 * sin1(part of b3)
   1815 
   1816     smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   1817     smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
   1818     smlal       v28.4s, v9.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
   1819     smlsl       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   1820 
   1821 
   1822 
   1823 
   1824 
   1825     smlal       v20.4s, v10.4h, v6.h[0]
   1826     smlal       v20.4s, v11.4h, v6.h[2]
   1827 
   1828 
   1829     smlsl       v22.4s, v10.4h, v2.h[0]
   1830     smlsl       v22.4s, v11.4h, v3.h[2]
   1831 
   1832     smlal       v16.4s, v10.4h, v2.h[0]
   1833     smlal       v16.4s, v11.4h, v0.h[2]
   1834 
   1835     smlsl       v18.4s, v10.4h, v6.h[0]
   1836     smlsl       v18.4s, v11.4h, v2.h[2]
   1837 
   1838     ld1         {v12.4h, v13.4h},[x1],#16
   1839     ld1         {v14.4h, v15.4h},[x1],x10
   1840 
   1841     smlal       v24.4s, v14.4h, v7.h[1]
   1842     smlsl       v26.4s, v14.4h, v5.h[3]
   1843     smlal       v28.4s, v14.4h, v4.h[1]
   1844     smlsl       v30.4s, v14.4h, v2.h[3]
   1845 
   1846 
   1847     smlal       v24.4s, v15.4h, v7.h[3]
   1848     smlsl       v26.4s, v15.4h, v7.h[1]
   1849     smlal       v28.4s, v15.4h, v6.h[3]
   1850     smlsl       v30.4s, v15.4h, v6.h[1]
   1851 
   1852 
   1853     smlal       v20.4s, v12.4h, v7.h[0]
   1854     smlal       v20.4s, v13.4h, v7.h[2]
   1855     smlsl       v22.4s, v12.4h, v5.h[0]
   1856     smlsl       v22.4s, v13.4h, v6.h[2]
   1857     smlal       v16.4s, v12.4h, v3.h[0]
   1858     smlal       v16.4s, v13.4h, v5.h[2]
   1859     smlsl       v18.4s, v12.4h, v1.h[0]
   1860     smlsl       v18.4s, v13.4h, v4.h[2]
   1861 
   1862 stage2_shift1:
   1863     add         v8.4s,  v20.4s ,  v24.4s
   1864     sub         v10.4s,  v20.4s ,  v24.4s
   1865 
   1866     add         v12.4s,  v22.4s ,  v26.4s
   1867     sub         v24.4s,  v22.4s ,  v26.4s
   1868 
   1869     add         v14.4s,  v16.4s ,  v28.4s
   1870     sub         v26.4s,  v16.4s ,  v28.4s
   1871 
   1872 
   1873     add         v16.4s,  v18.4s ,  v30.4s
   1874     sub         v28.4s,  v18.4s ,  v30.4s
   1875 
   1876 
   1877     sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
   1878     sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
   1879     sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
   1880     sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
   1881     sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
   1882     sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
   1883     sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
   1884     sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
   1885 
   1886 
   1887     umov        x15,v24.d[0]
   1888     umov        x16,v25.d[0]
   1889     umov        x19,v26.d[0]
   1890     umov        x20,v27.d[0]
   1891 
   1892     trn1        v24.4h, v30.4h, v12.4h
   1893     trn2        v25.4h, v30.4h, v12.4h
   1894     trn1        v26.4h, v31.4h, v13.4h
   1895     trn2        v27.4h, v31.4h, v13.4h
   1896 
   1897     trn1        v30.2s, v24.2s, v26.2s
   1898     trn2        v31.2s, v24.2s, v26.2s
   1899     trn1        v12.2s, v25.2s, v27.2s
   1900     trn2        v13.2s, v25.2s, v27.2s
   1901 
   1902     trn1        v24.4h, v14.4h, v18.4h
   1903     trn2        v25.4h, v14.4h, v18.4h
   1904     trn1        v26.4h, v15.4h, v19.4h
   1905     trn2        v27.4h, v15.4h, v19.4h
   1906 
   1907     trn1        v14.2s, v24.2s, v26.2s
   1908     trn2        v15.2s, v24.2s, v26.2s
   1909     trn1        v18.2s, v25.2s, v27.2s
   1910     trn2        v19.2s, v25.2s, v27.2s
   1911 
   1912     mov         v24.d[0],x15
   1913     mov         v25.d[0],x16
   1914     mov         v26.d[0],x19
   1915     mov         v27.d[0],x20
   1916 
   1917     st1         { v30.4h, v31.4h},[x0],#16
   1918     st1         { v12.4h, v13.4h},[x0],#16
   1919     st1         { v14.4h, v15.4h},[x0],#16
   1920     st1         { v18.4h, v19.4h},[x0],#16
   1921 
   1922     mov         x1,x4
   1923 
   1924 
   1925 
   1926 
   1927 
   1928 
   1929     ld1         {v10.4h, v11.4h},[x1],#16
   1930     ld1         {v8.4h, v9.4h},[x1],x10
   1931 
   1932 
   1933     smull       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
   1934     smull       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
   1935     smull       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
   1936     smull       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
   1937 
   1938     smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   1939     smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   1940     smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
   1941     smlsl       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
   1942 
   1943 
   1944 
   1945 
   1946 
   1947     smull       v20.4s, v10.4h, v0.h[0]
   1948     smlal       v20.4s, v11.4h, v4.h[2]
   1949 
   1950 
   1951     smull       v22.4s, v10.4h, v0.h[0]
   1952     smlal       v22.4s, v11.4h, v5.h[2]
   1953 
   1954     smull       v16.4s, v10.4h, v0.h[0]
   1955     smlal       v16.4s, v11.4h, v6.h[2]
   1956 
   1957     smull       v18.4s, v10.4h, v0.h[0]
   1958     smlal       v18.4s, v11.4h, v7.h[2]
   1959 
   1960     cmp         x12,x11
   1961     bhs         stage2_shift2
   1962 
   1963     ld1         {v12.4h, v13.4h},[x1],#16
   1964     ld1         {v14.4h, v15.4h},[x1],x10
   1965 
   1966 
   1967     smlsl       v24.4s, v14.4h, v4.h[3]
   1968     smlsl       v26.4s, v14.4h, v2.h[1]
   1969     smlsl       v28.4s, v14.4h, v0.h[1]
   1970     smlsl       v30.4s, v14.4h, v2.h[3]
   1971 
   1972 
   1973     smlsl       v24.4s, v15.4h, v0.h[3]
   1974     smlsl       v26.4s, v15.4h, v3.h[1]
   1975     smlsl       v28.4s, v15.4h, v6.h[3]
   1976     smlal       v30.4s, v15.4h, v5.h[3]
   1977 
   1978 
   1979     smlsl       v20.4s, v12.4h, v7.h[0]
   1980     smlsl       v20.4s, v13.4h, v2.h[2]
   1981     smlsl       v22.4s, v12.4h, v5.h[0]
   1982     smlsl       v22.4s, v13.4h, v0.h[2]
   1983     smlsl       v16.4s, v12.4h, v3.h[0]
   1984     smlsl       v16.4s, v13.4h, v3.h[2]
   1985     smlsl       v18.4s, v12.4h, v1.h[0]
   1986     smlsl       v18.4s, v13.4h, v6.h[2]
   1987 
   1988     cmp         x12,x5
   1989     bhs         stage2_shift2
   1990 
   1991     ld1         {v10.4h, v11.4h},[x1],#16
   1992     ld1         {v8.4h, v9.4h},[x1],x10
   1993 
   1994 
   1995 
   1996 
   1997 
   1998     smlsl       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
   1999     smlal       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
   2000     smlal       v28.4s, v8.4h, v2.h[3]     //// y1 * sin3(part of b2)
   2001     smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
   2002 
   2003     smlal       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
   2004     smlal       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   2005     smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
   2006     smlsl       v30.4s, v9.4h, v6.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
   2007 
   2008 
   2009 
   2010 
   2011 
   2012     smlsl       v20.4s, v10.4h, v2.h[0]
   2013     smlsl       v20.4s, v11.4h, v6.h[2]
   2014 
   2015 
   2016     smlsl       v22.4s, v10.4h, v6.h[0]
   2017     smlal       v22.4s, v11.4h, v4.h[2]
   2018 
   2019     smlal       v16.4s, v10.4h, v6.h[0]
   2020     smlal       v16.4s, v11.4h, v0.h[2]
   2021 
   2022     smlal       v18.4s, v10.4h, v2.h[0]
   2023     smlal       v18.4s, v11.4h, v5.h[2]
   2024 
   2025     cmp         x12,x6
   2026     bhs         stage2_shift2
   2027 
   2028 
   2029     ld1         {v12.4h, v13.4h},[x1],#16
   2030     ld1         {v14.4h, v15.4h},[x1],x10
   2031 
   2032 
   2033 
   2034 
   2035 
   2036 
   2037     smlal       v24.4s, v14.4h, v2.h[3]
   2038     smlal       v26.4s, v14.4h, v3.h[3]
   2039     smlsl       v28.4s, v14.4h, v5.h[3]
   2040     smlsl       v30.4s, v14.4h, v0.h[3]
   2041 
   2042 
   2043     smlal       v24.4s, v15.4h, v1.h[3]
   2044     smlsl       v26.4s, v15.4h, v6.h[3]
   2045     smlsl       v28.4s, v15.4h, v0.h[3]
   2046     smlal       v30.4s, v15.4h, v7.h[3]
   2047 
   2048 
   2049     smlal       v20.4s, v12.4h, v5.h[0]
   2050     smlal       v20.4s, v13.4h, v0.h[2]
   2051     smlal       v22.4s, v12.4h, v1.h[0]
   2052     smlal       v22.4s, v13.4h, v6.h[2]
   2053     smlal       v16.4s, v12.4h, v7.h[0]
   2054     smlsl       v16.4s, v13.4h, v2.h[2]
   2055     smlsl       v18.4s, v12.4h, v3.h[0]
   2056     smlsl       v18.4s, v13.4h, v4.h[2]
   2057 
   2058     cmp         x12,x9
   2059     bhs         stage2_shift2
   2060 
   2061 
   2062     ld1         {v10.4h, v11.4h},[x1],#16
   2063     ld1         {v8.4h, v9.4h},[x1],x10
   2064 
   2065 
   2066 
   2067     smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
   2068     smlsl       v26.4s, v8.4h, v1.h[1]     //// y1 * cos3(part of b1)
   2069     smlsl       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
   2070     smlal       v30.4s, v8.4h, v0.h[3]     //// y1 * sin1(part of b3)
   2071 
   2072     smlsl       v24.4s, v9.4h, v5.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
   2073     smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
   2074     smlal       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
   2075     smlal       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   2076 
   2077 
   2078 
   2079 
   2080 
   2081     smlal       v20.4s, v10.4h, v0.h[0]
   2082     smlsl       v20.4s, v11.4h, v7.h[2]
   2083 
   2084 
   2085     smlsl       v22.4s, v10.4h, v0.h[0]
   2086     smlsl       v22.4s, v11.4h, v1.h[2]
   2087 
   2088     smlsl       v16.4s, v10.4h, v0.h[0]
   2089     smlal       v16.4s, v11.4h, v5.h[2]
   2090 
   2091     smlal       v18.4s, v10.4h, v0.h[0]
   2092     smlal       v18.4s, v11.4h, v3.h[2]
   2093 
   2094     ld1         {v12.4h, v13.4h},[x1],#16
   2095     ld1         {v14.4h, v15.4h},[x1],x10
   2096 
   2097 
   2098 
   2099 
   2100     smlsl       v24.4s, v14.4h, v0.h[1]
   2101     smlal       v26.4s, v14.4h, v6.h[1]
   2102     smlal       v28.4s, v14.4h, v4.h[1]
   2103     smlsl       v30.4s, v14.4h, v1.h[1]
   2104 
   2105 
   2106     smlsl       v24.4s, v15.4h, v3.h[3]
   2107     smlal       v26.4s, v15.4h, v0.h[1]
   2108     smlsl       v28.4s, v15.4h, v5.h[1]
   2109     smlsl       v30.4s, v15.4h, v6.h[1]
   2110 
   2111 
   2112     smlsl       v20.4s, v12.4h, v3.h[0]
   2113     smlsl       v20.4s, v13.4h, v1.h[2]
   2114     smlsl       v22.4s, v12.4h, v7.h[0]
   2115     smlal       v22.4s, v13.4h, v3.h[2]
   2116     smlal       v16.4s, v12.4h, v1.h[0]
   2117     smlal       v16.4s, v13.4h, v7.h[2]
   2118     smlsl       v18.4s, v12.4h, v5.h[0]
   2119     smlsl       v18.4s, v13.4h, v2.h[2]
   2120 
   2121 
   2122     ld1         {v10.4h, v11.4h},[x1],#16
   2123     ld1         {v8.4h, v9.4h},[x1],x10
   2124 
   2125 
   2126     smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
   2127     smlal       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
   2128     smlsl       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
   2129     smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
   2130 
   2131     smlal       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
   2132     smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   2133     smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
   2134     smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   2135 
   2136 
   2137 
   2138 
   2139 
   2140     smlsl       v20.4s, v10.4h, v6.h[0]
   2141     smlal       v20.4s, v11.4h, v5.h[2]
   2142 
   2143 
   2144     smlal       v22.4s, v10.4h, v2.h[0]
   2145     smlal       v22.4s, v11.4h, v7.h[2]
   2146 
   2147     smlsl       v16.4s, v10.4h, v2.h[0]
   2148     smlsl       v16.4s, v11.4h, v4.h[2]
   2149 
   2150     smlal       v18.4s, v10.4h, v6.h[0]
   2151     smlal       v18.4s, v11.4h, v1.h[2]
   2152 
   2153 
   2154     ld1         {v12.4h, v13.4h},[x1],#16
   2155     ld1         {v14.4h, v15.4h},[x1],x10
   2156 
   2157 
   2158 
   2159     smlal       v24.4s, v14.4h, v1.h[1]
   2160     smlsl       v26.4s, v14.4h, v0.h[3]
   2161     smlal       v28.4s, v14.4h, v1.h[3]
   2162     smlsl       v30.4s, v14.4h, v3.h[1]
   2163 
   2164 
   2165     smlal       v24.4s, v15.4h, v5.h[3]
   2166     smlsl       v26.4s, v15.4h, v5.h[1]
   2167     smlal       v28.4s, v15.4h, v4.h[3]
   2168     smlsl       v30.4s, v15.4h, v4.h[1]
   2169 
   2170 
   2171     smlal       v20.4s, v12.4h, v1.h[0]
   2172     smlal       v20.4s, v13.4h, v3.h[2]
   2173     smlsl       v22.4s, v12.4h, v3.h[0]
   2174     smlsl       v22.4s, v13.4h, v2.h[2]
   2175     smlal       v16.4s, v12.4h, v5.h[0]
   2176     smlal       v16.4s, v13.4h, v1.h[2]
   2177     smlsl       v18.4s, v12.4h, v7.h[0]
   2178     smlsl       v18.4s, v13.4h, v0.h[2]
   2179 
   2180 stage2_shift2:
   2181     add         v8.4s,  v20.4s ,  v24.4s
   2182     sub         v10.4s,  v20.4s ,  v24.4s
   2183 
   2184     add         v12.4s,  v22.4s ,  v26.4s
   2185     sub         v24.4s,  v22.4s ,  v26.4s
   2186 
   2187     add         v14.4s,  v16.4s ,  v28.4s
   2188     sub         v26.4s,  v16.4s ,  v28.4s
   2189 
   2190 
   2191     add         v16.4s,  v18.4s ,  v30.4s
   2192     sub         v28.4s,  v18.4s ,  v30.4s
   2193 
   2194 
   2195     sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
   2196     sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
   2197     sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
   2198     sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
   2199     sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
   2200     sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
   2201     sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
   2202     sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
   2203 
   2204     umov        x15,v24.d[0]
   2205     umov        x16,v25.d[0]
   2206     umov        x19,v26.d[0]
   2207     umov        x20,v27.d[0]
   2208 
   2209     trn1        v24.4h, v30.4h, v12.4h
   2210     trn2        v25.4h, v30.4h, v12.4h
   2211     trn1        v26.4h, v31.4h, v13.4h
   2212     trn2        v27.4h, v31.4h, v13.4h
   2213 
   2214     trn1        v30.2s, v24.2s, v26.2s
   2215     trn2        v31.2s, v24.2s, v26.2s
   2216     trn1        v12.2s, v25.2s, v27.2s
   2217     trn2        v13.2s, v25.2s, v27.2s
   2218 
   2219     trn1        v24.4h, v14.4h, v18.4h
   2220     trn2        v25.4h, v14.4h, v18.4h
   2221     trn1        v26.4h, v15.4h, v19.4h
   2222     trn2        v27.4h, v15.4h, v19.4h
   2223 
   2224     trn1        v14.2s, v24.2s, v26.2s
   2225     trn2        v15.2s, v24.2s, v26.2s
   2226     trn1        v18.2s, v25.2s, v27.2s
   2227     trn2        v19.2s, v25.2s, v27.2s
   2228 
   2229     mov         v24.d[0],x15
   2230     mov         v25.d[0],x16
   2231     mov         v26.d[0],x19
   2232     mov         v27.d[0],x20
   2233 
   2234     st1         { v30.4h, v31.4h},[x0],#16
   2235     st1         { v12.4h, v13.4h},[x0],#16
   2236     st1         { v14.4h, v15.4h},[x0],#16
   2237     st1         { v18.4h, v19.4h},[x0],#16
   2238 
   2239 
   2240     mov         x1,x4
   2241 
   2242 
   2243 
   2244 
   2245     ld1         {v10.4h, v11.4h},[x1],#16
   2246     ld1         {v8.4h, v9.4h},[x1],x10
   2247 
   2248     smull       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
   2249     smull       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
   2250     smull       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
   2251     smull       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
   2252 
   2253     smlsl       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
   2254     smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   2255     smlsl       v28.4s, v9.4h, v0.h[2]     //// y1 * sin3 - y3 * cos1(part of b2)
   2256     smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   2257 
   2258 
   2259 
   2260 
   2261 
   2262     smull       v20.4s, v10.4h, v0.h[0]
   2263     smlsl       v20.4s, v11.4h, v7.h[2]
   2264 
   2265 
   2266     smull       v22.4s, v10.4h, v0.h[0]
   2267     smlsl       v22.4s, v11.4h, v6.h[2]
   2268 
   2269     smull       v16.4s, v10.4h, v0.h[0]
   2270     smlsl       v16.4s, v11.4h, v5.h[2]
   2271 
   2272     smull       v18.4s, v10.4h, v0.h[0]
   2273     smlsl       v18.4s, v11.4h, v4.h[2]
   2274 
   2275     cmp         x12,x11
   2276     bhs         stage2_shift3
   2277 
   2278     ld1         {v12.4h, v13.4h},[x1],#16
   2279     ld1         {v14.4h, v15.4h},[x1],x10
   2280 
   2281     smlsl       v24.4s, v14.4h, v5.h[1]
   2282     smlsl       v26.4s, v14.4h, v7.h[3]
   2283     smlal       v28.4s, v14.4h, v5.h[3]
   2284     smlal       v30.4s, v14.4h, v3.h[1]
   2285 
   2286 
   2287     smlal       v24.4s, v15.4h, v2.h[1]
   2288     smlal       v26.4s, v15.4h, v1.h[1]
   2289     smlal       v28.4s, v15.4h, v4.h[3]
   2290     smlsl       v30.4s, v15.4h, v7.h[3]
   2291 
   2292 
   2293     smlsl       v20.4s, v12.4h, v1.h[0]
   2294     smlal       v20.4s, v13.4h, v6.h[2]
   2295     smlsl       v22.4s, v12.4h, v3.h[0]
   2296     smlal       v22.4s, v13.4h, v3.h[2]
   2297     smlsl       v16.4s, v12.4h, v5.h[0]
   2298     smlal       v16.4s, v13.4h, v0.h[2]
   2299     smlsl       v18.4s, v12.4h, v7.h[0]
   2300     smlal       v18.4s, v13.4h, v2.h[2]
   2301 
   2302     cmp         x12,x5
   2303     bhs         stage2_shift3
   2304 
   2305     ld1         {v10.4h, v11.4h},[x1],#16
   2306     ld1         {v8.4h, v9.4h},[x1],x10
   2307 
   2308 
   2309 
   2310     smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
   2311     smlsl       v26.4s, v8.4h, v5.h[1]     //// y1 * cos3(part of b1)
   2312     smlsl       v28.4s, v8.4h, v0.h[3]     //// y1 * sin3(part of b2)
   2313     smlsl       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
   2314 
   2315     smlsl       v24.4s, v9.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
   2316     smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
   2317     smlal       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
   2318     smlal       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   2319 
   2320 
   2321 
   2322 
   2323 
   2324     smlal       v20.4s, v10.4h, v2.h[0]
   2325     smlsl       v20.4s, v11.4h, v5.h[2]
   2326 
   2327 
   2328     smlal       v22.4s, v10.4h, v6.h[0]
   2329     smlsl       v22.4s, v11.4h, v0.h[2]
   2330 
   2331     smlsl       v16.4s, v10.4h, v6.h[0]
   2332     smlsl       v16.4s, v11.4h, v4.h[2]
   2333 
   2334     smlsl       v18.4s, v10.4h, v2.h[0]
   2335     smlal       v18.4s, v11.4h, v6.h[2]
   2336 
   2337     cmp         x12,x6
   2338     bhs         stage2_shift3
   2339 
   2340     ld1         {v12.4h, v13.4h},[x1],#16
   2341     ld1         {v14.4h, v15.4h},[x1],x10
   2342 
   2343 
   2344 
   2345 
   2346 
   2347     smlsl       v24.4s, v14.4h, v7.h[1]
   2348     smlal       v26.4s, v14.4h, v2.h[1]
   2349     smlal       v28.4s, v14.4h, v4.h[1]
   2350     smlsl       v30.4s, v14.4h, v5.h[1]
   2351 
   2352 
   2353     smlal       v24.4s, v15.4h, v0.h[3]
   2354     smlal       v26.4s, v15.4h, v7.h[1]
   2355     smlsl       v28.4s, v15.4h, v1.h[1]
   2356     smlsl       v30.4s, v15.4h, v6.h[1]
   2357 
   2358 
   2359     smlsl       v20.4s, v12.4h, v3.h[0]
   2360     smlal       v20.4s, v13.4h, v4.h[2]
   2361     smlal       v22.4s, v12.4h, v7.h[0]
   2362     smlal       v22.4s, v13.4h, v2.h[2]
   2363     smlal       v16.4s, v12.4h, v1.h[0]
   2364     smlsl       v16.4s, v13.4h, v6.h[2]
   2365     smlal       v18.4s, v12.4h, v5.h[0]
   2366     smlsl       v18.4s, v13.4h, v0.h[2]
   2367 
   2368     cmp         x12,x9
   2369     bhs         stage2_shift3
   2370 
   2371 
   2372     ld1         {v10.4h, v11.4h},[x1],#16
   2373     ld1         {v8.4h, v9.4h},[x1],x10
   2374 
   2375 
   2376     smlsl       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
   2377     smlsl       v26.4s, v8.4h, v0.h[1]     //// y1 * cos3(part of b1)
   2378     smlal       v28.4s, v8.4h, v6.h[3]     //// y1 * sin3(part of b2)
   2379     smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
   2380 
   2381     smlsl       v24.4s, v9.4h, v0.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
   2382     smlal       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   2383     smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
   2384     smlsl       v30.4s, v9.4h, v2.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
   2385 
   2386 
   2387 
   2388 
   2389 
   2390     smlal       v20.4s, v10.4h, v0.h[0]
   2391     smlsl       v20.4s, v11.4h, v3.h[2]
   2392 
   2393 
   2394     smlsl       v22.4s, v10.4h, v0.h[0]
   2395     smlsl       v22.4s, v11.4h, v5.h[2]
   2396 
   2397     smlsl       v16.4s, v10.4h, v0.h[0]
   2398     smlal       v16.4s, v11.4h, v1.h[2]
   2399 
   2400     smlal       v18.4s, v10.4h, v0.h[0]
   2401     smlal       v18.4s, v11.4h, v7.h[2]
   2402 
   2403     ld1         {v12.4h, v13.4h},[x1],#16
   2404     ld1         {v14.4h, v15.4h},[x1],x10
   2405 
   2406 
   2407 
   2408 
   2409     smlal       v24.4s, v14.4h, v6.h[3]
   2410     smlal       v26.4s, v14.4h, v3.h[3]
   2411     smlsl       v28.4s, v14.4h, v1.h[3]
   2412     smlal       v30.4s, v14.4h, v7.h[1]
   2413 
   2414 
   2415     smlal       v24.4s, v15.4h, v1.h[3]
   2416     smlsl       v26.4s, v15.4h, v2.h[3]
   2417     smlal       v28.4s, v15.4h, v7.h[1]
   2418     smlal       v30.4s, v15.4h, v4.h[1]
   2419 
   2420 
   2421     smlsl       v20.4s, v12.4h, v5.h[0]
   2422     smlal       v20.4s, v13.4h, v2.h[2]
   2423     smlal       v22.4s, v12.4h, v1.h[0]
   2424     smlsl       v22.4s, v13.4h, v7.h[2]
   2425     smlsl       v16.4s, v12.4h, v7.h[0]
   2426     smlsl       v16.4s, v13.4h, v3.h[2]
   2427     smlsl       v18.4s, v12.4h, v3.h[0]
   2428     smlal       v18.4s, v13.4h, v1.h[2]
   2429 
   2430 
   2431     ld1         {v10.4h, v11.4h},[x1],#16
   2432     ld1         {v8.4h, v9.4h},[x1],x10
   2433 
   2434 
   2435     smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
   2436     smlsl       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
   2437     smlal       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
   2438     smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
   2439 
   2440     smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   2441     smlal       v26.4s, v9.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
   2442     smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
   2443     smlal       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
   2444 
   2445 
   2446 
   2447 
   2448 
   2449     smlal       v20.4s, v10.4h, v6.h[0]
   2450     smlsl       v20.4s, v11.4h, v1.h[2]
   2451 
   2452 
   2453     smlsl       v22.4s, v10.4h, v2.h[0]
   2454     smlal       v22.4s, v11.4h, v4.h[2]
   2455 
   2456     smlal       v16.4s, v10.4h, v2.h[0]
   2457     smlsl       v16.4s, v11.4h, v7.h[2]
   2458 
   2459     smlsl       v18.4s, v10.4h, v6.h[0]
   2460     smlsl       v18.4s, v11.4h, v5.h[2]
   2461 
   2462     ld1         {v12.4h, v13.4h},[x1],#16
   2463     ld1         {v14.4h, v15.4h},[x1],x10
   2464 
   2465 
   2466 
   2467     smlal       v24.4s, v14.4h, v4.h[3]
   2468     smlsl       v26.4s, v14.4h, v6.h[1]
   2469     smlal       v28.4s, v14.4h, v7.h[3]
   2470     smlal       v30.4s, v14.4h, v6.h[3]
   2471 
   2472 
   2473     smlal       v24.4s, v15.4h, v3.h[3]
   2474     smlsl       v26.4s, v15.4h, v3.h[1]
   2475     smlal       v28.4s, v15.4h, v2.h[3]
   2476     smlsl       v30.4s, v15.4h, v2.h[1]
   2477 
   2478 
   2479     smlsl       v20.4s, v12.4h, v7.h[0]
   2480     smlal       v20.4s, v13.4h, v0.h[2]
   2481     smlal       v22.4s, v12.4h, v5.h[0]
   2482     smlsl       v22.4s, v13.4h, v1.h[2]
   2483     smlsl       v16.4s, v12.4h, v3.h[0]
   2484     smlal       v16.4s, v13.4h, v2.h[2]
   2485     smlal       v18.4s, v12.4h, v1.h[0]
   2486     smlsl       v18.4s, v13.4h, v3.h[2]
   2487 
   2488 stage2_shift3:
   2489     add         v8.4s,  v20.4s ,  v24.4s
   2490     sub         v10.4s,  v20.4s ,  v24.4s
   2491 
   2492     add         v12.4s,  v22.4s ,  v26.4s
   2493     sub         v24.4s,  v22.4s ,  v26.4s
   2494 
   2495     add         v14.4s,  v16.4s ,  v28.4s
   2496     sub         v26.4s,  v16.4s ,  v28.4s
   2497 
   2498 
   2499     add         v16.4s,  v18.4s ,  v30.4s
   2500     sub         v28.4s,  v18.4s ,  v30.4s
   2501 
   2502 
   2503     sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
   2504     sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
   2505     sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
   2506     sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
   2507     sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
   2508     sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
   2509     sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
   2510     sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
   2511 
   2512     umov        x15,v24.d[0]
   2513     umov        x16,v25.d[0]
   2514     umov        x19,v26.d[0]
   2515     umov        x20,v27.d[0]
   2516 
   2517     trn1        v24.4h, v30.4h, v12.4h
   2518     trn2        v25.4h, v30.4h, v12.4h
   2519     trn1        v26.4h, v31.4h, v13.4h
   2520     trn2        v27.4h, v31.4h, v13.4h
   2521 
   2522     trn1        v30.2s, v24.2s, v26.2s
   2523     trn2        v31.2s, v24.2s, v26.2s
   2524     trn1        v12.2s, v25.2s, v27.2s
   2525     trn2        v13.2s, v25.2s, v27.2s
   2526 
   2527     trn1        v24.4h, v14.4h, v18.4h
   2528     trn2        v25.4h, v14.4h, v18.4h
   2529     trn1        v26.4h, v15.4h, v19.4h
   2530     trn2        v27.4h, v15.4h, v19.4h
   2531 
   2532     trn1        v14.2s, v24.2s, v26.2s
   2533     trn2        v15.2s, v24.2s, v26.2s
   2534     trn1        v18.2s, v25.2s, v27.2s
   2535     trn2        v19.2s, v25.2s, v27.2s
   2536 
   2537     mov         v24.d[0],x15
   2538     mov         v25.d[0],x16
   2539     mov         v26.d[0],x19
   2540     mov         v27.d[0],x20
   2541 
   2542     st1         { v30.4h, v31.4h},[x0],#16
   2543     st1         { v12.4h, v13.4h},[x0],#16
   2544     st1         { v14.4h, v15.4h},[x0],#16
   2545     st1         { v18.4h, v19.4h},[x0],#16
   2546 
   2547 
   2548 
   2549     mov         x1,x4
   2550 
   2551 
   2552 
   2553 
   2554     ld1         {v10.4h, v11.4h},[x1],#16
   2555     ld1         {v8.4h, v9.4h},[x1],x10
   2556 
   2557 
   2558     smull       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
   2559     smull       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
   2560     smull       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
   2561     smull       v30.4s, v8.4h, v7.h[3]     //// y1 * sin1(part of b3)
   2562 
   2563     smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   2564     smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
   2565     smlsl       v28.4s, v9.4h, v5.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
   2566     smlsl       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   2567 
   2568 
   2569 
   2570 
   2571 
   2572     smull       v20.4s, v10.4h, v0.h[0]
   2573     smlsl       v20.4s, v11.4h, v3.h[2]
   2574 
   2575 
   2576     smull       v22.4s, v10.4h, v0.h[0]
   2577     smlsl       v22.4s, v11.4h, v2.h[2]
   2578 
   2579     smull       v16.4s, v10.4h, v0.h[0]
   2580     smlsl       v16.4s, v11.4h, v1.h[2]
   2581 
   2582     smull       v18.4s, v10.4h, v0.h[0]
   2583     smlsl       v18.4s, v11.4h, v0.h[2]
   2584 
   2585     cmp         x12,x11
   2586     bhs         stage2_shift4
   2587     ld1         {v12.4h, v13.4h},[x1],#16
   2588     ld1         {v14.4h, v15.4h},[x1],x10
   2589 
   2590 
   2591 
   2592 
   2593 
   2594 
   2595     smlal       v24.4s, v14.4h, v0.h[1]
   2596     smlal       v26.4s, v14.4h, v1.h[3]
   2597     smlal       v28.4s, v14.4h, v4.h[1]
   2598     smlal       v30.4s, v14.4h, v6.h[3]
   2599 
   2600 
   2601     smlsl       v24.4s, v15.4h, v4.h[1]
   2602     smlsl       v26.4s, v15.4h, v0.h[3]
   2603     smlsl       v28.4s, v15.4h, v2.h[3]
   2604     smlsl       v30.4s, v15.4h, v6.h[1]
   2605 
   2606 
   2607     smlal       v20.4s, v12.4h, v7.h[0]
   2608     smlal       v20.4s, v13.4h, v5.h[2]
   2609     smlal       v22.4s, v12.4h, v5.h[0]
   2610     smlsl       v22.4s, v13.4h, v7.h[2]
   2611     smlal       v16.4s, v12.4h, v3.h[0]
   2612     smlsl       v16.4s, v13.4h, v4.h[2]
   2613     smlal       v18.4s, v12.4h, v1.h[0]
   2614     smlsl       v18.4s, v13.4h, v1.h[2]
   2615 
   2616     cmp         x12,x5
   2617     bhs         stage2_shift4
   2618 
   2619     ld1         {v10.4h, v11.4h},[x1],#16
   2620     ld1         {v8.4h, v9.4h},[x1],x10
   2621 
   2622 
   2623 
   2624     smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
   2625     smlal       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
   2626     smlal       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
   2627     smlal       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
   2628 
   2629     smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   2630     smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   2631     smlsl       v28.4s, v9.4h, v0.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
   2632     smlsl       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   2633 
   2634 
   2635 
   2636 
   2637 
   2638     smlsl       v20.4s, v10.4h, v2.h[0]
   2639     smlal       v20.4s, v11.4h, v1.h[2]
   2640 
   2641 
   2642     smlsl       v22.4s, v10.4h, v6.h[0]
   2643     smlal       v22.4s, v11.4h, v3.h[2]
   2644 
   2645     smlal       v16.4s, v10.4h, v6.h[0]
   2646     smlsl       v16.4s, v11.4h, v7.h[2]
   2647 
   2648     smlal       v18.4s, v10.4h, v2.h[0]
   2649     smlsl       v18.4s, v11.4h, v2.h[2]
   2650 
   2651     cmp         x12,x6
   2652     bhs         stage2_shift4
   2653 
   2654 
   2655     ld1         {v12.4h, v13.4h},[x1],#16
   2656     ld1         {v14.4h, v15.4h},[x1],x10
   2657 
   2658 
   2659 
   2660 
   2661 
   2662 
   2663     smlsl       v24.4s, v14.4h, v1.h[1]
   2664     smlsl       v26.4s, v14.4h, v7.h[3]
   2665     smlal       v28.4s, v14.4h, v1.h[3]
   2666     smlal       v30.4s, v14.4h, v4.h[3]
   2667 
   2668 
   2669     smlal       v24.4s, v15.4h, v2.h[1]
   2670     smlal       v26.4s, v15.4h, v5.h[1]
   2671     smlsl       v28.4s, v15.4h, v3.h[1]
   2672     smlsl       v30.4s, v15.4h, v4.h[1]
   2673 
   2674 
   2675     smlsl       v20.4s, v12.4h, v5.h[0]
   2676     smlsl       v20.4s, v13.4h, v7.h[2]
   2677     smlsl       v22.4s, v12.4h, v1.h[0]
   2678     smlal       v22.4s, v13.4h, v1.h[2]
   2679     smlsl       v16.4s, v12.4h, v7.h[0]
   2680     smlal       v16.4s, v13.4h, v5.h[2]
   2681     smlal       v18.4s, v12.4h, v3.h[0]
   2682     smlsl       v18.4s, v13.4h, v3.h[2]
   2683 
   2684     cmp         x12,x9
   2685     bhs         stage2_shift4
   2686 
   2687 
   2688     ld1         {v10.4h, v11.4h},[x1],#16
   2689     ld1         {v8.4h, v9.4h},[x1],x10
   2690 
   2691 
   2692     smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
   2693     smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
   2694     smlal       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
   2695     smlal       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
   2696 
   2697     smlsl       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
   2698     smlal       v26.4s, v9.4h, v0.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
   2699     smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
   2700     smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   2701 
   2702 
   2703 
   2704 
   2705 
   2706     smlal       v20.4s, v10.4h, v0.h[0]
   2707     smlsl       v20.4s, v11.4h, v0.h[2]
   2708 
   2709 
   2710     smlsl       v22.4s, v10.4h, v0.h[0]
   2711     smlal       v22.4s, v11.4h, v6.h[2]
   2712 
   2713     smlsl       v16.4s, v10.4h, v0.h[0]
   2714     smlal       v16.4s, v11.4h, v2.h[2]
   2715 
   2716     smlal       v18.4s, v10.4h, v0.h[0]
   2717     smlsl       v18.4s, v11.4h, v4.h[2]
   2718 
   2719     ld1         {v12.4h, v13.4h},[x1],#16
   2720     ld1         {v14.4h, v15.4h},[x1],x10
   2721 
   2722 
   2723 
   2724 
   2725     smlal       v24.4s, v14.4h, v3.h[1]
   2726     smlsl       v26.4s, v14.4h, v2.h[1]
   2727     smlal       v28.4s, v14.4h, v7.h[3]
   2728     smlal       v30.4s, v14.4h, v2.h[3]
   2729 
   2730 
   2731     smlsl       v24.4s, v15.4h, v0.h[3]
   2732     smlal       v26.4s, v15.4h, v4.h[3]
   2733     smlal       v28.4s, v15.4h, v6.h[3]
   2734     smlsl       v30.4s, v15.4h, v2.h[1]
   2735 
   2736 
   2737     smlal       v20.4s, v12.4h, v3.h[0]
   2738     smlsl       v20.4s, v13.4h, v6.h[2]
   2739     smlal       v22.4s, v12.4h, v7.h[0]
   2740     smlsl       v22.4s, v13.4h, v4.h[2]
   2741     smlsl       v16.4s, v12.4h, v1.h[0]
   2742     smlal       v16.4s, v13.4h, v0.h[2]
   2743     smlal       v18.4s, v12.4h, v5.h[0]
   2744     smlsl       v18.4s, v13.4h, v5.h[2]
   2745 
   2746 
   2747     ld1         {v10.4h, v11.4h},[x1],#16
   2748     ld1         {v8.4h, v9.4h},[x1],x10
   2749 
   2750 
   2751 
   2752 
   2753     smlal       v24.4s, v8.4h, v3.h[3]     //// y1 * cos1(part of b0)
   2754     smlsl       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
   2755     smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
   2756     smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
   2757 
   2758     smlsl       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
   2759     smlsl       v26.4s, v9.4h, v6.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
   2760     smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
   2761     smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
   2762 
   2763 
   2764 
   2765 
   2766 
   2767     smlsl       v20.4s, v10.4h, v6.h[0]
   2768     smlal       v20.4s, v11.4h, v2.h[2]
   2769 
   2770 
   2771     smlal       v22.4s, v10.4h, v2.h[0]
   2772     smlsl       v22.4s, v11.4h, v0.h[2]
   2773 
   2774     smlsl       v16.4s, v10.4h, v2.h[0]
   2775     smlal       v16.4s, v11.4h, v3.h[2]
   2776 
   2777     smlal       v18.4s, v10.4h, v6.h[0]
   2778     smlsl       v18.4s, v11.4h, v6.h[2]
   2779 
   2780 
   2781     ld1         {v12.4h, v13.4h},[x1],#16
   2782     ld1         {v14.4h, v15.4h},[x1],x10
   2783 
   2784 
   2785 
   2786     smlsl       v24.4s, v14.4h, v5.h[1]
   2787     smlal       v26.4s, v14.4h, v3.h[3]
   2788     smlsl       v28.4s, v14.4h, v2.h[1]
   2789     smlal       v30.4s, v14.4h, v0.h[3]
   2790 
   2791 
   2792     smlal       v24.4s, v15.4h, v1.h[3]
   2793     smlsl       v26.4s, v15.4h, v1.h[1]
   2794     smlal       v28.4s, v15.4h, v0.h[3]
   2795     smlsl       v30.4s, v15.4h, v0.h[1]
   2796 
   2797 
   2798     smlsl       v20.4s, v12.4h, v1.h[0]
   2799     smlal       v20.4s, v13.4h, v4.h[2]
   2800     smlal       v22.4s, v12.4h, v3.h[0]
   2801     smlsl       v22.4s, v13.4h, v5.h[2]
   2802     smlsl       v16.4s, v12.4h, v5.h[0]
   2803     smlal       v16.4s, v13.4h, v6.h[2]
   2804     smlal       v18.4s, v12.4h, v7.h[0]
   2805     smlsl       v18.4s, v13.4h, v7.h[2]
   2806 
   2807 stage2_shift4:
   2808     add         v8.4s,  v20.4s ,  v24.4s
   2809     sub         v10.4s,  v20.4s ,  v24.4s
   2810 
   2811     add         v12.4s,  v22.4s ,  v26.4s
   2812     sub         v24.4s,  v22.4s ,  v26.4s
   2813 
   2814     add         v14.4s,  v16.4s ,  v28.4s
   2815     sub         v26.4s,  v16.4s ,  v28.4s
   2816 
   2817 
   2818     add         v16.4s,  v18.4s ,  v30.4s
   2819     sub         v28.4s,  v18.4s ,  v30.4s
   2820 
   2821 
   2822     sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
   2823     sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
   2824     sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
   2825     sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
   2826     sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
   2827     sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
   2828     sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
   2829     sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
   2830 
   2831 
   2832 
   2833     umov        x15,v24.d[0]
   2834     umov        x16,v25.d[0]
   2835     umov        x19,v26.d[0]
   2836     umov        x20,v27.d[0]
   2837 
   2838     trn1        v24.4h, v30.4h, v12.4h
   2839     trn2        v25.4h, v30.4h, v12.4h
   2840     trn1        v26.4h, v31.4h, v13.4h
   2841     trn2        v27.4h, v31.4h, v13.4h
   2842 
   2843     trn1        v30.2s, v24.2s, v26.2s
   2844     trn2        v31.2s, v24.2s, v26.2s
   2845     trn1        v12.2s, v25.2s, v27.2s
   2846     trn2        v13.2s, v25.2s, v27.2s
   2847 
   2848     trn1        v24.4h, v14.4h, v18.4h
   2849     trn2        v25.4h, v14.4h, v18.4h
   2850     trn1        v26.4h, v15.4h, v19.4h
   2851     trn2        v27.4h, v15.4h, v19.4h
   2852 
   2853     trn1        v14.2s, v24.2s, v26.2s
   2854     trn2        v15.2s, v24.2s, v26.2s
   2855     trn1        v18.2s, v25.2s, v27.2s
   2856     trn2        v19.2s, v25.2s, v27.2s
   2857 
   2858     mov         v24.d[0],x15
   2859     mov         v25.d[0],x16
   2860     mov         v26.d[0],x19
   2861     mov         v27.d[0],x20
   2862 
   2863     st1         { v30.4h, v31.4h},[x0],#16
   2864     st1         { v12.4h, v13.4h},[x0],#16
   2865     st1         { v14.4h, v15.4h},[x0],#16
   2866     st1         { v18.4h, v19.4h},[x0],#16
   2867 
   2868 
   2869 
   2870 
   2871     sub         x0,x0,#256
   2872 prediction_buffer:
   2873 
   2874 
   2875     ld1         {v12.8h},[x0],#16
   2876     ld1         {v14.8h},[x0],#16
   2877 
   2878     add         x0,x0,#32
   2879 
   2880     ld1         {v16.8h},[x0],#16
   2881     ld1         {v18.8h},[x0],#16
   2882     add         x0,x0,#32
   2883 
   2884     ld1         {v20.8h},[x0],#16
   2885     ld1         {v22.8h},[x0],#16
   2886 
   2887 
   2888     add         x0,x0,#32
   2889 
   2890     ld1         {v24.8h},[x0],#16
   2891     ld1         {v26.8h},[x0],#16
   2892 
   2893 
   2894 
   2895 
   2896 
   2897 // d12 =x0 1- 4 values
   2898 // d13 =x2 1- 4 values
   2899 // d14=x1 1- 4 values
   2900 // d15=x3 1- 4 values
   2901 
   2902 // d16 =x0 5- 8 values
   2903 // d17 =x2 5- 8 values
   2904 // d18=x1 5- 8 values
   2905 // d19=x3 5- 8 values
   2906 
   2907 // d20 =x0 9- 12 values
   2908 // d21 =x2 9- 12 values
   2909 // d22=x1 9- 12 values
   2910 // d23=x3 9- 12 values
   2911 
   2912 // d24 =x0 13-16 values
   2913 // d25 =x2 13- 16 values
   2914 // d26=x1 13- 16 values
   2915 // d27=x3 13- 16 values
   2916 
   2917     // swapping v12 upper and v16 lower 64bits
   2918     mov         v13.d[0], v12.d[1]
   2919     mov         v12.d[1], v16.d[0]
   2920     mov         v16.d[0], v13.d[0]
   2921     // swapping v20 upper and v24 lower 64bits
   2922     mov         v21.d[0], v20.d[1]
   2923     mov         v20.d[1], v24.d[0]
   2924     mov         v24.d[0], v21.d[0]
   2925     // swapping v14 uppper and v18 lower 64bits
   2926     mov         v15.d[0], v14.d[1]
   2927     mov         v14.d[1], v18.d[0]
   2928     mov         v18.d[0], v15.d[0]
   2929     // swapping v22 upper and v26 lower 64bits
   2930     mov         v23.d[0], v22.d[1]
   2931     mov         v22.d[1], v26.d[0]
   2932     mov         v26.d[0], v23.d[0]
   2933 
   2934 
   2935     ld1         {v8.8b, v9.8b},[x2],x8
   2936     ld1         {v10.8b, v11.8b},[x2],x8
   2937     ld1         {v28.8b, v29.8b},[x2],x8
   2938     ld1         {v30.8b, v31.8b},[x2],x8
   2939 
   2940 
   2941     uaddw       v12.8h,  v12.8h ,  v8.8b
   2942     uaddw       v20.8h,  v20.8h ,  v9.8b
   2943     uaddw       v14.8h,  v14.8h ,  v10.8b
   2944     uaddw       v22.8h,  v22.8h ,  v11.8b
   2945     uaddw       v16.8h,  v16.8h ,  v28.8b
   2946     uaddw       v24.8h,  v24.8h ,  v29.8b
   2947     uaddw       v18.8h,  v18.8h ,  v30.8b
   2948     uaddw       v26.8h,  v26.8h ,  v31.8b
   2949     sub         x2,x2,x8,lsl #2
   2950     add         x2,x2,#16
   2951     sqxtun      v12.8b, v12.8h
   2952     sqxtun      v13.8b, v20.8h
   2953     sqxtun      v20.8b, v14.8h
   2954     sqxtun      v21.8b, v22.8h
   2955     sqxtun      v14.8b, v16.8h
   2956     sqxtun      v15.8b, v24.8h
   2957     sqxtun      v22.8b, v18.8h
   2958     sqxtun      v23.8b, v26.8h
   2959 
   2960 
   2961     st1         {v12.8b, v13.8b},[x3],x7
   2962     st1         {v20.8b, v21.8b},[x3],x7
   2963     st1         {v14.8b, v15.8b},[x3],x7
   2964     st1         {v22.8b, v23.8b},[x3],x7
   2965 
   2966 
   2967     sub         x3,x3,x7,lsl #2
   2968     add         x3,x3,#16
   2969 
   2970     ld1         {v12.8h},[x0],#16
   2971     ld1         {v14.8h},[x0],#16
   2972 
   2973     sub         x0,x0,#96
   2974 
   2975     ld1         {v16.8h},[x0],#16
   2976     ld1         {v18.8h},[x0],#16
   2977     sub         x0,x0,#96
   2978 
   2979     ld1         {v20.8h},[x0],#16
   2980     ld1         {v22.8h},[x0],#16
   2981 
   2982 
   2983     sub         x0,x0,#96
   2984 
   2985     ld1         {v24.8h},[x0],#16
   2986     ld1         {v26.8h},[x0],#16
   2987 
   2988 
   2989     sub         x0,x0,#64
   2990 
   2991 
   2992     // swapping v12 upper and v16 lower 64bits
   2993     mov         v13.d[0], v12.d[1]
   2994     mov         v12.d[1], v16.d[0]
   2995     mov         v16.d[0], v13.d[0]
   2996     // swapping v20 upper and v24 lower 64bits
   2997     mov         v21.d[0], v20.d[1]
   2998     mov         v20.d[1], v24.d[0]
   2999     mov         v24.d[0], v21.d[0]
   3000     // swapping v14 uppper and v18 lower 64bits
   3001     mov         v15.d[0], v14.d[1]
   3002     mov         v14.d[1], v18.d[0]
   3003     mov         v18.d[0], v15.d[0]
   3004     // swapping v22 upper and v26 lower 64bits
   3005     mov         v23.d[0], v22.d[1]
   3006     mov         v22.d[1], v26.d[0]
   3007     mov         v26.d[0], v23.d[0]
   3008 
   3009 
   3010     ld1         {v8.8b, v9.8b},[x2],x8
   3011     ld1         {v10.8b, v11.8b},[x2],x8
   3012     ld1         {v28.8b, v29.8b},[x2],x8
   3013     ld1         {v30.8b, v31.8b},[x2],x8
   3014 
   3015 
   3016     uaddw       v12.8h,  v12.8h ,  v8.8b
   3017     uaddw       v20.8h,  v20.8h ,  v9.8b
   3018     uaddw       v14.8h,  v14.8h ,  v10.8b
   3019     uaddw       v22.8h,  v22.8h ,  v11.8b
   3020     uaddw       v16.8h,  v16.8h ,  v28.8b
   3021     uaddw       v24.8h,  v24.8h ,  v29.8b
   3022     uaddw       v18.8h,  v18.8h ,  v30.8b
   3023     uaddw       v26.8h,  v26.8h ,  v31.8b
   3024     sub         x2,x2,#16
   3025 
   3026     sqxtun      v12.8b, v12.8h
   3027     sqxtun      v13.8b, v20.8h
   3028     sqxtun      v20.8b, v14.8h
   3029     sqxtun      v21.8b, v22.8h
   3030     sqxtun      v14.8b, v16.8h
   3031     sqxtun      v15.8b, v24.8h
   3032     sqxtun      v22.8b, v18.8h
   3033     sqxtun      v23.8b, v26.8h
   3034 
   3035 
   3036     st1         {v12.8b, v13.8b},[x3],x7
   3037     st1         {v20.8b, v21.8b},[x3],x7
   3038     st1         {v14.8b, v15.8b},[x3],x7
   3039     st1         {v22.8b, v23.8b},[x3],x7
   3040 
   3041     sub         x3,x3,#16
   3042 
   3043     subs        x14,x14,#1
   3044     bne         dct_stage2
   3045     // ldmfd sp!,{x0-x12,pc}
   3046     ldp         x19, x20,[sp],#16
   3047     pop_v_regs
   3048     ret
   3049 
   3050 
   3051 
   3052 
   3053 
   3054