Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 // *******************************************************************************
     20 // * @file
     21 // *  ihevc_itrans_recon_8x8_neon.s
     22 // *
     23 // * @brief
     24 // *  contains function definitions for single stage  inverse transform
     25 // *
     26 // * @author
     27 // * anand s
     28 // *
     29 // * @par list of functions:
     30 // *  - ihevc_itrans_recon_16x16()
     31 // *
     32 // * @remarks
     33 // *  none
     34 // *
     35 // *******************************************************************************
     36 //*/
     37 
     38 ///**
     39 // *******************************************************************************
     40 // *
     41 // * @brief
     42 // *  this function performs inverse transform  and reconstruction for 8x8
     43 // * input block
     44 // *
     45 // * @par description:
     46 // *  performs inverse transform and adds the prediction  data and clips output
     47 // * to 8 bit
     48 // *
     49 // * @param[in] pi2_src
     50 // *  input 16x16 coefficients
     51 // *
     52 // * @param[in] pi2_tmp
     53 // *  temporary 16x16 buffer for storing inverse
     54 // *
     55 // *  transform
     56 // *  1st stage output
     57 // *
     58 // * @param[in] pu1_pred
     59 // *  prediction 16x16 block
     60 // *
     61 // * @param[out] pu1_dst
     62 // *  output 8x8 block
     63 // *
     64 // * @param[in] src_strd
     65 // *  input stride
     66 // *
     67 // * @param[in] pred_strd
     68 // *  prediction stride
     69 // *
     70 // * @param[in] dst_strd
     71 // *  output stride
     72 // *
     73 // * @param[in] shift
     74 // *  output shift
     75 // *
     76 // * @param[in] x12
     77 // *  zero columns in pi2_src
     78 // *
     79 // * @returns  void
     80 // *
     81 // * @remarks
     82 // *  none
     83 // *
     84 // *******************************************************************************
     85 // */
     86 
     87 //void ihevc_itrans_recon_16x16(word16 *pi2_src,
     88 //                            word16 *pi2_tmp,
     89 //                            uword8 *pu1_pred,
     90 //                            uword8 *pu1_dst,
     91 //                            word32 src_strd,
     92 //                            word32 pred_strd,
     93 //                            word32 dst_strd,
     94 //                            word32 x12
     95 //                             word32    x11                )
     96 
     97 //**************variables vs registers*************************
     98 //    x0 => *pi2_src
     99 //    x1 => *pi2_tmp
    100 //    x2 => *pu1_pred
    101 //    x3 => *pu1_dst
    102 //    src_strd
    103 //    pred_strd
    104 //    dst_strd
    105 //    x12
    106 //    x11
    107 
    108 .text
    109 .align 4
    110 
    111 .include "ihevc_neon_macros.s"
    112 
    113 
    114 
    115 
    116 .set shift_stage1_idct ,   7
    117 .set shift_stage2_idct ,   12
    118 //#define zero_cols         x12
    119 //#define zero_rows         x11
    120 .globl ihevc_itrans_recon_16x16_av8
    121 
    122 .extern g_ai2_ihevc_trans_16_transpose
    123 
    124 .type ihevc_itrans_recon_16x16_av8, %function
    125 
    126 ihevc_itrans_recon_16x16_av8:
    127 
    128     ldr         w11, [sp]
    129     // stmfd sp!,{x4-x12,x14}
    130     push_v_regs
    131     stp         x19, x20,[sp,#-16]!
    132     stp         x5, x6,[sp,#-16]!
    133 //    add             sp,sp,#40
    134 
    135 
    136 
    137 //    ldr            x8,[sp,#4]     @ prediction stride
    138 //    ldr            x7,[sp,#8]     @ destination stride
    139     mov         x6, x4 // src stride
    140     mov         x12, x7
    141 
    142 
    143 
    144     adrp        x14, :got:g_ai2_ihevc_trans_16_transpose
    145     ldr         x14, [x14, #:got_lo12:g_ai2_ihevc_trans_16_transpose]
    146     ld1         {v0.4h, v1.4h, v2.4h, v3.4h},[x14] ////d0,d1 are used for storing the constant data
    147     mov         x7,#0xffff
    148     and         x12,x12,x7
    149     and         x11,x11,x7
    150     lsl         x6, x6, #1                  // x sizeof(word16)
    151     add         x9,x0,x6, lsl #1            // 2 rows
    152 
    153     add         x10,x6,x6, lsl #1           // 3 rows
    154     add         x5,x6,x6,lsl #2
    155     mov         x7,#0xfff0
    156 
    157     cmp         x12,x7
    158     bge         zero_12cols_decision
    159 
    160     mov         x19,#0xff00
    161     cmp         x12,x19
    162     bge         zero_8cols_decision
    163 
    164 
    165 
    166 
    167     mov         x14,#4
    168     cmp         x11,x7
    169     sub         x20,x6,#0
    170     neg         x20, x20
    171     csel        x10,x20,x10,ge
    172 
    173     mov         x19,#0xff00
    174     cmp         x11,x19
    175     csel        x8, x5, x8,ge
    176     sub         x20,x8,#0
    177     neg         x20, x20
    178     csel        x8,x20,x8,ge
    179     csel        x8, x10, x8,lt
    180     add         x5,x5,x6,lsl #3
    181     sub         x20,x5,#0
    182     neg         x5, x20
    183 
    184     b           first_stage_top_four_bottom_four
    185 
    186 zero_12cols_decision:
    187     mov         x14,#1
    188     mov         x19,#0xff00
    189     cmp         x11,x19
    190     csel        x8, x5, x8,ge
    191     csel        x8, x10, x8,lt
    192     add         x5,x5,x6,lsl #3
    193     sub         x20,x5,#0
    194     neg         x5, x20
    195 
    196     b           first_stage_top_four_bottom_four
    197 
    198 zero_8cols_decision:
    199     mov         x14,#2
    200     mov         x8,x5
    201     sub         x20,x8,#0
    202     neg         x8, x20
    203     mov         x19,#0xff00
    204     cmp         x11,x19
    205     csel        x8, x10, x8,lt
    206     add         x5,x5,x6,lsl #3
    207     sub         x20,x5,#0
    208     neg         x5, x20
    209     cmp         x11,x7
    210     sub         x20,x6,#0
    211     neg         x20, x20
    212     csel        x10,x20,x10,ge
    213 
    214 
    215     b           first_stage_top_four_bottom_four
    216 
    217 
    218 //d0[0]=    64        d2[0]=64
    219 //d0[1]= 90        d2[1]=57
    220 //d0[2]= 89        d2[2]=50
    221 //d0[3]= 87        d2[3]=43
    222 //d1[0]= 83         d3[0]=36
    223 //d1[1]= 80        d3[1]=25
    224 //d1[2]= 75        d3[2]=18
    225 //d1[3]= 70        d3[3]=9
    226 
    227 
    228 
    229 first_stage:
    230     add         x0,x0,#8
    231     add         x9,x9,#8
    232 
    233 first_stage_top_four_bottom_four:
    234 
    235     ld1         {v10.4h},[x0],x6
    236     ld1         {v11.4h},[x9],x6
    237     ld1         {v6.4h},[x0],x10
    238     ld1         {v7.4h},[x9],x10
    239     cmp         x11,x7
    240     bge         skip_load4rows
    241 
    242     ld1         {v4.4h},[x0],x6
    243     ld1         {v5.4h},[x9],x6
    244     ld1         {v8.4h},[x0],x8
    245     ld1         {v9.4h},[x9],x8
    246 
    247 // registers used: q0,q1,q3,q5,q2,q4
    248 
    249 // d10 =x0
    250 //d6= x1
    251 //d11=x2
    252 //d7=x3
    253 
    254 skip_load4rows:
    255     smull       v24.4s, v6.4h, v0.h[1]      //// y1 * cos1(part of b0)
    256     smull       v26.4s, v6.4h, v0.h[3]      //// y1 * cos3(part of b1)
    257     smull       v28.4s, v6.4h, v1.h[1]      //// y1 * sin3(part of b2)
    258     smull       v30.4s, v6.4h, v1.h[3]      //// y1 * sin1(part of b3)
    259 
    260     smlal       v24.4s, v7.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
    261     smlal       v26.4s, v7.4h, v2.h[1]      //// y1 * cos3 - y3 * sin1(part of b1)
    262     smlal       v28.4s, v7.4h, v3.h[3]      //// y1 * sin3 - y3 * cos1(part of b2)
    263     smlsl       v30.4s, v7.4h, v2.h[3]      //// y1 * sin1 - y3 * sin3(part of b3)
    264 
    265 
    266 
    267 
    268 
    269 
    270     smull       v12.4s, v10.4h, v0.h[0]
    271     smlal       v12.4s, v11.4h, v0.h[2]
    272     smull       v14.4s, v10.4h, v0.h[0]
    273     smlal       v14.4s, v11.4h, v1.h[2]
    274     smull       v16.4s, v10.4h, v0.h[0]
    275     smlal       v16.4s, v11.4h, v2.h[2]
    276     smull       v18.4s, v10.4h, v0.h[0]
    277     smlal       v18.4s, v11.4h, v3.h[2]
    278 
    279     bge         skip_last12rows_kernel1
    280 
    281 
    282     smlal       v24.4s, v8.4h, v1.h[1]
    283     smlal       v26.4s, v8.4h, v3.h[3]
    284     smlsl       v28.4s, v8.4h, v1.h[3]
    285     smlsl       v30.4s, v8.4h, v0.h[3]
    286 
    287 
    288     smlal       v24.4s, v9.4h, v1.h[3]
    289     smlsl       v26.4s, v9.4h, v2.h[3]
    290     smlsl       v28.4s, v9.4h, v0.h[3]
    291     smlal       v30.4s, v9.4h, v3.h[3]
    292 
    293 
    294 
    295 
    296 
    297     smlal       v12.4s, v4.4h, v1.h[0]
    298     smlal       v12.4s, v5.4h, v1.h[2]
    299     smlal       v14.4s, v4.4h, v3.h[0]
    300     smlsl       v14.4s, v5.4h, v3.h[2]
    301     smlsl       v16.4s, v4.4h, v3.h[0]
    302     smlsl       v16.4s, v5.4h, v0.h[2]
    303     smlsl       v18.4s, v4.4h, v1.h[0]
    304     smlsl       v18.4s, v5.4h, v2.h[2]
    305 
    306 //d0[0]=    64        d2[0]=64
    307 //d0[1]= 90        d2[1]=57
    308 //d0[2]= 89        d2[2]=50
    309 //d0[3]= 87        d2[3]=43
    310 //d1[0]= 83         d3[0]=36
    311 //d1[1]= 80        d3[1]=25
    312 //d1[2]= 75        d3[2]=18
    313 //d1[3]= 70        d3[3]=9
    314     mov         x19,#0xff00
    315     cmp         x11,x19
    316     bge         skip_last12rows_kernel1
    317 
    318 
    319     ld1         {v10.4h},[x0],x6
    320     ld1         {v11.4h},[x9],x6
    321     ld1         {v6.4h},[x0],x10
    322     ld1         {v7.4h},[x9],x10
    323     ld1         {v4.4h},[x0],x6
    324     ld1         {v5.4h},[x9],x6
    325     ld1         {v8.4h},[x0],x5
    326     ld1         {v9.4h},[x9],x5
    327 
    328 
    329 
    330 
    331     smlal       v24.4s, v6.4h, v2.h[1]      //// y1 * cos1(part of b0)
    332     smlsl       v26.4s, v6.4h, v1.h[1]      //// y1 * cos3(part of b1)
    333     smlsl       v28.4s, v6.4h, v3.h[1]      //// y1 * sin3(part of b2)
    334     smlal       v30.4s, v6.4h, v0.h[1]      //// y1 * sin1(part of b3)
    335 
    336     smlal       v24.4s, v7.4h, v2.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
    337     smlsl       v26.4s, v7.4h, v0.h[1]      //// y1 * cos3 - y3 * sin1(part of b1)
    338     smlal       v28.4s, v7.4h, v2.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
    339     smlal       v30.4s, v7.4h, v3.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
    340 
    341 
    342 
    343     smlal       v24.4s, v8.4h, v3.h[1]
    344     smlsl       v26.4s, v8.4h, v1.h[3]
    345     smlal       v28.4s, v8.4h, v0.h[1]
    346     smlsl       v30.4s, v8.4h, v1.h[1]
    347 
    348 
    349     smlal       v24.4s, v9.4h, v3.h[3]
    350     smlsl       v26.4s, v9.4h, v3.h[1]
    351     smlal       v28.4s, v9.4h, v2.h[3]
    352     smlsl       v30.4s, v9.4h, v2.h[1]
    353 
    354 
    355 
    356 
    357 
    358     smlal       v12.4s, v10.4h, v0.h[0]
    359     smlal       v12.4s, v11.4h, v2.h[2]
    360     smlal       v12.4s, v4.4h, v3.h[0]
    361     smlal       v12.4s, v5.4h, v3.h[2]
    362 
    363 
    364 
    365 
    366     smlsl       v14.4s, v10.4h, v0.h[0]
    367     smlsl       v14.4s, v11.4h, v0.h[2]
    368     smlsl       v14.4s, v4.4h, v1.h[0]
    369     smlsl       v14.4s, v5.4h, v2.h[2]
    370 
    371 
    372     smlsl       v16.4s, v10.4h, v0.h[0]
    373     smlal       v16.4s, v11.4h, v3.h[2]
    374     smlal       v16.4s, v4.4h, v1.h[0]
    375     smlal       v16.4s, v5.4h, v1.h[2]
    376 
    377 
    378     smlal       v18.4s, v10.4h, v0.h[0]
    379     smlal       v18.4s, v11.4h, v1.h[2]
    380     smlsl       v18.4s, v4.4h, v3.h[0]
    381     smlsl       v18.4s, v5.4h, v0.h[2]
    382 
    383 skip_last12rows_kernel1:
    384     add         v20.4s,  v12.4s ,  v24.4s
    385     sub         v22.4s,  v12.4s ,  v24.4s
    386 
    387     add         v12.4s,  v14.4s ,  v26.4s
    388     sub         v24.4s,  v14.4s ,  v26.4s
    389 
    390     add         v14.4s,  v16.4s ,  v28.4s
    391     sub         v26.4s,  v16.4s ,  v28.4s
    392 
    393 
    394     add         v16.4s,  v18.4s ,  v30.4s
    395     sub         v28.4s,  v18.4s ,  v30.4s
    396 
    397 
    398 
    399 
    400 
    401 
    402 
    403     sqrshrn     v30.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
    404     sqrshrn     v19.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
    405     sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
    406     sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
    407     sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
    408     sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
    409     sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
    410     sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
    411 
    412     st1         {v30.4h, v31.4h},[x1],#16
    413     st1         {v18.4h, v19.4h},[x1],#16
    414     sub         x1,x1,#32
    415 
    416     bge         skip_stage1_kernel_load
    417 
    418 first_stage_middle_eight:
    419 
    420 
    421 
    422     ld1         {v10.4h},[x0],x6
    423     ld1         {v11.4h},[x9],x6
    424     ld1         {v6.4h},[x0],x10
    425     ld1         {v7.4h},[x9],x10
    426     ld1         {v4.4h},[x0],x6
    427     ld1         {v5.4h},[x9],x6
    428     ld1         {v8.4h},[x0],x8
    429     ld1         {v9.4h},[x9],x8
    430 
    431 
    432 skip_stage1_kernel_load:
    433     smull       v24.4s, v6.4h, v2.h[1]     //// y1 * cos1(part of b0)
    434     smull       v26.4s, v6.4h, v2.h[3]     //// y1 * cos3(part of b1)
    435     smull       v28.4s, v6.4h, v3.h[1]     //// y1 * sin3(part of b2)
    436     smull       v30.4s, v6.4h, v3.h[3]     //// y1 * sin1(part of b3)
    437 
    438     smlsl       v24.4s, v7.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
    439     smlsl       v26.4s, v7.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
    440     smlsl       v28.4s, v7.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
    441     smlsl       v30.4s, v7.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    442 
    443 
    444 
    445 
    446 
    447 
    448     smull       v22.4s, v10.4h, v0.h[0]
    449     smlsl       v22.4s, v11.4h, v3.h[2]
    450     smull       v20.4s, v10.4h, v0.h[0]
    451     smlsl       v20.4s, v11.4h, v2.h[2]
    452     smull       v16.4s, v10.4h, v0.h[0]
    453     smlsl       v16.4s, v11.4h, v1.h[2]
    454     smull       v18.4s, v10.4h, v0.h[0]
    455     smlsl       v18.4s, v11.4h, v0.h[2]
    456 
    457 
    458     cmp         x11,x7
    459     bge         skip_last12rows_kernel2
    460 
    461     smlsl       v24.4s, v8.4h, v3.h[1]
    462     smlal       v26.4s, v8.4h, v2.h[1]
    463     smlal       v28.4s, v8.4h, v0.h[1]
    464     smlal       v30.4s, v8.4h, v2.h[3]
    465 
    466 
    467     smlal       v24.4s, v9.4h, v0.h[1]
    468     smlal       v26.4s, v9.4h, v3.h[1]
    469     smlsl       v28.4s, v9.4h, v1.h[1]
    470     smlsl       v30.4s, v9.4h, v2.h[1]
    471 
    472 
    473 
    474     smlsl       v22.4s, v4.4h, v1.h[0]
    475     smlal       v22.4s, v5.4h, v2.h[2]
    476     smlsl       v20.4s, v4.4h, v3.h[0]
    477     smlal       v20.4s, v5.4h, v0.h[2]
    478     smlal       v16.4s, v4.4h, v3.h[0]
    479     smlal       v16.4s, v5.4h, v3.h[2]
    480     smlal       v18.4s, v4.4h, v1.h[0]
    481     smlsl       v18.4s, v5.4h, v1.h[2]
    482 
    483 //d0[0]=    64        d2[0]=64
    484 //d0[1]= 90        d2[1]=57
    485 //d0[2]= 89        d2[2]=50
    486 //d0[3]= 87        d2[3]=43
    487 //d1[0]= 83         d3[0]=36
    488 //d1[1]= 80        d3[1]=25
    489 //d1[2]= 75        d3[2]=18
    490 //d1[3]= 70        d3[3]=9
    491     mov         x19,#0xff00
    492     cmp         x11,x19
    493     bge         skip_last12rows_kernel2
    494 
    495     ld1         {v10.4h},[x0],x6
    496     ld1         {v11.4h},[x9],x6
    497     ld1         {v6.4h},[x0],x10
    498     ld1         {v7.4h},[x9],x10
    499     ld1         {v4.4h},[x0],x6
    500     ld1         {v5.4h},[x9],x6
    501     ld1         {v8.4h},[x0],x5
    502     ld1         {v9.4h},[x9],x5
    503 
    504 
    505     smlsl       v24.4s, v6.4h, v3.h[3]     //// y1 * cos1(part of b0)
    506     smlsl       v26.4s, v6.4h, v0.h[3]     //// y1 * cos3(part of b1)
    507     smlal       v28.4s, v6.4h, v2.h[3]     //// y1 * sin3(part of b2)
    508     smlal       v30.4s, v6.4h, v1.h[3]     //// y1 * sin1(part of b3)
    509 
    510     smlsl       v24.4s, v7.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
    511     smlal       v26.4s, v7.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
    512     smlal       v28.4s, v7.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
    513     smlsl       v30.4s, v7.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    514 
    515 
    516     smlal       v24.4s, v8.4h, v2.h[3]
    517     smlal       v26.4s, v8.4h, v3.h[3]
    518     smlsl       v28.4s, v8.4h, v2.h[1]
    519     smlal       v30.4s, v8.4h, v0.h[3]
    520 
    521 
    522     smlal       v24.4s, v9.4h, v1.h[3]
    523     smlsl       v26.4s, v9.4h, v1.h[1]
    524     smlal       v28.4s, v9.4h, v0.h[3]
    525     smlsl       v30.4s, v9.4h, v0.h[1]
    526 
    527 
    528 
    529 
    530     smlal       v22.4s, v10.4h, v0.h[0]
    531     smlsl       v22.4s, v11.4h, v1.h[2]
    532     smlsl       v22.4s, v4.4h, v3.h[0]
    533     smlal       v22.4s, v5.4h, v0.h[2]
    534 
    535 
    536 
    537     smlsl       v20.4s, v10.4h, v0.h[0]
    538     smlsl       v20.4s, v11.4h, v3.h[2]
    539     smlal       v20.4s, v4.4h, v1.h[0]
    540     smlsl       v20.4s, v5.4h, v1.h[2]
    541 
    542 
    543     smlsl       v16.4s, v10.4h, v0.h[0]
    544     smlal       v16.4s, v11.4h, v0.h[2]
    545     smlsl       v16.4s, v4.4h, v1.h[0]
    546     smlal       v16.4s, v5.4h, v2.h[2]
    547 
    548 
    549 
    550     smlal       v18.4s, v10.4h, v0.h[0]
    551     smlsl       v18.4s, v11.4h, v2.h[2]
    552     smlal       v18.4s, v4.4h, v3.h[0]
    553     smlsl       v18.4s, v5.4h, v3.h[2]
    554 
    555 skip_last12rows_kernel2:
    556 
    557     add         v4.4s,  v22.4s ,  v24.4s
    558     sub         v22.4s,  v22.4s ,  v24.4s
    559 
    560     add         v6.4s,  v20.4s ,  v26.4s
    561     sub         v24.4s,  v20.4s ,  v26.4s
    562 
    563     add         v10.4s,  v16.4s ,  v28.4s
    564     sub         v26.4s,  v16.4s ,  v28.4s
    565 
    566 
    567     add         v16.4s,  v18.4s ,  v30.4s
    568     sub         v28.4s,  v18.4s ,  v30.4s
    569 
    570 
    571     sqrshrn     v18.4h, v4.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
    572     sqrshrn     v31.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
    573     sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
    574     sqrshrn     v30.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
    575     sqrshrn     v20.4h, v6.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
    576     sqrshrn     v23.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
    577     sqrshrn     v21.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
    578     sqrshrn     v22.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
    579 
    580 
    581     // registers used:    {q2,q4,q6,q7}, {q9,q15,q10,q11}
    582 
    583 
    584 
    585 
    586 
    587 
    588     ld1         {v4.4h, v5.4h},[x1],#16
    589     ld1         {v8.4h, v9.4h},[x1],#16
    590     sub         x1,x1,#32
    591 
    592 //d4=x0
    593 //d12=x1
    594 //d5=x2
    595 //d13=x3
    596 
    597 //d18=x4
    598 //d20=x5
    599 //d19=x6
    600 //d21=x7
    601 
    602 //d22=x8
    603 //d30=x9
    604 //d23=x10
    605 //d31=x11
    606 
    607 //d14=x12
    608 //d8=x13
    609 //d15=x14
    610 //d9=x15
    611 
    612     umov        x15,v26.d[0]
    613     umov        x16,v27.d[0]
    614     umov        x19,v28.d[0]
    615     umov        x20,v29.d[0]
    616 
    617     trn1        v26.4h, v4.4h, v12.4h
    618     trn2        v27.4h, v4.4h, v12.4h
    619     trn1        v28.4h, v5.4h, v13.4h
    620     trn2        v29.4h, v5.4h, v13.4h
    621 
    622     trn1        v4.2s, v26.2s, v28.2s
    623     trn2        v5.2s, v26.2s, v28.2s
    624     trn1        v12.2s, v27.2s, v29.2s
    625     trn2        v13.2s, v27.2s, v29.2s
    626 
    627     trn1        v26.4h, v18.4h, v20.4h
    628     trn2        v27.4h, v18.4h, v20.4h
    629     trn1        v28.4h, v19.4h, v21.4h
    630     trn2        v29.4h, v19.4h, v21.4h
    631 
    632     trn1        v18.2s, v26.2s, v28.2s
    633     trn2        v19.2s, v26.2s, v28.2s
    634     trn1        v20.2s, v27.2s, v29.2s
    635     trn2        v21.2s, v27.2s, v29.2s
    636 
    637     trn1        v26.4h, v22.4h, v30.4h
    638     trn2        v27.4h, v22.4h, v30.4h
    639     trn1        v28.4h, v23.4h, v31.4h
    640     trn2        v29.4h, v23.4h, v31.4h
    641 
    642     trn1        v22.2s, v26.2s, v28.2s
    643     trn2        v23.2s, v26.2s, v28.2s
    644     trn1        v30.2s, v27.2s, v29.2s
    645     trn2        v31.2s, v27.2s, v29.2s
    646 
    647     trn1        v26.4h, v14.4h, v8.4h
    648     trn2        v27.4h, v14.4h, v8.4h
    649     trn1        v28.4h, v15.4h, v9.4h
    650     trn2        v29.4h, v15.4h, v9.4h
    651 
    652     trn1        v14.2s, v26.2s, v28.2s
    653     trn2        v15.2s, v26.2s, v28.2s
    654     trn1        v8.2s, v27.2s, v29.2s
    655     trn2        v9.2s, v27.2s, v29.2s
    656 
    657     mov         v26.d[0],x15
    658     mov         v27.d[0],x16
    659     mov         v28.d[0],x19
    660     mov         v29.d[0],x20
    661 
    662 // d4 =x0 1- 4 values
    663 // d5 =x2 1- 4 values
    664 // d12=x1 1- 4 values
    665 // d13=x3 1- 4 values
    666 
    667 // d18 =x0 5- 8 values
    668 // d19 =x2 5- 8 values
    669 // d20=x1 5- 8 values
    670 // d21=x3 5- 8 values
    671 
    672 // d22 =x0 9- 12 values
    673 // d23 =x2 9- 12 values
    674 // d30=x1 9- 12 values
    675 // d31=x3 9- 12 values
    676 
    677 // d14 =x0 13-16 values
    678 // d15 =x2 13- 16 values
    679 // d8=x1 13- 16 values
    680 // d9=x3 13- 16 values
    681 
    682 
    683     st1         { v4.4h, v5.4h},[x1],#16
    684     st1         { v12.4h, v13.4h},[x1],#16
    685 
    686     st1         { v18.4h, v19.4h},[x1],#16
    687     st1         { v20.4h, v21.4h},[x1],#16
    688     st1         { v22.4h, v23.4h},[x1],#16
    689     st1         { v30.4h, v31.4h},[x1],#16
    690     st1         { v14.4h, v15.4h},[x1],#16
    691     st1         { v8.4h, v9.4h},[x1],#16
    692 
    693 
    694     subs        x14,x14,#1
    695     bne         first_stage
    696 
    697 
    698 
    699 
    700 
    701 
    702 
    703 
    704 
    705 
    706     mov         x6,x7
    707 
    708     ldp         x8, x7,[sp],#16
    709 
    710     mov         x10,#16
    711 
    712     cmp         x12,x6
    713     sub         x20,x1,#128
    714     csel        x1, x20, x1,ge
    715     bge         label1
    716 
    717     mov         x19,#0xff00
    718     cmp         x12,x19
    719     sub         x20,x1,#256
    720     csel        x1, x20, x1,ge
    721     bge         label_2
    722 
    723     sub         x1,x1,#512
    724     sub         x20,x10,#0
    725     neg         x10, x20
    726 
    727 label_2:
    728     add         x9,x1,#128
    729     add         x11,x9,#128
    730     add         x0,x11,#128
    731 
    732 
    733 
    734 label1:
    735 //    mov   x6,x1
    736 
    737 
    738     mov         x14,#4
    739     add         x4,x2,x8, lsl #1            // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
    740     add         x5,x8,x8, lsl #1            //
    741 //    add x0,x3,x7, lsl #1    @ x0 points to 3rd row of dest data
    742 //    add x10,x7,x7, lsl #1    @
    743 
    744 
    745 
    746 
    747 second_stage:
    748     ld1         {v10.4h, v11.4h},[x1],#16
    749     ld1         {v6.4h, v7.4h},[x1],x10
    750     cmp         x12,x6
    751     bge         second_stage_process
    752     ld1         {v4.4h, v5.4h},[x9],#16
    753     ld1         {v8.4h, v9.4h},[x9],x10
    754 
    755 second_stage_process:
    756 
    757 
    758     smull       v24.4s, v6.4h, v0.h[1]     //// y1 * cos1(part of b0)
    759     smull       v26.4s, v6.4h, v0.h[3]     //// y1 * cos3(part of b1)
    760     smull       v28.4s, v6.4h, v1.h[1]     //// y1 * sin3(part of b2)
    761     smull       v30.4s, v6.4h, v1.h[3]     //// y1 * sin1(part of b3)
    762 
    763     smlal       v24.4s, v7.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
    764     smlal       v26.4s, v7.4h, v2.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
    765     smlal       v28.4s, v7.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
    766     smlsl       v30.4s, v7.4h, v2.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
    767 
    768 
    769     smull       v12.4s, v10.4h, v0.h[0]
    770     smlal       v12.4s, v11.4h, v0.h[2]
    771     smull       v14.4s, v10.4h, v0.h[0]
    772     smlal       v14.4s, v11.4h, v1.h[2]
    773     smull       v16.4s, v10.4h, v0.h[0]
    774     smlal       v16.4s, v11.4h, v2.h[2]
    775     smull       v18.4s, v10.4h, v0.h[0]
    776     smlal       v18.4s, v11.4h, v3.h[2]
    777 
    778     bge         skip_last8rows_stage2_kernel1
    779 
    780     smlal       v24.4s, v8.4h, v1.h[1]
    781     smlal       v26.4s, v8.4h, v3.h[3]
    782     smlsl       v28.4s, v8.4h, v1.h[3]
    783     smlsl       v30.4s, v8.4h, v0.h[3]
    784 
    785 
    786     smlal       v24.4s, v9.4h, v1.h[3]
    787     smlsl       v26.4s, v9.4h, v2.h[3]
    788     smlsl       v28.4s, v9.4h, v0.h[3]
    789     smlal       v30.4s, v9.4h, v3.h[3]
    790 
    791 
    792     smlal       v12.4s, v4.4h, v1.h[0]
    793     smlal       v12.4s, v5.4h, v1.h[2]
    794     smlal       v14.4s, v4.4h, v3.h[0]
    795     smlsl       v14.4s, v5.4h, v3.h[2]
    796     smlsl       v16.4s, v4.4h, v3.h[0]
    797     smlsl       v16.4s, v5.4h, v0.h[2]
    798     smlsl       v18.4s, v4.4h, v1.h[0]
    799     smlsl       v18.4s, v5.4h, v2.h[2]
    800 
    801     mov         x19,#0xff00
    802     cmp         x12,x19
    803     bge         skip_last8rows_stage2_kernel1
    804 
    805 
    806     ld1         {v10.4h, v11.4h},[x11],#16
    807     ld1         {v6.4h, v7.4h},[x11],x10
    808     ld1         {v4.4h, v5.4h},[x0],#16
    809     ld1         {v8.4h, v9.4h},[x0],x10
    810 
    811 
    812 
    813 
    814 
    815     smlal       v24.4s, v6.4h, v2.h[1]     //// y1 * cos1(part of b0)
    816     smlsl       v26.4s, v6.4h, v1.h[1]     //// y1 * cos3(part of b1)
    817     smlsl       v28.4s, v6.4h, v3.h[1]     //// y1 * sin3(part of b2)
    818     smlal       v30.4s, v6.4h, v0.h[1]     //// y1 * sin1(part of b3)
    819 
    820     smlal       v24.4s, v7.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
    821     smlsl       v26.4s, v7.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
    822     smlal       v28.4s, v7.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
    823     smlal       v30.4s, v7.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    824 
    825 
    826 
    827     smlal       v24.4s, v8.4h, v3.h[1]
    828     smlsl       v26.4s, v8.4h, v1.h[3]
    829     smlal       v28.4s, v8.4h, v0.h[1]
    830     smlsl       v30.4s, v8.4h, v1.h[1]
    831 
    832 
    833     smlal       v24.4s, v9.4h, v3.h[3]
    834     smlsl       v26.4s, v9.4h, v3.h[1]
    835     smlal       v28.4s, v9.4h, v2.h[3]
    836     smlsl       v30.4s, v9.4h, v2.h[1]
    837 
    838 
    839 
    840 
    841 
    842     smlal       v12.4s, v10.4h, v0.h[0]
    843     smlal       v12.4s, v11.4h, v2.h[2]
    844     smlal       v12.4s, v4.4h, v3.h[0]
    845     smlal       v12.4s, v5.4h, v3.h[2]
    846 
    847 
    848 
    849 
    850     smlsl       v14.4s, v10.4h, v0.h[0]
    851     smlsl       v14.4s, v11.4h, v0.h[2]
    852     smlsl       v14.4s, v4.4h, v1.h[0]
    853     smlsl       v14.4s, v5.4h, v2.h[2]
    854 
    855 
    856     smlsl       v16.4s, v10.4h, v0.h[0]
    857     smlal       v16.4s, v11.4h, v3.h[2]
    858     smlal       v16.4s, v4.4h, v1.h[0]
    859     smlal       v16.4s, v5.4h, v1.h[2]
    860 
    861 
    862     smlal       v18.4s, v10.4h, v0.h[0]
    863     smlal       v18.4s, v11.4h, v1.h[2]
    864     smlsl       v18.4s, v4.4h, v3.h[0]
    865     smlsl       v18.4s, v5.4h, v0.h[2]
    866 
    867 
    868 
    869 
    870 
    871 
    872 skip_last8rows_stage2_kernel1:
    873 
    874 
    875 
    876     add         v20.4s,  v12.4s ,  v24.4s
    877     sub         v22.4s,  v12.4s ,  v24.4s
    878 
    879     add         v12.4s,  v14.4s ,  v26.4s
    880     sub         v24.4s,  v14.4s ,  v26.4s
    881 
    882     add         v14.4s,  v16.4s ,  v28.4s
    883     sub         v26.4s,  v16.4s ,  v28.4s
    884 
    885 
    886     add         v16.4s,  v18.4s ,  v30.4s
    887     sub         v28.4s,  v18.4s ,  v30.4s
    888 
    889 
    890 
    891 
    892 
    893 
    894 
    895     sqrshrn     v30.4h, v20.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
    896     sqrshrn     v19.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
    897     sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
    898     sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
    899     sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
    900     sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
    901     sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
    902     sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
    903 
    904     bge         skip_stage2_kernel_load
    905 
    906     //q2,q4,q6,q7 is used
    907     ld1         {v10.4h, v11.4h},[x1],#16
    908     ld1         {v6.4h, v7.4h},[x1],#16
    909     ld1         {v4.4h, v5.4h},[x9],#16
    910     ld1         {v8.4h, v9.4h},[x9],#16
    911 skip_stage2_kernel_load:
    912     sub         x1,x1,#32
    913     st1         {v30.4h, v31.4h},[x1],#16
    914     st1         {v18.4h, v19.4h},[x1],#16
    915     sub         x1,x1,#32
    916 
    917     smull       v24.4s, v6.4h, v2.h[1]     //// y1 * cos1(part of b0)
    918     smull       v26.4s, v6.4h, v2.h[3]     //// y1 * cos3(part of b1)
    919     smull       v28.4s, v6.4h, v3.h[1]     //// y1 * sin3(part of b2)
    920     smull       v30.4s, v6.4h, v3.h[3]     //// y1 * sin1(part of b3)
    921 
    922     smlsl       v24.4s, v7.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
    923     smlsl       v26.4s, v7.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
    924     smlsl       v28.4s, v7.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
    925     smlsl       v30.4s, v7.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    926 
    927 
    928     smull       v22.4s, v10.4h, v0.h[0]
    929     smlsl       v22.4s, v11.4h, v3.h[2]
    930     smull       v20.4s, v10.4h, v0.h[0]
    931     smlsl       v20.4s, v11.4h, v2.h[2]
    932     smull       v16.4s, v10.4h, v0.h[0]
    933     smlsl       v16.4s, v11.4h, v1.h[2]
    934     smull       v18.4s, v10.4h, v0.h[0]
    935     smlsl       v18.4s, v11.4h, v0.h[2]
    936 
    937 
    938 
    939     cmp         x12,x6
    940     bge         skip_last8rows_stage2_kernel2
    941 
    942 
    943     smlsl       v24.4s, v8.4h, v3.h[1]
    944     smlal       v26.4s, v8.4h, v2.h[1]
    945     smlal       v28.4s, v8.4h, v0.h[1]
    946     smlal       v30.4s, v8.4h, v2.h[3]
    947 
    948 
    949     smlal       v24.4s, v9.4h, v0.h[1]
    950     smlal       v26.4s, v9.4h, v3.h[1]
    951     smlsl       v28.4s, v9.4h, v1.h[1]
    952     smlsl       v30.4s, v9.4h, v2.h[1]
    953 
    954 
    955 
    956     smlsl       v22.4s, v4.4h, v1.h[0]
    957     smlal       v22.4s, v5.4h, v2.h[2]
    958     smlsl       v20.4s, v4.4h, v3.h[0]
    959     smlal       v20.4s, v5.4h, v0.h[2]
    960     smlal       v16.4s, v4.4h, v3.h[0]
    961     smlal       v16.4s, v5.4h, v3.h[2]
    962     smlal       v18.4s, v4.4h, v1.h[0]
    963     smlsl       v18.4s, v5.4h, v1.h[2]
    964     mov         x19,#0xff00
    965     cmp         x12,x19
    966     bge         skip_last8rows_stage2_kernel2
    967 
    968     ld1         {v10.4h, v11.4h},[x11],#16
    969     ld1         {v6.4h, v7.4h},[x11],#16
    970     ld1         {v4.4h, v5.4h},[x0],#16
    971     ld1         {v8.4h, v9.4h},[x0],#16
    972 
    973     smlsl       v24.4s, v6.4h, v3.h[3]     //// y1 * cos1(part of b0)
    974     smlsl       v26.4s, v6.4h, v0.h[3]     //// y1 * cos3(part of b1)
    975     smlal       v28.4s, v6.4h, v2.h[3]     //// y1 * sin3(part of b2)
    976     smlal       v30.4s, v6.4h, v1.h[3]     //// y1 * sin1(part of b3)
    977 
    978     smlsl       v24.4s, v7.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
    979     smlal       v26.4s, v7.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
    980     smlal       v28.4s, v7.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
    981     smlsl       v30.4s, v7.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    982 
    983 
    984     smlal       v24.4s, v8.4h, v2.h[3]
    985     smlal       v26.4s, v8.4h, v3.h[3]
    986     smlsl       v28.4s, v8.4h, v2.h[1]
    987     smlal       v30.4s, v8.4h, v0.h[3]
    988 
    989 
    990     smlal       v24.4s, v9.4h, v1.h[3]
    991     smlsl       v26.4s, v9.4h, v1.h[1]
    992     smlal       v28.4s, v9.4h, v0.h[3]
    993     smlsl       v30.4s, v9.4h, v0.h[1]
    994 
    995 
    996 
    997 
    998     smlal       v22.4s, v10.4h, v0.h[0]
    999     smlsl       v22.4s, v11.4h, v1.h[2]
   1000     smlsl       v22.4s, v4.4h, v3.h[0]
   1001     smlal       v22.4s, v5.4h, v0.h[2]
   1002 
   1003 
   1004 
   1005     smlsl       v20.4s, v10.4h, v0.h[0]
   1006     smlsl       v20.4s, v11.4h, v3.h[2]
   1007     smlal       v20.4s, v4.4h, v1.h[0]
   1008     smlsl       v20.4s, v5.4h, v1.h[2]
   1009 
   1010 
   1011     smlsl       v16.4s, v10.4h, v0.h[0]
   1012     smlal       v16.4s, v11.4h, v0.h[2]
   1013     smlsl       v16.4s, v4.4h, v1.h[0]
   1014     smlal       v16.4s, v5.4h, v2.h[2]
   1015 
   1016 
   1017 
   1018     smlal       v18.4s, v10.4h, v0.h[0]
   1019     smlsl       v18.4s, v11.4h, v2.h[2]
   1020     smlal       v18.4s, v4.4h, v3.h[0]
   1021     smlsl       v18.4s, v5.4h, v3.h[2]
   1022 
   1023 
   1024 skip_last8rows_stage2_kernel2:
   1025 
   1026 
   1027 
   1028     add         v4.4s,  v22.4s ,  v24.4s
   1029     sub         v22.4s,  v22.4s ,  v24.4s
   1030 
   1031     add         v6.4s,  v20.4s ,  v26.4s
   1032     sub         v24.4s,  v20.4s ,  v26.4s
   1033 
   1034     add         v10.4s,  v16.4s ,  v28.4s
   1035     sub         v26.4s,  v16.4s ,  v28.4s
   1036 
   1037 
   1038     add         v16.4s,  v18.4s ,  v30.4s
   1039     sub         v28.4s,  v18.4s ,  v30.4s
   1040 
   1041 
   1042     sqrshrn     v18.4h, v4.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
   1043     sqrshrn     v31.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
   1044     sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
   1045     sqrshrn     v30.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
   1046     sqrshrn     v20.4h, v6.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
   1047     sqrshrn     v23.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
   1048     sqrshrn     v21.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
   1049     sqrshrn     v22.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
   1050 
   1051     ld1         {v4.4h, v5.4h},[x1],#16
   1052     ld1         {v8.4h, v9.4h},[x1],#16
   1053 
   1054 
   1055 
   1056     // registers used:    {q2,q4,q6,q7}, {q9,q15,q10,q11}
   1057 
   1058 //d4=x0
   1059 //d12=x1
   1060 //d5=x2
   1061 //d13=x3
   1062 
   1063 //d18=x4
   1064 //d20=x5
   1065 //d19=x6
   1066 //d21=x7
   1067 
   1068 //d22=x8
   1069 //d30=x9
   1070 //d23=x10
   1071 //d31=x11
   1072 
   1073 //d14=x12
   1074 //d8=x13
   1075 //d15=x14
   1076 //d9=x15
   1077 
   1078     umov        x15,v26.d[0]
   1079     umov        x16,v27.d[0]
   1080     umov        x19,v28.d[0]
   1081     umov        x20,v29.d[0]
   1082 
   1083     trn1        v26.4h, v4.4h, v12.4h
   1084     trn2        v27.4h, v4.4h, v12.4h
   1085     trn1        v28.4h, v5.4h, v13.4h
   1086     trn2        v29.4h, v5.4h, v13.4h
   1087 
   1088     trn1        v4.2s, v26.2s, v28.2s
   1089     trn2        v5.2s, v26.2s, v28.2s
   1090     trn1        v12.2s, v27.2s, v29.2s
   1091     trn2        v13.2s, v27.2s, v29.2s
   1092 
   1093     trn1        v26.4h, v18.4h, v20.4h
   1094     trn2        v27.4h, v18.4h, v20.4h
   1095     trn1        v28.4h, v19.4h, v21.4h
   1096     trn2        v29.4h, v19.4h, v21.4h
   1097 
   1098     trn1        v18.2s, v26.2s, v28.2s
   1099     trn2        v19.2s, v26.2s, v28.2s
   1100     trn1        v20.2s, v27.2s, v29.2s
   1101     trn2        v21.2s, v27.2s, v29.2s
   1102 
   1103     trn1        v26.4h, v22.4h, v30.4h
   1104     trn2        v27.4h, v22.4h, v30.4h
   1105     trn1        v28.4h, v23.4h, v31.4h
   1106     trn2        v29.4h, v23.4h, v31.4h
   1107 
   1108     trn1        v22.2s, v26.2s, v28.2s
   1109     trn2        v23.2s, v26.2s, v28.2s
   1110     trn1        v30.2s, v27.2s, v29.2s
   1111     trn2        v31.2s, v27.2s, v29.2s
   1112 
   1113     trn1        v26.4h, v14.4h, v8.4h
   1114     trn2        v27.4h, v14.4h, v8.4h
   1115     trn1        v28.4h, v15.4h, v9.4h
   1116     trn2        v29.4h, v15.4h, v9.4h
   1117 
   1118     trn1        v14.2s, v26.2s, v28.2s
   1119     trn2        v15.2s, v26.2s, v28.2s
   1120     trn1        v8.2s, v27.2s, v29.2s
   1121     trn2        v9.2s, v27.2s, v29.2s
   1122 
   1123     mov         v26.d[0],x15
   1124     mov         v27.d[0],x16
   1125     mov         v28.d[0],x19
   1126     mov         v29.d[0],x20
   1127 
   1128 // d4 =x0 1- 4 values
   1129 // d5 =x2 1- 4 values
   1130 // d12=x1 1- 4 values
   1131 // d13=x3 1- 4 values
   1132 
   1133 // d18 =x0 5- 8 values
   1134 // d19 =x2 5- 8 values
   1135 // d20=x1 5- 8 values
   1136 // d21=x3 5- 8 values
   1137 
   1138 // d22 =x0 9- 12 values
   1139 // d23 =x2 9- 12 values
   1140 // d30=x1 9- 12 values
   1141 // d31=x3 9- 12 values
   1142 
   1143 // d14 =x0 13-16 values
   1144 // d15 =x2 13- 16 values
   1145 // d8=x1 13- 16 values
   1146 // d9=x3 13- 16 values
   1147 
   1148     // swapping v5 and v15
   1149     mov         v5.d[1],v5.d[0]
   1150     mov         v5.d[0],v18.d[0]
   1151     mov         v18.d[0],v5.d[1]
   1152     // swapping v23 and v14
   1153     mov         v23.d[1],v23.d[0]
   1154     mov         v23.d[0],v14.d[0]
   1155     mov         v14.d[0],v23.d[1]
   1156     // swapping v13 and v20
   1157     mov         v13.d[1],v13.d[0]
   1158     mov         v13.d[0],v20.d[0]
   1159     mov         v20.d[0],v13.d[1]
   1160     // swapping v31 and v8
   1161     mov         v31.d[1],v31.d[0]
   1162     mov         v31.d[0],v8.d[0]
   1163     mov         v8.d[0],v31.d[1]
   1164 
   1165 // q2: x0 1-8 values
   1166 // q11: x0 9-16 values
   1167 // q9 : x2 1-8 values
   1168 // q7 : x2 9-16 values
   1169 // q6 : x1 1- 8 values
   1170 // q10: x3 1-8 values
   1171 // q15: x1 9-16 values
   1172 // q4:  x3 9-16 values
   1173 
   1174 
   1175 //    registers free: q8,q14,q12,q13
   1176 
   1177 
   1178     ld1         {v16.8b, v17.8b},[x2],x8
   1179     ld1         {v28.8b, v29.8b},[x2],x5
   1180     ld1         {v24.8b, v25.8b},[x4],x8
   1181     ld1         {v26.8b, v27.8b},[x4],x5
   1182 
   1183     mov         v4.d[1] ,v5.d[0]
   1184     mov         v22.d[1] ,v23.d[0]
   1185     mov         v12.d[1] ,v13.d[0]
   1186     mov         v30.d[1] ,v31.d[0]
   1187     mov         v18.d[1] ,v19.d[0]
   1188     mov         v14.d[1] ,v15.d[0]
   1189     mov         v20.d[1] ,v21.d[0]
   1190     mov         v8.d[1] ,v9.d[0]
   1191 
   1192     uaddw       v4.8h,  v4.8h ,  v16.8b
   1193     uaddw       v22.8h,  v22.8h ,  v17.8b
   1194     uaddw       v12.8h,  v12.8h ,  v28.8b
   1195     uaddw       v30.8h,  v30.8h ,  v29.8b
   1196     uaddw       v18.8h,  v18.8h ,  v24.8b
   1197     uaddw       v14.8h,  v14.8h ,  v25.8b
   1198     uaddw       v20.8h,  v20.8h ,  v26.8b
   1199     uaddw       v8.8h,  v8.8h ,  v27.8b
   1200 
   1201 
   1202     sqxtun      v16.8b, v4.8h
   1203     sqxtun      v17.8b, v22.8h
   1204     sqxtun      v28.8b, v12.8h
   1205     sqxtun      v29.8b, v30.8h
   1206     sqxtun      v24.8b, v18.8h
   1207     sqxtun      v25.8b, v14.8h
   1208     sqxtun      v26.8b, v20.8h
   1209     sqxtun      v27.8b, v8.8h
   1210 
   1211 
   1212 
   1213     st1         {v16.8b, v17.8b},[x3],x7
   1214     st1         {v28.8b, v29.8b},[x3],x7
   1215     st1         {v24.8b, v25.8b},[x3],x7
   1216     st1         {v26.8b, v27.8b},[x3],x7
   1217 
   1218     subs        x14,x14,#1
   1219 
   1220 
   1221 
   1222     bne         second_stage
   1223 
   1224 
   1225 //    sub         sp,sp,#40
   1226     // ldmfd sp!,{x4-x12,pc}
   1227     ldp         x19, x20,[sp],#16
   1228     pop_v_regs
   1229     ret
   1230 
   1231 
   1232 
   1233 
   1234 
   1235 
   1236 
   1237 
   1238 
   1239 
   1240 
   1241