Home | History | Annotate | Download | only in armv8
      1 ///******************************************************************************
      2 // *
      3 // * Copyright (C) 2018 The Android Open Source Project
      4 // *
      5 // * Licensed under the Apache License, Version 2.0 (the "License");
      6 // * you may not use this file except in compliance with the License.
      7 // * You may obtain a copy of the License at:
      8 // *
      9 // * http://www.apache.org/licenses/LICENSE-2.0
     10 // *
     11 // * Unless required by applicable law or agreed to in writing, software
     12 // * distributed under the License is distributed on an "AS IS" BASIS,
     13 // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 // * See the License for the specific language governing permissions and
     15 // * limitations under the License.
     16 // *
     17 // *****************************************************************************
     18 // * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 
     21 
     22 .macro push_v_regs
     23     stp             q8, q9, [sp, #-32]!
     24     stp             q10, q11, [sp, #-32]!
     25     stp             q12, q13, [sp, #-32]!
     26     stp             q14, q15, [sp, #-32]!
     27     stp             x21, x22, [sp, #-16]!
     28     stp             x23, x24, [sp, #-16]!
     29 .endm
     30 .macro pop_v_regs
     31     ldp             x23, x24, [sp], #16
     32     ldp             x21, x22, [sp], #16
     33     ldp             q14, q15, [sp], #32
     34     ldp             q12, q13, [sp], #32
     35     ldp             q10, q11, [sp], #32
     36     ldp             q8, q9, [sp], #32
     37 .endm
     38 .macro swp reg1, reg2
     39     MOV             X16, \reg1
     40     MOV             \reg1, \reg2
     41     MOV             \reg2, x16
     42 .endm
     43 .text
     44 .global ixheaacd_post_twiddle_armv8
     45 ixheaacd_post_twiddle_armv8:
     46 
     47 
     48     push_v_regs
     49 
     50 ARM_PROLOGUE:
     51     CMP             w3, #0x400
     52     LDR             x21, =7500
     53     ADD             x2, x2, x21
     54     BLT             NEXT
     55     MOV             w4, #50
     56     MOV             w5, #-50
     57     MOV             x6, #4
     58     dup             v10.4h, w4
     59     B               NEXT1
     60 
     61 NEXT:
     62     MOV             w4, #0x192
     63     MOV             w5, #0xfe6e
     64     MOV             x6, #32
     65     dup             v10.4h, w4
     66 
     67 NEXT1:
     68     LDR             w9, [x2]
     69     LSL             W22, W9, #16
     70     AND             W21, W9, #0xFFFF0000
     71 
     72     LDR             w7, [x1], #4
     73     LDR             w8, [x1], #4
     74 
     75     ADD             x2, x2, x6
     76 
     77 
     78     SMULL           X11, w8, w21
     79     ASR             X11, x11, #32
     80     SMULL           X10, w8, w22
     81     ASR             X10, x10, #32
     82     SMULL           X12, w7, w21
     83     ASR             X12, x12, #32
     84     SMULL           X23, w7, w22
     85     ASR             X23, x23, #32
     86     ADD             w8, w11, w23
     87 
     88 
     89     SUB             w10, w10, w12
     90 
     91     MVN             w8, w8
     92     ADD             w8, w8, #1
     93 
     94 
     95 
     96     LSL             w21, w5, #16
     97     LSL             w22, w4, #16
     98     SMULL           X23, w10, w21
     99     ASR             X23, x23, #32
    100     ADD             w9, w8, w23
    101     SMULL           X23, w8, w22
    102     ASR             X23, x23, #32
    103     ADD             w11, w10, w23
    104 
    105     LSL             x7, x3, #2
    106     ADD             x7, x0, x7
    107     SUB             x7, x7, #4
    108 
    109     STR             w11, [x7], #-4
    110 
    111     STR             w9, [x0], #4
    112 
    113     LSL             x5, x3, #2
    114     ADD             x5, x1, x5
    115     SUB             x5, x5, #40
    116 
    117 
    118     SUB             w3, w3, #1
    119     ASR             w3, w3, #4
    120 
    121 
    122     SUB             x7, x7, #28
    123 
    124 
    125 
    126 
    127 
    128 
    129 
    130 
    131 
    132 
    133 
    134 
    135     MOV             x8, #-32
    136 
    137 NEON_PROLOGUE:
    138 
    139     LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
    140 
    141     LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
    142     LD2             {v8.h, v9.h}[0], [x2], x6
    143     LD2             {v8.h, v9.h}[1], [x2], x6
    144     LD2             {v8.h, v9.h}[2], [x2], x6
    145     LD2             {v8.h, v9.h}[3], [x2], x6
    146 
    147     rev64           v12.4h, v8.4h
    148     rev64           v13.4h, v9.4h
    149 
    150     uMULL           v30.4s, v2.4h, v13.4h
    151     uMULL           v28.4s, v0.4h, v13.4h
    152     uMULL           v26.4s, v2.4h, v12.4h
    153     uMULL           v24.4s, v0.4h, v12.4h
    154 
    155     ushR            v30.4s, v30.4s, #16
    156     ushR            v28.4s, v28.4s, #16
    157     ushR            v26.4s, v26.4s, #16
    158     ushR            v24.4s, v24.4s, #16
    159 
    160     sMLAL           v30.4s, v3.4h, v13.4h
    161     sMLAL           v28.4s, v1.4h, v13.4h
    162     sMLAL           v26.4s, v3.4h, v12.4h
    163     sMLAL           v24.4s, v1.4h, v12.4h
    164 
    165     uMULL           v22.4s, v6.4h, v9.4h
    166     uMULL           v20.4s, v4.4h, v9.4h
    167 
    168     ADD             v28.4s, v28.4s , v26.4s
    169     SUB             v30.4s, v30.4s , v24.4s
    170     NEG             v28.4s, v28.4s
    171 
    172     uMULL           v18.4s, v6.4h, v8.4h
    173     uMULL           v16.4s, v4.4h, v8.4h
    174 
    175     mov             v31.8b, v30.8b
    176     mov             v27.D[0], v30.D[1]
    177     ushR            v22.4s, v22.4s, #16
    178 
    179     mov             v24.8b, v28.8b
    180     mov             v25.D[0], v28.D[1]
    181     ushR            v20.4s, v20.4s, #16
    182 
    183 
    184     UZP1            v26.4h, v31.4h, v27.4h
    185     UZP2            v27.4h, v31.4h, v27.4h
    186     ushR            v18.4s, v18.4s, #16
    187 
    188 
    189     mov             v31.8B , v24.8B
    190     UZP1            v24.4h, v31.4h, v25.4h
    191     UZP2            v25.4h, v31.4h, v25.4h
    192     ushR            v16.4s, v16.4s, #16
    193 
    194 
    195     sMLAL           v22.4s, v7.4h, v9.4h
    196     sMLAL           v20.4s, v5.4h, v9.4h
    197     sMLAL           v18.4s, v7.4h, v8.4h
    198     sMLAL           v16.4s, v5.4h, v8.4h
    199 
    200     LD2             {v8.h, v9.h}[0], [x2], x6
    201     uMULL           v0.4s, v26.4h, v10.4h
    202 
    203     LD2             {v8.h, v9.h}[1], [x2], x6
    204     uMULL           v2.4s, v24.4h, v10.4h
    205 
    206 
    207     LD2             {v8.h, v9.h}[2], [x2], x6
    208     ADD             v22.4s, v22.4s , v16.4s
    209 
    210     LD2             {v8.h, v9.h}[3], [x2], x6
    211     SUB             v20.4s, v18.4s , v20.4s
    212 
    213     rev64           v12.4h, v8.4h
    214     rev64           v13.4h, v9.4h
    215     NEG             v22.4s, v22.4s
    216 
    217 
    218     mov             v18.8b, v22.8b
    219     mov             v19.D[0], v22.D[1]
    220     ushR            v0.4s, v0.4s, #16
    221 
    222     mov             v16.16b, v20.16b
    223     mov             v17.D[0], v20.D[1]
    224     ushR            v2.4s, v2.4s, #16
    225 
    226 
    227     MOV             v31.8b, v18.8b
    228     UZP1            v18.4h, v31.4h, v19.4h
    229     UZP2            v19.4h, v31.4h, v19.4h
    230     sMLAL           v0.4s, v27.4h, v10.4h
    231 
    232 
    233     MOV             v31.8b, v16.8b
    234     UZP1            v16.4h, v31.4h, v17.4h
    235     UZP2            v17.4h, v31.4h, v17.4h
    236     sMLAL           v2.4s, v25.4h, v10.4h
    237 
    238     uMULL           v4.4s, v18.4h, v10.4h
    239     uMULL           v6.4s, v16.4h, v10.4h
    240 
    241     NEG             v0.4s, v0.4s
    242     ADD             v14.4s, v30.4s , v2.4s
    243     ADD             v26.4s, v28.4s , v0.4s
    244 
    245     rev64           v14.4s, v14.4s
    246     ushR            v4.4s, v4.4s, #16
    247 
    248     swp             v14.D[0], v14.D[1]
    249     ushR            v6.4s, v6.4s, #16
    250 
    251     sMLAL           v4.4s, v19.4h, v10.4h
    252     LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
    253     sMLAL           v6.4s, v17.4h, v10.4h
    254 
    255 
    256     SUB             x3, x3, #2
    257 
    258     ADD             v24.4s, v20.4s , v4.4s
    259 
    260     rev64           v24.4s, v24.4s
    261     NEG             v16.4s, v6.4s
    262 
    263     LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
    264 
    265     swp             v24.D[0], v24.D[1]
    266     ADD             v16.4s, v22.4s , v16.4s
    267 
    268 
    269 
    270 CORE_LOOP:
    271     uMULL           v30.4s, v2.4h, v13.4h
    272     MOV             v25.16B, v24.16B
    273     ST2             { v25.4s, v26.4s}, [x7], x8
    274     uMULL           v28.4s, v0.4h, v13.4h
    275 
    276     uMULL           v26.4s, v2.4h, v12.4h
    277     MOV             v15.16B, v14.16B
    278     ST2             { v15.4s, v16.4s}, [x0], #32
    279     uMULL           v24.4s, v0.4h, v12.4h
    280 
    281     ushR            v30.4s, v30.4s, #16
    282     ushR            v28.4s, v28.4s, #16
    283     ushR            v26.4s, v26.4s, #16
    284     ushR            v24.4s, v24.4s, #16
    285 
    286     sMLAL           v30.4s, v3.4h, v13.4h
    287     sMLAL           v28.4s, v1.4h, v13.4h
    288     sMLAL           v26.4s, v3.4h, v12.4h
    289     sMLAL           v24.4s, v1.4h, v12.4h
    290 
    291     uMULL           v22.4s, v6.4h, v9.4h
    292     uMULL           v20.4s, v4.4h, v9.4h
    293 
    294 
    295     ADD             v28.4s, v28.4s , v26.4s
    296     SUB             v30.4s, v30.4s , v24.4s
    297     NEG             v28.4s, v28.4s
    298 
    299     uMULL           v18.4s, v6.4h, v8.4h
    300     uMULL           v16.4s, v4.4h, v8.4h
    301 
    302 
    303     mov             v26.8b, v30.8b
    304     mov             v27.D[0], v30.D[1]
    305     ushR            v22.4s, v22.4s, #16
    306 
    307 
    308     mov             v24.8b, v28.8b
    309     mov             v25.D[0], v28.D[1]
    310     ushR            v20.4s, v20.4s, #16
    311 
    312 
    313     MOV             v31.8b, v26.8b
    314     UZP1            v26.4h, v31.4h, v27.4h
    315     UZP2            v27.4h, v31.4h, v27.4h
    316     ushR            v18.4s, v18.4s, #16
    317 
    318 
    319     MOV             v31.8b, v24.8b
    320     UZP1            v24.4h, v31.4h, v25.4h
    321     UZP2            v25.4h, v31.4h, v25.4h
    322     ushR            v16.4s, v16.4s, #16
    323 
    324 
    325     sMLAL           v22.4s, v7.4h, v9.4h
    326     sMLAL           v20.4s, v5.4h, v9.4h
    327     sMLAL           v18.4s, v7.4h, v8.4h
    328     sMLAL           v16.4s, v5.4h, v8.4h
    329 
    330     LD2             {v8.h, v9.h}[0], [x2], x6
    331     uMULL           v0.4s, v26.4h, v10.4h
    332 
    333     LD2             {v8.h, v9.h}[1], [x2], x6
    334     uMULL           v2.4s, v24.4h, v10.4h
    335 
    336     LD2             {v8.h, v9.h}[2], [x2], x6
    337     ADD             v22.4s, v22.4s , v16.4s
    338 
    339     LD2             {v8.h, v9.h}[3], [x2], x6
    340     SUB             v20.4s, v18.4s , v20.4s
    341 
    342     rev64           v12.4h, v8.4h
    343     rev64           v13.4h, v9.4h
    344     NEG             v22.4s, v22.4s
    345 
    346     mov             v18.8b, v22.8b
    347     mov             v19.D[0], v22.D[1]
    348     ushR            v0.4s, v0.4s, #16
    349 
    350     mov             v16.8b, v20.8b
    351     mov             v17.D[0], v20.D[1]
    352     ushR            v2.4s, v2.4s, #16
    353 
    354 
    355     MOV             v31.8b, v18.8b
    356     UZP1            v18.4h, v31.4h, v19.4h
    357     UZP2            v19.4h, v31.4h, v19.4h
    358     sMLAL           v0.4s, v27.4h, v10.4h
    359 
    360 
    361     MOV             v31.8b, v16.8b
    362     UZP1            v16.4h, v31.4h, v17.4h
    363     UZP2            v17.4h, v31.4h, v17.4h
    364     sMLAL           v2.4s, v25.4h, v10.4h
    365 
    366     uMULL           v4.4s, v18.4h, v10.4h
    367     uMULL           v6.4s, v16.4h, v10.4h
    368 
    369     NEG             v0.4s, v0.4s
    370     ADD             v14.4s, v30.4s , v2.4s
    371     ADD             v26.4s, v28.4s , v0.4s
    372 
    373     rev64           v14.4s, v14.4s
    374     ushR            v4.4s, v4.4s, #16
    375 
    376     swp             v14.D[0], v14.D[1]
    377     ushR            v6.4s, v6.4s, #16
    378 
    379     sMLAL           v4.4s, v19.4h, v10.4h
    380     LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
    381     sMLAL           v6.4s, v17.4h, v10.4h
    382 
    383 
    384 
    385 
    386 
    387     ADD             v24.4s, v20.4s , v4.4s
    388 
    389     rev64           v24.4s, v24.4s
    390     NEG             v16.4s, v6.4s
    391 
    392     LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
    393 
    394     swp             v24.D[0], v24.D[1]
    395     ADD             v16.4s, v22.4s , v16.4s
    396 
    397     SUBS            x3, x3, #1
    398 
    399     BNE             CORE_LOOP
    400 
    401 
    402 
    403 
    404 NEON_EPILOGUE:
    405     uMULL           v30.4s, v2.4h, v13.4h
    406     MOV             v25.16B, v24.16B
    407     ST2             { v25.4s, v26.4s}, [x7], x8
    408     uMULL           v28.4s, v0.4h, v13.4h
    409 
    410     uMULL           v26.4s, v2.4h, v12.4h
    411     MOV             v15.16B, v14.16B
    412     ST2             { v15.4s, v16.4s}, [x0], #32
    413     uMULL           v24.4s, v0.4h, v12.4h
    414 
    415 
    416 
    417     ushR            v30.4s, v30.4s, #16
    418     ushR            v28.4s, v28.4s, #16
    419     ushR            v26.4s, v26.4s, #16
    420     ushR            v24.4s, v24.4s, #16
    421 
    422     sMLAL           v30.4s, v3.4h, v13.4h
    423     sMLAL           v28.4s, v1.4h, v13.4h
    424     sMLAL           v26.4s, v3.4h, v12.4h
    425     sMLAL           v24.4s, v1.4h, v12.4h
    426 
    427 
    428     uMULL           v22.4s, v6.4h, v9.4h
    429     uMULL           v20.4s, v4.4h, v9.4h
    430 
    431 
    432     ADD             v28.4s, v28.4s , v26.4s
    433     SUB             v30.4s, v30.4s , v24.4s
    434     NEG             v28.4s, v28.4s
    435 
    436     uMULL           v18.4s, v6.4h, v8.4h
    437     uMULL           v16.4s, v4.4h, v8.4h
    438 
    439 
    440     mov             v26.8b, v30.8b
    441     mov             v27.D[0], v30.D[1]
    442     ushR            v22.4s, v22.4s, #16
    443 
    444     mov             v24.16b, v28.16b
    445     mov             v25.D[0], v28.D[1]
    446     ushR            v20.4s, v20.4s, #16
    447 
    448 
    449     mov             v31.8b, v26.8b
    450     UZP1            v26.4h, v31.4h, v27.4h
    451     UZP2            v27.4h, v31.4h, v27.4h
    452     ushR            v18.4s, v18.4s, #16
    453 
    454 
    455     mov             v31.8b, v24.8b
    456     UZP1            v24.4h, v31.4h, v25.4h
    457     UZP2            v25.4h, v31.4h, v25.4h
    458     ushR            v16.4s, v16.4s, #16
    459 
    460 
    461     sMLAL           v22.4s, v7.4h, v9.4h
    462     sMLAL           v20.4s, v5.4h, v9.4h
    463     sMLAL           v18.4s, v7.4h, v8.4h
    464     sMLAL           v16.4s, v5.4h, v8.4h
    465 
    466 
    467     uMULL           v0.4s, v26.4h, v10.4h
    468 
    469 
    470     uMULL           v2.4s, v24.4h, v10.4h
    471 
    472 
    473     ADD             v22.4s, v22.4s , v16.4s
    474 
    475 
    476     SUB             v20.4s, v18.4s , v20.4s
    477 
    478 
    479     NEG             v22.4s, v22.4s
    480 
    481 
    482     mov             v18.16b, v22.16b
    483     ushR            v0.4s, v0.4s, #16
    484 
    485     mov             v16.16b, v20.16b
    486     ushR            v2.4s, v2.4s, #16
    487 
    488 
    489     mov             v31.16b, v18.16b
    490     mov             v19.d[0], v31.d[1]
    491     UZP1            v18.4h, v31.4h, v19.4h
    492     UZP2            v19.4h, v31.4h, v19.4h
    493     sMLAL           v0.4s, v27.4h, v10.4h
    494 
    495 
    496     mov             v31.16b, v16.16b
    497     mov             v17.d[0], v31.d[1]
    498     UZP1            v16.4h, v31.4h, v17.4h
    499     UZP2            v17.4h, v31.4h, v17.4h
    500     sMLAL           v2.4s, v25.4h, v10.4h
    501 
    502     uMULL           v4.4s, v18.4h, v10.4h
    503     uMULL           v6.4s, v16.4h, v10.4h
    504 
    505     NEG             v0.4s, v0.4s
    506     ADD             v14.4s, v30.4s , v2.4s
    507     ADD             v26.4s, v28.4s , v0.4s
    508 
    509     rev64           v14.4s, v14.4s
    510     ushR            v4.4s, v4.4s, #16
    511 
    512     swp             v14.D[0], v14.D[1]
    513     ushR            v6.4s, v6.4s, #16
    514 
    515     sMLAL           v4.4s, v19.4h, v10.4h
    516 
    517     sMLAL           v6.4s, v17.4h, v10.4h
    518 
    519 
    520 
    521 
    522     ADD             v24.4s, v20.4s , v4.4s
    523 
    524     rev64           v24.4s, v24.4s
    525     NEG             v16.4s, v6.4s
    526 
    527 
    528 
    529     swp             v24.D[0], v24.D[1]
    530     ADD             v16.4s, v22.4s , v16.4s
    531 
    532     MOV             v25.16B, v24.16B
    533     MOV             v15.16B, v14.16B
    534     ST2             { v15.4s, v16.4s}, [x0], #32
    535     ST2             { v25.4s, v26.4s}, [x7], x8
    536 
    537 
    538 
    539 
    540     LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
    541 
    542     movi            v6.2s, #0x00000000
    543     movi            v7.2s, #0x00000000
    544 
    545     LD2             {v4.2s, v5.2s}, [x1], #16
    546     LD2             {v6.s, v7.s}[0], [x1]
    547 
    548     LD2             {v8.h, v9.h}[0], [x2], x6
    549     LD2             {v8.h, v9.h}[1], [x2], x6
    550     LD2             {v8.h, v9.h}[2], [x2], x6
    551     LD2             {v8.h, v9.h}[3], [x2], x6
    552 
    553     rev64           v12.8h, v8.8h
    554     rev64           v13.8h, v9.8h
    555     swp             v5.D[0], v6.D[0]
    556 
    557 
    558     MOV             v30.8B, V4.8B
    559     UZP1            v4.4h, v30.4h, v5.4h
    560     UZP2            v5.4h, v30.4h, v5.4h
    561     MOV             v30.8B, V6.8B
    562     UZP1            v6.4h, v30.4h, v7.4h
    563     UZP2            v7.4h, v30.4h, v7.4h
    564     uMULL           v30.4s, v2.4h, v13.4h
    565     uMULL           v28.4s, v0.4h, v13.4h
    566 
    567     uMULL           v26.4s, v2.4h, v12.4h
    568     uMULL           v24.4s, v0.4h, v12.4h
    569 
    570     ushR            v30.4s, v30.4s, #16
    571     ushR            v28.4s, v28.4s, #16
    572     ushR            v26.4s, v26.4s, #16
    573     ushR            v24.4s, v24.4s, #16
    574 
    575     sMLAL           v30.4s, v3.4h, v13.4h
    576     sMLAL           v28.4s, v1.4h, v13.4h
    577     sMLAL           v26.4s, v3.4h, v12.4h
    578     sMLAL           v24.4s, v1.4h, v12.4h
    579 
    580     uMULL           v22.4s, v6.4h, v9.4h
    581     uMULL           v20.4s, v4.4h, v9.4h
    582 
    583 
    584     ADD             v28.4s, v28.4s , v26.4s
    585     SUB             v30.4s, v30.4s , v24.4s
    586     NEG             v28.4s, v28.4s
    587 
    588     uMULL           v18.4s, v6.4h, v8.4h
    589     uMULL           v16.4s, v4.4h, v8.4h
    590 
    591     mov             v26.8b, v30.8b
    592     mov             v27.D[0], v30.D[1]
    593     ushR            v22.4s, v22.4s, #16
    594 
    595     mov             v24.16b, v28.16b
    596     mov             v25.D[0], v28.D[1]
    597     ushR            v20.4s, v20.4s, #16
    598 
    599 
    600     MOV             v31.8B, V26.8B
    601     UZP1            v26.4h, v31.4h, v27.4h
    602     UZP2            v27.4h, v31.4h, v27.4h
    603     ushr            v18.4s, v18.4s, #16
    604 
    605     MOV             v31.8B, V24.8B
    606     UZP1            v24.4h, v31.4h, v25.4h
    607     UZP2            v25.4h, v31.4h, v25.4h
    608     ushR            v16.4s, v16.4s, #16
    609 
    610     sMLAL           v22.4s, v7.4h, v9.4h
    611     sMLAL           v20.4s, v5.4h, v9.4h
    612     sMLAL           v18.4s, v7.4h, v8.4h
    613     sMLAL           v16.4s, v5.4h, v8.4h
    614 
    615 
    616     uMULL           v0.4s, v26.4h, v10.4h
    617 
    618 
    619     uMULL           v2.4s, v24.4h, v10.4h
    620 
    621     ADD             v22.4s, v22.4s , v16.4s
    622 
    623 
    624     SUB             v20.4s, v18.4s , v20.4s
    625 
    626 
    627     NEG             v22.4s, v22.4s
    628 
    629 
    630     mov             v18.8B, v22.8B
    631     mov             v19.D[0], v22.D[1]
    632     ushR            v0.4s, v0.4s, #16
    633 
    634     mov             v16.16b, v20.16b
    635     mov             v17.D[0], v20.D[1]
    636     ushR            v2.4s, v2.4s, #16
    637 
    638 
    639     MOV             v31.8B, V18.8B
    640     UZP1            v18.4h, v31.4h, v19.4h
    641     UZP2            v19.4h, v31.4h, v19.4h
    642     sMLAL           v0.4s, v27.4h, v10.4h
    643 
    644 
    645     MOV             v31.8B, V16.8B
    646     UZP1            v16.4h, v31.4h, v17.4h
    647     UZP2            v17.4h, v31.4h, v17.4h
    648     sMLAL           v2.4s, v25.4h, v10.4h
    649 
    650     uMULL           v4.4s, v18.4h, v10.4h
    651     uMULL           v6.4s, v16.4h, v10.4h
    652 
    653     NEG             v0.4s, v0.4s
    654     ADD             v14.4s, v30.4s , v2.4s
    655     ADD             v26.4s, v28.4s , v0.4s
    656 
    657     rev64           v14.4s, v14.4s
    658     ushR            v4.4s, v4.4s, #16
    659 
    660     swp             v14.D[0], v14.D[1]
    661     ushR            v6.4s, v6.4s, #16
    662 
    663     sMLAL           v4.4s, v19.4h, v10.4h
    664 
    665     sMLAL           v6.4s, v17.4h, v10.4h
    666 
    667 
    668 
    669 
    670     ADD             v24.4s, v20.4s , v4.4s
    671 
    672     rev64           v24.4s, v24.4s
    673     NEG             v16.4s, v6.4s
    674 
    675     swp             v24.D[0], v24.D[1]
    676     ADD             v16.4s, v22.4s , v16.4s
    677 
    678 
    679     MOV             v15.16B, v14.16B
    680     ST2             {v15.2s, v16.2s}, [x0], #16
    681 
    682     ST2             {v15.s, v16.s}[2], [x0], #8
    683 
    684     ST1             {v15.s}[3], [x0]
    685 
    686     ADD             x7, x7, #4
    687 
    688     ST1             {v26.s}[0], [x7], #4
    689     MOV             v25.16B, v24.16B
    690     ST2             {v25.s, v26.s}[1], [x7], #8
    691     MOV             v27.D[0], V26.d[1]
    692     mov             v26.d[0], v25.d[1]
    693     ST2             {v26.2s, v27.2s}, [x7]
    694 
    695 
    696 
    697 
    698 
    699 
    700     pop_v_regs
    701     ret
    702 
    703 
    704 
    705 
    706 
    707 
    708 
    709 
    710 
    711 
    712 
    713 
    714