Home | History | Annotate | Download | only in armv8
      1 .macro push_v_regs
      2     stp             q8, q9, [sp, #-32]!
      3     stp             q10, q11, [sp, #-32]!
      4     stp             q12, q13, [sp, #-32]!
      5     stp             q14, q15, [sp, #-32]!
      6 //st1 { v8.2d,  v9.2d, v10.2d, v11.2d}, [sp, #-64]!
      7 //st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp, #-64]!
      8     stp             X8, X9, [sp, #-16]!
      9     stp             X10, X11, [sp, #-16]!
     10     stp             X12, X13, [sp, #-16]!
     11     stp             X14, X15, [sp, #-16]!
     12     stp             X16, X17, [sp, #-16]!
     13     stp             X18, X19, [sp, #-16]!
     14     stp             X20, X21, [sp, #-16]!
     15     stp             X22, X23, [sp, #-16]!
     16     stp             X24, X25, [sp, #-16]!
     17     stp             X26, X27, [sp, #-16]!
     18     stp             X28, X29, [sp, #-16]!
     19     stp             X30, X29, [sp, #-16]!
     20 .endm
     21 
     22 .macro pop_v_regs
     23     ldp             X30, X29, [sp], #16
     24     ldp             X28, X29, [sp], #16
     25     ldp             X26, X27, [sp], #16
     26     ldp             X24, X25, [sp], #16
     27     ldp             X22, X23, [sp], #16
     28     ldp             X20, X21, [sp], #16
     29     ldp             X18, X19, [sp], #16
     30     ldp             X16, X17, [sp], #16
     31     ldp             X14, X15, [sp], #16
     32     ldp             X12, X13, [sp], #16
     33     ldp             X10, X11, [sp], #16
     34     ldp             X8, X9, [sp], #16
     35 //ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
     36 //ld1 { v8.2d,  v9.2d, v10.2d, v11.2d}, [sp], #64
     37     ldp             q14, q15, [sp], #32
     38     ldp             q12, q13, [sp], #32
     39     ldp             q10, q11, [sp], #32
     40     ldp             q8, q9, [sp], #32
     41 .endm
     42 
     43 
     44 .text
     45 .p2align 2
     46 .global ixheaacd_post_twid_overlap_add_armv8
     47 
     48 ixheaacd_post_twid_overlap_add_armv8:
     49 
     50     // STMFD sp!, {x4-x12}
     51     push_v_regs
     52     //stp x19, x20,[sp,#-16]!
     53     //VPUSH           {d8 - d15}
     54 
     55     //LDR w4,  [sp, #100]
     56     //sxtw x4,w4
     57     //LDR w5,  [sp, #104]
     58     //sxtw x5,w5
     59     //LDR w6,  [sp, #108]
     60     //sxtw x6,w6
     61     MOV             x16, x5
     62     MOV             x17, x7
     63     LSL             x9, x3, #2
     64     ASR             x9, x9, #1
     65     ADD             x6, x6, x9
     66     SUB             x6, x6, #4
     67 
     68     LDR             w8, =7500
     69     sxtw            x8, w8
     70     ADD             x2, x2, x8
     71 
     72 
     73 
     74     movi            v18.4h, #50
     75     sub             x20, x5, #15
     76     neg             x9, x20
     77     movi            v20.4s, #0x80, LSL #8
     78     dup             v16.4s, w5
     79     SUB             x5, x5, #16
     80     //STR w5,  [sp, #116]
     81     MOV             w25, w5
     82     sxtw            x25, w25
     83     MOV             x8, #1
     84     LSL             x8, x8, x9
     85     //STR w8,  [sp, #120]
     86     MOV             w26, w8
     87 
     88     //sxtw x8,w8
     89 
     90 
     91 ARM_PROLOGUE:
     92 
     93 
     94     LDR             w8, [x1], #4
     95     sxtw            x8, w8
     96     LDR             w9, [x1], #4
     97     sxtw            x9, w9
     98 
     99     LDR             w10, [x2], #4
    100     sxtw            x10, w10
    101 
    102     AND             w19, w10, 0xFFFF
    103     sxth            x19, w19
    104     ASR             w10, w10, #16
    105 //    SMULWT          x11, x8, x10
    106 //
    107 //    SMULWB          x12, x9, x10
    108 //    SMULWB          x5, x8, x10
    109 //    SMLAWT          x7, x9, x10, x5
    110 
    111     SMULL           x11, w8, w10
    112     ASR             x11, x11, #16
    113     SMULL           x12, w9, w19
    114     ASR             x12, x12, #16
    115     SMULL           x5, w8, w19
    116     ASR             x5, x5, #16
    117     SMULL           x7, w9, w10
    118     ASR             x7, x7, #16
    119     ADD             x7, x7, x5
    120 
    121     SUB             x8, x12, x11
    122     MVN             x5, x7
    123     ADD             x5, x5, #1
    124 
    125 
    126     MOV             x9, #50
    127     MOV             x12, #-50
    128     AND             w19, w9, 0xFFFF
    129     sxth            x19, w19
    130     SMULL           x10, w5, w19
    131     ASR             x10, x10, #16
    132     AND             w19, w12, 0xFFFF
    133     sxth            x19, w19
    134     SMULL           x11, w8, w19
    135     ASR             x11, x11, #16
    136 
    137     ADD             x8, x8, x10
    138     ADD             x5, x5, x11
    139 
    140     //LDR w11,  [sp, #104]
    141     MOV             w11, w16
    142     sxth            x11, w11
    143     LDR             w10, [x6], #-32
    144     sxtw            x10, w10
    145 
    146     AND             w19, w10, 0xFFFF
    147     sxth            x19, w19
    148     ASR             w20, w10, #16
    149 
    150     //SMULWB          x7, x8, x10
    151     SMULL           x7, w8, w19
    152     ASR             x7, x7, #16
    153     MVN             x8, x8
    154     ADD             x8, x8, #1
    155     //SMULWT          x12, x8, x10
    156     SMULL           x12, w8, w20
    157     ASR             x12, x12, #16
    158 
    159     CMP             x11, #0
    160     BLT             NEXT
    161 
    162     SUB             x9, x11, #16
    163     negs            x9, x9
    164 
    165 
    166 
    167 
    168     // LDR w8,  [sp, #120]
    169     //sxtw x8,w8
    170     MOV             v1.s[0], w26
    171     MOV             v2.s[0], w5
    172 
    173     //sQADD            w5, w5, w8
    174     //ASR             w5, w5, w9
    175 
    176     SQADD           v2.2s, v2.2s, v1.2s
    177     MOV             w5, v2.s[0]
    178     ASR             w5, w5, w9
    179 
    180     SUB             x9, x11, #31
    181     negs            x9, x9
    182     ASR             x20, x7, x9
    183     //MOV            x8, x20
    184     ADDS            x8, x20, #0
    185     BGE             NEXT2
    186     CMN             x8, #1
    187 NEXT2:
    188     LDR             x20, =0x80000000
    189     csel            x7, x20, x7, LT
    190     LDR             x20, =0x7fffffff
    191     csel            x7, x20, x7, GT
    192     LSL             x20, x7, x11
    193     csel            x7, x20, x7, EQ
    194 
    195     SUB             x9, x11, #31
    196     negs            x9, x9
    197     ASR             x20, x12, x9
    198     //MOV            x8, x20
    199     ADDS            x8, x20, #0
    200     BGE             NEXT3
    201     CMN             x8, #1
    202 NEXT3:
    203     LDR             x20, =0x80000000
    204     csel            x12, x20, x12, LT
    205     LDR             x20, =0x7fffffff
    206     csel            x12, x20, x12, GT
    207     LSL             x20, x12, x11
    208     csel            x12, x20, x12, EQ
    209 
    210     B               NEXT1
    211 NEXT:
    212     MVN             w11, w11
    213     ADD             w11, w11, #1
    214     ASR             w5, w5, w11
    215     MOV             w8, #0x8000
    216 
    217     MOV             v1.s[0], w8
    218     MOV             v2.s[0], w5
    219 
    220     //QADD            x5, x5, x8
    221 
    222     SQADD           v2.2s, v2.2s, v1.2s
    223     MOV             w5, v2.s[0]
    224 
    225     ASR             w5, w5, #16
    226     ASR             w7, w7, w11
    227     ASR             w12, w12, w11
    228 
    229 NEXT1:
    230     LDR             w9, [x4]
    231     sxtw            x9, w9
    232     MOV             w8, #0x8000
    233     //sxtw x8,w8
    234 
    235     STR             w5, [x4], #4
    236     sxtw            x5, w5
    237 
    238 
    239     ROR             w20, w10, #16
    240     //UXTH            x5, x10, ROR #16
    241     UXTH            w5, w20
    242     UXTH            w10, w10
    243 
    244 
    245     dup             v0.2s, w9
    246     dup             v2.2s, w10
    247     dup             v3.2s, w5
    248     //VZIP.32         D2, D3
    249     ZIP1            v28.2s, v2.2s, v3.2s
    250     ZIP2            v3.2s, v2.2s, v3.2s
    251     MOV             v2.8b, v28.8b
    252     sMULL           v0.2d, v2.2s, v0.2s
    253     Sqxtn           v8.2s, v0.2d
    254 
    255 
    256     dup             v0.2s, w12
    257     dup             v1.2s, w7
    258 
    259     //VZIP.32         D0, D1
    260 
    261     ZIP1            v28.2s, v0.2s, v1.2s
    262     ZIP2            v1.2s, v0.2s, v1.2s
    263     MOV             v0.8b, v28.8b
    264 
    265     SQSUB           v8.2s, v0.2s , v8.2s
    266 
    267 
    268     sQshL           v8.2s, v8.2s, #2
    269     dup             v0.2s, w8
    270     SQADD           v8.2s, v8.2s , v0.2s
    271     sshR            v8.2s, v8.2s, #16
    272 
    273 
    274 
    275     MOV             x7, x17
    276     //sxtw x7,w7
    277     LSL             x10, x7, #1
    278 
    279     ASR             x5, x3, #1
    280     //SMULBB          x5, x10, x5
    281     AND             w5, w5, 0xFFFF
    282     sxth            x5, w5
    283     AND             w19, w10, 0xFFFF
    284     sxth            x19, w19
    285     SMULL           x5, w19, w5
    286 
    287     ADD             x5, x5, x0
    288     SUB             x0, x5, x10
    289     MVN             x9, x10
    290     ADD             x9, x9, #1
    291 
    292     ST1             {v8.h}[2], [x0], x9
    293     ST1             {v8.h}[0], [x5], x10
    294 
    295 
    296     MOV             x8, x1
    297     LSL             x12, x3, #2
    298 
    299     ADD             x1, x1, x12
    300 
    301     SUB             x1, x1, #40
    302 
    303     MOV             x12, #-32
    304 
    305 
    306 
    307 PROLOGUE_NEON:
    308 
    309     ASR             x3, x3, #2
    310     SUB             x3, x3, #4
    311     ASR             x3, x3, #2
    312     SUB             x3, x3, #2
    313 
    314     LD2             { v0.4s, v1.4s}, [x1]
    315     MOV             v2.16b, v1.16b
    316     ADD             x1, x1, x12
    317 
    318     //VUZP.16         D0, D1
    319     UZP1            v28.8h, v0.8h, v0.8h
    320     UZP2            v29.8h, v0.8h, v0.8h
    321     MOV             v0.d[0], v28.d[0]
    322     MOV             v0.d[1], v29.d[0]
    323 
    324     //VUZP.16         D2, D3
    325 
    326     UZP1            v28.8h, v2.8h, v2.8h
    327     UZP2            v29.8h, v2.8h, v2.8h
    328     MOV             v2.d[0], v28.d[0]
    329     MOV             v2.d[1], v29.d[0]
    330 
    331 
    332     //rev64  v0.8h,  v0.8h
    333     rev64           v0.8h, v0.8h
    334     MOV             v1.d[0], v0.d[1]
    335     rev64           v2.8h, v2.8h
    336     MOV             v3.d[0], v2.d[1]
    337     LD2             {v8.4h, v9.4h}, [x2]
    338     ADD             x2, x2, #16
    339 
    340     LD2             { v4.4s, v5.4s}, [x8]
    341     MOV             v6.16b, v5.16b
    342     ADD             x8, x8, #32
    343     uMULL           v30.4s, v0.4h, v9.4h
    344 
    345 //    VUZP.16         D4, D5
    346 
    347     UZP1            v28.8h, v4.8h, v4.8h
    348     UZP2            v29.8h, v4.8h, v4.8h
    349     MOV             v4.d[0], v28.d[0]
    350     MOV             v5.d[0], v29.d[0]
    351 
    352     uMULL           v28.4s, v2.4h, v8.4h
    353 
    354 //    VUZP.16         D6, D7
    355     UZP1            v26.8h, v6.8h, v6.8h
    356     UZP2            v27.8h, v6.8h, v6.8h
    357     MOV             v6.d[0], v26.d[0]
    358     MOV             v7.d[0], v27.d[0]
    359 
    360     uMULL           v26.4s, v0.4h, v8.4h
    361 
    362 
    363     uMULL           v24.4s, v2.4h, v9.4h
    364 
    365     LD2             { v10.4s, v11.4s}, [x6]
    366     MOV             v12.16b, v11.16b
    367     ADD             x6, x6, x12
    368     ushR            v30.4s, v30.4s, #16
    369 
    370     //VUZP.16         D10, D11
    371 
    372     UZP1            v22.8h, v10.8h, v10.8h
    373     UZP2            v23.8h, v10.8h, v10.8h
    374     MOV             v10.d[0], v22.d[0]
    375     MOV             v10.d[1], v23.d[0]
    376 
    377     ushR            v28.4s, v28.4s, #16
    378 
    379     //VUZP.16         D12, D13
    380 
    381     UZP1            v22.8h, v12.8h, v12.8h
    382     UZP2            v23.8h, v12.8h, v12.8h
    383     MOV             v12.d[0], v22.d[0]
    384     MOV             v12.d[1], v23.d[0]
    385 
    386     sMLAL           v30.4s, v1.4h, v9.4h
    387 
    388     rev64           v10.8h, v10.8h
    389     MOV             v11.d[0], v10.d[1]
    390     sMLAL           v28.4s, v3.4h, v8.4h
    391 
    392     rev64           v12.8h, v12.8h
    393     MOV             v13.d[0], v12.d[1]
    394     ushR            v26.4s, v26.4s, #16
    395 
    396 
    397     ushR            v24.4s, v24.4s, #16
    398 
    399     sMLAL           v26.4s, v1.4h, v8.4h
    400     sMLAL           v24.4s, v3.4h, v9.4h
    401 
    402 
    403 
    404     ADD             v30.4s, v30.4s , v28.4s
    405     NEG             v30.4s, v30.4s
    406 
    407     uMULL           v22.4s, v4.4h, v8.4h
    408 
    409     SUB             v28.4s, v24.4s , v26.4s
    410 
    411 
    412     mov             v26.16b, v30.16b
    413     mov             v24.16b, v28.16b
    414 
    415 //    VUZP.16         D24, D25
    416 
    417     UZP1            v19.8h, v24.8h, v24.8h
    418     UZP2            v21.8h, v24.8h, v24.8h
    419     MOV             v24.d[0], v19.d[0]
    420     MOV             v25.d[0], v21.d[0]
    421 
    422 
    423 //    VUZP.16         D26, D27
    424 
    425     UZP1            v19.8h, v26.8h, v26.8h
    426     UZP2            v21.8h, v26.8h, v26.8h
    427     MOV             v26.d[0], v19.d[0]
    428     MOV             v27.d[0], v21.d[0]
    429 
    430     uMULL           v2.4s, v24.4h, v18.4h
    431 
    432     uMULL           v0.4s, v26.4h, v18.4h
    433 
    434     ushR            v22.4s, v22.4s, #16
    435     sMLAL           v22.4s, v5.4h, v8.4h
    436 
    437     ushR            v2.4s, v2.4s, #16
    438     ushR            v0.4s, v0.4s, #16
    439     sMLAL           v2.4s, v25.4h, v18.4h
    440     sMLAL           v0.4s, v27.4h, v18.4h
    441 
    442     uMULL           v24.4s, v4.4h, v9.4h
    443     uMULL           v26.4s, v6.4h, v8.4h
    444 
    445     NEG             v2.4s, v2.4s
    446     ADD             v28.4s, v28.4s , v0.4s
    447     ADD             v30.4s, v30.4s , v2.4s
    448 
    449     uMULL           v0.4s, v6.4h, v9.4h
    450     sshR            v24.4s, v24.4s, #16
    451     sMLAL           v24.4s, v5.4h, v9.4h
    452     sshR            v26.4s, v26.4s, #16
    453     sshR            v0.4s, v0.4s, #16
    454     sMLAL           v26.4s, v7.4h, v8.4h
    455     sMLAL           v0.4s, v7.4h, v9.4h
    456 
    457 
    458 
    459 
    460     ADD             v22.4s, v22.4s , v0.4s
    461     NEG             v22.4s, v22.4s
    462     SUB             v24.4s, v26.4s , v24.4s
    463 
    464 
    465 
    466     //LDR w11,  [sp, #120]
    467     //sxtw x11,w11
    468     MOV             w11, w26
    469     dup             v14.4s, w11
    470     SQADD           v28.4s, v28.4s , v14.4s
    471     //LDR w11,  [sp, #116]
    472     MOV             w11, w25
    473     //sxtw x11,w11
    474     dup             v0.4s, w11
    475     sQshL           v28.4s, v28.4s, v0.4s
    476 
    477     mov             v0.16b, v22.16b
    478     mov             v14.16b, v24.16b
    479 
    480 
    481 //    VUZP.16         D24, D25
    482 
    483     UZP1            v19.8h, v24.8h, v24.8h
    484     UZP2            v21.8h, v24.8h, v24.8h
    485     MOV             v24.d[0], v19.d[0]
    486     MOV             v25.d[0], v21.d[0]
    487 
    488 
    489 //    VUZP.16         D22, D23
    490 
    491     UZP1            v19.8h, v22.8h, v22.8h
    492     UZP2            v21.8h, v22.8h, v22.8h
    493     MOV             v22.d[0], v19.d[0]
    494     MOV             v23.d[0], v21.d[0]
    495 
    496     uMULL           v8.4s, v24.4h, v18.4h
    497     uMULL           v26.4s, v22.4h, v18.4h
    498 
    499     NEG             v2.4s, v30.4s
    500 //    VUZP.16         D30, D31
    501 
    502     UZP1            v19.8h, v30.8h, v30.8h
    503     UZP2            v21.8h, v30.8h, v30.8h
    504     MOV             v30.d[0], v19.d[0]
    505     MOV             v30.d[1], v21.d[0]
    506 
    507 //    VUZP.16         D2, D3
    508 
    509     UZP1            v19.8h, v2.8h, v2.8h
    510     UZP2            v21.8h, v2.8h, v2.8h
    511     MOV             v2.d[0], v19.d[0]
    512     MOV             v3.d[0], v21.d[0]
    513 
    514     uMULL           v4.4s, v30.4h, v12.4h
    515 
    516     uMULL           v6.4s, v2.4h, v13.4h
    517 
    518     ushR            v8.4s, v8.4s, #16
    519     ushR            v26.4s, v26.4s, #16
    520 
    521     sMLAL           v8.4s, v25.4h, v18.4h
    522     sMLAL           v26.4s, v23.4h, v18.4h
    523 
    524     ushR            v4.4s, v4.4s, #16
    525     ushR            v6.4s, v6.4s, #16
    526 
    527     MOV             v19.d[0], v30.d[1]
    528 
    529     sMLAL           v4.4s, v19.4h, v12.4h
    530     sMLAL           v6.4s, v3.4h, v13.4h
    531 
    532     NEG             v8.4s, v8.4s
    533     ADD             v14.4s, v14.4s , v26.4s
    534     ADD             v0.4s, v0.4s , v8.4s
    535 
    536     //LDR w11,  [sp, #120]
    537     //sxtw x11,w11
    538     MOV             w11, w26
    539     dup             v8.4s, w11
    540     SQADD           v0.4s, v0.4s , v8.4s
    541     //LDR w11,  [sp, #116]
    542     //sxtw x11,w11
    543     MOV             w11, w25
    544     dup             v26.4s, w11
    545     sQshL           v0.4s, v0.4s, v26.4s
    546 
    547     mov             v26.16b, v28.16b
    548 
    549     LD2             { v28.4s, v29.4s}, [x4]
    550     MOV             v30.16b, v29.16b
    551     MOV             v29.d[0], v28.d[1]
    552 //   VZIP.32         Q13, Q0
    553 
    554     ZIP1            v19.4s, v26.4s, v0.4s
    555     ZIP2            v0.4s, v26.4s, v0.4s
    556     MOV             v26.16b, v19.16b
    557 
    558     ST1             { v26.4s}, [x4], #16
    559     ST1             { v0.4s}, [x4], #16
    560 
    561     movi            v1.2s, #0
    562     //VADDL.S16       Q0, D13, D1
    563 
    564     SADDL           v0.4s, v13.4h, v1.4h
    565     MOV             v1.d[0], v0.d[1]
    566     sMULL           v26.2d, v28.2s, v0.2s
    567     Sqxtn           v8.2s, v26.2d
    568     sMULL           v26.2d, v29.2s, v1.2s
    569     Sqxtn           v9.2s, v26.2d
    570     MOV             v8.d[1], v9.d[0]
    571     movi            v1.2s, #0
    572 //    VADDL.S16       Q0, D12, D1
    573     SADDL           v0.4s, v12.4h, v1.4h
    574     MOV             v1.d[0], v0.d[1]
    575     sMULL           v24.2d, v28.2s, v0.2s
    576     Sqxtn           v26.2s, v24.2d
    577     sMULL           v24.2d, v29.2s, v1.2s
    578     Sqxtn           v27.2s, v24.2d
    579     MOV             v26.d[1], v27.d[0]
    580 
    581     sQshL           v4.4s, v4.4s, v16.4s
    582     sQshL           v6.4s, v6.4s, v16.4s
    583 
    584     SQSUB           v4.4s, v4.4s , v8.4s
    585     SQSUB           v6.4s, v6.4s , v26.4s
    586 
    587     NEG             v26.4s, v14.4s
    588 //    VUZP.16         D14, D15
    589 
    590 
    591     UZP1            v19.8h, v14.8h, v14.8h
    592     UZP2            v21.8h, v14.8h, v14.8h
    593     MOV             v14.d[0], v19.d[0]
    594     MOV             v15.d[0], v21.d[0]
    595 
    596 //    VUZP.16         D26, D27
    597 
    598 
    599     UZP1            v19.8h, v26.8h, v26.8h
    600     UZP2            v21.8h, v26.8h, v26.8h
    601     MOV             v26.d[0], v19.d[0]
    602     MOV             v27.d[0], v21.d[0]
    603 
    604 
    605     movi            v1.2s, #0
    606 //    VADDL.S16       Q0, D10, D1
    607     SADDL           v0.4s, v10.4h, v1.4h
    608     MOV             v1.d[0], v0.d[0]
    609     sMULL           v22.2d, v30.2s, v0.2s
    610     Sqxtn           v24.2s, v22.2d
    611     sMULL2          v22.2d, v30.4s, v0.4s
    612     Sqxtn           v25.2s, v22.2d
    613     MOV             v24.d[1], v25.d[0]
    614     movi            v1.2s, #0
    615 //    VADDL.S16       Q0, D11, D1
    616     SADDL           v0.4s, v11.4h, v1.4h
    617     MOV             v1.d[0], v0.d[1]
    618 
    619     sMULL           v8.2d, v30.2s, v0.2s
    620     Sqxtn           v22.2s, v8.2d
    621     sMULL2          v8.2d, v30.4s, v0.4s
    622     Sqxtn           v23.2s, v8.2d
    623     MOV             v22.d[1], v23.d[0]
    624     uMULL           v8.4s, v26.4h, v11.4h
    625     uMULL           v30.4s, v14.4h, v10.4h
    626 
    627     LD2             { v0.4s, v1.4s}, [x1]
    628     MOV             v2.16b, v1.16b
    629     ADD             x1, x1, x12
    630 
    631 //    VUZP.16         D0, D1
    632 
    633     UZP1            v19.8h, v0.8h, v0.8h
    634     UZP2            v21.8h, v0.8h, v0.8h
    635     MOV             v0.d[0], v19.d[0]
    636     MOV             v0.d[1], v21.d[0]
    637 
    638 //    VUZP.16         D2, D3
    639 
    640     UZP1            v19.8h, v2.8h, v2.8h
    641     UZP2            v21.8h, v2.8h, v2.8h
    642     MOV             v2.d[0], v19.d[0]
    643     MOV             v2.d[1], v21.d[0]
    644 
    645     ushR            v8.4s, v8.4s, #16
    646 
    647     rev64           v0.8h, v0.8h
    648     MOV             v1.d[0], v0.d[1]
    649     ushR            v30.4s, v30.4s, #16
    650 
    651     rev64           v2.8h, v2.8h
    652     MOV             v3.d[0], v2.d[1]
    653     sMLAL           v8.4s, v27.4h, v11.4h
    654 
    655     sMLAL           v30.4s, v15.4h, v10.4h
    656 
    657     LD2             { v10.4s, v11.4s}, [x6]
    658     ADD             x6, x6, x12
    659     MOV             v12.16b, v11.16b
    660     sQshL           v4.4s, v4.4s, #2
    661 
    662 //   VUZP.16         D10, D11
    663 
    664     UZP1            v19.8h, v10.8h, v10.8h
    665     UZP2            v21.8h, v10.8h, v10.8h
    666     MOV             v10.d[0], v19.d[0]
    667     MOV             v10.d[1], v21.d[0]
    668 
    669     sQshL           v6.4s, v6.4s, #2
    670 
    671 //    VUZP.16         D12, D13
    672 
    673     UZP1            v19.8h, v12.8h, v12.8h
    674     UZP2            v21.8h, v12.8h, v12.8h
    675     MOV             v12.d[0], v19.d[0]
    676     MOV             v12.d[1], v21.d[0]
    677 
    678     SQADD           v14.4s, v4.4s , v20.4s
    679 
    680     rev64           v10.8h, v10.8h
    681     MOV             v11.d[0], v10.d[1]
    682     SQADD           v6.4s, v6.4s , v20.4s
    683 
    684     rev64           v12.8h, v12.8h
    685     MOV             v13.d[0], v12.d[1]
    686     sshR            v14.4s, v14.4s, #16
    687 
    688 //    VUZP.16         D14, D15
    689 
    690     UZP1            v19.8h, v14.8h, v14.8h
    691     UZP2            v21.8h, v14.8h, v14.8h
    692     MOV             v14.d[0], v19.d[0]
    693     MOV             v15.d[0], v21.d[0]
    694 
    695     sshR            v6.4s, v6.4s, #16
    696 
    697 //    VUZP.16         D6, D7
    698 
    699     UZP1            v19.8h, v6.8h, v6.8h
    700     UZP2            v21.8h, v6.8h, v6.8h
    701     MOV             v6.d[0], v19.d[0]
    702     MOV             v7.d[0], v21.d[0]
    703 
    704     mov             v15.8b, v6.8b
    705     sQshL           v8.4s, v8.4s, v16.4s
    706 
    707     LD2             { v4.4s, v5.4s}, [x8]
    708     ADD             x8, x8, #32
    709     MOV             v6.16b, v5.16b
    710     sQshL           v30.4s, v30.4s, v16.4s
    711 
    712 //    VUZP.16         D4, D5
    713 
    714     UZP1            v19.8h, v4.8h, v4.8h
    715     UZP2            v21.8h, v4.8h, v4.8h
    716     MOV             v4.d[0], v19.d[0]
    717     MOV             v5.d[0], v21.d[0]
    718 
    719     SQSUB           v8.4s, v8.4s , v24.4s
    720 
    721 //    VUZP.16         D6, D7
    722 
    723     UZP1            v19.8h, v6.8h, v6.8h
    724     UZP2            v21.8h, v6.8h, v6.8h
    725     MOV             v6.d[0], v19.d[0]
    726     MOV             v7.d[0], v21.d[0]
    727 
    728     SQSUB           v22.4s, v30.4s , v22.4s
    729 
    730     sQshL           v30.4s, v8.4s, #2
    731 
    732     LD2             {v8.4h, v9.4h}, [x2]
    733     ADD             x2, x2, #16
    734     sQshL           v22.4s, v22.4s, #2
    735 
    736     SQADD           v30.4s, v30.4s , v20.4s
    737     SQADD           v22.4s, v22.4s , v20.4s
    738 
    739     sshR            v30.4s, v30.4s, #16
    740 
    741 //    VUZP.16         D30, D31
    742 
    743     UZP1            v19.8h, v30.8h, v30.8h
    744     UZP2            v21.8h, v30.8h, v30.8h
    745     MOV             v30.d[0], v19.d[0]
    746     MOV             v30.d[1], v21.d[0]
    747 
    748     sshR            v22.4s, v22.4s, #16
    749 
    750 
    751 //    VUZP.16         D22, D23
    752 
    753     UZP1            v19.8h, v22.8h, v22.8h
    754     UZP2            v21.8h, v22.8h, v22.8h
    755     MOV             v22.d[0], v19.d[0]
    756     MOV             v23.d[0], v21.d[0]
    757 
    758 
    759     mov             v23.8b, v30.8b
    760 
    761 CORE_LOOP:
    762     ST1             {v14.h}[0], [x0]
    763     ADD             x0, x0, x9
    764     uMULL           v30.4s, v0.4h, v9.4h
    765 
    766     ST1             {v22.h}[0], [x0]
    767     ADD             x0, x0, x9
    768     uMULL           v28.4s, v2.4h, v8.4h
    769 
    770     ST1             {v14.h}[1], [x0]
    771     ADD             x0, x0, x9
    772     uMULL           v26.4s, v0.4h, v8.4h
    773 
    774     ST1             {v22.h}[1], [x0]
    775     ADD             x0, x0, x9
    776     uMULL           v24.4s, v2.4h, v9.4h
    777 
    778     ST1             {v14.h}[2], [x0]
    779     ADD             x0, x0, x9
    780     ushR            v30.4s, v30.4s, #16
    781 
    782     ST1             {v22.h}[2], [x0]
    783     ADD             x0, x0, x9
    784     ushR            v28.4s, v28.4s, #16
    785 
    786     ST1             {v14.h}[3], [x0]
    787     ADD             x0, x0, x9
    788     sMLAL           v30.4s, v1.4h, v9.4h
    789 
    790     ST1             {v22.h}[3], [x0]
    791     ADD             x0, x0, x9
    792     sMLAL           v28.4s, v3.4h, v8.4h
    793 
    794     ST1             {v15.h}[0], [x5]
    795     ADD             x5, x5, x10
    796     ushR            v26.4s, v26.4s, #16
    797 
    798     ST1             {v23.h}[0], [x5]
    799     ADD             x5, x5, x10
    800     ushR            v24.4s, v24.4s, #16
    801 
    802     ST1             {v15.h}[1], [x5]
    803     ADD             x5, x5, x10
    804     sMLAL           v26.4s, v1.4h, v8.4h
    805 
    806     ST1             {v23.h}[1], [x5]
    807     ADD             x5, x5, x10
    808     sMLAL           v24.4s, v3.4h, v9.4h
    809 
    810     ST1             {v15.h}[2], [x5]
    811     ADD             x5, x5, x10
    812     ADD             v30.4s, v30.4s , v28.4s
    813 
    814     ST1             {v23.h}[2], [x5]
    815     ADD             x5, x5, x10
    816     NEG             v30.4s, v30.4s
    817 
    818     ST1             {v15.h}[3], [x5]
    819     ADD             x5, x5, x10
    820 
    821     ST1             {v23.h}[3], [x5]
    822     ADD             x5, x5, x10
    823     SUB             v28.4s, v24.4s , v26.4s
    824 
    825 
    826     mov             v26.16b, v30.16b
    827     uMULL           v22.4s, v4.4h, v8.4h
    828 
    829     mov             v24.16b, v28.16b
    830 
    831 //    VUZP.16         D24, D25
    832 
    833     UZP1            v19.8h, v24.8h, v24.8h
    834     UZP2            v21.8h, v24.8h, v24.8h
    835     MOV             v24.d[0], v19.d[0]
    836     MOV             v25.d[0], v21.d[0]
    837 
    838 
    839 //    VUZP.16         D26, D27
    840 
    841     UZP1            v19.8h, v26.8h, v26.8h
    842     UZP2            v21.8h, v26.8h, v26.8h
    843     MOV             v26.d[0], v19.d[0]
    844     MOV             v27.d[0], v21.d[0]
    845 
    846     uMULL           v2.4s, v24.4h, v18.4h
    847     uMULL           v0.4s, v26.4h, v18.4h
    848 
    849     ushR            v22.4s, v22.4s, #16
    850     sMLAL           v22.4s, v5.4h, v8.4h
    851 
    852     ushR            v2.4s, v2.4s, #16
    853     ushR            v0.4s, v0.4s, #16
    854     sMLAL           v2.4s, v25.4h, v18.4h
    855     sMLAL           v0.4s, v27.4h, v18.4h
    856 
    857     uMULL           v24.4s, v4.4h, v9.4h
    858     uMULL           v26.4s, v6.4h, v8.4h
    859 
    860     NEG             v2.4s, v2.4s
    861     ADD             v28.4s, v28.4s , v0.4s
    862     ADD             v30.4s, v30.4s , v2.4s
    863 
    864     uMULL           v0.4s, v6.4h, v9.4h
    865     sshR            v24.4s, v24.4s, #16
    866     sMLAL           v24.4s, v5.4h, v9.4h
    867     sshR            v26.4s, v26.4s, #16
    868     sshR            v0.4s, v0.4s, #16
    869     sMLAL           v26.4s, v7.4h, v8.4h
    870     sMLAL           v0.4s, v7.4h, v9.4h
    871 
    872 
    873 
    874     ADD             v22.4s, v22.4s , v0.4s
    875 
    876     NEG             v22.4s, v22.4s
    877     SUB             v24.4s, v26.4s , v24.4s
    878 
    879 
    880     //LDR w11,  [sp, #120]
    881     //sxtw x11,w11
    882     MOV             w11, w26
    883     dup             v14.4s, w11
    884     SQADD           v28.4s, v28.4s , v14.4s
    885     //LDR w11,  [sp, #116]
    886     //sxtw x11,w11
    887     MOV             w11, w25
    888     dup             v0.4s, w11
    889     sQshL           v28.4s, v28.4s, v0.4s
    890 
    891 
    892     mov             v0.16b, v22.16b
    893     mov             v14.16b, v24.16b
    894 
    895 //    VUZP.16         D24, D25
    896 
    897     UZP1            v19.8h, v24.8h, v24.8h
    898     UZP2            v21.8h, v24.8h, v24.8h
    899     MOV             v24.d[0], v19.d[0]
    900     MOV             v25.d[0], v21.d[0]
    901 
    902 
    903 //    VUZP.16         D22, D23
    904 
    905     UZP1            v19.8h, v22.8h, v22.8h
    906     UZP2            v21.8h, v22.8h, v22.8h
    907     MOV             v22.d[0], v19.d[0]
    908     MOV             v23.d[0], v21.d[0]
    909 
    910     uMULL           v8.4s, v24.4h, v18.4h
    911     uMULL           v26.4s, v22.4h, v18.4h
    912 
    913     NEG             v2.4s, v30.4s
    914 
    915 //    VUZP.16         D30, D31
    916 
    917     UZP1            v19.8h, v30.8h, v30.8h
    918     UZP2            v21.8h, v30.8h, v30.8h
    919     MOV             v30.d[0], v19.d[0]
    920     MOV             v30.d[1], v21.d[0]
    921 
    922 
    923 //    VUZP.16         D2, D3
    924 
    925     UZP1            v19.8h, v2.8h, v2.8h
    926     UZP2            v21.8h, v2.8h, v2.8h
    927     MOV             v2.d[0], v19.d[0]
    928     MOV             v3.d[0], v21.d[0]
    929 
    930     uMULL           v4.4s, v30.4h, v12.4h
    931     uMULL           v6.4s, v2.4h, v13.4h
    932 
    933     ushR            v8.4s, v8.4s, #16
    934     ushR            v26.4s, v26.4s, #16
    935 
    936     sMLAL           v8.4s, v25.4h, v18.4h
    937     sMLAL           v26.4s, v23.4h, v18.4h
    938 
    939     ushR            v4.4s, v4.4s, #16
    940     ushR            v6.4s, v6.4s, #16
    941 
    942     MOV             v19.d[0], v30.d[1]
    943 
    944     sMLAL           v4.4s, v19.4h, v12.4h
    945     sMLAL           v6.4s, v3.4h, v13.4h
    946 
    947     NEG             v8.4s, v8.4s
    948     ADD             v14.4s, v14.4s , v26.4s
    949     ADD             v0.4s, v0.4s , v8.4s
    950 
    951 
    952 
    953     //LDR w11,  [sp, #120]
    954     //sxtw x11,w11
    955     MOV             w11, w26
    956     dup             v8.4s, w11
    957     SQADD           v0.4s, v0.4s , v8.4s
    958     //LDR w11,  [sp, #116]
    959     //sxtw x11,w11
    960     MOV             w11, w25
    961     dup             v26.4s, w11
    962     sQshL           v0.4s, v0.4s, v26.4s
    963     mov             v26.16b, v28.16b
    964 
    965     LD2             { v28.4s, v29.4s}, [x4]
    966     MOV             v30.16b, v29.16b
    967     MOV             v29.d[0], v28.d[1]
    968 //    VZIP.32         Q13, Q0
    969 
    970     ZIP1            v19.4s, v26.4s, v0.4s
    971     ZIP2            v0.4s, v26.4s, v0.4s
    972     MOV             v26.16b, v19.16b
    973 
    974     ST1             { v26.4s}, [x4]
    975     ADD             x4, x4, #16
    976     ST1             { v0.4s}, [x4]
    977     ADD             x4, x4, #16
    978 
    979     movi            v1.2s, #0
    980 //    VADDL.S16       Q0, D13, D1
    981     SADDL           v0.4s, v13.4h, v1.4h
    982     MOV             v1.d[0], v0.d[1]
    983 
    984     sMULL           v26.2d, v28.2s, v0.2s
    985     Sqxtn           v8.2s, v26.2d
    986     sMULL           v26.2d, v29.2s, v1.2s
    987     Sqxtn           v9.2s, v26.2d
    988     MOV             v8.d[1], v9.d[0]
    989     movi            v1.2s, #0
    990     //VADDL.S16       Q0, D12, D1
    991     SADDL           v0.4s, v12.4h, v1.4h
    992     MOV             v1.d[0], v0.d[1]
    993 
    994     sMULL           v24.2d, v28.2s, v0.2s
    995     Sqxtn           v26.2s, v24.2d
    996     sMULL           v24.2d, v29.2s, v1.2s
    997     Sqxtn           v27.2s, v24.2d
    998     MOV             v26.d[1], v27.d[0]
    999     sQshL           v4.4s, v4.4s, v16.4s
   1000     sQshL           v6.4s, v6.4s, v16.4s
   1001 
   1002 
   1003 
   1004     SQSUB           v4.4s, v4.4s , v8.4s
   1005     SQSUB           v6.4s, v6.4s , v26.4s
   1006 
   1007     NEG             v26.4s, v14.4s
   1008 //    VUZP.16         D26, D27
   1009     UZP1            v19.8h, v26.8h, v26.8h
   1010     UZP2            v21.8h, v26.8h, v26.8h
   1011     MOV             v26.d[0], v19.d[0]
   1012     MOV             v27.d[0], v21.d[0]
   1013 
   1014     movi            v1.2s, #0
   1015     //VADDL.S16       Q0, D10, D1
   1016     SADDL           v0.4s, v10.4h, v1.4h
   1017     MOV             v1.d[0], v0.d[1]
   1018 
   1019     sMULL           v22.2d, v30.2s, v0.2s
   1020     Sqxtn           v24.2s, v22.2d
   1021     sMULL2          v22.2d, v30.4s, v0.4s
   1022     Sqxtn           v25.2s, v22.2d
   1023     MOV             v24.d[1], v25.d[0]
   1024     movi            v1.2s, #0
   1025     //VADDL.S16       Q0, D11, D1
   1026     SADDL           v0.4s, v11.4h, v1.4h
   1027 
   1028     sMULL           v8.2d, v30.2s, v0.2s
   1029     Sqxtn           v22.2s, v8.2d
   1030     sMULL2          v8.2d, v30.4s, v0.4s
   1031     Sqxtn           v23.2s, v8.2d
   1032     MOV             v22.d[1], v23.d[0]
   1033 
   1034 //    VUZP.16         D14, D15
   1035 
   1036     UZP1            v19.8h, v14.8h, v14.8h
   1037     UZP2            v21.8h, v14.8h, v14.8h
   1038     MOV             v14.d[0], v19.d[0]
   1039     MOV             v15.d[0], v21.d[0]
   1040 
   1041     uMULL           v8.4s, v26.4h, v11.4h
   1042     uMULL           v30.4s, v14.4h, v10.4h
   1043 
   1044 
   1045     LD2             { v0.4s, v1.4s}, [x1]
   1046     MOV             v2.16b, v1.16b
   1047     ADD             X1, X1, x12
   1048 
   1049 //    VUZP.16         D0, D1
   1050     UZP1            v19.8h, v0.8h, v0.8h
   1051     UZP2            v21.8h, v0.8h, v0.8h
   1052     MOV             v0.d[0], v19.d[0]
   1053     MOV             v0.d[1], v21.d[0]
   1054 
   1055 //    VUZP.16         D2, D3
   1056 
   1057     UZP1            v19.8h, v2.8h, v2.8h
   1058     UZP2            v21.8h, v2.8h, v2.8h
   1059     MOV             v2.d[0], v19.d[0]
   1060     MOV             v2.d[1], v21.d[0]
   1061 
   1062     ushR            v8.4s, v8.4s, #16
   1063 
   1064     rev64           v0.8h, v0.8h
   1065     MOV             v1.d[0], v0.d[1]
   1066     ushR            v30.4s, v30.4s, #16
   1067 
   1068     rev64           v2.8h, v2.8h
   1069     MOV             v3.d[0], v2.d[1]
   1070     sMLAL           v8.4s, v27.4h, v11.4h
   1071 
   1072     sMLAL           v30.4s, v15.4h, v10.4h
   1073 
   1074     LD2             { v10.4s, v11.4s}, [x6]
   1075     add             X6, x6, x12
   1076     MOV             v12.16b, v11.16b
   1077     sQshL           v4.4s, v4.4s, #2
   1078 
   1079     //VUZP.16         D10, D11
   1080 
   1081     UZP1            v19.8h, v10.8h, v10.8h
   1082     UZP2            v21.8h, v10.8h, v10.8h
   1083     MOV             v10.d[0], v19.d[0]
   1084     MOV             v10.d[1], v21.d[0]
   1085 
   1086     sQshL           v6.4s, v6.4s, #2
   1087 
   1088 //    VUZP.16         D12, D13
   1089 
   1090     UZP1            v19.8h, v12.8h, v12.8h
   1091     UZP2            v21.8h, v12.8h, v12.8h
   1092     MOV             v12.d[0], v19.d[0]
   1093     MOV             v12.d[1], v21.d[0]
   1094 
   1095 
   1096     SQADD           v14.4s, v4.4s , v20.4s
   1097 
   1098     rev64           v10.8h, v10.8h
   1099     MOV             v11.d[0], v10.d[1]
   1100     SQADD           v6.4s, v6.4s , v20.4s
   1101 
   1102     rev64           v12.8h, v12.8h
   1103     MOV             v13.d[0], v12.d[1]
   1104     sshR            v14.4s, v14.4s, #16
   1105 
   1106 //    VUZP.16         D14, D15
   1107 
   1108     UZP1            v19.8h, v14.8h, v14.8h
   1109     UZP2            v21.8h, v14.8h, v14.8h
   1110     MOV             v14.d[0], v19.d[0]
   1111     MOV             v15.d[0], v21.d[0]
   1112 
   1113 
   1114     sshR            v6.4s, v6.4s, #16
   1115 
   1116 //    VUZP.16         D6, D7
   1117 
   1118     UZP1            v19.8h, v6.8h, v6.8h
   1119     UZP2            v21.8h, v6.8h, v6.8h
   1120     MOV             v6.d[0], v19.d[0]
   1121     MOV             v7.d[0], v21.d[0]
   1122 
   1123 
   1124     mov             v15.8b, v6.8b
   1125     sQshL           v8.4s, v8.4s, v16.4s
   1126 
   1127     LD2             { v4.4s, v5.4s}, [x8]
   1128     ADD             x8, x8, #32
   1129     MOV             v6.16b, v5.16b
   1130 
   1131     sQshL           v30.4s, v30.4s, v16.4s
   1132 
   1133 //    VUZP.16         D4, D5
   1134 
   1135     UZP1            v19.8h, v4.8h, v4.8h
   1136     UZP2            v21.8h, v4.8h, v4.8h
   1137     MOV             v4.d[0], v19.d[0]
   1138     MOV             v5.d[0], v21.d[0]
   1139 
   1140 
   1141     SQSUB           v8.4s, v8.4s , v24.4s
   1142 
   1143 //    VUZP.16         D6, D7
   1144 
   1145     UZP1            v19.8h, v6.8h, v6.8h
   1146     UZP2            v21.8h, v6.8h, v6.8h
   1147     MOV             v6.d[0], v19.d[0]
   1148     MOV             v7.d[0], v21.d[0]
   1149 
   1150 
   1151     SQSUB           v22.4s, v30.4s , v22.4s
   1152 
   1153     sQshL           v30.4s, v8.4s, #2
   1154 
   1155     LD2             {v8.4h, v9.4h}, [x2]
   1156     ADD             x2, x2, #16
   1157     sQshL           v22.4s, v22.4s, #2
   1158 
   1159     SQADD           v30.4s, v30.4s , v20.4s
   1160     SQADD           v22.4s, v22.4s , v20.4s
   1161 
   1162     sshR            v30.4s, v30.4s, #16
   1163 
   1164 //   VUZP.16         D30, D31
   1165 
   1166     UZP1            v19.8h, v30.8h, v30.8h
   1167     UZP2            v21.8h, v30.8h, v30.8h
   1168     MOV             v30.d[0], v19.d[0]
   1169     MOV             v30.d[1], v21.d[0]
   1170 
   1171 
   1172     sshR            v22.4s, v22.4s, #16
   1173 
   1174 
   1175 //    VUZP.16         D22, D23
   1176     UZP1            v19.8h, v22.8h, v22.8h
   1177     UZP2            v21.8h, v22.8h, v22.8h
   1178     MOV             v22.d[0], v19.d[0]
   1179     MOV             v23.d[0], v21.d[0]
   1180 
   1181 
   1182     mov             v23.8b, v30.8b
   1183 
   1184     SUBS            x3, x3, #1
   1185     BNE             CORE_LOOP
   1186 
   1187 
   1188 
   1189 
   1190 
   1191 EPILOGUE:
   1192 
   1193     ST1             {v14.h}[0], [x0]
   1194     ADD             x0, x0, x9
   1195     uMULL           v30.4s, v0.4h, v9.4h
   1196 
   1197     ST1             {v22.h}[0], [x0]
   1198     ADD             x0, x0, x9
   1199     uMULL           v28.4s, v2.4h, v8.4h
   1200 
   1201     ST1             {v14.h}[1], [x0]
   1202     ADD             x0, x0, x9
   1203     uMULL           v26.4s, v0.4h, v8.4h
   1204 
   1205     ST1             {v22.h}[1], [x0]
   1206     ADD             x0, x0, x9
   1207     uMULL           v24.4s, v2.4h, v9.4h
   1208 
   1209     ST1             {v14.h}[2], [x0]
   1210     ADD             x0, x0, x9
   1211     ushR            v30.4s, v30.4s, #16
   1212 
   1213     ST1             {v22.h}[2], [x0]
   1214     ADD             x0, x0, x9
   1215     ushR            v28.4s, v28.4s, #16
   1216 
   1217     ST1             {v14.h}[3], [x0]
   1218     ADD             x0, x0, x9
   1219     sMLAL           v30.4s, v1.4h, v9.4h
   1220 
   1221     ST1             {v22.h}[3], [x0]
   1222     ADD             x0, x0, x9
   1223     sMLAL           v28.4s, v3.4h, v8.4h
   1224 
   1225     ST1             {v15.h}[0], [x5]
   1226     ADD             x5, x5, x10
   1227     ushR            v26.4s, v26.4s, #16
   1228 
   1229     ST1             {v23.h}[0], [x5]
   1230     ADD             x5, x5, x10
   1231     ushR            v24.4s, v24.4s, #16
   1232 
   1233     ST1             {v15.h}[1], [x5]
   1234     ADD             x5, x5, x10
   1235     sMLAL           v26.4s, v1.4h, v8.4h
   1236 
   1237     ST1             {v23.h}[1], [x5]
   1238     ADD             x5, x5, x10
   1239     sMLAL           v24.4s, v3.4h, v9.4h
   1240 
   1241     ST1             {v15.h}[2], [x5]
   1242     ADD             x5, x5, x10
   1243     ADD             v30.4s, v30.4s , v28.4s
   1244 
   1245     ST1             {v23.h}[2], [x5]
   1246     ADD             x5, x5, x10
   1247     NEG             v30.4s, v30.4s
   1248 
   1249     ST1             {v15.h}[3], [x5]
   1250     ADD             x5, x5, x10
   1251 
   1252 
   1253     ST1             {v23.h}[3], [x5]
   1254     ADD             x5, x5, x10
   1255     SUB             v28.4s, v24.4s , v26.4s
   1256 
   1257 
   1258     uMULL           v22.4s, v4.4h, v8.4h
   1259     mov             v26.16b, v30.16b
   1260     mov             v24.16b, v28.16b
   1261 
   1262     mov             v26.16b, v30.16b
   1263     mov             v24.16b, v28.16b
   1264 
   1265     //VUZP.16         D26, D27
   1266 
   1267     UZP1            v19.8h, v26.8h, v26.8h
   1268     UZP2            v21.8h, v26.8h, v26.8h
   1269     MOV             v26.d[0], v19.d[0]
   1270     MOV             v27.d[0], v21.d[0]
   1271 
   1272 //    VUZP.16         D24, D25
   1273 
   1274     UZP1            v19.8h, v24.8h, v24.8h
   1275     UZP2            v21.8h, v24.8h, v24.8h
   1276     MOV             v24.d[0], v19.d[0]
   1277     MOV             v25.d[0], v21.d[0]
   1278 
   1279     uMULL           v2.4s, v24.4h, v18.4h
   1280     uMULL           v0.4s, v26.4h, v18.4h
   1281 
   1282     ushR            v22.4s, v22.4s, #16
   1283     sMLAL           v22.4s, v5.4h, v8.4h
   1284 
   1285     ushR            v2.4s, v2.4s, #16
   1286     ushR            v0.4s, v0.4s, #16
   1287     sMLAL           v2.4s, v25.4h, v18.4h
   1288     sMLAL           v0.4s, v27.4h, v18.4h
   1289 
   1290     uMULL           v24.4s, v4.4h, v9.4h
   1291     uMULL           v26.4s, v6.4h, v8.4h
   1292 
   1293     NEG             v2.4s, v2.4s
   1294     ADD             v28.4s, v28.4s , v0.4s
   1295     ADD             v30.4s, v30.4s , v2.4s
   1296 
   1297     uMULL           v0.4s, v6.4h, v9.4h
   1298     sshR            v24.4s, v24.4s, #16
   1299     sMLAL           v24.4s, v5.4h, v9.4h
   1300     sshR            v26.4s, v26.4s, #16
   1301     sshR            v0.4s, v0.4s, #16
   1302     sMLAL           v26.4s, v7.4h, v8.4h
   1303     sMLAL           v0.4s, v7.4h, v9.4h
   1304 
   1305 
   1306 
   1307 
   1308 
   1309     ADD             v22.4s, v22.4s , v0.4s
   1310     NEG             v22.4s, v22.4s
   1311     SUB             v24.4s, v26.4s , v24.4s
   1312 
   1313 
   1314 
   1315 
   1316     //LDR w11,  [sp, #120]
   1317     //sxtw x11,w11
   1318     MOV             w11, w26
   1319     dup             v14.4s, w11
   1320     SQADD           v28.4s, v28.4s , v14.4s
   1321     //LDR w11,  [sp, #116]
   1322     //sxtw x11,w11
   1323     MOV             w11, w25
   1324     dup             v0.4s, w11
   1325     sQshL           v28.4s, v28.4s, v0.4s
   1326 
   1327 
   1328     mov             v0.16b, v22.16b
   1329     mov             v14.16b, v24.16b
   1330 
   1331 
   1332 //    VUZP.16         D22, D23
   1333 
   1334     UZP1            v19.8h, v22.8h, v22.8h
   1335     UZP2            v21.8h, v22.8h, v22.8h
   1336     MOV             v22.d[0], v19.d[0]
   1337     MOV             v23.d[0], v21.d[0]
   1338 
   1339 //    VUZP.16         D24, D25
   1340 
   1341     UZP1            v19.8h, v24.8h, v24.8h
   1342     UZP2            v21.8h, v24.8h, v24.8h
   1343     MOV             v24.d[0], v19.d[0]
   1344     MOV             v25.d[0], v21.d[0]
   1345 
   1346     uMULL           v8.4s, v24.4h, v18.4h
   1347     uMULL           v26.4s, v22.4h, v18.4h
   1348 
   1349     NEG             v2.4s, v30.4s
   1350 
   1351 //    VUZP.16         D30, D31
   1352 
   1353     UZP1            v19.8h, v30.8h, v30.8h
   1354     UZP2            v21.8h, v30.8h, v30.8h
   1355     MOV             v30.d[0], v19.d[0]
   1356     MOV             v30.d[1], v21.d[0]
   1357 
   1358 //    VUZP.16         D2, D3
   1359 
   1360     UZP1            v19.8h, v2.8h, v2.8h
   1361     UZP2            v21.8h, v2.8h, v2.8h
   1362     MOV             v2.d[0], v19.d[0]
   1363     MOV             v3.d[0], v21.d[0]
   1364 
   1365     uMULL           v4.4s, v30.4h, v12.4h
   1366     uMULL           v6.4s, v2.4h, v13.4h
   1367 
   1368     ushR            v8.4s, v8.4s, #16
   1369     ushR            v26.4s, v26.4s, #16
   1370 
   1371     sMLAL           v8.4s, v25.4h, v18.4h
   1372     sMLAL           v26.4s, v23.4h, v18.4h
   1373 
   1374     ushR            v4.4s, v4.4s, #16
   1375     ushR            v6.4s, v6.4s, #16
   1376 
   1377     MOV             v19.d[0], v30.d[1]
   1378 
   1379     sMLAL           v4.4s, v19.4h, v12.4h
   1380     sMLAL           v6.4s, v3.4h, v13.4h
   1381 
   1382     NEG             v8.4s, v8.4s
   1383     ADD             v14.4s, v14.4s , v26.4s
   1384     ADD             v0.4s, v0.4s , v8.4s
   1385 
   1386     //LDR w11,  [sp, #120]
   1387     //sxtw x11,w11
   1388     MOV             w11, w26
   1389     dup             v8.4s, w11
   1390     SQADD           v0.4s, v0.4s , v8.4s
   1391     //LDR w11,  [sp, #116]
   1392     //sxtw x11,w11
   1393     MOV             w11, w25
   1394     dup             v26.4s, w11
   1395     sQshL           v0.4s, v0.4s, v26.4s
   1396 
   1397 
   1398     mov             v26.16b, v28.16b
   1399 
   1400     LD2             { v28.4s, v29.4s}, [x4]
   1401     MOV             v30.16b, v29.16b
   1402     MOV             v29.d[0], v28.d[1]
   1403 //    VZIP.32         Q13, Q0
   1404 
   1405     ZIP1            v19.4s, v26.4s, v0.4s
   1406     ZIP2            v0.4s, v26.4s, v0.4s
   1407     MOV             v26.16b, v19.16b
   1408 
   1409     ST1             { v26.4s}, [x4], #16
   1410     ST1             { v0.4s}, [x4], #16
   1411 
   1412     movi            v1.2s, #0
   1413 //    VADDL.S16       Q0, D13, D1
   1414     SADDL           v0.4s, v13.4h, v1.4h
   1415     MOV             v1.d[0], v0.d[1]
   1416 
   1417     sMULL           v26.2d, v28.2s, v0.2s
   1418     Sqxtn           v8.2s, v26.2d
   1419     sMULL           v26.2d, v29.2s, v1.2s
   1420     Sqxtn           v9.2s, v26.2d
   1421     MOV             v8.d[1], v9.d[0]
   1422     movi            v1.2s, #0
   1423 //    VADDL.S16       Q0, D12, D1
   1424     SADDL           v0.4s, v12.4h, v1.4h
   1425     MOV             v1.d[0], v0.d[1]
   1426 
   1427     sMULL           v24.2d, v28.2s, v0.2s
   1428     Sqxtn           v26.2s, v24.2d
   1429     sMULL           v24.2d, v29.2s, v1.2s
   1430     Sqxtn           v27.2s, v24.2d
   1431     MOV             v26.d[1], v27.d[0]
   1432 
   1433     sQshL           v4.4s, v4.4s, v16.4s
   1434     sQshL           v6.4s, v6.4s, v16.4s
   1435 
   1436     SQSUB           v4.4s, v4.4s , v8.4s
   1437     SQSUB           v6.4s, v6.4s , v26.4s
   1438 
   1439     NEG             v26.4s, v14.4s
   1440 //    VUZP.16         D14, D15
   1441 
   1442     UZP1            v19.8h, v14.8h, v14.8h
   1443     UZP2            v21.8h, v14.8h, v14.8h
   1444     MOV             v14.d[0], v19.d[0]
   1445     MOV             v15.d[0], v21.d[0]
   1446 
   1447 
   1448 //   VUZP.16         D26, D27
   1449 
   1450     UZP1            v19.8h, v26.8h, v26.8h
   1451     UZP2            v21.8h, v26.8h, v26.8h
   1452     MOV             v26.d[0], v19.d[0]
   1453     MOV             v27.d[0], v21.d[0]
   1454 
   1455 
   1456     movi            v1.2s, #0
   1457     //VADDL.S16       Q0, D10, D1
   1458     SADDL           v0.4s, v10.4h, v1.4h
   1459     MOV             v1.d[0], v0.d[1]
   1460 
   1461     sMULL           v22.2d, v30.2s, v0.2s
   1462     Sqxtn           v24.2s, v22.2d
   1463     sMULL2          v22.2d, v30.4s, v0.4s
   1464     Sqxtn           v25.2s, v22.2d
   1465     MOV             v24.d[1], v25.d[0]
   1466     movi            v1.2s, #0
   1467     //VADDL.S16       Q0, D11, D1
   1468     SADDL           v0.4s, v11.4h, v1.4h
   1469     MOV             v1.d[0], v0.d[1]
   1470 
   1471     sMULL           v8.2d, v30.2s, v0.2s
   1472     Sqxtn           v22.2s, v8.2d
   1473     sMULL2          v8.2d, v30.4s, v0.4s
   1474     Sqxtn           v23.2s, v8.2d
   1475     MOV             v22.d[1], v23.d[0]
   1476 
   1477     uMULL           v8.4s, v26.4h, v11.4h
   1478     uMULL           v30.4s, v14.4h, v10.4h
   1479 
   1480     ushR            v8.4s, v8.4s, #16
   1481 
   1482     ushR            v30.4s, v30.4s, #16
   1483 
   1484     sMLAL           v8.4s, v27.4h, v11.4h
   1485 
   1486     sMLAL           v30.4s, v15.4h, v10.4h
   1487 
   1488     sQshL           v4.4s, v4.4s, #2
   1489 
   1490     sQshL           v6.4s, v6.4s, #2
   1491 
   1492     SQADD           v14.4s, v4.4s , v20.4s
   1493 
   1494     SQADD           v6.4s, v6.4s , v20.4s
   1495 
   1496     sshR            v14.4s, v14.4s, #16
   1497 
   1498 //    VUZP.16         D14, D15
   1499 
   1500     UZP1            v19.8h, v14.8h, v14.8h
   1501     UZP2            v21.8h, v14.8h, v14.8h
   1502     MOV             v14.d[0], v19.d[0]
   1503     MOV             v15.d[0], v21.d[0]
   1504 
   1505     sshR            v6.4s, v6.4s, #16
   1506 
   1507 //    VUZP.16         D6, D7
   1508 
   1509     UZP1            v19.8h, v6.8h, v6.8h
   1510     UZP2            v21.8h, v6.8h, v6.8h
   1511     MOV             v6.d[0], v19.d[0]
   1512     MOV             v7.d[0], v21.d[0]
   1513 
   1514     mov             v15.8b, v6.8b
   1515     sQshL           v8.4s, v8.4s, v16.4s
   1516 
   1517     sQshL           v30.4s, v30.4s, v16.4s
   1518 
   1519     SQSUB           v8.4s, v8.4s , v24.4s
   1520 
   1521     SQSUB           v22.4s, v30.4s , v22.4s
   1522 
   1523     sQshL           v30.4s, v8.4s, #2
   1524 
   1525     sQshL           v22.4s, v22.4s, #2
   1526 
   1527     SQADD           v30.4s, v30.4s , v20.4s
   1528     SQADD           v22.4s, v22.4s , v20.4s
   1529 
   1530     sshR            v30.4s, v30.4s, #16
   1531 
   1532     //VUZP.16         D30, D31
   1533 
   1534     UZP1            v19.8h, v30.8h, v30.8h
   1535     UZP2            v21.8h, v30.8h, v30.8h
   1536     MOV             v30.d[0], v19.d[0]
   1537     MOV             v30.d[1], v21.d[0]
   1538 
   1539     sshR            v22.4s, v22.4s, #16
   1540 
   1541 //    VUZP.16         D22, D23
   1542     UZP1            v19.8h, v22.8h, v22.8h
   1543     UZP2            v21.8h, v22.8h, v22.8h
   1544     MOV             v22.d[0], v19.d[0]
   1545     MOV             v23.d[0], v21.d[0]
   1546 
   1547     mov             v23.8b, v30.8b
   1548 
   1549 
   1550 
   1551 
   1552     ST1             {v14.h}[0], [x0]
   1553     ADD             x0, x0, x9
   1554     ST1             {v22.h}[0], [x0]
   1555     ADD             x0, x0, x9
   1556     ST1             {v14.h}[1], [x0]
   1557     ADD             x0, x0, x9
   1558     ST1             {v22.h}[1], [x0]
   1559     ADD             x0, x0, x9
   1560     ST1             {v14.h}[2], [x0]
   1561     ADD             x0, x0, x9
   1562     ST1             {v22.h}[2], [x0]
   1563     ADD             x0, x0, x9
   1564     ST1             {v14.h}[3], [x0]
   1565     ADD             x0, x0, x9
   1566     ST1             {v22.h}[3], [x0]
   1567     ADD             x0, x0, x9
   1568     ST1             {v15.h}[0], [x5]
   1569     ADD             x5, x5, x10
   1570     ST1             {v23.h}[0], [x5]
   1571     ADD             x5, x5, x10
   1572     ST1             {v15.h}[1], [x5]
   1573     ADD             x5, x5, x10
   1574     ST1             {v23.h}[1], [x5]
   1575     ADD             x5, x5, x10
   1576     ST1             {v15.h}[2], [x5]
   1577     ADD             x5, x5, x10
   1578     ST1             {v23.h}[2], [x5]
   1579     ADD             x5, x5, x10
   1580     ST1             {v15.h}[3], [x5]
   1581     ADD             x5, x5, x10
   1582     ST1             {v23.h}[3], [x5]
   1583     ADD             x5, x5, x10
   1584 
   1585 ARM_EPILOGUE:
   1586 
   1587 ARM_LOOP:
   1588 
   1589     LD2             { v0.4s, v1.4s}, [x1]
   1590     MOV             v2.16b, v1.16b
   1591 
   1592     //VUZP.16         D0, D1
   1593     UZP1            v19.8h, v0.8h, v0.8h
   1594     UZP2            v21.8h, v0.8h, v0.8h
   1595     MOV             v0.d[0], v19.d[0]
   1596     MOV             v0.d[1], v21.d[0]
   1597 
   1598     //VUZP.16         D2, D3
   1599     UZP1            v19.8h, v2.8h, v2.8h
   1600     UZP2            v21.8h, v2.8h, v2.8h
   1601     MOV             v2.d[0], v19.d[0]
   1602     MOV             v2.d[1], v21.d[0]
   1603 
   1604 
   1605     rev64           v0.8h, v0.8h
   1606     MOV             v1.d[0], v0.d[1]
   1607     rev64           v2.8h, v2.8h
   1608     MOV             v3.d[0], v2.d[1]
   1609 
   1610     LD2             {v8.4h, v9.4h}, [x2]
   1611     ADD             x2, x2, #16
   1612 
   1613     LD2             {v4.2s, v5.2s}, [x8]
   1614     ADD             x8, x8, #16
   1615     MOV             v6.16b, v5.16b
   1616     movi            v5.2s, #0x00000000
   1617     movi            v7.2s, #0x00000000
   1618 
   1619     LD1             {v5.s}[0], [x8], #4
   1620     LD1             {v7.s}[0], [x8]
   1621 
   1622     MOV             x12, #16
   1623     MOV             v4.d[1], v5.d[0]
   1624     MOV             v6.d[1], v7.d[0]
   1625 //    VUZP.16         D4, D5
   1626 
   1627     UZP1            v19.8h, v4.8h, v4.8h
   1628     UZP2            v21.8h, v4.8h, v4.8h
   1629     MOV             v4.d[0], v19.d[0]
   1630     MOV             v5.d[0], v21.d[0]
   1631 
   1632 //    VUZP.16         D6, D7
   1633 
   1634     UZP1            v19.8h, v6.8h, v6.8h
   1635     UZP2            v21.8h, v6.8h, v6.8h
   1636     MOV             v6.d[0], v19.d[0]
   1637     MOV             v7.d[0], v21.d[0]
   1638 
   1639     ADD             x6, x6, #16
   1640 
   1641     MOV             x12, #-4
   1642     LD2             {v11.2s, v12.2s}, [x6]
   1643     ADD             x6, x6, x12
   1644     MOV             v13.16b, v12.16b
   1645 
   1646 
   1647     movi            v10.2s, #0x00000000
   1648 
   1649     LD1             {v12.s}[1], [x6]
   1650     ADD             x6, x6, x12
   1651     LD1             {v10.s}[1], [x6]
   1652     ADD             x6, x6, x12
   1653     LD1             {v12.s}[0], [x6]
   1654     ADD             x6, x6, x12
   1655 
   1656     MOV             v10.d[1], v11.d[0]
   1657     MOV             v12.d[1], v13.d[0]
   1658 
   1659     //VUZP.16         D10, D11
   1660 
   1661     UZP1            v19.8h, v10.8h, v10.8h
   1662     UZP2            v21.8h, v10.8h, v10.8h
   1663     MOV             v10.d[0], v19.d[0]
   1664     MOV             v10.d[1], v21.d[0]
   1665 
   1666     //VUZP.16         D12, D13
   1667 
   1668     UZP1            v19.8h, v12.8h, v12.8h
   1669     UZP2            v21.8h, v12.8h, v12.8h
   1670     MOV             v12.d[0], v19.d[0]
   1671     MOV             v12.d[1], v21.d[0]
   1672 
   1673 
   1674     rev64           v10.8h, v10.8h
   1675     MOV             v11.d[0], v10.d[1]
   1676     rev64           v12.8h, v12.8h
   1677     MOV             v13.d[0], v12.d[1]
   1678 
   1679     uMULL           v30.4s, v0.4h, v9.4h
   1680     uMULL           v28.4s, v2.4h, v8.4h
   1681     uMULL           v26.4s, v0.4h, v8.4h
   1682     uMULL           v24.4s, v2.4h, v9.4h
   1683 
   1684     ushR            v30.4s, v30.4s, #16
   1685     ushR            v28.4s, v28.4s, #16
   1686 
   1687     sMLAL           v30.4s, v1.4h, v9.4h
   1688     sMLAL           v28.4s, v3.4h, v8.4h
   1689 
   1690     ushR            v26.4s, v26.4s, #16
   1691     ushR            v24.4s, v24.4s, #16
   1692 
   1693     sMLAL           v26.4s, v1.4h, v8.4h
   1694     sMLAL           v24.4s, v3.4h, v9.4h
   1695 
   1696     ADD             v30.4s, v30.4s , v28.4s
   1697     NEG             v30.4s, v30.4s
   1698 
   1699     uMULL           v22.4s, v4.4h, v8.4h
   1700 
   1701     SUB             v28.4s, v24.4s , v26.4s
   1702 
   1703 
   1704     mov             v26.16b, v30.16b
   1705     mov             v24.16b, v28.16b
   1706 
   1707 //    VUZP.16         D26, D27
   1708 
   1709     UZP1            v19.8h, v26.8h, v26.8h
   1710     UZP2            v21.8h, v26.8h, v26.8h
   1711     MOV             v26.d[0], v19.d[0]
   1712     MOV             v27.d[0], v21.d[0]
   1713 
   1714     //VUZP.16         D24, D25
   1715 
   1716     UZP1            v19.8h, v24.8h, v24.8h
   1717     UZP2            v21.8h, v24.8h, v24.8h
   1718     MOV             v24.d[0], v19.d[0]
   1719     MOV             v25.d[0], v21.d[0]
   1720 
   1721     uMULL           v2.4s, v24.4h, v18.4h
   1722     uMULL           v0.4s, v26.4h, v18.4h
   1723 
   1724     ushR            v22.4s, v22.4s, #16
   1725     sMLAL           v22.4s, v5.4h, v8.4h
   1726 
   1727     ushR            v2.4s, v2.4s, #16
   1728     ushR            v0.4s, v0.4s, #16
   1729     sMLAL           v2.4s, v25.4h, v18.4h
   1730     sMLAL           v0.4s, v27.4h, v18.4h
   1731 
   1732     uMULL           v24.4s, v4.4h, v9.4h
   1733     uMULL           v26.4s, v6.4h, v8.4h
   1734 
   1735     NEG             v2.4s, v2.4s
   1736     ADD             v28.4s, v28.4s , v0.4s
   1737     ADD             v30.4s, v30.4s , v2.4s
   1738 
   1739     uMULL           v0.4s, v6.4h, v9.4h
   1740     sshR            v24.4s, v24.4s, #16
   1741     sMLAL           v24.4s, v5.4h, v9.4h
   1742     sshR            v26.4s, v26.4s, #16
   1743     sshR            v0.4s, v0.4s, #16
   1744     sMLAL           v26.4s, v7.4h, v8.4h
   1745     sMLAL           v0.4s, v7.4h, v9.4h
   1746 
   1747     ADD             v22.4s, v22.4s , v0.4s
   1748     NEG             v22.4s, v22.4s
   1749     SUB             v24.4s, v26.4s , v24.4s
   1750 
   1751     //LDR w11,  [sp, #120]
   1752     //sxtw x11,w11
   1753     MOV             w11, w26
   1754     dup             v14.4s, w11
   1755     SQADD           v28.4s, v28.4s , v14.4s
   1756     //LDR w11,  [sp, #116]
   1757     //sxtw x11,w11
   1758     MOV             w11, w25
   1759     dup             v0.4s, w11
   1760     sQshL           v28.4s, v28.4s, v0.4s
   1761 
   1762     mov             v0.16b, v22.16b
   1763     mov             v14.16b, v24.16b
   1764 
   1765 //    VUZP.16         D22, D23
   1766 
   1767     UZP1            v19.8h, v22.8h, v22.8h
   1768     UZP2            v21.8h, v22.8h, v22.8h
   1769     MOV             v22.d[0], v19.d[0]
   1770     MOV             v23.d[0], v21.d[0]
   1771 
   1772 //   VUZP.16         D24, D25
   1773 
   1774     UZP1            v19.8h, v24.8h, v24.8h
   1775     UZP2            v21.8h, v24.8h, v24.8h
   1776     MOV             v24.d[0], v19.d[0]
   1777     MOV             v25.d[0], v21.d[0]
   1778 
   1779     uMULL           v8.4s, v24.4h, v18.4h
   1780     uMULL           v26.4s, v22.4h, v18.4h
   1781 
   1782     NEG             v2.4s, v30.4s
   1783 //    VUZP.16         D30, D31
   1784 
   1785     UZP1            v19.8h, v30.8h, v30.8h
   1786     UZP2            v21.8h, v30.8h, v30.8h
   1787     MOV             v30.d[0], v19.d[0]
   1788     MOV             v30.d[1], v21.d[0]
   1789 
   1790 //    VUZP.16         D2, D3
   1791 
   1792     UZP1            v19.8h, v2.8h, v2.8h
   1793     UZP2            v21.8h, v2.8h, v2.8h
   1794     MOV             v2.d[0], v19.d[0]
   1795     MOV             v3.d[0], v21.d[0]
   1796 
   1797     uMULL           v4.4s, v30.4h, v12.4h
   1798     uMULL           v6.4s, v2.4h, v13.4h
   1799 
   1800     ushR            v8.4s, v8.4s, #16
   1801     ushR            v26.4s, v26.4s, #16
   1802 
   1803     sMLAL           v8.4s, v25.4h, v18.4h
   1804     sMLAL           v26.4s, v23.4h, v18.4h
   1805 
   1806     ushR            v4.4s, v4.4s, #16
   1807     ushR            v6.4s, v6.4s, #16
   1808 
   1809     MOV             v19.d[0], v30.d[1]
   1810 
   1811     sMLAL           v4.4s, v19.4h, v12.4h
   1812     sMLAL           v6.4s, v3.4h, v13.4h
   1813 
   1814     NEG             v8.4s, v8.4s
   1815     ADD             v14.4s, v14.4s , v26.4s
   1816     ADD             v0.4s, v0.4s , v8.4s
   1817 
   1818     //LDR w11,  [sp, #120]
   1819     //sxtw x11,w11
   1820     MOV             w11, w26
   1821     dup             v8.4s, w11
   1822     SQADD           v0.4s, v0.4s , v8.4s
   1823     //LDR w11,  [sp, #116]
   1824     //sxtw x11,w11
   1825     MOV             w11, w25
   1826     dup             v26.4s, w11
   1827     sQshL           v0.4s, v0.4s, v26.4s
   1828 
   1829     mov             v26.16b, v28.16b
   1830 
   1831     MOV             x6, x4
   1832 
   1833     LD1             {v28.2s, v29.2s}, [x4], #16
   1834     movi            v19.2s, #0x00000000
   1835     LD1             {v30.s}[0], [x4], #4
   1836     LD1             {v30.s}[1], [x4], #4
   1837     LD1             {v19.s}[0], [x4], #4
   1838 
   1839     MOV             v28.d[1], v29.d[0]
   1840     MOV             v30.d[1], v19.d[0]
   1841 
   1842     //VUZP.32         Q14, Q15
   1843 
   1844     UZP1            v19.4s, v28.4s, v30.4s
   1845     UZP2            v30.4s, v28.4s, v30.4s
   1846     MOV             v28.16b, v19.16b
   1847     MOV             v29.d[0], v28.d[1]
   1848 
   1849     ST1             {v26.s}[0], [x6], #4
   1850     ST1             {v0.s}[0], [x6], #4
   1851     ST1             {v26.s}[1], [x6], #4
   1852     ST1             {v0.s}[1], [x6], #4
   1853     ST1             {v26.s}[2], [x6], #4
   1854     ST1             {v0.s}[2], [x6], #4
   1855     ST1             {v26.s}[3], [x6], #4
   1856 
   1857     movi            v1.2s, #0
   1858     //VADDL.S16       Q0, D13, D1
   1859     SADDL           v0.4s, v13.4h, v1.4h
   1860     MOV             v1.d[0], v0.d[1]
   1861 
   1862     sMULL           v26.2d, v28.2s, v0.2s
   1863     Sqxtn           v8.2s, v26.2d
   1864     sMULL           v26.2d, v29.2s, v1.2s
   1865     Sqxtn           v9.2s, v26.2d
   1866     MOV             v8.d[1], v9.d[0]
   1867     movi            v1.2s, #0
   1868     //VADDL.S16       Q0, D12, D1
   1869     SADDL           v0.4s, v12.4h, v1.4h
   1870     MOV             v1.d[0], v0.d[1]
   1871 
   1872     sMULL           v24.2d, v28.2s, v0.2s
   1873     Sqxtn           v26.2s, v24.2d
   1874     sMULL           v24.2d, v29.2s, v1.2s
   1875     Sqxtn           v27.2s, v24.2d
   1876     MOV             v26.d[1], v27.d[0]
   1877 
   1878     sQshL           v4.4s, v4.4s, v16.4s
   1879     sQshL           v6.4s, v6.4s, v16.4s
   1880 
   1881     SQSUB           v4.4s, v4.4s , v8.4s
   1882     SQSUB           v6.4s, v6.4s , v26.4s
   1883 
   1884     NEG             v26.4s, v14.4s
   1885     //VUZP.16         D14, D15
   1886 
   1887     UZP1            v19.8h, v14.8h, v14.8h
   1888     UZP2            v21.8h, v14.8h, v14.8h
   1889     MOV             v14.d[0], v19.d[0]
   1890     MOV             v15.d[0], v21.d[0]
   1891 
   1892 //    VUZP.16         D26, D27
   1893 
   1894     UZP1            v19.8h, v26.8h, v26.8h
   1895     UZP2            v21.8h, v26.8h, v26.8h
   1896     MOV             v26.d[0], v19.d[0]
   1897     MOV             v27.d[0], v21.d[0]
   1898 
   1899 
   1900     movi            v1.2s, #0
   1901     //VADDL.S16       Q0, D10, D1
   1902     SADDL           v0.4s, v10.4h, v1.4h
   1903     MOV             v1.d[0], v0.d[1]
   1904 
   1905     sMULL           v22.2d, v30.2s, v0.2s
   1906     Sqxtn           v24.2s, v22.2d
   1907     sMULL2          v22.2d, v30.4s, v0.4s
   1908     Sqxtn           v25.2s, v22.2d
   1909     MOV             v24.d[1], v25.d[0]
   1910 
   1911     movi            v1.2s, #0
   1912 //    VADDL.S16       Q0, D11, D1
   1913     SADDL           v0.4s, v11.4h, v1.4h
   1914     MOV             v1.d[0], v0.d[1]
   1915 
   1916     sMULL           v8.2d, v30.2s, v0.2s
   1917     Sqxtn           v22.2s, v8.2d
   1918     sMULL2          v8.2d, v30.4s, v0.4s
   1919     Sqxtn           v23.2s, v8.2d
   1920     MOV             v22.d[1], v23.d[0]
   1921 
   1922     uMULL           v8.4s, v26.4h, v11.4h
   1923     uMULL           v30.4s, v14.4h, v10.4h
   1924 
   1925     ushR            v8.4s, v8.4s, #16
   1926 
   1927     ushR            v30.4s, v30.4s, #16
   1928 
   1929     sMLAL           v8.4s, v27.4h, v11.4h
   1930 
   1931     sMLAL           v30.4s, v15.4h, v10.4h
   1932 
   1933     sQshL           v4.4s, v4.4s, #2
   1934 
   1935     sQshL           v6.4s, v6.4s, #2
   1936 
   1937     SQADD           v14.4s, v4.4s , v20.4s
   1938 
   1939     SQADD           v6.4s, v6.4s , v20.4s
   1940 
   1941     sshR            v14.4s, v14.4s, #16
   1942 
   1943 //    VUZP.16         D14, D15
   1944 
   1945     UZP1            v19.8h, v14.8h, v14.8h
   1946     UZP2            v21.8h, v14.8h, v14.8h
   1947     MOV             v14.d[0], v19.d[0]
   1948     MOV             v15.d[0], v21.d[0]
   1949 
   1950     sshR            v6.4s, v6.4s, #16
   1951 
   1952     //VUZP.16         D6, D7
   1953 
   1954     UZP1            v19.8h, v6.8h, v6.8h
   1955     UZP2            v21.8h, v6.8h, v6.8h
   1956     MOV             v6.d[0], v19.d[0]
   1957     MOV             v7.d[0], v21.d[0]
   1958 
   1959     mov             v15.8b, v6.8b
   1960     sQshL           v8.4s, v8.4s, v16.4s
   1961 
   1962     sQshL           v30.4s, v30.4s, v16.4s
   1963 
   1964     SQSUB           v8.4s, v8.4s , v24.4s
   1965 
   1966     SQSUB           v22.4s, v30.4s , v22.4s
   1967 
   1968     sQshL           v30.4s, v8.4s, #2
   1969 
   1970     sQshL           v22.4s, v22.4s, #2
   1971 
   1972     SQADD           v30.4s, v30.4s , v20.4s
   1973     SQADD           v22.4s, v22.4s , v20.4s
   1974 
   1975     sshR            v30.4s, v30.4s, #16
   1976 
   1977 //    VUZP.16         D30, D31
   1978 
   1979     UZP1            v19.8h, v30.8h, v30.8h
   1980     UZP2            v21.8h, v30.8h, v30.8h
   1981     MOV             v30.d[0], v19.d[0]
   1982     MOV             v30.d[1], v21.d[0]
   1983 
   1984     sshR            v22.4s, v22.4s, #16
   1985 
   1986 //    VUZP.16         D22, D23
   1987 
   1988     UZP1            v19.8h, v22.8h, v22.8h
   1989     UZP2            v21.8h, v22.8h, v22.8h
   1990     MOV             v22.d[0], v19.d[0]
   1991     MOV             v23.d[0], v21.d[0]
   1992 
   1993     mov             v23.8b, v30.8b
   1994 
   1995 
   1996 
   1997 
   1998     ST1             {v14.h}[0], [x0]
   1999     ADD             x0, x0, x9
   2000     ST1             {v22.h}[0], [x0]
   2001     ADD             x0, x0, x9
   2002     ST1             {v14.h}[1], [x0]
   2003     ADD             x0, x0, x9
   2004     ST1             {v22.h}[1], [x0]
   2005     ADD             x0, x0, x9
   2006     ST1             {v14.h}[2], [x0]
   2007     ADD             x0, x0, x9
   2008     ST1             {v22.h}[2], [x0]
   2009     ADD             x0, x0, x9
   2010     ST1             {v14.h}[3], [x0]
   2011     ADD             x0, x0, x9
   2012 
   2013     ST1             {v15.h}[0], [x5]
   2014     ADD             x5, x5, x10
   2015     ST1             {v23.h}[0], [x5]
   2016     ADD             x5, x5, x10
   2017     ST1             {v15.h}[1], [x5]
   2018     ADD             x5, x5, x10
   2019     ST1             {v23.h}[1], [x5]
   2020     ADD             x5, x5, x10
   2021     ST1             {v15.h}[2], [x5]
   2022     ADD             x5, x5, x10
   2023     ST1             {v23.h}[2], [x5]
   2024     ADD             x5, x5, x10
   2025     ST1             {v15.h}[3], [x5]
   2026     ADD             x5, x5, x10
   2027 
   2028     // VPOP            {d8 - d15}
   2029     // LDMFD sp!, {x4-x12}
   2030     //ldp x19, x20,[sp],#16
   2031     pop_v_regs
   2032     ret
   2033     //BX              x14
   2034