Home | History | Annotate | Download | only in armv8
      1 ///******************************************************************************
      2 // *
      3 // * Copyright (C) 2018 The Android Open Source Project
      4 // *
      5 // * Licensed under the Apache License, Version 2.0 (the "License");
      6 // * you may not use this file except in compliance with the License.
      7 // * You may obtain a copy of the License at:
      8 // *
      9 // * http://www.apache.org/licenses/LICENSE-2.0
     10 // *
     11 // * Unless required by applicable law or agreed to in writing, software
     12 // * distributed under the License is distributed on an "AS IS" BASIS,
     13 // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 // * See the License for the specific language governing permissions and
     15 // * limitations under the License.
     16 // *
     17 // *****************************************************************************
     18 // * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 
     21 
     22 .macro push_v_regs
     23     stp             q8, q9, [sp, #-32]!
     24     stp             q10, q11, [sp, #-32]!
     25     stp             q12, q13, [sp, #-32]!
     26     stp             q14, q15, [sp, #-32]!
     27     stp             X8, X9, [sp, #-16]!
     28     stp             X10, X11, [sp, #-16]!
     29     stp             X12, X13, [sp, #-16]!
     30     stp             X14, X15, [sp, #-16]!
     31     stp             X16, X17, [sp, #-16]!
     32     stp             X29, X30, [sp, #-16]!
     33 .endm
     34 .macro pop_v_regs
     35     ldp             X29, X30, [sp], #16
     36     ldp             X16, X17, [sp], #16
     37     ldp             X14, X15, [sp], #16
     38     ldp             X12, X13, [sp], #16
     39     ldp             X10, X11, [sp], #16
     40     ldp             X8, X9, [sp], #16
     41     ldp             q14, q15, [sp], #32
     42     ldp             q12, q13, [sp], #32
     43     ldp             q10, q11, [sp], #32
     44     ldp             q8, q9, [sp], #32
     45 .endm
     46 .text
     47 .global ixheaacd_over_lap_add2_armv8
     48 
     49 
     50 ixheaacd_over_lap_add2_armv8:
     51     push_v_regs
     52     MOV             X8, X5
     53     SUB             X12, X5, #1
     54     LSL             X9, X5, #2
     55     LSL             X12, X12, #2
     56     ADD             X10, X0, X9
     57     ADD             X7, X1, X12
     58     ADD             X4, X4, #1
     59     LD2             {V0.4H, V1.4H}, [X10], #16
     60     LSL             X11, X6, #2
     61     SUB             X7, X7, #12
     62     SUB             X4, X4, #16
     63     MOV             X12, #-16
     64     MOV             X13, #1
     65     ADD             X14, X4, #1
     66     NEG             X14, X14
     67     DUP             V21.4S, W4
     68     LD2             {V6.4H, V7.4H}, [X7], X12
     69     LSL             X4, X13, X14
     70     REV64           V4.4H, V6.4H
     71     DUP             V20.4S, W4
     72     REV64           V5.4H, V7.4H
     73     MOV             X4, X3
     74 
     75     MOV             X9, X2
     76     LD2             {V2.4H, V3.4H}, [X3], #16
     77 
     78     UMULL           V23.4S, V0.4H, V2.4H
     79     UMLSL           V23.4S, V4.4H, V3.4H
     80     LD2             {V8.4H, V9.4H}, [X10], #16
     81     SSHR            V23.4S, V23.4S, #16
     82     LD2             {V10.4H, V11.4H}, [X3], #16
     83     SMLAL           V23.4S, V1.4H, V2.4H
     84     SMLSL           V23.4S, V5.4H, V3.4H
     85     LD2             {V14.4H, V15.4H}, [X7], X12
     86     REV64           V12.4H, V14.4H
     87     REV64           V13.4H, V15.4H
     88     SQADD           V22.4S, V23.4S, V20.4S
     89     SSHL            V22.4S, V22.4S, V21.4S
     90     MOV             V24.16B, V22.16B
     91     SUB             X8, X8, #8
     92 
     93 LOOP_1:
     94 
     95     LD2             {V0.4H, V1.4H}, [X10], #16
     96     UMULL           V19.4S, V8.4H, V10.4H
     97     LD2             {V2.4H, V3.4H}, [X3], #16
     98     UMLSL           V19.4S, V12.4H, V11.4H
     99     LD2             {V6.4H, V7.4H}, [X7], X12
    100     UMULL           V23.4S, V0.4H, V2.4H
    101     REV64           V4.4H, V6.4H
    102     UMLSL           V23.4S, V4.4H, V3.4H
    103     REV64           V5.4H, V7.4H
    104     SSHR            V19.4S, V19.4S, #16
    105     ST1             {V24.S}[0], [X2], X11
    106     SMLAL           V19.4S, V9.4H, V10.4H
    107     ST1             {V24.S}[1], [X2], X11
    108     SSHR            V23.4S, V23.4S, #16
    109     ST1             {V24.S}[2], [X2], X11
    110     SMLAL           V23.4S, V1.4H, V2.4H
    111 
    112     ST1             {V24.S}[3], [X2], X11
    113     SMLSL           V19.4S, V13.4H, V11.4H
    114     SMLSL           V23.4S, V5.4H, V3.4H
    115 
    116     LD2             {V8.4H, V9.4H}, [X10], #16
    117     LD2             {V10.4H, V11.4H}, [X3], #16
    118 
    119 
    120     LD2             {V14.4H, V15.4H}, [X7], X12
    121     SQADD           V18.4S, V19.4S, V20.4S
    122     REV64           V12.4H, V14.4H
    123     REV64           V13.4H, V15.4H
    124     SQADD           V22.4S, V23.4S, V20.4S
    125     SSHL            V18.4S, V18.4S, V21.4S
    126     MOV             V16.16B, V18.16B
    127     ST1             {V16.S}[0], [X2], X11
    128     SSHL            V22.4S, V22.4S, V21.4S
    129 
    130 
    131     MOV             V24.16B, V22.16B
    132     SUBS            X8, X8, #8
    133 
    134     ST1             {V16.S}[1], [X2], X11
    135     ST1             {V16.S}[2], [X2], X11
    136     ST1             {V16.S}[3], [X2], X11
    137 
    138 
    139     BGT             LOOP_1
    140 
    141 
    142     ST1             {V24.S}[0], [X2], X11
    143     UMULL           V19.4S, V8.4H, V10.4H
    144     UMLSL           V19.4S, V12.4H, V11.4H
    145     ST1             {V24.S}[1], [X2], X11
    146     ST1             {V24.S}[2], [X2], X11
    147     SSHR            V19.4S, V19.4S, #16
    148     ST1             {V24.S}[3], [X2], X11
    149     SMLAL           V19.4S, V9.4H, V10.4H
    150     SMLSL           V19.4S, V13.4H, V11.4H
    151     MOV             X12, #12
    152     MOV             V30.S[0], W5
    153     MOV             V31.S[0], W6
    154     SMULL           V29.4S, V30.4H, V31.4H
    155     MOV             W7, V29.S[0]
    156 
    157     LSL             W10, W5, #1
    158     SQADD           V18.4S, V19.4S, V20.4S
    159     SSHL            V18.4S, V18.4S, V21.4S
    160     MOV             V16.16B, V18.16B
    161 
    162     ST1             {V16.S}[0], [X2], X11
    163     LSL             X7, X7, #2
    164 
    165     ST1             {V16.S}[1], [X2], X11
    166     ADD             X7, X7, X9
    167 
    168     ST1             {V16.S}[2], [X2], X11
    169     ST1             {V16.S}[3], [X2], X11
    170 
    171     SUB             X11, X10, #1
    172     LSL             X10, X11, #2
    173     ADD             X10, X0, X10
    174     LSL             X11, X11, #1
    175     SUB             X10, X10, X12
    176     LSL             X8, X6, #2
    177     MOV             X12, #-16
    178     ADD             X11, X11, X4
    179 
    180     LD1             {V6.4S}, [X10], X12
    181     SUB             X11, X11, #14
    182 
    183 
    184     REV64           V0.4S, V6.4S
    185     SQNEG           V0.4S, V0.4S
    186 
    187 
    188     UZP1            V1.8H, V0.8H, V0.8H
    189     UZP2            V0.8H, V0.8H, V0.8H
    190     REV64           V1.4S, V1.4S
    191     REV64           V0.4S, V0.4S
    192     LD2             {V2.4H, V3.4H}, [X11], X12
    193     REV64           V2.4H, V2.4H
    194     REV64           V3.4H, V3.4H
    195 
    196     LD2             {V4.4H, V5.4H}, [X1], #16
    197 
    198     UMULL           V23.4S, V1.4H, V3.4H
    199     UMLSL           V23.4S, V4.4H, V2.4H
    200     SSHR            V23.4S, V23.4S, #16
    201     SMLAL           V23.4S, V0.4H, V3.4H
    202     SMLSL           V23.4S, V5.4H, V2.4H
    203     SQADD           V22.4S, V23.4S, V20.4S
    204     SSHL            V22.4S, V22.4S, V21.4S
    205     MOV             V24.16B, V22.16B
    206 
    207 
    208     LD1             {V14.4S}, [X10], X12
    209     UMULL           V23.4S, V1.4H, V3.4H
    210     UMLSL           V23.4S, V4.4H, V2.4H
    211     REV64           V8.4S, V14.4S
    212     SQNEG           V8.4S, V8.4S
    213     LD2             {V10.4H, V11.4H}, [X11], X12
    214     SSHR            V23.4S, V23.4S, #16
    215     LD2             {V12.4H, V13.4H}, [X1], #16
    216     SMLAL           V23.4S, V0.4H, V3.4H
    217     SMLSL           V23.4S, V5.4H, V2.4H
    218     UZP1            V9.8H, V8.8H, V8.8H
    219     UZP2            V8.8H, V8.8H, V8.8H
    220     rev64           v9.4s, v9.4s
    221     rev64           v8.4s, v8.4s
    222     REV64           V10.4H, V10.4H
    223     REV64           V11.4H, V11.4H
    224     SQADD           V22.4S, V23.4S, V20.4S
    225     SUB             X5, X5, #8
    226     SSHL            V22.4S, V22.4S, V21.4S
    227     MOV             V24.16B, V22.16B
    228 
    229 
    230 LOOP_2:
    231 
    232 
    233     LD1             {V6.4S}, [X10], X12
    234     UMULL           V19.4S, V9.4H, V11.4H
    235     REV64           V0.4S, V6.4S
    236     SQNEG           V0.4S, V0.4S
    237     UZP1            V1.8H, V0.8H, V0.8H
    238     UZP2            V0.8H, V0.8H, V0.8H
    239     REV64           V1.4S, V1.4S
    240     REV64           V0.4S, V0.4S
    241     LD2             {V2.4H, V3.4H}, [X11], X12
    242     REV64           V2.8H, V2.8H
    243     REV64           V3.8H, V3.8H
    244 
    245     LD2             {V4.4H, V5.4H}, [X1], #16
    246     UMLSL           V19.4S, V12.4H, V10.4H
    247     ST1             {V24.S}[0], [X7], X8
    248     UMULL           V23.4S, V1.4H, V3.4H
    249     ST1             {V24.S}[1], [X7], X8
    250     SSHR            V19.4S, V19.4S, #16
    251     ST1             {V24.S}[2], [X7], X8
    252     UMLSL           V23.4S, V4.4H, V2.4H
    253     ST1             {V24.S}[3], [X7], X8
    254     SMLAL           V19.4S, V8.4H, V11.4H
    255     LD1             {V14.4S}, [X10], X12
    256     SSHR            V23.4S, V23.4S, #16
    257     SMLSL           V19.4S, V13.4H, V10.4H
    258     LD2             {V10.4H, V11.4H}, [X11], X12
    259     SMLAL           V23.4S, V0.4H, V3.4H
    260     SMLSL           V23.4S, V5.4H, V2.4H
    261     REV64           V8.4S, V14.4S
    262     LD2             {V12.4H, V13.4H}, [X1], #16
    263     SQNEG           V8.4S, V8.4S
    264     REV64           V11.4H, V11.4h
    265     REV64           V10.4H, V10.4H
    266     SQADD           V18.4S, V19.4S, V20.4S
    267     UZP1            V9.8H, V8.8H, V8.8H
    268     UZP2            V8.8H, V8.8H, V8.8H
    269     rev64           v9.4s, v9.4s
    270     rev64           v8.4s, v8.4s
    271     SQADD           V22.4S, V23.4S, V20.4S
    272     SSHL            V18.4S, V18.4S, V21.4S
    273     SUBS            X5, X5, #8
    274     MOV             V16.16B, V18.16B
    275     ST1             {V16.S}[0], [X7], X8
    276     SSHL            V22.4S, V22.4S, V21.4S
    277     ST1             {V16.S}[1], [X7], X8
    278     MOV             V24.16B, V22.16B
    279 
    280     ST1             {V16.S}[2], [X7], X8
    281     ST1             {V16.S}[3], [X7], X8
    282 
    283     BGT             LOOP_2
    284 
    285     ST1             {V24.S}[0], [X7], X8
    286     UMULL           V19.4S, V9.4H, V11.4H
    287     UMLSL           V19.4S, V12.4H, V10.4H
    288     ST1             {V24.S}[1], [X7], X8
    289     ST1             {V24.S}[2], [X7], X8
    290     SSHR            V19.4S, V19.4S, #16
    291     ST1             {V24.S}[3], [X7], X8
    292 
    293     SMLAL           V19.4S, V8.4H, V11.4H
    294     SMLSL           V19.4S, V13.4H, V10.4H
    295     SQADD           V18.4S, V19.4S, V20.4S
    296     SSHL            V18.4S, V18.4S, V21.4S
    297     MOV             V16.16B, V18.16B
    298 
    299     ST1             {V16.S}[0], [X7], X8
    300     ST1             {V16.S}[1], [X7], X8
    301     ST1             {V16.S}[2], [X7], X8
    302     ST1             {V16.S}[3], [X7], X8
    303 
    304     pop_v_regs
    305     RET
    306