Home | History | Annotate | Download | only in armv8
      1 ///******************************************************************************
      2 // *
      3 // * Copyright (C) 2018 The Android Open Source Project
      4 // *
      5 // * Licensed under the Apache License, Version 2.0 (the "License");
      6 // * you may not use this file except in compliance with the License.
      7 // * You may obtain a copy of the License at:
      8 // *
      9 // * http://www.apache.org/licenses/LICENSE-2.0
     10 // *
     11 // * Unless required by applicable law or agreed to in writing, software
     12 // * distributed under the License is distributed on an "AS IS" BASIS,
     13 // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 // * See the License for the specific language governing permissions and
     15 // * limitations under the License.
     16 // *
     17 // *****************************************************************************
     18 // * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 
     21 .macro push_v_regs
     22     stp             q8, q9, [sp, #-32]!
     23     stp             q10, q11, [sp, #-32]!
     24     stp             q12, q13, [sp, #-32]!
     25     stp             q14, q15, [sp, #-32]!
     26     stp             X8, X9, [sp, #-16]!
     27     stp             X10, X11, [sp, #-16]!
     28     stp             X12, X13, [sp, #-16]!
     29     stp             X14, X15, [sp, #-16]!
     30     stp             X16, X17, [sp, #-16]!
     31     stp             X29, X30, [sp, #-16]!
     32 .endm
     33 .macro pop_v_regs
     34     ldp             X29, X30, [sp], #16
     35     ldp             X16, X17, [sp], #16
     36     ldp             X14, X15, [sp], #16
     37     ldp             X12, X13, [sp], #16
     38     ldp             X10, X11, [sp], #16
     39     ldp             X8, X9, [sp], #16
     40     ldp             q14, q15, [sp], #32
     41     ldp             q12, q13, [sp], #32
     42     ldp             q10, q11, [sp], #32
     43     ldp             q8, q9, [sp], #32
     44 .endm
     45 
     46 .text
     47 .global ixheaacd_over_lap_add1_armv8
     48 ixheaacd_over_lap_add1_armv8:
     49     push_v_regs
     50     LSL             X10, X5, #1
     51     SUB             X11, X10, #1
     52     LSL             X10, X11, #2
     53     ADD             X10, X0, X10
     54     SUB             X10, X10, #12
     55     LSL             X8, X11, #1
     56     ADD             X8, X8, X3
     57     SUB             X8, X8, #14
     58     MOV             X12, #-16
     59     DUP             V11.8H, W4
     60     LD1             {V3.4S}, [X10], X12
     61     MOV             W7, #0x2000
     62 
     63     NEG             W7, W7
     64     SQNEG           V0.4S, V3.4S
     65     DUP             V10.4S, W7
     66     UZP1            V31.8H, V0.8H, V0.8H
     67     UZP2            V30.8H, V0.8H, V0.8H
     68     REV64           V31.8h, V31.8h
     69     REV64           V30.8h, V30.8h
     70     SUB             X11, X5, #1
     71     UZP1            V7.8H, V3.8H, V3.8H
     72     UZP2            V6.8H, V3.8H, V3.8H
     73     REV64           V7.8H, V7.8H
     74     REV64           V6.8H, V6.8H
     75     MOV             V16.S[0], W6
     76     MOV             V17.S[0], W11
     77     SMULL           V17.4S, V16.4H, V17.4H
     78     MOV             W11, V17.S[0]
     79     LSL             X11, X11, #1
     80 
     81     LD2             {V2.4H, V3.4H}, [X8], X12
     82     ADD             X11, X11, X2
     83     REV64           V2.4H, V2.4H
     84     REV64           V3.4H, V3.4H
     85     LSL             X4, X6, #1
     86     NEG             X4, X4
     87     LSL             X9, X6, #1
     88     MOV             V16.S[0], W5
     89     MOV             V17.S[0], W6
     90     SMULL           V17.4S, V16.4H, V17.4H
     91     MOV             W6, V17.S[0]
     92     LSL             W6, W6, #1
     93     ADD             X6, X6, X2
     94 
     95     UMULL           V15.4S, V7.4H, V2.4H
     96     LD1             {V4.4S}, [X1], #16
     97     USHR            V15.4S, V15.4S, #16
     98 
     99     SMLAL           V15.4S, V6.4H, V2.4H
    100     SQSHL           V15.4S, V15.4S, V11.4S
    101     SSHLL           V27.4S, V3.4H, #0
    102     SMULL           V28.2D, V27.2S, V4.2S
    103     SMULL2          V29.2D, V27.4S, V4.4S
    104     SQXTN           V28.2S, V28.2D
    105     SQXTN2          V28.4S, V29.2D
    106     MOV             V14.16B, V28.16B
    107 
    108     SQADD           V14.4S, V14.4S, V10.4S
    109     SQSUB           V13.4S, V15.4S, V14.4S
    110     SQSHL           V13.4S, V13.4S, #2
    111     SSHR            V13.4S, V13.4S, #16
    112     UZP1            V26.8H, V13.8H, V13.8H
    113 
    114     UMULL           V12.4S, V31.4H, V3.4H
    115     USHR            V12.4S, V12.4S, #16
    116     SMLAL           V12.4S, V30.4H, V3.4H
    117     SQSHL           V12.4S, V12.4S, V11.4S
    118     LD1             {V3.4S}, [X10], X12
    119 
    120     SSHLL           V27.4S, V2.4H, #0
    121     SMULL           V28.2D, V27.2S, V4.2S
    122     SMULL2          V29.2D, V27.4S, V4.4S
    123     SQXTN           V28.2S, V28.2D
    124     SQXTN2          V28.4S, V29.2D
    125     MOV             V8.16B, V28.16B
    126 
    127     SQADD           V8.4S, V8.4S, V10.4S
    128 
    129     SQNEG           V0.4S, V3.4S
    130     UZP1            V1.8H, V0.8H, V0.8H
    131     UZP2            V0.8H, V0.8H, V0.8H
    132     REV64           V1.8h, V1.8h
    133     REV64           V0.8h, V0.8h
    134     SQSUB           V9.4S, V12.4S, V8.4S
    135     UZP1            V7.8H, V3.8H, V3.8H
    136     UZP2            V6.8H, V3.8H, V3.8H
    137     REV64           V7.8h, V7.8h
    138     REV64           V6.8h, V6.8h
    139     SQSHL           V9.4S, V9.4S, #2
    140     LD2             {V2.4H, V3.4H}, [X8], X12
    141     SSHR            V9.4S, V9.4S, #16
    142     REV64           V2.4H, V2.4H
    143     REV64           V3.4H, V3.4H
    144     UZP1            V18.8H, V9.8H, V9.8H
    145 
    146     LD1             {V4.4S}, [X1], #16
    147     SUB             W5, W5, #8
    148 
    149 
    150 LOOP_1:
    151 
    152     ST1             {V26.H}[0], [X11], X4
    153     UMULL           V15.4S, V7.4H, V2.4H
    154     ST1             {V26.H}[1], [X11], X4
    155     UMULL           V12.4S, V1.4H, V3.4H
    156     ST1             {V26.H}[2], [X11], X4
    157     USHR            V15.4S, V15.4S, #16
    158     ST1             {V26.H}[3], [X11], X4
    159     USHR            V12.4S, V12.4S, #16
    160     ST1             {V18.H}[0], [X6], X9
    161     SMLAL           V15.4S, V6.4H, V2.4H
    162     ST1             {V18.H}[1], [X6], X9
    163     SMLAL           V12.4S, V0.4H, V3.4H
    164     ST1             {V18.H}[2], [X6], X9
    165     SQSHL           V15.4S, V15.4S, V11.4S
    166     ST1             {V18.H}[3], [X6], X9
    167     SQSHL           V12.4S, V12.4S, V11.4S
    168     LD1             {V6.4S}, [X10], X12
    169 
    170     SSHLL           V27.4S, V3.4H, #0
    171     SMULL           V28.2D, V27.2S, V4.2S
    172     SMULL2          V29.2D, V27.4S, V4.4S
    173     SQXTN           V28.2S, V28.2D
    174     SQXTN2          V28.4S, V29.2D
    175     MOV             V14.16B, V28.16B
    176 
    177     SSHLL           V27.4S, V2.4H, #0
    178     SMULL           V28.2D, V27.2S, V4.2S
    179     SMULL2          V29.2D, V27.4S, V4.4S
    180     SQXTN           V28.2S, V28.2D
    181     SQXTN2          V28.4S, V29.2D
    182     MOV             V8.16B, V28.16B
    183 
    184     LD2             {V2.4H, V3.4H}, [X8], X12
    185 
    186     SQNEG           V0.4S, V6.4S
    187 
    188     LD1             {V4.4S}, [X1], #16
    189 
    190     SQADD           V14.4S, V14.4S, V10.4S
    191     UZP1            V1.8H, V0.8H, V0.8H
    192     UZP2            V0.8H, V0.8H, V0.8H
    193     REV64           V1.8h, V1.8h
    194     REV64           V0.8h, V0.8h
    195     SQADD           V8.4S, V8.4S, V10.4S
    196     UZP1            V7.8H, V6.8H, V6.8H
    197     UZP2            V6.8H, V6.8H, V6.8H
    198     REV64           V7.8h, V7.8h
    199     REV64           V6.8h, V6.8h
    200     SQSUB           V13.4S, V15.4S, V14.4S
    201     REV64           V2.4H, V2.4H
    202     REV64           V3.4H, V3.4H
    203     SQSUB           V9.4S, V12.4S, V8.4S
    204     SQSHL           V13.4S, V13.4S, #2
    205     SQSHL           V9.4S, V9.4S, #2
    206     UMULL           V15.4S, V7.4H, V2.4H
    207     SSHR            V13.4S, V13.4S, #16
    208     UZP1            V26.8H, V13.8H, V13.8H
    209     SSHR            V9.4S, V9.4S, #16
    210     ST1             {V26.H}[0], [X11], X4
    211     UMULL           V12.4S, V1.4H, V3.4H
    212     UZP1            V18.8H, V9.8H, V9.8H
    213     USHR            V15.4S, V15.4S, #16
    214     ST1             {V26.H}[1], [X11], X4
    215     SMLAL           V15.4S, V6.4H, V2.4H
    216     ST1             {V26.H}[2], [X11], X4
    217     USHR            V12.4S, V12.4S, #16
    218     ST1             {V26.H}[3], [X11], X4
    219     SMLAL           V12.4S, V0.4H, V3.4H
    220     ST1             {V18.H}[0], [X6], X9
    221     SQSHL           V15.4S, V15.4S, V11.4S
    222     ST1             {V18.H}[1], [X6], X9
    223     SQSHL           V12.4S, V12.4S, V11.4S
    224     ST1             {V18.H}[2], [X6], X9
    225 
    226     SSHLL           V27.4S, V3.4H, #0
    227     SMULL           V28.2D, V27.2S, V4.2S
    228     SMULL2          V29.2D, V27.4S, V4.4S
    229     SQXTN           V28.2S, V28.2D
    230     SQXTN2          V28.4S, V29.2D
    231     MOV             V14.16B, V28.16B
    232 
    233     ST1             {V18.H}[3], [X6], X9
    234 
    235 
    236     SSHLL           V27.4S, V2.4H, #0
    237     SMULL           V28.2D, V27.2S, V4.2S
    238     SMULL2          V29.2D, V27.4S, V4.4S
    239     SQXTN           V28.2S, V28.2D
    240     SQXTN2          V28.4S, V29.2D
    241     MOV             V8.16B, V28.16B
    242 
    243     LD1             {V3.4S}, [X10], X12
    244     SQADD           V14.4S, V14.4S, V10.4S
    245 
    246     SQNEG           V0.4S, V3.4S
    247     UZP1            V1.8H, V0.8H, V0.8H
    248     UZP2            V0.8H, V0.8H, V0.8H
    249     REV64           V1.8H, V1.8H
    250     REV64           V0.8H, V0.8H
    251     SQSUB           V13.4S, V15.4S, V14.4S
    252     UZP1            V7.8H, V3.8H, V3.8H
    253     UZP2            V6.8H, V3.8H, V3.8H
    254     REV64           V7.8H, V7.8H
    255     REV64           V6.8H, V6.8H
    256     SQADD           V8.4S, V8.4S, V10.4S
    257     LD2             {V2.4H, V3.4H}, [X8], X12
    258     SQSUB           V9.4S, V12.4S, V8.4S
    259     REV64           V2.4H, V2.4H
    260     REV64           V3.4H, V3.4H
    261     SQSHL           V13.4S, V13.4S, #2
    262     LD1             {V4.4S}, [X1], #16
    263 
    264     SQSHL           V9.4S, V9.4S, #2
    265     SSHR            V13.4S, V13.4S, #16
    266     SUBS            X5, X5, #8
    267     SSHR            V9.4S, V9.4S, #16
    268     UZP1            V26.8H, V13.8H, V13.8H
    269     UZP1            V18.8H, V9.8H, V9.8H
    270 
    271     BGT             LOOP_1
    272 
    273     ST1             {V26.H}[0], [X11], X4
    274     UMULL           V15.4S, V7.4H, V2.4H
    275     ST1             {V26.H}[1], [X11], X4
    276     UMULL           V12.4s, V1.4H, V3.4H
    277     ST1             {V26.H}[2], [X11], X4
    278     USHR            V15.4S, V15.4S, #16
    279     ST1             {V26.H}[3], [X11], X4
    280     USHR            V12.4S, V12.4S, #16
    281 
    282     ST1             {V18.H}[0], [X6], X9
    283     SMLAL           V15.4S, V6.4H, V2.4H
    284     ST1             {V18.H}[1], [X6], X9
    285     SMLAL           V12.4S, V0.4H, V3.4H
    286     ST1             {V18.H}[2], [X6], X9
    287     SQSHL           V15.4S, V15.4S, V11.4S
    288     ST1             {V18.H}[3], [X6], X9
    289     SQSHL           V12.4S, V12.4S, V11.4S
    290 
    291 
    292     SSHLL           V27.4S, V3.4H, #0
    293     SMULL           V28.2D, V27.2S, V4.2S
    294     SMULL2          V29.2D, V27.4S, V4.4S
    295     SQXTN           V28.2S, V28.2D
    296     SQXTN2          V28.4S, V29.2D
    297     MOV             V14.16B, V28.16B
    298 
    299     SSHLL           V27.4S, V2.4H, #0
    300     SMULL           V28.2D, V27.2S, V4.2S
    301     SMULL2          V29.2D, V27.4S, V4.4S
    302     SQXTN           V28.2S, V28.2D
    303     SQXTN2          V28.4S, V29.2D
    304     MOV             V8.16B, V28.16B
    305 
    306     SQADD           V14.4S, V14.4S, V10.4S
    307     SQADD           V8.4S, V8.4S, V10.4S
    308     SQSUB           V13.4S, V15.4S, V14.4S
    309     SQSUB           V9.4S, V12.4S, V8.4S
    310     SQSHL           V13.4S, V13.4S, #2
    311     SQSHL           V9.4S, V9.4S, #2
    312     SSHR            V13.4S, V13.4S, #16
    313     SSHR            V9.4S, V9.4S, #16
    314     UZP1            V26.8H, V13.8H, V13.8H
    315 
    316     UZP1            V18.8H, V9.8H, V9.8H
    317 
    318 
    319     ST1             {V26.H}[0], [X11], X4
    320     ST1             {V26.H}[1], [X11], X4
    321     ST1             {V26.H}[2], [X11], X4
    322     ST1             {V26.H}[3], [X11], X4
    323 
    324     ST1             {V18.H}[0], [X6], X9
    325     ST1             {V18.H}[1], [X6], X9
    326     ST1             {V18.H}[2], [X6], X9
    327     ST1             {V18.H}[3], [X6], X9
    328     pop_v_regs
    329     RET
    330 
    331 
    332 
    333 
    334