Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
      2 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+rdm -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
      3 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=falkor -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
      4 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
      5 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=saphira -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
      6 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
      7 
      8 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
      9 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
     10 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
     11 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
     12 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
     13 declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
     14 
     15 declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
     16 declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
     17 declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
     18 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
     19 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
     20 declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
     21 
     22 declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
     23 declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
     24 declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
     25 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
     26 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
     27 declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
     28 
     29 ;-----------------------------------------------------------------------------
     30 ; RDMA Vector
     31 ; test for SIMDThreeSameVectorSQRDMLxHTiedHS
     32 
     33 define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
     34 ; CHECK-LABEL: test_sqrdmlah_v4i16:
     35    %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
     36    %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
     37 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
     38 ; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.4h
     39 ; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2
     40    ret <4 x i16> %retval
     41 }
     42 
     43 define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
     44 ; CHECK-LABEL: test_sqrdmlah_v8i16:
     45    %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
     46    %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
     47 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
     48 ; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.8h
     49 ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
     50    ret <8 x i16> %retval
     51 }
     52 
     53 define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
     54 ; CHECK-LABEL: test_sqrdmlah_v2i32:
     55    %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
     56    %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
     57 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
     58 ; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.2s
     59 ; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2
     60    ret <2 x i32> %retval
     61 }
     62 
     63 define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
     64 ; CHECK-LABEL: test_sqrdmlah_v4i32:
     65    %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
     66    %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
     67 ; CHECK-V81:        sqrdmulh    v1.4s, v1.4s, v2.4s
     68 ; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.4s
     69 ; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2
     70    ret <4 x i32> %retval
     71 }
     72 
     73 define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
     74 ; CHECK-LABEL: test_sqrdmlsh_v4i16:
     75    %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
     76    %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
     77 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
     78 ; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.4h
     79 ; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2
     80    ret <4 x i16> %retval
     81 }
     82 
     83 define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
     84 ; CHECK-LABEL: test_sqrdmlsh_v8i16:
     85    %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
     86    %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
     87 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
     88 ; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.8h
     89 ; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2
     90    ret <8 x i16> %retval
     91 }
     92 
     93 define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
     94 ; CHECK-LABEL: test_sqrdmlsh_v2i32:
     95    %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
     96    %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
     97 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
     98 ; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.2s
     99 ; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2
    100    ret <2 x i32> %retval
    101 }
    102 
    103 define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
    104 ; CHECK-LABEL: test_sqrdmlsh_v4i32:
    105    %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
    106    %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
    107 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.4s
    108 ; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.4s
    109 ; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2
    110    ret <4 x i32> %retval
    111 }
    112 
    113 ;-----------------------------------------------------------------------------
    114 ; RDMA Vector, by element
    115 ; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
    116 
    117 define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
    118 ; CHECK-LABEL: test_sqrdmlah_lane_s16:
    119 entry:
    120   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    121   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
    122   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
    123 ; CHECK-V8a :       sqrdmulh    v1.4h, v1.4h, v2.h[3]
    124 ; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.h[3]
    125 ; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2[3]
    126   ret <4 x i16> %retval
    127 }
    128 
    129 define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
    130 ; CHECK-LABEL: test_sqrdmlahq_lane_s16:
    131 entry:
    132   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
    133   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
    134   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
    135 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
    136 ; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.h[2]
    137 ; CHECK-V81a-apple: sqrdmlah.8h v0,    v1,    v2[2]
    138   ret <8 x i16> %retval
    139 }
    140 
    141 define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
    142 ; CHECK-LABEL: test_sqrdmlah_lane_s32:
    143 entry:
    144   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    145   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
    146   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
    147 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
    148 ; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.s[1]
    149 ; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2[1]
    150   ret <2 x i32> %retval
    151 }
    152 
    153 define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
    154 ; CHECK-LABEL: test_sqrdmlahq_lane_s32:
    155 entry:
    156   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
    157   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
    158   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
    159 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
    160 ; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.s[0]
    161 ; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2[0]
    162   ret <4 x i32> %retval
    163 }
    164 
    165 define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
    166 ; CHECK-LABEL: test_sqrdmlsh_lane_s16:
    167 entry:
    168   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    169   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
    170   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
    171 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
    172 ; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.h[3]
    173 ; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2[3]
    174   ret <4 x i16> %retval
    175 }
    176 
    177 define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
    178 ; CHECK-LABEL: test_sqrdmlshq_lane_s16:
    179 entry:
    180   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
    181   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
    182   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
    183 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
    184 ; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.h[2]
    185 ; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2[2]
    186   ret <8 x i16> %retval
    187 }
    188 
    189 define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
    190 ; CHECK-LABEL: test_sqrdmlsh_lane_s32:
    191 entry:
    192   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    193   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
    194   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
    195 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
    196 ; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.s[1]
    197 ; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2[1]
    198   ret <2 x i32> %retval
    199 }
    200 
    201 define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
    202 ; CHECK-LABEL: test_sqrdmlshq_lane_s32:
    203 entry:
    204   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
    205   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
    206   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
    207 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
    208 ; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.s[0]
    209 ; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2[0]
    210   ret <4 x i32> %retval
    211 }
    212 
    213 ;-----------------------------------------------------------------------------
    214 ; RDMA Vector, by element, extracted
    215 ; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
    216 ; i32 tests are for   "def : Pat" in SIMDIndexedSQRDMLxHSDTied
    217 
    218 define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
    219 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
    220 entry:
    221   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
    222   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
    223   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
    224   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
    225   %retval = extractelement <4 x i16> %retval_vec, i64 0
    226 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
    227 ; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
    228 ; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}},    v0,    v1[1]
    229   ret i16 %retval
    230 }
    231 
    232 define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
    233 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
    234 entry:
    235   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
    236   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
    237   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
    238   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
    239   %retval = extractelement <8 x i16> %retval_vec, i64 0
    240 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
    241 ; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
    242 ; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}},    v0,    v1[1]
    243   ret i16 %retval
    244 }
    245 
    246 define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
    247 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
    248 entry:
    249   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
    250   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
    251   %extract = extractelement <2 x i32> %prod, i64 0
    252   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
    253 ; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
    254 ; CHECK-V81a:       sqrdmlah    v2.2s, v0.2s, v1.s[0]
    255 ; CHECK-V81a-apple: sqrdmlah.2s v2,    v0,    v1[0]
    256   ret i32 %retval
    257 }
    258 
    259 define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
    260 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
    261 entry:
    262   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
    263   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
    264   %extract = extractelement <4 x i32> %prod, i64 0
    265   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
    266 ; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
    267 ; CHECK-V81a:       sqrdmlah    v2.4s, v0.4s, v1.s[0]
    268 ; CHECK-V81a-apple: sqrdmlah.4s v2,    v0,    v1[0]
    269   ret i32 %retval
    270 }
    271 
    272 define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
    273 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
    274 entry:
    275   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
    276   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
    277   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
    278   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
    279   %retval = extractelement <4 x i16> %retval_vec, i64 0
    280 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
    281 ; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
    282 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}},    v0,    v1[1]
    283   ret i16 %retval
    284 }
    285 
    286 define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
    287 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
    288 entry:
    289   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
    290   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
    291   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
    292   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
    293   %retval = extractelement <8 x i16> %retval_vec, i64 0
    294 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
    295 ; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
    296 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}},    v0,    v1[1]
    297   ret i16 %retval
    298 }
    299 
    300 define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
    301 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
    302 entry:
    303   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
    304   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
    305   %extract = extractelement <2 x i32> %prod, i64 0
    306   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
    307 ; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
    308 ; CHECK-V81a:       sqrdmlsh    v2.2s, v0.2s, v1.s[0]
    309 ; CHECK-V81a-apple: sqrdmlsh.2s v2,    v0,    v1[0]
    310   ret i32 %retval
    311 }
    312 
    313 define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
    314 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
    315 entry:
    316   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
    317   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
    318   %extract = extractelement <4 x i32> %prod, i64 0
    319   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
    320 ; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
    321 ; CHECK-V81a:       sqrdmlsh    v2.4s, v0.4s, v1.s[0]
    322 ; CHECK-V81a-apple: sqrdmlsh.4s v2,    v0,    v1[0]
    323   ret i32 %retval
    324 }
    325 
    326 ;-----------------------------------------------------------------------------
    327 ; RDMA Scalar
    328 ; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
    329 
    330 define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
    331 ; CHECK-LABEL: test_sqrdmlah_v1i16:
    332   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
    333   %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
    334   %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
    335   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
    336   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
    337   %retval = extractelement <4 x i16> %retval_vec, i64 0
    338 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    339 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    340 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
    341   ret i16 %retval
    342 }
    343 
    344 define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
    345 ; CHECK-LABEL: test_sqrdmlah_v1i32:
    346   %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
    347   %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
    348   %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
    349   %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
    350   %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
    351   %retval = extractelement <4 x i32> %retval_vec, i64 0
    352 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    353 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    354 ; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
    355   ret i32 %retval
    356 }
    357 
    358 
    359 define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
    360 ; CHECK-LABEL: test_sqrdmlsh_v1i16:
    361   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
    362   %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
    363   %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
    364   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
    365   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
    366   %retval = extractelement <4 x i16> %retval_vec, i64 0
    367 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    368 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    369 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
    370   ret i16 %retval
    371 }
    372 
    373 define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
    374 ; CHECK-LABEL: test_sqrdmlsh_v1i32:
    375   %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
    376   %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
    377   %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
    378   %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
    379   %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
    380   %retval = extractelement <4 x i32> %retval_vec, i64 0
    381 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    382 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    383 ; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
    384   ret i32 %retval
    385 }
    386 define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
    387 ; CHECK-LABEL: test_sqrdmlah_i32:
    388   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
    389   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
    390 ; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    391 ; CHECK-V81a:       sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    392 ; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    393   ret i32 %retval
    394 }
    395 
    396 define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
    397 ; CHECK-LABEL: test_sqrdmlsh_i32:
    398   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
    399   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
    400 ; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    401 ; CHECK-V81a:       sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    402 ; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    403   ret i32 %retval
    404 }
    405 
    406 ;-----------------------------------------------------------------------------
    407 ; RDMA Scalar, by element
    408 ; i16 tests are performed via tests in above chapter, with IR in ACLE style
    409 ; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
    410 
    411 define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
    412 ; CHECK-LABEL: test_sqrdmlah_extract_i16:
    413   %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
    414   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
    415   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
    416   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
    417   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
    418   %retval = extractelement <4 x i16> %retval_vec, i32 0
    419 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
    420 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
    421 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
    422   ret i16 %retval
    423 }
    424 
    425 define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
    426 ; CHECK-LABEL: test_sqrdmlah_extract_i32:
    427   %extract = extractelement <4 x i32> %rhs, i32 3
    428   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
    429   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
    430 ; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
    431 ; CHECK-V81a:       sqrdmlah   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
    432 ; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
    433   ret i32 %retval
    434 }
    435 
    436 define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
    437 ; CHECK-LABEL: test_sqrdmlshq_extract_i16:
    438   %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
    439   %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
    440   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
    441   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
    442   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
    443   %retval = extractelement <8 x i16> %retval_vec, i32 0
    444 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
    445 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
    446 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
    447   ret i16 %retval
    448 }
    449 
    450 define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
    451 ; CHECK-LABEL: test_sqrdmlsh_extract_i32:
    452   %extract = extractelement <4 x i32> %rhs, i32 3
    453   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
    454   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
    455 ; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
    456 ; CHECK-V81a:       sqrdmlsh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
    457 ; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
    458   ret i32 %retval
    459 }
    460