Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
      2 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
      3 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
      4 
      5 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
      6 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
      7 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
      8 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
      9 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
     10 declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
     11 
     12 declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
     13 declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
     14 declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
     15 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
     16 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
     17 declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
     18 
     19 declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
     20 declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
     21 declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
     22 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
     23 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
     24 declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
     25 
     26 ;-----------------------------------------------------------------------------
     27 ; RDMA Vector
     28 ; test for SIMDThreeSameVectorSQRDMLxHTiedHS
     29 
     30 define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
     31 ; CHECK-LABEL: test_sqrdmlah_v4i16:
     32    %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
     33    %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
     34 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
     35 ; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.4h
     36 ; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2
     37    ret <4 x i16> %retval
     38 }
     39 
     40 define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
     41 ; CHECK-LABEL: test_sqrdmlah_v8i16:
     42    %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
     43    %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
     44 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
     45 ; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.8h
     46 ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
     47    ret <8 x i16> %retval
     48 }
     49 
     50 define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
     51 ; CHECK-LABEL: test_sqrdmlah_v2i32:
     52    %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
     53    %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
     54 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
     55 ; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.2s
     56 ; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2
     57    ret <2 x i32> %retval
     58 }
     59 
     60 define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
     61 ; CHECK-LABEL: test_sqrdmlah_v4i32:
     62    %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
     63    %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
     64 ; CHECK-V81:        sqrdmulh    v1.4s, v1.4s, v2.4s
     65 ; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.4s
     66 ; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2
     67    ret <4 x i32> %retval
     68 }
     69 
     70 define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
     71 ; CHECK-LABEL: test_sqrdmlsh_v4i16:
     72    %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
     73    %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
     74 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
     75 ; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.4h
     76 ; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2
     77    ret <4 x i16> %retval
     78 }
     79 
     80 define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
     81 ; CHECK-LABEL: test_sqrdmlsh_v8i16:
     82    %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
     83    %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
     84 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
     85 ; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.8h
     86 ; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2
     87    ret <8 x i16> %retval
     88 }
     89 
     90 define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
     91 ; CHECK-LABEL: test_sqrdmlsh_v2i32:
     92    %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
     93    %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
     94 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
     95 ; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.2s
     96 ; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2
     97    ret <2 x i32> %retval
     98 }
     99 
    100 define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
    101 ; CHECK-LABEL: test_sqrdmlsh_v4i32:
    102    %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
    103    %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
    104 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.4s
    105 ; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.4s
    106 ; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2
    107    ret <4 x i32> %retval
    108 }
    109 
    110 ;-----------------------------------------------------------------------------
    111 ; RDMA Vector, by element
    112 ; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
    113 
    114 define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
    115 ; CHECK-LABEL: test_sqrdmlah_lane_s16:
    116 entry:
    117   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    118   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
    119   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
    120 ; CHECK-V8a :       sqrdmulh    v1.4h, v1.4h, v2.h[3]
    121 ; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.h[3]
    122 ; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2[3]
    123   ret <4 x i16> %retval
    124 }
    125 
    126 define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
    127 ; CHECK-LABEL: test_sqrdmlahq_lane_s16:
    128 entry:
    129   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
    130   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
    131   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
    132 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
    133 ; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.h[2]
    134 ; CHECK-V81a-apple: sqrdmlah.8h v0,    v1,    v2[2]
    135   ret <8 x i16> %retval
    136 }
    137 
    138 define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
    139 ; CHECK-LABEL: test_sqrdmlah_lane_s32:
    140 entry:
    141   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    142   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
    143   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
    144 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
    145 ; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.s[1]
    146 ; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2[1]
    147   ret <2 x i32> %retval
    148 }
    149 
    150 define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
    151 ; CHECK-LABEL: test_sqrdmlahq_lane_s32:
    152 entry:
    153   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
    154   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
    155   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
    156 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
    157 ; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.s[0]
    158 ; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2[0]
    159   ret <4 x i32> %retval
    160 }
    161 
    162 define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
    163 ; CHECK-LABEL: test_sqrdmlsh_lane_s16:
    164 entry:
    165   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    166   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
    167   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
    168 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
    169 ; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.h[3]
    170 ; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2[3]
    171   ret <4 x i16> %retval
    172 }
    173 
    174 define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
    175 ; CHECK-LABEL: test_sqrdmlshq_lane_s16:
    176 entry:
    177   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
    178   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
    179   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
    180 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
    181 ; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.h[2]
    182 ; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2[2]
    183   ret <8 x i16> %retval
    184 }
    185 
    186 define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
    187 ; CHECK-LABEL: test_sqrdmlsh_lane_s32:
    188 entry:
    189   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    190   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
    191   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
    192 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
    193 ; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.s[1]
    194 ; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2[1]
    195   ret <2 x i32> %retval
    196 }
    197 
    198 define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
    199 ; CHECK-LABEL: test_sqrdmlshq_lane_s32:
    200 entry:
    201   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
    202   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
    203   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
    204 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
    205 ; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.s[0]
    206 ; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2[0]
    207   ret <4 x i32> %retval
    208 }
    209 
    210 ;-----------------------------------------------------------------------------
    211 ; RDMA Vector, by element, extracted
    212 ; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
    213 ; i32 tests are for   "def : Pat" in SIMDIndexedSQRDMLxHSDTied
    214 
    215 define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
    216 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
    217 entry:
    218   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
    219   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
    220   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
    221   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
    222   %retval = extractelement <4 x i16> %retval_vec, i64 0
    223 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
    224 ; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
    225 ; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}},    v0,    v1[1]
    226   ret i16 %retval
    227 }
    228 
    229 define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
    230 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
    231 entry:
    232   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
    233   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
    234   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
    235   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
    236   %retval = extractelement <8 x i16> %retval_vec, i64 0
    237 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
    238 ; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
    239 ; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}},    v0,    v1[1]
    240   ret i16 %retval
    241 }
    242 
    243 define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
    244 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
    245 entry:
    246   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
    247   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
    248   %extract = extractelement <2 x i32> %prod, i64 0
    249   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
    250 ; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
    251 ; CHECK-V81a:       sqrdmlah    v2.2s, v0.2s, v1.s[0]
    252 ; CHECK-V81a-apple: sqrdmlah.2s v2,    v0,    v1[0]
    253   ret i32 %retval
    254 }
    255 
    256 define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
    257 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
    258 entry:
    259   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
    260   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
    261   %extract = extractelement <4 x i32> %prod, i64 0
    262   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
    263 ; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
    264 ; CHECK-V81a:       sqrdmlah    v2.4s, v0.4s, v1.s[0]
    265 ; CHECK-V81a-apple: sqrdmlah.4s v2,    v0,    v1[0]
    266   ret i32 %retval
    267 }
    268 
    269 define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
    270 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
    271 entry:
    272   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
    273   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
    274   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
    275   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
    276   %retval = extractelement <4 x i16> %retval_vec, i64 0
    277 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
    278 ; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
    279 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}},    v0,    v1[1]
    280   ret i16 %retval
    281 }
    282 
    283 define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
    284 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
    285 entry:
    286   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
    287   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
    288   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
    289   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
    290   %retval = extractelement <8 x i16> %retval_vec, i64 0
    291 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
    292 ; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
    293 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}},    v0,    v1[1]
    294   ret i16 %retval
    295 }
    296 
    297 define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
    298 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
    299 entry:
    300   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
    301   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
    302   %extract = extractelement <2 x i32> %prod, i64 0
    303   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
    304 ; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
    305 ; CHECK-V81a:       sqrdmlsh    v2.2s, v0.2s, v1.s[0]
    306 ; CHECK-V81a-apple: sqrdmlsh.2s v2,    v0,    v1[0]
    307   ret i32 %retval
    308 }
    309 
    310 define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
    311 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
    312 entry:
    313   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
    314   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
    315   %extract = extractelement <4 x i32> %prod, i64 0
    316   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
    317 ; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
    318 ; CHECK-V81a:       sqrdmlsh    v2.4s, v0.4s, v1.s[0]
    319 ; CHECK-V81a-apple: sqrdmlsh.4s v2,    v0,    v1[0]
    320   ret i32 %retval
    321 }
    322 
    323 ;-----------------------------------------------------------------------------
    324 ; RDMA Scalar
    325 ; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
    326 
    327 define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
    328 ; CHECK-LABEL: test_sqrdmlah_v1i16:
    329   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
    330   %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
    331   %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
    332   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
    333   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
    334   %retval = extractelement <4 x i16> %retval_vec, i64 0
    335 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    336 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    337 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
    338   ret i16 %retval
    339 }
    340 
    341 define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
    342 ; CHECK-LABEL: test_sqrdmlah_v1i32:
    343   %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
    344   %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
    345   %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
    346   %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
    347   %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
    348   %retval = extractelement <4 x i32> %retval_vec, i64 0
    349 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    350 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    351 ; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
    352   ret i32 %retval
    353 }
    354 
    355 
    356 define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
    357 ; CHECK-LABEL: test_sqrdmlsh_v1i16:
    358   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
    359   %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
    360   %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
    361   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
    362   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
    363   %retval = extractelement <4 x i16> %retval_vec, i64 0
    364 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    365 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    366 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
    367   ret i16 %retval
    368 }
    369 
    370 define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
    371 ; CHECK-LABEL: test_sqrdmlsh_v1i32:
    372   %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
    373   %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
    374   %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
    375   %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
    376   %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
    377   %retval = extractelement <4 x i32> %retval_vec, i64 0
    378 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    379 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    380 ; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
    381   ret i32 %retval
    382 }
    383 define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
    384 ; CHECK-LABEL: test_sqrdmlah_i32:
    385   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
    386   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
    387 ; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    388 ; CHECK-V81a:       sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    389 ; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    390   ret i32 %retval
    391 }
    392 
    393 define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
    394 ; CHECK-LABEL: test_sqrdmlsh_i32:
    395   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
    396   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
    397 ; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    398 ; CHECK-V81a:       sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    399 ; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
    400   ret i32 %retval
    401 }
    402 
    403 ;-----------------------------------------------------------------------------
    404 ; RDMA Scalar, by element
    405 ; i16 tests are performed via tests in above chapter, with IR in ACLE style
    406 ; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
    407 
    408 define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
    409 ; CHECK-LABEL: test_sqrdmlah_extract_i16:
    410   %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
    411   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
    412   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
    413   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
    414   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
    415   %retval = extractelement <4 x i16> %retval_vec, i32 0
    416 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
    417 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
    418 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
    419   ret i16 %retval
    420 }
    421 
    422 define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
    423 ; CHECK-LABEL: test_sqrdmlah_extract_i32:
    424   %extract = extractelement <4 x i32> %rhs, i32 3
    425   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
    426   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
    427 ; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
    428 ; CHECK-V81a:       sqrdmlah   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
    429 ; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
    430   ret i32 %retval
    431 }
    432 
    433 define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
    434 ; CHECK-LABEL: test_sqrdmlshq_extract_i16:
    435   %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
    436   %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
    437   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
    438   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
    439   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
    440   %retval = extractelement <8 x i16> %retval_vec, i32 0
    441 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
    442 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
    443 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
    444   ret i16 %retval
    445 }
    446 
    447 define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
    448 ; CHECK-LABEL: test_sqrdmlsh_extract_i32:
    449   %extract = extractelement <4 x i32> %rhs, i32 3
    450   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
    451   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
    452 ; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
    453 ; CHECK-V81a:       sqrdmlsh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
    454 ; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
    455   ret i32 %retval
    456 }
    457