1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s 2 3 define <8 x i8> @v_dup8(i8 %A) nounwind { 4 ;CHECK-LABEL: v_dup8: 5 ;CHECK: dup.8b 6 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 7 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 8 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 9 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 10 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 11 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 12 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 13 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 14 ret <8 x i8> %tmp8 15 } 16 17 define <4 x i16> @v_dup16(i16 %A) nounwind { 18 ;CHECK-LABEL: v_dup16: 19 ;CHECK: dup.4h 20 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 21 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 22 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 23 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 24 ret <4 x i16> %tmp4 25 } 26 27 define <2 x i32> @v_dup32(i32 %A) nounwind { 28 ;CHECK-LABEL: v_dup32: 29 ;CHECK: dup.2s 30 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 31 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 32 ret <2 x i32> %tmp2 33 } 34 35 define <2 x float> @v_dupfloat(float %A) nounwind { 36 ;CHECK-LABEL: v_dupfloat: 37 ;CHECK: dup.2s 38 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 39 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 40 ret <2 x float> %tmp2 41 } 42 43 define <16 x i8> @v_dupQ8(i8 %A) nounwind { 44 ;CHECK-LABEL: v_dupQ8: 45 ;CHECK: dup.16b 46 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 47 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 48 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 49 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 50 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 51 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 52 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 53 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 54 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 55 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 56 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 57 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 58 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 59 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 60 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 61 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 62 ret <16 x i8> %tmp16 63 } 64 65 define <8 x i16> @v_dupQ16(i16 %A) nounwind { 66 ;CHECK-LABEL: v_dupQ16: 67 ;CHECK: dup.8h 68 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 69 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 70 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 71 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 72 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 73 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 74 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 75 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 76 ret <8 x i16> %tmp8 77 } 78 79 define <4 x i32> @v_dupQ32(i32 %A) nounwind { 80 ;CHECK-LABEL: v_dupQ32: 81 ;CHECK: dup.4s 82 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 83 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 84 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 85 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 86 ret <4 x i32> %tmp4 87 } 88 89 define <4 x float> @v_dupQfloat(float %A) nounwind { 90 ;CHECK-LABEL: v_dupQfloat: 91 ;CHECK: dup.4s 92 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 93 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 94 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 95 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 96 ret <4 x float> %tmp4 97 } 98 99 ; Check to make sure it works with shuffles, too. 100 101 define <8 x i8> @v_shuffledup8(i8 %A) nounwind { 102 ;CHECK-LABEL: v_shuffledup8: 103 ;CHECK: dup.8b 104 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 105 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 106 ret <8 x i8> %tmp2 107 } 108 109 define <4 x i16> @v_shuffledup16(i16 %A) nounwind { 110 ;CHECK-LABEL: v_shuffledup16: 111 ;CHECK: dup.4h 112 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 113 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 114 ret <4 x i16> %tmp2 115 } 116 117 define <2 x i32> @v_shuffledup32(i32 %A) nounwind { 118 ;CHECK-LABEL: v_shuffledup32: 119 ;CHECK: dup.2s 120 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 121 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 122 ret <2 x i32> %tmp2 123 } 124 125 define <2 x float> @v_shuffledupfloat(float %A) nounwind { 126 ;CHECK-LABEL: v_shuffledupfloat: 127 ;CHECK: dup.2s 128 %tmp1 = insertelement <2 x float> undef, float %A, i32 0 129 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer 130 ret <2 x float> %tmp2 131 } 132 133 define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { 134 ;CHECK-LABEL: v_shuffledupQ8: 135 ;CHECK: dup.16b 136 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 137 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer 138 ret <16 x i8> %tmp2 139 } 140 141 define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { 142 ;CHECK-LABEL: v_shuffledupQ16: 143 ;CHECK: dup.8h 144 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 145 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer 146 ret <8 x i16> %tmp2 147 } 148 149 define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { 150 ;CHECK-LABEL: v_shuffledupQ32: 151 ;CHECK: dup.4s 152 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 153 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer 154 ret <4 x i32> %tmp2 155 } 156 157 define <4 x float> @v_shuffledupQfloat(float %A) nounwind { 158 ;CHECK-LABEL: v_shuffledupQfloat: 159 ;CHECK: dup.4s 160 %tmp1 = insertelement <4 x float> undef, float %A, i32 0 161 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer 162 ret <4 x float> %tmp2 163 } 164 165 define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { 166 ;CHECK-LABEL: vduplane8: 167 ;CHECK: dup.8b 168 %tmp1 = load <8 x i8>, <8 x i8>* %A 169 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 170 ret <8 x i8> %tmp2 171 } 172 173 define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { 174 ;CHECK-LABEL: vduplane16: 175 ;CHECK: dup.4h 176 %tmp1 = load <4 x i16>, <4 x i16>* %A 177 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 178 ret <4 x i16> %tmp2 179 } 180 181 define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { 182 ;CHECK-LABEL: vduplane32: 183 ;CHECK: dup.2s 184 %tmp1 = load <2 x i32>, <2 x i32>* %A 185 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > 186 ret <2 x i32> %tmp2 187 } 188 189 define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { 190 ;CHECK-LABEL: vduplanefloat: 191 ;CHECK: dup.2s 192 %tmp1 = load <2 x float>, <2 x float>* %A 193 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > 194 ret <2 x float> %tmp2 195 } 196 197 define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind { 198 ;CHECK-LABEL: vduplaneQ8: 199 ;CHECK: dup.16b 200 %tmp1 = load <8 x i8>, <8 x i8>* %A 201 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 202 ret <16 x i8> %tmp2 203 } 204 205 define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind { 206 ;CHECK-LABEL: vduplaneQ16: 207 ;CHECK: dup.8h 208 %tmp1 = load <4 x i16>, <4 x i16>* %A 209 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 210 ret <8 x i16> %tmp2 211 } 212 213 define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind { 214 ;CHECK-LABEL: vduplaneQ32: 215 ;CHECK: dup.4s 216 %tmp1 = load <2 x i32>, <2 x i32>* %A 217 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 218 ret <4 x i32> %tmp2 219 } 220 221 define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind { 222 ;CHECK-LABEL: vduplaneQfloat: 223 ;CHECK: dup.4s 224 %tmp1 = load <2 x float>, <2 x float>* %A 225 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 226 ret <4 x float> %tmp2 227 } 228 229 define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { 230 ;CHECK-LABEL: foo: 231 ;CHECK: dup.2d 232 entry: 233 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1> 234 ret <2 x i64> %0 235 } 236 237 define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { 238 ;CHECK-LABEL: bar: 239 ;CHECK: dup.2d 240 entry: 241 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0> 242 ret <2 x i64> %0 243 } 244 245 define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { 246 ;CHECK-LABEL: baz: 247 ;CHECK: dup.2d 248 entry: 249 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1> 250 ret <2 x double> %0 251 } 252 253 define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { 254 ;CHECK-LABEL: qux: 255 ;CHECK: dup.2d 256 entry: 257 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0> 258 ret <2 x double> %0 259 } 260 261 define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone { 262 ; CHECK-LABEL: f: 263 ; CHECK-NEXT: fmov s0, w0 264 ; CHECK-NEXT: mov.s v0[1], w1 265 ; CHECK-NEXT: ret 266 %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0 267 %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1 268 ret <2 x i32> %vecinit1 269 } 270 271 define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone { 272 ; CHECK-LABEL: g: 273 ; CHECK-NEXT: fmov s0, w0 274 ; CHECK-NEXT: mov.s v0[1], w1 275 ; CHECK-NEXT: mov.s v0[2], w1 276 ; CHECK-NEXT: mov.s v0[3], w0 277 ; CHECK-NEXT: ret 278 %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0 279 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1 280 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2 281 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3 282 ret <4 x i32> %vecinit3 283 } 284 285 define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone { 286 ; CHECK-LABEL: h: 287 ; CHECK-NEXT: fmov d0, x0 288 ; CHECK-NEXT: mov.d v0[1], x1 289 ; CHECK-NEXT: ret 290 %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0 291 %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1 292 ret <2 x i64> %vecinit1 293 } 294 295 ; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that 296 ; the single value needed was of the same type as the vector. This is false if 297 ; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16> 298 ; BUILD_VECTOR will have an i32 as its source). In that case, the operation is 299 ; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed. 300 ; 301 ; *However*, it is a dup vD.4h, vN.h[2*idx]. 302 define <4 x i16> @test_build_illegal(<4 x i32> %in) { 303 ; CHECK-LABEL: test_build_illegal: 304 ; CHECK: dup.4h v0, v0[6] 305 %val = extractelement <4 x i32> %in, i32 3 306 %smallval = trunc i32 %val to i16 307 %vec = insertelement <4x i16> undef, i16 %smallval, i32 3 308 309 ret <4 x i16> %vec 310 } 311 312 ; We used to inherit an already extract_subvectored v4i16 from 313 ; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing 314 ; the formation of an indexed-by-7 MLS. 315 define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { 316 ; CHECK-LABEL: test_high_splat: 317 ; CHECK: mls.4h v0, v1, v2[7] 318 entry: 319 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 320 %mul = mul <4 x i16> %shuffle, %b 321 %sub = sub <4 x i16> %a, %mul 322 ret <4 x i16> %sub 323 } 324 325 ; Also test the DUP path in the PerfectShuffle generator. 326 327 ; CHECK-LABEL: test_perfectshuffle_dupext_v4i16: 328 ; CHECK-NEXT: dup.4h v0, v0[0] 329 ; CHECK-NEXT: ext.8b v0, v0, v1, #4 330 define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { 331 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 332 ret <4 x i16> %r 333 } 334 335 ; CHECK-LABEL: test_perfectshuffle_dupext_v4f16: 336 ; CHECK-NEXT: dup.4h v0, v0[0] 337 ; CHECK-NEXT: ext.8b v0, v0, v1, #4 338 ; CHECK-NEXT: ret 339 define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind { 340 %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 341 ret <4 x half> %r 342 } 343 344 ; CHECK-LABEL: test_perfectshuffle_dupext_v4i32: 345 ; CHECK-NEXT: dup.4s v0, v0[0] 346 ; CHECK-NEXT: ext.16b v0, v0, v1, #8 347 ; CHECK-NEXT: ret 348 define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 349 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 350 ret <4 x i32> %r 351 } 352 353 ; CHECK-LABEL: test_perfectshuffle_dupext_v4f32: 354 ; CHECK-NEXT: dup.4s v0, v0[0] 355 ; CHECK-NEXT: ext.16b v0, v0, v1, #8 356 ; CHECK-NEXT: ret 357 define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind { 358 %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 359 ret <4 x float> %r 360 } 361