1 ; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs %s -o - \ 2 ; RUN: | FileCheck %s 3 4 define <8 x i8> @v_dup8(i8 %A) nounwind { 5 ;CHECK-LABEL: v_dup8: 6 ;CHECK: vdup.8 7 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 8 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 9 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 10 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 11 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 12 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 13 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 14 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 15 ret <8 x i8> %tmp8 16 } 17 18 define <4 x i16> @v_dup16(i16 %A) nounwind { 19 ;CHECK-LABEL: v_dup16: 20 ;CHECK: vdup.16 21 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 22 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 23 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 24 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 25 ret <4 x i16> %tmp4 26 } 27 28 define <2 x i32> @v_dup32(i32 %A) nounwind { 29 ;CHECK-LABEL: v_dup32: 30 ;CHECK: vdup.32 31 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 32 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 33 ret <2 x i32> %tmp2 34 } 35 36 define <2 x float> @v_dupfloat(float %A) nounwind { 37 ;CHECK-LABEL: v_dupfloat: 38 ;CHECK: vdup.32 39 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 40 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 41 ret <2 x float> %tmp2 42 } 43 44 define <16 x i8> @v_dupQ8(i8 %A) nounwind { 45 ;CHECK-LABEL: v_dupQ8: 46 ;CHECK: vdup.8 47 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 48 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 49 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 50 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 51 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 52 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 53 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 54 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 55 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 56 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 57 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 58 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 59 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 60 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 61 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 62 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 63 ret <16 x i8> %tmp16 64 } 65 66 define <8 x i16> @v_dupQ16(i16 %A) nounwind { 67 ;CHECK-LABEL: v_dupQ16: 68 ;CHECK: vdup.16 69 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 70 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 71 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 72 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 73 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 74 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 75 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 76 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 77 ret <8 x i16> %tmp8 78 } 79 80 define <4 x i32> @v_dupQ32(i32 %A) nounwind { 81 ;CHECK-LABEL: v_dupQ32: 82 ;CHECK: vdup.32 83 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 84 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 85 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 86 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 87 ret <4 x i32> %tmp4 88 } 89 90 define <4 x float> @v_dupQfloat(float %A) nounwind { 91 ;CHECK-LABEL: v_dupQfloat: 92 ;CHECK: vdup.32 93 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 94 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 95 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 96 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 97 ret <4 x float> %tmp4 98 } 99 100 ; Check to make sure it works with shuffles, too. 101 102 define <8 x i8> @v_shuffledup8(i8 %A) nounwind { 103 ;CHECK-LABEL: v_shuffledup8: 104 ;CHECK: vdup.8 105 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 106 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 107 ret <8 x i8> %tmp2 108 } 109 110 define <4 x i16> @v_shuffledup16(i16 %A) nounwind { 111 ;CHECK-LABEL: v_shuffledup16: 112 ;CHECK: vdup.16 113 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 114 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 115 ret <4 x i16> %tmp2 116 } 117 118 define <2 x i32> @v_shuffledup32(i32 %A) nounwind { 119 ;CHECK-LABEL: v_shuffledup32: 120 ;CHECK: vdup.32 121 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 122 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 123 ret <2 x i32> %tmp2 124 } 125 126 define <2 x float> @v_shuffledupfloat(float %A) nounwind { 127 ;CHECK-LABEL: v_shuffledupfloat: 128 ;CHECK: vdup.32 129 %tmp1 = insertelement <2 x float> undef, float %A, i32 0 130 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer 131 ret <2 x float> %tmp2 132 } 133 134 define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { 135 ;CHECK-LABEL: v_shuffledupQ8: 136 ;CHECK: vdup.8 137 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 138 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer 139 ret <16 x i8> %tmp2 140 } 141 142 define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { 143 ;CHECK-LABEL: v_shuffledupQ16: 144 ;CHECK: vdup.16 145 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 146 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer 147 ret <8 x i16> %tmp2 148 } 149 150 define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { 151 ;CHECK-LABEL: v_shuffledupQ32: 152 ;CHECK: vdup.32 153 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 154 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer 155 ret <4 x i32> %tmp2 156 } 157 158 define <4 x float> @v_shuffledupQfloat(float %A) nounwind { 159 ;CHECK-LABEL: v_shuffledupQfloat: 160 ;CHECK: vdup.32 161 %tmp1 = insertelement <4 x float> undef, float %A, i32 0 162 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer 163 ret <4 x float> %tmp2 164 } 165 166 define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { 167 ;CHECK-LABEL: vduplane8: 168 ;CHECK: vdup.8 169 %tmp1 = load <8 x i8>, <8 x i8>* %A 170 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 171 ret <8 x i8> %tmp2 172 } 173 174 define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { 175 ;CHECK-LABEL: vduplane16: 176 ;CHECK: vdup.16 177 %tmp1 = load <4 x i16>, <4 x i16>* %A 178 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 179 ret <4 x i16> %tmp2 180 } 181 182 define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { 183 ;CHECK-LABEL: vduplane32: 184 ;CHECK: vdup.32 185 %tmp1 = load <2 x i32>, <2 x i32>* %A 186 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > 187 ret <2 x i32> %tmp2 188 } 189 190 define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { 191 ;CHECK-LABEL: vduplanefloat: 192 ;CHECK: vdup.32 193 %tmp1 = load <2 x float>, <2 x float>* %A 194 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > 195 ret <2 x float> %tmp2 196 } 197 198 define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind { 199 ;CHECK-LABEL: vduplaneQ8: 200 ;CHECK: vdup.8 201 %tmp1 = load <8 x i8>, <8 x i8>* %A 202 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 203 ret <16 x i8> %tmp2 204 } 205 206 define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind { 207 ;CHECK-LABEL: vduplaneQ16: 208 ;CHECK: vdup.16 209 %tmp1 = load <4 x i16>, <4 x i16>* %A 210 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 211 ret <8 x i16> %tmp2 212 } 213 214 define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind { 215 ;CHECK-LABEL: vduplaneQ32: 216 ;CHECK: vdup.32 217 %tmp1 = load <2 x i32>, <2 x i32>* %A 218 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 219 ret <4 x i32> %tmp2 220 } 221 222 define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind { 223 ;CHECK-LABEL: vduplaneQfloat: 224 ;CHECK: vdup.32 225 %tmp1 = load <2 x float>, <2 x float>* %A 226 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 227 ret <4 x float> %tmp2 228 } 229 230 define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { 231 entry: 232 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1> 233 ret <2 x i64> %0 234 } 235 236 define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { 237 entry: 238 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0> 239 ret <2 x i64> %0 240 } 241 242 define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { 243 entry: 244 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1> 245 ret <2 x double> %0 246 } 247 248 define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { 249 entry: 250 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0> 251 ret <2 x double> %0 252 } 253 254 ; Radar 7373643 255 ;CHECK-LABEL: redundantVdup: 256 ;CHECK: vmov.i8 257 ;CHECK-NOT: vdup.8 258 ;CHECK: vstr 259 define void @redundantVdup(<8 x i8>* %ptr) nounwind { 260 %1 = insertelement <8 x i8> undef, i8 -128, i32 0 261 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 262 store <8 x i8> %2, <8 x i8>* %ptr, align 8 263 ret void 264 } 265 266 define <4 x i32> @tdupi(i32 %x, i32 %y) { 267 ;CHECK-LABEL: tdupi: 268 ;CHECK: vdup.32 269 %1 = insertelement <4 x i32> undef, i32 %x, i32 0 270 %2 = insertelement <4 x i32> %1, i32 %x, i32 1 271 %3 = insertelement <4 x i32> %2, i32 %x, i32 2 272 %4 = insertelement <4 x i32> %3, i32 %y, i32 3 273 ret <4 x i32> %4 274 } 275 276 define <4 x float> @tdupf(float %x, float %y) { 277 ;CHECK-LABEL: tdupf: 278 ;CHECK: vdup.32 279 %1 = insertelement <4 x float> undef, float %x, i32 0 280 %2 = insertelement <4 x float> %1, float %x, i32 1 281 %3 = insertelement <4 x float> %2, float %x, i32 2 282 %4 = insertelement <4 x float> %3, float %y, i32 3 283 ret <4 x float> %4 284 } 285 286 ; This test checks that when splatting an element from a vector into another, 287 ; the value isn't moved out to GPRs first. 288 define <4 x i32> @tduplane(<4 x i32> %invec) { 289 ;CHECK-LABEL: tduplane: 290 ;CHECK-NOT: vmov {{.*}}, d16[1] 291 ;CHECK: vdup.32 {{.*}}, d16[1] 292 %in = extractelement <4 x i32> %invec, i32 1 293 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 294 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 295 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 296 %4 = insertelement <4 x i32> %3, i32 255, i32 3 297 ret <4 x i32> %4 298 } 299 300 define <2 x float> @check_f32(<4 x float> %v) nounwind { 301 ;CHECK-LABEL: check_f32: 302 ;CHECK: vdup.32 {{.*}}, d{{..}}[1] 303 %x = extractelement <4 x float> %v, i32 3 304 %1 = insertelement <2 x float> undef, float %x, i32 0 305 %2 = insertelement <2 x float> %1, float %x, i32 1 306 ret <2 x float> %2 307 } 308 309 define <2 x i32> @check_i32(<4 x i32> %v) nounwind { 310 ;CHECK-LABEL: check_i32: 311 ;CHECK: vdup.32 {{.*}}, d{{..}}[1] 312 %x = extractelement <4 x i32> %v, i32 3 313 %1 = insertelement <2 x i32> undef, i32 %x, i32 0 314 %2 = insertelement <2 x i32> %1, i32 %x, i32 1 315 ret <2 x i32> %2 316 } 317 318 define <4 x i16> @check_i16(<8 x i16> %v) nounwind { 319 ;CHECK-LABEL: check_i16: 320 ;CHECK: vdup.16 {{.*}}, d{{..}}[3] 321 %x = extractelement <8 x i16> %v, i32 3 322 %1 = insertelement <4 x i16> undef, i16 %x, i32 0 323 %2 = insertelement <4 x i16> %1, i16 %x, i32 1 324 ret <4 x i16> %2 325 } 326 327 define <8 x i8> @check_i8(<16 x i8> %v) nounwind { 328 ;CHECK-LABEL: check_i8: 329 ;CHECK: vdup.8 {{.*}}, d{{..}}[3] 330 %x = extractelement <16 x i8> %v, i32 3 331 %1 = insertelement <8 x i8> undef, i8 %x, i32 0 332 %2 = insertelement <8 x i8> %1, i8 %x, i32 1 333 ret <8 x i8> %2 334 } 335 336 ; Check that an SPR splat produces a vdup. 337 338 define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) { 339 ;CHECK-LABEL: check_spr_splat2: 340 ;CHECK: vdup.32 d 341 %conv = sitofp i16 %q to float 342 %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0 343 %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer 344 %sub = fsub <2 x float> %splat.splat, %p 345 ret <2 x float> %sub 346 } 347 348 define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) { 349 ;CHECK-LABEL: check_spr_splat4: 350 ;CHECK: vld1.16 351 %conv = sitofp i16 %q to float 352 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0 353 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 354 %sub = fsub <4 x float> %splat.splat, %p 355 ret <4 x float> %sub 356 } 357 ; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant. 358 define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) { 359 ;CHECK-LABEL: check_spr_splat4_lane1: 360 ;CHECK: vld1.16 361 %conv = sitofp i16 %q to float 362 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1 363 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 364 %sub = fsub <4 x float> %splat.splat, %p 365 ret <4 x float> %sub 366 } 367 368 ; Also make sure we don't barf on variable-index extractelts, where we almost 369 ; could have generated a vdup. 370 371 define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) { 372 ; CHECK-LABEL: check_i8_varidx: 373 ; CHECK: mov r[[FP:[0-9]+]], sp 374 ; CHECK: ldr r[[IDX:[0-9]+]], [r[[FP]], #4] 375 ; CHECK: mov r[[SPCOPY:[0-9]+]], sp 376 ; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[IDX]] 377 ; CHECK: vld1.8 {d{{.*}}[]}, [r[[SPCOPY]]] 378 %x = extractelement <16 x i8> %v, i32 %idx 379 %1 = insertelement <8 x i8> undef, i8 %x, i32 0 380 %2 = insertelement <8 x i8> %1, i8 %x, i32 1 381 ret <8 x i8> %2 382 } 383