1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s 2 3 define <8 x i8> @v_dup8(i8 %A) nounwind { 4 ;CHECK: v_dup8: 5 ;CHECK: vdup.8 6 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 7 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 8 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 9 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 10 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 11 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 12 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 13 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 14 ret <8 x i8> %tmp8 15 } 16 17 define <4 x i16> @v_dup16(i16 %A) nounwind { 18 ;CHECK: v_dup16: 19 ;CHECK: vdup.16 20 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 21 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 22 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 23 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 24 ret <4 x i16> %tmp4 25 } 26 27 define <2 x i32> @v_dup32(i32 %A) nounwind { 28 ;CHECK: v_dup32: 29 ;CHECK: vdup.32 30 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 31 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 32 ret <2 x i32> %tmp2 33 } 34 35 define <2 x float> @v_dupfloat(float %A) nounwind { 36 ;CHECK: v_dupfloat: 37 ;CHECK: vdup.32 38 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 39 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 40 ret <2 x float> %tmp2 41 } 42 43 define <16 x i8> @v_dupQ8(i8 %A) nounwind { 44 ;CHECK: v_dupQ8: 45 ;CHECK: vdup.8 46 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 47 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 48 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 49 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 50 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 51 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 52 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 53 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 54 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 55 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 56 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 57 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 58 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 59 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 60 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 61 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 62 ret <16 x i8> %tmp16 63 } 64 65 define <8 x i16> @v_dupQ16(i16 %A) nounwind { 66 ;CHECK: v_dupQ16: 67 ;CHECK: vdup.16 68 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 69 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 70 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 71 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 72 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 73 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 74 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 75 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 76 ret <8 x i16> %tmp8 77 } 78 79 define <4 x i32> @v_dupQ32(i32 %A) nounwind { 80 ;CHECK: v_dupQ32: 81 ;CHECK: vdup.32 82 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 83 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 84 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 85 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 86 ret <4 x i32> %tmp4 87 } 88 89 define <4 x float> @v_dupQfloat(float %A) nounwind { 90 ;CHECK: v_dupQfloat: 91 ;CHECK: vdup.32 92 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 93 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 94 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 95 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 96 ret <4 x float> %tmp4 97 } 98 99 ; Check to make sure it works with shuffles, too. 100 101 define <8 x i8> @v_shuffledup8(i8 %A) nounwind { 102 ;CHECK: v_shuffledup8: 103 ;CHECK: vdup.8 104 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 105 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 106 ret <8 x i8> %tmp2 107 } 108 109 define <4 x i16> @v_shuffledup16(i16 %A) nounwind { 110 ;CHECK: v_shuffledup16: 111 ;CHECK: vdup.16 112 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 113 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 114 ret <4 x i16> %tmp2 115 } 116 117 define <2 x i32> @v_shuffledup32(i32 %A) nounwind { 118 ;CHECK: v_shuffledup32: 119 ;CHECK: vdup.32 120 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 121 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 122 ret <2 x i32> %tmp2 123 } 124 125 define <2 x float> @v_shuffledupfloat(float %A) nounwind { 126 ;CHECK: v_shuffledupfloat: 127 ;CHECK: vdup.32 128 %tmp1 = insertelement <2 x float> undef, float %A, i32 0 129 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer 130 ret <2 x float> %tmp2 131 } 132 133 define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { 134 ;CHECK: v_shuffledupQ8: 135 ;CHECK: vdup.8 136 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 137 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer 138 ret <16 x i8> %tmp2 139 } 140 141 define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { 142 ;CHECK: v_shuffledupQ16: 143 ;CHECK: vdup.16 144 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 145 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer 146 ret <8 x i16> %tmp2 147 } 148 149 define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { 150 ;CHECK: v_shuffledupQ32: 151 ;CHECK: vdup.32 152 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 153 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer 154 ret <4 x i32> %tmp2 155 } 156 157 define <4 x float> @v_shuffledupQfloat(float %A) nounwind { 158 ;CHECK: v_shuffledupQfloat: 159 ;CHECK: vdup.32 160 %tmp1 = insertelement <4 x float> undef, float %A, i32 0 161 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer 162 ret <4 x float> %tmp2 163 } 164 165 define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { 166 ;CHECK: vduplane8: 167 ;CHECK: vdup.8 168 %tmp1 = load <8 x i8>* %A 169 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 170 ret <8 x i8> %tmp2 171 } 172 173 define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { 174 ;CHECK: vduplane16: 175 ;CHECK: vdup.16 176 %tmp1 = load <4 x i16>* %A 177 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 178 ret <4 x i16> %tmp2 179 } 180 181 define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { 182 ;CHECK: vduplane32: 183 ;CHECK: vdup.32 184 %tmp1 = load <2 x i32>* %A 185 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > 186 ret <2 x i32> %tmp2 187 } 188 189 define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { 190 ;CHECK: vduplanefloat: 191 ;CHECK: vdup.32 192 %tmp1 = load <2 x float>* %A 193 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > 194 ret <2 x float> %tmp2 195 } 196 197 define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind { 198 ;CHECK: vduplaneQ8: 199 ;CHECK: vdup.8 200 %tmp1 = load <8 x i8>* %A 201 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 202 ret <16 x i8> %tmp2 203 } 204 205 define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind { 206 ;CHECK: vduplaneQ16: 207 ;CHECK: vdup.16 208 %tmp1 = load <4 x i16>* %A 209 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 210 ret <8 x i16> %tmp2 211 } 212 213 define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind { 214 ;CHECK: vduplaneQ32: 215 ;CHECK: vdup.32 216 %tmp1 = load <2 x i32>* %A 217 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 218 ret <4 x i32> %tmp2 219 } 220 221 define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind { 222 ;CHECK: vduplaneQfloat: 223 ;CHECK: vdup.32 224 %tmp1 = load <2 x float>* %A 225 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 226 ret <4 x float> %tmp2 227 } 228 229 define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { 230 entry: 231 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1> 232 ret <2 x i64> %0 233 } 234 235 define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { 236 entry: 237 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0> 238 ret <2 x i64> %0 239 } 240 241 define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { 242 entry: 243 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1> 244 ret <2 x double> %0 245 } 246 247 define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { 248 entry: 249 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0> 250 ret <2 x double> %0 251 } 252 253 ; Radar 7373643 254 ;CHECK: redundantVdup: 255 ;CHECK: vmov.i8 256 ;CHECK-NOT: vdup.8 257 ;CHECK: vstr 258 define void @redundantVdup(<8 x i8>* %ptr) nounwind { 259 %1 = insertelement <8 x i8> undef, i8 -128, i32 0 260 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 261 store <8 x i8> %2, <8 x i8>* %ptr, align 8 262 ret void 263 } 264 265 define <4 x i32> @tdupi(i32 %x, i32 %y) { 266 ;CHECK: tdupi 267 ;CHECK: vdup.32 268 %1 = insertelement <4 x i32> undef, i32 %x, i32 0 269 %2 = insertelement <4 x i32> %1, i32 %x, i32 1 270 %3 = insertelement <4 x i32> %2, i32 %x, i32 2 271 %4 = insertelement <4 x i32> %3, i32 %y, i32 3 272 ret <4 x i32> %4 273 } 274 275 define <4 x float> @tdupf(float %x, float %y) { 276 ;CHECK: tdupf 277 ;CHECK: vdup.32 278 %1 = insertelement <4 x float> undef, float %x, i32 0 279 %2 = insertelement <4 x float> %1, float %x, i32 1 280 %3 = insertelement <4 x float> %2, float %x, i32 2 281 %4 = insertelement <4 x float> %3, float %y, i32 3 282 ret <4 x float> %4 283 } 284 285 ; This test checks that when splatting an element from a vector into another, 286 ; the value isn't moved out to GPRs first. 287 define <4 x i32> @tduplane(<4 x i32> %invec) { 288 ;CHECK: tduplane 289 ;CHECK-NOT: vmov {{.*}}, d16[1] 290 ;CHECK: vdup.32 {{.*}}, d16[1] 291 %in = extractelement <4 x i32> %invec, i32 1 292 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 293 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 294 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 295 %4 = insertelement <4 x i32> %3, i32 255, i32 3 296 ret <4 x i32> %4 297 } 298