1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F 3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 4 5 define <16 x i32> @_inreg16xi32(i32 %a) { 6 ; ALL-LABEL: _inreg16xi32: 7 ; ALL: # BB#0: 8 ; ALL-NEXT: vpbroadcastd %edi, %zmm0 9 ; ALL-NEXT: retq 10 %b = insertelement <16 x i32> undef, i32 %a, i32 0 11 %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer 12 ret <16 x i32> %c 13 } 14 15 define <8 x i64> @_inreg8xi64(i64 %a) { 16 ; ALL-LABEL: _inreg8xi64: 17 ; ALL: # BB#0: 18 ; ALL-NEXT: vpbroadcastq %rdi, %zmm0 19 ; ALL-NEXT: retq 20 %b = insertelement <8 x i64> undef, i64 %a, i32 0 21 %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer 22 ret <8 x i64> %c 23 } 24 25 define <16 x float> @_ss16xfloat_v4(<4 x float> %a) { 26 ; ALL-LABEL: _ss16xfloat_v4: 27 ; ALL: # BB#0: 28 ; ALL-NEXT: vbroadcastss %xmm0, %zmm0 29 ; ALL-NEXT: retq 30 %b = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> zeroinitializer 31 ret <16 x float> %b 32 } 33 34 define <16 x float> @_inreg16xfloat(float %a) { 35 ; ALL-LABEL: _inreg16xfloat: 36 ; ALL: # BB#0: 37 ; ALL-NEXT: vbroadcastss %xmm0, %zmm0 38 ; ALL-NEXT: retq 39 %b = insertelement <16 x float> undef, float %a, i32 0 40 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer 41 ret <16 x float> %c 42 } 43 44 define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) { 45 ; ALL-LABEL: _ss16xfloat_mask: 46 ; ALL: # BB#0: 47 ; ALL-NEXT: vpxord %zmm3, %zmm3, %zmm3 48 ; ALL-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 49 ; ALL-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} 50 ; ALL-NEXT: vmovaps %zmm1, %zmm0 51 ; ALL-NEXT: retq 52 %mask = icmp ne <16 x i32> %mask1, zeroinitializer 53 %b = insertelement <16 x float> undef, float %a, i32 0 54 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer 55 %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i 56 ret <16 x float> %r 57 } 58 59 define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) { 60 ; ALL-LABEL: _ss16xfloat_maskz: 61 ; ALL: # BB#0: 62 ; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 63 ; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 64 ; ALL-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} 65 ; ALL-NEXT: retq 66 %mask = icmp ne <16 x i32> %mask1, zeroinitializer 67 %b = insertelement <16 x float> undef, float %a, i32 0 68 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer 69 %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer 70 ret <16 x float> %r 71 } 72 73 define <16 x float> @_ss16xfloat_load(float* %a.ptr) { 74 ; ALL-LABEL: _ss16xfloat_load: 75 ; ALL: # BB#0: 76 ; ALL-NEXT: vbroadcastss (%rdi), %zmm0 77 ; ALL-NEXT: retq 78 %a = load float, float* %a.ptr 79 %b = insertelement <16 x float> undef, float %a, i32 0 80 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer 81 ret <16 x float> %c 82 } 83 84 define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) { 85 ; ALL-LABEL: _ss16xfloat_mask_load: 86 ; ALL: # BB#0: 87 ; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 88 ; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 89 ; ALL-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} 90 ; ALL-NEXT: retq 91 %a = load float, float* %a.ptr 92 %mask = icmp ne <16 x i32> %mask1, zeroinitializer 93 %b = insertelement <16 x float> undef, float %a, i32 0 94 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer 95 %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i 96 ret <16 x float> %r 97 } 98 99 define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) { 100 ; ALL-LABEL: _ss16xfloat_maskz_load: 101 ; ALL: # BB#0: 102 ; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 103 ; ALL-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 104 ; ALL-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} 105 ; ALL-NEXT: retq 106 %a = load float, float* %a.ptr 107 %mask = icmp ne <16 x i32> %mask1, zeroinitializer 108 %b = insertelement <16 x float> undef, float %a, i32 0 109 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer 110 %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer 111 ret <16 x float> %r 112 } 113 114 define <8 x double> @_inreg8xdouble(double %a) { 115 ; ALL-LABEL: _inreg8xdouble: 116 ; ALL: # BB#0: 117 ; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 118 ; ALL-NEXT: retq 119 %b = insertelement <8 x double> undef, double %a, i32 0 120 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer 121 ret <8 x double> %c 122 } 123 124 define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) { 125 ; ALL-LABEL: _sd8xdouble_mask: 126 ; ALL: # BB#0: 127 ; ALL-NEXT: vpxor %ymm3, %ymm3, %ymm3 128 ; ALL-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 129 ; ALL-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} 130 ; ALL-NEXT: vmovaps %zmm1, %zmm0 131 ; ALL-NEXT: retq 132 %mask = icmp ne <8 x i32> %mask1, zeroinitializer 133 %b = insertelement <8 x double> undef, double %a, i32 0 134 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer 135 %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i 136 ret <8 x double> %r 137 } 138 139 define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) { 140 ; ALL-LABEL: _sd8xdouble_maskz: 141 ; ALL: # BB#0: 142 ; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2 143 ; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 144 ; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} 145 ; ALL-NEXT: retq 146 %mask = icmp ne <8 x i32> %mask1, zeroinitializer 147 %b = insertelement <8 x double> undef, double %a, i32 0 148 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer 149 %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer 150 ret <8 x double> %r 151 } 152 153 define <8 x double> @_sd8xdouble_load(double* %a.ptr) { 154 ; ALL-LABEL: _sd8xdouble_load: 155 ; ALL: # BB#0: 156 ; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 157 ; ALL-NEXT: retq 158 %a = load double, double* %a.ptr 159 %b = insertelement <8 x double> undef, double %a, i32 0 160 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer 161 ret <8 x double> %c 162 } 163 164 define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) { 165 ; ALL-LABEL: _sd8xdouble_mask_load: 166 ; ALL: # BB#0: 167 ; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2 168 ; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 169 ; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} 170 ; ALL-NEXT: retq 171 %a = load double, double* %a.ptr 172 %mask = icmp ne <8 x i32> %mask1, zeroinitializer 173 %b = insertelement <8 x double> undef, double %a, i32 0 174 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer 175 %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i 176 ret <8 x double> %r 177 } 178 179 define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) { 180 ; ALL-LABEL: _sd8xdouble_maskz_load: 181 ; ALL: # BB#0: 182 ; ALL-NEXT: vpxor %ymm1, %ymm1, %ymm1 183 ; ALL-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 184 ; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} 185 ; ALL-NEXT: retq 186 %a = load double, double* %a.ptr 187 %mask = icmp ne <8 x i32> %mask1, zeroinitializer 188 %b = insertelement <8 x double> undef, double %a, i32 0 189 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer 190 %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer 191 ret <8 x double> %r 192 } 193 194 define <16 x i32> @_xmm16xi32(<16 x i32> %a) { 195 ; ALL-LABEL: _xmm16xi32: 196 ; ALL: # BB#0: 197 ; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 198 ; ALL-NEXT: retq 199 %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer 200 ret <16 x i32> %b 201 } 202 203 define <16 x float> @_xmm16xfloat(<16 x float> %a) { 204 ; ALL-LABEL: _xmm16xfloat: 205 ; ALL: # BB#0: 206 ; ALL-NEXT: vbroadcastss %xmm0, %zmm0 207 ; ALL-NEXT: retq 208 %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer 209 ret <16 x float> %b 210 } 211 212 define <16 x i32> @test_vbroadcast() { 213 ; ALL-LABEL: test_vbroadcast: 214 ; ALL: # BB#0: # %entry 215 ; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0 216 ; ALL-NEXT: vcmpunordps %zmm0, %zmm0, %k1 217 ; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} 218 ; ALL-NEXT: knotw %k1, %k1 219 ; ALL-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z} 220 ; ALL-NEXT: retq 221 entry: 222 %0 = sext <16 x i1> zeroinitializer to <16 x i32> 223 %1 = fcmp uno <16 x float> undef, zeroinitializer 224 %2 = sext <16 x i1> %1 to <16 x i32> 225 %3 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> %2 226 ret <16 x i32> %3 227 } 228 229 ; We implement the set1 intrinsics with vector initializers. Verify that the 230 ; IR generated will produce broadcasts at the end. 231 define <8 x double> @test_set1_pd(double %d) #2 { 232 ; ALL-LABEL: test_set1_pd: 233 ; ALL: # BB#0: # %entry 234 ; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 235 ; ALL-NEXT: retq 236 entry: 237 %vecinit.i = insertelement <8 x double> undef, double %d, i32 0 238 %vecinit1.i = insertelement <8 x double> %vecinit.i, double %d, i32 1 239 %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %d, i32 2 240 %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %d, i32 3 241 %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %d, i32 4 242 %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %d, i32 5 243 %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %d, i32 6 244 %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %d, i32 7 245 ret <8 x double> %vecinit7.i 246 } 247 248 define <8 x i64> @test_set1_epi64(i64 %d) #2 { 249 ; ALL-LABEL: test_set1_epi64: 250 ; ALL: # BB#0: # %entry 251 ; ALL-NEXT: vpbroadcastq %rdi, %zmm0 252 ; ALL-NEXT: retq 253 entry: 254 %vecinit.i = insertelement <8 x i64> undef, i64 %d, i32 0 255 %vecinit1.i = insertelement <8 x i64> %vecinit.i, i64 %d, i32 1 256 %vecinit2.i = insertelement <8 x i64> %vecinit1.i, i64 %d, i32 2 257 %vecinit3.i = insertelement <8 x i64> %vecinit2.i, i64 %d, i32 3 258 %vecinit4.i = insertelement <8 x i64> %vecinit3.i, i64 %d, i32 4 259 %vecinit5.i = insertelement <8 x i64> %vecinit4.i, i64 %d, i32 5 260 %vecinit6.i = insertelement <8 x i64> %vecinit5.i, i64 %d, i32 6 261 %vecinit7.i = insertelement <8 x i64> %vecinit6.i, i64 %d, i32 7 262 ret <8 x i64> %vecinit7.i 263 } 264 265 define <16 x float> @test_set1_ps(float %f) #2 { 266 ; ALL-LABEL: test_set1_ps: 267 ; ALL: # BB#0: # %entry 268 ; ALL-NEXT: vbroadcastss %xmm0, %zmm0 269 ; ALL-NEXT: retq 270 entry: 271 %vecinit.i = insertelement <16 x float> undef, float %f, i32 0 272 %vecinit1.i = insertelement <16 x float> %vecinit.i, float %f, i32 1 273 %vecinit2.i = insertelement <16 x float> %vecinit1.i, float %f, i32 2 274 %vecinit3.i = insertelement <16 x float> %vecinit2.i, float %f, i32 3 275 %vecinit4.i = insertelement <16 x float> %vecinit3.i, float %f, i32 4 276 %vecinit5.i = insertelement <16 x float> %vecinit4.i, float %f, i32 5 277 %vecinit6.i = insertelement <16 x float> %vecinit5.i, float %f, i32 6 278 %vecinit7.i = insertelement <16 x float> %vecinit6.i, float %f, i32 7 279 %vecinit8.i = insertelement <16 x float> %vecinit7.i, float %f, i32 8 280 %vecinit9.i = insertelement <16 x float> %vecinit8.i, float %f, i32 9 281 %vecinit10.i = insertelement <16 x float> %vecinit9.i, float %f, i32 10 282 %vecinit11.i = insertelement <16 x float> %vecinit10.i, float %f, i32 11 283 %vecinit12.i = insertelement <16 x float> %vecinit11.i, float %f, i32 12 284 %vecinit13.i = insertelement <16 x float> %vecinit12.i, float %f, i32 13 285 %vecinit14.i = insertelement <16 x float> %vecinit13.i, float %f, i32 14 286 %vecinit15.i = insertelement <16 x float> %vecinit14.i, float %f, i32 15 287 ret <16 x float> %vecinit15.i 288 } 289 290 define <16 x i32> @test_set1_epi32(i32 %f) #2 { 291 ; ALL-LABEL: test_set1_epi32: 292 ; ALL: # BB#0: # %entry 293 ; ALL-NEXT: vpbroadcastd %edi, %zmm0 294 ; ALL-NEXT: retq 295 entry: 296 %vecinit.i = insertelement <16 x i32> undef, i32 %f, i32 0 297 %vecinit1.i = insertelement <16 x i32> %vecinit.i, i32 %f, i32 1 298 %vecinit2.i = insertelement <16 x i32> %vecinit1.i, i32 %f, i32 2 299 %vecinit3.i = insertelement <16 x i32> %vecinit2.i, i32 %f, i32 3 300 %vecinit4.i = insertelement <16 x i32> %vecinit3.i, i32 %f, i32 4 301 %vecinit5.i = insertelement <16 x i32> %vecinit4.i, i32 %f, i32 5 302 %vecinit6.i = insertelement <16 x i32> %vecinit5.i, i32 %f, i32 6 303 %vecinit7.i = insertelement <16 x i32> %vecinit6.i, i32 %f, i32 7 304 %vecinit8.i = insertelement <16 x i32> %vecinit7.i, i32 %f, i32 8 305 %vecinit9.i = insertelement <16 x i32> %vecinit8.i, i32 %f, i32 9 306 %vecinit10.i = insertelement <16 x i32> %vecinit9.i, i32 %f, i32 10 307 %vecinit11.i = insertelement <16 x i32> %vecinit10.i, i32 %f, i32 11 308 %vecinit12.i = insertelement <16 x i32> %vecinit11.i, i32 %f, i32 12 309 %vecinit13.i = insertelement <16 x i32> %vecinit12.i, i32 %f, i32 13 310 %vecinit14.i = insertelement <16 x i32> %vecinit13.i, i32 %f, i32 14 311 %vecinit15.i = insertelement <16 x i32> %vecinit14.i, i32 %f, i32 15 312 ret <16 x i32> %vecinit15.i 313 } 314 315 ; We implement the scalar broadcast intrinsics with vector initializers. 316 ; Verify that the IR generated will produce the broadcast at the end. 317 define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) { 318 ; ALL-LABEL: test_mm512_broadcastsd_pd: 319 ; ALL: # BB#0: # %entry 320 ; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 321 ; ALL-NEXT: retq 322 entry: 323 %0 = extractelement <2 x double> %a, i32 0 324 %vecinit.i = insertelement <8 x double> undef, double %0, i32 0 325 %vecinit1.i = insertelement <8 x double> %vecinit.i, double %0, i32 1 326 %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %0, i32 2 327 %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %0, i32 3 328 %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %0, i32 4 329 %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %0, i32 5 330 %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %0, i32 6 331 %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %0, i32 7 332 ret <8 x double> %vecinit7.i 333 } 334 335 define <16 x float> @test1(<8 x float>%a) { 336 ; ALL-LABEL: test1: 337 ; ALL: # BB#0: 338 ; ALL-NEXT: vbroadcastss %xmm0, %zmm0 339 ; ALL-NEXT: retq 340 %res = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> zeroinitializer 341 ret <16 x float>%res 342 } 343 344 define <8 x double> @test2(<4 x double>%a) { 345 ; ALL-LABEL: test2: 346 ; ALL: # BB#0: 347 ; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 348 ; ALL-NEXT: retq 349 %res = shufflevector <4 x double> %a, <4 x double> undef, <8 x i32> zeroinitializer 350 ret <8 x double>%res 351 } 352 353 define <64 x i8> @_invec32xi8(<32 x i8>%a) { 354 ; AVX512F-LABEL: _invec32xi8: 355 ; AVX512F: # BB#0: 356 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 357 ; AVX512F-NEXT: vmovaps %zmm0, %zmm1 358 ; AVX512F-NEXT: retq 359 ; 360 ; AVX512BW-LABEL: _invec32xi8: 361 ; AVX512BW: # BB#0: 362 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 363 ; AVX512BW-NEXT: retq 364 %res = shufflevector <32 x i8> %a, <32 x i8> undef, <64 x i32> zeroinitializer 365 ret <64 x i8>%res 366 } 367 368 define <32 x i16> @_invec16xi16(<16 x i16>%a) { 369 ; AVX512F-LABEL: _invec16xi16: 370 ; AVX512F: # BB#0: 371 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 372 ; AVX512F-NEXT: vmovaps %zmm0, %zmm1 373 ; AVX512F-NEXT: retq 374 ; 375 ; AVX512BW-LABEL: _invec16xi16: 376 ; AVX512BW: # BB#0: 377 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %zmm0 378 ; AVX512BW-NEXT: retq 379 %res = shufflevector <16 x i16> %a, <16 x i16> undef, <32 x i32> zeroinitializer 380 ret <32 x i16>%res 381 } 382 383 define <16 x i32> @_invec8xi32(<8 x i32>%a) { 384 ; ALL-LABEL: _invec8xi32: 385 ; ALL: # BB#0: 386 ; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 387 ; ALL-NEXT: retq 388 %res = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> zeroinitializer 389 ret <16 x i32>%res 390 } 391 392 define <8 x i64> @_invec4xi64(<4 x i64>%a) { 393 ; ALL-LABEL: _invec4xi64: 394 ; ALL: # BB#0: 395 ; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 396 ; ALL-NEXT: retq 397 %res = shufflevector <4 x i64> %a, <4 x i64> undef, <8 x i32> zeroinitializer 398 ret <8 x i64>%res 399 } 400 401