1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32 3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64 4 5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c 6 7 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { 8 ; X32-LABEL: test_mm_broadcastd_epi32: 9 ; X32: # BB#0: 10 ; X32-NEXT: vpbroadcastd %xmm0, %xmm0 11 ; X32-NEXT: retl 12 ; 13 ; X64-LABEL: test_mm_broadcastd_epi32: 14 ; X64: # BB#0: 15 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 16 ; X64-NEXT: retq 17 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 18 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer 19 %res1 = bitcast <4 x i32> %res0 to <2 x i64> 20 ret <2 x i64> %res1 21 } 22 23 define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) { 24 ; X32-LABEL: test_mm_mask_broadcastd_epi32: 25 ; X32: # BB#0: 26 ; X32-NEXT: pushl %eax 27 ; X32-NEXT: .Ltmp0: 28 ; X32-NEXT: .cfi_def_cfa_offset 8 29 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 30 ; X32-NEXT: andb $15, %al 31 ; X32-NEXT: movb %al, (%esp) 32 ; X32-NEXT: movzbl (%esp), %eax 33 ; X32-NEXT: kmovw %eax, %k1 34 ; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 35 ; X32-NEXT: popl %eax 36 ; X32-NEXT: retl 37 ; 38 ; X64-LABEL: test_mm_mask_broadcastd_epi32: 39 ; X64: # BB#0: 40 ; X64-NEXT: andb $15, %dil 41 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 42 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 43 ; X64-NEXT: kmovw %eax, %k1 44 ; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 45 ; X64-NEXT: retq 46 %trn1 = trunc i8 %a1 to i4 47 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 48 %arg1 = bitcast i4 %trn1 to <4 x i1> 49 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 50 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <4 x i32> zeroinitializer 51 %res1 = select <4 x i1> %arg1, <4 x i32> %res0, <4 x i32> %arg0 52 %res2 = bitcast <4 x i32> %res1 to <2 x i64> 53 ret <2 x i64> %res2 54 } 55 56 define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) { 57 ; X32-LABEL: test_mm_maskz_broadcastd_epi32: 58 ; X32: # BB#0: 59 ; X32-NEXT: pushl %eax 60 ; X32-NEXT: .Ltmp1: 61 ; X32-NEXT: .cfi_def_cfa_offset 8 62 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 63 ; X32-NEXT: andb $15, %al 64 ; X32-NEXT: movb %al, (%esp) 65 ; X32-NEXT: movzbl (%esp), %eax 66 ; X32-NEXT: kmovw %eax, %k1 67 ; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} 68 ; X32-NEXT: popl %eax 69 ; X32-NEXT: retl 70 ; 71 ; X64-LABEL: test_mm_maskz_broadcastd_epi32: 72 ; X64: # BB#0: 73 ; X64-NEXT: andb $15, %dil 74 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 75 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 76 ; X64-NEXT: kmovw %eax, %k1 77 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} 78 ; X64-NEXT: retq 79 %trn0 = trunc i8 %a0 to i4 80 %arg0 = bitcast i4 %trn0 to <4 x i1> 81 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 82 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <4 x i32> zeroinitializer 83 %res1 = select <4 x i1> %arg0, <4 x i32> %res0, <4 x i32> zeroinitializer 84 %res2 = bitcast <4 x i32> %res1 to <2 x i64> 85 ret <2 x i64> %res2 86 } 87 88 define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) { 89 ; X32-LABEL: test_mm256_broadcastd_epi32: 90 ; X32: # BB#0: 91 ; X32-NEXT: vpbroadcastd %xmm0, %ymm0 92 ; X32-NEXT: retl 93 ; 94 ; X64-LABEL: test_mm256_broadcastd_epi32: 95 ; X64: # BB#0: 96 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0 97 ; X64-NEXT: retq 98 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 99 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer 100 %res1 = bitcast <8 x i32> %res0 to <4 x i64> 101 ret <4 x i64> %res1 102 } 103 104 define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) { 105 ; X32-LABEL: test_mm256_mask_broadcastd_epi32: 106 ; X32: # BB#0: 107 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 108 ; X32-NEXT: kmovw %eax, %k1 109 ; X32-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1} 110 ; X32-NEXT: retl 111 ; 112 ; X64-LABEL: test_mm256_mask_broadcastd_epi32: 113 ; X64: # BB#0: 114 ; X64-NEXT: kmovw %edi, %k1 115 ; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1} 116 ; X64-NEXT: retq 117 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 118 %arg1 = bitcast i8 %a1 to <8 x i1> 119 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 120 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer 121 %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0 122 %res2 = bitcast <8 x i32> %res1 to <4 x i64> 123 ret <4 x i64> %res2 124 } 125 126 define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) { 127 ; X32-LABEL: test_mm256_maskz_broadcastd_epi32: 128 ; X32: # BB#0: 129 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 130 ; X32-NEXT: kmovw %eax, %k1 131 ; X32-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} 132 ; X32-NEXT: retl 133 ; 134 ; X64-LABEL: test_mm256_maskz_broadcastd_epi32: 135 ; X64: # BB#0: 136 ; X64-NEXT: kmovw %edi, %k1 137 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} 138 ; X64-NEXT: retq 139 %arg0 = bitcast i8 %a0 to <8 x i1> 140 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 141 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer 142 %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer 143 %res2 = bitcast <8 x i32> %res1 to <4 x i64> 144 ret <4 x i64> %res2 145 } 146 147 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { 148 ; X32-LABEL: test_mm_broadcastq_epi64: 149 ; X32: # BB#0: 150 ; X32-NEXT: vpbroadcastq %xmm0, %xmm0 151 ; X32-NEXT: retl 152 ; 153 ; X64-LABEL: test_mm_broadcastq_epi64: 154 ; X64: # BB#0: 155 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 156 ; X64-NEXT: retq 157 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer 158 ret <2 x i64> %res 159 } 160 161 define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) { 162 ; X32-LABEL: test_mm_mask_broadcastq_epi64: 163 ; X32: # BB#0: 164 ; X32-NEXT: pushl %eax 165 ; X32-NEXT: .Ltmp2: 166 ; X32-NEXT: .cfi_def_cfa_offset 8 167 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 168 ; X32-NEXT: andb $3, %al 169 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 170 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 171 ; X32-NEXT: kmovw %eax, %k1 172 ; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 173 ; X32-NEXT: popl %eax 174 ; X32-NEXT: retl 175 ; 176 ; X64-LABEL: test_mm_mask_broadcastq_epi64: 177 ; X64: # BB#0: 178 ; X64-NEXT: andb $3, %dil 179 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 180 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 181 ; X64-NEXT: kmovw %eax, %k1 182 ; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 183 ; X64-NEXT: retq 184 %trn1 = trunc i8 %a1 to i2 185 %arg1 = bitcast i2 %trn1 to <2 x i1> 186 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <2 x i32> zeroinitializer 187 %res1 = select <2 x i1> %arg1, <2 x i64> %res0, <2 x i64> %a0 188 ret <2 x i64> %res1 189 } 190 191 define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { 192 ; X32-LABEL: test_mm_maskz_broadcastq_epi64: 193 ; X32: # BB#0: 194 ; X32-NEXT: pushl %eax 195 ; X32-NEXT: .Ltmp3: 196 ; X32-NEXT: .cfi_def_cfa_offset 8 197 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 198 ; X32-NEXT: andb $3, %al 199 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 200 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 201 ; X32-NEXT: kmovw %eax, %k1 202 ; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 203 ; X32-NEXT: popl %eax 204 ; X32-NEXT: retl 205 ; 206 ; X64-LABEL: test_mm_maskz_broadcastq_epi64: 207 ; X64: # BB#0: 208 ; X64-NEXT: andb $3, %dil 209 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 210 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 211 ; X64-NEXT: kmovw %eax, %k1 212 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 213 ; X64-NEXT: retq 214 %trn0 = trunc i8 %a0 to i2 215 %arg0 = bitcast i2 %trn0 to <2 x i1> 216 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> zeroinitializer 217 %res1 = select <2 x i1> %arg0, <2 x i64> %res0, <2 x i64> zeroinitializer 218 ret <2 x i64> %res1 219 } 220 221 define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) { 222 ; X32-LABEL: test_mm256_broadcastq_epi64: 223 ; X32: # BB#0: 224 ; X32-NEXT: vpbroadcastq %xmm0, %ymm0 225 ; X32-NEXT: retl 226 ; 227 ; X64-LABEL: test_mm256_broadcastq_epi64: 228 ; X64: # BB#0: 229 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 230 ; X64-NEXT: retq 231 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer 232 ret <4 x i64> %res 233 } 234 235 define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) { 236 ; X32-LABEL: test_mm256_mask_broadcastq_epi64: 237 ; X32: # BB#0: 238 ; X32-NEXT: pushl %eax 239 ; X32-NEXT: .Ltmp4: 240 ; X32-NEXT: .cfi_def_cfa_offset 8 241 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 242 ; X32-NEXT: andb $15, %al 243 ; X32-NEXT: movb %al, (%esp) 244 ; X32-NEXT: movzbl (%esp), %eax 245 ; X32-NEXT: kmovw %eax, %k1 246 ; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 247 ; X32-NEXT: popl %eax 248 ; X32-NEXT: retl 249 ; 250 ; X64-LABEL: test_mm256_mask_broadcastq_epi64: 251 ; X64: # BB#0: 252 ; X64-NEXT: andb $15, %dil 253 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 254 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 255 ; X64-NEXT: kmovw %eax, %k1 256 ; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 257 ; X64-NEXT: retq 258 %trn1 = trunc i8 %a1 to i4 259 %arg1 = bitcast i4 %trn1 to <4 x i1> 260 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> zeroinitializer 261 %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0 262 ret <4 x i64> %res1 263 } 264 265 define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { 266 ; X32-LABEL: test_mm256_maskz_broadcastq_epi64: 267 ; X32: # BB#0: 268 ; X32-NEXT: pushl %eax 269 ; X32-NEXT: .Ltmp5: 270 ; X32-NEXT: .cfi_def_cfa_offset 8 271 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 272 ; X32-NEXT: andb $15, %al 273 ; X32-NEXT: movb %al, (%esp) 274 ; X32-NEXT: movzbl (%esp), %eax 275 ; X32-NEXT: kmovw %eax, %k1 276 ; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 277 ; X32-NEXT: popl %eax 278 ; X32-NEXT: retl 279 ; 280 ; X64-LABEL: test_mm256_maskz_broadcastq_epi64: 281 ; X64: # BB#0: 282 ; X64-NEXT: andb $15, %dil 283 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 284 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 285 ; X64-NEXT: kmovw %eax, %k1 286 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 287 ; X64-NEXT: retq 288 %trn0 = trunc i8 %a0 to i4 289 %arg0 = bitcast i4 %trn0 to <4 x i1> 290 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <4 x i32> zeroinitializer 291 %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer 292 ret <4 x i64> %res1 293 } 294 295 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) { 296 ; X32-LABEL: test_mm_broadcastsd_pd: 297 ; X32: # BB#0: 298 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 299 ; X32-NEXT: retl 300 ; 301 ; X64-LABEL: test_mm_broadcastsd_pd: 302 ; X64: # BB#0: 303 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 304 ; X64-NEXT: retq 305 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 306 ret <2 x double> %res 307 } 308 309 define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) { 310 ; X32-LABEL: test_mm_mask_broadcastsd_pd: 311 ; X32: # BB#0: 312 ; X32-NEXT: pushl %eax 313 ; X32-NEXT: .Ltmp6: 314 ; X32-NEXT: .cfi_def_cfa_offset 8 315 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 316 ; X32-NEXT: andb $3, %al 317 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 318 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 319 ; X32-NEXT: kmovw %eax, %k1 320 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 321 ; X32-NEXT: popl %eax 322 ; X32-NEXT: retl 323 ; 324 ; X64-LABEL: test_mm_mask_broadcastsd_pd: 325 ; X64: # BB#0: 326 ; X64-NEXT: andb $3, %dil 327 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 328 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 329 ; X64-NEXT: kmovw %eax, %k1 330 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 331 ; X64-NEXT: retq 332 %trn1 = trunc i8 %a1 to i2 333 %arg1 = bitcast i2 %trn1 to <2 x i1> 334 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer 335 %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0 336 ret <2 x double> %res1 337 } 338 339 define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { 340 ; X32-LABEL: test_mm_maskz_broadcastsd_pd: 341 ; X32: # BB#0: 342 ; X32-NEXT: pushl %eax 343 ; X32-NEXT: .Ltmp7: 344 ; X32-NEXT: .cfi_def_cfa_offset 8 345 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 346 ; X32-NEXT: andb $3, %al 347 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 348 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 349 ; X32-NEXT: kmovw %eax, %k1 350 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 351 ; X32-NEXT: popl %eax 352 ; X32-NEXT: retl 353 ; 354 ; X64-LABEL: test_mm_maskz_broadcastsd_pd: 355 ; X64: # BB#0: 356 ; X64-NEXT: andb $3, %dil 357 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 358 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 359 ; X64-NEXT: kmovw %eax, %k1 360 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 361 ; X64-NEXT: retq 362 %trn0 = trunc i8 %a0 to i2 363 %arg0 = bitcast i2 %trn0 to <2 x i1> 364 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer 365 %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer 366 ret <2 x double> %res1 367 } 368 369 define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) { 370 ; X32-LABEL: test_mm256_broadcastsd_pd: 371 ; X32: # BB#0: 372 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0 373 ; X32-NEXT: retl 374 ; 375 ; X64-LABEL: test_mm256_broadcastsd_pd: 376 ; X64: # BB#0: 377 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 378 ; X64-NEXT: retq 379 %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer 380 ret <4 x double> %res 381 } 382 383 define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 x double> %a2) { 384 ; X32-LABEL: test_mm256_mask_broadcastsd_pd: 385 ; X32: # BB#0: 386 ; X32-NEXT: pushl %eax 387 ; X32-NEXT: .Ltmp8: 388 ; X32-NEXT: .cfi_def_cfa_offset 8 389 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 390 ; X32-NEXT: andb $15, %al 391 ; X32-NEXT: movb %al, (%esp) 392 ; X32-NEXT: movzbl (%esp), %eax 393 ; X32-NEXT: kmovw %eax, %k1 394 ; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} 395 ; X32-NEXT: popl %eax 396 ; X32-NEXT: retl 397 ; 398 ; X64-LABEL: test_mm256_mask_broadcastsd_pd: 399 ; X64: # BB#0: 400 ; X64-NEXT: andb $15, %dil 401 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 402 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 403 ; X64-NEXT: kmovw %eax, %k1 404 ; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} 405 ; X64-NEXT: retq 406 %trn1 = trunc i8 %a1 to i4 407 %arg1 = bitcast i4 %trn1 to <4 x i1> 408 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <4 x i32> zeroinitializer 409 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0 410 ret <4 x double> %res1 411 } 412 413 define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { 414 ; X32-LABEL: test_mm256_maskz_broadcastsd_pd: 415 ; X32: # BB#0: 416 ; X32-NEXT: pushl %eax 417 ; X32-NEXT: .Ltmp9: 418 ; X32-NEXT: .cfi_def_cfa_offset 8 419 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 420 ; X32-NEXT: andb $15, %al 421 ; X32-NEXT: movb %al, (%esp) 422 ; X32-NEXT: movzbl (%esp), %eax 423 ; X32-NEXT: kmovw %eax, %k1 424 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} 425 ; X32-NEXT: popl %eax 426 ; X32-NEXT: retl 427 ; 428 ; X64-LABEL: test_mm256_maskz_broadcastsd_pd: 429 ; X64: # BB#0: 430 ; X64-NEXT: andb $15, %dil 431 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 432 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 433 ; X64-NEXT: kmovw %eax, %k1 434 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} 435 ; X64-NEXT: retq 436 %trn0 = trunc i8 %a0 to i4 437 %arg0 = bitcast i4 %trn0 to <4 x i1> 438 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <4 x i32> zeroinitializer 439 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer 440 ret <4 x double> %res1 441 } 442 443 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { 444 ; X32-LABEL: test_mm_broadcastss_ps: 445 ; X32: # BB#0: 446 ; X32-NEXT: vbroadcastss %xmm0, %xmm0 447 ; X32-NEXT: retl 448 ; 449 ; X64-LABEL: test_mm_broadcastss_ps: 450 ; X64: # BB#0: 451 ; X64-NEXT: vbroadcastss %xmm0, %xmm0 452 ; X64-NEXT: retq 453 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 454 ret <4 x float> %res 455 } 456 457 define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { 458 ; X32-LABEL: test_mm_mask_broadcastss_ps: 459 ; X32: # BB#0: 460 ; X32-NEXT: pushl %eax 461 ; X32-NEXT: .Ltmp10: 462 ; X32-NEXT: .cfi_def_cfa_offset 8 463 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 464 ; X32-NEXT: andb $15, %al 465 ; X32-NEXT: movb %al, (%esp) 466 ; X32-NEXT: movzbl (%esp), %eax 467 ; X32-NEXT: kmovw %eax, %k1 468 ; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} 469 ; X32-NEXT: popl %eax 470 ; X32-NEXT: retl 471 ; 472 ; X64-LABEL: test_mm_mask_broadcastss_ps: 473 ; X64: # BB#0: 474 ; X64-NEXT: andb $15, %dil 475 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 476 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 477 ; X64-NEXT: kmovw %eax, %k1 478 ; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} 479 ; X64-NEXT: retq 480 %trn1 = trunc i8 %a1 to i4 481 %arg1 = bitcast i4 %trn1 to <4 x i1> 482 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> zeroinitializer 483 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0 484 ret <4 x float> %res1 485 } 486 487 define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) { 488 ; X32-LABEL: test_mm_maskz_broadcastss_ps: 489 ; X32: # BB#0: 490 ; X32-NEXT: pushl %eax 491 ; X32-NEXT: .Ltmp11: 492 ; X32-NEXT: .cfi_def_cfa_offset 8 493 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 494 ; X32-NEXT: andb $15, %al 495 ; X32-NEXT: movb %al, (%esp) 496 ; X32-NEXT: movzbl (%esp), %eax 497 ; X32-NEXT: kmovw %eax, %k1 498 ; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} 499 ; X32-NEXT: popl %eax 500 ; X32-NEXT: retl 501 ; 502 ; X64-LABEL: test_mm_maskz_broadcastss_ps: 503 ; X64: # BB#0: 504 ; X64-NEXT: andb $15, %dil 505 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 506 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 507 ; X64-NEXT: kmovw %eax, %k1 508 ; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} 509 ; X64-NEXT: retq 510 %trn0 = trunc i8 %a0 to i4 511 %arg0 = bitcast i4 %trn0 to <4 x i1> 512 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer 513 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer 514 ret <4 x float> %res1 515 } 516 517 define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) { 518 ; X32-LABEL: test_mm256_broadcastss_ps: 519 ; X32: # BB#0: 520 ; X32-NEXT: vbroadcastss %xmm0, %ymm0 521 ; X32-NEXT: retl 522 ; 523 ; X64-LABEL: test_mm256_broadcastss_ps: 524 ; X64: # BB#0: 525 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 526 ; X64-NEXT: retq 527 %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer 528 ret <8 x float> %res 529 } 530 531 define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) { 532 ; X32-LABEL: test_mm256_mask_broadcastss_ps: 533 ; X32: # BB#0: 534 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 535 ; X32-NEXT: kmovw %eax, %k1 536 ; X32-NEXT: vbroadcastss %xmm1, %ymm0 {%k1} 537 ; X32-NEXT: retl 538 ; 539 ; X64-LABEL: test_mm256_mask_broadcastss_ps: 540 ; X64: # BB#0: 541 ; X64-NEXT: kmovw %edi, %k1 542 ; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1} 543 ; X64-NEXT: retq 544 %arg1 = bitcast i8 %a1 to <8 x i1> 545 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer 546 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 547 ret <8 x float> %res1 548 } 549 550 define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) { 551 ; X32-LABEL: test_mm256_maskz_broadcastss_ps: 552 ; X32: # BB#0: 553 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 554 ; X32-NEXT: kmovw %eax, %k1 555 ; X32-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} 556 ; X32-NEXT: retl 557 ; 558 ; X64-LABEL: test_mm256_maskz_broadcastss_ps: 559 ; X64: # BB#0: 560 ; X64-NEXT: kmovw %edi, %k1 561 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} 562 ; X64-NEXT: retq 563 %arg0 = bitcast i8 %a0 to <8 x i1> 564 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer 565 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 566 ret <8 x float> %res1 567 } 568 569 define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) { 570 ; X32-LABEL: test_mm_movddup_pd: 571 ; X32: # BB#0: 572 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 573 ; X32-NEXT: retl 574 ; 575 ; X64-LABEL: test_mm_movddup_pd: 576 ; X64: # BB#0: 577 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 578 ; X64-NEXT: retq 579 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 580 ret <2 x double> %res 581 } 582 583 define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) { 584 ; X32-LABEL: test_mm_mask_movddup_pd: 585 ; X32: # BB#0: 586 ; X32-NEXT: pushl %eax 587 ; X32-NEXT: .Ltmp12: 588 ; X32-NEXT: .cfi_def_cfa_offset 8 589 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 590 ; X32-NEXT: andb $3, %al 591 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 592 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 593 ; X32-NEXT: kmovw %eax, %k1 594 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 595 ; X32-NEXT: popl %eax 596 ; X32-NEXT: retl 597 ; 598 ; X64-LABEL: test_mm_mask_movddup_pd: 599 ; X64: # BB#0: 600 ; X64-NEXT: andb $3, %dil 601 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 602 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 603 ; X64-NEXT: kmovw %eax, %k1 604 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 605 ; X64-NEXT: retq 606 %trn1 = trunc i8 %a1 to i2 607 %arg1 = bitcast i2 %trn1 to <2 x i1> 608 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer 609 %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0 610 ret <2 x double> %res1 611 } 612 613 define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) { 614 ; X32-LABEL: test_mm_maskz_movddup_pd: 615 ; X32: # BB#0: 616 ; X32-NEXT: pushl %eax 617 ; X32-NEXT: .Ltmp13: 618 ; X32-NEXT: .cfi_def_cfa_offset 8 619 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 620 ; X32-NEXT: andb $3, %al 621 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 622 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 623 ; X32-NEXT: kmovw %eax, %k1 624 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 625 ; X32-NEXT: popl %eax 626 ; X32-NEXT: retl 627 ; 628 ; X64-LABEL: test_mm_maskz_movddup_pd: 629 ; X64: # BB#0: 630 ; X64-NEXT: andb $3, %dil 631 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 632 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 633 ; X64-NEXT: kmovw %eax, %k1 634 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 635 ; X64-NEXT: retq 636 %trn1 = trunc i8 %a0 to i2 637 %arg0 = bitcast i2 %trn1 to <2 x i1> 638 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer 639 %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer 640 ret <2 x double> %res1 641 } 642 643 define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) { 644 ; X32-LABEL: test_mm256_movddup_pd: 645 ; X32: # BB#0: 646 ; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 647 ; X32-NEXT: retl 648 ; 649 ; X64-LABEL: test_mm256_movddup_pd: 650 ; X64: # BB#0: 651 ; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 652 ; X64-NEXT: retq 653 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 654 ret <4 x double> %res 655 } 656 657 define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) { 658 ; X32-LABEL: test_mm256_mask_movddup_pd: 659 ; X32: # BB#0: 660 ; X32-NEXT: pushl %eax 661 ; X32-NEXT: .Ltmp14: 662 ; X32-NEXT: .cfi_def_cfa_offset 8 663 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 664 ; X32-NEXT: andb $15, %al 665 ; X32-NEXT: movb %al, (%esp) 666 ; X32-NEXT: movzbl (%esp), %eax 667 ; X32-NEXT: kmovw %eax, %k1 668 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] 669 ; X32-NEXT: popl %eax 670 ; X32-NEXT: retl 671 ; 672 ; X64-LABEL: test_mm256_mask_movddup_pd: 673 ; X64: # BB#0: 674 ; X64-NEXT: andb $15, %dil 675 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 676 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 677 ; X64-NEXT: kmovw %eax, %k1 678 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] 679 ; X64-NEXT: retq 680 %trn1 = trunc i8 %a1 to i4 681 %arg1 = bitcast i4 %trn1 to <4 x i1> 682 %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 683 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0 684 ret <4 x double> %res1 685 } 686 687 define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) { 688 ; X32-LABEL: test_mm256_maskz_movddup_pd: 689 ; X32: # BB#0: 690 ; X32-NEXT: pushl %eax 691 ; X32-NEXT: .Ltmp15: 692 ; X32-NEXT: .cfi_def_cfa_offset 8 693 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 694 ; X32-NEXT: andb $15, %al 695 ; X32-NEXT: movb %al, (%esp) 696 ; X32-NEXT: movzbl (%esp), %eax 697 ; X32-NEXT: kmovw %eax, %k1 698 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 699 ; X32-NEXT: popl %eax 700 ; X32-NEXT: retl 701 ; 702 ; X64-LABEL: test_mm256_maskz_movddup_pd: 703 ; X64: # BB#0: 704 ; X64-NEXT: andb $15, %dil 705 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 706 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 707 ; X64-NEXT: kmovw %eax, %k1 708 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 709 ; X64-NEXT: retq 710 %trn1 = trunc i8 %a0 to i4 711 %arg0 = bitcast i4 %trn1 to <4 x i1> 712 %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 713 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer 714 ret <4 x double> %res1 715 } 716 717 define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) { 718 ; X32-LABEL: test_mm_movehdup_ps: 719 ; X32: # BB#0: 720 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 721 ; X32-NEXT: retl 722 ; 723 ; X64-LABEL: test_mm_movehdup_ps: 724 ; X64: # BB#0: 725 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 726 ; X64-NEXT: retq 727 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 728 ret <4 x float> %res 729 } 730 731 define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { 732 ; X32-LABEL: test_mm_mask_movehdup_ps: 733 ; X32: # BB#0: 734 ; X32-NEXT: pushl %eax 735 ; X32-NEXT: .Ltmp16: 736 ; X32-NEXT: .cfi_def_cfa_offset 8 737 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 738 ; X32-NEXT: andb $15, %al 739 ; X32-NEXT: movb %al, (%esp) 740 ; X32-NEXT: movzbl (%esp), %eax 741 ; X32-NEXT: kmovw %eax, %k1 742 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] 743 ; X32-NEXT: popl %eax 744 ; X32-NEXT: retl 745 ; 746 ; X64-LABEL: test_mm_mask_movehdup_ps: 747 ; X64: # BB#0: 748 ; X64-NEXT: andb $15, %dil 749 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 750 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 751 ; X64-NEXT: kmovw %eax, %k1 752 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] 753 ; X64-NEXT: retq 754 %trn1 = trunc i8 %a1 to i4 755 %arg1 = bitcast i4 %trn1 to <4 x i1> 756 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 757 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0 758 ret <4 x float> %res1 759 } 760 761 define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) { 762 ; X32-LABEL: test_mm_maskz_movehdup_ps: 763 ; X32: # BB#0: 764 ; X32-NEXT: pushl %eax 765 ; X32-NEXT: .Ltmp17: 766 ; X32-NEXT: .cfi_def_cfa_offset 8 767 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 768 ; X32-NEXT: andb $15, %al 769 ; X32-NEXT: movb %al, (%esp) 770 ; X32-NEXT: movzbl (%esp), %eax 771 ; X32-NEXT: kmovw %eax, %k1 772 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 773 ; X32-NEXT: popl %eax 774 ; X32-NEXT: retl 775 ; 776 ; X64-LABEL: test_mm_maskz_movehdup_ps: 777 ; X64: # BB#0: 778 ; X64-NEXT: andb $15, %dil 779 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 780 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 781 ; X64-NEXT: kmovw %eax, %k1 782 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 783 ; X64-NEXT: retq 784 %trn0 = trunc i8 %a0 to i4 785 %arg0 = bitcast i4 %trn0 to <4 x i1> 786 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 787 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer 788 ret <4 x float> %res1 789 } 790 791 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) { 792 ; X32-LABEL: test_mm256_movehdup_ps: 793 ; X32: # BB#0: 794 ; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 795 ; X32-NEXT: retl 796 ; 797 ; X64-LABEL: test_mm256_movehdup_ps: 798 ; X64: # BB#0: 799 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 800 ; X64-NEXT: retq 801 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 802 ret <8 x float> %res 803 } 804 805 define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) { 806 ; X32-LABEL: test_mm256_mask_movehdup_ps: 807 ; X32: # BB#0: 808 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 809 ; X32-NEXT: kmovw %eax, %k1 810 ; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7] 811 ; X32-NEXT: retl 812 ; 813 ; X64-LABEL: test_mm256_mask_movehdup_ps: 814 ; X64: # BB#0: 815 ; X64-NEXT: kmovw %edi, %k1 816 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7] 817 ; X64-NEXT: retq 818 %arg1 = bitcast i8 %a1 to <8 x i1> 819 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 820 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 821 ret <8 x float> %res1 822 } 823 824 define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) { 825 ; X32-LABEL: test_mm256_maskz_movehdup_ps: 826 ; X32: # BB#0: 827 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 828 ; X32-NEXT: kmovw %eax, %k1 829 ; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 830 ; X32-NEXT: retl 831 ; 832 ; X64-LABEL: test_mm256_maskz_movehdup_ps: 833 ; X64: # BB#0: 834 ; X64-NEXT: kmovw %edi, %k1 835 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 836 ; X64-NEXT: retq 837 %arg0 = bitcast i8 %a0 to <8 x i1> 838 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 839 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 840 ret <8 x float> %res1 841 } 842 843 define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) { 844 ; X32-LABEL: test_mm_moveldup_ps: 845 ; X32: # BB#0: 846 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 847 ; X32-NEXT: retl 848 ; 849 ; X64-LABEL: test_mm_moveldup_ps: 850 ; X64: # BB#0: 851 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 852 ; X64-NEXT: retq 853 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 854 ret <4 x float> %res 855 } 856 857 define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { 858 ; X32-LABEL: test_mm_mask_moveldup_ps: 859 ; X32: # BB#0: 860 ; X32-NEXT: pushl %eax 861 ; X32-NEXT: .Ltmp18: 862 ; X32-NEXT: .cfi_def_cfa_offset 8 863 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 864 ; X32-NEXT: andb $15, %al 865 ; X32-NEXT: movb %al, (%esp) 866 ; X32-NEXT: movzbl (%esp), %eax 867 ; X32-NEXT: kmovw %eax, %k1 868 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] 869 ; X32-NEXT: popl %eax 870 ; X32-NEXT: retl 871 ; 872 ; X64-LABEL: test_mm_mask_moveldup_ps: 873 ; X64: # BB#0: 874 ; X64-NEXT: andb $15, %dil 875 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 876 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 877 ; X64-NEXT: kmovw %eax, %k1 878 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] 879 ; X64-NEXT: retq 880 %trn1 = trunc i8 %a1 to i4 881 %arg1 = bitcast i4 %trn1 to <4 x i1> 882 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 883 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0 884 ret <4 x float> %res1 885 } 886 887 define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) { 888 ; X32-LABEL: test_mm_maskz_moveldup_ps: 889 ; X32: # BB#0: 890 ; X32-NEXT: pushl %eax 891 ; X32-NEXT: .Ltmp19: 892 ; X32-NEXT: .cfi_def_cfa_offset 8 893 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 894 ; X32-NEXT: andb $15, %al 895 ; X32-NEXT: movb %al, (%esp) 896 ; X32-NEXT: movzbl (%esp), %eax 897 ; X32-NEXT: kmovw %eax, %k1 898 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 899 ; X32-NEXT: popl %eax 900 ; X32-NEXT: retl 901 ; 902 ; X64-LABEL: test_mm_maskz_moveldup_ps: 903 ; X64: # BB#0: 904 ; X64-NEXT: andb $15, %dil 905 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 906 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 907 ; X64-NEXT: kmovw %eax, %k1 908 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 909 ; X64-NEXT: retq 910 %trn0 = trunc i8 %a0 to i4 911 %arg0 = bitcast i4 %trn0 to <4 x i1> 912 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 913 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer 914 ret <4 x float> %res1 915 } 916 917 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) { 918 ; X32-LABEL: test_mm256_moveldup_ps: 919 ; X32: # BB#0: 920 ; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 921 ; X32-NEXT: retl 922 ; 923 ; X64-LABEL: test_mm256_moveldup_ps: 924 ; X64: # BB#0: 925 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 926 ; X64-NEXT: retq 927 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 928 ret <8 x float> %res 929 } 930 931 define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) { 932 ; X32-LABEL: test_mm256_mask_moveldup_ps: 933 ; X32: # BB#0: 934 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 935 ; X32-NEXT: kmovw %eax, %k1 936 ; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6] 937 ; X32-NEXT: retl 938 ; 939 ; X64-LABEL: test_mm256_mask_moveldup_ps: 940 ; X64: # BB#0: 941 ; X64-NEXT: kmovw %edi, %k1 942 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6] 943 ; X64-NEXT: retq 944 %arg1 = bitcast i8 %a1 to <8 x i1> 945 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 946 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 947 ret <8 x float> %res1 948 } 949 950 define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) { 951 ; X32-LABEL: test_mm256_maskz_moveldup_ps: 952 ; X32: # BB#0: 953 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 954 ; X32-NEXT: kmovw %eax, %k1 955 ; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 956 ; X32-NEXT: retl 957 ; 958 ; X64-LABEL: test_mm256_maskz_moveldup_ps: 959 ; X64: # BB#0: 960 ; X64-NEXT: kmovw %edi, %k1 961 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 962 ; X64-NEXT: retq 963 %arg0 = bitcast i8 %a0 to <8 x i1> 964 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 965 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 966 ret <8 x float> %res1 967 } 968 969 define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) { 970 ; X32-LABEL: test_mm256_permutex_epi64: 971 ; X32: # BB#0: 972 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] 973 ; X32-NEXT: retl 974 ; 975 ; X64-LABEL: test_mm256_permutex_epi64: 976 ; X64: # BB#0: 977 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] 978 ; X64-NEXT: retq 979 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 980 ret <4 x i64> %res 981 } 982 983 define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64> %a2) { 984 ; X32-LABEL: test_mm256_mask_permutex_epi64: 985 ; X32: # BB#0: 986 ; X32-NEXT: pushl %eax 987 ; X32-NEXT: .Ltmp20: 988 ; X32-NEXT: .cfi_def_cfa_offset 8 989 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 990 ; X32-NEXT: andb $15, %al 991 ; X32-NEXT: movb %al, (%esp) 992 ; X32-NEXT: movzbl (%esp), %eax 993 ; X32-NEXT: kmovw %eax, %k1 994 ; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 995 ; X32-NEXT: popl %eax 996 ; X32-NEXT: retl 997 ; 998 ; X64-LABEL: test_mm256_mask_permutex_epi64: 999 ; X64: # BB#0: 1000 ; X64-NEXT: andb $15, %dil 1001 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1002 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1003 ; X64-NEXT: kmovw %eax, %k1 1004 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 1005 ; X64-NEXT: retq 1006 %trn1 = trunc i8 %a1 to i4 1007 %arg1 = bitcast i4 %trn1 to <4 x i1> 1008 %res0 = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1009 %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0 1010 ret <4 x i64> %res1 1011 } 1012 1013 define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) { 1014 ; X32-LABEL: test_mm256_maskz_permutex_epi64: 1015 ; X32: # BB#0: 1016 ; X32-NEXT: pushl %eax 1017 ; X32-NEXT: .Ltmp21: 1018 ; X32-NEXT: .cfi_def_cfa_offset 8 1019 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1020 ; X32-NEXT: andb $15, %al 1021 ; X32-NEXT: movb %al, (%esp) 1022 ; X32-NEXT: movzbl (%esp), %eax 1023 ; X32-NEXT: kmovw %eax, %k1 1024 ; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 1025 ; X32-NEXT: popl %eax 1026 ; X32-NEXT: retl 1027 ; 1028 ; X64-LABEL: test_mm256_maskz_permutex_epi64: 1029 ; X64: # BB#0: 1030 ; X64-NEXT: andb $15, %dil 1031 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1032 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1033 ; X64-NEXT: kmovw %eax, %k1 1034 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 1035 ; X64-NEXT: retq 1036 %trn1 = trunc i8 %a0 to i4 1037 %arg0 = bitcast i4 %trn1 to <4 x i1> 1038 %res0 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1039 %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer 1040 ret <4 x i64> %res1 1041 } 1042 1043 define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) { 1044 ; X32-LABEL: test_mm256_permutex_pd: 1045 ; X32: # BB#0: 1046 ; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] 1047 ; X32-NEXT: retl 1048 ; 1049 ; X64-LABEL: test_mm256_permutex_pd: 1050 ; X64: # BB#0: 1051 ; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] 1052 ; X64-NEXT: retq 1053 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 1054 ret <4 x double> %res 1055 } 1056 1057 define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) { 1058 ; X32-LABEL: test_mm256_mask_permutex_pd: 1059 ; X32: # BB#0: 1060 ; X32-NEXT: pushl %eax 1061 ; X32-NEXT: .Ltmp22: 1062 ; X32-NEXT: .cfi_def_cfa_offset 8 1063 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1064 ; X32-NEXT: andb $15, %al 1065 ; X32-NEXT: movb %al, (%esp) 1066 ; X32-NEXT: movzbl (%esp), %eax 1067 ; X32-NEXT: kmovw %eax, %k1 1068 ; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 1069 ; X32-NEXT: popl %eax 1070 ; X32-NEXT: retl 1071 ; 1072 ; X64-LABEL: test_mm256_mask_permutex_pd: 1073 ; X64: # BB#0: 1074 ; X64-NEXT: andb $15, %dil 1075 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1076 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1077 ; X64-NEXT: kmovw %eax, %k1 1078 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 1079 ; X64-NEXT: retq 1080 %trn1 = trunc i8 %a1 to i4 1081 %arg1 = bitcast i4 %trn1 to <4 x i1> 1082 %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1083 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0 1084 ret <4 x double> %res1 1085 } 1086 1087 define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) { 1088 ; X32-LABEL: test_mm256_maskz_permutex_pd: 1089 ; X32: # BB#0: 1090 ; X32-NEXT: pushl %eax 1091 ; X32-NEXT: .Ltmp23: 1092 ; X32-NEXT: .cfi_def_cfa_offset 8 1093 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1094 ; X32-NEXT: andb $15, %al 1095 ; X32-NEXT: movb %al, (%esp) 1096 ; X32-NEXT: movzbl (%esp), %eax 1097 ; X32-NEXT: kmovw %eax, %k1 1098 ; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 1099 ; X32-NEXT: popl %eax 1100 ; X32-NEXT: retl 1101 ; 1102 ; X64-LABEL: test_mm256_maskz_permutex_pd: 1103 ; X64: # BB#0: 1104 ; X64-NEXT: andb $15, %dil 1105 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1106 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1107 ; X64-NEXT: kmovw %eax, %k1 1108 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 1109 ; X64-NEXT: retq 1110 %trn1 = trunc i8 %a0 to i4 1111 %arg0 = bitcast i4 %trn1 to <4 x i1> 1112 %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1113 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer 1114 ret <4 x double> %res1 1115 } 1116 1117 define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) { 1118 ; X32-LABEL: test_mm_shuffle_pd: 1119 ; X32: # BB#0: 1120 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1121 ; X32-NEXT: retl 1122 ; 1123 ; X64-LABEL: test_mm_shuffle_pd: 1124 ; X64: # BB#0: 1125 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1126 ; X64-NEXT: retq 1127 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3> 1128 ret <2 x double> %res 1129 } 1130 1131 define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2, <2 x double> %a3) { 1132 ; X32-LABEL: test_mm_mask_shuffle_pd: 1133 ; X32: # BB#0: 1134 ; X32-NEXT: pushl %eax 1135 ; X32-NEXT: .Ltmp24: 1136 ; X32-NEXT: .cfi_def_cfa_offset 8 1137 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1138 ; X32-NEXT: andb $3, %al 1139 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 1140 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1141 ; X32-NEXT: kmovw %eax, %k1 1142 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] 1143 ; X32-NEXT: popl %eax 1144 ; X32-NEXT: retl 1145 ; 1146 ; X64-LABEL: test_mm_mask_shuffle_pd: 1147 ; X64: # BB#0: 1148 ; X64-NEXT: andb $3, %dil 1149 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1150 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1151 ; X64-NEXT: kmovw %eax, %k1 1152 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] 1153 ; X64-NEXT: retq 1154 %trn1 = trunc i8 %a1 to i2 1155 %arg1 = bitcast i2 %trn1 to <2 x i1> 1156 %res0 = shufflevector <2 x double> %a2, <2 x double> %a3, <2 x i32> <i32 1, i32 3> 1157 %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0 1158 ret <2 x double> %res1 1159 } 1160 1161 define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x double> %a2) { 1162 ; X32-LABEL: test_mm_maskz_shuffle_pd: 1163 ; X32: # BB#0: 1164 ; X32-NEXT: pushl %eax 1165 ; X32-NEXT: .Ltmp25: 1166 ; X32-NEXT: .cfi_def_cfa_offset 8 1167 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1168 ; X32-NEXT: andb $3, %al 1169 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 1170 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1171 ; X32-NEXT: kmovw %eax, %k1 1172 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] 1173 ; X32-NEXT: popl %eax 1174 ; X32-NEXT: retl 1175 ; 1176 ; X64-LABEL: test_mm_maskz_shuffle_pd: 1177 ; X64: # BB#0: 1178 ; X64-NEXT: andb $3, %dil 1179 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1180 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1181 ; X64-NEXT: kmovw %eax, %k1 1182 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] 1183 ; X64-NEXT: retq 1184 %trn1 = trunc i8 %a0 to i2 1185 %arg0 = bitcast i2 %trn1 to <2 x i1> 1186 %res0 = shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 1, i32 3> 1187 %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer 1188 ret <2 x double> %res1 1189 } 1190 1191 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) { 1192 ; X32-LABEL: test_mm256_shuffle_pd: 1193 ; X32: # BB#0: 1194 ; X32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 1195 ; X32-NEXT: retl 1196 ; 1197 ; X64-LABEL: test_mm256_shuffle_pd: 1198 ; X64: # BB#0: 1199 ; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 1200 ; X64-NEXT: retq 1201 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 1202 ret <4 x double> %res 1203 } 1204 1205 define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2, <4 x double> %a3) { 1206 ; X32-LABEL: test_mm256_mask_shuffle_pd: 1207 ; X32: # BB#0: 1208 ; X32-NEXT: pushl %eax 1209 ; X32-NEXT: .Ltmp26: 1210 ; X32-NEXT: .cfi_def_cfa_offset 8 1211 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1212 ; X32-NEXT: andb $15, %al 1213 ; X32-NEXT: movb %al, (%esp) 1214 ; X32-NEXT: movzbl (%esp), %eax 1215 ; X32-NEXT: kmovw %eax, %k1 1216 ; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] 1217 ; X32-NEXT: popl %eax 1218 ; X32-NEXT: retl 1219 ; 1220 ; X64-LABEL: test_mm256_mask_shuffle_pd: 1221 ; X64: # BB#0: 1222 ; X64-NEXT: andb $15, %dil 1223 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1224 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1225 ; X64-NEXT: kmovw %eax, %k1 1226 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] 1227 ; X64-NEXT: retq 1228 %trn1 = trunc i8 %a1 to i4 1229 %arg1 = bitcast i4 %trn1 to <4 x i1> 1230 %res0 = shufflevector <4 x double> %a2, <4 x double> %a3, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 1231 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0 1232 ret <4 x double> %res1 1233 } 1234 1235 define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x double> %a2) { 1236 ; X32-LABEL: test_mm256_maskz_shuffle_pd: 1237 ; X32: # BB#0: 1238 ; X32-NEXT: pushl %eax 1239 ; X32-NEXT: .Ltmp27: 1240 ; X32-NEXT: .cfi_def_cfa_offset 8 1241 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1242 ; X32-NEXT: andb $15, %al 1243 ; X32-NEXT: movb %al, (%esp) 1244 ; X32-NEXT: movzbl (%esp), %eax 1245 ; X32-NEXT: kmovw %eax, %k1 1246 ; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 1247 ; X32-NEXT: popl %eax 1248 ; X32-NEXT: retl 1249 ; 1250 ; X64-LABEL: test_mm256_maskz_shuffle_pd: 1251 ; X64: # BB#0: 1252 ; X64-NEXT: andb $15, %dil 1253 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1254 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1255 ; X64-NEXT: kmovw %eax, %k1 1256 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 1257 ; X64-NEXT: retq 1258 %trn1 = trunc i8 %a0 to i4 1259 %arg0 = bitcast i4 %trn1 to <4 x i1> 1260 %res0 = shufflevector <4 x double> %a1, <4 x double> %a2, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 1261 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer 1262 ret <4 x double> %res1 1263 } 1264 1265 define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) { 1266 ; X32-LABEL: test_mm_shuffle_ps: 1267 ; X32: # BB#0: 1268 ; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 1269 ; X32-NEXT: retl 1270 ; 1271 ; X64-LABEL: test_mm_shuffle_ps: 1272 ; X64: # BB#0: 1273 ; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 1274 ; X64-NEXT: retq 1275 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 1276 ret <4 x float> %res 1277 } 1278 1279 define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2, <4 x float> %a3) { 1280 ; X32-LABEL: test_mm_mask_shuffle_ps: 1281 ; X32: # BB#0: 1282 ; X32-NEXT: pushl %eax 1283 ; X32-NEXT: .Ltmp28: 1284 ; X32-NEXT: .cfi_def_cfa_offset 8 1285 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1286 ; X32-NEXT: andb $15, %al 1287 ; X32-NEXT: movb %al, (%esp) 1288 ; X32-NEXT: movzbl (%esp), %eax 1289 ; X32-NEXT: kmovw %eax, %k1 1290 ; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] 1291 ; X32-NEXT: popl %eax 1292 ; X32-NEXT: retl 1293 ; 1294 ; X64-LABEL: test_mm_mask_shuffle_ps: 1295 ; X64: # BB#0: 1296 ; X64-NEXT: andb $15, %dil 1297 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1298 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1299 ; X64-NEXT: kmovw %eax, %k1 1300 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] 1301 ; X64-NEXT: retq 1302 %trn1 = trunc i8 %a1 to i4 1303 %arg1 = bitcast i4 %trn1 to <4 x i1> 1304 %res0 = shufflevector <4 x float> %a2, <4 x float> %a3, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 1305 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0 1306 ret <4 x float> %res1 1307 } 1308 1309 define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float> %a2) { 1310 ; X32-LABEL: test_mm_maskz_shuffle_ps: 1311 ; X32: # BB#0: 1312 ; X32-NEXT: pushl %eax 1313 ; X32-NEXT: .Ltmp29: 1314 ; X32-NEXT: .cfi_def_cfa_offset 8 1315 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1316 ; X32-NEXT: andb $15, %al 1317 ; X32-NEXT: movb %al, (%esp) 1318 ; X32-NEXT: movzbl (%esp), %eax 1319 ; X32-NEXT: kmovw %eax, %k1 1320 ; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] 1321 ; X32-NEXT: popl %eax 1322 ; X32-NEXT: retl 1323 ; 1324 ; X64-LABEL: test_mm_maskz_shuffle_ps: 1325 ; X64: # BB#0: 1326 ; X64-NEXT: andb $15, %dil 1327 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1328 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1329 ; X64-NEXT: kmovw %eax, %k1 1330 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] 1331 ; X64-NEXT: retq 1332 %trn0 = trunc i8 %a0 to i4 1333 %arg0 = bitcast i4 %trn0 to <4 x i1> 1334 %res0 = shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 1335 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer 1336 ret <4 x float> %res1 1337 } 1338 1339 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) { 1340 ; X32-LABEL: test_mm256_shuffle_ps: 1341 ; X32: # BB#0: 1342 ; X32-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 1343 ; X32-NEXT: retl 1344 ; 1345 ; X64-LABEL: test_mm256_shuffle_ps: 1346 ; X64: # BB#0: 1347 ; X64-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 1348 ; X64-NEXT: retq 1349 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 1350 ret <8 x float> %res 1351 } 1352 1353 define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) { 1354 ; X32-LABEL: test_mm256_mask_shuffle_ps: 1355 ; X32: # BB#0: 1356 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1357 ; X32-NEXT: kmovw %eax, %k1 1358 ; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4] 1359 ; X32-NEXT: retl 1360 ; 1361 ; X64-LABEL: test_mm256_mask_shuffle_ps: 1362 ; X64: # BB#0: 1363 ; X64-NEXT: kmovw %edi, %k1 1364 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4] 1365 ; X64-NEXT: retq 1366 %arg1 = bitcast i8 %a1 to <8 x i1> 1367 %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 1368 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 1369 ret <8 x float> %res1 1370 } 1371 1372 define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) { 1373 ; X32-LABEL: test_mm256_maskz_shuffle_ps: 1374 ; X32: # BB#0: 1375 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1376 ; X32-NEXT: kmovw %eax, %k1 1377 ; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 1378 ; X32-NEXT: retl 1379 ; 1380 ; X64-LABEL: test_mm256_maskz_shuffle_ps: 1381 ; X64: # BB#0: 1382 ; X64-NEXT: kmovw %edi, %k1 1383 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 1384 ; X64-NEXT: retq 1385 %arg0 = bitcast i8 %a0 to <8 x i1> 1386 %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 1387 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 1388 ret <8 x float> %res1 1389 } 1390 1391 !0 = !{i32 1} 1392