1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86 3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64 4 5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c 6 7 define <4 x float> @test_mm_mask_cvtepi32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) { 8 ; X86-LABEL: test_mm_mask_cvtepi32_ps: 9 ; X86: # %bb.0: # %entry 10 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 11 ; X86-NEXT: kmovw %eax, %k1 12 ; X86-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1} 13 ; X86-NEXT: retl 14 ; 15 ; X64-LABEL: test_mm_mask_cvtepi32_ps: 16 ; X64: # %bb.0: # %entry 17 ; X64-NEXT: kmovw %edi, %k1 18 ; X64-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1} 19 ; X64-NEXT: retq 20 entry: 21 %0 = bitcast <2 x i64> %__A to <4 x i32> 22 %conv.i.i = sitofp <4 x i32> %0 to <4 x float> 23 %1 = bitcast i8 %__U to <8 x i1> 24 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 25 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W 26 ret <4 x float> %2 27 } 28 29 define <4 x float> @test_mm_maskz_cvtepi32_ps(i8 zeroext %__U, <2 x i64> %__A) { 30 ; X86-LABEL: test_mm_maskz_cvtepi32_ps: 31 ; X86: # %bb.0: # %entry 32 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 33 ; X86-NEXT: kmovw %eax, %k1 34 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z} 35 ; X86-NEXT: retl 36 ; 37 ; X64-LABEL: test_mm_maskz_cvtepi32_ps: 38 ; X64: # %bb.0: # %entry 39 ; X64-NEXT: kmovw %edi, %k1 40 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z} 41 ; X64-NEXT: retq 42 entry: 43 %0 = bitcast <2 x i64> %__A to <4 x i32> 44 %conv.i.i = sitofp <4 x i32> %0 to <4 x float> 45 %1 = bitcast i8 %__U to <8 x i1> 46 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 47 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer 48 ret <4 x float> %2 49 } 50 51 define <8 x float> @test_mm256_mask_cvtepi32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) { 52 ; X86-LABEL: test_mm256_mask_cvtepi32_ps: 53 ; X86: # %bb.0: # %entry 54 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 55 ; X86-NEXT: kmovw %eax, %k1 56 ; X86-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1} 57 ; X86-NEXT: retl 58 ; 59 ; X64-LABEL: test_mm256_mask_cvtepi32_ps: 60 ; X64: # %bb.0: # %entry 61 ; X64-NEXT: kmovw %edi, %k1 62 ; X64-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1} 63 ; X64-NEXT: retq 64 entry: 65 %0 = bitcast <4 x i64> %__A to <8 x i32> 66 %conv.i.i = sitofp <8 x i32> %0 to <8 x float> 67 %1 = bitcast i8 %__U to <8 x i1> 68 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W 69 ret <8 x float> %2 70 } 71 72 define <8 x float> @test_mm256_maskz_cvtepi32_ps(i8 zeroext %__U, <4 x i64> %__A) { 73 ; X86-LABEL: test_mm256_maskz_cvtepi32_ps: 74 ; X86: # %bb.0: # %entry 75 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 76 ; X86-NEXT: kmovw %eax, %k1 77 ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z} 78 ; X86-NEXT: retl 79 ; 80 ; X64-LABEL: test_mm256_maskz_cvtepi32_ps: 81 ; X64: # %bb.0: # %entry 82 ; X64-NEXT: kmovw %edi, %k1 83 ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z} 84 ; X64-NEXT: retq 85 entry: 86 %0 = bitcast <4 x i64> %__A to <8 x i32> 87 %conv.i.i = sitofp <8 x i32> %0 to <8 x float> 88 %1 = bitcast i8 %__U to <8 x i1> 89 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer 90 ret <8 x float> %2 91 } 92 93 define <2 x i64> @test_mm_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 94 ; X86-LABEL: test_mm_mask_cvtpd_epi32: 95 ; X86: # %bb.0: # %entry 96 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 97 ; X86-NEXT: kmovw %eax, %k1 98 ; X86-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1} 99 ; X86-NEXT: retl 100 ; 101 ; X64-LABEL: test_mm_mask_cvtpd_epi32: 102 ; X64: # %bb.0: # %entry 103 ; X64-NEXT: kmovw %edi, %k1 104 ; X64-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1} 105 ; X64-NEXT: retq 106 entry: 107 %0 = bitcast <2 x i64> %__W to <4 x i32> 108 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 109 %2 = bitcast <4 x i32> %1 to <2 x i64> 110 ret <2 x i64> %2 111 } 112 113 define <2 x i64> @test_mm_maskz_cvtpd_epi32(i8 zeroext %__U, <2 x double> %__A) { 114 ; X86-LABEL: test_mm_maskz_cvtpd_epi32: 115 ; X86: # %bb.0: # %entry 116 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 117 ; X86-NEXT: kmovw %eax, %k1 118 ; X86-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z} 119 ; X86-NEXT: retl 120 ; 121 ; X64-LABEL: test_mm_maskz_cvtpd_epi32: 122 ; X64: # %bb.0: # %entry 123 ; X64-NEXT: kmovw %edi, %k1 124 ; X64-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z} 125 ; X64-NEXT: retq 126 entry: 127 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 128 %1 = bitcast <4 x i32> %0 to <2 x i64> 129 ret <2 x i64> %1 130 } 131 132 define <2 x i64> @test_mm256_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 133 ; X86-LABEL: test_mm256_mask_cvtpd_epi32: 134 ; X86: # %bb.0: # %entry 135 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 136 ; X86-NEXT: kmovw %eax, %k1 137 ; X86-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1} 138 ; X86-NEXT: vzeroupper 139 ; X86-NEXT: retl 140 ; 141 ; X64-LABEL: test_mm256_mask_cvtpd_epi32: 142 ; X64: # %bb.0: # %entry 143 ; X64-NEXT: kmovw %edi, %k1 144 ; X64-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1} 145 ; X64-NEXT: vzeroupper 146 ; X64-NEXT: retq 147 entry: 148 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8 149 %1 = bitcast <2 x i64> %__W to <4 x i32> 150 %2 = bitcast i8 %__U to <8 x i1> 151 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 152 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 153 %4 = bitcast <4 x i32> %3 to <2 x i64> 154 ret <2 x i64> %4 155 } 156 157 define <2 x i64> @test_mm256_maskz_cvtpd_epi32(i8 zeroext %__U, <4 x double> %__A) { 158 ; X86-LABEL: test_mm256_maskz_cvtpd_epi32: 159 ; X86: # %bb.0: # %entry 160 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 161 ; X86-NEXT: kmovw %eax, %k1 162 ; X86-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z} 163 ; X86-NEXT: vzeroupper 164 ; X86-NEXT: retl 165 ; 166 ; X64-LABEL: test_mm256_maskz_cvtpd_epi32: 167 ; X64: # %bb.0: # %entry 168 ; X64-NEXT: kmovw %edi, %k1 169 ; X64-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z} 170 ; X64-NEXT: vzeroupper 171 ; X64-NEXT: retq 172 entry: 173 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8 174 %1 = bitcast i8 %__U to <8 x i1> 175 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 176 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 177 %3 = bitcast <4 x i32> %2 to <2 x i64> 178 ret <2 x i64> %3 179 } 180 181 define <4 x float> @test_mm_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <2 x double> %__A) { 182 ; X86-LABEL: test_mm_mask_cvtpd_ps: 183 ; X86: # %bb.0: # %entry 184 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 185 ; X86-NEXT: kmovw %eax, %k1 186 ; X86-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1} 187 ; X86-NEXT: retl 188 ; 189 ; X64-LABEL: test_mm_mask_cvtpd_ps: 190 ; X64: # %bb.0: # %entry 191 ; X64-NEXT: kmovw %edi, %k1 192 ; X64-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1} 193 ; X64-NEXT: retq 194 entry: 195 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> %__W, i8 %__U) #8 196 ret <4 x float> %0 197 } 198 199 define <4 x float> @test_mm_maskz_cvtpd_ps(i8 zeroext %__U, <2 x double> %__A) { 200 ; X86-LABEL: test_mm_maskz_cvtpd_ps: 201 ; X86: # %bb.0: # %entry 202 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 203 ; X86-NEXT: kmovw %eax, %k1 204 ; X86-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z} 205 ; X86-NEXT: retl 206 ; 207 ; X64-LABEL: test_mm_maskz_cvtpd_ps: 208 ; X64: # %bb.0: # %entry 209 ; X64-NEXT: kmovw %edi, %k1 210 ; X64-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z} 211 ; X64-NEXT: retq 212 entry: 213 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> zeroinitializer, i8 %__U) #8 214 ret <4 x float> %0 215 } 216 217 define <4 x float> @test_mm256_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <4 x double> %__A) { 218 ; X86-LABEL: test_mm256_mask_cvtpd_ps: 219 ; X86: # %bb.0: # %entry 220 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 221 ; X86-NEXT: kmovw %eax, %k1 222 ; X86-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1} 223 ; X86-NEXT: vzeroupper 224 ; X86-NEXT: retl 225 ; 226 ; X64-LABEL: test_mm256_mask_cvtpd_ps: 227 ; X64: # %bb.0: # %entry 228 ; X64-NEXT: kmovw %edi, %k1 229 ; X64-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1} 230 ; X64-NEXT: vzeroupper 231 ; X64-NEXT: retq 232 entry: 233 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8 234 %1 = bitcast i8 %__U to <8 x i1> 235 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 236 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W 237 ret <4 x float> %2 238 } 239 240 define <4 x float> @test_mm256_maskz_cvtpd_ps(i8 zeroext %__U, <4 x double> %__A) { 241 ; X86-LABEL: test_mm256_maskz_cvtpd_ps: 242 ; X86: # %bb.0: # %entry 243 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 244 ; X86-NEXT: kmovw %eax, %k1 245 ; X86-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} 246 ; X86-NEXT: vzeroupper 247 ; X86-NEXT: retl 248 ; 249 ; X64-LABEL: test_mm256_maskz_cvtpd_ps: 250 ; X64: # %bb.0: # %entry 251 ; X64-NEXT: kmovw %edi, %k1 252 ; X64-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} 253 ; X64-NEXT: vzeroupper 254 ; X64-NEXT: retq 255 entry: 256 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8 257 %1 = bitcast i8 %__U to <8 x i1> 258 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 259 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 260 ret <4 x float> %2 261 } 262 263 define <2 x i64> @test_mm_cvtpd_epu32(<2 x double> %__A) { 264 ; CHECK-LABEL: test_mm_cvtpd_epu32: 265 ; CHECK: # %bb.0: # %entry 266 ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 267 ; CHECK-NEXT: ret{{[l|q]}} 268 entry: 269 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 270 %1 = bitcast <4 x i32> %0 to <2 x i64> 271 ret <2 x i64> %1 272 } 273 274 define <2 x i64> @test_mm_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 275 ; X86-LABEL: test_mm_mask_cvtpd_epu32: 276 ; X86: # %bb.0: # %entry 277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 278 ; X86-NEXT: kmovw %eax, %k1 279 ; X86-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1} 280 ; X86-NEXT: retl 281 ; 282 ; X64-LABEL: test_mm_mask_cvtpd_epu32: 283 ; X64: # %bb.0: # %entry 284 ; X64-NEXT: kmovw %edi, %k1 285 ; X64-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1} 286 ; X64-NEXT: retq 287 entry: 288 %0 = bitcast <2 x i64> %__W to <4 x i32> 289 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 290 %2 = bitcast <4 x i32> %1 to <2 x i64> 291 ret <2 x i64> %2 292 } 293 294 define <2 x i64> @test_mm_maskz_cvtpd_epu32(i8 zeroext %__U, <2 x double> %__A) { 295 ; X86-LABEL: test_mm_maskz_cvtpd_epu32: 296 ; X86: # %bb.0: # %entry 297 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 298 ; X86-NEXT: kmovw %eax, %k1 299 ; X86-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z} 300 ; X86-NEXT: retl 301 ; 302 ; X64-LABEL: test_mm_maskz_cvtpd_epu32: 303 ; X64: # %bb.0: # %entry 304 ; X64-NEXT: kmovw %edi, %k1 305 ; X64-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z} 306 ; X64-NEXT: retq 307 entry: 308 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 309 %1 = bitcast <4 x i32> %0 to <2 x i64> 310 ret <2 x i64> %1 311 } 312 313 define <2 x i64> @test_mm256_cvtpd_epu32(<4 x double> %__A) { 314 ; CHECK-LABEL: test_mm256_cvtpd_epu32: 315 ; CHECK: # %bb.0: # %entry 316 ; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0 317 ; CHECK-NEXT: vzeroupper 318 ; CHECK-NEXT: ret{{[l|q]}} 319 entry: 320 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 321 %1 = bitcast <4 x i32> %0 to <2 x i64> 322 ret <2 x i64> %1 323 } 324 325 define <2 x i64> @test_mm256_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 326 ; X86-LABEL: test_mm256_mask_cvtpd_epu32: 327 ; X86: # %bb.0: # %entry 328 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 329 ; X86-NEXT: kmovw %eax, %k1 330 ; X86-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1} 331 ; X86-NEXT: vzeroupper 332 ; X86-NEXT: retl 333 ; 334 ; X64-LABEL: test_mm256_mask_cvtpd_epu32: 335 ; X64: # %bb.0: # %entry 336 ; X64-NEXT: kmovw %edi, %k1 337 ; X64-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1} 338 ; X64-NEXT: vzeroupper 339 ; X64-NEXT: retq 340 entry: 341 %0 = bitcast <2 x i64> %__W to <4 x i32> 342 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8 343 %2 = bitcast <4 x i32> %1 to <2 x i64> 344 ret <2 x i64> %2 345 } 346 347 define <2 x i64> @test_mm256_maskz_cvtpd_epu32(i8 zeroext %__U, <4 x double> %__A) { 348 ; X86-LABEL: test_mm256_maskz_cvtpd_epu32: 349 ; X86: # %bb.0: # %entry 350 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 351 ; X86-NEXT: kmovw %eax, %k1 352 ; X86-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z} 353 ; X86-NEXT: vzeroupper 354 ; X86-NEXT: retl 355 ; 356 ; X64-LABEL: test_mm256_maskz_cvtpd_epu32: 357 ; X64: # %bb.0: # %entry 358 ; X64-NEXT: kmovw %edi, %k1 359 ; X64-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z} 360 ; X64-NEXT: vzeroupper 361 ; X64-NEXT: retq 362 entry: 363 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 364 %1 = bitcast <4 x i32> %0 to <2 x i64> 365 ret <2 x i64> %1 366 } 367 368 define <2 x i64> @test_mm_mask_cvtps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 369 ; X86-LABEL: test_mm_mask_cvtps_epi32: 370 ; X86: # %bb.0: # %entry 371 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 372 ; X86-NEXT: kmovw %eax, %k1 373 ; X86-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1} 374 ; X86-NEXT: retl 375 ; 376 ; X64-LABEL: test_mm_mask_cvtps_epi32: 377 ; X64: # %bb.0: # %entry 378 ; X64-NEXT: kmovw %edi, %k1 379 ; X64-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1} 380 ; X64-NEXT: retq 381 entry: 382 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8 383 %1 = bitcast <2 x i64> %__W to <4 x i32> 384 %2 = bitcast i8 %__U to <8 x i1> 385 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 386 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 387 %4 = bitcast <4 x i32> %3 to <2 x i64> 388 ret <2 x i64> %4 389 } 390 391 define <2 x i64> @test_mm_maskz_cvtps_epi32(i8 zeroext %__U, <4 x float> %__A) { 392 ; X86-LABEL: test_mm_maskz_cvtps_epi32: 393 ; X86: # %bb.0: # %entry 394 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 395 ; X86-NEXT: kmovw %eax, %k1 396 ; X86-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z} 397 ; X86-NEXT: retl 398 ; 399 ; X64-LABEL: test_mm_maskz_cvtps_epi32: 400 ; X64: # %bb.0: # %entry 401 ; X64-NEXT: kmovw %edi, %k1 402 ; X64-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z} 403 ; X64-NEXT: retq 404 entry: 405 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8 406 %1 = bitcast i8 %__U to <8 x i1> 407 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 408 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 409 %3 = bitcast <4 x i32> %2 to <2 x i64> 410 ret <2 x i64> %3 411 } 412 413 define <4 x i64> @test_mm256_mask_cvtps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 414 ; X86-LABEL: test_mm256_mask_cvtps_epi32: 415 ; X86: # %bb.0: # %entry 416 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 417 ; X86-NEXT: kmovw %eax, %k1 418 ; X86-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1} 419 ; X86-NEXT: retl 420 ; 421 ; X64-LABEL: test_mm256_mask_cvtps_epi32: 422 ; X64: # %bb.0: # %entry 423 ; X64-NEXT: kmovw %edi, %k1 424 ; X64-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1} 425 ; X64-NEXT: retq 426 entry: 427 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8 428 %1 = bitcast <4 x i64> %__W to <8 x i32> 429 %2 = bitcast i8 %__U to <8 x i1> 430 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 431 %4 = bitcast <8 x i32> %3 to <4 x i64> 432 ret <4 x i64> %4 433 } 434 435 define <4 x i64> @test_mm256_maskz_cvtps_epi32(i8 zeroext %__U, <8 x float> %__A) { 436 ; X86-LABEL: test_mm256_maskz_cvtps_epi32: 437 ; X86: # %bb.0: # %entry 438 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 439 ; X86-NEXT: kmovw %eax, %k1 440 ; X86-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z} 441 ; X86-NEXT: retl 442 ; 443 ; X64-LABEL: test_mm256_maskz_cvtps_epi32: 444 ; X64: # %bb.0: # %entry 445 ; X64-NEXT: kmovw %edi, %k1 446 ; X64-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z} 447 ; X64-NEXT: retq 448 entry: 449 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8 450 %1 = bitcast i8 %__U to <8 x i1> 451 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer 452 %3 = bitcast <8 x i32> %2 to <4 x i64> 453 ret <4 x i64> %3 454 } 455 456 define <2 x double> @test_mm_mask_cvtps_pd(<2 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 457 ; X86-LABEL: test_mm_mask_cvtps_pd: 458 ; X86: # %bb.0: # %entry 459 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 460 ; X86-NEXT: kmovw %eax, %k1 461 ; X86-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1} 462 ; X86-NEXT: retl 463 ; 464 ; X64-LABEL: test_mm_mask_cvtps_pd: 465 ; X64: # %bb.0: # %entry 466 ; X64-NEXT: kmovw %edi, %k1 467 ; X64-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1} 468 ; X64-NEXT: retq 469 entry: 470 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1> 471 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double> 472 %0 = bitcast i8 %__U to <8 x i1> 473 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 474 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W 475 ret <2 x double> %1 476 } 477 478 define <2 x double> @test_mm_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 479 ; X86-LABEL: test_mm_maskz_cvtps_pd: 480 ; X86: # %bb.0: # %entry 481 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 482 ; X86-NEXT: kmovw %eax, %k1 483 ; X86-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z} 484 ; X86-NEXT: retl 485 ; 486 ; X64-LABEL: test_mm_maskz_cvtps_pd: 487 ; X64: # %bb.0: # %entry 488 ; X64-NEXT: kmovw %edi, %k1 489 ; X64-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z} 490 ; X64-NEXT: retq 491 entry: 492 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1> 493 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double> 494 %0 = bitcast i8 %__U to <8 x i1> 495 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 496 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer 497 ret <2 x double> %1 498 } 499 500 define <4 x double> @test_mm256_mask_cvtps_pd(<4 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 501 ; X86-LABEL: test_mm256_mask_cvtps_pd: 502 ; X86: # %bb.0: # %entry 503 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 504 ; X86-NEXT: kmovw %eax, %k1 505 ; X86-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1} 506 ; X86-NEXT: retl 507 ; 508 ; X64-LABEL: test_mm256_mask_cvtps_pd: 509 ; X64: # %bb.0: # %entry 510 ; X64-NEXT: kmovw %edi, %k1 511 ; X64-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1} 512 ; X64-NEXT: retq 513 entry: 514 %conv.i.i = fpext <4 x float> %__A to <4 x double> 515 %0 = bitcast i8 %__U to <8 x i1> 516 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 517 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W 518 ret <4 x double> %1 519 } 520 521 define <4 x double> @test_mm256_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 522 ; X86-LABEL: test_mm256_maskz_cvtps_pd: 523 ; X86: # %bb.0: # %entry 524 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 525 ; X86-NEXT: kmovw %eax, %k1 526 ; X86-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} 527 ; X86-NEXT: retl 528 ; 529 ; X64-LABEL: test_mm256_maskz_cvtps_pd: 530 ; X64: # %bb.0: # %entry 531 ; X64-NEXT: kmovw %edi, %k1 532 ; X64-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} 533 ; X64-NEXT: retq 534 entry: 535 %conv.i.i = fpext <4 x float> %__A to <4 x double> 536 %0 = bitcast i8 %__U to <8 x i1> 537 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 538 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer 539 ret <4 x double> %1 540 } 541 542 define <2 x i64> @test_mm_cvtps_epu32(<4 x float> %__A) { 543 ; CHECK-LABEL: test_mm_cvtps_epu32: 544 ; CHECK: # %bb.0: # %entry 545 ; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0 546 ; CHECK-NEXT: ret{{[l|q]}} 547 entry: 548 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8 549 %1 = bitcast <4 x i32> %0 to <2 x i64> 550 ret <2 x i64> %1 551 } 552 553 define <2 x i64> @test_mm_mask_cvtps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 554 ; X86-LABEL: test_mm_mask_cvtps_epu32: 555 ; X86: # %bb.0: # %entry 556 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 557 ; X86-NEXT: kmovw %eax, %k1 558 ; X86-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1} 559 ; X86-NEXT: retl 560 ; 561 ; X64-LABEL: test_mm_mask_cvtps_epu32: 562 ; X64: # %bb.0: # %entry 563 ; X64-NEXT: kmovw %edi, %k1 564 ; X64-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1} 565 ; X64-NEXT: retq 566 entry: 567 %0 = bitcast <2 x i64> %__W to <4 x i32> 568 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8 569 %2 = bitcast <4 x i32> %1 to <2 x i64> 570 ret <2 x i64> %2 571 } 572 573 define <2 x i64> @test_mm_maskz_cvtps_epu32(i8 zeroext %__U, <4 x float> %__A) { 574 ; X86-LABEL: test_mm_maskz_cvtps_epu32: 575 ; X86: # %bb.0: # %entry 576 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 577 ; X86-NEXT: kmovw %eax, %k1 578 ; X86-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z} 579 ; X86-NEXT: retl 580 ; 581 ; X64-LABEL: test_mm_maskz_cvtps_epu32: 582 ; X64: # %bb.0: # %entry 583 ; X64-NEXT: kmovw %edi, %k1 584 ; X64-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z} 585 ; X64-NEXT: retq 586 entry: 587 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 588 %1 = bitcast <4 x i32> %0 to <2 x i64> 589 ret <2 x i64> %1 590 } 591 592 define <4 x i64> @test_mm256_cvtps_epu32(<8 x float> %__A) { 593 ; CHECK-LABEL: test_mm256_cvtps_epu32: 594 ; CHECK: # %bb.0: # %entry 595 ; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0 596 ; CHECK-NEXT: ret{{[l|q]}} 597 entry: 598 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8 599 %1 = bitcast <8 x i32> %0 to <4 x i64> 600 ret <4 x i64> %1 601 } 602 603 define <4 x i64> @test_mm256_mask_cvtps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 604 ; X86-LABEL: test_mm256_mask_cvtps_epu32: 605 ; X86: # %bb.0: # %entry 606 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 607 ; X86-NEXT: kmovw %eax, %k1 608 ; X86-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1} 609 ; X86-NEXT: retl 610 ; 611 ; X64-LABEL: test_mm256_mask_cvtps_epu32: 612 ; X64: # %bb.0: # %entry 613 ; X64-NEXT: kmovw %edi, %k1 614 ; X64-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1} 615 ; X64-NEXT: retq 616 entry: 617 %0 = bitcast <4 x i64> %__W to <8 x i32> 618 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8 619 %2 = bitcast <8 x i32> %1 to <4 x i64> 620 ret <4 x i64> %2 621 } 622 623 define <4 x i64> @test_mm256_maskz_cvtps_epu32(i8 zeroext %__U, <8 x float> %__A) { 624 ; X86-LABEL: test_mm256_maskz_cvtps_epu32: 625 ; X86: # %bb.0: # %entry 626 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 627 ; X86-NEXT: kmovw %eax, %k1 628 ; X86-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z} 629 ; X86-NEXT: retl 630 ; 631 ; X64-LABEL: test_mm256_maskz_cvtps_epu32: 632 ; X64: # %bb.0: # %entry 633 ; X64-NEXT: kmovw %edi, %k1 634 ; X64-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z} 635 ; X64-NEXT: retq 636 entry: 637 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8 638 %1 = bitcast <8 x i32> %0 to <4 x i64> 639 ret <4 x i64> %1 640 } 641 642 define <2 x i64> @test_mm_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 643 ; X86-LABEL: test_mm_mask_cvttpd_epi32: 644 ; X86: # %bb.0: # %entry 645 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 646 ; X86-NEXT: kmovw %eax, %k1 647 ; X86-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1} 648 ; X86-NEXT: retl 649 ; 650 ; X64-LABEL: test_mm_mask_cvttpd_epi32: 651 ; X64: # %bb.0: # %entry 652 ; X64-NEXT: kmovw %edi, %k1 653 ; X64-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1} 654 ; X64-NEXT: retq 655 entry: 656 %0 = bitcast <2 x i64> %__W to <4 x i32> 657 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 658 %2 = bitcast <4 x i32> %1 to <2 x i64> 659 ret <2 x i64> %2 660 } 661 662 define <2 x i64> @test_mm_maskz_cvttpd_epi32(i8 zeroext %__U, <2 x double> %__A) { 663 ; X86-LABEL: test_mm_maskz_cvttpd_epi32: 664 ; X86: # %bb.0: # %entry 665 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 666 ; X86-NEXT: kmovw %eax, %k1 667 ; X86-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z} 668 ; X86-NEXT: retl 669 ; 670 ; X64-LABEL: test_mm_maskz_cvttpd_epi32: 671 ; X64: # %bb.0: # %entry 672 ; X64-NEXT: kmovw %edi, %k1 673 ; X64-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z} 674 ; X64-NEXT: retq 675 entry: 676 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 677 %1 = bitcast <4 x i32> %0 to <2 x i64> 678 ret <2 x i64> %1 679 } 680 681 define <2 x i64> @test_mm256_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 682 ; X86-LABEL: test_mm256_mask_cvttpd_epi32: 683 ; X86: # %bb.0: # %entry 684 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 685 ; X86-NEXT: kmovw %eax, %k1 686 ; X86-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} 687 ; X86-NEXT: vzeroupper 688 ; X86-NEXT: retl 689 ; 690 ; X64-LABEL: test_mm256_mask_cvttpd_epi32: 691 ; X64: # %bb.0: # %entry 692 ; X64-NEXT: kmovw %edi, %k1 693 ; X64-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} 694 ; X64-NEXT: vzeroupper 695 ; X64-NEXT: retq 696 entry: 697 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8 698 %1 = bitcast <2 x i64> %__W to <4 x i32> 699 %2 = bitcast i8 %__U to <8 x i1> 700 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 701 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 702 %4 = bitcast <4 x i32> %3 to <2 x i64> 703 ret <2 x i64> %4 704 } 705 706 define <2 x i64> @test_mm256_maskz_cvttpd_epi32(i8 zeroext %__U, <4 x double> %__A) { 707 ; X86-LABEL: test_mm256_maskz_cvttpd_epi32: 708 ; X86: # %bb.0: # %entry 709 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 710 ; X86-NEXT: kmovw %eax, %k1 711 ; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z} 712 ; X86-NEXT: vzeroupper 713 ; X86-NEXT: retl 714 ; 715 ; X64-LABEL: test_mm256_maskz_cvttpd_epi32: 716 ; X64: # %bb.0: # %entry 717 ; X64-NEXT: kmovw %edi, %k1 718 ; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z} 719 ; X64-NEXT: vzeroupper 720 ; X64-NEXT: retq 721 entry: 722 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8 723 %1 = bitcast i8 %__U to <8 x i1> 724 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 725 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 726 %3 = bitcast <4 x i32> %2 to <2 x i64> 727 ret <2 x i64> %3 728 } 729 730 define <2 x i64> @test_mm_cvttpd_epu32(<2 x double> %__A) { 731 ; CHECK-LABEL: test_mm_cvttpd_epu32: 732 ; CHECK: # %bb.0: # %entry 733 ; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 734 ; CHECK-NEXT: ret{{[l|q]}} 735 entry: 736 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 737 %1 = bitcast <4 x i32> %0 to <2 x i64> 738 ret <2 x i64> %1 739 } 740 741 define <2 x i64> @test_mm_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 742 ; X86-LABEL: test_mm_mask_cvttpd_epu32: 743 ; X86: # %bb.0: # %entry 744 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 745 ; X86-NEXT: kmovw %eax, %k1 746 ; X86-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1} 747 ; X86-NEXT: retl 748 ; 749 ; X64-LABEL: test_mm_mask_cvttpd_epu32: 750 ; X64: # %bb.0: # %entry 751 ; X64-NEXT: kmovw %edi, %k1 752 ; X64-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1} 753 ; X64-NEXT: retq 754 entry: 755 %0 = bitcast <2 x i64> %__W to <4 x i32> 756 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 757 %2 = bitcast <4 x i32> %1 to <2 x i64> 758 ret <2 x i64> %2 759 } 760 761 define <2 x i64> @test_mm_maskz_cvttpd_epu32(i8 zeroext %__U, <2 x double> %__A) { 762 ; X86-LABEL: test_mm_maskz_cvttpd_epu32: 763 ; X86: # %bb.0: # %entry 764 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 765 ; X86-NEXT: kmovw %eax, %k1 766 ; X86-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z} 767 ; X86-NEXT: retl 768 ; 769 ; X64-LABEL: test_mm_maskz_cvttpd_epu32: 770 ; X64: # %bb.0: # %entry 771 ; X64-NEXT: kmovw %edi, %k1 772 ; X64-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z} 773 ; X64-NEXT: retq 774 entry: 775 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 776 %1 = bitcast <4 x i32> %0 to <2 x i64> 777 ret <2 x i64> %1 778 } 779 780 define <2 x i64> @test_mm256_cvttpd_epu32(<4 x double> %__A) { 781 ; CHECK-LABEL: test_mm256_cvttpd_epu32: 782 ; CHECK: # %bb.0: # %entry 783 ; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0 784 ; CHECK-NEXT: vzeroupper 785 ; CHECK-NEXT: ret{{[l|q]}} 786 entry: 787 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 788 %1 = bitcast <4 x i32> %0 to <2 x i64> 789 ret <2 x i64> %1 790 } 791 792 define <2 x i64> @test_mm256_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 793 ; X86-LABEL: test_mm256_mask_cvttpd_epu32: 794 ; X86: # %bb.0: # %entry 795 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 796 ; X86-NEXT: kmovw %eax, %k1 797 ; X86-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1} 798 ; X86-NEXT: vzeroupper 799 ; X86-NEXT: retl 800 ; 801 ; X64-LABEL: test_mm256_mask_cvttpd_epu32: 802 ; X64: # %bb.0: # %entry 803 ; X64-NEXT: kmovw %edi, %k1 804 ; X64-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1} 805 ; X64-NEXT: vzeroupper 806 ; X64-NEXT: retq 807 entry: 808 %0 = bitcast <2 x i64> %__W to <4 x i32> 809 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8 810 %2 = bitcast <4 x i32> %1 to <2 x i64> 811 ret <2 x i64> %2 812 } 813 814 define <2 x i64> @test_mm256_maskz_cvttpd_epu32(i8 zeroext %__U, <4 x double> %__A) { 815 ; X86-LABEL: test_mm256_maskz_cvttpd_epu32: 816 ; X86: # %bb.0: # %entry 817 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 818 ; X86-NEXT: kmovw %eax, %k1 819 ; X86-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z} 820 ; X86-NEXT: vzeroupper 821 ; X86-NEXT: retl 822 ; 823 ; X64-LABEL: test_mm256_maskz_cvttpd_epu32: 824 ; X64: # %bb.0: # %entry 825 ; X64-NEXT: kmovw %edi, %k1 826 ; X64-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z} 827 ; X64-NEXT: vzeroupper 828 ; X64-NEXT: retq 829 entry: 830 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 831 %1 = bitcast <4 x i32> %0 to <2 x i64> 832 ret <2 x i64> %1 833 } 834 835 define <2 x i64> @test_mm_mask_cvttps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 836 ; X86-LABEL: test_mm_mask_cvttps_epi32: 837 ; X86: # %bb.0: # %entry 838 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 839 ; X86-NEXT: kmovw %eax, %k1 840 ; X86-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} 841 ; X86-NEXT: retl 842 ; 843 ; X64-LABEL: test_mm_mask_cvttps_epi32: 844 ; X64: # %bb.0: # %entry 845 ; X64-NEXT: kmovw %edi, %k1 846 ; X64-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} 847 ; X64-NEXT: retq 848 entry: 849 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8 850 %1 = bitcast <2 x i64> %__W to <4 x i32> 851 %2 = bitcast i8 %__U to <8 x i1> 852 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 853 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 854 %4 = bitcast <4 x i32> %3 to <2 x i64> 855 ret <2 x i64> %4 856 } 857 858 define <2 x i64> @test_mm_maskz_cvttps_epi32(i8 zeroext %__U, <4 x float> %__A) { 859 ; X86-LABEL: test_mm_maskz_cvttps_epi32: 860 ; X86: # %bb.0: # %entry 861 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 862 ; X86-NEXT: kmovw %eax, %k1 863 ; X86-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z} 864 ; X86-NEXT: retl 865 ; 866 ; X64-LABEL: test_mm_maskz_cvttps_epi32: 867 ; X64: # %bb.0: # %entry 868 ; X64-NEXT: kmovw %edi, %k1 869 ; X64-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z} 870 ; X64-NEXT: retq 871 entry: 872 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8 873 %1 = bitcast i8 %__U to <8 x i1> 874 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 875 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 876 %3 = bitcast <4 x i32> %2 to <2 x i64> 877 ret <2 x i64> %3 878 } 879 880 define <4 x i64> @test_mm256_mask_cvttps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 881 ; X86-LABEL: test_mm256_mask_cvttps_epi32: 882 ; X86: # %bb.0: # %entry 883 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 884 ; X86-NEXT: kmovw %eax, %k1 885 ; X86-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1} 886 ; X86-NEXT: retl 887 ; 888 ; X64-LABEL: test_mm256_mask_cvttps_epi32: 889 ; X64: # %bb.0: # %entry 890 ; X64-NEXT: kmovw %edi, %k1 891 ; X64-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1} 892 ; X64-NEXT: retq 893 entry: 894 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8 895 %1 = bitcast <4 x i64> %__W to <8 x i32> 896 %2 = bitcast i8 %__U to <8 x i1> 897 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 898 %4 = bitcast <8 x i32> %3 to <4 x i64> 899 ret <4 x i64> %4 900 } 901 902 define <4 x i64> @test_mm256_maskz_cvttps_epi32(i8 zeroext %__U, <8 x float> %__A) { 903 ; X86-LABEL: test_mm256_maskz_cvttps_epi32: 904 ; X86: # %bb.0: # %entry 905 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 906 ; X86-NEXT: kmovw %eax, %k1 907 ; X86-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z} 908 ; X86-NEXT: retl 909 ; 910 ; X64-LABEL: test_mm256_maskz_cvttps_epi32: 911 ; X64: # %bb.0: # %entry 912 ; X64-NEXT: kmovw %edi, %k1 913 ; X64-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z} 914 ; X64-NEXT: retq 915 entry: 916 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8 917 %1 = bitcast i8 %__U to <8 x i1> 918 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer 919 %3 = bitcast <8 x i32> %2 to <4 x i64> 920 ret <4 x i64> %3 921 } 922 923 define <2 x i64> @test_mm_cvttps_epu32(<4 x float> %__A) { 924 ; CHECK-LABEL: test_mm_cvttps_epu32: 925 ; CHECK: # %bb.0: # %entry 926 ; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0 927 ; CHECK-NEXT: ret{{[l|q]}} 928 entry: 929 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8 930 %1 = bitcast <4 x i32> %0 to <2 x i64> 931 ret <2 x i64> %1 932 } 933 934 define <2 x i64> @test_mm_mask_cvttps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 935 ; X86-LABEL: test_mm_mask_cvttps_epu32: 936 ; X86: # %bb.0: # %entry 937 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 938 ; X86-NEXT: kmovw %eax, %k1 939 ; X86-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1} 940 ; X86-NEXT: retl 941 ; 942 ; X64-LABEL: test_mm_mask_cvttps_epu32: 943 ; X64: # %bb.0: # %entry 944 ; X64-NEXT: kmovw %edi, %k1 945 ; X64-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1} 946 ; X64-NEXT: retq 947 entry: 948 %0 = bitcast <2 x i64> %__W to <4 x i32> 949 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8 950 %2 = bitcast <4 x i32> %1 to <2 x i64> 951 ret <2 x i64> %2 952 } 953 954 define <2 x i64> @test_mm_maskz_cvttps_epu32(i8 zeroext %__U, <4 x float> %__A) { 955 ; X86-LABEL: test_mm_maskz_cvttps_epu32: 956 ; X86: # %bb.0: # %entry 957 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 958 ; X86-NEXT: kmovw %eax, %k1 959 ; X86-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z} 960 ; X86-NEXT: retl 961 ; 962 ; X64-LABEL: test_mm_maskz_cvttps_epu32: 963 ; X64: # %bb.0: # %entry 964 ; X64-NEXT: kmovw %edi, %k1 965 ; X64-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z} 966 ; X64-NEXT: retq 967 entry: 968 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 969 %1 = bitcast <4 x i32> %0 to <2 x i64> 970 ret <2 x i64> %1 971 } 972 973 define <4 x i64> @test_mm256_cvttps_epu32(<8 x float> %__A) { 974 ; CHECK-LABEL: test_mm256_cvttps_epu32: 975 ; CHECK: # %bb.0: # %entry 976 ; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 977 ; CHECK-NEXT: ret{{[l|q]}} 978 entry: 979 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8 980 %1 = bitcast <8 x i32> %0 to <4 x i64> 981 ret <4 x i64> %1 982 } 983 984 define <4 x i64> @test_mm256_mask_cvttps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 985 ; X86-LABEL: test_mm256_mask_cvttps_epu32: 986 ; X86: # %bb.0: # %entry 987 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 988 ; X86-NEXT: kmovw %eax, %k1 989 ; X86-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1} 990 ; X86-NEXT: retl 991 ; 992 ; X64-LABEL: test_mm256_mask_cvttps_epu32: 993 ; X64: # %bb.0: # %entry 994 ; X64-NEXT: kmovw %edi, %k1 995 ; X64-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1} 996 ; X64-NEXT: retq 997 entry: 998 %0 = bitcast <4 x i64> %__W to <8 x i32> 999 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8 1000 %2 = bitcast <8 x i32> %1 to <4 x i64> 1001 ret <4 x i64> %2 1002 } 1003 1004 define <4 x i64> @test_mm256_maskz_cvttps_epu32(i8 zeroext %__U, <8 x float> %__A) { 1005 ; X86-LABEL: test_mm256_maskz_cvttps_epu32: 1006 ; X86: # %bb.0: # %entry 1007 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1008 ; X86-NEXT: kmovw %eax, %k1 1009 ; X86-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z} 1010 ; X86-NEXT: retl 1011 ; 1012 ; X64-LABEL: test_mm256_maskz_cvttps_epu32: 1013 ; X64: # %bb.0: # %entry 1014 ; X64-NEXT: kmovw %edi, %k1 1015 ; X64-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z} 1016 ; X64-NEXT: retq 1017 entry: 1018 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8 1019 %1 = bitcast <8 x i32> %0 to <4 x i64> 1020 ret <4 x i64> %1 1021 } 1022 1023 define <2 x double> @test_mm_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 { 1024 ; CHECK-LABEL: test_mm_cvtepu32_pd: 1025 ; CHECK: # %bb.0: # %entry 1026 ; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0 1027 ; CHECK-NEXT: ret{{[l|q]}} 1028 entry: 1029 %0 = bitcast <2 x i64> %__A to <4 x i32> 1030 %shuffle.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1031 %conv.i = uitofp <2 x i32> %shuffle.i to <2 x double> 1032 ret <2 x double> %conv.i 1033 } 1034 1035 define <2 x double> @test_mm_mask_cvtepu32_pd(<2 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1036 ; X86-LABEL: test_mm_mask_cvtepu32_pd: 1037 ; X86: # %bb.0: # %entry 1038 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1039 ; X86-NEXT: kmovw %eax, %k1 1040 ; X86-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1} 1041 ; X86-NEXT: retl 1042 ; 1043 ; X64-LABEL: test_mm_mask_cvtepu32_pd: 1044 ; X64: # %bb.0: # %entry 1045 ; X64-NEXT: kmovw %edi, %k1 1046 ; X64-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1} 1047 ; X64-NEXT: retq 1048 entry: 1049 %0 = bitcast <2 x i64> %__A to <4 x i32> 1050 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1051 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double> 1052 %1 = bitcast i8 %__U to <8 x i1> 1053 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1054 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W 1055 ret <2 x double> %2 1056 } 1057 1058 define <2 x double> @test_mm_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1059 ; X86-LABEL: test_mm_maskz_cvtepu32_pd: 1060 ; X86: # %bb.0: # %entry 1061 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1062 ; X86-NEXT: kmovw %eax, %k1 1063 ; X86-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z} 1064 ; X86-NEXT: retl 1065 ; 1066 ; X64-LABEL: test_mm_maskz_cvtepu32_pd: 1067 ; X64: # %bb.0: # %entry 1068 ; X64-NEXT: kmovw %edi, %k1 1069 ; X64-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z} 1070 ; X64-NEXT: retq 1071 entry: 1072 %0 = bitcast <2 x i64> %__A to <4 x i32> 1073 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1074 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double> 1075 %1 = bitcast i8 %__U to <8 x i1> 1076 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1077 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer 1078 ret <2 x double> %2 1079 } 1080 1081 define <4 x double> @test_mm256_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 { 1082 ; CHECK-LABEL: test_mm256_cvtepu32_pd: 1083 ; CHECK: # %bb.0: # %entry 1084 ; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0 1085 ; CHECK-NEXT: ret{{[l|q]}} 1086 entry: 1087 %0 = bitcast <2 x i64> %__A to <4 x i32> 1088 %conv.i = uitofp <4 x i32> %0 to <4 x double> 1089 ret <4 x double> %conv.i 1090 } 1091 1092 define <4 x double> @test_mm256_mask_cvtepu32_pd(<4 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1093 ; X86-LABEL: test_mm256_mask_cvtepu32_pd: 1094 ; X86: # %bb.0: # %entry 1095 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1096 ; X86-NEXT: kmovw %eax, %k1 1097 ; X86-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1} 1098 ; X86-NEXT: retl 1099 ; 1100 ; X64-LABEL: test_mm256_mask_cvtepu32_pd: 1101 ; X64: # %bb.0: # %entry 1102 ; X64-NEXT: kmovw %edi, %k1 1103 ; X64-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1} 1104 ; X64-NEXT: retq 1105 entry: 1106 %0 = bitcast <2 x i64> %__A to <4 x i32> 1107 %conv.i.i = uitofp <4 x i32> %0 to <4 x double> 1108 %1 = bitcast i8 %__U to <8 x i1> 1109 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1110 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W 1111 ret <4 x double> %2 1112 } 1113 1114 define <4 x double> @test_mm256_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1115 ; X86-LABEL: test_mm256_maskz_cvtepu32_pd: 1116 ; X86: # %bb.0: # %entry 1117 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1118 ; X86-NEXT: kmovw %eax, %k1 1119 ; X86-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z} 1120 ; X86-NEXT: retl 1121 ; 1122 ; X64-LABEL: test_mm256_maskz_cvtepu32_pd: 1123 ; X64: # %bb.0: # %entry 1124 ; X64-NEXT: kmovw %edi, %k1 1125 ; X64-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z} 1126 ; X64-NEXT: retq 1127 entry: 1128 %0 = bitcast <2 x i64> %__A to <4 x i32> 1129 %conv.i.i = uitofp <4 x i32> %0 to <4 x double> 1130 %1 = bitcast i8 %__U to <8 x i1> 1131 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1132 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer 1133 ret <4 x double> %2 1134 } 1135 1136 define <4 x float> @test_mm_cvtepu32_ps(<2 x i64> %__A) { 1137 ; CHECK-LABEL: test_mm_cvtepu32_ps: 1138 ; CHECK: # %bb.0: # %entry 1139 ; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 1140 ; CHECK-NEXT: ret{{[l|q]}} 1141 entry: 1142 %0 = bitcast <2 x i64> %__A to <4 x i32> 1143 %conv.i = uitofp <4 x i32> %0 to <4 x float> 1144 ret <4 x float> %conv.i 1145 } 1146 1147 define <4 x float> @test_mm_mask_cvtepu32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) { 1148 ; X86-LABEL: test_mm_mask_cvtepu32_ps: 1149 ; X86: # %bb.0: # %entry 1150 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1151 ; X86-NEXT: kmovw %eax, %k1 1152 ; X86-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1} 1153 ; X86-NEXT: retl 1154 ; 1155 ; X64-LABEL: test_mm_mask_cvtepu32_ps: 1156 ; X64: # %bb.0: # %entry 1157 ; X64-NEXT: kmovw %edi, %k1 1158 ; X64-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1} 1159 ; X64-NEXT: retq 1160 entry: 1161 %0 = bitcast <2 x i64> %__A to <4 x i32> 1162 %conv.i.i = uitofp <4 x i32> %0 to <4 x float> 1163 %1 = bitcast i8 %__U to <8 x i1> 1164 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1165 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W 1166 ret <4 x float> %2 1167 } 1168 1169 define <4 x float> @test_mm_maskz_cvtepu32_ps(i8 zeroext %__U, <2 x i64> %__A) { 1170 ; X86-LABEL: test_mm_maskz_cvtepu32_ps: 1171 ; X86: # %bb.0: # %entry 1172 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1173 ; X86-NEXT: kmovw %eax, %k1 1174 ; X86-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z} 1175 ; X86-NEXT: retl 1176 ; 1177 ; X64-LABEL: test_mm_maskz_cvtepu32_ps: 1178 ; X64: # %bb.0: # %entry 1179 ; X64-NEXT: kmovw %edi, %k1 1180 ; X64-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z} 1181 ; X64-NEXT: retq 1182 entry: 1183 %0 = bitcast <2 x i64> %__A to <4 x i32> 1184 %conv.i.i = uitofp <4 x i32> %0 to <4 x float> 1185 %1 = bitcast i8 %__U to <8 x i1> 1186 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1187 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer 1188 ret <4 x float> %2 1189 } 1190 1191 define <8 x float> @test_mm256_cvtepu32_ps(<4 x i64> %__A) { 1192 ; CHECK-LABEL: test_mm256_cvtepu32_ps: 1193 ; CHECK: # %bb.0: # %entry 1194 ; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 1195 ; CHECK-NEXT: ret{{[l|q]}} 1196 entry: 1197 %0 = bitcast <4 x i64> %__A to <8 x i32> 1198 %conv.i = uitofp <8 x i32> %0 to <8 x float> 1199 ret <8 x float> %conv.i 1200 } 1201 1202 define <8 x float> @test_mm256_mask_cvtepu32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) { 1203 ; X86-LABEL: test_mm256_mask_cvtepu32_ps: 1204 ; X86: # %bb.0: # %entry 1205 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1206 ; X86-NEXT: kmovw %eax, %k1 1207 ; X86-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1} 1208 ; X86-NEXT: retl 1209 ; 1210 ; X64-LABEL: test_mm256_mask_cvtepu32_ps: 1211 ; X64: # %bb.0: # %entry 1212 ; X64-NEXT: kmovw %edi, %k1 1213 ; X64-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1} 1214 ; X64-NEXT: retq 1215 entry: 1216 %0 = bitcast <4 x i64> %__A to <8 x i32> 1217 %conv.i.i = uitofp <8 x i32> %0 to <8 x float> 1218 %1 = bitcast i8 %__U to <8 x i1> 1219 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W 1220 ret <8 x float> %2 1221 } 1222 1223 define <8 x float> @test_mm256_maskz_cvtepu32_ps(i8 zeroext %__U, <4 x i64> %__A) { 1224 ; X86-LABEL: test_mm256_maskz_cvtepu32_ps: 1225 ; X86: # %bb.0: # %entry 1226 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1227 ; X86-NEXT: kmovw %eax, %k1 1228 ; X86-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z} 1229 ; X86-NEXT: retl 1230 ; 1231 ; X64-LABEL: test_mm256_maskz_cvtepu32_ps: 1232 ; X64: # %bb.0: # %entry 1233 ; X64-NEXT: kmovw %edi, %k1 1234 ; X64-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z} 1235 ; X64-NEXT: retq 1236 entry: 1237 %0 = bitcast <4 x i64> %__A to <8 x i32> 1238 %conv.i.i = uitofp <8 x i32> %0 to <8 x float> 1239 %1 = bitcast i8 %__U to <8 x i1> 1240 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer 1241 ret <8 x float> %2 1242 } 1243 1244 define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) { 1245 ; CHECK-LABEL: test_mm256_shuffle_f32x4: 1246 ; CHECK: # %bb.0: # %entry 1247 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1248 ; CHECK-NEXT: ret{{[l|q]}} 1249 entry: 1250 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1251 ret <8 x float> %shuffle 1252 } 1253 1254 define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { 1255 ; X86-LABEL: test_mm256_mask_shuffle_f32x4: 1256 ; X86: # %bb.0: # %entry 1257 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1258 ; X86-NEXT: kmovw %eax, %k1 1259 ; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1260 ; X86-NEXT: retl 1261 ; 1262 ; X64-LABEL: test_mm256_mask_shuffle_f32x4: 1263 ; X64: # %bb.0: # %entry 1264 ; X64-NEXT: kmovw %edi, %k1 1265 ; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1266 ; X64-NEXT: retq 1267 entry: 1268 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1269 %0 = bitcast i8 %__U to <8 x i1> 1270 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W 1271 ret <8 x float> %1 1272 } 1273 1274 define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { 1275 ; X86-LABEL: test_mm256_maskz_shuffle_f32x4: 1276 ; X86: # %bb.0: # %entry 1277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1278 ; X86-NEXT: kmovw %eax, %k1 1279 ; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1280 ; X86-NEXT: retl 1281 ; 1282 ; X64-LABEL: test_mm256_maskz_shuffle_f32x4: 1283 ; X64: # %bb.0: # %entry 1284 ; X64-NEXT: kmovw %edi, %k1 1285 ; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1286 ; X64-NEXT: retq 1287 entry: 1288 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1289 %0 = bitcast i8 %__U to <8 x i1> 1290 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer 1291 ret <8 x float> %1 1292 } 1293 1294 define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) { 1295 ; CHECK-LABEL: test_mm256_shuffle_f64x2: 1296 ; CHECK: # %bb.0: # %entry 1297 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1298 ; CHECK-NEXT: ret{{[l|q]}} 1299 entry: 1300 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1301 ret <4 x double> %shuffle 1302 } 1303 1304 define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 1305 ; X86-LABEL: test_mm256_mask_shuffle_f64x2: 1306 ; X86: # %bb.0: # %entry 1307 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1308 ; X86-NEXT: kmovw %eax, %k1 1309 ; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1310 ; X86-NEXT: retl 1311 ; 1312 ; X64-LABEL: test_mm256_mask_shuffle_f64x2: 1313 ; X64: # %bb.0: # %entry 1314 ; X64-NEXT: kmovw %edi, %k1 1315 ; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1316 ; X64-NEXT: retq 1317 entry: 1318 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1319 %0 = bitcast i8 %__U to <8 x i1> 1320 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1321 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W 1322 ret <4 x double> %1 1323 } 1324 1325 define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 1326 ; X86-LABEL: test_mm256_maskz_shuffle_f64x2: 1327 ; X86: # %bb.0: # %entry 1328 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1329 ; X86-NEXT: kmovw %eax, %k1 1330 ; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1331 ; X86-NEXT: retl 1332 ; 1333 ; X64-LABEL: test_mm256_maskz_shuffle_f64x2: 1334 ; X64: # %bb.0: # %entry 1335 ; X64-NEXT: kmovw %edi, %k1 1336 ; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1337 ; X64-NEXT: retq 1338 entry: 1339 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1340 %0 = bitcast i8 %__U to <8 x i1> 1341 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1342 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer 1343 ret <4 x double> %1 1344 } 1345 1346 define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) { 1347 ; CHECK-LABEL: test_mm256_shuffle_i32x4: 1348 ; CHECK: # %bb.0: # %entry 1349 ; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1350 ; CHECK-NEXT: ret{{[l|q]}} 1351 entry: 1352 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1353 ret <4 x i64> %shuffle 1354 } 1355 1356 define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1357 ; X86-LABEL: test_mm256_mask_shuffle_i32x4: 1358 ; X86: # %bb.0: # %entry 1359 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1360 ; X86-NEXT: kmovw %eax, %k1 1361 ; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1362 ; X86-NEXT: retl 1363 ; 1364 ; X64-LABEL: test_mm256_mask_shuffle_i32x4: 1365 ; X64: # %bb.0: # %entry 1366 ; X64-NEXT: kmovw %edi, %k1 1367 ; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1368 ; X64-NEXT: retq 1369 entry: 1370 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1371 %0 = bitcast <4 x i64> %shuffle to <8 x i32> 1372 %1 = bitcast <4 x i64> %__W to <8 x i32> 1373 %2 = bitcast i8 %__U to <8 x i1> 1374 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 1375 %4 = bitcast <8 x i32> %3 to <4 x i64> 1376 ret <4 x i64> %4 1377 } 1378 1379 define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1380 ; X86-LABEL: test_mm256_maskz_shuffle_i32x4: 1381 ; X86: # %bb.0: # %entry 1382 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1383 ; X86-NEXT: kmovw %eax, %k1 1384 ; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1385 ; X86-NEXT: retl 1386 ; 1387 ; X64-LABEL: test_mm256_maskz_shuffle_i32x4: 1388 ; X64: # %bb.0: # %entry 1389 ; X64-NEXT: kmovw %edi, %k1 1390 ; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1391 ; X64-NEXT: retq 1392 entry: 1393 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1394 %0 = bitcast <4 x i64> %shuffle to <8 x i32> 1395 %1 = bitcast i8 %__U to <8 x i1> 1396 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer 1397 %3 = bitcast <8 x i32> %2 to <4 x i64> 1398 ret <4 x i64> %3 1399 } 1400 1401 define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) { 1402 ; CHECK-LABEL: test_mm256_shuffle_i64x2: 1403 ; CHECK: # %bb.0: # %entry 1404 ; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1405 ; CHECK-NEXT: ret{{[l|q]}} 1406 entry: 1407 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1408 ret <4 x i64> %shuffle 1409 } 1410 1411 define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1412 ; X86-LABEL: test_mm256_mask_shuffle_i64x2: 1413 ; X86: # %bb.0: # %entry 1414 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1415 ; X86-NEXT: kmovw %eax, %k1 1416 ; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1417 ; X86-NEXT: retl 1418 ; 1419 ; X64-LABEL: test_mm256_mask_shuffle_i64x2: 1420 ; X64: # %bb.0: # %entry 1421 ; X64-NEXT: kmovw %edi, %k1 1422 ; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1423 ; X64-NEXT: retq 1424 entry: 1425 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1426 %0 = bitcast i8 %__U to <8 x i1> 1427 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1428 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W 1429 ret <4 x i64> %1 1430 } 1431 1432 define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1433 ; X86-LABEL: test_mm256_maskz_shuffle_i64x2: 1434 ; X86: # %bb.0: # %entry 1435 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1436 ; X86-NEXT: kmovw %eax, %k1 1437 ; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1438 ; X86-NEXT: retl 1439 ; 1440 ; X64-LABEL: test_mm256_maskz_shuffle_i64x2: 1441 ; X64: # %bb.0: # %entry 1442 ; X64-NEXT: kmovw %edi, %k1 1443 ; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1444 ; X64-NEXT: retq 1445 entry: 1446 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1447 %0 = bitcast i8 %__U to <8 x i1> 1448 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1449 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer 1450 ret <4 x i64> %1 1451 } 1452 1453 define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) { 1454 ; CHECK-LABEL: test_mm_test_epi32_mask: 1455 ; CHECK: # %bb.0: # %entry 1456 ; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k0 1457 ; CHECK-NEXT: kmovw %k0, %eax 1458 ; CHECK-NEXT: movzbl %al, %eax 1459 ; CHECK-NEXT: ret{{[l|q]}} 1460 entry: 1461 %and.i.i = and <2 x i64> %__B, %__A 1462 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1463 %1 = icmp ne <4 x i32> %0, zeroinitializer 1464 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1465 %3 = bitcast <8 x i1> %2 to i8 1466 ret i8 %3 1467 } 1468 1469 define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1470 ; X86-LABEL: test_mm_mask_test_epi32_mask: 1471 ; X86: # %bb.0: # %entry 1472 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1473 ; X86-NEXT: kmovw %eax, %k1 1474 ; X86-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} 1475 ; X86-NEXT: kmovw %k0, %eax 1476 ; X86-NEXT: movzbl %al, %eax 1477 ; X86-NEXT: retl 1478 ; 1479 ; X64-LABEL: test_mm_mask_test_epi32_mask: 1480 ; X64: # %bb.0: # %entry 1481 ; X64-NEXT: kmovw %edi, %k1 1482 ; X64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} 1483 ; X64-NEXT: kmovw %k0, %eax 1484 ; X64-NEXT: movzbl %al, %eax 1485 ; X64-NEXT: retq 1486 entry: 1487 %and.i.i = and <2 x i64> %__B, %__A 1488 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1489 %1 = icmp ne <4 x i32> %0, zeroinitializer 1490 %2 = bitcast i8 %__U to <8 x i1> 1491 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1492 %3 = and <4 x i1> %1, %extract.i 1493 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1494 %5 = bitcast <8 x i1> %4 to i8 1495 ret i8 %5 1496 } 1497 1498 define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) { 1499 ; CHECK-LABEL: test_mm256_test_epi32_mask: 1500 ; CHECK: # %bb.0: # %entry 1501 ; CHECK-NEXT: vptestmd %ymm0, %ymm1, %k0 1502 ; CHECK-NEXT: kmovw %k0, %eax 1503 ; CHECK-NEXT: movzbl %al, %eax 1504 ; CHECK-NEXT: vzeroupper 1505 ; CHECK-NEXT: ret{{[l|q]}} 1506 entry: 1507 %and.i.i = and <4 x i64> %__B, %__A 1508 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1509 %1 = icmp ne <8 x i32> %0, zeroinitializer 1510 %2 = bitcast <8 x i1> %1 to i8 1511 ret i8 %2 1512 } 1513 1514 define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1515 ; X86-LABEL: test_mm256_mask_test_epi32_mask: 1516 ; X86: # %bb.0: # %entry 1517 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1518 ; X86-NEXT: kmovw %eax, %k1 1519 ; X86-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} 1520 ; X86-NEXT: kmovw %k0, %eax 1521 ; X86-NEXT: movzbl %al, %eax 1522 ; X86-NEXT: vzeroupper 1523 ; X86-NEXT: retl 1524 ; 1525 ; X64-LABEL: test_mm256_mask_test_epi32_mask: 1526 ; X64: # %bb.0: # %entry 1527 ; X64-NEXT: kmovw %edi, %k1 1528 ; X64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} 1529 ; X64-NEXT: kmovw %k0, %eax 1530 ; X64-NEXT: movzbl %al, %eax 1531 ; X64-NEXT: vzeroupper 1532 ; X64-NEXT: retq 1533 entry: 1534 %and.i.i = and <4 x i64> %__B, %__A 1535 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1536 %1 = icmp ne <8 x i32> %0, zeroinitializer 1537 %2 = bitcast i8 %__U to <8 x i1> 1538 %3 = and <8 x i1> %1, %2 1539 %4 = bitcast <8 x i1> %3 to i8 1540 ret i8 %4 1541 } 1542 1543 define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) { 1544 ; CHECK-LABEL: test_mm_test_epi64_mask: 1545 ; CHECK: # %bb.0: # %entry 1546 ; CHECK-NEXT: vptestmq %xmm0, %xmm1, %k0 1547 ; CHECK-NEXT: kmovw %k0, %eax 1548 ; CHECK-NEXT: movzbl %al, %eax 1549 ; CHECK-NEXT: ret{{[l|q]}} 1550 entry: 1551 %and.i.i = and <2 x i64> %__B, %__A 1552 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer 1553 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1554 %2 = bitcast <8 x i1> %1 to i8 1555 ret i8 %2 1556 } 1557 1558 define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1559 ; X86-LABEL: test_mm_mask_test_epi64_mask: 1560 ; X86: # %bb.0: # %entry 1561 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1562 ; X86-NEXT: kmovw %eax, %k1 1563 ; X86-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1} 1564 ; X86-NEXT: kmovw %k0, %eax 1565 ; X86-NEXT: movzbl %al, %eax 1566 ; X86-NEXT: retl 1567 ; 1568 ; X64-LABEL: test_mm_mask_test_epi64_mask: 1569 ; X64: # %bb.0: # %entry 1570 ; X64-NEXT: kmovw %edi, %k1 1571 ; X64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1} 1572 ; X64-NEXT: kmovw %k0, %eax 1573 ; X64-NEXT: movzbl %al, %eax 1574 ; X64-NEXT: retq 1575 entry: 1576 %and.i.i = and <2 x i64> %__B, %__A 1577 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer 1578 %1 = bitcast i8 %__U to <8 x i1> 1579 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1580 %2 = and <2 x i1> %0, %extract.i 1581 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1582 %4 = bitcast <8 x i1> %3 to i8 1583 ret i8 %4 1584 } 1585 1586 define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) { 1587 ; CHECK-LABEL: test_mm256_test_epi64_mask: 1588 ; CHECK: # %bb.0: # %entry 1589 ; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k0 1590 ; CHECK-NEXT: kmovw %k0, %eax 1591 ; CHECK-NEXT: movzbl %al, %eax 1592 ; CHECK-NEXT: vzeroupper 1593 ; CHECK-NEXT: ret{{[l|q]}} 1594 entry: 1595 %and.i.i = and <4 x i64> %__B, %__A 1596 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer 1597 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1598 %2 = bitcast <8 x i1> %1 to i8 1599 ret i8 %2 1600 } 1601 1602 define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1603 ; X86-LABEL: test_mm256_mask_test_epi64_mask: 1604 ; X86: # %bb.0: # %entry 1605 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1606 ; X86-NEXT: kmovw %eax, %k1 1607 ; X86-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1} 1608 ; X86-NEXT: kmovw %k0, %eax 1609 ; X86-NEXT: movzbl %al, %eax 1610 ; X86-NEXT: vzeroupper 1611 ; X86-NEXT: retl 1612 ; 1613 ; X64-LABEL: test_mm256_mask_test_epi64_mask: 1614 ; X64: # %bb.0: # %entry 1615 ; X64-NEXT: kmovw %edi, %k1 1616 ; X64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1} 1617 ; X64-NEXT: kmovw %k0, %eax 1618 ; X64-NEXT: movzbl %al, %eax 1619 ; X64-NEXT: vzeroupper 1620 ; X64-NEXT: retq 1621 entry: 1622 %and.i.i = and <4 x i64> %__B, %__A 1623 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer 1624 %1 = bitcast i8 %__U to <8 x i1> 1625 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1626 %2 = and <4 x i1> %0, %extract.i 1627 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1628 %4 = bitcast <8 x i1> %3 to i8 1629 ret i8 %4 1630 } 1631 1632 define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) { 1633 ; CHECK-LABEL: test_mm_testn_epi32_mask: 1634 ; CHECK: # %bb.0: # %entry 1635 ; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k0 1636 ; CHECK-NEXT: kmovw %k0, %eax 1637 ; CHECK-NEXT: movzbl %al, %eax 1638 ; CHECK-NEXT: ret{{[l|q]}} 1639 entry: 1640 %and.i.i = and <2 x i64> %__B, %__A 1641 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1642 %1 = icmp eq <4 x i32> %0, zeroinitializer 1643 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1644 %3 = bitcast <8 x i1> %2 to i8 1645 ret i8 %3 1646 } 1647 1648 define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1649 ; X86-LABEL: test_mm_mask_testn_epi32_mask: 1650 ; X86: # %bb.0: # %entry 1651 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1652 ; X86-NEXT: kmovw %eax, %k1 1653 ; X86-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} 1654 ; X86-NEXT: kmovw %k0, %eax 1655 ; X86-NEXT: movzbl %al, %eax 1656 ; X86-NEXT: retl 1657 ; 1658 ; X64-LABEL: test_mm_mask_testn_epi32_mask: 1659 ; X64: # %bb.0: # %entry 1660 ; X64-NEXT: kmovw %edi, %k1 1661 ; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} 1662 ; X64-NEXT: kmovw %k0, %eax 1663 ; X64-NEXT: movzbl %al, %eax 1664 ; X64-NEXT: retq 1665 entry: 1666 %and.i.i = and <2 x i64> %__B, %__A 1667 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1668 %1 = icmp eq <4 x i32> %0, zeroinitializer 1669 %2 = bitcast i8 %__U to <8 x i1> 1670 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1671 %3 = and <4 x i1> %1, %extract.i 1672 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1673 %5 = bitcast <8 x i1> %4 to i8 1674 ret i8 %5 1675 } 1676 1677 define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) { 1678 ; CHECK-LABEL: test_mm256_testn_epi32_mask: 1679 ; CHECK: # %bb.0: # %entry 1680 ; CHECK-NEXT: vptestnmd %ymm0, %ymm1, %k0 1681 ; CHECK-NEXT: kmovw %k0, %eax 1682 ; CHECK-NEXT: movzbl %al, %eax 1683 ; CHECK-NEXT: vzeroupper 1684 ; CHECK-NEXT: ret{{[l|q]}} 1685 entry: 1686 %and.i.i = and <4 x i64> %__B, %__A 1687 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1688 %1 = icmp eq <8 x i32> %0, zeroinitializer 1689 %2 = bitcast <8 x i1> %1 to i8 1690 ret i8 %2 1691 } 1692 1693 define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1694 ; X86-LABEL: test_mm256_mask_testn_epi32_mask: 1695 ; X86: # %bb.0: # %entry 1696 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1697 ; X86-NEXT: kmovw %eax, %k1 1698 ; X86-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} 1699 ; X86-NEXT: kmovw %k0, %eax 1700 ; X86-NEXT: movzbl %al, %eax 1701 ; X86-NEXT: vzeroupper 1702 ; X86-NEXT: retl 1703 ; 1704 ; X64-LABEL: test_mm256_mask_testn_epi32_mask: 1705 ; X64: # %bb.0: # %entry 1706 ; X64-NEXT: kmovw %edi, %k1 1707 ; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} 1708 ; X64-NEXT: kmovw %k0, %eax 1709 ; X64-NEXT: movzbl %al, %eax 1710 ; X64-NEXT: vzeroupper 1711 ; X64-NEXT: retq 1712 entry: 1713 %and.i.i = and <4 x i64> %__B, %__A 1714 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1715 %1 = icmp eq <8 x i32> %0, zeroinitializer 1716 %2 = bitcast i8 %__U to <8 x i1> 1717 %3 = and <8 x i1> %1, %2 1718 %4 = bitcast <8 x i1> %3 to i8 1719 ret i8 %4 1720 } 1721 1722 define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) { 1723 ; CHECK-LABEL: test_mm_testn_epi64_mask: 1724 ; CHECK: # %bb.0: # %entry 1725 ; CHECK-NEXT: vptestnmq %xmm0, %xmm1, %k0 1726 ; CHECK-NEXT: kmovw %k0, %eax 1727 ; CHECK-NEXT: movzbl %al, %eax 1728 ; CHECK-NEXT: ret{{[l|q]}} 1729 entry: 1730 %and.i.i = and <2 x i64> %__B, %__A 1731 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer 1732 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1733 %2 = bitcast <8 x i1> %1 to i8 1734 ret i8 %2 1735 } 1736 1737 define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1738 ; X86-LABEL: test_mm_mask_testn_epi64_mask: 1739 ; X86: # %bb.0: # %entry 1740 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1741 ; X86-NEXT: kmovw %eax, %k1 1742 ; X86-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1} 1743 ; X86-NEXT: kmovw %k0, %eax 1744 ; X86-NEXT: movzbl %al, %eax 1745 ; X86-NEXT: retl 1746 ; 1747 ; X64-LABEL: test_mm_mask_testn_epi64_mask: 1748 ; X64: # %bb.0: # %entry 1749 ; X64-NEXT: kmovw %edi, %k1 1750 ; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1} 1751 ; X64-NEXT: kmovw %k0, %eax 1752 ; X64-NEXT: movzbl %al, %eax 1753 ; X64-NEXT: retq 1754 entry: 1755 %and.i.i = and <2 x i64> %__B, %__A 1756 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer 1757 %1 = bitcast i8 %__U to <8 x i1> 1758 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1759 %2 = and <2 x i1> %0, %extract.i 1760 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1761 %4 = bitcast <8 x i1> %3 to i8 1762 ret i8 %4 1763 } 1764 1765 define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) { 1766 ; CHECK-LABEL: test_mm256_testn_epi64_mask: 1767 ; CHECK: # %bb.0: # %entry 1768 ; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k0 1769 ; CHECK-NEXT: kmovw %k0, %eax 1770 ; CHECK-NEXT: movzbl %al, %eax 1771 ; CHECK-NEXT: vzeroupper 1772 ; CHECK-NEXT: ret{{[l|q]}} 1773 entry: 1774 %and.i.i = and <4 x i64> %__B, %__A 1775 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer 1776 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1777 %2 = bitcast <8 x i1> %1 to i8 1778 ret i8 %2 1779 } 1780 1781 define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1782 ; X86-LABEL: test_mm256_mask_testn_epi64_mask: 1783 ; X86: # %bb.0: # %entry 1784 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1785 ; X86-NEXT: kmovw %eax, %k1 1786 ; X86-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1} 1787 ; X86-NEXT: kmovw %k0, %eax 1788 ; X86-NEXT: movzbl %al, %eax 1789 ; X86-NEXT: vzeroupper 1790 ; X86-NEXT: retl 1791 ; 1792 ; X64-LABEL: test_mm256_mask_testn_epi64_mask: 1793 ; X64: # %bb.0: # %entry 1794 ; X64-NEXT: kmovw %edi, %k1 1795 ; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1} 1796 ; X64-NEXT: kmovw %k0, %eax 1797 ; X64-NEXT: movzbl %al, %eax 1798 ; X64-NEXT: vzeroupper 1799 ; X64-NEXT: retq 1800 entry: 1801 %and.i.i = and <4 x i64> %__B, %__A 1802 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer 1803 %1 = bitcast i8 %__U to <8 x i1> 1804 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1805 %2 = and <4 x i1> %0, %extract.i 1806 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1807 %4 = bitcast <8 x i1> %3 to i8 1808 ret i8 %4 1809 } 1810 1811 define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M) { 1812 ; X86-LABEL: test_mm_mask_set1_epi32: 1813 ; X86: # %bb.0: # %entry 1814 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1815 ; X86-NEXT: kmovw %eax, %k1 1816 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} 1817 ; X86-NEXT: retl 1818 ; 1819 ; X64-LABEL: test_mm_mask_set1_epi32: 1820 ; X64: # %bb.0: # %entry 1821 ; X64-NEXT: kmovw %edi, %k1 1822 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} 1823 ; X64-NEXT: retq 1824 entry: 1825 %0 = bitcast <2 x i64> %__O to <4 x i32> 1826 %1 = bitcast i8 %__M to <8 x i1> 1827 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1828 %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0 1829 %3 = bitcast <4 x i32> %2 to <2 x i64> 1830 ret <2 x i64> %3 1831 } 1832 1833 define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) { 1834 ; X86-LABEL: test_mm_maskz_set1_epi32: 1835 ; X86: # %bb.0: # %entry 1836 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1837 ; X86-NEXT: kmovw %eax, %k1 1838 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} {z} 1839 ; X86-NEXT: retl 1840 ; 1841 ; X64-LABEL: test_mm_maskz_set1_epi32: 1842 ; X64: # %bb.0: # %entry 1843 ; X64-NEXT: kmovw %edi, %k1 1844 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} 1845 ; X64-NEXT: retq 1846 entry: 1847 %0 = bitcast i8 %__M to <8 x i1> 1848 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1849 %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer 1850 %2 = bitcast <4 x i32> %1 to <2 x i64> 1851 ret <2 x i64> %2 1852 } 1853 1854 define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M) { 1855 ; X86-LABEL: test_mm256_mask_set1_epi32: 1856 ; X86: # %bb.0: # %entry 1857 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1858 ; X86-NEXT: kmovw %eax, %k1 1859 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} 1860 ; X86-NEXT: retl 1861 ; 1862 ; X64-LABEL: test_mm256_mask_set1_epi32: 1863 ; X64: # %bb.0: # %entry 1864 ; X64-NEXT: kmovw %edi, %k1 1865 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} 1866 ; X64-NEXT: retq 1867 entry: 1868 %0 = bitcast <4 x i64> %__O to <8 x i32> 1869 %1 = bitcast i8 %__M to <8 x i1> 1870 %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0 1871 %3 = bitcast <8 x i32> %2 to <4 x i64> 1872 ret <4 x i64> %3 1873 } 1874 1875 define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M) { 1876 ; X86-LABEL: test_mm256_maskz_set1_epi32: 1877 ; X86: # %bb.0: # %entry 1878 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1879 ; X86-NEXT: kmovw %eax, %k1 1880 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} {z} 1881 ; X86-NEXT: retl 1882 ; 1883 ; X64-LABEL: test_mm256_maskz_set1_epi32: 1884 ; X64: # %bb.0: # %entry 1885 ; X64-NEXT: kmovw %edi, %k1 1886 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} 1887 ; X64-NEXT: retq 1888 entry: 1889 %0 = bitcast i8 %__M to <8 x i1> 1890 %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer 1891 %2 = bitcast <8 x i32> %1 to <4 x i64> 1892 ret <4 x i64> %2 1893 } 1894 1895 define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A) { 1896 ; X86-LABEL: test_mm_mask_set1_epi64: 1897 ; X86: # %bb.0: # %entry 1898 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1899 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1900 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1901 ; X86-NEXT: kmovw %eax, %k1 1902 ; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 1903 ; X86-NEXT: retl 1904 ; 1905 ; X64-LABEL: test_mm_mask_set1_epi64: 1906 ; X64: # %bb.0: # %entry 1907 ; X64-NEXT: kmovw %edi, %k1 1908 ; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1} 1909 ; X64-NEXT: retq 1910 entry: 1911 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0 1912 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer 1913 %0 = bitcast i8 %__M to <8 x i1> 1914 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1915 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> %__O 1916 ret <2 x i64> %1 1917 } 1918 1919 define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { 1920 ; X86-LABEL: test_mm_maskz_set1_epi64: 1921 ; X86: # %bb.0: # %entry 1922 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1923 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1924 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1925 ; X86-NEXT: kmovw %eax, %k1 1926 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 1927 ; X86-NEXT: retl 1928 ; 1929 ; X64-LABEL: test_mm_maskz_set1_epi64: 1930 ; X64: # %bb.0: # %entry 1931 ; X64-NEXT: kmovw %edi, %k1 1932 ; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1} {z} 1933 ; X64-NEXT: retq 1934 entry: 1935 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0 1936 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer 1937 %0 = bitcast i8 %__M to <8 x i1> 1938 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1939 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> zeroinitializer 1940 ret <2 x i64> %1 1941 } 1942 1943 1944 define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) { 1945 ; X86-LABEL: test_mm256_mask_set1_epi64: 1946 ; X86: # %bb.0: # %entry 1947 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1948 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1949 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1950 ; X86-NEXT: kmovw %eax, %k1 1951 ; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 1952 ; X86-NEXT: retl 1953 ; 1954 ; X64-LABEL: test_mm256_mask_set1_epi64: 1955 ; X64: # %bb.0: # %entry 1956 ; X64-NEXT: kmovw %edi, %k1 1957 ; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1} 1958 ; X64-NEXT: retq 1959 entry: 1960 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0 1961 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer 1962 %0 = bitcast i8 %__M to <8 x i1> 1963 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1964 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O 1965 ret <4 x i64> %1 1966 } 1967 1968 define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { 1969 ; X86-LABEL: test_mm256_maskz_set1_epi64: 1970 ; X86: # %bb.0: # %entry 1971 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1972 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1973 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1974 ; X86-NEXT: kmovw %eax, %k1 1975 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 1976 ; X86-NEXT: retl 1977 ; 1978 ; X64-LABEL: test_mm256_maskz_set1_epi64: 1979 ; X64: # %bb.0: # %entry 1980 ; X64-NEXT: kmovw %edi, %k1 1981 ; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1} {z} 1982 ; X64-NEXT: retq 1983 entry: 1984 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0 1985 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer 1986 %0 = bitcast i8 %__M to <8 x i1> 1987 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1988 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer 1989 ret <4 x i64> %1 1990 } 1991 1992 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { 1993 ; CHECK-LABEL: test_mm_broadcastd_epi32: 1994 ; CHECK: # %bb.0: 1995 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 1996 ; CHECK-NEXT: ret{{[l|q]}} 1997 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1998 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer 1999 %res1 = bitcast <4 x i32> %res0 to <2 x i64> 2000 ret <2 x i64> %res1 2001 } 2002 2003 define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) { 2004 ; X86-LABEL: test_mm_mask_broadcastd_epi32: 2005 ; X86: # %bb.0: # %entry 2006 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2007 ; X86-NEXT: kmovw %eax, %k1 2008 ; X86-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 2009 ; X86-NEXT: retl 2010 ; 2011 ; X64-LABEL: test_mm_mask_broadcastd_epi32: 2012 ; X64: # %bb.0: # %entry 2013 ; X64-NEXT: kmovw %edi, %k1 2014 ; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 2015 ; X64-NEXT: retq 2016 entry: 2017 %0 = bitcast <2 x i64> %__A to <4 x i32> 2018 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 2019 %1 = bitcast <2 x i64> %__O to <4 x i32> 2020 %2 = bitcast i8 %__M to <8 x i1> 2021 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2022 %3 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> %1 2023 %4 = bitcast <4 x i32> %3 to <2 x i64> 2024 ret <2 x i64> %4 2025 } 2026 2027 define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 zeroext %__M, <2 x i64> %__A) { 2028 ; X86-LABEL: test_mm_maskz_broadcastd_epi32: 2029 ; X86: # %bb.0: # %entry 2030 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2031 ; X86-NEXT: kmovw %eax, %k1 2032 ; X86-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} 2033 ; X86-NEXT: retl 2034 ; 2035 ; X64-LABEL: test_mm_maskz_broadcastd_epi32: 2036 ; X64: # %bb.0: # %entry 2037 ; X64-NEXT: kmovw %edi, %k1 2038 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} 2039 ; X64-NEXT: retq 2040 entry: 2041 %0 = bitcast <2 x i64> %__A to <4 x i32> 2042 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 2043 %1 = bitcast i8 %__M to <8 x i1> 2044 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2045 %2 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> zeroinitializer 2046 %3 = bitcast <4 x i32> %2 to <2 x i64> 2047 ret <2 x i64> %3 2048 } 2049 2050 define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) { 2051 ; CHECK-LABEL: test_mm256_broadcastd_epi32: 2052 ; CHECK: # %bb.0: 2053 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 2054 ; CHECK-NEXT: ret{{[l|q]}} 2055 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2056 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer 2057 %res1 = bitcast <8 x i32> %res0 to <4 x i64> 2058 ret <4 x i64> %res1 2059 } 2060 2061 define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) { 2062 ; X86-LABEL: test_mm256_mask_broadcastd_epi32: 2063 ; X86: # %bb.0: 2064 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2065 ; X86-NEXT: kmovw %eax, %k1 2066 ; X86-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1} 2067 ; X86-NEXT: retl 2068 ; 2069 ; X64-LABEL: test_mm256_mask_broadcastd_epi32: 2070 ; X64: # %bb.0: 2071 ; X64-NEXT: kmovw %edi, %k1 2072 ; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1} 2073 ; X64-NEXT: retq 2074 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2075 %arg1 = bitcast i8 %a1 to <8 x i1> 2076 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 2077 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer 2078 %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0 2079 %res2 = bitcast <8 x i32> %res1 to <4 x i64> 2080 ret <4 x i64> %res2 2081 } 2082 2083 define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) { 2084 ; X86-LABEL: test_mm256_maskz_broadcastd_epi32: 2085 ; X86: # %bb.0: 2086 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2087 ; X86-NEXT: kmovw %eax, %k1 2088 ; X86-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} 2089 ; X86-NEXT: retl 2090 ; 2091 ; X64-LABEL: test_mm256_maskz_broadcastd_epi32: 2092 ; X64: # %bb.0: 2093 ; X64-NEXT: kmovw %edi, %k1 2094 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} 2095 ; X64-NEXT: retq 2096 %arg0 = bitcast i8 %a0 to <8 x i1> 2097 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2098 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer 2099 %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer 2100 %res2 = bitcast <8 x i32> %res1 to <4 x i64> 2101 ret <4 x i64> %res2 2102 } 2103 2104 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { 2105 ; CHECK-LABEL: test_mm_broadcastq_epi64: 2106 ; CHECK: # %bb.0: 2107 ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 2108 ; CHECK-NEXT: ret{{[l|q]}} 2109 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer 2110 ret <2 x i64> %res 2111 } 2112 2113 define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) { 2114 ; X86-LABEL: test_mm_mask_broadcastq_epi64: 2115 ; X86: # %bb.0: # %entry 2116 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2117 ; X86-NEXT: kmovw %eax, %k1 2118 ; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 2119 ; X86-NEXT: retl 2120 ; 2121 ; X64-LABEL: test_mm_mask_broadcastq_epi64: 2122 ; X64: # %bb.0: # %entry 2123 ; X64-NEXT: kmovw %edi, %k1 2124 ; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 2125 ; X64-NEXT: retq 2126 entry: 2127 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer 2128 %0 = bitcast i8 %__M to <8 x i1> 2129 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2130 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> %__O 2131 ret <2 x i64> %1 2132 } 2133 2134 define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) { 2135 ; X86-LABEL: test_mm_maskz_broadcastq_epi64: 2136 ; X86: # %bb.0: # %entry 2137 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2138 ; X86-NEXT: kmovw %eax, %k1 2139 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 2140 ; X86-NEXT: retl 2141 ; 2142 ; X64-LABEL: test_mm_maskz_broadcastq_epi64: 2143 ; X64: # %bb.0: # %entry 2144 ; X64-NEXT: kmovw %edi, %k1 2145 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 2146 ; X64-NEXT: retq 2147 entry: 2148 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer 2149 %0 = bitcast i8 %__M to <8 x i1> 2150 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2151 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> zeroinitializer 2152 ret <2 x i64> %1 2153 } 2154 2155 define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) { 2156 ; CHECK-LABEL: test_mm256_broadcastq_epi64: 2157 ; CHECK: # %bb.0: 2158 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 2159 ; CHECK-NEXT: ret{{[l|q]}} 2160 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer 2161 ret <4 x i64> %res 2162 } 2163 2164 define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) { 2165 ; X86-LABEL: test_mm256_mask_broadcastq_epi64: 2166 ; X86: # %bb.0: # %entry 2167 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2168 ; X86-NEXT: kmovw %eax, %k1 2169 ; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 2170 ; X86-NEXT: retl 2171 ; 2172 ; X64-LABEL: test_mm256_mask_broadcastq_epi64: 2173 ; X64: # %bb.0: # %entry 2174 ; X64-NEXT: kmovw %edi, %k1 2175 ; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 2176 ; X64-NEXT: retq 2177 entry: 2178 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer 2179 %0 = bitcast i8 %__M to <8 x i1> 2180 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2181 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> %__O 2182 ret <4 x i64> %1 2183 } 2184 2185 define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) { 2186 ; X86-LABEL: test_mm256_maskz_broadcastq_epi64: 2187 ; X86: # %bb.0: # %entry 2188 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2189 ; X86-NEXT: kmovw %eax, %k1 2190 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 2191 ; X86-NEXT: retl 2192 ; 2193 ; X64-LABEL: test_mm256_maskz_broadcastq_epi64: 2194 ; X64: # %bb.0: # %entry 2195 ; X64-NEXT: kmovw %edi, %k1 2196 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 2197 ; X64-NEXT: retq 2198 entry: 2199 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer 2200 %0 = bitcast i8 %__M to <8 x i1> 2201 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2202 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> zeroinitializer 2203 ret <4 x i64> %1 2204 } 2205 2206 define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) { 2207 ; CHECK-LABEL: test_mm256_broadcastsd_pd: 2208 ; CHECK: # %bb.0: 2209 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 2210 ; CHECK-NEXT: ret{{[l|q]}} 2211 %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer 2212 ret <4 x double> %res 2213 } 2214 2215 define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %__O, i8 zeroext %__M, <2 x double> %__A) { 2216 ; X86-LABEL: test_mm256_mask_broadcastsd_pd: 2217 ; X86: # %bb.0: # %entry 2218 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2219 ; X86-NEXT: kmovw %eax, %k1 2220 ; X86-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} 2221 ; X86-NEXT: retl 2222 ; 2223 ; X64-LABEL: test_mm256_mask_broadcastsd_pd: 2224 ; X64: # %bb.0: # %entry 2225 ; X64-NEXT: kmovw %edi, %k1 2226 ; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} 2227 ; X64-NEXT: retq 2228 entry: 2229 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer 2230 %0 = bitcast i8 %__M to <8 x i1> 2231 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2232 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__O 2233 ret <4 x double> %1 2234 } 2235 2236 define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 zeroext %__M, <2 x double> %__A) { 2237 ; X86-LABEL: test_mm256_maskz_broadcastsd_pd: 2238 ; X86: # %bb.0: # %entry 2239 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2240 ; X86-NEXT: kmovw %eax, %k1 2241 ; X86-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} 2242 ; X86-NEXT: retl 2243 ; 2244 ; X64-LABEL: test_mm256_maskz_broadcastsd_pd: 2245 ; X64: # %bb.0: # %entry 2246 ; X64-NEXT: kmovw %edi, %k1 2247 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} 2248 ; X64-NEXT: retq 2249 entry: 2250 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer 2251 %0 = bitcast i8 %__M to <8 x i1> 2252 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2253 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer 2254 ret <4 x double> %1 2255 } 2256 2257 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { 2258 ; CHECK-LABEL: test_mm_broadcastss_ps: 2259 ; CHECK: # %bb.0: 2260 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 2261 ; CHECK-NEXT: ret{{[l|q]}} 2262 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 2263 ret <4 x float> %res 2264 } 2265 2266 define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %__O, i8 zeroext %__M, <4 x float> %__A) { 2267 ; X86-LABEL: test_mm_mask_broadcastss_ps: 2268 ; X86: # %bb.0: # %entry 2269 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2270 ; X86-NEXT: kmovw %eax, %k1 2271 ; X86-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} 2272 ; X86-NEXT: retl 2273 ; 2274 ; X64-LABEL: test_mm_mask_broadcastss_ps: 2275 ; X64: # %bb.0: # %entry 2276 ; X64-NEXT: kmovw %edi, %k1 2277 ; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} 2278 ; X64-NEXT: retq 2279 entry: 2280 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer 2281 %0 = bitcast i8 %__M to <8 x i1> 2282 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2283 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__O 2284 ret <4 x float> %1 2285 } 2286 2287 define <4 x float> @test_mm_maskz_broadcastss_ps(i8 zeroext %__M, <4 x float> %__A) { 2288 ; X86-LABEL: test_mm_maskz_broadcastss_ps: 2289 ; X86: # %bb.0: # %entry 2290 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2291 ; X86-NEXT: kmovw %eax, %k1 2292 ; X86-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} 2293 ; X86-NEXT: retl 2294 ; 2295 ; X64-LABEL: test_mm_maskz_broadcastss_ps: 2296 ; X64: # %bb.0: # %entry 2297 ; X64-NEXT: kmovw %edi, %k1 2298 ; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} 2299 ; X64-NEXT: retq 2300 entry: 2301 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer 2302 %0 = bitcast i8 %__M to <8 x i1> 2303 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2304 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer 2305 ret <4 x float> %1 2306 } 2307 2308 define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) { 2309 ; CHECK-LABEL: test_mm256_broadcastss_ps: 2310 ; CHECK: # %bb.0: 2311 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 2312 ; CHECK-NEXT: ret{{[l|q]}} 2313 %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer 2314 ret <8 x float> %res 2315 } 2316 2317 define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) { 2318 ; X86-LABEL: test_mm256_mask_broadcastss_ps: 2319 ; X86: # %bb.0: 2320 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2321 ; X86-NEXT: kmovw %eax, %k1 2322 ; X86-NEXT: vbroadcastss %xmm1, %ymm0 {%k1} 2323 ; X86-NEXT: retl 2324 ; 2325 ; X64-LABEL: test_mm256_mask_broadcastss_ps: 2326 ; X64: # %bb.0: 2327 ; X64-NEXT: kmovw %edi, %k1 2328 ; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1} 2329 ; X64-NEXT: retq 2330 %arg1 = bitcast i8 %a1 to <8 x i1> 2331 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer 2332 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 2333 ret <8 x float> %res1 2334 } 2335 2336 define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) { 2337 ; X86-LABEL: test_mm256_maskz_broadcastss_ps: 2338 ; X86: # %bb.0: 2339 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2340 ; X86-NEXT: kmovw %eax, %k1 2341 ; X86-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} 2342 ; X86-NEXT: retl 2343 ; 2344 ; X64-LABEL: test_mm256_maskz_broadcastss_ps: 2345 ; X64: # %bb.0: 2346 ; X64-NEXT: kmovw %edi, %k1 2347 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} 2348 ; X64-NEXT: retq 2349 %arg0 = bitcast i8 %a0 to <8 x i1> 2350 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer 2351 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 2352 ret <8 x float> %res1 2353 } 2354 2355 define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) { 2356 ; CHECK-LABEL: test_mm_movddup_pd: 2357 ; CHECK: # %bb.0: 2358 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2359 ; CHECK-NEXT: ret{{[l|q]}} 2360 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 2361 ret <2 x double> %res 2362 } 2363 2364 define <2 x double> @test_mm_mask_movedup_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) { 2365 ; X86-LABEL: test_mm_mask_movedup_pd: 2366 ; X86: # %bb.0: # %entry 2367 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2368 ; X86-NEXT: kmovw %eax, %k1 2369 ; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 2370 ; X86-NEXT: retl 2371 ; 2372 ; X64-LABEL: test_mm_mask_movedup_pd: 2373 ; X64: # %bb.0: # %entry 2374 ; X64-NEXT: kmovw %edi, %k1 2375 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 2376 ; X64-NEXT: retq 2377 entry: 2378 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer 2379 %0 = bitcast i8 %__U to <8 x i1> 2380 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2381 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> %__W 2382 ret <2 x double> %1 2383 } 2384 2385 define <2 x double> @test_mm_maskz_movedup_pd(i8 zeroext %__U, <2 x double> %__A) { 2386 ; X86-LABEL: test_mm_maskz_movedup_pd: 2387 ; X86: # %bb.0: # %entry 2388 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2389 ; X86-NEXT: kmovw %eax, %k1 2390 ; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 2391 ; X86-NEXT: retl 2392 ; 2393 ; X64-LABEL: test_mm_maskz_movedup_pd: 2394 ; X64: # %bb.0: # %entry 2395 ; X64-NEXT: kmovw %edi, %k1 2396 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 2397 ; X64-NEXT: retq 2398 entry: 2399 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer 2400 %0 = bitcast i8 %__U to <8 x i1> 2401 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2402 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> zeroinitializer 2403 ret <2 x double> %1 2404 } 2405 2406 define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) { 2407 ; CHECK-LABEL: test_mm256_movddup_pd: 2408 ; CHECK: # %bb.0: 2409 ; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 2410 ; CHECK-NEXT: ret{{[l|q]}} 2411 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2412 ret <4 x double> %res 2413 } 2414 2415 define <4 x double> @test_mm256_mask_movedup_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) { 2416 ; X86-LABEL: test_mm256_mask_movedup_pd: 2417 ; X86: # %bb.0: # %entry 2418 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2419 ; X86-NEXT: kmovw %eax, %k1 2420 ; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] 2421 ; X86-NEXT: retl 2422 ; 2423 ; X64-LABEL: test_mm256_mask_movedup_pd: 2424 ; X64: # %bb.0: # %entry 2425 ; X64-NEXT: kmovw %edi, %k1 2426 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] 2427 ; X64-NEXT: retq 2428 entry: 2429 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2430 %0 = bitcast i8 %__U to <8 x i1> 2431 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2432 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__W 2433 ret <4 x double> %1 2434 } 2435 2436 define <4 x double> @test_mm256_maskz_movedup_pd(i8 zeroext %__U, <4 x double> %__A) { 2437 ; X86-LABEL: test_mm256_maskz_movedup_pd: 2438 ; X86: # %bb.0: # %entry 2439 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2440 ; X86-NEXT: kmovw %eax, %k1 2441 ; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 2442 ; X86-NEXT: retl 2443 ; 2444 ; X64-LABEL: test_mm256_maskz_movedup_pd: 2445 ; X64: # %bb.0: # %entry 2446 ; X64-NEXT: kmovw %edi, %k1 2447 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 2448 ; X64-NEXT: retq 2449 entry: 2450 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2451 %0 = bitcast i8 %__U to <8 x i1> 2452 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2453 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer 2454 ret <4 x double> %1 2455 } 2456 2457 define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) { 2458 ; CHECK-LABEL: test_mm_movehdup_ps: 2459 ; CHECK: # %bb.0: 2460 ; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2461 ; CHECK-NEXT: ret{{[l|q]}} 2462 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 2463 ret <4 x float> %res 2464 } 2465 2466 define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) { 2467 ; X86-LABEL: test_mm_mask_movehdup_ps: 2468 ; X86: # %bb.0: # %entry 2469 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2470 ; X86-NEXT: kmovw %eax, %k1 2471 ; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] 2472 ; X86-NEXT: retl 2473 ; 2474 ; X64-LABEL: test_mm_mask_movehdup_ps: 2475 ; X64: # %bb.0: # %entry 2476 ; X64-NEXT: kmovw %edi, %k1 2477 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] 2478 ; X64-NEXT: retq 2479 entry: 2480 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 2481 %0 = bitcast i8 %__U to <8 x i1> 2482 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2483 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W 2484 ret <4 x float> %1 2485 } 2486 2487 define <4 x float> @test_mm_maskz_movehdup_ps(i8 zeroext %__U, <4 x float> %__A) { 2488 ; X86-LABEL: test_mm_maskz_movehdup_ps: 2489 ; X86: # %bb.0: # %entry 2490 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2491 ; X86-NEXT: kmovw %eax, %k1 2492 ; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 2493 ; X86-NEXT: retl 2494 ; 2495 ; X64-LABEL: test_mm_maskz_movehdup_ps: 2496 ; X64: # %bb.0: # %entry 2497 ; X64-NEXT: kmovw %edi, %k1 2498 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 2499 ; X64-NEXT: retq 2500 entry: 2501 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 2502 %0 = bitcast i8 %__U to <8 x i1> 2503 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2504 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer 2505 ret <4 x float> %1 2506 } 2507 2508 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) { 2509 ; CHECK-LABEL: test_mm256_movehdup_ps: 2510 ; CHECK: # %bb.0: 2511 ; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 2512 ; CHECK-NEXT: ret{{[l|q]}} 2513 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 2514 ret <8 x float> %res 2515 } 2516 2517 define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) { 2518 ; X86-LABEL: test_mm256_mask_movehdup_ps: 2519 ; X86: # %bb.0: 2520 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2521 ; X86-NEXT: kmovw %eax, %k1 2522 ; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7] 2523 ; X86-NEXT: retl 2524 ; 2525 ; X64-LABEL: test_mm256_mask_movehdup_ps: 2526 ; X64: # %bb.0: 2527 ; X64-NEXT: kmovw %edi, %k1 2528 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7] 2529 ; X64-NEXT: retq 2530 %arg1 = bitcast i8 %a1 to <8 x i1> 2531 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 2532 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 2533 ret <8 x float> %res1 2534 } 2535 2536 define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) { 2537 ; X86-LABEL: test_mm256_maskz_movehdup_ps: 2538 ; X86: # %bb.0: 2539 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2540 ; X86-NEXT: kmovw %eax, %k1 2541 ; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 2542 ; X86-NEXT: retl 2543 ; 2544 ; X64-LABEL: test_mm256_maskz_movehdup_ps: 2545 ; X64: # %bb.0: 2546 ; X64-NEXT: kmovw %edi, %k1 2547 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 2548 ; X64-NEXT: retq 2549 %arg0 = bitcast i8 %a0 to <8 x i1> 2550 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 2551 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 2552 ret <8 x float> %res1 2553 } 2554 2555 define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) { 2556 ; CHECK-LABEL: test_mm_moveldup_ps: 2557 ; CHECK: # %bb.0: 2558 ; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 2559 ; CHECK-NEXT: ret{{[l|q]}} 2560 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2561 ret <4 x float> %res 2562 } 2563 2564 define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) { 2565 ; X86-LABEL: test_mm_mask_moveldup_ps: 2566 ; X86: # %bb.0: # %entry 2567 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2568 ; X86-NEXT: kmovw %eax, %k1 2569 ; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] 2570 ; X86-NEXT: retl 2571 ; 2572 ; X64-LABEL: test_mm_mask_moveldup_ps: 2573 ; X64: # %bb.0: # %entry 2574 ; X64-NEXT: kmovw %edi, %k1 2575 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] 2576 ; X64-NEXT: retq 2577 entry: 2578 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2579 %0 = bitcast i8 %__U to <8 x i1> 2580 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2581 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W 2582 ret <4 x float> %1 2583 } 2584 2585 define <4 x float> @test_mm_maskz_moveldup_ps(i8 zeroext %__U, <4 x float> %__A) { 2586 ; X86-LABEL: test_mm_maskz_moveldup_ps: 2587 ; X86: # %bb.0: # %entry 2588 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2589 ; X86-NEXT: kmovw %eax, %k1 2590 ; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 2591 ; X86-NEXT: retl 2592 ; 2593 ; X64-LABEL: test_mm_maskz_moveldup_ps: 2594 ; X64: # %bb.0: # %entry 2595 ; X64-NEXT: kmovw %edi, %k1 2596 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 2597 ; X64-NEXT: retq 2598 entry: 2599 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2600 %0 = bitcast i8 %__U to <8 x i1> 2601 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2602 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer 2603 ret <4 x float> %1 2604 } 2605 2606 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) { 2607 ; CHECK-LABEL: test_mm256_moveldup_ps: 2608 ; CHECK: # %bb.0: 2609 ; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 2610 ; CHECK-NEXT: ret{{[l|q]}} 2611 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 2612 ret <8 x float> %res 2613 } 2614 2615 define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) { 2616 ; X86-LABEL: test_mm256_mask_moveldup_ps: 2617 ; X86: # %bb.0: 2618 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2619 ; X86-NEXT: kmovw %eax, %k1 2620 ; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6] 2621 ; X86-NEXT: retl 2622 ; 2623 ; X64-LABEL: test_mm256_mask_moveldup_ps: 2624 ; X64: # %bb.0: 2625 ; X64-NEXT: kmovw %edi, %k1 2626 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6] 2627 ; X64-NEXT: retq 2628 %arg1 = bitcast i8 %a1 to <8 x i1> 2629 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 2630 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 2631 ret <8 x float> %res1 2632 } 2633 2634 define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) { 2635 ; X86-LABEL: test_mm256_maskz_moveldup_ps: 2636 ; X86: # %bb.0: 2637 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2638 ; X86-NEXT: kmovw %eax, %k1 2639 ; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 2640 ; X86-NEXT: retl 2641 ; 2642 ; X64-LABEL: test_mm256_maskz_moveldup_ps: 2643 ; X64: # %bb.0: 2644 ; X64-NEXT: kmovw %edi, %k1 2645 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 2646 ; X64-NEXT: retq 2647 %arg0 = bitcast i8 %a0 to <8 x i1> 2648 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 2649 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 2650 ret <8 x float> %res1 2651 } 2652 2653 define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) { 2654 ; CHECK-LABEL: test_mm256_permutex_epi64: 2655 ; CHECK: # %bb.0: 2656 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] 2657 ; CHECK-NEXT: ret{{[l|q]}} 2658 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2659 ret <4 x i64> %res 2660 } 2661 2662 define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X) { 2663 ; X86-LABEL: test_mm256_mask_permutex_epi64: 2664 ; X86: # %bb.0: # %entry 2665 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2666 ; X86-NEXT: kmovw %eax, %k1 2667 ; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0] 2668 ; X86-NEXT: retl 2669 ; 2670 ; X64-LABEL: test_mm256_mask_permutex_epi64: 2671 ; X64: # %bb.0: # %entry 2672 ; X64-NEXT: kmovw %edi, %k1 2673 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0] 2674 ; X64-NEXT: retq 2675 entry: 2676 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2677 %0 = bitcast i8 %__M to <8 x i1> 2678 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2679 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> %__W 2680 ret <4 x i64> %1 2681 } 2682 2683 define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 zeroext %__M, <4 x i64> %__X) { 2684 ; X86-LABEL: test_mm256_maskz_permutex_epi64: 2685 ; X86: # %bb.0: # %entry 2686 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2687 ; X86-NEXT: kmovw %eax, %k1 2688 ; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] 2689 ; X86-NEXT: retl 2690 ; 2691 ; X64-LABEL: test_mm256_maskz_permutex_epi64: 2692 ; X64: # %bb.0: # %entry 2693 ; X64-NEXT: kmovw %edi, %k1 2694 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] 2695 ; X64-NEXT: retq 2696 entry: 2697 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2698 %0 = bitcast i8 %__M to <8 x i1> 2699 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2700 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> zeroinitializer 2701 ret <4 x i64> %1 2702 } 2703 2704 define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) { 2705 ; CHECK-LABEL: test_mm256_permutex_pd: 2706 ; CHECK: # %bb.0: 2707 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] 2708 ; CHECK-NEXT: ret{{[l|q]}} 2709 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2710 ret <4 x double> %res 2711 } 2712 2713 define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__X) { 2714 ; X86-LABEL: test_mm256_mask_permutex_pd: 2715 ; X86: # %bb.0: # %entry 2716 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2717 ; X86-NEXT: kmovw %eax, %k1 2718 ; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 2719 ; X86-NEXT: retl 2720 ; 2721 ; X64-LABEL: test_mm256_mask_permutex_pd: 2722 ; X64: # %bb.0: # %entry 2723 ; X64-NEXT: kmovw %edi, %k1 2724 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 2725 ; X64-NEXT: retq 2726 entry: 2727 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 2728 %0 = bitcast i8 %__U to <8 x i1> 2729 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2730 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> %__W 2731 ret <4 x double> %1 2732 } 2733 2734 define <4 x double> @test_mm256_maskz_permutex_pd(i8 zeroext %__U, <4 x double> %__X) { 2735 ; X86-LABEL: test_mm256_maskz_permutex_pd: 2736 ; X86: # %bb.0: # %entry 2737 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2738 ; X86-NEXT: kmovw %eax, %k1 2739 ; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 2740 ; X86-NEXT: retl 2741 ; 2742 ; X64-LABEL: test_mm256_maskz_permutex_pd: 2743 ; X64: # %bb.0: # %entry 2744 ; X64-NEXT: kmovw %edi, %k1 2745 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 2746 ; X64-NEXT: retq 2747 entry: 2748 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 2749 %0 = bitcast i8 %__U to <8 x i1> 2750 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2751 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> zeroinitializer 2752 ret <4 x double> %1 2753 } 2754 2755 define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) { 2756 ; CHECK-LABEL: test_mm_shuffle_pd: 2757 ; CHECK: # %bb.0: 2758 ; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 2759 ; CHECK-NEXT: ret{{[l|q]}} 2760 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3> 2761 ret <2 x double> %res 2762 } 2763 2764 define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2765 ; X86-LABEL: test_mm_mask_shuffle_pd: 2766 ; X86: # %bb.0: # %entry 2767 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2768 ; X86-NEXT: kmovw %eax, %k1 2769 ; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] 2770 ; X86-NEXT: retl 2771 ; 2772 ; X64-LABEL: test_mm_mask_shuffle_pd: 2773 ; X64: # %bb.0: # %entry 2774 ; X64-NEXT: kmovw %edi, %k1 2775 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] 2776 ; X64-NEXT: retq 2777 entry: 2778 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3> 2779 %0 = bitcast i8 %__U to <8 x i1> 2780 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2781 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> %__W 2782 ret <2 x double> %1 2783 } 2784 2785 define <2 x double> @test_mm_maskz_shuffle_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2786 ; X86-LABEL: test_mm_maskz_shuffle_pd: 2787 ; X86: # %bb.0: # %entry 2788 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2789 ; X86-NEXT: kmovw %eax, %k1 2790 ; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] 2791 ; X86-NEXT: retl 2792 ; 2793 ; X64-LABEL: test_mm_maskz_shuffle_pd: 2794 ; X64: # %bb.0: # %entry 2795 ; X64-NEXT: kmovw %edi, %k1 2796 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] 2797 ; X64-NEXT: retq 2798 entry: 2799 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3> 2800 %0 = bitcast i8 %__U to <8 x i1> 2801 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2802 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> zeroinitializer 2803 ret <2 x double> %1 2804 } 2805 2806 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) { 2807 ; CHECK-LABEL: test_mm256_shuffle_pd: 2808 ; CHECK: # %bb.0: 2809 ; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 2810 ; CHECK-NEXT: ret{{[l|q]}} 2811 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 2812 ret <4 x double> %res 2813 } 2814 2815 define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 2816 ; X86-LABEL: test_mm256_mask_shuffle_pd: 2817 ; X86: # %bb.0: # %entry 2818 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2819 ; X86-NEXT: kmovw %eax, %k1 2820 ; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] 2821 ; X86-NEXT: retl 2822 ; 2823 ; X64-LABEL: test_mm256_mask_shuffle_pd: 2824 ; X64: # %bb.0: # %entry 2825 ; X64-NEXT: kmovw %edi, %k1 2826 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] 2827 ; X64-NEXT: retq 2828 entry: 2829 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 2830 %0 = bitcast i8 %__U to <8 x i1> 2831 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2832 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> %__W 2833 ret <4 x double> %1 2834 } 2835 2836 define <4 x double> @test_mm256_maskz_shuffle_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 2837 ; X86-LABEL: test_mm256_maskz_shuffle_pd: 2838 ; X86: # %bb.0: # %entry 2839 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2840 ; X86-NEXT: kmovw %eax, %k1 2841 ; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 2842 ; X86-NEXT: retl 2843 ; 2844 ; X64-LABEL: test_mm256_maskz_shuffle_pd: 2845 ; X64: # %bb.0: # %entry 2846 ; X64-NEXT: kmovw %edi, %k1 2847 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 2848 ; X64-NEXT: retq 2849 entry: 2850 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 2851 %0 = bitcast i8 %__U to <8 x i1> 2852 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2853 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> zeroinitializer 2854 ret <4 x double> %1 2855 } 2856 2857 define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) { 2858 ; CHECK-LABEL: test_mm_shuffle_ps: 2859 ; CHECK: # %bb.0: 2860 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 2861 ; CHECK-NEXT: ret{{[l|q]}} 2862 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 2863 ret <4 x float> %res 2864 } 2865 2866 define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2867 ; X86-LABEL: test_mm_mask_shuffle_ps: 2868 ; X86: # %bb.0: # %entry 2869 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2870 ; X86-NEXT: kmovw %eax, %k1 2871 ; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] 2872 ; X86-NEXT: retl 2873 ; 2874 ; X64-LABEL: test_mm_mask_shuffle_ps: 2875 ; X64: # %bb.0: # %entry 2876 ; X64-NEXT: kmovw %edi, %k1 2877 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] 2878 ; X64-NEXT: retq 2879 entry: 2880 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 2881 %0 = bitcast i8 %__U to <8 x i1> 2882 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2883 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> %__W 2884 ret <4 x float> %1 2885 } 2886 2887 define <4 x float> @test_mm_maskz_shuffle_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2888 ; X86-LABEL: test_mm_maskz_shuffle_ps: 2889 ; X86: # %bb.0: # %entry 2890 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2891 ; X86-NEXT: kmovw %eax, %k1 2892 ; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] 2893 ; X86-NEXT: retl 2894 ; 2895 ; X64-LABEL: test_mm_maskz_shuffle_ps: 2896 ; X64: # %bb.0: # %entry 2897 ; X64-NEXT: kmovw %edi, %k1 2898 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] 2899 ; X64-NEXT: retq 2900 entry: 2901 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 2902 %0 = bitcast i8 %__U to <8 x i1> 2903 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2904 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> zeroinitializer 2905 ret <4 x float> %1 2906 } 2907 2908 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) { 2909 ; CHECK-LABEL: test_mm256_shuffle_ps: 2910 ; CHECK: # %bb.0: 2911 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 2912 ; CHECK-NEXT: ret{{[l|q]}} 2913 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 2914 ret <8 x float> %res 2915 } 2916 2917 define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) { 2918 ; X86-LABEL: test_mm256_mask_shuffle_ps: 2919 ; X86: # %bb.0: 2920 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2921 ; X86-NEXT: kmovw %eax, %k1 2922 ; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4] 2923 ; X86-NEXT: retl 2924 ; 2925 ; X64-LABEL: test_mm256_mask_shuffle_ps: 2926 ; X64: # %bb.0: 2927 ; X64-NEXT: kmovw %edi, %k1 2928 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4] 2929 ; X64-NEXT: retq 2930 %arg1 = bitcast i8 %a1 to <8 x i1> 2931 %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 2932 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 2933 ret <8 x float> %res1 2934 } 2935 2936 define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) { 2937 ; X86-LABEL: test_mm256_maskz_shuffle_ps: 2938 ; X86: # %bb.0: 2939 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2940 ; X86-NEXT: kmovw %eax, %k1 2941 ; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 2942 ; X86-NEXT: retl 2943 ; 2944 ; X64-LABEL: test_mm256_maskz_shuffle_ps: 2945 ; X64: # %bb.0: 2946 ; X64-NEXT: kmovw %edi, %k1 2947 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 2948 ; X64-NEXT: retq 2949 %arg0 = bitcast i8 %a0 to <8 x i1> 2950 %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 2951 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 2952 ret <8 x float> %res1 2953 } 2954 2955 define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 2956 ; X86-LABEL: test_mm256_mask_mul_epi32: 2957 ; X86: # %bb.0: # %entry 2958 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2959 ; X86-NEXT: kmovw %eax, %k1 2960 ; X86-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1} 2961 ; X86-NEXT: retl 2962 ; 2963 ; X64-LABEL: test_mm256_mask_mul_epi32: 2964 ; X64: # %bb.0: # %entry 2965 ; X64-NEXT: kmovw %edi, %k1 2966 ; X64-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1} 2967 ; X64-NEXT: retq 2968 entry: 2969 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32> 2970 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32> 2971 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32> 2972 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32> 2973 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1 2974 %tmp5 = bitcast i8 %__M to <8 x i1> 2975 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2976 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W 2977 ret <4 x i64> %tmp6 2978 } 2979 2980 define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 2981 ; X86-LABEL: test_mm256_maskz_mul_epi32: 2982 ; X86: # %bb.0: 2983 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2984 ; X86-NEXT: kmovw %eax, %k1 2985 ; X86-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z} 2986 ; X86-NEXT: retl 2987 ; 2988 ; X64-LABEL: test_mm256_maskz_mul_epi32: 2989 ; X64: # %bb.0: 2990 ; X64-NEXT: kmovw %edi, %k1 2991 ; X64-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z} 2992 ; X64-NEXT: retq 2993 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32> 2994 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32> 2995 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32> 2996 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32> 2997 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1 2998 %tmp5 = bitcast i8 %__M to <8 x i1> 2999 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3000 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer 3001 ret <4 x i64> %tmp6 3002 } 3003 3004 define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3005 ; X86-LABEL: test_mm_mask_mul_epi32: 3006 ; X86: # %bb.0: 3007 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3008 ; X86-NEXT: kmovw %eax, %k1 3009 ; X86-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1} 3010 ; X86-NEXT: retl 3011 ; 3012 ; X64-LABEL: test_mm_mask_mul_epi32: 3013 ; X64: # %bb.0: 3014 ; X64-NEXT: kmovw %edi, %k1 3015 ; X64-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1} 3016 ; X64-NEXT: retq 3017 %tmp = shl <2 x i64> %__X, <i64 32, i64 32> 3018 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32> 3019 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32> 3020 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32> 3021 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1 3022 %tmp5 = bitcast i8 %__M to <8 x i1> 3023 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3024 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W 3025 ret <2 x i64> %tmp6 3026 } 3027 3028 define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3029 ; X86-LABEL: test_mm_maskz_mul_epi32: 3030 ; X86: # %bb.0: 3031 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3032 ; X86-NEXT: kmovw %eax, %k1 3033 ; X86-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z} 3034 ; X86-NEXT: retl 3035 ; 3036 ; X64-LABEL: test_mm_maskz_mul_epi32: 3037 ; X64: # %bb.0: 3038 ; X64-NEXT: kmovw %edi, %k1 3039 ; X64-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z} 3040 ; X64-NEXT: retq 3041 %tmp = shl <2 x i64> %__X, <i64 32, i64 32> 3042 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32> 3043 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32> 3044 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32> 3045 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1 3046 %tmp5 = bitcast i8 %__M to <8 x i1> 3047 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3048 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer 3049 ret <2 x i64> %tmp6 3050 } 3051 3052 define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 3053 ; X86-LABEL: test_mm256_mask_mul_epu32: 3054 ; X86: # %bb.0: # %entry 3055 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3056 ; X86-NEXT: kmovw %eax, %k1 3057 ; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1} 3058 ; X86-NEXT: retl 3059 ; 3060 ; X64-LABEL: test_mm256_mask_mul_epu32: 3061 ; X64: # %bb.0: # %entry 3062 ; X64-NEXT: kmovw %edi, %k1 3063 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1} 3064 ; X64-NEXT: retq 3065 entry: 3066 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3067 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3068 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp 3069 %tmp3 = bitcast i8 %__M to <8 x i1> 3070 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3071 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W 3072 ret <4 x i64> %tmp4 3073 } 3074 3075 define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 3076 ; X86-LABEL: test_mm256_maskz_mul_epu32: 3077 ; X86: # %bb.0: # %entry 3078 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3079 ; X86-NEXT: kmovw %eax, %k1 3080 ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z} 3081 ; X86-NEXT: retl 3082 ; 3083 ; X64-LABEL: test_mm256_maskz_mul_epu32: 3084 ; X64: # %bb.0: # %entry 3085 ; X64-NEXT: kmovw %edi, %k1 3086 ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z} 3087 ; X64-NEXT: retq 3088 entry: 3089 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3090 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3091 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp 3092 %tmp3 = bitcast i8 %__M to <8 x i1> 3093 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3094 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer 3095 ret <4 x i64> %tmp4 3096 } 3097 3098 define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3099 ; X86-LABEL: test_mm_mask_mul_epu32: 3100 ; X86: # %bb.0: # %entry 3101 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3102 ; X86-NEXT: kmovw %eax, %k1 3103 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1} 3104 ; X86-NEXT: retl 3105 ; 3106 ; X64-LABEL: test_mm_mask_mul_epu32: 3107 ; X64: # %bb.0: # %entry 3108 ; X64-NEXT: kmovw %edi, %k1 3109 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1} 3110 ; X64-NEXT: retq 3111 entry: 3112 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295> 3113 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295> 3114 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp 3115 %tmp3 = bitcast i8 %__M to <8 x i1> 3116 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3117 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W 3118 ret <2 x i64> %tmp4 3119 } 3120 3121 define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3122 ; X86-LABEL: test_mm_maskz_mul_epu32: 3123 ; X86: # %bb.0: # %entry 3124 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3125 ; X86-NEXT: kmovw %eax, %k1 3126 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z} 3127 ; X86-NEXT: retl 3128 ; 3129 ; X64-LABEL: test_mm_maskz_mul_epu32: 3130 ; X64: # %bb.0: # %entry 3131 ; X64-NEXT: kmovw %edi, %k1 3132 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z} 3133 ; X64-NEXT: retq 3134 entry: 3135 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295> 3136 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295> 3137 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp 3138 %tmp3 = bitcast i8 %__M to <8 x i1> 3139 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3140 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer 3141 ret <2 x i64> %tmp4 3142 } 3143 3144 define <2 x i64> @test_mm_cvtepi32_epi8(<2 x i64> %__A) { 3145 ; CHECK-LABEL: test_mm_cvtepi32_epi8: 3146 ; CHECK: # %bb.0: # %entry 3147 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3148 ; CHECK-NEXT: ret{{[l|q]}} 3149 entry: 3150 %0 = bitcast <2 x i64> %__A to <4 x i32> 3151 %conv.i = trunc <4 x i32> %0 to <4 x i8> 3152 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 3153 %1 = bitcast <16 x i8> %shuf.i to <2 x i64> 3154 ret <2 x i64> %1 3155 } 3156 3157 define <2 x i64> @test_mm_cvtepi32_epi16(<2 x i64> %__A) { 3158 ; CHECK-LABEL: test_mm_cvtepi32_epi16: 3159 ; CHECK: # %bb.0: # %entry 3160 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 3161 ; CHECK-NEXT: ret{{[l|q]}} 3162 entry: 3163 %0 = bitcast <2 x i64> %__A to <4 x i32> 3164 %conv.i = trunc <4 x i32> %0 to <4 x i16> 3165 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3166 %1 = bitcast <8 x i16> %shuf.i to <2 x i64> 3167 ret <2 x i64> %1 3168 } 3169 3170 define <2 x i64> @test_mm_cvtepi64_epi8(<2 x i64> %__A) { 3171 ; CHECK-LABEL: test_mm_cvtepi64_epi8: 3172 ; CHECK: # %bb.0: # %entry 3173 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3174 ; CHECK-NEXT: ret{{[l|q]}} 3175 entry: 3176 %conv.i = trunc <2 x i64> %__A to <2 x i8> 3177 %shuf.i = shufflevector <2 x i8> %conv.i, <2 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 3178 %0 = bitcast <16 x i8> %shuf.i to <2 x i64> 3179 ret <2 x i64> %0 3180 } 3181 3182 define <2 x i64> @test_mm_cvtepi64_epi16(<2 x i64> %__A) { 3183 ; CHECK-LABEL: test_mm_cvtepi64_epi16: 3184 ; CHECK: # %bb.0: # %entry 3185 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3186 ; CHECK-NEXT: ret{{[l|q]}} 3187 entry: 3188 %conv.i = trunc <2 x i64> %__A to <2 x i16> 3189 %shuf.i = shufflevector <2 x i16> %conv.i, <2 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 3190 %0 = bitcast <8 x i16> %shuf.i to <2 x i64> 3191 ret <2 x i64> %0 3192 } 3193 3194 define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) { 3195 ; CHECK-LABEL: test_mm_cvtepi64_epi32: 3196 ; CHECK: # %bb.0: # %entry 3197 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 3198 ; CHECK-NEXT: ret{{[l|q]}} 3199 entry: 3200 %conv.i = trunc <2 x i64> %__A to <2 x i32> 3201 %shuf.i = shufflevector <2 x i32> %conv.i, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3202 %0 = bitcast <4 x i32> %shuf.i to <2 x i64> 3203 ret <2 x i64> %0 3204 } 3205 3206 define <2 x i64> @test_mm256_cvtepi32_epi16(<4 x i64> %__A) local_unnamed_addr #0 { 3207 ; CHECK-LABEL: test_mm256_cvtepi32_epi16: 3208 ; CHECK: # %bb.0: # %entry 3209 ; CHECK-NEXT: vpmovdw %ymm0, %xmm0 3210 ; CHECK-NEXT: vzeroupper 3211 ; CHECK-NEXT: ret{{[l|q]}} 3212 entry: 3213 %0 = bitcast <4 x i64> %__A to <8 x i32> 3214 %conv.i = trunc <8 x i32> %0 to <8 x i16> 3215 %1 = bitcast <8 x i16> %conv.i to <2 x i64> 3216 ret <2 x i64> %1 3217 } 3218 3219 define <2 x i64> @test_mm256_mask_cvtepi32_epi16(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) { 3220 ; X86-LABEL: test_mm256_mask_cvtepi32_epi16: 3221 ; X86: # %bb.0: # %entry 3222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3223 ; X86-NEXT: kmovw %eax, %k1 3224 ; X86-NEXT: vpmovdw %ymm1, %xmm0 {%k1} 3225 ; X86-NEXT: vzeroupper 3226 ; X86-NEXT: retl 3227 ; 3228 ; X64-LABEL: test_mm256_mask_cvtepi32_epi16: 3229 ; X64: # %bb.0: # %entry 3230 ; X64-NEXT: kmovw %edi, %k1 3231 ; X64-NEXT: vpmovdw %ymm1, %xmm0 {%k1} 3232 ; X64-NEXT: vzeroupper 3233 ; X64-NEXT: retq 3234 entry: 3235 %0 = bitcast <4 x i64> %__A to <8 x i32> 3236 %1 = bitcast <2 x i64> %__O to <8 x i16> 3237 %2 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> %1, i8 %__M) 3238 %3 = bitcast <8 x i16> %2 to <2 x i64> 3239 ret <2 x i64> %3 3240 } 3241 3242 define <2 x i64> @test_mm256_maskz_cvtepi32_epi16(i8 zeroext %__M, <4 x i64> %__A) { 3243 ; X86-LABEL: test_mm256_maskz_cvtepi32_epi16: 3244 ; X86: # %bb.0: # %entry 3245 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3246 ; X86-NEXT: kmovw %eax, %k1 3247 ; X86-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} 3248 ; X86-NEXT: vzeroupper 3249 ; X86-NEXT: retl 3250 ; 3251 ; X64-LABEL: test_mm256_maskz_cvtepi32_epi16: 3252 ; X64: # %bb.0: # %entry 3253 ; X64-NEXT: kmovw %edi, %k1 3254 ; X64-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} 3255 ; X64-NEXT: vzeroupper 3256 ; X64-NEXT: retq 3257 entry: 3258 %0 = bitcast <4 x i64> %__A to <8 x i32> 3259 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> zeroinitializer, i8 %__M) 3260 %2 = bitcast <8 x i16> %1 to <2 x i64> 3261 ret <2 x i64> %2 3262 } 3263 3264 define <2 x i64> @test_mm256_cvtepi64_epi32(<4 x i64> %__A) local_unnamed_addr #0 { 3265 ; CHECK-LABEL: test_mm256_cvtepi64_epi32: 3266 ; CHECK: # %bb.0: # %entry 3267 ; CHECK-NEXT: vpmovqd %ymm0, %xmm0 3268 ; CHECK-NEXT: vzeroupper 3269 ; CHECK-NEXT: ret{{[l|q]}} 3270 entry: 3271 %conv.i = trunc <4 x i64> %__A to <4 x i32> 3272 %0 = bitcast <4 x i32> %conv.i to <2 x i64> 3273 ret <2 x i64> %0 3274 } 3275 3276 define <2 x i64> @test_mm256_mask_cvtepi64_epi32(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) { 3277 ; X86-LABEL: test_mm256_mask_cvtepi64_epi32: 3278 ; X86: # %bb.0: # %entry 3279 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3280 ; X86-NEXT: kmovw %eax, %k1 3281 ; X86-NEXT: vpmovqd %ymm1, %xmm0 {%k1} 3282 ; X86-NEXT: vzeroupper 3283 ; X86-NEXT: retl 3284 ; 3285 ; X64-LABEL: test_mm256_mask_cvtepi64_epi32: 3286 ; X64: # %bb.0: # %entry 3287 ; X64-NEXT: kmovw %edi, %k1 3288 ; X64-NEXT: vpmovqd %ymm1, %xmm0 {%k1} 3289 ; X64-NEXT: vzeroupper 3290 ; X64-NEXT: retq 3291 entry: 3292 %conv.i.i = trunc <4 x i64> %__A to <4 x i32> 3293 %0 = bitcast <2 x i64> %__O to <4 x i32> 3294 %1 = bitcast i8 %__M to <8 x i1> 3295 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3296 %2 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> %0 3297 %3 = bitcast <4 x i32> %2 to <2 x i64> 3298 ret <2 x i64> %3 3299 } 3300 3301 define <2 x i64> @test_mm256_maskz_cvtepi64_epi32(i8 zeroext %__M, <4 x i64> %__A) { 3302 ; X86-LABEL: test_mm256_maskz_cvtepi64_epi32: 3303 ; X86: # %bb.0: # %entry 3304 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3305 ; X86-NEXT: kmovw %eax, %k1 3306 ; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} 3307 ; X86-NEXT: vzeroupper 3308 ; X86-NEXT: retl 3309 ; 3310 ; X64-LABEL: test_mm256_maskz_cvtepi64_epi32: 3311 ; X64: # %bb.0: # %entry 3312 ; X64-NEXT: kmovw %edi, %k1 3313 ; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} 3314 ; X64-NEXT: vzeroupper 3315 ; X64-NEXT: retq 3316 entry: 3317 %conv.i.i = trunc <4 x i64> %__A to <4 x i32> 3318 %0 = bitcast i8 %__M to <8 x i1> 3319 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3320 %1 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> zeroinitializer 3321 %2 = bitcast <4 x i32> %1 to <2 x i64> 3322 ret <2 x i64> %2 3323 } 3324 3325 define <2 x i64> @test_mm256_cvtepi64_epi8(<4 x i64> %__A) { 3326 ; CHECK-LABEL: test_mm256_cvtepi64_epi8: 3327 ; CHECK: # %bb.0: # %entry 3328 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0 3329 ; CHECK-NEXT: vzeroupper 3330 ; CHECK-NEXT: ret{{[l|q]}} 3331 entry: 3332 %conv.i = trunc <4 x i64> %__A to <4 x i8> 3333 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 3334 %0 = bitcast <16 x i8> %shuf.i to <2 x i64> 3335 ret <2 x i64> %0 3336 } 3337 3338 define <2 x i64> @test_mm256_cvtepi64_epi16(<4 x i64> %__A) { 3339 ; CHECK-LABEL: test_mm256_cvtepi64_epi16: 3340 ; CHECK: # %bb.0: # %entry 3341 ; CHECK-NEXT: vpmovqw %ymm0, %xmm0 3342 ; CHECK-NEXT: vzeroupper 3343 ; CHECK-NEXT: ret{{[l|q]}} 3344 entry: 3345 %conv.i = trunc <4 x i64> %__A to <4 x i16> 3346 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3347 %0 = bitcast <8 x i16> %shuf.i to <2 x i64> 3348 ret <2 x i64> %0 3349 } 3350 3351 define <2 x i64> @test_mm256_cvtepi32_epi8(<4 x i64> %__A) { 3352 ; CHECK-LABEL: test_mm256_cvtepi32_epi8: 3353 ; CHECK: # %bb.0: # %entry 3354 ; CHECK-NEXT: vpmovdb %ymm0, %xmm0 3355 ; CHECK-NEXT: vzeroupper 3356 ; CHECK-NEXT: ret{{[l|q]}} 3357 entry: 3358 %0 = bitcast <4 x i64> %__A to <8 x i32> 3359 %conv.i = trunc <8 x i32> %0 to <8 x i8> 3360 %shuf.i = shufflevector <8 x i8> %conv.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3361 %1 = bitcast <16 x i8> %shuf.i to <2 x i64> 3362 ret <2 x i64> %1 3363 } 3364 3365 define <2 x i64> @test_mm_ternarylogic_epi32(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3366 ; CHECK-LABEL: test_mm_ternarylogic_epi32: 3367 ; CHECK: # %bb.0: # %entry 3368 ; CHECK-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 3369 ; CHECK-NEXT: ret{{[l|q]}} 3370 entry: 3371 %0 = bitcast <2 x i64> %__A to <4 x i32> 3372 %1 = bitcast <2 x i64> %__B to <4 x i32> 3373 %2 = bitcast <2 x i64> %__C to <4 x i32> 3374 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4) 3375 %4 = bitcast <4 x i32> %3 to <2 x i64> 3376 ret <2 x i64> %4 3377 } 3378 3379 declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32) #2 3380 3381 define <2 x i64> @test_mm_mask_ternarylogic_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) { 3382 ; X86-LABEL: test_mm_mask_ternarylogic_epi32: 3383 ; X86: # %bb.0: # %entry 3384 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3385 ; X86-NEXT: kmovw %eax, %k1 3386 ; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} 3387 ; X86-NEXT: retl 3388 ; 3389 ; X64-LABEL: test_mm_mask_ternarylogic_epi32: 3390 ; X64: # %bb.0: # %entry 3391 ; X64-NEXT: kmovw %edi, %k1 3392 ; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} 3393 ; X64-NEXT: retq 3394 entry: 3395 %0 = bitcast <2 x i64> %__A to <4 x i32> 3396 %1 = bitcast <2 x i64> %__B to <4 x i32> 3397 %2 = bitcast <2 x i64> %__C to <4 x i32> 3398 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4) 3399 %4 = bitcast i8 %__U to <8 x i1> 3400 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3401 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> %0 3402 %6 = bitcast <4 x i32> %5 to <2 x i64> 3403 ret <2 x i64> %6 3404 } 3405 3406 define <2 x i64> @test_mm_maskz_ternarylogic_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3407 ; X86-LABEL: test_mm_maskz_ternarylogic_epi32: 3408 ; X86: # %bb.0: # %entry 3409 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3410 ; X86-NEXT: kmovw %eax, %k1 3411 ; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3412 ; X86-NEXT: retl 3413 ; 3414 ; X64-LABEL: test_mm_maskz_ternarylogic_epi32: 3415 ; X64: # %bb.0: # %entry 3416 ; X64-NEXT: kmovw %edi, %k1 3417 ; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3418 ; X64-NEXT: retq 3419 entry: 3420 %0 = bitcast <2 x i64> %__A to <4 x i32> 3421 %1 = bitcast <2 x i64> %__B to <4 x i32> 3422 %2 = bitcast <2 x i64> %__C to <4 x i32> 3423 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4) 3424 %4 = bitcast i8 %__U to <8 x i1> 3425 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3426 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> zeroinitializer 3427 %6 = bitcast <4 x i32> %5 to <2 x i64> 3428 ret <2 x i64> %6 3429 } 3430 3431 define <4 x i64> @test_mm256_ternarylogic_epi32(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3432 ; CHECK-LABEL: test_mm256_ternarylogic_epi32: 3433 ; CHECK: # %bb.0: # %entry 3434 ; CHECK-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 3435 ; CHECK-NEXT: ret{{[l|q]}} 3436 entry: 3437 %0 = bitcast <4 x i64> %__A to <8 x i32> 3438 %1 = bitcast <4 x i64> %__B to <8 x i32> 3439 %2 = bitcast <4 x i64> %__C to <8 x i32> 3440 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4) 3441 %4 = bitcast <8 x i32> %3 to <4 x i64> 3442 ret <4 x i64> %4 3443 } 3444 3445 declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32) #2 3446 3447 define <4 x i64> @test_mm256_mask_ternarylogic_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) { 3448 ; X86-LABEL: test_mm256_mask_ternarylogic_epi32: 3449 ; X86: # %bb.0: # %entry 3450 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3451 ; X86-NEXT: kmovw %eax, %k1 3452 ; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} 3453 ; X86-NEXT: retl 3454 ; 3455 ; X64-LABEL: test_mm256_mask_ternarylogic_epi32: 3456 ; X64: # %bb.0: # %entry 3457 ; X64-NEXT: kmovw %edi, %k1 3458 ; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} 3459 ; X64-NEXT: retq 3460 entry: 3461 %0 = bitcast <4 x i64> %__A to <8 x i32> 3462 %1 = bitcast <4 x i64> %__B to <8 x i32> 3463 %2 = bitcast <4 x i64> %__C to <8 x i32> 3464 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4) 3465 %4 = bitcast i8 %__U to <8 x i1> 3466 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0 3467 %6 = bitcast <8 x i32> %5 to <4 x i64> 3468 ret <4 x i64> %6 3469 } 3470 3471 define <4 x i64> @test_mm256_maskz_ternarylogic_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3472 ; X86-LABEL: test_mm256_maskz_ternarylogic_epi32: 3473 ; X86: # %bb.0: # %entry 3474 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3475 ; X86-NEXT: kmovw %eax, %k1 3476 ; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3477 ; X86-NEXT: retl 3478 ; 3479 ; X64-LABEL: test_mm256_maskz_ternarylogic_epi32: 3480 ; X64: # %bb.0: # %entry 3481 ; X64-NEXT: kmovw %edi, %k1 3482 ; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3483 ; X64-NEXT: retq 3484 entry: 3485 %0 = bitcast <4 x i64> %__A to <8 x i32> 3486 %1 = bitcast <4 x i64> %__B to <8 x i32> 3487 %2 = bitcast <4 x i64> %__C to <8 x i32> 3488 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4) 3489 %4 = bitcast i8 %__U to <8 x i1> 3490 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer 3491 %6 = bitcast <8 x i32> %5 to <4 x i64> 3492 ret <4 x i64> %6 3493 } 3494 3495 define <2 x i64> @test_mm_ternarylogic_epi64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3496 ; CHECK-LABEL: test_mm_ternarylogic_epi64: 3497 ; CHECK: # %bb.0: # %entry 3498 ; CHECK-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 3499 ; CHECK-NEXT: ret{{[l|q]}} 3500 entry: 3501 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) 3502 ret <2 x i64> %0 3503 } 3504 3505 declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32) #2 3506 3507 define <2 x i64> @test_mm_mask_ternarylogic_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) { 3508 ; X86-LABEL: test_mm_mask_ternarylogic_epi64: 3509 ; X86: # %bb.0: # %entry 3510 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3511 ; X86-NEXT: kmovw %eax, %k1 3512 ; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} 3513 ; X86-NEXT: retl 3514 ; 3515 ; X64-LABEL: test_mm_mask_ternarylogic_epi64: 3516 ; X64: # %bb.0: # %entry 3517 ; X64-NEXT: kmovw %edi, %k1 3518 ; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} 3519 ; X64-NEXT: retq 3520 entry: 3521 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) 3522 %1 = bitcast i8 %__U to <8 x i1> 3523 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3524 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__A 3525 ret <2 x i64> %2 3526 } 3527 3528 define <2 x i64> @test_mm_maskz_ternarylogic_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3529 ; X86-LABEL: test_mm_maskz_ternarylogic_epi64: 3530 ; X86: # %bb.0: # %entry 3531 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3532 ; X86-NEXT: kmovw %eax, %k1 3533 ; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3534 ; X86-NEXT: retl 3535 ; 3536 ; X64-LABEL: test_mm_maskz_ternarylogic_epi64: 3537 ; X64: # %bb.0: # %entry 3538 ; X64-NEXT: kmovw %edi, %k1 3539 ; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3540 ; X64-NEXT: retq 3541 entry: 3542 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) 3543 %1 = bitcast i8 %__U to <8 x i1> 3544 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3545 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer 3546 ret <2 x i64> %2 3547 } 3548 3549 define <4 x i64> @test_mm256_ternarylogic_epi64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3550 ; CHECK-LABEL: test_mm256_ternarylogic_epi64: 3551 ; CHECK: # %bb.0: # %entry 3552 ; CHECK-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 3553 ; CHECK-NEXT: ret{{[l|q]}} 3554 entry: 3555 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) 3556 ret <4 x i64> %0 3557 } 3558 3559 declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32) #2 3560 3561 define <4 x i64> @test_mm256_mask_ternarylogic_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) { 3562 ; X86-LABEL: test_mm256_mask_ternarylogic_epi64: 3563 ; X86: # %bb.0: # %entry 3564 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3565 ; X86-NEXT: kmovw %eax, %k1 3566 ; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} 3567 ; X86-NEXT: retl 3568 ; 3569 ; X64-LABEL: test_mm256_mask_ternarylogic_epi64: 3570 ; X64: # %bb.0: # %entry 3571 ; X64-NEXT: kmovw %edi, %k1 3572 ; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} 3573 ; X64-NEXT: retq 3574 entry: 3575 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) 3576 %1 = bitcast i8 %__U to <8 x i1> 3577 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3578 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__A 3579 ret <4 x i64> %2 3580 } 3581 3582 define <4 x i64> @test_mm256_maskz_ternarylogic_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3583 ; X86-LABEL: test_mm256_maskz_ternarylogic_epi64: 3584 ; X86: # %bb.0: # %entry 3585 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3586 ; X86-NEXT: kmovw %eax, %k1 3587 ; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3588 ; X86-NEXT: retl 3589 ; 3590 ; X64-LABEL: test_mm256_maskz_ternarylogic_epi64: 3591 ; X64: # %bb.0: # %entry 3592 ; X64-NEXT: kmovw %edi, %k1 3593 ; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3594 ; X64-NEXT: retq 3595 entry: 3596 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) 3597 %1 = bitcast i8 %__U to <8 x i1> 3598 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3599 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer 3600 ret <4 x i64> %2 3601 } 3602 3603 define <2 x i64> @test_mm_mask2_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) { 3604 ; X86-LABEL: test_mm_mask2_permutex2var_epi32: 3605 ; X86: # %bb.0: # %entry 3606 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3607 ; X86-NEXT: kmovw %eax, %k1 3608 ; X86-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} 3609 ; X86-NEXT: vmovdqa %xmm1, %xmm0 3610 ; X86-NEXT: retl 3611 ; 3612 ; X64-LABEL: test_mm_mask2_permutex2var_epi32: 3613 ; X64: # %bb.0: # %entry 3614 ; X64-NEXT: kmovw %edi, %k1 3615 ; X64-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} 3616 ; X64-NEXT: vmovdqa %xmm1, %xmm0 3617 ; X64-NEXT: retq 3618 entry: 3619 %0 = bitcast <2 x i64> %__A to <4 x i32> 3620 %1 = bitcast <2 x i64> %__I to <4 x i32> 3621 %2 = bitcast <2 x i64> %__B to <4 x i32> 3622 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3623 %4 = bitcast i8 %__U to <8 x i1> 3624 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3625 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %1 3626 %6 = bitcast <4 x i32> %5 to <2 x i64> 3627 ret <2 x i64> %6 3628 } 3629 3630 define <4 x i64> @test_mm256_mask2_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) { 3631 ; X86-LABEL: test_mm256_mask2_permutex2var_epi32: 3632 ; X86: # %bb.0: # %entry 3633 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3634 ; X86-NEXT: kmovw %eax, %k1 3635 ; X86-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} 3636 ; X86-NEXT: vmovdqa %ymm1, %ymm0 3637 ; X86-NEXT: retl 3638 ; 3639 ; X64-LABEL: test_mm256_mask2_permutex2var_epi32: 3640 ; X64: # %bb.0: # %entry 3641 ; X64-NEXT: kmovw %edi, %k1 3642 ; X64-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} 3643 ; X64-NEXT: vmovdqa %ymm1, %ymm0 3644 ; X64-NEXT: retq 3645 entry: 3646 %0 = bitcast <4 x i64> %__A to <8 x i32> 3647 %1 = bitcast <4 x i64> %__I to <8 x i32> 3648 %2 = bitcast <4 x i64> %__B to <8 x i32> 3649 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 3650 %4 = bitcast i8 %__U to <8 x i1> 3651 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %1 3652 %6 = bitcast <8 x i32> %5 to <4 x i64> 3653 ret <4 x i64> %6 3654 } 3655 3656 define <2 x double> @test_mm_mask2_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x double> %__B) { 3657 ; X86-LABEL: test_mm_mask2_permutex2var_pd: 3658 ; X86: # %bb.0: # %entry 3659 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3660 ; X86-NEXT: kmovw %eax, %k1 3661 ; X86-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} 3662 ; X86-NEXT: vmovapd %xmm1, %xmm0 3663 ; X86-NEXT: retl 3664 ; 3665 ; X64-LABEL: test_mm_mask2_permutex2var_pd: 3666 ; X64: # %bb.0: # %entry 3667 ; X64-NEXT: kmovw %edi, %k1 3668 ; X64-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} 3669 ; X64-NEXT: vmovapd %xmm1, %xmm0 3670 ; X64-NEXT: retq 3671 entry: 3672 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 3673 %1 = bitcast <2 x i64> %__I to <2 x double> 3674 %2 = bitcast i8 %__U to <8 x i1> 3675 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3676 %3 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %1 3677 ret <2 x double> %3 3678 } 3679 3680 define <4 x double> @test_mm256_mask2_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x double> %__B) { 3681 ; X86-LABEL: test_mm256_mask2_permutex2var_pd: 3682 ; X86: # %bb.0: # %entry 3683 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3684 ; X86-NEXT: kmovw %eax, %k1 3685 ; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} 3686 ; X86-NEXT: vmovapd %ymm1, %ymm0 3687 ; X86-NEXT: retl 3688 ; 3689 ; X64-LABEL: test_mm256_mask2_permutex2var_pd: 3690 ; X64: # %bb.0: # %entry 3691 ; X64-NEXT: kmovw %edi, %k1 3692 ; X64-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} 3693 ; X64-NEXT: vmovapd %ymm1, %ymm0 3694 ; X64-NEXT: retq 3695 entry: 3696 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 3697 %1 = bitcast <4 x i64> %__I to <4 x double> 3698 %2 = bitcast i8 %__U to <8 x i1> 3699 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3700 %3 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %1 3701 ret <4 x double> %3 3702 } 3703 3704 define <4 x float> @test_mm_mask2_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, i8 zeroext %__U, <4 x float> %__B) { 3705 ; X86-LABEL: test_mm_mask2_permutex2var_ps: 3706 ; X86: # %bb.0: # %entry 3707 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3708 ; X86-NEXT: kmovw %eax, %k1 3709 ; X86-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} 3710 ; X86-NEXT: vmovaps %xmm1, %xmm0 3711 ; X86-NEXT: retl 3712 ; 3713 ; X64-LABEL: test_mm_mask2_permutex2var_ps: 3714 ; X64: # %bb.0: # %entry 3715 ; X64-NEXT: kmovw %edi, %k1 3716 ; X64-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} 3717 ; X64-NEXT: vmovaps %xmm1, %xmm0 3718 ; X64-NEXT: retq 3719 entry: 3720 %0 = bitcast <2 x i64> %__I to <4 x i32> 3721 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 3722 %2 = bitcast <2 x i64> %__I to <4 x float> 3723 %3 = bitcast i8 %__U to <8 x i1> 3724 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3725 %4 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %2 3726 ret <4 x float> %4 3727 } 3728 3729 define <8 x float> @test_mm256_mask2_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, i8 zeroext %__U, <8 x float> %__B) { 3730 ; X86-LABEL: test_mm256_mask2_permutex2var_ps: 3731 ; X86: # %bb.0: # %entry 3732 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3733 ; X86-NEXT: kmovw %eax, %k1 3734 ; X86-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} 3735 ; X86-NEXT: vmovaps %ymm1, %ymm0 3736 ; X86-NEXT: retl 3737 ; 3738 ; X64-LABEL: test_mm256_mask2_permutex2var_ps: 3739 ; X64: # %bb.0: # %entry 3740 ; X64-NEXT: kmovw %edi, %k1 3741 ; X64-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} 3742 ; X64-NEXT: vmovaps %ymm1, %ymm0 3743 ; X64-NEXT: retq 3744 entry: 3745 %0 = bitcast <4 x i64> %__I to <8 x i32> 3746 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 3747 %2 = bitcast <4 x i64> %__I to <8 x float> 3748 %3 = bitcast i8 %__U to <8 x i1> 3749 %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2 3750 ret <8 x float> %4 3751 } 3752 3753 define <2 x i64> @test_mm_mask2_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) { 3754 ; X86-LABEL: test_mm_mask2_permutex2var_epi64: 3755 ; X86: # %bb.0: # %entry 3756 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3757 ; X86-NEXT: kmovw %eax, %k1 3758 ; X86-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} 3759 ; X86-NEXT: vmovdqa %xmm1, %xmm0 3760 ; X86-NEXT: retl 3761 ; 3762 ; X64-LABEL: test_mm_mask2_permutex2var_epi64: 3763 ; X64: # %bb.0: # %entry 3764 ; X64-NEXT: kmovw %edi, %k1 3765 ; X64-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} 3766 ; X64-NEXT: vmovdqa %xmm1, %xmm0 3767 ; X64-NEXT: retq 3768 entry: 3769 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 3770 %1 = bitcast i8 %__U to <8 x i1> 3771 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3772 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__I 3773 ret <2 x i64> %2 3774 } 3775 3776 define <4 x i64> @test_mm256_mask2_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) { 3777 ; X86-LABEL: test_mm256_mask2_permutex2var_epi64: 3778 ; X86: # %bb.0: # %entry 3779 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3780 ; X86-NEXT: kmovw %eax, %k1 3781 ; X86-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} 3782 ; X86-NEXT: vmovdqa %ymm1, %ymm0 3783 ; X86-NEXT: retl 3784 ; 3785 ; X64-LABEL: test_mm256_mask2_permutex2var_epi64: 3786 ; X64: # %bb.0: # %entry 3787 ; X64-NEXT: kmovw %edi, %k1 3788 ; X64-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} 3789 ; X64-NEXT: vmovdqa %ymm1, %ymm0 3790 ; X64-NEXT: retq 3791 entry: 3792 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 3793 %1 = bitcast i8 %__U to <8 x i1> 3794 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3795 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__I 3796 ret <4 x i64> %2 3797 } 3798 3799 define <2 x i64> @test_mm_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 3800 ; CHECK-LABEL: test_mm_permutex2var_epi32: 3801 ; CHECK: # %bb.0: # %entry 3802 ; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 3803 ; CHECK-NEXT: ret{{[l|q]}} 3804 entry: 3805 %0 = bitcast <2 x i64> %__A to <4 x i32> 3806 %1 = bitcast <2 x i64> %__I to <4 x i32> 3807 %2 = bitcast <2 x i64> %__B to <4 x i32> 3808 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3809 %4 = bitcast <4 x i32> %3 to <2 x i64> 3810 ret <2 x i64> %4 3811 } 3812 3813 define <2 x i64> @test_mm_mask_permutex2var_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) { 3814 ; X86-LABEL: test_mm_mask_permutex2var_epi32: 3815 ; X86: # %bb.0: # %entry 3816 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3817 ; X86-NEXT: kmovw %eax, %k1 3818 ; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} 3819 ; X86-NEXT: retl 3820 ; 3821 ; X64-LABEL: test_mm_mask_permutex2var_epi32: 3822 ; X64: # %bb.0: # %entry 3823 ; X64-NEXT: kmovw %edi, %k1 3824 ; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} 3825 ; X64-NEXT: retq 3826 entry: 3827 %0 = bitcast <2 x i64> %__A to <4 x i32> 3828 %1 = bitcast <2 x i64> %__I to <4 x i32> 3829 %2 = bitcast <2 x i64> %__B to <4 x i32> 3830 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3831 %4 = bitcast i8 %__U to <8 x i1> 3832 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3833 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0 3834 %6 = bitcast <4 x i32> %5 to <2 x i64> 3835 ret <2 x i64> %6 3836 } 3837 3838 define <2 x i64> @test_mm_maskz_permutex2var_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 3839 ; X86-LABEL: test_mm_maskz_permutex2var_epi32: 3840 ; X86: # %bb.0: # %entry 3841 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3842 ; X86-NEXT: kmovw %eax, %k1 3843 ; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z} 3844 ; X86-NEXT: retl 3845 ; 3846 ; X64-LABEL: test_mm_maskz_permutex2var_epi32: 3847 ; X64: # %bb.0: # %entry 3848 ; X64-NEXT: kmovw %edi, %k1 3849 ; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z} 3850 ; X64-NEXT: retq 3851 entry: 3852 %0 = bitcast <2 x i64> %__A to <4 x i32> 3853 %1 = bitcast <2 x i64> %__I to <4 x i32> 3854 %2 = bitcast <2 x i64> %__B to <4 x i32> 3855 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3856 %4 = bitcast i8 %__U to <8 x i1> 3857 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3858 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer 3859 %6 = bitcast <4 x i32> %5 to <2 x i64> 3860 ret <2 x i64> %6 3861 } 3862 3863 define <4 x i64> @test_mm256_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 3864 ; CHECK-LABEL: test_mm256_permutex2var_epi32: 3865 ; CHECK: # %bb.0: # %entry 3866 ; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 3867 ; CHECK-NEXT: ret{{[l|q]}} 3868 entry: 3869 %0 = bitcast <4 x i64> %__A to <8 x i32> 3870 %1 = bitcast <4 x i64> %__I to <8 x i32> 3871 %2 = bitcast <4 x i64> %__B to <8 x i32> 3872 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 3873 %4 = bitcast <8 x i32> %3 to <4 x i64> 3874 ret <4 x i64> %4 3875 } 3876 3877 define <4 x i64> @test_mm256_mask_permutex2var_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) { 3878 ; X86-LABEL: test_mm256_mask_permutex2var_epi32: 3879 ; X86: # %bb.0: # %entry 3880 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3881 ; X86-NEXT: kmovw %eax, %k1 3882 ; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} 3883 ; X86-NEXT: retl 3884 ; 3885 ; X64-LABEL: test_mm256_mask_permutex2var_epi32: 3886 ; X64: # %bb.0: # %entry 3887 ; X64-NEXT: kmovw %edi, %k1 3888 ; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} 3889 ; X64-NEXT: retq 3890 entry: 3891 %0 = bitcast <4 x i64> %__A to <8 x i32> 3892 %1 = bitcast <4 x i64> %__I to <8 x i32> 3893 %2 = bitcast <4 x i64> %__B to <8 x i32> 3894 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 3895 %4 = bitcast i8 %__U to <8 x i1> 3896 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0 3897 %6 = bitcast <8 x i32> %5 to <4 x i64> 3898 ret <4 x i64> %6 3899 } 3900 3901 define <4 x i64> @test_mm256_maskz_permutex2var_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 3902 ; X86-LABEL: test_mm256_maskz_permutex2var_epi32: 3903 ; X86: # %bb.0: # %entry 3904 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3905 ; X86-NEXT: kmovw %eax, %k1 3906 ; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z} 3907 ; X86-NEXT: retl 3908 ; 3909 ; X64-LABEL: test_mm256_maskz_permutex2var_epi32: 3910 ; X64: # %bb.0: # %entry 3911 ; X64-NEXT: kmovw %edi, %k1 3912 ; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z} 3913 ; X64-NEXT: retq 3914 entry: 3915 %0 = bitcast <4 x i64> %__A to <8 x i32> 3916 %1 = bitcast <4 x i64> %__I to <8 x i32> 3917 %2 = bitcast <4 x i64> %__B to <8 x i32> 3918 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 3919 %4 = bitcast i8 %__U to <8 x i1> 3920 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer 3921 %6 = bitcast <8 x i32> %5 to <4 x i64> 3922 ret <4 x i64> %6 3923 } 3924 3925 define <2 x double> @test_mm_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) { 3926 ; CHECK-LABEL: test_mm_permutex2var_pd: 3927 ; CHECK: # %bb.0: # %entry 3928 ; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 3929 ; CHECK-NEXT: ret{{[l|q]}} 3930 entry: 3931 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 3932 ret <2 x double> %0 3933 } 3934 3935 define <2 x double> @test_mm_mask_permutex2var_pd(<2 x double> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x double> %__B) { 3936 ; X86-LABEL: test_mm_mask_permutex2var_pd: 3937 ; X86: # %bb.0: # %entry 3938 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3939 ; X86-NEXT: kmovw %eax, %k1 3940 ; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} 3941 ; X86-NEXT: retl 3942 ; 3943 ; X64-LABEL: test_mm_mask_permutex2var_pd: 3944 ; X64: # %bb.0: # %entry 3945 ; X64-NEXT: kmovw %edi, %k1 3946 ; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} 3947 ; X64-NEXT: retq 3948 entry: 3949 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 3950 %1 = bitcast i8 %__U to <8 x i1> 3951 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3952 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 3953 ret <2 x double> %2 3954 } 3955 3956 define <2 x double> @test_mm_maskz_permutex2var_pd(i8 zeroext %__U, <2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) { 3957 ; X86-LABEL: test_mm_maskz_permutex2var_pd: 3958 ; X86: # %bb.0: # %entry 3959 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3960 ; X86-NEXT: kmovw %eax, %k1 3961 ; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z} 3962 ; X86-NEXT: retl 3963 ; 3964 ; X64-LABEL: test_mm_maskz_permutex2var_pd: 3965 ; X64: # %bb.0: # %entry 3966 ; X64-NEXT: kmovw %edi, %k1 3967 ; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z} 3968 ; X64-NEXT: retq 3969 entry: 3970 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 3971 %1 = bitcast i8 %__U to <8 x i1> 3972 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3973 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 3974 ret <2 x double> %2 3975 } 3976 3977 define <4 x double> @test_mm256_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) { 3978 ; CHECK-LABEL: test_mm256_permutex2var_pd: 3979 ; CHECK: # %bb.0: # %entry 3980 ; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 3981 ; CHECK-NEXT: ret{{[l|q]}} 3982 entry: 3983 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 3984 ret <4 x double> %0 3985 } 3986 3987 define <4 x double> @test_mm256_mask_permutex2var_pd(<4 x double> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x double> %__B) { 3988 ; X86-LABEL: test_mm256_mask_permutex2var_pd: 3989 ; X86: # %bb.0: # %entry 3990 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3991 ; X86-NEXT: kmovw %eax, %k1 3992 ; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} 3993 ; X86-NEXT: retl 3994 ; 3995 ; X64-LABEL: test_mm256_mask_permutex2var_pd: 3996 ; X64: # %bb.0: # %entry 3997 ; X64-NEXT: kmovw %edi, %k1 3998 ; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} 3999 ; X64-NEXT: retq 4000 entry: 4001 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 4002 %1 = bitcast i8 %__U to <8 x i1> 4003 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4004 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 4005 ret <4 x double> %2 4006 } 4007 4008 define <4 x double> @test_mm256_maskz_permutex2var_pd(i8 zeroext %__U, <4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) { 4009 ; X86-LABEL: test_mm256_maskz_permutex2var_pd: 4010 ; X86: # %bb.0: # %entry 4011 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4012 ; X86-NEXT: kmovw %eax, %k1 4013 ; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z} 4014 ; X86-NEXT: retl 4015 ; 4016 ; X64-LABEL: test_mm256_maskz_permutex2var_pd: 4017 ; X64: # %bb.0: # %entry 4018 ; X64-NEXT: kmovw %edi, %k1 4019 ; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z} 4020 ; X64-NEXT: retq 4021 entry: 4022 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 4023 %1 = bitcast i8 %__U to <8 x i1> 4024 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4025 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4026 ret <4 x double> %2 4027 } 4028 4029 define <4 x float> @test_mm_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) { 4030 ; CHECK-LABEL: test_mm_permutex2var_ps: 4031 ; CHECK: # %bb.0: # %entry 4032 ; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 4033 ; CHECK-NEXT: ret{{[l|q]}} 4034 entry: 4035 %0 = bitcast <2 x i64> %__I to <4 x i32> 4036 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 4037 ret <4 x float> %1 4038 } 4039 4040 define <4 x float> @test_mm_mask_permutex2var_ps(<4 x float> %__A, i8 zeroext %__U, <2 x i64> %__I, <4 x float> %__B) { 4041 ; X86-LABEL: test_mm_mask_permutex2var_ps: 4042 ; X86: # %bb.0: # %entry 4043 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4044 ; X86-NEXT: kmovw %eax, %k1 4045 ; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} 4046 ; X86-NEXT: retl 4047 ; 4048 ; X64-LABEL: test_mm_mask_permutex2var_ps: 4049 ; X64: # %bb.0: # %entry 4050 ; X64-NEXT: kmovw %edi, %k1 4051 ; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} 4052 ; X64-NEXT: retq 4053 entry: 4054 %0 = bitcast <2 x i64> %__I to <4 x i32> 4055 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 4056 %2 = bitcast i8 %__U to <8 x i1> 4057 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4058 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %__A 4059 ret <4 x float> %3 4060 } 4061 4062 define <4 x float> @test_mm_maskz_permutex2var_ps(i8 zeroext %__U, <4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) { 4063 ; X86-LABEL: test_mm_maskz_permutex2var_ps: 4064 ; X86: # %bb.0: # %entry 4065 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4066 ; X86-NEXT: kmovw %eax, %k1 4067 ; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z} 4068 ; X86-NEXT: retl 4069 ; 4070 ; X64-LABEL: test_mm_maskz_permutex2var_ps: 4071 ; X64: # %bb.0: # %entry 4072 ; X64-NEXT: kmovw %edi, %k1 4073 ; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z} 4074 ; X64-NEXT: retq 4075 entry: 4076 %0 = bitcast <2 x i64> %__I to <4 x i32> 4077 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 4078 %2 = bitcast i8 %__U to <8 x i1> 4079 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4080 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> zeroinitializer 4081 ret <4 x float> %3 4082 } 4083 4084 define <8 x float> @test_mm256_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) { 4085 ; CHECK-LABEL: test_mm256_permutex2var_ps: 4086 ; CHECK: # %bb.0: # %entry 4087 ; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 4088 ; CHECK-NEXT: ret{{[l|q]}} 4089 entry: 4090 %0 = bitcast <4 x i64> %__I to <8 x i32> 4091 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 4092 ret <8 x float> %1 4093 } 4094 4095 define <8 x float> @test_mm256_mask_permutex2var_ps(<8 x float> %__A, i8 zeroext %__U, <4 x i64> %__I, <8 x float> %__B) { 4096 ; X86-LABEL: test_mm256_mask_permutex2var_ps: 4097 ; X86: # %bb.0: # %entry 4098 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4099 ; X86-NEXT: kmovw %eax, %k1 4100 ; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} 4101 ; X86-NEXT: retl 4102 ; 4103 ; X64-LABEL: test_mm256_mask_permutex2var_ps: 4104 ; X64: # %bb.0: # %entry 4105 ; X64-NEXT: kmovw %edi, %k1 4106 ; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} 4107 ; X64-NEXT: retq 4108 entry: 4109 %0 = bitcast <4 x i64> %__I to <8 x i32> 4110 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 4111 %2 = bitcast i8 %__U to <8 x i1> 4112 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %__A 4113 ret <8 x float> %3 4114 } 4115 4116 define <8 x float> @test_mm256_maskz_permutex2var_ps(i8 zeroext %__U, <8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) { 4117 ; X86-LABEL: test_mm256_maskz_permutex2var_ps: 4118 ; X86: # %bb.0: # %entry 4119 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4120 ; X86-NEXT: kmovw %eax, %k1 4121 ; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z} 4122 ; X86-NEXT: retl 4123 ; 4124 ; X64-LABEL: test_mm256_maskz_permutex2var_ps: 4125 ; X64: # %bb.0: # %entry 4126 ; X64-NEXT: kmovw %edi, %k1 4127 ; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z} 4128 ; X64-NEXT: retq 4129 entry: 4130 %0 = bitcast <4 x i64> %__I to <8 x i32> 4131 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 4132 %2 = bitcast i8 %__U to <8 x i1> 4133 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer 4134 ret <8 x float> %3 4135 } 4136 4137 define <2 x i64> @test_mm_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 4138 ; CHECK-LABEL: test_mm_permutex2var_epi64: 4139 ; CHECK: # %bb.0: # %entry 4140 ; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 4141 ; CHECK-NEXT: ret{{[l|q]}} 4142 entry: 4143 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 4144 ret <2 x i64> %0 4145 } 4146 4147 define <2 x i64> @test_mm_mask_permutex2var_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) { 4148 ; X86-LABEL: test_mm_mask_permutex2var_epi64: 4149 ; X86: # %bb.0: # %entry 4150 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4151 ; X86-NEXT: kmovw %eax, %k1 4152 ; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} 4153 ; X86-NEXT: retl 4154 ; 4155 ; X64-LABEL: test_mm_mask_permutex2var_epi64: 4156 ; X64: # %bb.0: # %entry 4157 ; X64-NEXT: kmovw %edi, %k1 4158 ; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} 4159 ; X64-NEXT: retq 4160 entry: 4161 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 4162 %1 = bitcast i8 %__U to <8 x i1> 4163 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4164 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__A 4165 ret <2 x i64> %2 4166 } 4167 4168 define <2 x i64> @test_mm_maskz_permutex2var_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 4169 ; X86-LABEL: test_mm_maskz_permutex2var_epi64: 4170 ; X86: # %bb.0: # %entry 4171 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4172 ; X86-NEXT: kmovw %eax, %k1 4173 ; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z} 4174 ; X86-NEXT: retl 4175 ; 4176 ; X64-LABEL: test_mm_maskz_permutex2var_epi64: 4177 ; X64: # %bb.0: # %entry 4178 ; X64-NEXT: kmovw %edi, %k1 4179 ; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z} 4180 ; X64-NEXT: retq 4181 entry: 4182 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 4183 %1 = bitcast i8 %__U to <8 x i1> 4184 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4185 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer 4186 ret <2 x i64> %2 4187 } 4188 4189 define <4 x i64> @test_mm256_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 4190 ; CHECK-LABEL: test_mm256_permutex2var_epi64: 4191 ; CHECK: # %bb.0: # %entry 4192 ; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 4193 ; CHECK-NEXT: ret{{[l|q]}} 4194 entry: 4195 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 4196 ret <4 x i64> %0 4197 } 4198 4199 define <4 x i64> @test_mm256_mask_permutex2var_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) { 4200 ; X86-LABEL: test_mm256_mask_permutex2var_epi64: 4201 ; X86: # %bb.0: # %entry 4202 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4203 ; X86-NEXT: kmovw %eax, %k1 4204 ; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} 4205 ; X86-NEXT: retl 4206 ; 4207 ; X64-LABEL: test_mm256_mask_permutex2var_epi64: 4208 ; X64: # %bb.0: # %entry 4209 ; X64-NEXT: kmovw %edi, %k1 4210 ; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} 4211 ; X64-NEXT: retq 4212 entry: 4213 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 4214 %1 = bitcast i8 %__U to <8 x i1> 4215 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4216 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__A 4217 ret <4 x i64> %2 4218 } 4219 4220 define <4 x i64> @test_mm256_maskz_permutex2var_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 4221 ; X86-LABEL: test_mm256_maskz_permutex2var_epi64: 4222 ; X86: # %bb.0: # %entry 4223 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4224 ; X86-NEXT: kmovw %eax, %k1 4225 ; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z} 4226 ; X86-NEXT: retl 4227 ; 4228 ; X64-LABEL: test_mm256_maskz_permutex2var_epi64: 4229 ; X64: # %bb.0: # %entry 4230 ; X64-NEXT: kmovw %edi, %k1 4231 ; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z} 4232 ; X64-NEXT: retq 4233 entry: 4234 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 4235 %1 = bitcast i8 %__U to <8 x i1> 4236 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4237 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer 4238 ret <4 x i64> %2 4239 } 4240 4241 4242 define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 4243 ; X86-LABEL: test_mm_mask_fmadd_pd: 4244 ; X86: # %bb.0: # %entry 4245 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4246 ; X86-NEXT: kmovw %eax, %k1 4247 ; X86-NEXT: vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 4248 ; X86-NEXT: retl 4249 ; 4250 ; X64-LABEL: test_mm_mask_fmadd_pd: 4251 ; X64: # %bb.0: # %entry 4252 ; X64-NEXT: kmovw %edi, %k1 4253 ; X64-NEXT: vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 4254 ; X64-NEXT: retq 4255 entry: 4256 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4257 %1 = bitcast i8 %__U to <8 x i1> 4258 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4259 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 4260 ret <2 x double> %2 4261 } 4262 4263 define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 4264 ; X86-LABEL: test_mm_mask_fmsub_pd: 4265 ; X86: # %bb.0: # %entry 4266 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4267 ; X86-NEXT: kmovw %eax, %k1 4268 ; X86-NEXT: vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 4269 ; X86-NEXT: retl 4270 ; 4271 ; X64-LABEL: test_mm_mask_fmsub_pd: 4272 ; X64: # %bb.0: # %entry 4273 ; X64-NEXT: kmovw %edi, %k1 4274 ; X64-NEXT: vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 4275 ; X64-NEXT: retq 4276 entry: 4277 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4278 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 4279 %1 = bitcast i8 %__U to <8 x i1> 4280 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4281 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 4282 ret <2 x double> %2 4283 } 4284 4285 define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 4286 ; X86-LABEL: test_mm_mask3_fmadd_pd: 4287 ; X86: # %bb.0: # %entry 4288 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4289 ; X86-NEXT: kmovw %eax, %k1 4290 ; X86-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4291 ; X86-NEXT: vmovapd %xmm2, %xmm0 4292 ; X86-NEXT: retl 4293 ; 4294 ; X64-LABEL: test_mm_mask3_fmadd_pd: 4295 ; X64: # %bb.0: # %entry 4296 ; X64-NEXT: kmovw %edi, %k1 4297 ; X64-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4298 ; X64-NEXT: vmovapd %xmm2, %xmm0 4299 ; X64-NEXT: retq 4300 entry: 4301 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4302 %1 = bitcast i8 %__U to <8 x i1> 4303 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4304 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 4305 ret <2 x double> %2 4306 } 4307 4308 define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 4309 ; X86-LABEL: test_mm_mask3_fnmadd_pd: 4310 ; X86: # %bb.0: # %entry 4311 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4312 ; X86-NEXT: kmovw %eax, %k1 4313 ; X86-NEXT: vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 4314 ; X86-NEXT: vmovapd %xmm2, %xmm0 4315 ; X86-NEXT: retl 4316 ; 4317 ; X64-LABEL: test_mm_mask3_fnmadd_pd: 4318 ; X64: # %bb.0: # %entry 4319 ; X64-NEXT: kmovw %edi, %k1 4320 ; X64-NEXT: vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 4321 ; X64-NEXT: vmovapd %xmm2, %xmm0 4322 ; X64-NEXT: retq 4323 entry: 4324 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 4325 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9 4326 %1 = bitcast i8 %__U to <8 x i1> 4327 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4328 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 4329 ret <2 x double> %2 4330 } 4331 4332 define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4333 ; X86-LABEL: test_mm_maskz_fmadd_pd: 4334 ; X86: # %bb.0: # %entry 4335 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4336 ; X86-NEXT: kmovw %eax, %k1 4337 ; X86-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4338 ; X86-NEXT: retl 4339 ; 4340 ; X64-LABEL: test_mm_maskz_fmadd_pd: 4341 ; X64: # %bb.0: # %entry 4342 ; X64-NEXT: kmovw %edi, %k1 4343 ; X64-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4344 ; X64-NEXT: retq 4345 entry: 4346 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4347 %1 = bitcast i8 %__U to <8 x i1> 4348 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4349 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4350 ret <2 x double> %2 4351 } 4352 4353 define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4354 ; X86-LABEL: test_mm_maskz_fmsub_pd: 4355 ; X86: # %bb.0: # %entry 4356 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4357 ; X86-NEXT: kmovw %eax, %k1 4358 ; X86-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 4359 ; X86-NEXT: retl 4360 ; 4361 ; X64-LABEL: test_mm_maskz_fmsub_pd: 4362 ; X64: # %bb.0: # %entry 4363 ; X64-NEXT: kmovw %edi, %k1 4364 ; X64-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 4365 ; X64-NEXT: retq 4366 entry: 4367 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4368 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 4369 %1 = bitcast i8 %__U to <8 x i1> 4370 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4371 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4372 ret <2 x double> %2 4373 } 4374 4375 define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4376 ; X86-LABEL: test_mm_maskz_fnmadd_pd: 4377 ; X86: # %bb.0: # %entry 4378 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4379 ; X86-NEXT: kmovw %eax, %k1 4380 ; X86-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 4381 ; X86-NEXT: retl 4382 ; 4383 ; X64-LABEL: test_mm_maskz_fnmadd_pd: 4384 ; X64: # %bb.0: # %entry 4385 ; X64-NEXT: kmovw %edi, %k1 4386 ; X64-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 4387 ; X64-NEXT: retq 4388 entry: 4389 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 4390 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9 4391 %1 = bitcast i8 %__U to <8 x i1> 4392 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4393 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4394 ret <2 x double> %2 4395 } 4396 4397 define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4398 ; X86-LABEL: test_mm_maskz_fnmsub_pd: 4399 ; X86: # %bb.0: # %entry 4400 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4401 ; X86-NEXT: kmovw %eax, %k1 4402 ; X86-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 4403 ; X86-NEXT: retl 4404 ; 4405 ; X64-LABEL: test_mm_maskz_fnmsub_pd: 4406 ; X64: # %bb.0: # %entry 4407 ; X64-NEXT: kmovw %edi, %k1 4408 ; X64-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 4409 ; X64-NEXT: retq 4410 entry: 4411 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 4412 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4413 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9 4414 %1 = bitcast i8 %__U to <8 x i1> 4415 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4416 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4417 ret <2 x double> %2 4418 } 4419 4420 define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 4421 ; X86-LABEL: test_mm256_mask_fmadd_pd: 4422 ; X86: # %bb.0: # %entry 4423 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4424 ; X86-NEXT: kmovw %eax, %k1 4425 ; X86-NEXT: vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2 4426 ; X86-NEXT: retl 4427 ; 4428 ; X64-LABEL: test_mm256_mask_fmadd_pd: 4429 ; X64: # %bb.0: # %entry 4430 ; X64-NEXT: kmovw %edi, %k1 4431 ; X64-NEXT: vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2 4432 ; X64-NEXT: retq 4433 entry: 4434 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 4435 %1 = bitcast i8 %__U to <8 x i1> 4436 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4437 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 4438 ret <4 x double> %2 4439 } 4440 4441 define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 4442 ; X86-LABEL: test_mm256_mask_fmsub_pd: 4443 ; X86: # %bb.0: # %entry 4444 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4445 ; X86-NEXT: kmovw %eax, %k1 4446 ; X86-NEXT: vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 4447 ; X86-NEXT: retl 4448 ; 4449 ; X64-LABEL: test_mm256_mask_fmsub_pd: 4450 ; X64: # %bb.0: # %entry 4451 ; X64-NEXT: kmovw %edi, %k1 4452 ; X64-NEXT: vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 4453 ; X64-NEXT: retq 4454 entry: 4455 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4456 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 4457 %1 = bitcast i8 %__U to <8 x i1> 4458 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4459 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 4460 ret <4 x double> %2 4461 } 4462 4463 define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 4464 ; X86-LABEL: test_mm256_mask3_fmadd_pd: 4465 ; X86: # %bb.0: # %entry 4466 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4467 ; X86-NEXT: kmovw %eax, %k1 4468 ; X86-NEXT: vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 4469 ; X86-NEXT: vmovapd %ymm2, %ymm0 4470 ; X86-NEXT: retl 4471 ; 4472 ; X64-LABEL: test_mm256_mask3_fmadd_pd: 4473 ; X64: # %bb.0: # %entry 4474 ; X64-NEXT: kmovw %edi, %k1 4475 ; X64-NEXT: vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 4476 ; X64-NEXT: vmovapd %ymm2, %ymm0 4477 ; X64-NEXT: retq 4478 entry: 4479 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 4480 %1 = bitcast i8 %__U to <8 x i1> 4481 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4482 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 4483 ret <4 x double> %2 4484 } 4485 4486 define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 4487 ; X86-LABEL: test_mm256_mask3_fnmadd_pd: 4488 ; X86: # %bb.0: # %entry 4489 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4490 ; X86-NEXT: kmovw %eax, %k1 4491 ; X86-NEXT: vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 4492 ; X86-NEXT: vmovapd %ymm2, %ymm0 4493 ; X86-NEXT: retl 4494 ; 4495 ; X64-LABEL: test_mm256_mask3_fnmadd_pd: 4496 ; X64: # %bb.0: # %entry 4497 ; X64-NEXT: kmovw %edi, %k1 4498 ; X64-NEXT: vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 4499 ; X64-NEXT: vmovapd %ymm2, %ymm0 4500 ; X64-NEXT: retq 4501 entry: 4502 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4503 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9 4504 %1 = bitcast i8 %__U to <8 x i1> 4505 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4506 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 4507 ret <4 x double> %2 4508 } 4509 4510 define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4511 ; X86-LABEL: test_mm256_maskz_fmadd_pd: 4512 ; X86: # %bb.0: # %entry 4513 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4514 ; X86-NEXT: kmovw %eax, %k1 4515 ; X86-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 4516 ; X86-NEXT: retl 4517 ; 4518 ; X64-LABEL: test_mm256_maskz_fmadd_pd: 4519 ; X64: # %bb.0: # %entry 4520 ; X64-NEXT: kmovw %edi, %k1 4521 ; X64-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 4522 ; X64-NEXT: retq 4523 entry: 4524 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 4525 %1 = bitcast i8 %__U to <8 x i1> 4526 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4527 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4528 ret <4 x double> %2 4529 } 4530 4531 define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4532 ; X86-LABEL: test_mm256_maskz_fmsub_pd: 4533 ; X86: # %bb.0: # %entry 4534 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4535 ; X86-NEXT: kmovw %eax, %k1 4536 ; X86-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 4537 ; X86-NEXT: retl 4538 ; 4539 ; X64-LABEL: test_mm256_maskz_fmsub_pd: 4540 ; X64: # %bb.0: # %entry 4541 ; X64-NEXT: kmovw %edi, %k1 4542 ; X64-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 4543 ; X64-NEXT: retq 4544 entry: 4545 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4546 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 4547 %1 = bitcast i8 %__U to <8 x i1> 4548 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4549 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4550 ret <4 x double> %2 4551 } 4552 4553 define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4554 ; X86-LABEL: test_mm256_maskz_fnmadd_pd: 4555 ; X86: # %bb.0: # %entry 4556 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4557 ; X86-NEXT: kmovw %eax, %k1 4558 ; X86-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 4559 ; X86-NEXT: retl 4560 ; 4561 ; X64-LABEL: test_mm256_maskz_fnmadd_pd: 4562 ; X64: # %bb.0: # %entry 4563 ; X64-NEXT: kmovw %edi, %k1 4564 ; X64-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 4565 ; X64-NEXT: retq 4566 entry: 4567 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4568 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9 4569 %1 = bitcast i8 %__U to <8 x i1> 4570 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4571 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4572 ret <4 x double> %2 4573 } 4574 4575 define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4576 ; X86-LABEL: test_mm256_maskz_fnmsub_pd: 4577 ; X86: # %bb.0: # %entry 4578 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4579 ; X86-NEXT: kmovw %eax, %k1 4580 ; X86-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 4581 ; X86-NEXT: retl 4582 ; 4583 ; X64-LABEL: test_mm256_maskz_fnmsub_pd: 4584 ; X64: # %bb.0: # %entry 4585 ; X64-NEXT: kmovw %edi, %k1 4586 ; X64-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 4587 ; X64-NEXT: retq 4588 entry: 4589 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4590 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4591 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9 4592 %1 = bitcast i8 %__U to <8 x i1> 4593 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4594 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4595 ret <4 x double> %2 4596 } 4597 4598 define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 4599 ; X86-LABEL: test_mm_mask_fmadd_ps: 4600 ; X86: # %bb.0: # %entry 4601 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4602 ; X86-NEXT: kmovw %eax, %k1 4603 ; X86-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 4604 ; X86-NEXT: retl 4605 ; 4606 ; X64-LABEL: test_mm_mask_fmadd_ps: 4607 ; X64: # %bb.0: # %entry 4608 ; X64-NEXT: kmovw %edi, %k1 4609 ; X64-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 4610 ; X64-NEXT: retq 4611 entry: 4612 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 4613 %1 = bitcast i8 %__U to <8 x i1> 4614 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4615 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 4616 ret <4 x float> %2 4617 } 4618 4619 define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 4620 ; X86-LABEL: test_mm_mask_fmsub_ps: 4621 ; X86: # %bb.0: # %entry 4622 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4623 ; X86-NEXT: kmovw %eax, %k1 4624 ; X86-NEXT: vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 4625 ; X86-NEXT: retl 4626 ; 4627 ; X64-LABEL: test_mm_mask_fmsub_ps: 4628 ; X64: # %bb.0: # %entry 4629 ; X64-NEXT: kmovw %edi, %k1 4630 ; X64-NEXT: vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 4631 ; X64-NEXT: retq 4632 entry: 4633 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4634 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 4635 %1 = bitcast i8 %__U to <8 x i1> 4636 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4637 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 4638 ret <4 x float> %2 4639 } 4640 4641 define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 4642 ; X86-LABEL: test_mm_mask3_fmadd_ps: 4643 ; X86: # %bb.0: # %entry 4644 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4645 ; X86-NEXT: kmovw %eax, %k1 4646 ; X86-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4647 ; X86-NEXT: vmovaps %xmm2, %xmm0 4648 ; X86-NEXT: retl 4649 ; 4650 ; X64-LABEL: test_mm_mask3_fmadd_ps: 4651 ; X64: # %bb.0: # %entry 4652 ; X64-NEXT: kmovw %edi, %k1 4653 ; X64-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4654 ; X64-NEXT: vmovaps %xmm2, %xmm0 4655 ; X64-NEXT: retq 4656 entry: 4657 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 4658 %1 = bitcast i8 %__U to <8 x i1> 4659 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4660 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 4661 ret <4 x float> %2 4662 } 4663 4664 define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 4665 ; X86-LABEL: test_mm_mask3_fnmadd_ps: 4666 ; X86: # %bb.0: # %entry 4667 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4668 ; X86-NEXT: kmovw %eax, %k1 4669 ; X86-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 4670 ; X86-NEXT: vmovaps %xmm2, %xmm0 4671 ; X86-NEXT: retl 4672 ; 4673 ; X64-LABEL: test_mm_mask3_fnmadd_ps: 4674 ; X64: # %bb.0: # %entry 4675 ; X64-NEXT: kmovw %edi, %k1 4676 ; X64-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 4677 ; X64-NEXT: vmovaps %xmm2, %xmm0 4678 ; X64-NEXT: retq 4679 entry: 4680 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4681 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9 4682 %1 = bitcast i8 %__U to <8 x i1> 4683 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4684 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 4685 ret <4 x float> %2 4686 } 4687 4688 define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4689 ; X86-LABEL: test_mm_maskz_fmadd_ps: 4690 ; X86: # %bb.0: # %entry 4691 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4692 ; X86-NEXT: kmovw %eax, %k1 4693 ; X86-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4694 ; X86-NEXT: retl 4695 ; 4696 ; X64-LABEL: test_mm_maskz_fmadd_ps: 4697 ; X64: # %bb.0: # %entry 4698 ; X64-NEXT: kmovw %edi, %k1 4699 ; X64-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4700 ; X64-NEXT: retq 4701 entry: 4702 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 4703 %1 = bitcast i8 %__U to <8 x i1> 4704 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4705 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4706 ret <4 x float> %2 4707 } 4708 4709 define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4710 ; X86-LABEL: test_mm_maskz_fmsub_ps: 4711 ; X86: # %bb.0: # %entry 4712 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4713 ; X86-NEXT: kmovw %eax, %k1 4714 ; X86-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 4715 ; X86-NEXT: retl 4716 ; 4717 ; X64-LABEL: test_mm_maskz_fmsub_ps: 4718 ; X64: # %bb.0: # %entry 4719 ; X64-NEXT: kmovw %edi, %k1 4720 ; X64-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 4721 ; X64-NEXT: retq 4722 entry: 4723 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4724 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 4725 %1 = bitcast i8 %__U to <8 x i1> 4726 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4727 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4728 ret <4 x float> %2 4729 } 4730 4731 define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4732 ; X86-LABEL: test_mm_maskz_fnmadd_ps: 4733 ; X86: # %bb.0: # %entry 4734 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4735 ; X86-NEXT: kmovw %eax, %k1 4736 ; X86-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 4737 ; X86-NEXT: retl 4738 ; 4739 ; X64-LABEL: test_mm_maskz_fnmadd_ps: 4740 ; X64: # %bb.0: # %entry 4741 ; X64-NEXT: kmovw %edi, %k1 4742 ; X64-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 4743 ; X64-NEXT: retq 4744 entry: 4745 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4746 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9 4747 %1 = bitcast i8 %__U to <8 x i1> 4748 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4749 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4750 ret <4 x float> %2 4751 } 4752 4753 define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4754 ; X86-LABEL: test_mm_maskz_fnmsub_ps: 4755 ; X86: # %bb.0: # %entry 4756 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4757 ; X86-NEXT: kmovw %eax, %k1 4758 ; X86-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 4759 ; X86-NEXT: retl 4760 ; 4761 ; X64-LABEL: test_mm_maskz_fnmsub_ps: 4762 ; X64: # %bb.0: # %entry 4763 ; X64-NEXT: kmovw %edi, %k1 4764 ; X64-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 4765 ; X64-NEXT: retq 4766 entry: 4767 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4768 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4769 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9 4770 %1 = bitcast i8 %__U to <8 x i1> 4771 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4772 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4773 ret <4 x float> %2 4774 } 4775 4776 define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 4777 ; X86-LABEL: test_mm256_mask_fmadd_ps: 4778 ; X86: # %bb.0: # %entry 4779 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4780 ; X86-NEXT: kmovw %eax, %k1 4781 ; X86-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2 4782 ; X86-NEXT: retl 4783 ; 4784 ; X64-LABEL: test_mm256_mask_fmadd_ps: 4785 ; X64: # %bb.0: # %entry 4786 ; X64-NEXT: kmovw %edi, %k1 4787 ; X64-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2 4788 ; X64-NEXT: retq 4789 entry: 4790 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 4791 %1 = bitcast i8 %__U to <8 x i1> 4792 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 4793 ret <8 x float> %2 4794 } 4795 4796 define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 4797 ; X86-LABEL: test_mm256_mask_fmsub_ps: 4798 ; X86: # %bb.0: # %entry 4799 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4800 ; X86-NEXT: kmovw %eax, %k1 4801 ; X86-NEXT: vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 4802 ; X86-NEXT: retl 4803 ; 4804 ; X64-LABEL: test_mm256_mask_fmsub_ps: 4805 ; X64: # %bb.0: # %entry 4806 ; X64-NEXT: kmovw %edi, %k1 4807 ; X64-NEXT: vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 4808 ; X64-NEXT: retq 4809 entry: 4810 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4811 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 4812 %1 = bitcast i8 %__U to <8 x i1> 4813 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 4814 ret <8 x float> %2 4815 } 4816 4817 define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 4818 ; X86-LABEL: test_mm256_mask3_fmadd_ps: 4819 ; X86: # %bb.0: # %entry 4820 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4821 ; X86-NEXT: kmovw %eax, %k1 4822 ; X86-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 4823 ; X86-NEXT: vmovaps %ymm2, %ymm0 4824 ; X86-NEXT: retl 4825 ; 4826 ; X64-LABEL: test_mm256_mask3_fmadd_ps: 4827 ; X64: # %bb.0: # %entry 4828 ; X64-NEXT: kmovw %edi, %k1 4829 ; X64-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 4830 ; X64-NEXT: vmovaps %ymm2, %ymm0 4831 ; X64-NEXT: retq 4832 entry: 4833 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 4834 %1 = bitcast i8 %__U to <8 x i1> 4835 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 4836 ret <8 x float> %2 4837 } 4838 4839 define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 4840 ; X86-LABEL: test_mm256_mask3_fnmadd_ps: 4841 ; X86: # %bb.0: # %entry 4842 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4843 ; X86-NEXT: kmovw %eax, %k1 4844 ; X86-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 4845 ; X86-NEXT: vmovaps %ymm2, %ymm0 4846 ; X86-NEXT: retl 4847 ; 4848 ; X64-LABEL: test_mm256_mask3_fnmadd_ps: 4849 ; X64: # %bb.0: # %entry 4850 ; X64-NEXT: kmovw %edi, %k1 4851 ; X64-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 4852 ; X64-NEXT: vmovaps %ymm2, %ymm0 4853 ; X64-NEXT: retq 4854 entry: 4855 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4856 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9 4857 %1 = bitcast i8 %__U to <8 x i1> 4858 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 4859 ret <8 x float> %2 4860 } 4861 4862 define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 4863 ; X86-LABEL: test_mm256_maskz_fmadd_ps: 4864 ; X86: # %bb.0: # %entry 4865 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4866 ; X86-NEXT: kmovw %eax, %k1 4867 ; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 4868 ; X86-NEXT: retl 4869 ; 4870 ; X64-LABEL: test_mm256_maskz_fmadd_ps: 4871 ; X64: # %bb.0: # %entry 4872 ; X64-NEXT: kmovw %edi, %k1 4873 ; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 4874 ; X64-NEXT: retq 4875 entry: 4876 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 4877 %1 = bitcast i8 %__U to <8 x i1> 4878 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 4879 ret <8 x float> %2 4880 } 4881 4882 define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 4883 ; X86-LABEL: test_mm256_maskz_fmsub_ps: 4884 ; X86: # %bb.0: # %entry 4885 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4886 ; X86-NEXT: kmovw %eax, %k1 4887 ; X86-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 4888 ; X86-NEXT: retl 4889 ; 4890 ; X64-LABEL: test_mm256_maskz_fmsub_ps: 4891 ; X64: # %bb.0: # %entry 4892 ; X64-NEXT: kmovw %edi, %k1 4893 ; X64-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 4894 ; X64-NEXT: retq 4895 entry: 4896 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4897 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 4898 %1 = bitcast i8 %__U to <8 x i1> 4899 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 4900 ret <8 x float> %2 4901 } 4902 4903 define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 4904 ; X86-LABEL: test_mm256_maskz_fnmadd_ps: 4905 ; X86: # %bb.0: # %entry 4906 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4907 ; X86-NEXT: kmovw %eax, %k1 4908 ; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 4909 ; X86-NEXT: retl 4910 ; 4911 ; X64-LABEL: test_mm256_maskz_fnmadd_ps: 4912 ; X64: # %bb.0: # %entry 4913 ; X64-NEXT: kmovw %edi, %k1 4914 ; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 4915 ; X64-NEXT: retq 4916 entry: 4917 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4918 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9 4919 %1 = bitcast i8 %__U to <8 x i1> 4920 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 4921 ret <8 x float> %2 4922 } 4923 4924 define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 4925 ; X86-LABEL: test_mm256_maskz_fnmsub_ps: 4926 ; X86: # %bb.0: # %entry 4927 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4928 ; X86-NEXT: kmovw %eax, %k1 4929 ; X86-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 4930 ; X86-NEXT: retl 4931 ; 4932 ; X64-LABEL: test_mm256_maskz_fnmsub_ps: 4933 ; X64: # %bb.0: # %entry 4934 ; X64-NEXT: kmovw %edi, %k1 4935 ; X64-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 4936 ; X64-NEXT: retq 4937 entry: 4938 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4939 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4940 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9 4941 %1 = bitcast i8 %__U to <8 x i1> 4942 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 4943 ret <8 x float> %2 4944 } 4945 4946 define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 4947 ; X86-LABEL: test_mm_mask_fmaddsub_pd: 4948 ; X86: # %bb.0: # %entry 4949 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4950 ; X86-NEXT: kmovw %eax, %k1 4951 ; X86-NEXT: vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 4952 ; X86-NEXT: retl 4953 ; 4954 ; X64-LABEL: test_mm_mask_fmaddsub_pd: 4955 ; X64: # %bb.0: # %entry 4956 ; X64-NEXT: kmovw %edi, %k1 4957 ; X64-NEXT: vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 4958 ; X64-NEXT: retq 4959 entry: 4960 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4961 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4962 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 4963 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 4964 %4 = bitcast i8 %__U to <8 x i1> 4965 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4966 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A 4967 ret <2 x double> %5 4968 } 4969 4970 define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 4971 ; X86-LABEL: test_mm_mask_fmsubadd_pd: 4972 ; X86: # %bb.0: # %entry 4973 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4974 ; X86-NEXT: kmovw %eax, %k1 4975 ; X86-NEXT: vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 4976 ; X86-NEXT: retl 4977 ; 4978 ; X64-LABEL: test_mm_mask_fmsubadd_pd: 4979 ; X64: # %bb.0: # %entry 4980 ; X64-NEXT: kmovw %edi, %k1 4981 ; X64-NEXT: vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 4982 ; X64-NEXT: retq 4983 entry: 4984 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4985 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 4986 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4987 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 4988 %3 = bitcast i8 %__U to <8 x i1> 4989 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4990 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A 4991 ret <2 x double> %4 4992 } 4993 4994 define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 4995 ; X86-LABEL: test_mm_mask3_fmaddsub_pd: 4996 ; X86: # %bb.0: # %entry 4997 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4998 ; X86-NEXT: kmovw %eax, %k1 4999 ; X86-NEXT: vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2 5000 ; X86-NEXT: vmovapd %xmm2, %xmm0 5001 ; X86-NEXT: retl 5002 ; 5003 ; X64-LABEL: test_mm_mask3_fmaddsub_pd: 5004 ; X64: # %bb.0: # %entry 5005 ; X64-NEXT: kmovw %edi, %k1 5006 ; X64-NEXT: vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2 5007 ; X64-NEXT: vmovapd %xmm2, %xmm0 5008 ; X64-NEXT: retq 5009 entry: 5010 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5011 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5012 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 5013 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5014 %4 = bitcast i8 %__U to <8 x i1> 5015 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5016 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C 5017 ret <2 x double> %5 5018 } 5019 5020 define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5021 ; X86-LABEL: test_mm_maskz_fmaddsub_pd: 5022 ; X86: # %bb.0: # %entry 5023 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5024 ; X86-NEXT: kmovw %eax, %k1 5025 ; X86-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 5026 ; X86-NEXT: retl 5027 ; 5028 ; X64-LABEL: test_mm_maskz_fmaddsub_pd: 5029 ; X64: # %bb.0: # %entry 5030 ; X64-NEXT: kmovw %edi, %k1 5031 ; X64-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 5032 ; X64-NEXT: retq 5033 entry: 5034 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5035 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5036 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 5037 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5038 %4 = bitcast i8 %__U to <8 x i1> 5039 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5040 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer 5041 ret <2 x double> %5 5042 } 5043 5044 define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5045 ; X86-LABEL: test_mm_maskz_fmsubadd_pd: 5046 ; X86: # %bb.0: # %entry 5047 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5048 ; X86-NEXT: kmovw %eax, %k1 5049 ; X86-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 5050 ; X86-NEXT: retl 5051 ; 5052 ; X64-LABEL: test_mm_maskz_fmsubadd_pd: 5053 ; X64: # %bb.0: # %entry 5054 ; X64-NEXT: kmovw %edi, %k1 5055 ; X64-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 5056 ; X64-NEXT: retq 5057 entry: 5058 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5059 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 5060 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5061 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5062 %3 = bitcast i8 %__U to <8 x i1> 5063 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5064 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer 5065 ret <2 x double> %4 5066 } 5067 5068 define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5069 ; X86-LABEL: test_mm256_mask_fmaddsub_pd: 5070 ; X86: # %bb.0: # %entry 5071 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5072 ; X86-NEXT: kmovw %eax, %k1 5073 ; X86-NEXT: vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 5074 ; X86-NEXT: retl 5075 ; 5076 ; X64-LABEL: test_mm256_mask_fmaddsub_pd: 5077 ; X64: # %bb.0: # %entry 5078 ; X64-NEXT: kmovw %edi, %k1 5079 ; X64-NEXT: vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 5080 ; X64-NEXT: retq 5081 entry: 5082 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5083 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5084 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 5085 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5086 %4 = bitcast i8 %__U to <8 x i1> 5087 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5088 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A 5089 ret <4 x double> %5 5090 } 5091 5092 define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5093 ; X86-LABEL: test_mm256_mask_fmsubadd_pd: 5094 ; X86: # %bb.0: # %entry 5095 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5096 ; X86-NEXT: kmovw %eax, %k1 5097 ; X86-NEXT: vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 5098 ; X86-NEXT: retl 5099 ; 5100 ; X64-LABEL: test_mm256_mask_fmsubadd_pd: 5101 ; X64: # %bb.0: # %entry 5102 ; X64-NEXT: kmovw %edi, %k1 5103 ; X64-NEXT: vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 5104 ; X64-NEXT: retq 5105 entry: 5106 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5107 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5108 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5109 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5110 %3 = bitcast i8 %__U to <8 x i1> 5111 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5112 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A 5113 ret <4 x double> %4 5114 } 5115 5116 define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5117 ; X86-LABEL: test_mm256_mask3_fmaddsub_pd: 5118 ; X86: # %bb.0: # %entry 5119 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5120 ; X86-NEXT: kmovw %eax, %k1 5121 ; X86-NEXT: vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2 5122 ; X86-NEXT: vmovapd %ymm2, %ymm0 5123 ; X86-NEXT: retl 5124 ; 5125 ; X64-LABEL: test_mm256_mask3_fmaddsub_pd: 5126 ; X64: # %bb.0: # %entry 5127 ; X64-NEXT: kmovw %edi, %k1 5128 ; X64-NEXT: vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2 5129 ; X64-NEXT: vmovapd %ymm2, %ymm0 5130 ; X64-NEXT: retq 5131 entry: 5132 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5133 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5134 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 5135 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5136 %4 = bitcast i8 %__U to <8 x i1> 5137 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5138 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C 5139 ret <4 x double> %5 5140 } 5141 5142 define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 5143 ; X86-LABEL: test_mm256_maskz_fmaddsub_pd: 5144 ; X86: # %bb.0: # %entry 5145 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5146 ; X86-NEXT: kmovw %eax, %k1 5147 ; X86-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 5148 ; X86-NEXT: retl 5149 ; 5150 ; X64-LABEL: test_mm256_maskz_fmaddsub_pd: 5151 ; X64: # %bb.0: # %entry 5152 ; X64-NEXT: kmovw %edi, %k1 5153 ; X64-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 5154 ; X64-NEXT: retq 5155 entry: 5156 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5157 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5158 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 5159 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5160 %4 = bitcast i8 %__U to <8 x i1> 5161 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5162 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer 5163 ret <4 x double> %5 5164 } 5165 5166 define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 5167 ; X86-LABEL: test_mm256_maskz_fmsubadd_pd: 5168 ; X86: # %bb.0: # %entry 5169 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5170 ; X86-NEXT: kmovw %eax, %k1 5171 ; X86-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 5172 ; X86-NEXT: retl 5173 ; 5174 ; X64-LABEL: test_mm256_maskz_fmsubadd_pd: 5175 ; X64: # %bb.0: # %entry 5176 ; X64-NEXT: kmovw %edi, %k1 5177 ; X64-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 5178 ; X64-NEXT: retq 5179 entry: 5180 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5181 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5182 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5183 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5184 %3 = bitcast i8 %__U to <8 x i1> 5185 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5186 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer 5187 ret <4 x double> %4 5188 } 5189 5190 define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5191 ; X86-LABEL: test_mm_mask_fmaddsub_ps: 5192 ; X86: # %bb.0: # %entry 5193 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5194 ; X86-NEXT: kmovw %eax, %k1 5195 ; X86-NEXT: vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 5196 ; X86-NEXT: retl 5197 ; 5198 ; X64-LABEL: test_mm_mask_fmaddsub_ps: 5199 ; X64: # %bb.0: # %entry 5200 ; X64-NEXT: kmovw %edi, %k1 5201 ; X64-NEXT: vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 5202 ; X64-NEXT: retq 5203 entry: 5204 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5205 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5206 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 5207 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5208 %4 = bitcast i8 %__U to <8 x i1> 5209 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5210 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A 5211 ret <4 x float> %5 5212 } 5213 5214 define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5215 ; X86-LABEL: test_mm_mask_fmsubadd_ps: 5216 ; X86: # %bb.0: # %entry 5217 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5218 ; X86-NEXT: kmovw %eax, %k1 5219 ; X86-NEXT: vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 5220 ; X86-NEXT: retl 5221 ; 5222 ; X64-LABEL: test_mm_mask_fmsubadd_ps: 5223 ; X64: # %bb.0: # %entry 5224 ; X64-NEXT: kmovw %edi, %k1 5225 ; X64-NEXT: vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 5226 ; X64-NEXT: retq 5227 entry: 5228 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5229 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5230 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5231 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5232 %3 = bitcast i8 %__U to <8 x i1> 5233 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5234 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A 5235 ret <4 x float> %4 5236 } 5237 5238 define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5239 ; X86-LABEL: test_mm_mask3_fmaddsub_ps: 5240 ; X86: # %bb.0: # %entry 5241 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5242 ; X86-NEXT: kmovw %eax, %k1 5243 ; X86-NEXT: vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2 5244 ; X86-NEXT: vmovaps %xmm2, %xmm0 5245 ; X86-NEXT: retl 5246 ; 5247 ; X64-LABEL: test_mm_mask3_fmaddsub_ps: 5248 ; X64: # %bb.0: # %entry 5249 ; X64-NEXT: kmovw %edi, %k1 5250 ; X64-NEXT: vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2 5251 ; X64-NEXT: vmovaps %xmm2, %xmm0 5252 ; X64-NEXT: retq 5253 entry: 5254 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5255 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5256 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 5257 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5258 %4 = bitcast i8 %__U to <8 x i1> 5259 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5260 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C 5261 ret <4 x float> %5 5262 } 5263 5264 define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5265 ; X86-LABEL: test_mm_maskz_fmaddsub_ps: 5266 ; X86: # %bb.0: # %entry 5267 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5268 ; X86-NEXT: kmovw %eax, %k1 5269 ; X86-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 5270 ; X86-NEXT: retl 5271 ; 5272 ; X64-LABEL: test_mm_maskz_fmaddsub_ps: 5273 ; X64: # %bb.0: # %entry 5274 ; X64-NEXT: kmovw %edi, %k1 5275 ; X64-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 5276 ; X64-NEXT: retq 5277 entry: 5278 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5279 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5280 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 5281 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5282 %4 = bitcast i8 %__U to <8 x i1> 5283 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5284 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer 5285 ret <4 x float> %5 5286 } 5287 5288 define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5289 ; X86-LABEL: test_mm_maskz_fmsubadd_ps: 5290 ; X86: # %bb.0: # %entry 5291 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5292 ; X86-NEXT: kmovw %eax, %k1 5293 ; X86-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 5294 ; X86-NEXT: retl 5295 ; 5296 ; X64-LABEL: test_mm_maskz_fmsubadd_ps: 5297 ; X64: # %bb.0: # %entry 5298 ; X64-NEXT: kmovw %edi, %k1 5299 ; X64-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 5300 ; X64-NEXT: retq 5301 entry: 5302 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5303 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5304 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5305 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5306 %3 = bitcast i8 %__U to <8 x i1> 5307 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5308 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer 5309 ret <4 x float> %4 5310 } 5311 5312 define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5313 ; X86-LABEL: test_mm256_mask_fmaddsub_ps: 5314 ; X86: # %bb.0: # %entry 5315 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5316 ; X86-NEXT: kmovw %eax, %k1 5317 ; X86-NEXT: vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 5318 ; X86-NEXT: retl 5319 ; 5320 ; X64-LABEL: test_mm256_mask_fmaddsub_ps: 5321 ; X64: # %bb.0: # %entry 5322 ; X64-NEXT: kmovw %edi, %k1 5323 ; X64-NEXT: vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 5324 ; X64-NEXT: retq 5325 entry: 5326 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5327 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5328 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 5329 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5330 %4 = bitcast i8 %__U to <8 x i1> 5331 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A 5332 ret <8 x float> %5 5333 } 5334 5335 define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5336 ; X86-LABEL: test_mm256_mask_fmsubadd_ps: 5337 ; X86: # %bb.0: # %entry 5338 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5339 ; X86-NEXT: kmovw %eax, %k1 5340 ; X86-NEXT: vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 5341 ; X86-NEXT: retl 5342 ; 5343 ; X64-LABEL: test_mm256_mask_fmsubadd_ps: 5344 ; X64: # %bb.0: # %entry 5345 ; X64-NEXT: kmovw %edi, %k1 5346 ; X64-NEXT: vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 5347 ; X64-NEXT: retq 5348 entry: 5349 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5350 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5351 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5352 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5353 %3 = bitcast i8 %__U to <8 x i1> 5354 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A 5355 ret <8 x float> %4 5356 } 5357 5358 define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5359 ; X86-LABEL: test_mm256_mask3_fmaddsub_ps: 5360 ; X86: # %bb.0: # %entry 5361 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5362 ; X86-NEXT: kmovw %eax, %k1 5363 ; X86-NEXT: vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2 5364 ; X86-NEXT: vmovaps %ymm2, %ymm0 5365 ; X86-NEXT: retl 5366 ; 5367 ; X64-LABEL: test_mm256_mask3_fmaddsub_ps: 5368 ; X64: # %bb.0: # %entry 5369 ; X64-NEXT: kmovw %edi, %k1 5370 ; X64-NEXT: vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2 5371 ; X64-NEXT: vmovaps %ymm2, %ymm0 5372 ; X64-NEXT: retq 5373 entry: 5374 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5375 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5376 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 5377 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5378 %4 = bitcast i8 %__U to <8 x i1> 5379 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C 5380 ret <8 x float> %5 5381 } 5382 5383 define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 5384 ; X86-LABEL: test_mm256_maskz_fmaddsub_ps: 5385 ; X86: # %bb.0: # %entry 5386 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5387 ; X86-NEXT: kmovw %eax, %k1 5388 ; X86-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 5389 ; X86-NEXT: retl 5390 ; 5391 ; X64-LABEL: test_mm256_maskz_fmaddsub_ps: 5392 ; X64: # %bb.0: # %entry 5393 ; X64-NEXT: kmovw %edi, %k1 5394 ; X64-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 5395 ; X64-NEXT: retq 5396 entry: 5397 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5398 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5399 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 5400 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5401 %4 = bitcast i8 %__U to <8 x i1> 5402 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer 5403 ret <8 x float> %5 5404 } 5405 5406 define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 5407 ; X86-LABEL: test_mm256_maskz_fmsubadd_ps: 5408 ; X86: # %bb.0: # %entry 5409 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5410 ; X86-NEXT: kmovw %eax, %k1 5411 ; X86-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 5412 ; X86-NEXT: retl 5413 ; 5414 ; X64-LABEL: test_mm256_maskz_fmsubadd_ps: 5415 ; X64: # %bb.0: # %entry 5416 ; X64-NEXT: kmovw %edi, %k1 5417 ; X64-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 5418 ; X64-NEXT: retq 5419 entry: 5420 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5421 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5422 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5423 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5424 %3 = bitcast i8 %__U to <8 x i1> 5425 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer 5426 ret <8 x float> %4 5427 } 5428 5429 define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 5430 ; X86-LABEL: test_mm_mask3_fmsub_pd: 5431 ; X86: # %bb.0: # %entry 5432 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5433 ; X86-NEXT: kmovw %eax, %k1 5434 ; X86-NEXT: vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5435 ; X86-NEXT: vmovapd %xmm2, %xmm0 5436 ; X86-NEXT: retl 5437 ; 5438 ; X64-LABEL: test_mm_mask3_fmsub_pd: 5439 ; X64: # %bb.0: # %entry 5440 ; X64-NEXT: kmovw %edi, %k1 5441 ; X64-NEXT: vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5442 ; X64-NEXT: vmovapd %xmm2, %xmm0 5443 ; X64-NEXT: retq 5444 entry: 5445 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5446 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 5447 %1 = bitcast i8 %__U to <8 x i1> 5448 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5449 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 5450 ret <2 x double> %2 5451 } 5452 5453 define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5454 ; X86-LABEL: test_mm256_mask3_fmsub_pd: 5455 ; X86: # %bb.0: # %entry 5456 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5457 ; X86-NEXT: kmovw %eax, %k1 5458 ; X86-NEXT: vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 5459 ; X86-NEXT: vmovapd %ymm2, %ymm0 5460 ; X86-NEXT: retl 5461 ; 5462 ; X64-LABEL: test_mm256_mask3_fmsub_pd: 5463 ; X64: # %bb.0: # %entry 5464 ; X64-NEXT: kmovw %edi, %k1 5465 ; X64-NEXT: vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 5466 ; X64-NEXT: vmovapd %ymm2, %ymm0 5467 ; X64-NEXT: retq 5468 entry: 5469 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5470 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5471 %1 = bitcast i8 %__U to <8 x i1> 5472 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5473 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 5474 ret <4 x double> %2 5475 } 5476 5477 define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5478 ; X86-LABEL: test_mm_mask3_fmsub_ps: 5479 ; X86: # %bb.0: # %entry 5480 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5481 ; X86-NEXT: kmovw %eax, %k1 5482 ; X86-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5483 ; X86-NEXT: vmovaps %xmm2, %xmm0 5484 ; X86-NEXT: retl 5485 ; 5486 ; X64-LABEL: test_mm_mask3_fmsub_ps: 5487 ; X64: # %bb.0: # %entry 5488 ; X64-NEXT: kmovw %edi, %k1 5489 ; X64-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5490 ; X64-NEXT: vmovaps %xmm2, %xmm0 5491 ; X64-NEXT: retq 5492 entry: 5493 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5494 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5495 %1 = bitcast i8 %__U to <8 x i1> 5496 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5497 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 5498 ret <4 x float> %2 5499 } 5500 5501 define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5502 ; X86-LABEL: test_mm256_mask3_fmsub_ps: 5503 ; X86: # %bb.0: # %entry 5504 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5505 ; X86-NEXT: kmovw %eax, %k1 5506 ; X86-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 5507 ; X86-NEXT: vmovaps %ymm2, %ymm0 5508 ; X86-NEXT: retl 5509 ; 5510 ; X64-LABEL: test_mm256_mask3_fmsub_ps: 5511 ; X64: # %bb.0: # %entry 5512 ; X64-NEXT: kmovw %edi, %k1 5513 ; X64-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 5514 ; X64-NEXT: vmovaps %ymm2, %ymm0 5515 ; X64-NEXT: retq 5516 entry: 5517 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5518 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5519 %1 = bitcast i8 %__U to <8 x i1> 5520 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 5521 ret <8 x float> %2 5522 } 5523 5524 define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 5525 ; X86-LABEL: test_mm_mask3_fmsubadd_pd: 5526 ; X86: # %bb.0: # %entry 5527 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5528 ; X86-NEXT: kmovw %eax, %k1 5529 ; X86-NEXT: vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2 5530 ; X86-NEXT: vmovapd %xmm2, %xmm0 5531 ; X86-NEXT: retl 5532 ; 5533 ; X64-LABEL: test_mm_mask3_fmsubadd_pd: 5534 ; X64: # %bb.0: # %entry 5535 ; X64-NEXT: kmovw %edi, %k1 5536 ; X64-NEXT: vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2 5537 ; X64-NEXT: vmovapd %xmm2, %xmm0 5538 ; X64-NEXT: retq 5539 entry: 5540 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5541 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 5542 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5543 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5544 %3 = bitcast i8 %__U to <8 x i1> 5545 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5546 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C 5547 ret <2 x double> %4 5548 } 5549 5550 define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5551 ; X86-LABEL: test_mm256_mask3_fmsubadd_pd: 5552 ; X86: # %bb.0: # %entry 5553 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5554 ; X86-NEXT: kmovw %eax, %k1 5555 ; X86-NEXT: vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2 5556 ; X86-NEXT: vmovapd %ymm2, %ymm0 5557 ; X86-NEXT: retl 5558 ; 5559 ; X64-LABEL: test_mm256_mask3_fmsubadd_pd: 5560 ; X64: # %bb.0: # %entry 5561 ; X64-NEXT: kmovw %edi, %k1 5562 ; X64-NEXT: vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2 5563 ; X64-NEXT: vmovapd %ymm2, %ymm0 5564 ; X64-NEXT: retq 5565 entry: 5566 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5567 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5568 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5569 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5570 %3 = bitcast i8 %__U to <8 x i1> 5571 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5572 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C 5573 ret <4 x double> %4 5574 } 5575 5576 define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5577 ; X86-LABEL: test_mm_mask3_fmsubadd_ps: 5578 ; X86: # %bb.0: # %entry 5579 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5580 ; X86-NEXT: kmovw %eax, %k1 5581 ; X86-NEXT: vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2 5582 ; X86-NEXT: vmovaps %xmm2, %xmm0 5583 ; X86-NEXT: retl 5584 ; 5585 ; X64-LABEL: test_mm_mask3_fmsubadd_ps: 5586 ; X64: # %bb.0: # %entry 5587 ; X64-NEXT: kmovw %edi, %k1 5588 ; X64-NEXT: vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2 5589 ; X64-NEXT: vmovaps %xmm2, %xmm0 5590 ; X64-NEXT: retq 5591 entry: 5592 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5593 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5594 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5595 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5596 %3 = bitcast i8 %__U to <8 x i1> 5597 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5598 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C 5599 ret <4 x float> %4 5600 } 5601 5602 define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5603 ; X86-LABEL: test_mm256_mask3_fmsubadd_ps: 5604 ; X86: # %bb.0: # %entry 5605 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5606 ; X86-NEXT: kmovw %eax, %k1 5607 ; X86-NEXT: vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2 5608 ; X86-NEXT: vmovaps %ymm2, %ymm0 5609 ; X86-NEXT: retl 5610 ; 5611 ; X64-LABEL: test_mm256_mask3_fmsubadd_ps: 5612 ; X64: # %bb.0: # %entry 5613 ; X64-NEXT: kmovw %edi, %k1 5614 ; X64-NEXT: vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2 5615 ; X64-NEXT: vmovaps %ymm2, %ymm0 5616 ; X64-NEXT: retq 5617 entry: 5618 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5619 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5620 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5621 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5622 %3 = bitcast i8 %__U to <8 x i1> 5623 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C 5624 ret <8 x float> %4 5625 } 5626 5627 define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 5628 ; X86-LABEL: test_mm_mask_fnmadd_pd: 5629 ; X86: # %bb.0: # %entry 5630 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5631 ; X86-NEXT: kmovw %eax, %k1 5632 ; X86-NEXT: vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 5633 ; X86-NEXT: retl 5634 ; 5635 ; X64-LABEL: test_mm_mask_fnmadd_pd: 5636 ; X64: # %bb.0: # %entry 5637 ; X64-NEXT: kmovw %edi, %k1 5638 ; X64-NEXT: vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 5639 ; X64-NEXT: retq 5640 entry: 5641 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 5642 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9 5643 %1 = bitcast i8 %__U to <8 x i1> 5644 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5645 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 5646 ret <2 x double> %2 5647 } 5648 5649 define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5650 ; X86-LABEL: test_mm256_mask_fnmadd_pd: 5651 ; X86: # %bb.0: # %entry 5652 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5653 ; X86-NEXT: kmovw %eax, %k1 5654 ; X86-NEXT: vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2 5655 ; X86-NEXT: retl 5656 ; 5657 ; X64-LABEL: test_mm256_mask_fnmadd_pd: 5658 ; X64: # %bb.0: # %entry 5659 ; X64-NEXT: kmovw %edi, %k1 5660 ; X64-NEXT: vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2 5661 ; X64-NEXT: retq 5662 entry: 5663 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 5664 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9 5665 %1 = bitcast i8 %__U to <8 x i1> 5666 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5667 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 5668 ret <4 x double> %2 5669 } 5670 5671 define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5672 ; X86-LABEL: test_mm_mask_fnmadd_ps: 5673 ; X86: # %bb.0: # %entry 5674 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5675 ; X86-NEXT: kmovw %eax, %k1 5676 ; X86-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 5677 ; X86-NEXT: retl 5678 ; 5679 ; X64-LABEL: test_mm_mask_fnmadd_ps: 5680 ; X64: # %bb.0: # %entry 5681 ; X64-NEXT: kmovw %edi, %k1 5682 ; X64-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 5683 ; X64-NEXT: retq 5684 entry: 5685 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5686 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9 5687 %1 = bitcast i8 %__U to <8 x i1> 5688 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5689 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 5690 ret <4 x float> %2 5691 } 5692 5693 define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5694 ; X86-LABEL: test_mm256_mask_fnmadd_ps: 5695 ; X86: # %bb.0: # %entry 5696 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5697 ; X86-NEXT: kmovw %eax, %k1 5698 ; X86-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2 5699 ; X86-NEXT: retl 5700 ; 5701 ; X64-LABEL: test_mm256_mask_fnmadd_ps: 5702 ; X64: # %bb.0: # %entry 5703 ; X64-NEXT: kmovw %edi, %k1 5704 ; X64-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2 5705 ; X64-NEXT: retq 5706 entry: 5707 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5708 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9 5709 %1 = bitcast i8 %__U to <8 x i1> 5710 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 5711 ret <8 x float> %2 5712 } 5713 5714 define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 5715 ; X86-LABEL: test_mm_mask_fnmsub_pd: 5716 ; X86: # %bb.0: # %entry 5717 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5718 ; X86-NEXT: kmovw %eax, %k1 5719 ; X86-NEXT: vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 5720 ; X86-NEXT: retl 5721 ; 5722 ; X64-LABEL: test_mm_mask_fnmsub_pd: 5723 ; X64: # %bb.0: # %entry 5724 ; X64-NEXT: kmovw %edi, %k1 5725 ; X64-NEXT: vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 5726 ; X64-NEXT: retq 5727 entry: 5728 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 5729 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5730 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9 5731 %1 = bitcast i8 %__U to <8 x i1> 5732 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5733 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 5734 ret <2 x double> %2 5735 } 5736 5737 define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 5738 ; X86-LABEL: test_mm_mask3_fnmsub_pd: 5739 ; X86: # %bb.0: # %entry 5740 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5741 ; X86-NEXT: kmovw %eax, %k1 5742 ; X86-NEXT: vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5743 ; X86-NEXT: vmovapd %xmm2, %xmm0 5744 ; X86-NEXT: retl 5745 ; 5746 ; X64-LABEL: test_mm_mask3_fnmsub_pd: 5747 ; X64: # %bb.0: # %entry 5748 ; X64-NEXT: kmovw %edi, %k1 5749 ; X64-NEXT: vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5750 ; X64-NEXT: vmovapd %xmm2, %xmm0 5751 ; X64-NEXT: retq 5752 entry: 5753 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 5754 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5755 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9 5756 %1 = bitcast i8 %__U to <8 x i1> 5757 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5758 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 5759 ret <2 x double> %2 5760 } 5761 5762 define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5763 ; X86-LABEL: test_mm256_mask_fnmsub_pd: 5764 ; X86: # %bb.0: # %entry 5765 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5766 ; X86-NEXT: kmovw %eax, %k1 5767 ; X86-NEXT: vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2 5768 ; X86-NEXT: retl 5769 ; 5770 ; X64-LABEL: test_mm256_mask_fnmsub_pd: 5771 ; X64: # %bb.0: # %entry 5772 ; X64-NEXT: kmovw %edi, %k1 5773 ; X64-NEXT: vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2 5774 ; X64-NEXT: retq 5775 entry: 5776 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 5777 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5778 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9 5779 %1 = bitcast i8 %__U to <8 x i1> 5780 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5781 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 5782 ret <4 x double> %2 5783 } 5784 5785 define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5786 ; X86-LABEL: test_mm256_mask3_fnmsub_pd: 5787 ; X86: # %bb.0: # %entry 5788 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5789 ; X86-NEXT: kmovw %eax, %k1 5790 ; X86-NEXT: vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2 5791 ; X86-NEXT: vmovapd %ymm2, %ymm0 5792 ; X86-NEXT: retl 5793 ; 5794 ; X64-LABEL: test_mm256_mask3_fnmsub_pd: 5795 ; X64: # %bb.0: # %entry 5796 ; X64-NEXT: kmovw %edi, %k1 5797 ; X64-NEXT: vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2 5798 ; X64-NEXT: vmovapd %ymm2, %ymm0 5799 ; X64-NEXT: retq 5800 entry: 5801 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 5802 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5803 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9 5804 %1 = bitcast i8 %__U to <8 x i1> 5805 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5806 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 5807 ret <4 x double> %2 5808 } 5809 5810 define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5811 ; X86-LABEL: test_mm_mask_fnmsub_ps: 5812 ; X86: # %bb.0: # %entry 5813 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5814 ; X86-NEXT: kmovw %eax, %k1 5815 ; X86-NEXT: vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 5816 ; X86-NEXT: retl 5817 ; 5818 ; X64-LABEL: test_mm_mask_fnmsub_ps: 5819 ; X64: # %bb.0: # %entry 5820 ; X64-NEXT: kmovw %edi, %k1 5821 ; X64-NEXT: vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 5822 ; X64-NEXT: retq 5823 entry: 5824 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5825 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5826 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9 5827 %1 = bitcast i8 %__U to <8 x i1> 5828 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5829 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 5830 ret <4 x float> %2 5831 } 5832 5833 define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5834 ; X86-LABEL: test_mm_mask3_fnmsub_ps: 5835 ; X86: # %bb.0: # %entry 5836 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5837 ; X86-NEXT: kmovw %eax, %k1 5838 ; X86-NEXT: vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5839 ; X86-NEXT: vmovaps %xmm2, %xmm0 5840 ; X86-NEXT: retl 5841 ; 5842 ; X64-LABEL: test_mm_mask3_fnmsub_ps: 5843 ; X64: # %bb.0: # %entry 5844 ; X64-NEXT: kmovw %edi, %k1 5845 ; X64-NEXT: vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5846 ; X64-NEXT: vmovaps %xmm2, %xmm0 5847 ; X64-NEXT: retq 5848 entry: 5849 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5850 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5851 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9 5852 %1 = bitcast i8 %__U to <8 x i1> 5853 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5854 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 5855 ret <4 x float> %2 5856 } 5857 5858 define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5859 ; X86-LABEL: test_mm256_mask_fnmsub_ps: 5860 ; X86: # %bb.0: # %entry 5861 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5862 ; X86-NEXT: kmovw %eax, %k1 5863 ; X86-NEXT: vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2 5864 ; X86-NEXT: retl 5865 ; 5866 ; X64-LABEL: test_mm256_mask_fnmsub_ps: 5867 ; X64: # %bb.0: # %entry 5868 ; X64-NEXT: kmovw %edi, %k1 5869 ; X64-NEXT: vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2 5870 ; X64-NEXT: retq 5871 entry: 5872 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5873 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5874 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9 5875 %1 = bitcast i8 %__U to <8 x i1> 5876 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 5877 ret <8 x float> %2 5878 } 5879 5880 define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5881 ; X86-LABEL: test_mm256_mask3_fnmsub_ps: 5882 ; X86: # %bb.0: # %entry 5883 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5884 ; X86-NEXT: kmovw %eax, %k1 5885 ; X86-NEXT: vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2 5886 ; X86-NEXT: vmovaps %ymm2, %ymm0 5887 ; X86-NEXT: retl 5888 ; 5889 ; X64-LABEL: test_mm256_mask3_fnmsub_ps: 5890 ; X64: # %bb.0: # %entry 5891 ; X64-NEXT: kmovw %edi, %k1 5892 ; X64-NEXT: vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2 5893 ; X64-NEXT: vmovaps %ymm2, %ymm0 5894 ; X64-NEXT: retq 5895 entry: 5896 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5897 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5898 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9 5899 %1 = bitcast i8 %__U to <8 x i1> 5900 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 5901 ret <8 x float> %2 5902 } 5903 5904 define <2 x double> @test_mm_mask_expandloadu_pd(<2 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { 5905 ; X86-LABEL: test_mm_mask_expandloadu_pd: 5906 ; X86: # %bb.0: # %entry 5907 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 5908 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 5909 ; X86-NEXT: kmovw %ecx, %k1 5910 ; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} 5911 ; X86-NEXT: retl 5912 ; 5913 ; X64-LABEL: test_mm_mask_expandloadu_pd: 5914 ; X64: # %bb.0: # %entry 5915 ; X64-NEXT: kmovw %edi, %k1 5916 ; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} 5917 ; X64-NEXT: retq 5918 entry: 5919 %0 = bitcast i8* %__P to double* 5920 %1 = bitcast i8 %__U to <8 x i1> 5921 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5922 %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> %__W) 5923 ret <2 x double> %2 5924 } 5925 5926 define <2 x double> @test_mm_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { 5927 ; X86-LABEL: test_mm_maskz_expandloadu_pd: 5928 ; X86: # %bb.0: # %entry 5929 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 5930 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 5931 ; X86-NEXT: kmovw %ecx, %k1 5932 ; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} {z} 5933 ; X86-NEXT: retl 5934 ; 5935 ; X64-LABEL: test_mm_maskz_expandloadu_pd: 5936 ; X64: # %bb.0: # %entry 5937 ; X64-NEXT: kmovw %edi, %k1 5938 ; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} {z} 5939 ; X64-NEXT: retq 5940 entry: 5941 %0 = bitcast i8* %__P to double* 5942 %1 = bitcast i8 %__U to <8 x i1> 5943 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5944 %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> zeroinitializer) 5945 ret <2 x double> %2 5946 } 5947 5948 define <4 x double> @test_mm256_mask_expandloadu_pd(<4 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { 5949 ; X86-LABEL: test_mm256_mask_expandloadu_pd: 5950 ; X86: # %bb.0: # %entry 5951 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 5952 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 5953 ; X86-NEXT: kmovw %ecx, %k1 5954 ; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} 5955 ; X86-NEXT: retl 5956 ; 5957 ; X64-LABEL: test_mm256_mask_expandloadu_pd: 5958 ; X64: # %bb.0: # %entry 5959 ; X64-NEXT: kmovw %edi, %k1 5960 ; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} 5961 ; X64-NEXT: retq 5962 entry: 5963 %0 = bitcast i8* %__P to double* 5964 %1 = bitcast i8 %__U to <8 x i1> 5965 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5966 %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> %__W) 5967 ret <4 x double> %2 5968 } 5969 5970 define <4 x double> @test_mm256_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { 5971 ; X86-LABEL: test_mm256_maskz_expandloadu_pd: 5972 ; X86: # %bb.0: # %entry 5973 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 5974 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 5975 ; X86-NEXT: kmovw %ecx, %k1 5976 ; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} {z} 5977 ; X86-NEXT: retl 5978 ; 5979 ; X64-LABEL: test_mm256_maskz_expandloadu_pd: 5980 ; X64: # %bb.0: # %entry 5981 ; X64-NEXT: kmovw %edi, %k1 5982 ; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} {z} 5983 ; X64-NEXT: retq 5984 entry: 5985 %0 = bitcast i8* %__P to double* 5986 %1 = bitcast i8 %__U to <8 x i1> 5987 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5988 %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> zeroinitializer) 5989 ret <4 x double> %2 5990 } 5991 5992 define <2 x i64> @test_mm_mask_expandloadu_epi64(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 5993 ; X86-LABEL: test_mm_mask_expandloadu_epi64: 5994 ; X86: # %bb.0: # %entry 5995 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 5996 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 5997 ; X86-NEXT: kmovw %ecx, %k1 5998 ; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} 5999 ; X86-NEXT: retl 6000 ; 6001 ; X64-LABEL: test_mm_mask_expandloadu_epi64: 6002 ; X64: # %bb.0: # %entry 6003 ; X64-NEXT: kmovw %edi, %k1 6004 ; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} 6005 ; X64-NEXT: retq 6006 entry: 6007 %0 = bitcast i8* %__P to i64* 6008 %1 = bitcast i8 %__U to <8 x i1> 6009 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6010 %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> %__W) #10 6011 ret <2 x i64> %2 6012 } 6013 6014 define <2 x i64> @test_mm_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { 6015 ; X86-LABEL: test_mm_maskz_expandloadu_epi64: 6016 ; X86: # %bb.0: # %entry 6017 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6018 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6019 ; X86-NEXT: kmovw %ecx, %k1 6020 ; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} {z} 6021 ; X86-NEXT: retl 6022 ; 6023 ; X64-LABEL: test_mm_maskz_expandloadu_epi64: 6024 ; X64: # %bb.0: # %entry 6025 ; X64-NEXT: kmovw %edi, %k1 6026 ; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} {z} 6027 ; X64-NEXT: retq 6028 entry: 6029 %0 = bitcast i8* %__P to i64* 6030 %1 = bitcast i8 %__U to <8 x i1> 6031 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6032 %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> zeroinitializer) 6033 ret <2 x i64> %2 6034 } 6035 6036 define <4 x i64> @test_mm256_mask_expandloadu_epi64(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6037 ; X86-LABEL: test_mm256_mask_expandloadu_epi64: 6038 ; X86: # %bb.0: # %entry 6039 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6040 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6041 ; X86-NEXT: kmovw %ecx, %k1 6042 ; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} 6043 ; X86-NEXT: retl 6044 ; 6045 ; X64-LABEL: test_mm256_mask_expandloadu_epi64: 6046 ; X64: # %bb.0: # %entry 6047 ; X64-NEXT: kmovw %edi, %k1 6048 ; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} 6049 ; X64-NEXT: retq 6050 entry: 6051 %0 = bitcast i8* %__P to i64* 6052 %1 = bitcast i8 %__U to <8 x i1> 6053 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6054 %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> %__W) #10 6055 ret <4 x i64> %2 6056 } 6057 6058 define <4 x i64> @test_mm256_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { 6059 ; X86-LABEL: test_mm256_maskz_expandloadu_epi64: 6060 ; X86: # %bb.0: # %entry 6061 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6062 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6063 ; X86-NEXT: kmovw %ecx, %k1 6064 ; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} {z} 6065 ; X86-NEXT: retl 6066 ; 6067 ; X64-LABEL: test_mm256_maskz_expandloadu_epi64: 6068 ; X64: # %bb.0: # %entry 6069 ; X64-NEXT: kmovw %edi, %k1 6070 ; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} {z} 6071 ; X64-NEXT: retq 6072 entry: 6073 %0 = bitcast i8* %__P to i64* 6074 %1 = bitcast i8 %__U to <8 x i1> 6075 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6076 %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> zeroinitializer) 6077 ret <4 x i64> %2 6078 } 6079 6080 define <4 x float> @test_mm_mask_expandloadu_ps(<4 x float> %__W, i8 zeroext %__U, i8* readonly %__P) { 6081 ; X86-LABEL: test_mm_mask_expandloadu_ps: 6082 ; X86: # %bb.0: # %entry 6083 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6084 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6085 ; X86-NEXT: kmovw %ecx, %k1 6086 ; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} 6087 ; X86-NEXT: retl 6088 ; 6089 ; X64-LABEL: test_mm_mask_expandloadu_ps: 6090 ; X64: # %bb.0: # %entry 6091 ; X64-NEXT: kmovw %edi, %k1 6092 ; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} 6093 ; X64-NEXT: retq 6094 entry: 6095 %0 = bitcast i8* %__P to float* 6096 %1 = bitcast i8 %__U to <8 x i1> 6097 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6098 %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> %__W) 6099 ret <4 x float> %2 6100 } 6101 6102 define <4 x float> @test_mm_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) { 6103 ; X86-LABEL: test_mm_maskz_expandloadu_ps: 6104 ; X86: # %bb.0: # %entry 6105 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6106 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6107 ; X86-NEXT: kmovw %ecx, %k1 6108 ; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} {z} 6109 ; X86-NEXT: retl 6110 ; 6111 ; X64-LABEL: test_mm_maskz_expandloadu_ps: 6112 ; X64: # %bb.0: # %entry 6113 ; X64-NEXT: kmovw %edi, %k1 6114 ; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} {z} 6115 ; X64-NEXT: retq 6116 entry: 6117 %0 = bitcast i8* %__P to float* 6118 %1 = bitcast i8 %__U to <8 x i1> 6119 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6120 %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> zeroinitializer) 6121 ret <4 x float> %2 6122 } 6123 6124 define <8 x float> @test_mm256_mask_expandloadu_ps(<8 x float> %__W, i8 zeroext %__U, i8* readonly %__P) { 6125 ; X86-LABEL: test_mm256_mask_expandloadu_ps: 6126 ; X86: # %bb.0: # %entry 6127 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6128 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6129 ; X86-NEXT: kmovw %ecx, %k1 6130 ; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} 6131 ; X86-NEXT: retl 6132 ; 6133 ; X64-LABEL: test_mm256_mask_expandloadu_ps: 6134 ; X64: # %bb.0: # %entry 6135 ; X64-NEXT: kmovw %edi, %k1 6136 ; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} 6137 ; X64-NEXT: retq 6138 entry: 6139 %0 = bitcast i8* %__P to float* 6140 %1 = bitcast i8 %__U to <8 x i1> 6141 %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> %__W) 6142 ret <8 x float> %2 6143 } 6144 6145 define <8 x float> @test_mm256_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) { 6146 ; X86-LABEL: test_mm256_maskz_expandloadu_ps: 6147 ; X86: # %bb.0: # %entry 6148 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6149 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6150 ; X86-NEXT: kmovw %ecx, %k1 6151 ; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} {z} 6152 ; X86-NEXT: retl 6153 ; 6154 ; X64-LABEL: test_mm256_maskz_expandloadu_ps: 6155 ; X64: # %bb.0: # %entry 6156 ; X64-NEXT: kmovw %edi, %k1 6157 ; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} {z} 6158 ; X64-NEXT: retq 6159 entry: 6160 %0 = bitcast i8* %__P to float* 6161 %1 = bitcast i8 %__U to <8 x i1> 6162 %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> zeroinitializer) 6163 ret <8 x float> %2 6164 } 6165 6166 define <2 x i64> @test_mm_mask_expandloadu_epi32(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6167 ; X86-LABEL: test_mm_mask_expandloadu_epi32: 6168 ; X86: # %bb.0: # %entry 6169 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6170 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6171 ; X86-NEXT: kmovw %ecx, %k1 6172 ; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} 6173 ; X86-NEXT: retl 6174 ; 6175 ; X64-LABEL: test_mm_mask_expandloadu_epi32: 6176 ; X64: # %bb.0: # %entry 6177 ; X64-NEXT: kmovw %edi, %k1 6178 ; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} 6179 ; X64-NEXT: retq 6180 entry: 6181 %0 = bitcast <2 x i64> %__W to <4 x i32> 6182 %1 = bitcast i8* %__P to i32* 6183 %2 = bitcast i8 %__U to <8 x i1> 6184 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6185 %3 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %1, <4 x i1> %extract.i, <4 x i32> %0) 6186 %4 = bitcast <4 x i32> %3 to <2 x i64> 6187 ret <2 x i64> %4 6188 } 6189 6190 define <2 x i64> @test_mm_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) { 6191 ; X86-LABEL: test_mm_maskz_expandloadu_epi32: 6192 ; X86: # %bb.0: # %entry 6193 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6194 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6195 ; X86-NEXT: kmovw %ecx, %k1 6196 ; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} {z} 6197 ; X86-NEXT: retl 6198 ; 6199 ; X64-LABEL: test_mm_maskz_expandloadu_epi32: 6200 ; X64: # %bb.0: # %entry 6201 ; X64-NEXT: kmovw %edi, %k1 6202 ; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} {z} 6203 ; X64-NEXT: retq 6204 entry: 6205 %0 = bitcast i8* %__P to i32* 6206 %1 = bitcast i8 %__U to <8 x i1> 6207 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6208 %2 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %0, <4 x i1> %extract.i, <4 x i32> zeroinitializer) 6209 %3 = bitcast <4 x i32> %2 to <2 x i64> 6210 ret <2 x i64> %3 6211 } 6212 6213 define <4 x i64> @test_mm256_mask_expandloadu_epi32(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6214 ; X86-LABEL: test_mm256_mask_expandloadu_epi32: 6215 ; X86: # %bb.0: # %entry 6216 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6217 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6218 ; X86-NEXT: kmovw %ecx, %k1 6219 ; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} 6220 ; X86-NEXT: retl 6221 ; 6222 ; X64-LABEL: test_mm256_mask_expandloadu_epi32: 6223 ; X64: # %bb.0: # %entry 6224 ; X64-NEXT: kmovw %edi, %k1 6225 ; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} 6226 ; X64-NEXT: retq 6227 entry: 6228 %0 = bitcast <4 x i64> %__W to <8 x i32> 6229 %1 = bitcast i8* %__P to i32* 6230 %2 = bitcast i8 %__U to <8 x i1> 6231 %3 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %1, <8 x i1> %2, <8 x i32> %0) 6232 %4 = bitcast <8 x i32> %3 to <4 x i64> 6233 ret <4 x i64> %4 6234 } 6235 6236 define <4 x i64> @test_mm256_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) { 6237 ; X86-LABEL: test_mm256_maskz_expandloadu_epi32: 6238 ; X86: # %bb.0: # %entry 6239 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6240 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6241 ; X86-NEXT: kmovw %ecx, %k1 6242 ; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} {z} 6243 ; X86-NEXT: retl 6244 ; 6245 ; X64-LABEL: test_mm256_maskz_expandloadu_epi32: 6246 ; X64: # %bb.0: # %entry 6247 ; X64-NEXT: kmovw %edi, %k1 6248 ; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} {z} 6249 ; X64-NEXT: retq 6250 entry: 6251 %0 = bitcast i8* %__P to i32* 6252 %1 = bitcast i8 %__U to <8 x i1> 6253 %2 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %0, <8 x i1> %1, <8 x i32> zeroinitializer) 6254 %3 = bitcast <8 x i32> %2 to <4 x i64> 6255 ret <4 x i64> %3 6256 } 6257 6258 define void @test_mm_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <2 x double> %__A) { 6259 ; X86-LABEL: test_mm_mask_compressstoreu_pd: 6260 ; X86: # %bb.0: # %entry 6261 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6262 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6263 ; X86-NEXT: kmovw %eax, %k1 6264 ; X86-NEXT: vcompresspd %xmm0, (%ecx) {%k1} 6265 ; X86-NEXT: retl 6266 ; 6267 ; X64-LABEL: test_mm_mask_compressstoreu_pd: 6268 ; X64: # %bb.0: # %entry 6269 ; X64-NEXT: kmovw %esi, %k1 6270 ; X64-NEXT: vcompresspd %xmm0, (%rdi) {%k1} 6271 ; X64-NEXT: retq 6272 entry: 6273 %0 = bitcast i8* %__P to double* 6274 %1 = bitcast i8 %__U to <8 x i1> 6275 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6276 tail call void @llvm.masked.compressstore.v2f64(<2 x double> %__A, double* %0, <2 x i1> %extract.i) 6277 ret void 6278 } 6279 6280 define void @test_mm256_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <4 x double> %__A) { 6281 ; X86-LABEL: test_mm256_mask_compressstoreu_pd: 6282 ; X86: # %bb.0: # %entry 6283 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6284 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6285 ; X86-NEXT: kmovw %eax, %k1 6286 ; X86-NEXT: vcompresspd %ymm0, (%ecx) {%k1} 6287 ; X86-NEXT: vzeroupper 6288 ; X86-NEXT: retl 6289 ; 6290 ; X64-LABEL: test_mm256_mask_compressstoreu_pd: 6291 ; X64: # %bb.0: # %entry 6292 ; X64-NEXT: kmovw %esi, %k1 6293 ; X64-NEXT: vcompresspd %ymm0, (%rdi) {%k1} 6294 ; X64-NEXT: vzeroupper 6295 ; X64-NEXT: retq 6296 entry: 6297 %0 = bitcast i8* %__P to double* 6298 %1 = bitcast i8 %__U to <8 x i1> 6299 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6300 tail call void @llvm.masked.compressstore.v4f64(<4 x double> %__A, double* %0, <4 x i1> %extract.i) 6301 ret void 6302 } 6303 6304 define void @test_mm_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) { 6305 ; X86-LABEL: test_mm_mask_compressstoreu_epi64: 6306 ; X86: # %bb.0: # %entry 6307 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6308 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6309 ; X86-NEXT: kmovw %eax, %k1 6310 ; X86-NEXT: vpcompressq %xmm0, (%ecx) {%k1} 6311 ; X86-NEXT: retl 6312 ; 6313 ; X64-LABEL: test_mm_mask_compressstoreu_epi64: 6314 ; X64: # %bb.0: # %entry 6315 ; X64-NEXT: kmovw %esi, %k1 6316 ; X64-NEXT: vpcompressq %xmm0, (%rdi) {%k1} 6317 ; X64-NEXT: retq 6318 entry: 6319 %0 = bitcast i8* %__P to i64* 6320 %1 = bitcast i8 %__U to <8 x i1> 6321 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6322 tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %__A, i64* %0, <2 x i1> %extract.i) 6323 ret void 6324 } 6325 6326 define void @test_mm256_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) { 6327 ; X86-LABEL: test_mm256_mask_compressstoreu_epi64: 6328 ; X86: # %bb.0: # %entry 6329 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6330 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6331 ; X86-NEXT: kmovw %eax, %k1 6332 ; X86-NEXT: vpcompressq %ymm0, (%ecx) {%k1} 6333 ; X86-NEXT: vzeroupper 6334 ; X86-NEXT: retl 6335 ; 6336 ; X64-LABEL: test_mm256_mask_compressstoreu_epi64: 6337 ; X64: # %bb.0: # %entry 6338 ; X64-NEXT: kmovw %esi, %k1 6339 ; X64-NEXT: vpcompressq %ymm0, (%rdi) {%k1} 6340 ; X64-NEXT: vzeroupper 6341 ; X64-NEXT: retq 6342 entry: 6343 %0 = bitcast i8* %__P to i64* 6344 %1 = bitcast i8 %__U to <8 x i1> 6345 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6346 tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %__A, i64* %0, <4 x i1> %extract.i) 6347 ret void 6348 } 6349 6350 define void @test_mm_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <4 x float> %__A) { 6351 ; X86-LABEL: test_mm_mask_compressstoreu_ps: 6352 ; X86: # %bb.0: # %entry 6353 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6354 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6355 ; X86-NEXT: kmovw %eax, %k1 6356 ; X86-NEXT: vcompressps %xmm0, (%ecx) {%k1} 6357 ; X86-NEXT: retl 6358 ; 6359 ; X64-LABEL: test_mm_mask_compressstoreu_ps: 6360 ; X64: # %bb.0: # %entry 6361 ; X64-NEXT: kmovw %esi, %k1 6362 ; X64-NEXT: vcompressps %xmm0, (%rdi) {%k1} 6363 ; X64-NEXT: retq 6364 entry: 6365 %0 = bitcast i8* %__P to float* 6366 %1 = bitcast i8 %__U to <8 x i1> 6367 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6368 tail call void @llvm.masked.compressstore.v4f32(<4 x float> %__A, float* %0, <4 x i1> %extract.i) 6369 ret void 6370 } 6371 6372 define void @test_mm256_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <8 x float> %__A) { 6373 ; X86-LABEL: test_mm256_mask_compressstoreu_ps: 6374 ; X86: # %bb.0: # %entry 6375 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6376 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6377 ; X86-NEXT: kmovw %eax, %k1 6378 ; X86-NEXT: vcompressps %ymm0, (%ecx) {%k1} 6379 ; X86-NEXT: vzeroupper 6380 ; X86-NEXT: retl 6381 ; 6382 ; X64-LABEL: test_mm256_mask_compressstoreu_ps: 6383 ; X64: # %bb.0: # %entry 6384 ; X64-NEXT: kmovw %esi, %k1 6385 ; X64-NEXT: vcompressps %ymm0, (%rdi) {%k1} 6386 ; X64-NEXT: vzeroupper 6387 ; X64-NEXT: retq 6388 entry: 6389 %0 = bitcast i8* %__P to float* 6390 %1 = bitcast i8 %__U to <8 x i1> 6391 tail call void @llvm.masked.compressstore.v8f32(<8 x float> %__A, float* %0, <8 x i1> %1) 6392 ret void 6393 } 6394 6395 define void @test_mm_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) { 6396 ; X86-LABEL: test_mm_mask_compressstoreu_epi32: 6397 ; X86: # %bb.0: # %entry 6398 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6399 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6400 ; X86-NEXT: kmovw %eax, %k1 6401 ; X86-NEXT: vpcompressd %xmm0, (%ecx) {%k1} 6402 ; X86-NEXT: retl 6403 ; 6404 ; X64-LABEL: test_mm_mask_compressstoreu_epi32: 6405 ; X64: # %bb.0: # %entry 6406 ; X64-NEXT: kmovw %esi, %k1 6407 ; X64-NEXT: vpcompressd %xmm0, (%rdi) {%k1} 6408 ; X64-NEXT: retq 6409 entry: 6410 %0 = bitcast <2 x i64> %__A to <4 x i32> 6411 %1 = bitcast i8* %__P to i32* 6412 %2 = bitcast i8 %__U to <8 x i1> 6413 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6414 tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %0, i32* %1, <4 x i1> %extract.i) 6415 ret void 6416 } 6417 6418 define void @test_mm256_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) { 6419 ; X86-LABEL: test_mm256_mask_compressstoreu_epi32: 6420 ; X86: # %bb.0: # %entry 6421 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6422 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6423 ; X86-NEXT: kmovw %eax, %k1 6424 ; X86-NEXT: vpcompressd %ymm0, (%ecx) {%k1} 6425 ; X86-NEXT: vzeroupper 6426 ; X86-NEXT: retl 6427 ; 6428 ; X64-LABEL: test_mm256_mask_compressstoreu_epi32: 6429 ; X64: # %bb.0: # %entry 6430 ; X64-NEXT: kmovw %esi, %k1 6431 ; X64-NEXT: vpcompressd %ymm0, (%rdi) {%k1} 6432 ; X64-NEXT: vzeroupper 6433 ; X64-NEXT: retq 6434 entry: 6435 %0 = bitcast <4 x i64> %__A to <8 x i32> 6436 %1 = bitcast i8* %__P to i32* 6437 %2 = bitcast i8 %__U to <8 x i1> 6438 tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %0, i32* %1, <8 x i1> %2) #10 6439 ret void 6440 } 6441 6442 6443 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8 6444 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8 6445 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8 6446 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8 6447 6448 define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) { 6449 ; X86-LABEL: test_mm_mask_sqrt_pd: 6450 ; X86: # %bb.0: # %entry 6451 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6452 ; X86-NEXT: kmovw %eax, %k1 6453 ; X86-NEXT: vsqrtpd %xmm1, %xmm0 {%k1} 6454 ; X86-NEXT: retl 6455 ; 6456 ; X64-LABEL: test_mm_mask_sqrt_pd: 6457 ; X64: # %bb.0: # %entry 6458 ; X64-NEXT: kmovw %edi, %k1 6459 ; X64-NEXT: vsqrtpd %xmm1, %xmm0 {%k1} 6460 ; X64-NEXT: retq 6461 entry: 6462 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2 6463 %1 = bitcast i8 %__U to <8 x i1> 6464 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6465 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W 6466 ret <2 x double> %2 6467 } 6468 6469 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) 6470 6471 define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) { 6472 ; X86-LABEL: test_mm_maskz_sqrt_pd: 6473 ; X86: # %bb.0: # %entry 6474 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6475 ; X86-NEXT: kmovw %eax, %k1 6476 ; X86-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z} 6477 ; X86-NEXT: retl 6478 ; 6479 ; X64-LABEL: test_mm_maskz_sqrt_pd: 6480 ; X64: # %bb.0: # %entry 6481 ; X64-NEXT: kmovw %edi, %k1 6482 ; X64-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z} 6483 ; X64-NEXT: retq 6484 entry: 6485 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2 6486 %1 = bitcast i8 %__U to <8 x i1> 6487 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6488 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 6489 ret <2 x double> %2 6490 } 6491 6492 define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) { 6493 ; X86-LABEL: test_mm256_mask_sqrt_pd: 6494 ; X86: # %bb.0: # %entry 6495 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6496 ; X86-NEXT: kmovw %eax, %k1 6497 ; X86-NEXT: vsqrtpd %ymm1, %ymm0 {%k1} 6498 ; X86-NEXT: retl 6499 ; 6500 ; X64-LABEL: test_mm256_mask_sqrt_pd: 6501 ; X64: # %bb.0: # %entry 6502 ; X64-NEXT: kmovw %edi, %k1 6503 ; X64-NEXT: vsqrtpd %ymm1, %ymm0 {%k1} 6504 ; X64-NEXT: retq 6505 entry: 6506 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2 6507 %1 = bitcast i8 %__U to <8 x i1> 6508 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6509 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W 6510 ret <4 x double> %2 6511 } 6512 6513 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) 6514 6515 define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) { 6516 ; X86-LABEL: test_mm256_maskz_sqrt_pd: 6517 ; X86: # %bb.0: # %entry 6518 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6519 ; X86-NEXT: kmovw %eax, %k1 6520 ; X86-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} 6521 ; X86-NEXT: retl 6522 ; 6523 ; X64-LABEL: test_mm256_maskz_sqrt_pd: 6524 ; X64: # %bb.0: # %entry 6525 ; X64-NEXT: kmovw %edi, %k1 6526 ; X64-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} 6527 ; X64-NEXT: retq 6528 entry: 6529 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2 6530 %1 = bitcast i8 %__U to <8 x i1> 6531 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6532 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 6533 ret <4 x double> %2 6534 } 6535 6536 define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) { 6537 ; X86-LABEL: test_mm_mask_sqrt_ps: 6538 ; X86: # %bb.0: # %entry 6539 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6540 ; X86-NEXT: kmovw %eax, %k1 6541 ; X86-NEXT: vsqrtps %xmm1, %xmm0 {%k1} 6542 ; X86-NEXT: retl 6543 ; 6544 ; X64-LABEL: test_mm_mask_sqrt_ps: 6545 ; X64: # %bb.0: # %entry 6546 ; X64-NEXT: kmovw %edi, %k1 6547 ; X64-NEXT: vsqrtps %xmm1, %xmm0 {%k1} 6548 ; X64-NEXT: retq 6549 entry: 6550 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2 6551 %1 = bitcast i8 %__U to <8 x i1> 6552 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6553 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W 6554 ret <4 x float> %2 6555 } 6556 6557 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) 6558 6559 define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) { 6560 ; X86-LABEL: test_mm_maskz_sqrt_ps: 6561 ; X86: # %bb.0: # %entry 6562 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6563 ; X86-NEXT: kmovw %eax, %k1 6564 ; X86-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z} 6565 ; X86-NEXT: retl 6566 ; 6567 ; X64-LABEL: test_mm_maskz_sqrt_ps: 6568 ; X64: # %bb.0: # %entry 6569 ; X64-NEXT: kmovw %edi, %k1 6570 ; X64-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z} 6571 ; X64-NEXT: retq 6572 entry: 6573 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2 6574 %1 = bitcast i8 %__U to <8 x i1> 6575 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6576 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 6577 ret <4 x float> %2 6578 } 6579 6580 define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) { 6581 ; X86-LABEL: test_mm256_mask_sqrt_ps: 6582 ; X86: # %bb.0: # %entry 6583 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6584 ; X86-NEXT: kmovw %eax, %k1 6585 ; X86-NEXT: vsqrtps %ymm1, %ymm0 {%k1} 6586 ; X86-NEXT: retl 6587 ; 6588 ; X64-LABEL: test_mm256_mask_sqrt_ps: 6589 ; X64: # %bb.0: # %entry 6590 ; X64-NEXT: kmovw %edi, %k1 6591 ; X64-NEXT: vsqrtps %ymm1, %ymm0 {%k1} 6592 ; X64-NEXT: retq 6593 entry: 6594 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2 6595 %1 = bitcast i8 %__U to <8 x i1> 6596 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W 6597 ret <8 x float> %2 6598 } 6599 6600 define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) { 6601 ; X86-LABEL: test_mm256_maskz_sqrt_ps: 6602 ; X86: # %bb.0: # %entry 6603 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6604 ; X86-NEXT: kmovw %eax, %k1 6605 ; X86-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} 6606 ; X86-NEXT: retl 6607 ; 6608 ; X64-LABEL: test_mm256_maskz_sqrt_ps: 6609 ; X64: # %bb.0: # %entry 6610 ; X64-NEXT: kmovw %edi, %k1 6611 ; X64-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} 6612 ; X64-NEXT: retq 6613 entry: 6614 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2 6615 %1 = bitcast i8 %__U to <8 x i1> 6616 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 6617 ret <8 x float> %2 6618 } 6619 6620 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) 6621 6622 define <2 x i64> @test_mm_rol_epi32(<2 x i64> %__A) { 6623 ; CHECK-LABEL: test_mm_rol_epi32: 6624 ; CHECK: # %bb.0: # %entry 6625 ; CHECK-NEXT: vprold $5, %xmm0, %xmm0 6626 ; CHECK-NEXT: ret{{[l|q]}} 6627 entry: 6628 %0 = bitcast <2 x i64> %__A to <4 x i32> 6629 %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5) 6630 %2 = bitcast <4 x i32> %1 to <2 x i64> 6631 ret <2 x i64> %2 6632 } 6633 6634 declare <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32>, i32) 6635 6636 define <2 x i64> @test_mm_mask_rol_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 6637 ; X86-LABEL: test_mm_mask_rol_epi32: 6638 ; X86: # %bb.0: # %entry 6639 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6640 ; X86-NEXT: kmovw %eax, %k1 6641 ; X86-NEXT: vprold $5, %xmm1, %xmm0 {%k1} 6642 ; X86-NEXT: retl 6643 ; 6644 ; X64-LABEL: test_mm_mask_rol_epi32: 6645 ; X64: # %bb.0: # %entry 6646 ; X64-NEXT: kmovw %edi, %k1 6647 ; X64-NEXT: vprold $5, %xmm1, %xmm0 {%k1} 6648 ; X64-NEXT: retq 6649 entry: 6650 %0 = bitcast <2 x i64> %__A to <4 x i32> 6651 %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5) 6652 %2 = bitcast <2 x i64> %__W to <4 x i32> 6653 %3 = bitcast i8 %__U to <8 x i1> 6654 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6655 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2 6656 %5 = bitcast <4 x i32> %4 to <2 x i64> 6657 ret <2 x i64> %5 6658 } 6659 6660 define <2 x i64> @test_mm_maskz_rol_epi32(i8 zeroext %__U, <2 x i64> %__A) { 6661 ; X86-LABEL: test_mm_maskz_rol_epi32: 6662 ; X86: # %bb.0: # %entry 6663 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6664 ; X86-NEXT: kmovw %eax, %k1 6665 ; X86-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z} 6666 ; X86-NEXT: retl 6667 ; 6668 ; X64-LABEL: test_mm_maskz_rol_epi32: 6669 ; X64: # %bb.0: # %entry 6670 ; X64-NEXT: kmovw %edi, %k1 6671 ; X64-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z} 6672 ; X64-NEXT: retq 6673 entry: 6674 %0 = bitcast <2 x i64> %__A to <4 x i32> 6675 %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5) 6676 %2 = bitcast i8 %__U to <8 x i1> 6677 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6678 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer 6679 %4 = bitcast <4 x i32> %3 to <2 x i64> 6680 ret <2 x i64> %4 6681 } 6682 6683 define <4 x i64> @test_mm256_rol_epi32(<4 x i64> %__A) { 6684 ; CHECK-LABEL: test_mm256_rol_epi32: 6685 ; CHECK: # %bb.0: # %entry 6686 ; CHECK-NEXT: vprold $5, %ymm0, %ymm0 6687 ; CHECK-NEXT: ret{{[l|q]}} 6688 entry: 6689 %0 = bitcast <4 x i64> %__A to <8 x i32> 6690 %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5) 6691 %2 = bitcast <8 x i32> %1 to <4 x i64> 6692 ret <4 x i64> %2 6693 } 6694 6695 declare <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32>, i32) 6696 6697 define <4 x i64> @test_mm256_mask_rol_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 6698 ; X86-LABEL: test_mm256_mask_rol_epi32: 6699 ; X86: # %bb.0: # %entry 6700 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6701 ; X86-NEXT: kmovw %eax, %k1 6702 ; X86-NEXT: vprold $5, %ymm1, %ymm0 {%k1} 6703 ; X86-NEXT: retl 6704 ; 6705 ; X64-LABEL: test_mm256_mask_rol_epi32: 6706 ; X64: # %bb.0: # %entry 6707 ; X64-NEXT: kmovw %edi, %k1 6708 ; X64-NEXT: vprold $5, %ymm1, %ymm0 {%k1} 6709 ; X64-NEXT: retq 6710 entry: 6711 %0 = bitcast <4 x i64> %__A to <8 x i32> 6712 %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5) 6713 %2 = bitcast <4 x i64> %__W to <8 x i32> 6714 %3 = bitcast i8 %__U to <8 x i1> 6715 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2 6716 %5 = bitcast <8 x i32> %4 to <4 x i64> 6717 ret <4 x i64> %5 6718 } 6719 6720 define <4 x i64> @test_mm256_maskz_rol_epi32(i8 zeroext %__U, <4 x i64> %__A) { 6721 ; X86-LABEL: test_mm256_maskz_rol_epi32: 6722 ; X86: # %bb.0: # %entry 6723 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6724 ; X86-NEXT: kmovw %eax, %k1 6725 ; X86-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z} 6726 ; X86-NEXT: retl 6727 ; 6728 ; X64-LABEL: test_mm256_maskz_rol_epi32: 6729 ; X64: # %bb.0: # %entry 6730 ; X64-NEXT: kmovw %edi, %k1 6731 ; X64-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z} 6732 ; X64-NEXT: retq 6733 entry: 6734 %0 = bitcast <4 x i64> %__A to <8 x i32> 6735 %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5) 6736 %2 = bitcast i8 %__U to <8 x i1> 6737 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer 6738 %4 = bitcast <8 x i32> %3 to <4 x i64> 6739 ret <4 x i64> %4 6740 } 6741 6742 define <2 x i64> @test_mm_rol_epi64(<2 x i64> %__A) { 6743 ; CHECK-LABEL: test_mm_rol_epi64: 6744 ; CHECK: # %bb.0: # %entry 6745 ; CHECK-NEXT: vprolq $5, %xmm0, %xmm0 6746 ; CHECK-NEXT: ret{{[l|q]}} 6747 entry: 6748 %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5) 6749 ret <2 x i64> %0 6750 } 6751 6752 declare <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64>, i32) 6753 6754 define <2 x i64> @test_mm_mask_rol_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 6755 ; X86-LABEL: test_mm_mask_rol_epi64: 6756 ; X86: # %bb.0: # %entry 6757 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6758 ; X86-NEXT: kmovw %eax, %k1 6759 ; X86-NEXT: vprolq $5, %xmm1, %xmm0 {%k1} 6760 ; X86-NEXT: retl 6761 ; 6762 ; X64-LABEL: test_mm_mask_rol_epi64: 6763 ; X64: # %bb.0: # %entry 6764 ; X64-NEXT: kmovw %edi, %k1 6765 ; X64-NEXT: vprolq $5, %xmm1, %xmm0 {%k1} 6766 ; X64-NEXT: retq 6767 entry: 6768 %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5) 6769 %1 = bitcast i8 %__U to <8 x i1> 6770 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6771 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W 6772 ret <2 x i64> %2 6773 } 6774 6775 define <2 x i64> @test_mm_maskz_rol_epi64(i8 zeroext %__U, <2 x i64> %__A) { 6776 ; X86-LABEL: test_mm_maskz_rol_epi64: 6777 ; X86: # %bb.0: # %entry 6778 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6779 ; X86-NEXT: kmovw %eax, %k1 6780 ; X86-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z} 6781 ; X86-NEXT: retl 6782 ; 6783 ; X64-LABEL: test_mm_maskz_rol_epi64: 6784 ; X64: # %bb.0: # %entry 6785 ; X64-NEXT: kmovw %edi, %k1 6786 ; X64-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z} 6787 ; X64-NEXT: retq 6788 entry: 6789 %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5) 6790 %1 = bitcast i8 %__U to <8 x i1> 6791 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6792 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer 6793 ret <2 x i64> %2 6794 } 6795 6796 define <4 x i64> @test_mm256_rol_epi64(<4 x i64> %__A) { 6797 ; CHECK-LABEL: test_mm256_rol_epi64: 6798 ; CHECK: # %bb.0: # %entry 6799 ; CHECK-NEXT: vprolq $5, %ymm0, %ymm0 6800 ; CHECK-NEXT: ret{{[l|q]}} 6801 entry: 6802 %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5) 6803 ret <4 x i64> %0 6804 } 6805 6806 declare <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64>, i32) 6807 6808 define <4 x i64> @test_mm256_mask_rol_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 6809 ; X86-LABEL: test_mm256_mask_rol_epi64: 6810 ; X86: # %bb.0: # %entry 6811 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6812 ; X86-NEXT: kmovw %eax, %k1 6813 ; X86-NEXT: vprolq $5, %ymm1, %ymm0 {%k1} 6814 ; X86-NEXT: retl 6815 ; 6816 ; X64-LABEL: test_mm256_mask_rol_epi64: 6817 ; X64: # %bb.0: # %entry 6818 ; X64-NEXT: kmovw %edi, %k1 6819 ; X64-NEXT: vprolq $5, %ymm1, %ymm0 {%k1} 6820 ; X64-NEXT: retq 6821 entry: 6822 %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5) 6823 %1 = bitcast i8 %__U to <8 x i1> 6824 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6825 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W 6826 ret <4 x i64> %2 6827 } 6828 6829 define <4 x i64> @test_mm256_maskz_rol_epi64(i8 zeroext %__U, <4 x i64> %__A) { 6830 ; X86-LABEL: test_mm256_maskz_rol_epi64: 6831 ; X86: # %bb.0: # %entry 6832 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6833 ; X86-NEXT: kmovw %eax, %k1 6834 ; X86-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z} 6835 ; X86-NEXT: retl 6836 ; 6837 ; X64-LABEL: test_mm256_maskz_rol_epi64: 6838 ; X64: # %bb.0: # %entry 6839 ; X64-NEXT: kmovw %edi, %k1 6840 ; X64-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z} 6841 ; X64-NEXT: retq 6842 entry: 6843 %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5) 6844 %1 = bitcast i8 %__U to <8 x i1> 6845 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6846 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer 6847 ret <4 x i64> %2 6848 } 6849 6850 define <2 x i64> @test_mm_rolv_epi32(<2 x i64> %__A, <2 x i64> %__B) { 6851 ; CHECK-LABEL: test_mm_rolv_epi32: 6852 ; CHECK: # %bb.0: # %entry 6853 ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0 6854 ; CHECK-NEXT: ret{{[l|q]}} 6855 entry: 6856 %0 = bitcast <2 x i64> %__A to <4 x i32> 6857 %1 = bitcast <2 x i64> %__B to <4 x i32> 6858 %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1) 6859 %3 = bitcast <4 x i32> %2 to <2 x i64> 6860 ret <2 x i64> %3 6861 } 6862 6863 define <2 x i64> @test_mm_mask_rolv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 6864 ; X86-LABEL: test_mm_mask_rolv_epi32: 6865 ; X86: # %bb.0: # %entry 6866 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6867 ; X86-NEXT: kmovw %eax, %k1 6868 ; X86-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1} 6869 ; X86-NEXT: retl 6870 ; 6871 ; X64-LABEL: test_mm_mask_rolv_epi32: 6872 ; X64: # %bb.0: # %entry 6873 ; X64-NEXT: kmovw %edi, %k1 6874 ; X64-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1} 6875 ; X64-NEXT: retq 6876 entry: 6877 %0 = bitcast <2 x i64> %__A to <4 x i32> 6878 %1 = bitcast <2 x i64> %__B to <4 x i32> 6879 %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1) 6880 %3 = bitcast <2 x i64> %__W to <4 x i32> 6881 %4 = bitcast i8 %__U to <8 x i1> 6882 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6883 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3 6884 %6 = bitcast <4 x i32> %5 to <2 x i64> 6885 ret <2 x i64> %6 6886 } 6887 6888 define <2 x i64> @test_mm_maskz_rolv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 6889 ; X86-LABEL: test_mm_maskz_rolv_epi32: 6890 ; X86: # %bb.0: # %entry 6891 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6892 ; X86-NEXT: kmovw %eax, %k1 6893 ; X86-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z} 6894 ; X86-NEXT: retl 6895 ; 6896 ; X64-LABEL: test_mm_maskz_rolv_epi32: 6897 ; X64: # %bb.0: # %entry 6898 ; X64-NEXT: kmovw %edi, %k1 6899 ; X64-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z} 6900 ; X64-NEXT: retq 6901 entry: 6902 %0 = bitcast <2 x i64> %__A to <4 x i32> 6903 %1 = bitcast <2 x i64> %__B to <4 x i32> 6904 %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1) 6905 %3 = bitcast i8 %__U to <8 x i1> 6906 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6907 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer 6908 %5 = bitcast <4 x i32> %4 to <2 x i64> 6909 ret <2 x i64> %5 6910 } 6911 6912 define <4 x i64> @test_mm256_rolv_epi32(<4 x i64> %__A, <4 x i64> %__B) { 6913 ; CHECK-LABEL: test_mm256_rolv_epi32: 6914 ; CHECK: # %bb.0: # %entry 6915 ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0 6916 ; CHECK-NEXT: ret{{[l|q]}} 6917 entry: 6918 %0 = bitcast <4 x i64> %__A to <8 x i32> 6919 %1 = bitcast <4 x i64> %__B to <8 x i32> 6920 %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1) 6921 %3 = bitcast <8 x i32> %2 to <4 x i64> 6922 ret <4 x i64> %3 6923 } 6924 6925 define <4 x i64> @test_mm256_mask_rolv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 6926 ; X86-LABEL: test_mm256_mask_rolv_epi32: 6927 ; X86: # %bb.0: # %entry 6928 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6929 ; X86-NEXT: kmovw %eax, %k1 6930 ; X86-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1} 6931 ; X86-NEXT: retl 6932 ; 6933 ; X64-LABEL: test_mm256_mask_rolv_epi32: 6934 ; X64: # %bb.0: # %entry 6935 ; X64-NEXT: kmovw %edi, %k1 6936 ; X64-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1} 6937 ; X64-NEXT: retq 6938 entry: 6939 %0 = bitcast <4 x i64> %__A to <8 x i32> 6940 %1 = bitcast <4 x i64> %__B to <8 x i32> 6941 %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1) 6942 %3 = bitcast <4 x i64> %__W to <8 x i32> 6943 %4 = bitcast i8 %__U to <8 x i1> 6944 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3 6945 %6 = bitcast <8 x i32> %5 to <4 x i64> 6946 ret <4 x i64> %6 6947 } 6948 6949 define <4 x i64> @test_mm256_maskz_rolv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 6950 ; X86-LABEL: test_mm256_maskz_rolv_epi32: 6951 ; X86: # %bb.0: # %entry 6952 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6953 ; X86-NEXT: kmovw %eax, %k1 6954 ; X86-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z} 6955 ; X86-NEXT: retl 6956 ; 6957 ; X64-LABEL: test_mm256_maskz_rolv_epi32: 6958 ; X64: # %bb.0: # %entry 6959 ; X64-NEXT: kmovw %edi, %k1 6960 ; X64-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z} 6961 ; X64-NEXT: retq 6962 entry: 6963 %0 = bitcast <4 x i64> %__A to <8 x i32> 6964 %1 = bitcast <4 x i64> %__B to <8 x i32> 6965 %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1) 6966 %3 = bitcast i8 %__U to <8 x i1> 6967 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 6968 %5 = bitcast <8 x i32> %4 to <4 x i64> 6969 ret <4 x i64> %5 6970 } 6971 6972 define <2 x i64> @test_mm_rolv_epi64(<2 x i64> %__A, <2 x i64> %__B) { 6973 ; CHECK-LABEL: test_mm_rolv_epi64: 6974 ; CHECK: # %bb.0: # %entry 6975 ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0 6976 ; CHECK-NEXT: ret{{[l|q]}} 6977 entry: 6978 %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B) 6979 ret <2 x i64> %0 6980 } 6981 6982 define <2 x i64> @test_mm_mask_rolv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 6983 ; X86-LABEL: test_mm_mask_rolv_epi64: 6984 ; X86: # %bb.0: # %entry 6985 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6986 ; X86-NEXT: kmovw %eax, %k1 6987 ; X86-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1} 6988 ; X86-NEXT: retl 6989 ; 6990 ; X64-LABEL: test_mm_mask_rolv_epi64: 6991 ; X64: # %bb.0: # %entry 6992 ; X64-NEXT: kmovw %edi, %k1 6993 ; X64-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1} 6994 ; X64-NEXT: retq 6995 entry: 6996 %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B) 6997 %1 = bitcast i8 %__U to <8 x i1> 6998 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6999 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W 7000 ret <2 x i64> %2 7001 } 7002 7003 define <2 x i64> @test_mm_maskz_rolv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7004 ; X86-LABEL: test_mm_maskz_rolv_epi64: 7005 ; X86: # %bb.0: # %entry 7006 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7007 ; X86-NEXT: kmovw %eax, %k1 7008 ; X86-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7009 ; X86-NEXT: retl 7010 ; 7011 ; X64-LABEL: test_mm_maskz_rolv_epi64: 7012 ; X64: # %bb.0: # %entry 7013 ; X64-NEXT: kmovw %edi, %k1 7014 ; X64-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7015 ; X64-NEXT: retq 7016 entry: 7017 %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B) 7018 %1 = bitcast i8 %__U to <8 x i1> 7019 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7020 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer 7021 ret <2 x i64> %2 7022 } 7023 7024 define <4 x i64> @test_mm256_rolv_epi64(<4 x i64> %__A, <4 x i64> %__B) { 7025 ; CHECK-LABEL: test_mm256_rolv_epi64: 7026 ; CHECK: # %bb.0: # %entry 7027 ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0 7028 ; CHECK-NEXT: ret{{[l|q]}} 7029 entry: 7030 %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7031 ret <4 x i64> %0 7032 } 7033 7034 define <4 x i64> @test_mm256_mask_rolv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7035 ; X86-LABEL: test_mm256_mask_rolv_epi64: 7036 ; X86: # %bb.0: # %entry 7037 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7038 ; X86-NEXT: kmovw %eax, %k1 7039 ; X86-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1} 7040 ; X86-NEXT: retl 7041 ; 7042 ; X64-LABEL: test_mm256_mask_rolv_epi64: 7043 ; X64: # %bb.0: # %entry 7044 ; X64-NEXT: kmovw %edi, %k1 7045 ; X64-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1} 7046 ; X64-NEXT: retq 7047 entry: 7048 %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7049 %1 = bitcast i8 %__U to <8 x i1> 7050 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7051 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W 7052 ret <4 x i64> %2 7053 } 7054 7055 define <4 x i64> @test_mm256_maskz_rolv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7056 ; X86-LABEL: test_mm256_maskz_rolv_epi64: 7057 ; X86: # %bb.0: # %entry 7058 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7059 ; X86-NEXT: kmovw %eax, %k1 7060 ; X86-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7061 ; X86-NEXT: retl 7062 ; 7063 ; X64-LABEL: test_mm256_maskz_rolv_epi64: 7064 ; X64: # %bb.0: # %entry 7065 ; X64-NEXT: kmovw %edi, %k1 7066 ; X64-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7067 ; X64-NEXT: retq 7068 entry: 7069 %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7070 %1 = bitcast i8 %__U to <8 x i1> 7071 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7072 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer 7073 ret <4 x i64> %2 7074 } 7075 7076 define <2 x i64> @test_mm_ror_epi32(<2 x i64> %__A) { 7077 ; CHECK-LABEL: test_mm_ror_epi32: 7078 ; CHECK: # %bb.0: # %entry 7079 ; CHECK-NEXT: vprord $5, %xmm0, %xmm0 7080 ; CHECK-NEXT: ret{{[l|q]}} 7081 entry: 7082 %0 = bitcast <2 x i64> %__A to <4 x i32> 7083 %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5) 7084 %2 = bitcast <4 x i32> %1 to <2 x i64> 7085 ret <2 x i64> %2 7086 } 7087 7088 declare <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32>, i32) 7089 7090 define <2 x i64> @test_mm_mask_ror_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 7091 ; X86-LABEL: test_mm_mask_ror_epi32: 7092 ; X86: # %bb.0: # %entry 7093 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7094 ; X86-NEXT: kmovw %eax, %k1 7095 ; X86-NEXT: vprord $5, %xmm1, %xmm0 {%k1} 7096 ; X86-NEXT: retl 7097 ; 7098 ; X64-LABEL: test_mm_mask_ror_epi32: 7099 ; X64: # %bb.0: # %entry 7100 ; X64-NEXT: kmovw %edi, %k1 7101 ; X64-NEXT: vprord $5, %xmm1, %xmm0 {%k1} 7102 ; X64-NEXT: retq 7103 entry: 7104 %0 = bitcast <2 x i64> %__A to <4 x i32> 7105 %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5) 7106 %2 = bitcast <2 x i64> %__W to <4 x i32> 7107 %3 = bitcast i8 %__U to <8 x i1> 7108 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7109 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2 7110 %5 = bitcast <4 x i32> %4 to <2 x i64> 7111 ret <2 x i64> %5 7112 } 7113 7114 define <2 x i64> @test_mm_maskz_ror_epi32(i8 zeroext %__U, <2 x i64> %__A) { 7115 ; X86-LABEL: test_mm_maskz_ror_epi32: 7116 ; X86: # %bb.0: # %entry 7117 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7118 ; X86-NEXT: kmovw %eax, %k1 7119 ; X86-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z} 7120 ; X86-NEXT: retl 7121 ; 7122 ; X64-LABEL: test_mm_maskz_ror_epi32: 7123 ; X64: # %bb.0: # %entry 7124 ; X64-NEXT: kmovw %edi, %k1 7125 ; X64-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z} 7126 ; X64-NEXT: retq 7127 entry: 7128 %0 = bitcast <2 x i64> %__A to <4 x i32> 7129 %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5) 7130 %2 = bitcast i8 %__U to <8 x i1> 7131 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7132 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer 7133 %4 = bitcast <4 x i32> %3 to <2 x i64> 7134 ret <2 x i64> %4 7135 } 7136 7137 define <4 x i64> @test_mm256_ror_epi32(<4 x i64> %__A) { 7138 ; CHECK-LABEL: test_mm256_ror_epi32: 7139 ; CHECK: # %bb.0: # %entry 7140 ; CHECK-NEXT: vprord $5, %ymm0, %ymm0 7141 ; CHECK-NEXT: ret{{[l|q]}} 7142 entry: 7143 %0 = bitcast <4 x i64> %__A to <8 x i32> 7144 %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5) 7145 %2 = bitcast <8 x i32> %1 to <4 x i64> 7146 ret <4 x i64> %2 7147 } 7148 7149 declare <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32>, i32) 7150 7151 define <4 x i64> @test_mm256_mask_ror_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 7152 ; X86-LABEL: test_mm256_mask_ror_epi32: 7153 ; X86: # %bb.0: # %entry 7154 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7155 ; X86-NEXT: kmovw %eax, %k1 7156 ; X86-NEXT: vprord $5, %ymm1, %ymm0 {%k1} 7157 ; X86-NEXT: retl 7158 ; 7159 ; X64-LABEL: test_mm256_mask_ror_epi32: 7160 ; X64: # %bb.0: # %entry 7161 ; X64-NEXT: kmovw %edi, %k1 7162 ; X64-NEXT: vprord $5, %ymm1, %ymm0 {%k1} 7163 ; X64-NEXT: retq 7164 entry: 7165 %0 = bitcast <4 x i64> %__A to <8 x i32> 7166 %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5) 7167 %2 = bitcast <4 x i64> %__W to <8 x i32> 7168 %3 = bitcast i8 %__U to <8 x i1> 7169 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2 7170 %5 = bitcast <8 x i32> %4 to <4 x i64> 7171 ret <4 x i64> %5 7172 } 7173 7174 define <4 x i64> @test_mm256_maskz_ror_epi32(i8 zeroext %__U, <4 x i64> %__A) { 7175 ; X86-LABEL: test_mm256_maskz_ror_epi32: 7176 ; X86: # %bb.0: # %entry 7177 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7178 ; X86-NEXT: kmovw %eax, %k1 7179 ; X86-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z} 7180 ; X86-NEXT: retl 7181 ; 7182 ; X64-LABEL: test_mm256_maskz_ror_epi32: 7183 ; X64: # %bb.0: # %entry 7184 ; X64-NEXT: kmovw %edi, %k1 7185 ; X64-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z} 7186 ; X64-NEXT: retq 7187 entry: 7188 %0 = bitcast <4 x i64> %__A to <8 x i32> 7189 %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5) 7190 %2 = bitcast i8 %__U to <8 x i1> 7191 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer 7192 %4 = bitcast <8 x i32> %3 to <4 x i64> 7193 ret <4 x i64> %4 7194 } 7195 7196 define <2 x i64> @test_mm_ror_epi64(<2 x i64> %__A) { 7197 ; CHECK-LABEL: test_mm_ror_epi64: 7198 ; CHECK: # %bb.0: # %entry 7199 ; CHECK-NEXT: vprorq $5, %xmm0, %xmm0 7200 ; CHECK-NEXT: ret{{[l|q]}} 7201 entry: 7202 %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5) 7203 ret <2 x i64> %0 7204 } 7205 7206 declare <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64>, i32) 7207 7208 define <2 x i64> @test_mm_mask_ror_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 7209 ; X86-LABEL: test_mm_mask_ror_epi64: 7210 ; X86: # %bb.0: # %entry 7211 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7212 ; X86-NEXT: kmovw %eax, %k1 7213 ; X86-NEXT: vprorq $5, %xmm1, %xmm0 {%k1} 7214 ; X86-NEXT: retl 7215 ; 7216 ; X64-LABEL: test_mm_mask_ror_epi64: 7217 ; X64: # %bb.0: # %entry 7218 ; X64-NEXT: kmovw %edi, %k1 7219 ; X64-NEXT: vprorq $5, %xmm1, %xmm0 {%k1} 7220 ; X64-NEXT: retq 7221 entry: 7222 %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5) 7223 %1 = bitcast i8 %__U to <8 x i1> 7224 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7225 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W 7226 ret <2 x i64> %2 7227 } 7228 7229 define <2 x i64> @test_mm_maskz_ror_epi64(i8 zeroext %__U, <2 x i64> %__A) { 7230 ; X86-LABEL: test_mm_maskz_ror_epi64: 7231 ; X86: # %bb.0: # %entry 7232 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7233 ; X86-NEXT: kmovw %eax, %k1 7234 ; X86-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z} 7235 ; X86-NEXT: retl 7236 ; 7237 ; X64-LABEL: test_mm_maskz_ror_epi64: 7238 ; X64: # %bb.0: # %entry 7239 ; X64-NEXT: kmovw %edi, %k1 7240 ; X64-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z} 7241 ; X64-NEXT: retq 7242 entry: 7243 %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5) 7244 %1 = bitcast i8 %__U to <8 x i1> 7245 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7246 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer 7247 ret <2 x i64> %2 7248 } 7249 7250 define <4 x i64> @test_mm256_ror_epi64(<4 x i64> %__A) { 7251 ; CHECK-LABEL: test_mm256_ror_epi64: 7252 ; CHECK: # %bb.0: # %entry 7253 ; CHECK-NEXT: vprorq $5, %ymm0, %ymm0 7254 ; CHECK-NEXT: ret{{[l|q]}} 7255 entry: 7256 %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5) 7257 ret <4 x i64> %0 7258 } 7259 7260 declare <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64>, i32) 7261 7262 define <4 x i64> @test_mm256_mask_ror_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 7263 ; X86-LABEL: test_mm256_mask_ror_epi64: 7264 ; X86: # %bb.0: # %entry 7265 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7266 ; X86-NEXT: kmovw %eax, %k1 7267 ; X86-NEXT: vprorq $5, %ymm1, %ymm0 {%k1} 7268 ; X86-NEXT: retl 7269 ; 7270 ; X64-LABEL: test_mm256_mask_ror_epi64: 7271 ; X64: # %bb.0: # %entry 7272 ; X64-NEXT: kmovw %edi, %k1 7273 ; X64-NEXT: vprorq $5, %ymm1, %ymm0 {%k1} 7274 ; X64-NEXT: retq 7275 entry: 7276 %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5) 7277 %1 = bitcast i8 %__U to <8 x i1> 7278 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7279 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W 7280 ret <4 x i64> %2 7281 } 7282 7283 define <4 x i64> @test_mm256_maskz_ror_epi64(i8 zeroext %__U, <4 x i64> %__A) { 7284 ; X86-LABEL: test_mm256_maskz_ror_epi64: 7285 ; X86: # %bb.0: # %entry 7286 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7287 ; X86-NEXT: kmovw %eax, %k1 7288 ; X86-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z} 7289 ; X86-NEXT: retl 7290 ; 7291 ; X64-LABEL: test_mm256_maskz_ror_epi64: 7292 ; X64: # %bb.0: # %entry 7293 ; X64-NEXT: kmovw %edi, %k1 7294 ; X64-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z} 7295 ; X64-NEXT: retq 7296 entry: 7297 %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5) 7298 %1 = bitcast i8 %__U to <8 x i1> 7299 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7300 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer 7301 ret <4 x i64> %2 7302 } 7303 7304 define <2 x i64> @test_mm_rorv_epi32(<2 x i64> %__A, <2 x i64> %__B) { 7305 ; CHECK-LABEL: test_mm_rorv_epi32: 7306 ; CHECK: # %bb.0: # %entry 7307 ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0 7308 ; CHECK-NEXT: ret{{[l|q]}} 7309 entry: 7310 %0 = bitcast <2 x i64> %__A to <4 x i32> 7311 %1 = bitcast <2 x i64> %__B to <4 x i32> 7312 %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1) 7313 %3 = bitcast <4 x i32> %2 to <2 x i64> 7314 ret <2 x i64> %3 7315 } 7316 7317 define <2 x i64> @test_mm_mask_rorv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7318 ; X86-LABEL: test_mm_mask_rorv_epi32: 7319 ; X86: # %bb.0: # %entry 7320 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7321 ; X86-NEXT: kmovw %eax, %k1 7322 ; X86-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1} 7323 ; X86-NEXT: retl 7324 ; 7325 ; X64-LABEL: test_mm_mask_rorv_epi32: 7326 ; X64: # %bb.0: # %entry 7327 ; X64-NEXT: kmovw %edi, %k1 7328 ; X64-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1} 7329 ; X64-NEXT: retq 7330 entry: 7331 %0 = bitcast <2 x i64> %__A to <4 x i32> 7332 %1 = bitcast <2 x i64> %__B to <4 x i32> 7333 %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1) 7334 %3 = bitcast <2 x i64> %__W to <4 x i32> 7335 %4 = bitcast i8 %__U to <8 x i1> 7336 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7337 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3 7338 %6 = bitcast <4 x i32> %5 to <2 x i64> 7339 ret <2 x i64> %6 7340 } 7341 7342 define <2 x i64> @test_mm_maskz_rorv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7343 ; X86-LABEL: test_mm_maskz_rorv_epi32: 7344 ; X86: # %bb.0: # %entry 7345 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7346 ; X86-NEXT: kmovw %eax, %k1 7347 ; X86-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z} 7348 ; X86-NEXT: retl 7349 ; 7350 ; X64-LABEL: test_mm_maskz_rorv_epi32: 7351 ; X64: # %bb.0: # %entry 7352 ; X64-NEXT: kmovw %edi, %k1 7353 ; X64-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z} 7354 ; X64-NEXT: retq 7355 entry: 7356 %0 = bitcast <2 x i64> %__A to <4 x i32> 7357 %1 = bitcast <2 x i64> %__B to <4 x i32> 7358 %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1) 7359 %3 = bitcast i8 %__U to <8 x i1> 7360 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7361 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer 7362 %5 = bitcast <4 x i32> %4 to <2 x i64> 7363 ret <2 x i64> %5 7364 } 7365 7366 define <4 x i64> @test_mm256_rorv_epi32(<4 x i64> %__A, <4 x i64> %__B) { 7367 ; CHECK-LABEL: test_mm256_rorv_epi32: 7368 ; CHECK: # %bb.0: # %entry 7369 ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0 7370 ; CHECK-NEXT: ret{{[l|q]}} 7371 entry: 7372 %0 = bitcast <4 x i64> %__A to <8 x i32> 7373 %1 = bitcast <4 x i64> %__B to <8 x i32> 7374 %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1) 7375 %3 = bitcast <8 x i32> %2 to <4 x i64> 7376 ret <4 x i64> %3 7377 } 7378 7379 define <4 x i64> @test_mm256_mask_rorv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7380 ; X86-LABEL: test_mm256_mask_rorv_epi32: 7381 ; X86: # %bb.0: # %entry 7382 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7383 ; X86-NEXT: kmovw %eax, %k1 7384 ; X86-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1} 7385 ; X86-NEXT: retl 7386 ; 7387 ; X64-LABEL: test_mm256_mask_rorv_epi32: 7388 ; X64: # %bb.0: # %entry 7389 ; X64-NEXT: kmovw %edi, %k1 7390 ; X64-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1} 7391 ; X64-NEXT: retq 7392 entry: 7393 %0 = bitcast <4 x i64> %__A to <8 x i32> 7394 %1 = bitcast <4 x i64> %__B to <8 x i32> 7395 %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1) 7396 %3 = bitcast <4 x i64> %__W to <8 x i32> 7397 %4 = bitcast i8 %__U to <8 x i1> 7398 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3 7399 %6 = bitcast <8 x i32> %5 to <4 x i64> 7400 ret <4 x i64> %6 7401 } 7402 7403 define <4 x i64> @test_mm256_maskz_rorv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7404 ; X86-LABEL: test_mm256_maskz_rorv_epi32: 7405 ; X86: # %bb.0: # %entry 7406 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7407 ; X86-NEXT: kmovw %eax, %k1 7408 ; X86-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z} 7409 ; X86-NEXT: retl 7410 ; 7411 ; X64-LABEL: test_mm256_maskz_rorv_epi32: 7412 ; X64: # %bb.0: # %entry 7413 ; X64-NEXT: kmovw %edi, %k1 7414 ; X64-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z} 7415 ; X64-NEXT: retq 7416 entry: 7417 %0 = bitcast <4 x i64> %__A to <8 x i32> 7418 %1 = bitcast <4 x i64> %__B to <8 x i32> 7419 %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1) 7420 %3 = bitcast i8 %__U to <8 x i1> 7421 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 7422 %5 = bitcast <8 x i32> %4 to <4 x i64> 7423 ret <4 x i64> %5 7424 } 7425 7426 define <2 x i64> @test_mm_rorv_epi64(<2 x i64> %__A, <2 x i64> %__B) { 7427 ; CHECK-LABEL: test_mm_rorv_epi64: 7428 ; CHECK: # %bb.0: # %entry 7429 ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0 7430 ; CHECK-NEXT: ret{{[l|q]}} 7431 entry: 7432 %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B) 7433 ret <2 x i64> %0 7434 } 7435 7436 define <2 x i64> @test_mm_mask_rorv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7437 ; X86-LABEL: test_mm_mask_rorv_epi64: 7438 ; X86: # %bb.0: # %entry 7439 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7440 ; X86-NEXT: kmovw %eax, %k1 7441 ; X86-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1} 7442 ; X86-NEXT: retl 7443 ; 7444 ; X64-LABEL: test_mm_mask_rorv_epi64: 7445 ; X64: # %bb.0: # %entry 7446 ; X64-NEXT: kmovw %edi, %k1 7447 ; X64-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1} 7448 ; X64-NEXT: retq 7449 entry: 7450 %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B) 7451 %1 = bitcast i8 %__U to <8 x i1> 7452 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7453 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W 7454 ret <2 x i64> %2 7455 } 7456 7457 define <2 x i64> @test_mm_maskz_rorv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7458 ; X86-LABEL: test_mm_maskz_rorv_epi64: 7459 ; X86: # %bb.0: # %entry 7460 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7461 ; X86-NEXT: kmovw %eax, %k1 7462 ; X86-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7463 ; X86-NEXT: retl 7464 ; 7465 ; X64-LABEL: test_mm_maskz_rorv_epi64: 7466 ; X64: # %bb.0: # %entry 7467 ; X64-NEXT: kmovw %edi, %k1 7468 ; X64-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7469 ; X64-NEXT: retq 7470 entry: 7471 %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B) 7472 %1 = bitcast i8 %__U to <8 x i1> 7473 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7474 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer 7475 ret <2 x i64> %2 7476 } 7477 7478 define <4 x i64> @test_mm256_rorv_epi64(<4 x i64> %__A, <4 x i64> %__B) { 7479 ; CHECK-LABEL: test_mm256_rorv_epi64: 7480 ; CHECK: # %bb.0: # %entry 7481 ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0 7482 ; CHECK-NEXT: ret{{[l|q]}} 7483 entry: 7484 %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7485 ret <4 x i64> %0 7486 } 7487 7488 define <4 x i64> @test_mm256_mask_rorv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7489 ; X86-LABEL: test_mm256_mask_rorv_epi64: 7490 ; X86: # %bb.0: # %entry 7491 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7492 ; X86-NEXT: kmovw %eax, %k1 7493 ; X86-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1} 7494 ; X86-NEXT: retl 7495 ; 7496 ; X64-LABEL: test_mm256_mask_rorv_epi64: 7497 ; X64: # %bb.0: # %entry 7498 ; X64-NEXT: kmovw %edi, %k1 7499 ; X64-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1} 7500 ; X64-NEXT: retq 7501 entry: 7502 %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7503 %1 = bitcast i8 %__U to <8 x i1> 7504 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7505 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W 7506 ret <4 x i64> %2 7507 } 7508 7509 define <4 x i64> @test_mm256_maskz_rorv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7510 ; X86-LABEL: test_mm256_maskz_rorv_epi64: 7511 ; X86: # %bb.0: # %entry 7512 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7513 ; X86-NEXT: kmovw %eax, %k1 7514 ; X86-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7515 ; X86-NEXT: retl 7516 ; 7517 ; X64-LABEL: test_mm256_maskz_rorv_epi64: 7518 ; X64: # %bb.0: # %entry 7519 ; X64-NEXT: kmovw %edi, %k1 7520 ; X64-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7521 ; X64-NEXT: retq 7522 entry: 7523 %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7524 %1 = bitcast i8 %__U to <8 x i1> 7525 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7526 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer 7527 ret <4 x i64> %2 7528 } 7529 7530 declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) 7531 declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) 7532 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8) 7533 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) 7534 declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8) 7535 declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) 7536 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8) 7537 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8) 7538 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) 7539 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) 7540 declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8) 7541 declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8) 7542 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8) 7543 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) 7544 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8) 7545 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8) 7546 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) 7547 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) 7548 declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8) 7549 declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8) 7550 declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8) 7551 declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>) 7552 declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>) 7553 declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>) 7554 declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>) 7555 declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>) 7556 declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>) 7557 declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>) 7558 declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>) 7559 declare <2 x double> @llvm.masked.expandload.v2f64(double*, <2 x i1>, <2 x double>) 7560 declare <4 x double> @llvm.masked.expandload.v4f64(double*, <4 x i1>, <4 x double>) 7561 declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>) 7562 declare <4 x i64> @llvm.masked.expandload.v4i64(i64*, <4 x i1>, <4 x i64>) 7563 declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>) 7564 declare <8 x float> @llvm.masked.expandload.v8f32(float*, <8 x i1>, <8 x float>) 7565 declare <4 x i32> @llvm.masked.expandload.v4i32(i32*, <4 x i1>, <4 x i32>) 7566 declare <8 x i32> @llvm.masked.expandload.v8i32(i32*, <8 x i1>, <8 x i32>) 7567 declare void @llvm.masked.compressstore.v2f64(<2 x double>, double*, <2 x i1>) 7568 declare void @llvm.masked.compressstore.v4f64(<4 x double>, double*, <4 x i1>) 7569 declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64*, <2 x i1>) 7570 declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64*, <4 x i1>) 7571 declare void @llvm.masked.compressstore.v4f32(<4 x float>, float*, <4 x i1>) 7572 declare void @llvm.masked.compressstore.v8f32(<8 x float>, float*, <8 x i1>) 7573 declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32*, <4 x i1>) 7574 declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32*, <8 x i1>) 7575 declare <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32>, <4 x i32>) 7576 declare <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32>, <8 x i32>) 7577 declare <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64>, <2 x i64>) 7578 declare <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64>, <4 x i64>) 7579 declare <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32>, <4 x i32>) 7580 declare <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32>, <8 x i32>) 7581 declare <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64>, <2 x i64>) 7582 declare <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64>, <4 x i64>) 7583 7584 !0 = !{i32 1} 7585