1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86 3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64 4 5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c 6 7 8 define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 { 9 ; X86-LABEL: test_mm512_kunpackb: 10 ; X86: # %bb.0: # %entry 11 ; X86-NEXT: pushl %ebp 12 ; X86-NEXT: .cfi_def_cfa_offset 8 13 ; X86-NEXT: .cfi_offset %ebp, -8 14 ; X86-NEXT: movl %esp, %ebp 15 ; X86-NEXT: .cfi_def_cfa_register %ebp 16 ; X86-NEXT: andl $-64, %esp 17 ; X86-NEXT: subl $64, %esp 18 ; X86-NEXT: vmovdqa64 136(%ebp), %zmm3 19 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 20 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 21 ; X86-NEXT: kunpckbw %k0, %k1, %k1 22 ; X86-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1} 23 ; X86-NEXT: kmovw %k0, %eax 24 ; X86-NEXT: movzwl %ax, %eax 25 ; X86-NEXT: movl %ebp, %esp 26 ; X86-NEXT: popl %ebp 27 ; X86-NEXT: .cfi_def_cfa %esp, 4 28 ; X86-NEXT: vzeroupper 29 ; X86-NEXT: retl 30 ; 31 ; X64-LABEL: test_mm512_kunpackb: 32 ; X64: # %bb.0: # %entry 33 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 34 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 35 ; X64-NEXT: kunpckbw %k0, %k1, %k1 36 ; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1} 37 ; X64-NEXT: kmovw %k0, %eax 38 ; X64-NEXT: movzwl %ax, %eax 39 ; X64-NEXT: vzeroupper 40 ; X64-NEXT: retq 41 entry: 42 %0 = bitcast <8 x i64> %__E to <16 x i32> 43 %1 = bitcast <8 x i64> %__F to <16 x i32> 44 %2 = bitcast <8 x i64> %__A to <16 x i32> 45 %3 = bitcast <8 x i64> %__B to <16 x i32> 46 %4 = icmp ne <16 x i32> %2, %3 47 %5 = bitcast <8 x i64> %__C to <16 x i32> 48 %6 = bitcast <8 x i64> %__D to <16 x i32> 49 %7 = icmp ne <16 x i32> %5, %6 50 %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 51 %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 52 %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 53 %11 = icmp ne <16 x i32> %0, %1 54 %12 = and <16 x i1> %11, %10 55 %13 = bitcast <16 x i1> %12 to i16 56 ret i16 %13 57 } 58 59 define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) { 60 ; X86-LABEL: test_mm512_kortestc: 61 ; X86: # %bb.0: # %entry 62 ; X86-NEXT: pushl %ebp 63 ; X86-NEXT: .cfi_def_cfa_offset 8 64 ; X86-NEXT: .cfi_offset %ebp, -8 65 ; X86-NEXT: movl %esp, %ebp 66 ; X86-NEXT: .cfi_def_cfa_register %ebp 67 ; X86-NEXT: andl $-64, %esp 68 ; X86-NEXT: subl $64, %esp 69 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 70 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 71 ; X86-NEXT: korw %k0, %k1, %k0 72 ; X86-NEXT: kmovw %k0, %eax 73 ; X86-NEXT: cmpw $-1, %ax 74 ; X86-NEXT: sete %al 75 ; X86-NEXT: andb $1, %al 76 ; X86-NEXT: movzbl %al, %eax 77 ; X86-NEXT: movl %ebp, %esp 78 ; X86-NEXT: popl %ebp 79 ; X86-NEXT: .cfi_def_cfa %esp, 4 80 ; X86-NEXT: vzeroupper 81 ; X86-NEXT: retl 82 ; 83 ; X64-LABEL: test_mm512_kortestc: 84 ; X64: # %bb.0: # %entry 85 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 86 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 87 ; X64-NEXT: korw %k0, %k1, %k0 88 ; X64-NEXT: kmovw %k0, %eax 89 ; X64-NEXT: cmpw $-1, %ax 90 ; X64-NEXT: sete %al 91 ; X64-NEXT: andb $1, %al 92 ; X64-NEXT: movzbl %al, %eax 93 ; X64-NEXT: vzeroupper 94 ; X64-NEXT: retq 95 entry: 96 %0 = bitcast <8 x i64> %__A to <16 x i32> 97 %1 = bitcast <8 x i64> %__B to <16 x i32> 98 %2 = icmp ne <16 x i32> %0, %1 99 %3 = bitcast <8 x i64> %__C to <16 x i32> 100 %4 = bitcast <8 x i64> %__D to <16 x i32> 101 %5 = icmp ne <16 x i32> %3, %4 102 %6 = or <16 x i1> %5, %2 %7 = bitcast <16 x i1> %6 to i16 103 %8 = icmp eq i16 %7, -1 104 %9 = zext i1 %8 to i32 105 ret i32 %9 106 } 107 108 define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) { 109 ; X86-LABEL: test_mm512_kortestz: 110 ; X86: # %bb.0: # %entry 111 ; X86-NEXT: pushl %ebp 112 ; X86-NEXT: .cfi_def_cfa_offset 8 113 ; X86-NEXT: .cfi_offset %ebp, -8 114 ; X86-NEXT: movl %esp, %ebp 115 ; X86-NEXT: .cfi_def_cfa_register %ebp 116 ; X86-NEXT: andl $-64, %esp 117 ; X86-NEXT: subl $64, %esp 118 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 119 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 120 ; X86-NEXT: korw %k0, %k1, %k0 121 ; X86-NEXT: kmovw %k0, %eax 122 ; X86-NEXT: cmpw $0, %ax 123 ; X86-NEXT: sete %al 124 ; X86-NEXT: andb $1, %al 125 ; X86-NEXT: movzbl %al, %eax 126 ; X86-NEXT: movl %ebp, %esp 127 ; X86-NEXT: popl %ebp 128 ; X86-NEXT: .cfi_def_cfa %esp, 4 129 ; X86-NEXT: vzeroupper 130 ; X86-NEXT: retl 131 ; 132 ; X64-LABEL: test_mm512_kortestz: 133 ; X64: # %bb.0: # %entry 134 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 135 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 136 ; X64-NEXT: korw %k0, %k1, %k0 137 ; X64-NEXT: kmovw %k0, %eax 138 ; X64-NEXT: cmpw $0, %ax 139 ; X64-NEXT: sete %al 140 ; X64-NEXT: andb $1, %al 141 ; X64-NEXT: movzbl %al, %eax 142 ; X64-NEXT: vzeroupper 143 ; X64-NEXT: retq 144 entry: 145 %0 = bitcast <8 x i64> %__A to <16 x i32> 146 %1 = bitcast <8 x i64> %__B to <16 x i32> 147 %2 = icmp ne <16 x i32> %0, %1 148 %3 = bitcast <8 x i64> %__C to <16 x i32> 149 %4 = bitcast <8 x i64> %__D to <16 x i32> 150 %5 = icmp ne <16 x i32> %3, %4 151 %6 = or <16 x i1> %5, %2 152 %7 = bitcast <16 x i1> %6 to i16 153 %8 = icmp eq i16 %7, 0 154 %9 = zext i1 %8 to i32 155 ret i32 %9 156 } 157 158 define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) { 159 ; CHECK-LABEL: test_mm512_shuffle_f32x4: 160 ; CHECK: # %bb.0: # %entry 161 ; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 162 ; CHECK-NEXT: ret{{[l|q]}} 163 entry: 164 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> 165 ret <16 x float> %shuffle 166 } 167 168 169 define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 170 ; X86-LABEL: test_mm512_mask_shuffle_f32x4: 171 ; X86: # %bb.0: # %entry 172 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 173 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 174 ; X86-NEXT: retl 175 ; 176 ; X64-LABEL: test_mm512_mask_shuffle_f32x4: 177 ; X64: # %bb.0: # %entry 178 ; X64-NEXT: kmovw %edi, %k1 179 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 180 ; X64-NEXT: retq 181 entry: 182 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> 183 %0 = bitcast i16 %__U to <16 x i1> 184 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W 185 ret <16 x float> %1 186 } 187 188 define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 189 ; X86-LABEL: test_mm512_maskz_shuffle_f32x4: 190 ; X86: # %bb.0: # %entry 191 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 192 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 193 ; X86-NEXT: retl 194 ; 195 ; X64-LABEL: test_mm512_maskz_shuffle_f32x4: 196 ; X64: # %bb.0: # %entry 197 ; X64-NEXT: kmovw %edi, %k1 198 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 199 ; X64-NEXT: retq 200 entry: 201 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> 202 %0 = bitcast i16 %__U to <16 x i1> 203 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer 204 ret <16 x float> %1 205 } 206 207 define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) { 208 ; CHECK-LABEL: test_mm512_shuffle_f64x2: 209 ; CHECK: # %bb.0: # %entry 210 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] 211 ; CHECK-NEXT: ret{{[l|q]}} 212 entry: 213 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 214 ret <8 x double> %shuffle 215 } 216 217 define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 218 ; X86-LABEL: test_mm512_mask_shuffle_f64x2: 219 ; X86: # %bb.0: # %entry 220 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 221 ; X86-NEXT: kmovw %eax, %k1 222 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 223 ; X86-NEXT: retl 224 ; 225 ; X64-LABEL: test_mm512_mask_shuffle_f64x2: 226 ; X64: # %bb.0: # %entry 227 ; X64-NEXT: kmovw %edi, %k1 228 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 229 ; X64-NEXT: retq 230 entry: 231 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 232 %0 = bitcast i8 %__U to <8 x i1> 233 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W 234 ret <8 x double> %1 235 } 236 237 define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 238 ; X86-LABEL: test_mm512_maskz_shuffle_f64x2: 239 ; X86: # %bb.0: # %entry 240 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 241 ; X86-NEXT: kmovw %eax, %k1 242 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 243 ; X86-NEXT: retl 244 ; 245 ; X64-LABEL: test_mm512_maskz_shuffle_f64x2: 246 ; X64: # %bb.0: # %entry 247 ; X64-NEXT: kmovw %edi, %k1 248 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 249 ; X64-NEXT: retq 250 entry: 251 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 252 %0 = bitcast i8 %__U to <8 x i1> 253 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer 254 ret <8 x double> %1 255 } 256 257 define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 258 ; CHECK-LABEL: test_mm512_shuffle_i32x4: 259 ; CHECK: # %bb.0: # %entry 260 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] 261 ; CHECK-NEXT: ret{{[l|q]}} 262 entry: 263 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 264 ret <8 x i64> %shuffle 265 } 266 267 define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 268 ; X86-LABEL: test_mm512_mask_shuffle_i32x4: 269 ; X86: # %bb.0: # %entry 270 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 271 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 272 ; X86-NEXT: retl 273 ; 274 ; X64-LABEL: test_mm512_mask_shuffle_i32x4: 275 ; X64: # %bb.0: # %entry 276 ; X64-NEXT: kmovw %edi, %k1 277 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 278 ; X64-NEXT: retq 279 entry: 280 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 281 %0 = bitcast <8 x i64> %shuffle to <16 x i32> 282 %1 = bitcast <8 x i64> %__W to <16 x i32> 283 %2 = bitcast i16 %__U to <16 x i1> 284 %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1 285 %4 = bitcast <16 x i32> %3 to <8 x i64> 286 ret <8 x i64> %4 287 } 288 289 define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 290 ; X86-LABEL: test_mm512_maskz_shuffle_i32x4: 291 ; X86: # %bb.0: # %entry 292 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 293 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 294 ; X86-NEXT: retl 295 ; 296 ; X64-LABEL: test_mm512_maskz_shuffle_i32x4: 297 ; X64: # %bb.0: # %entry 298 ; X64-NEXT: kmovw %edi, %k1 299 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 300 ; X64-NEXT: retq 301 entry: 302 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 303 %0 = bitcast <8 x i64> %shuffle to <16 x i32> 304 %1 = bitcast i16 %__U to <16 x i1> 305 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 306 %3 = bitcast <16 x i32> %2 to <8 x i64> 307 ret <8 x i64> %3 308 } 309 310 define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 311 ; CHECK-LABEL: test_mm512_shuffle_i64x2: 312 ; CHECK: # %bb.0: # %entry 313 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] 314 ; CHECK-NEXT: ret{{[l|q]}} 315 entry: 316 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 317 ret <8 x i64> %shuffle 318 } 319 320 define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 321 ; X86-LABEL: test_mm512_mask_shuffle_i64x2: 322 ; X86: # %bb.0: # %entry 323 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 324 ; X86-NEXT: kmovw %eax, %k1 325 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 326 ; X86-NEXT: retl 327 ; 328 ; X64-LABEL: test_mm512_mask_shuffle_i64x2: 329 ; X64: # %bb.0: # %entry 330 ; X64-NEXT: kmovw %edi, %k1 331 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 332 ; X64-NEXT: retq 333 entry: 334 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 335 %0 = bitcast i8 %__U to <8 x i1> 336 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W 337 ret <8 x i64> %1 338 } 339 340 define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 341 ; X86-LABEL: test_mm512_maskz_shuffle_i64x2: 342 ; X86: # %bb.0: # %entry 343 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 344 ; X86-NEXT: kmovw %eax, %k1 345 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 346 ; X86-NEXT: retl 347 ; 348 ; X64-LABEL: test_mm512_maskz_shuffle_i64x2: 349 ; X64: # %bb.0: # %entry 350 ; X64-NEXT: kmovw %edi, %k1 351 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 352 ; X64-NEXT: retq 353 entry: 354 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 355 %0 = bitcast i8 %__U to <8 x i1> 356 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer 357 ret <8 x i64> %1 358 } 359 360 361 define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) { 362 ; CHECK-LABEL: test_mm512_testn_epi32_mask: 363 ; CHECK: # %bb.0: # %entry 364 ; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 365 ; CHECK-NEXT: kmovw %k0, %eax 366 ; CHECK-NEXT: movzwl %ax, %eax 367 ; CHECK-NEXT: vzeroupper 368 ; CHECK-NEXT: ret{{[l|q]}} 369 entry: 370 %and1.i.i = and <8 x i64> %__B, %__A 371 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> 372 %1 = icmp eq <16 x i32> %0, zeroinitializer 373 %2 = bitcast <16 x i1> %1 to i16 374 ret i16 %2 375 } 376 377 define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 378 ; X86-LABEL: test_mm512_mask_testn_epi32_mask: 379 ; X86: # %bb.0: # %entry 380 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 381 ; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} 382 ; X86-NEXT: kmovw %k0, %eax 383 ; X86-NEXT: movzwl %ax, %eax 384 ; X86-NEXT: vzeroupper 385 ; X86-NEXT: retl 386 ; 387 ; X64-LABEL: test_mm512_mask_testn_epi32_mask: 388 ; X64: # %bb.0: # %entry 389 ; X64-NEXT: kmovw %edi, %k1 390 ; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} 391 ; X64-NEXT: kmovw %k0, %eax 392 ; X64-NEXT: movzwl %ax, %eax 393 ; X64-NEXT: vzeroupper 394 ; X64-NEXT: retq 395 entry: 396 %and1.i.i = and <8 x i64> %__B, %__A 397 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> 398 %1 = icmp eq <16 x i32> %0, zeroinitializer 399 %2 = bitcast i16 %__U to <16 x i1> 400 %3 = and <16 x i1> %1, %2 401 %4 = bitcast <16 x i1> %3 to i16 402 ret i16 %4 403 } 404 405 define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) { 406 ; CHECK-LABEL: test_mm512_testn_epi64_mask: 407 ; CHECK: # %bb.0: # %entry 408 ; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0 409 ; CHECK-NEXT: kmovw %k0, %eax 410 ; CHECK-NEXT: movzbl %al, %eax 411 ; CHECK-NEXT: vzeroupper 412 ; CHECK-NEXT: ret{{[l|q]}} 413 entry: 414 %and1.i.i = and <8 x i64> %__B, %__A 415 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer 416 %1 = bitcast <8 x i1> %0 to i8 417 ret i8 %1 418 } 419 420 define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 421 ; X86-LABEL: test_mm512_mask_testn_epi64_mask: 422 ; X86: # %bb.0: # %entry 423 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 424 ; X86-NEXT: kmovw %eax, %k1 425 ; X86-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1} 426 ; X86-NEXT: kmovw %k0, %eax 427 ; X86-NEXT: movzbl %al, %eax 428 ; X86-NEXT: vzeroupper 429 ; X86-NEXT: retl 430 ; 431 ; X64-LABEL: test_mm512_mask_testn_epi64_mask: 432 ; X64: # %bb.0: # %entry 433 ; X64-NEXT: kmovw %edi, %k1 434 ; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1} 435 ; X64-NEXT: kmovw %k0, %eax 436 ; X64-NEXT: movzbl %al, %eax 437 ; X64-NEXT: vzeroupper 438 ; X64-NEXT: retq 439 entry: 440 %and1.i.i = and <8 x i64> %__B, %__A 441 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer 442 %1 = bitcast i8 %__U to <8 x i1> 443 %2 = and <8 x i1> %0, %1 444 %3 = bitcast <8 x i1> %2 to i8 445 ret i8 %3 446 } 447 448 define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 449 ; X86-LABEL: test_mm512_mask_test_epi32_mask: 450 ; X86: # %bb.0: # %entry 451 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 452 ; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} 453 ; X86-NEXT: kmovw %k0, %eax 454 ; X86-NEXT: movzwl %ax, %eax 455 ; X86-NEXT: vzeroupper 456 ; X86-NEXT: retl 457 ; 458 ; X64-LABEL: test_mm512_mask_test_epi32_mask: 459 ; X64: # %bb.0: # %entry 460 ; X64-NEXT: kmovw %edi, %k1 461 ; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} 462 ; X64-NEXT: kmovw %k0, %eax 463 ; X64-NEXT: movzwl %ax, %eax 464 ; X64-NEXT: vzeroupper 465 ; X64-NEXT: retq 466 entry: 467 %and1.i.i = and <8 x i64> %__B, %__A 468 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> 469 %1 = icmp ne <16 x i32> %0, zeroinitializer 470 %2 = bitcast i16 %__U to <16 x i1> 471 %3 = and <16 x i1> %1, %2 472 %4 = bitcast <16 x i1> %3 to i16 473 ret i16 %4 474 } 475 476 define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 477 ; X86-LABEL: test_mm512_mask_test_epi64_mask: 478 ; X86: # %bb.0: # %entry 479 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 480 ; X86-NEXT: kmovw %eax, %k1 481 ; X86-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1} 482 ; X86-NEXT: kmovw %k0, %eax 483 ; X86-NEXT: movzbl %al, %eax 484 ; X86-NEXT: vzeroupper 485 ; X86-NEXT: retl 486 ; 487 ; X64-LABEL: test_mm512_mask_test_epi64_mask: 488 ; X64: # %bb.0: # %entry 489 ; X64-NEXT: kmovw %edi, %k1 490 ; X64-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1} 491 ; X64-NEXT: kmovw %k0, %eax 492 ; X64-NEXT: movzbl %al, %eax 493 ; X64-NEXT: vzeroupper 494 ; X64-NEXT: retq 495 entry: 496 %and1.i.i = and <8 x i64> %__B, %__A 497 %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer 498 %1 = bitcast i8 %__U to <8 x i1> 499 %2 = and <8 x i1> %0, %1 500 %3 = bitcast <8 x i1> %2 to i8 501 ret i8 %3 502 } 503 504 define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) { 505 ; X86-LABEL: test_mm512_mask_set1_epi32: 506 ; X86: # %bb.0: # %entry 507 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 508 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 509 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} 510 ; X86-NEXT: retl 511 ; 512 ; X64-LABEL: test_mm512_mask_set1_epi32: 513 ; X64: # %bb.0: # %entry 514 ; X64-NEXT: kmovw %edi, %k1 515 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} 516 ; X64-NEXT: retq 517 entry: 518 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0 519 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer 520 %0 = bitcast <8 x i64> %__O to <16 x i32> 521 %1 = bitcast i16 %__M to <16 x i1> 522 %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0 523 %3 = bitcast <16 x i32> %2 to <8 x i64> 524 ret <8 x i64> %3 525 } 526 527 define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) { 528 ; X86-LABEL: test_mm512_maskz_set1_epi32: 529 ; X86: # %bb.0: # %entry 530 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 531 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 532 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} 533 ; X86-NEXT: retl 534 ; 535 ; X64-LABEL: test_mm512_maskz_set1_epi32: 536 ; X64: # %bb.0: # %entry 537 ; X64-NEXT: kmovw %edi, %k1 538 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} {z} 539 ; X64-NEXT: retq 540 entry: 541 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0 542 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer 543 %0 = bitcast i16 %__M to <16 x i1> 544 %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer 545 %2 = bitcast <16 x i32> %1 to <8 x i64> 546 ret <8 x i64> %2 547 } 548 549 define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) { 550 ; X86-LABEL: test_mm512_mask_set1_epi64: 551 ; X86: # %bb.0: # %entry 552 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 553 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 554 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 555 ; X86-NEXT: kmovw %eax, %k1 556 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} 557 ; X86-NEXT: retl 558 ; 559 ; X64-LABEL: test_mm512_mask_set1_epi64: 560 ; X64: # %bb.0: # %entry 561 ; X64-NEXT: kmovw %edi, %k1 562 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} 563 ; X64-NEXT: retq 564 entry: 565 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0 566 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer 567 %0 = bitcast i8 %__M to <8 x i1> 568 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O 569 ret <8 x i64> %1 570 } 571 572 define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { 573 ; X86-LABEL: test_mm512_maskz_set1_epi64: 574 ; X86: # %bb.0: # %entry 575 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 576 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 577 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 578 ; X86-NEXT: kmovw %eax, %k1 579 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} 580 ; X86-NEXT: retl 581 ; 582 ; X64-LABEL: test_mm512_maskz_set1_epi64: 583 ; X64: # %bb.0: # %entry 584 ; X64-NEXT: kmovw %edi, %k1 585 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} {z} 586 ; X64-NEXT: retq 587 entry: 588 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0 589 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer 590 %0 = bitcast i8 %__M to <8 x i1> 591 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer 592 ret <8 x i64> %1 593 } 594 595 596 define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) { 597 ; CHECK-LABEL: test_mm512_broadcastd_epi32: 598 ; CHECK: # %bb.0: 599 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 600 ; CHECK-NEXT: ret{{[l|q]}} 601 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 602 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer 603 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 604 ret <8 x i64> %res1 605 } 606 607 define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) { 608 ; X86-LABEL: test_mm512_mask_broadcastd_epi32: 609 ; X86: # %bb.0: 610 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 611 ; X86-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1} 612 ; X86-NEXT: retl 613 ; 614 ; X64-LABEL: test_mm512_mask_broadcastd_epi32: 615 ; X64: # %bb.0: 616 ; X64-NEXT: kmovw %edi, %k1 617 ; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1} 618 ; X64-NEXT: retq 619 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 620 %arg1 = bitcast i16 %a1 to <16 x i1> 621 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 622 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer 623 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 624 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 625 ret <8 x i64> %res2 626 } 627 628 define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) { 629 ; X86-LABEL: test_mm512_maskz_broadcastd_epi32: 630 ; X86: # %bb.0: 631 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 632 ; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} 633 ; X86-NEXT: retl 634 ; 635 ; X64-LABEL: test_mm512_maskz_broadcastd_epi32: 636 ; X64: # %bb.0: 637 ; X64-NEXT: kmovw %edi, %k1 638 ; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} 639 ; X64-NEXT: retq 640 %arg0 = bitcast i16 %a0 to <16 x i1> 641 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 642 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer 643 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 644 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 645 ret <8 x i64> %res2 646 } 647 648 define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) { 649 ; CHECK-LABEL: test_mm512_broadcastq_epi64: 650 ; CHECK: # %bb.0: 651 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 652 ; CHECK-NEXT: ret{{[l|q]}} 653 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer 654 ret <8 x i64> %res 655 } 656 657 define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) { 658 ; X86-LABEL: test_mm512_mask_broadcastq_epi64: 659 ; X86: # %bb.0: 660 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 661 ; X86-NEXT: kmovw %eax, %k1 662 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} 663 ; X86-NEXT: retl 664 ; 665 ; X64-LABEL: test_mm512_mask_broadcastq_epi64: 666 ; X64: # %bb.0: 667 ; X64-NEXT: kmovw %edi, %k1 668 ; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} 669 ; X64-NEXT: retq 670 %arg1 = bitcast i8 %a1 to <8 x i1> 671 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer 672 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 673 ret <8 x i64> %res1 674 } 675 676 define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { 677 ; X86-LABEL: test_mm512_maskz_broadcastq_epi64: 678 ; X86: # %bb.0: 679 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 680 ; X86-NEXT: kmovw %eax, %k1 681 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} 682 ; X86-NEXT: retl 683 ; 684 ; X64-LABEL: test_mm512_maskz_broadcastq_epi64: 685 ; X64: # %bb.0: 686 ; X64-NEXT: kmovw %edi, %k1 687 ; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} 688 ; X64-NEXT: retq 689 %arg0 = bitcast i8 %a0 to <8 x i1> 690 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer 691 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 692 ret <8 x i64> %res1 693 } 694 695 define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) { 696 ; CHECK-LABEL: test_mm512_broadcastsd_pd: 697 ; CHECK: # %bb.0: 698 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 699 ; CHECK-NEXT: ret{{[l|q]}} 700 %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer 701 ret <8 x double> %res 702 } 703 704 define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) { 705 ; X86-LABEL: test_mm512_mask_broadcastsd_pd: 706 ; X86: # %bb.0: 707 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 708 ; X86-NEXT: kmovw %eax, %k1 709 ; X86-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} 710 ; X86-NEXT: retl 711 ; 712 ; X64-LABEL: test_mm512_mask_broadcastsd_pd: 713 ; X64: # %bb.0: 714 ; X64-NEXT: kmovw %edi, %k1 715 ; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} 716 ; X64-NEXT: retq 717 %arg1 = bitcast i8 %a1 to <8 x i1> 718 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer 719 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 720 ret <8 x double> %res1 721 } 722 723 define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { 724 ; X86-LABEL: test_mm512_maskz_broadcastsd_pd: 725 ; X86: # %bb.0: 726 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 727 ; X86-NEXT: kmovw %eax, %k1 728 ; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} 729 ; X86-NEXT: retl 730 ; 731 ; X64-LABEL: test_mm512_maskz_broadcastsd_pd: 732 ; X64: # %bb.0: 733 ; X64-NEXT: kmovw %edi, %k1 734 ; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} 735 ; X64-NEXT: retq 736 %arg0 = bitcast i8 %a0 to <8 x i1> 737 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer 738 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 739 ret <8 x double> %res1 740 } 741 742 define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) { 743 ; CHECK-LABEL: test_mm512_broadcastss_ps: 744 ; CHECK: # %bb.0: 745 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 746 ; CHECK-NEXT: ret{{[l|q]}} 747 %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer 748 ret <16 x float> %res 749 } 750 751 define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) { 752 ; X86-LABEL: test_mm512_mask_broadcastss_ps: 753 ; X86: # %bb.0: 754 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 755 ; X86-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} 756 ; X86-NEXT: retl 757 ; 758 ; X64-LABEL: test_mm512_mask_broadcastss_ps: 759 ; X64: # %bb.0: 760 ; X64-NEXT: kmovw %edi, %k1 761 ; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} 762 ; X64-NEXT: retq 763 %arg1 = bitcast i16 %a1 to <16 x i1> 764 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer 765 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 766 ret <16 x float> %res1 767 } 768 769 define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) { 770 ; X86-LABEL: test_mm512_maskz_broadcastss_ps: 771 ; X86: # %bb.0: 772 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 773 ; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} 774 ; X86-NEXT: retl 775 ; 776 ; X64-LABEL: test_mm512_maskz_broadcastss_ps: 777 ; X64: # %bb.0: 778 ; X64-NEXT: kmovw %edi, %k1 779 ; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} 780 ; X64-NEXT: retq 781 %arg0 = bitcast i16 %a0 to <16 x i1> 782 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer 783 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 784 ret <16 x float> %res1 785 } 786 787 define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) { 788 ; CHECK-LABEL: test_mm512_movedup_pd: 789 ; CHECK: # %bb.0: 790 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] 791 ; CHECK-NEXT: ret{{[l|q]}} 792 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 793 ret <8 x double> %res 794 } 795 796 define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) { 797 ; X86-LABEL: test_mm512_mask_movedup_pd: 798 ; X86: # %bb.0: 799 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 800 ; X86-NEXT: kmovw %eax, %k1 801 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6] 802 ; X86-NEXT: retl 803 ; 804 ; X64-LABEL: test_mm512_mask_movedup_pd: 805 ; X64: # %bb.0: 806 ; X64-NEXT: kmovw %edi, %k1 807 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6] 808 ; X64-NEXT: retq 809 %arg1 = bitcast i8 %a1 to <8 x i1> 810 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 811 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 812 ret <8 x double> %res1 813 } 814 815 define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) { 816 ; X86-LABEL: test_mm512_maskz_movedup_pd: 817 ; X86: # %bb.0: 818 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 819 ; X86-NEXT: kmovw %eax, %k1 820 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 821 ; X86-NEXT: retl 822 ; 823 ; X64-LABEL: test_mm512_maskz_movedup_pd: 824 ; X64: # %bb.0: 825 ; X64-NEXT: kmovw %edi, %k1 826 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 827 ; X64-NEXT: retq 828 %arg0 = bitcast i8 %a0 to <8 x i1> 829 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 830 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 831 ret <8 x double> %res1 832 } 833 834 define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) { 835 ; CHECK-LABEL: test_mm512_movehdup_ps: 836 ; CHECK: # %bb.0: 837 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 838 ; CHECK-NEXT: ret{{[l|q]}} 839 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 840 ret <16 x float> %res 841 } 842 843 define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { 844 ; X86-LABEL: test_mm512_mask_movehdup_ps: 845 ; X86: # %bb.0: 846 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 847 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 848 ; X86-NEXT: retl 849 ; 850 ; X64-LABEL: test_mm512_mask_movehdup_ps: 851 ; X64: # %bb.0: 852 ; X64-NEXT: kmovw %edi, %k1 853 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 854 ; X64-NEXT: retq 855 %arg1 = bitcast i16 %a1 to <16 x i1> 856 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 857 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 858 ret <16 x float> %res1 859 } 860 861 define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) { 862 ; X86-LABEL: test_mm512_maskz_movehdup_ps: 863 ; X86: # %bb.0: 864 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 865 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 866 ; X86-NEXT: retl 867 ; 868 ; X64-LABEL: test_mm512_maskz_movehdup_ps: 869 ; X64: # %bb.0: 870 ; X64-NEXT: kmovw %edi, %k1 871 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 872 ; X64-NEXT: retq 873 %arg0 = bitcast i16 %a0 to <16 x i1> 874 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 875 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 876 ret <16 x float> %res1 877 } 878 879 define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) { 880 ; CHECK-LABEL: test_mm512_moveldup_ps: 881 ; CHECK: # %bb.0: 882 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 883 ; CHECK-NEXT: ret{{[l|q]}} 884 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 885 ret <16 x float> %res 886 } 887 888 define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { 889 ; X86-LABEL: test_mm512_mask_moveldup_ps: 890 ; X86: # %bb.0: 891 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 892 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 893 ; X86-NEXT: retl 894 ; 895 ; X64-LABEL: test_mm512_mask_moveldup_ps: 896 ; X64: # %bb.0: 897 ; X64-NEXT: kmovw %edi, %k1 898 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 899 ; X64-NEXT: retq 900 %arg1 = bitcast i16 %a1 to <16 x i1> 901 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 902 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 903 ret <16 x float> %res1 904 } 905 906 define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) { 907 ; X86-LABEL: test_mm512_maskz_moveldup_ps: 908 ; X86: # %bb.0: 909 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 910 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 911 ; X86-NEXT: retl 912 ; 913 ; X64-LABEL: test_mm512_maskz_moveldup_ps: 914 ; X64: # %bb.0: 915 ; X64-NEXT: kmovw %edi, %k1 916 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 917 ; X64-NEXT: retq 918 %arg0 = bitcast i16 %a0 to <16 x i1> 919 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 920 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 921 ret <16 x float> %res1 922 } 923 924 define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) { 925 ; CHECK-LABEL: test_mm512_permute_pd: 926 ; CHECK: # %bb.0: 927 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6] 928 ; CHECK-NEXT: ret{{[l|q]}} 929 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 930 ret <8 x double> %res 931 } 932 933 define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) { 934 ; X86-LABEL: test_mm512_mask_permute_pd: 935 ; X86: # %bb.0: 936 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 937 ; X86-NEXT: kmovw %eax, %k1 938 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6] 939 ; X86-NEXT: retl 940 ; 941 ; X64-LABEL: test_mm512_mask_permute_pd: 942 ; X64: # %bb.0: 943 ; X64-NEXT: kmovw %edi, %k1 944 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6] 945 ; X64-NEXT: retq 946 %arg1 = bitcast i8 %a1 to <8 x i1> 947 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 948 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 949 ret <8 x double> %res1 950 } 951 952 define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) { 953 ; X86-LABEL: test_mm512_maskz_permute_pd: 954 ; X86: # %bb.0: 955 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 956 ; X86-NEXT: kmovw %eax, %k1 957 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6] 958 ; X86-NEXT: retl 959 ; 960 ; X64-LABEL: test_mm512_maskz_permute_pd: 961 ; X64: # %bb.0: 962 ; X64-NEXT: kmovw %edi, %k1 963 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6] 964 ; X64-NEXT: retq 965 %arg0 = bitcast i8 %a0 to <8 x i1> 966 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 967 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 968 ret <8 x double> %res1 969 } 970 971 define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) { 972 ; CHECK-LABEL: test_mm512_permute_ps: 973 ; CHECK: # %bb.0: 974 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 975 ; CHECK-NEXT: ret{{[l|q]}} 976 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12> 977 ret <16 x float> %res 978 } 979 980 define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { 981 ; X86-LABEL: test_mm512_mask_permute_ps: 982 ; X86: # %bb.0: 983 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 984 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 985 ; X86-NEXT: retl 986 ; 987 ; X64-LABEL: test_mm512_mask_permute_ps: 988 ; X64: # %bb.0: 989 ; X64-NEXT: kmovw %edi, %k1 990 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 991 ; X64-NEXT: retq 992 %arg1 = bitcast i16 %a1 to <16 x i1> 993 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12> 994 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 995 ret <16 x float> %res1 996 } 997 998 define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) { 999 ; X86-LABEL: test_mm512_maskz_permute_ps: 1000 ; X86: # %bb.0: 1001 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1002 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 1003 ; X86-NEXT: retl 1004 ; 1005 ; X64-LABEL: test_mm512_maskz_permute_ps: 1006 ; X64: # %bb.0: 1007 ; X64-NEXT: kmovw %edi, %k1 1008 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 1009 ; X64-NEXT: retq 1010 %arg0 = bitcast i16 %a0 to <16 x i1> 1011 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12> 1012 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 1013 ret <16 x float> %res1 1014 } 1015 1016 define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) { 1017 ; CHECK-LABEL: test_mm512_permutex_epi64: 1018 ; CHECK: # %bb.0: 1019 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4] 1020 ; CHECK-NEXT: ret{{[l|q]}} 1021 %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1022 ret <8 x i64> %res 1023 } 1024 1025 define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) { 1026 ; X86-LABEL: test_mm512_mask_permutex_epi64: 1027 ; X86: # %bb.0: 1028 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1029 ; X86-NEXT: kmovw %eax, %k1 1030 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1031 ; X86-NEXT: retl 1032 ; 1033 ; X64-LABEL: test_mm512_mask_permutex_epi64: 1034 ; X64: # %bb.0: 1035 ; X64-NEXT: kmovw %edi, %k1 1036 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1037 ; X64-NEXT: retq 1038 %arg1 = bitcast i8 %a1 to <8 x i1> 1039 %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1040 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 1041 ret <8 x i64> %res1 1042 } 1043 1044 define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) { 1045 ; X86-LABEL: test_mm512_maskz_permutex_epi64: 1046 ; X86: # %bb.0: 1047 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1048 ; X86-NEXT: kmovw %eax, %k1 1049 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1050 ; X86-NEXT: retl 1051 ; 1052 ; X64-LABEL: test_mm512_maskz_permutex_epi64: 1053 ; X64: # %bb.0: 1054 ; X64-NEXT: kmovw %edi, %k1 1055 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1056 ; X64-NEXT: retq 1057 %arg0 = bitcast i8 %a0 to <8 x i1> 1058 %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1059 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 1060 ret <8 x i64> %res1 1061 } 1062 1063 define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) { 1064 ; CHECK-LABEL: test_mm512_permutex_pd: 1065 ; CHECK: # %bb.0: 1066 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4] 1067 ; CHECK-NEXT: ret{{[l|q]}} 1068 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1069 ret <8 x double> %res 1070 } 1071 1072 define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) { 1073 ; X86-LABEL: test_mm512_mask_permutex_pd: 1074 ; X86: # %bb.0: 1075 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1076 ; X86-NEXT: kmovw %eax, %k1 1077 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1078 ; X86-NEXT: retl 1079 ; 1080 ; X64-LABEL: test_mm512_mask_permutex_pd: 1081 ; X64: # %bb.0: 1082 ; X64-NEXT: kmovw %edi, %k1 1083 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1084 ; X64-NEXT: retq 1085 %arg1 = bitcast i8 %a1 to <8 x i1> 1086 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1087 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1088 ret <8 x double> %res1 1089 } 1090 1091 define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) { 1092 ; X86-LABEL: test_mm512_maskz_permutex_pd: 1093 ; X86: # %bb.0: 1094 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1095 ; X86-NEXT: kmovw %eax, %k1 1096 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1097 ; X86-NEXT: retl 1098 ; 1099 ; X64-LABEL: test_mm512_maskz_permutex_pd: 1100 ; X64: # %bb.0: 1101 ; X64-NEXT: kmovw %edi, %k1 1102 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1103 ; X64-NEXT: retq 1104 %arg0 = bitcast i8 %a0 to <8 x i1> 1105 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1106 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1107 ret <8 x double> %res1 1108 } 1109 1110 define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) { 1111 ; CHECK-LABEL: test_mm512_shuffle_epi32: 1112 ; CHECK: # %bb.0: 1113 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1114 ; CHECK-NEXT: ret{{[l|q]}} 1115 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1116 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12> 1117 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 1118 ret <8 x i64> %res1 1119 } 1120 1121 define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) { 1122 ; X86-LABEL: test_mm512_mask_shuffle_epi32: 1123 ; X86: # %bb.0: 1124 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1125 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1126 ; X86-NEXT: retl 1127 ; 1128 ; X64-LABEL: test_mm512_mask_shuffle_epi32: 1129 ; X64: # %bb.0: 1130 ; X64-NEXT: kmovw %edi, %k1 1131 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1132 ; X64-NEXT: retq 1133 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1134 %arg1 = bitcast i16 %a1 to <16 x i1> 1135 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1136 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12> 1137 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 1138 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1139 ret <8 x i64> %res2 1140 } 1141 1142 define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) { 1143 ; X86-LABEL: test_mm512_maskz_shuffle_epi32: 1144 ; X86: # %bb.0: 1145 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1146 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1147 ; X86-NEXT: retl 1148 ; 1149 ; X64-LABEL: test_mm512_maskz_shuffle_epi32: 1150 ; X64: # %bb.0: 1151 ; X64-NEXT: kmovw %edi, %k1 1152 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1153 ; X64-NEXT: retq 1154 %arg0 = bitcast i16 %a0 to <16 x i1> 1155 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1156 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12> 1157 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 1158 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1159 ret <8 x i64> %res2 1160 } 1161 1162 define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) { 1163 ; CHECK-LABEL: test_mm512_shuffle_pd: 1164 ; CHECK: # %bb.0: 1165 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1166 ; CHECK-NEXT: ret{{[l|q]}} 1167 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14> 1168 ret <8 x double> %res 1169 } 1170 1171 define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) { 1172 ; X86-LABEL: test_mm512_mask_shuffle_pd: 1173 ; X86: # %bb.0: 1174 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1175 ; X86-NEXT: kmovw %eax, %k1 1176 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1177 ; X86-NEXT: retl 1178 ; 1179 ; X64-LABEL: test_mm512_mask_shuffle_pd: 1180 ; X64: # %bb.0: 1181 ; X64-NEXT: kmovw %edi, %k1 1182 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1183 ; X64-NEXT: retq 1184 %arg1 = bitcast i8 %a1 to <8 x i1> 1185 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14> 1186 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1187 ret <8 x double> %res1 1188 } 1189 1190 define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) { 1191 ; X86-LABEL: test_mm512_maskz_shuffle_pd: 1192 ; X86: # %bb.0: 1193 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1194 ; X86-NEXT: kmovw %eax, %k1 1195 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1196 ; X86-NEXT: retl 1197 ; 1198 ; X64-LABEL: test_mm512_maskz_shuffle_pd: 1199 ; X64: # %bb.0: 1200 ; X64-NEXT: kmovw %edi, %k1 1201 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1202 ; X64-NEXT: retq 1203 %arg0 = bitcast i8 %a0 to <8 x i1> 1204 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14> 1205 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1206 ret <8 x double> %res1 1207 } 1208 1209 define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) { 1210 ; CHECK-LABEL: test_mm512_unpackhi_epi32: 1211 ; CHECK: # %bb.0: 1212 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1213 ; CHECK-NEXT: ret{{[l|q]}} 1214 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1215 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1216 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1217 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 1218 ret <8 x i64> %res1 1219 } 1220 1221 define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1222 ; X86-LABEL: test_mm512_mask_unpackhi_epi32: 1223 ; X86: # %bb.0: 1224 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1225 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1226 ; X86-NEXT: retl 1227 ; 1228 ; X64-LABEL: test_mm512_mask_unpackhi_epi32: 1229 ; X64: # %bb.0: 1230 ; X64-NEXT: kmovw %edi, %k1 1231 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1232 ; X64-NEXT: retq 1233 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1234 %arg1 = bitcast i16 %a1 to <16 x i1> 1235 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1236 %arg3 = bitcast <8 x i64> %a3 to <16 x i32> 1237 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1238 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 1239 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1240 ret <8 x i64> %res2 1241 } 1242 1243 define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1244 ; X86-LABEL: test_mm512_maskz_unpackhi_epi32: 1245 ; X86: # %bb.0: 1246 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1247 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1248 ; X86-NEXT: retl 1249 ; 1250 ; X64-LABEL: test_mm512_maskz_unpackhi_epi32: 1251 ; X64: # %bb.0: 1252 ; X64-NEXT: kmovw %edi, %k1 1253 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1254 ; X64-NEXT: retq 1255 %arg0 = bitcast i16 %a0 to <16 x i1> 1256 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1257 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1258 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1259 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 1260 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1261 ret <8 x i64> %res2 1262 } 1263 1264 define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) { 1265 ; CHECK-LABEL: test_mm512_unpackhi_epi64: 1266 ; CHECK: # %bb.0: 1267 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1268 ; CHECK-NEXT: ret{{[l|q]}} 1269 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1270 ret <8 x i64> %res 1271 } 1272 1273 define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1274 ; X86-LABEL: test_mm512_mask_unpackhi_epi64: 1275 ; X86: # %bb.0: 1276 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1277 ; X86-NEXT: kmovw %eax, %k1 1278 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1279 ; X86-NEXT: retl 1280 ; 1281 ; X64-LABEL: test_mm512_mask_unpackhi_epi64: 1282 ; X64: # %bb.0: 1283 ; X64-NEXT: kmovw %edi, %k1 1284 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1285 ; X64-NEXT: retq 1286 %arg1 = bitcast i8 %a1 to <8 x i1> 1287 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1288 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 1289 ret <8 x i64> %res1 1290 } 1291 1292 define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1293 ; X86-LABEL: test_mm512_maskz_unpackhi_epi64: 1294 ; X86: # %bb.0: 1295 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1296 ; X86-NEXT: kmovw %eax, %k1 1297 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1298 ; X86-NEXT: retl 1299 ; 1300 ; X64-LABEL: test_mm512_maskz_unpackhi_epi64: 1301 ; X64: # %bb.0: 1302 ; X64-NEXT: kmovw %edi, %k1 1303 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1304 ; X64-NEXT: retq 1305 %arg0 = bitcast i8 %a0 to <8 x i1> 1306 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1307 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 1308 ret <8 x i64> %res1 1309 } 1310 1311 define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) { 1312 ; CHECK-LABEL: test_mm512_unpackhi_pd: 1313 ; CHECK: # %bb.0: 1314 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1315 ; CHECK-NEXT: ret{{[l|q]}} 1316 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1317 ret <8 x double> %res 1318 } 1319 1320 define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) { 1321 ; X86-LABEL: test_mm512_mask_unpackhi_pd: 1322 ; X86: # %bb.0: 1323 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1324 ; X86-NEXT: kmovw %eax, %k1 1325 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1326 ; X86-NEXT: retl 1327 ; 1328 ; X64-LABEL: test_mm512_mask_unpackhi_pd: 1329 ; X64: # %bb.0: 1330 ; X64-NEXT: kmovw %edi, %k1 1331 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1332 ; X64-NEXT: retq 1333 %arg1 = bitcast i8 %a1 to <8 x i1> 1334 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1335 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1336 ret <8 x double> %res1 1337 } 1338 1339 define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) { 1340 ; X86-LABEL: test_mm512_maskz_unpackhi_pd: 1341 ; X86: # %bb.0: 1342 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1343 ; X86-NEXT: kmovw %eax, %k1 1344 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1345 ; X86-NEXT: retl 1346 ; 1347 ; X64-LABEL: test_mm512_maskz_unpackhi_pd: 1348 ; X64: # %bb.0: 1349 ; X64-NEXT: kmovw %edi, %k1 1350 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1351 ; X64-NEXT: retq 1352 %arg0 = bitcast i8 %a0 to <8 x i1> 1353 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1354 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1355 ret <8 x double> %res1 1356 } 1357 1358 define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) { 1359 ; CHECK-LABEL: test_mm512_unpackhi_ps: 1360 ; CHECK: # %bb.0: 1361 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1362 ; CHECK-NEXT: ret{{[l|q]}} 1363 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1364 ret <16 x float> %res 1365 } 1366 1367 define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { 1368 ; X86-LABEL: test_mm512_mask_unpackhi_ps: 1369 ; X86: # %bb.0: 1370 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1371 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1372 ; X86-NEXT: retl 1373 ; 1374 ; X64-LABEL: test_mm512_mask_unpackhi_ps: 1375 ; X64: # %bb.0: 1376 ; X64-NEXT: kmovw %edi, %k1 1377 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1378 ; X64-NEXT: retq 1379 %arg1 = bitcast i16 %a1 to <16 x i1> 1380 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1381 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 1382 ret <16 x float> %res1 1383 } 1384 1385 define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { 1386 ; X86-LABEL: test_mm512_maskz_unpackhi_ps: 1387 ; X86: # %bb.0: 1388 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1389 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1390 ; X86-NEXT: retl 1391 ; 1392 ; X64-LABEL: test_mm512_maskz_unpackhi_ps: 1393 ; X64: # %bb.0: 1394 ; X64-NEXT: kmovw %edi, %k1 1395 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1396 ; X64-NEXT: retq 1397 %arg0 = bitcast i16 %a0 to <16 x i1> 1398 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1399 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 1400 ret <16 x float> %res1 1401 } 1402 1403 define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) { 1404 ; CHECK-LABEL: test_mm512_unpacklo_epi32: 1405 ; CHECK: # %bb.0: 1406 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1407 ; CHECK-NEXT: ret{{[l|q]}} 1408 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1409 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1410 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1411 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 1412 ret <8 x i64> %res1 1413 } 1414 1415 define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1416 ; X86-LABEL: test_mm512_mask_unpacklo_epi32: 1417 ; X86: # %bb.0: 1418 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1419 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1420 ; X86-NEXT: retl 1421 ; 1422 ; X64-LABEL: test_mm512_mask_unpacklo_epi32: 1423 ; X64: # %bb.0: 1424 ; X64-NEXT: kmovw %edi, %k1 1425 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1426 ; X64-NEXT: retq 1427 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1428 %arg1 = bitcast i16 %a1 to <16 x i1> 1429 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1430 %arg3 = bitcast <8 x i64> %a3 to <16 x i32> 1431 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1432 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 1433 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1434 ret <8 x i64> %res2 1435 } 1436 1437 define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1438 ; X86-LABEL: test_mm512_maskz_unpacklo_epi32: 1439 ; X86: # %bb.0: 1440 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1441 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1442 ; X86-NEXT: retl 1443 ; 1444 ; X64-LABEL: test_mm512_maskz_unpacklo_epi32: 1445 ; X64: # %bb.0: 1446 ; X64-NEXT: kmovw %edi, %k1 1447 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1448 ; X64-NEXT: retq 1449 %arg0 = bitcast i16 %a0 to <16 x i1> 1450 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1451 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1452 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1453 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 1454 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1455 ret <8 x i64> %res2 1456 } 1457 1458 define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) { 1459 ; CHECK-LABEL: test_mm512_unpacklo_epi64: 1460 ; CHECK: # %bb.0: 1461 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1462 ; CHECK-NEXT: ret{{[l|q]}} 1463 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1464 ret <8 x i64> %res 1465 } 1466 1467 define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1468 ; X86-LABEL: test_mm512_mask_unpacklo_epi64: 1469 ; X86: # %bb.0: 1470 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1471 ; X86-NEXT: kmovw %eax, %k1 1472 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1473 ; X86-NEXT: retl 1474 ; 1475 ; X64-LABEL: test_mm512_mask_unpacklo_epi64: 1476 ; X64: # %bb.0: 1477 ; X64-NEXT: kmovw %edi, %k1 1478 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1479 ; X64-NEXT: retq 1480 %arg1 = bitcast i8 %a1 to <8 x i1> 1481 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1482 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 1483 ret <8 x i64> %res1 1484 } 1485 1486 define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1487 ; X86-LABEL: test_mm512_maskz_unpacklo_epi64: 1488 ; X86: # %bb.0: 1489 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1490 ; X86-NEXT: kmovw %eax, %k1 1491 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1492 ; X86-NEXT: retl 1493 ; 1494 ; X64-LABEL: test_mm512_maskz_unpacklo_epi64: 1495 ; X64: # %bb.0: 1496 ; X64-NEXT: kmovw %edi, %k1 1497 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1498 ; X64-NEXT: retq 1499 %arg0 = bitcast i8 %a0 to <8 x i1> 1500 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1501 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 1502 ret <8 x i64> %res1 1503 } 1504 1505 define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) { 1506 ; CHECK-LABEL: test_mm512_unpacklo_pd: 1507 ; CHECK: # %bb.0: 1508 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1509 ; CHECK-NEXT: ret{{[l|q]}} 1510 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1511 ret <8 x double> %res 1512 } 1513 1514 define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) { 1515 ; X86-LABEL: test_mm512_mask_unpacklo_pd: 1516 ; X86: # %bb.0: 1517 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1518 ; X86-NEXT: kmovw %eax, %k1 1519 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1520 ; X86-NEXT: retl 1521 ; 1522 ; X64-LABEL: test_mm512_mask_unpacklo_pd: 1523 ; X64: # %bb.0: 1524 ; X64-NEXT: kmovw %edi, %k1 1525 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1526 ; X64-NEXT: retq 1527 %arg1 = bitcast i8 %a1 to <8 x i1> 1528 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1529 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1530 ret <8 x double> %res1 1531 } 1532 1533 define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) { 1534 ; X86-LABEL: test_mm512_maskz_unpacklo_pd: 1535 ; X86: # %bb.0: 1536 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1537 ; X86-NEXT: kmovw %eax, %k1 1538 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1539 ; X86-NEXT: retl 1540 ; 1541 ; X64-LABEL: test_mm512_maskz_unpacklo_pd: 1542 ; X64: # %bb.0: 1543 ; X64-NEXT: kmovw %edi, %k1 1544 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1545 ; X64-NEXT: retq 1546 %arg0 = bitcast i8 %a0 to <8 x i1> 1547 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1548 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1549 ret <8 x double> %res1 1550 } 1551 1552 define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) { 1553 ; CHECK-LABEL: test_mm512_unpacklo_ps: 1554 ; CHECK: # %bb.0: 1555 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1556 ; CHECK-NEXT: ret{{[l|q]}} 1557 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1558 ret <16 x float> %res 1559 } 1560 1561 define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { 1562 ; X86-LABEL: test_mm512_mask_unpacklo_ps: 1563 ; X86: # %bb.0: 1564 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1565 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1566 ; X86-NEXT: retl 1567 ; 1568 ; X64-LABEL: test_mm512_mask_unpacklo_ps: 1569 ; X64: # %bb.0: 1570 ; X64-NEXT: kmovw %edi, %k1 1571 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1572 ; X64-NEXT: retq 1573 %arg1 = bitcast i16 %a1 to <16 x i1> 1574 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1575 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 1576 ret <16 x float> %res1 1577 } 1578 1579 define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { 1580 ; X86-LABEL: test_mm512_maskz_unpacklo_ps: 1581 ; X86: # %bb.0: 1582 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1583 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1584 ; X86-NEXT: retl 1585 ; 1586 ; X64-LABEL: test_mm512_maskz_unpacklo_ps: 1587 ; X64: # %bb.0: 1588 ; X64-NEXT: kmovw %edi, %k1 1589 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1590 ; X64-NEXT: retq 1591 %arg0 = bitcast i16 %a0 to <16 x i1> 1592 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1593 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 1594 ret <16 x float> %res1 1595 } 1596 1597 define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind { 1598 ; CHECK-LABEL: test_mm512_zextpd128_pd512: 1599 ; CHECK: # %bb.0: 1600 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 1601 ; CHECK-NEXT: ret{{[l|q]}} 1602 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1603 ret <8 x double> %res 1604 } 1605 1606 define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind { 1607 ; CHECK-LABEL: test_mm512_zextpd256_pd512: 1608 ; CHECK: # %bb.0: 1609 ; CHECK-NEXT: vmovaps %ymm0, %ymm0 1610 ; CHECK-NEXT: ret{{[l|q]}} 1611 %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1612 ret <8 x double> %res 1613 } 1614 1615 define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind { 1616 ; CHECK-LABEL: test_mm512_zextps128_ps512: 1617 ; CHECK: # %bb.0: 1618 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 1619 ; CHECK-NEXT: ret{{[l|q]}} 1620 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 1621 ret <16 x float> %res 1622 } 1623 1624 define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind { 1625 ; CHECK-LABEL: test_mm512_zextps256_ps512: 1626 ; CHECK: # %bb.0: 1627 ; CHECK-NEXT: vmovaps %ymm0, %ymm0 1628 ; CHECK-NEXT: ret{{[l|q]}} 1629 %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1630 ret <16 x float> %res 1631 } 1632 1633 define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind { 1634 ; CHECK-LABEL: test_mm512_zextsi128_si512: 1635 ; CHECK: # %bb.0: 1636 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 1637 ; CHECK-NEXT: ret{{[l|q]}} 1638 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1639 ret <8 x i64> %res 1640 } 1641 1642 define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind { 1643 ; CHECK-LABEL: test_mm512_zextsi256_si512: 1644 ; CHECK: # %bb.0: 1645 ; CHECK-NEXT: vmovaps %ymm0, %ymm0 1646 ; CHECK-NEXT: ret{{[l|q]}} 1647 %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1648 ret <8 x i64> %res 1649 } 1650 1651 define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind { 1652 ; CHECK-LABEL: test_mm512_mul_epi32: 1653 ; CHECK: # %bb.0: 1654 ; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0 1655 ; CHECK-NEXT: vpsraq $32, %zmm0, %zmm0 1656 ; CHECK-NEXT: vpsllq $32, %zmm1, %zmm1 1657 ; CHECK-NEXT: vpsraq $32, %zmm1, %zmm1 1658 ; CHECK-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 1659 ; CHECK-NEXT: ret{{[l|q]}} 1660 %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1661 %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1662 %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1663 %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1664 %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1 1665 ret <8 x i64> %tmp4 1666 } 1667 1668 define <8 x i64> @test_mm512_maskz_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind { 1669 ; X86-LABEL: test_mm512_maskz_mul_epi32: 1670 ; X86: # %bb.0: # %entry 1671 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1672 ; X86-NEXT: kmovw %eax, %k1 1673 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z} 1674 ; X86-NEXT: retl 1675 ; 1676 ; X64-LABEL: test_mm512_maskz_mul_epi32: 1677 ; X64: # %bb.0: # %entry 1678 ; X64-NEXT: kmovw %edi, %k1 1679 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z} 1680 ; X64-NEXT: retq 1681 entry: 1682 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1683 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1684 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1685 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1686 %4 = mul nsw <8 x i64> %3, %1 1687 %5 = bitcast i8 %__k to <8 x i1> 1688 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer 1689 ret <8 x i64> %6 1690 } 1691 1692 define <8 x i64> @test_mm512_mask_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind { 1693 ; X86-LABEL: test_mm512_mask_mul_epi32: 1694 ; X86: # %bb.0: # %entry 1695 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1696 ; X86-NEXT: kmovw %eax, %k1 1697 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1} 1698 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0 1699 ; X86-NEXT: retl 1700 ; 1701 ; X64-LABEL: test_mm512_mask_mul_epi32: 1702 ; X64: # %bb.0: # %entry 1703 ; X64-NEXT: kmovw %edi, %k1 1704 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1} 1705 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0 1706 ; X64-NEXT: retq 1707 entry: 1708 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1709 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1710 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1711 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1712 %4 = mul nsw <8 x i64> %3, %1 1713 %5 = bitcast i8 %__k to <8 x i1> 1714 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %__src 1715 ret <8 x i64> %6 1716 } 1717 1718 define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind { 1719 ; CHECK-LABEL: test_mm512_mul_epu32: 1720 ; CHECK: # %bb.0: 1721 ; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA 1722 ; CHECK-NEXT: kmovw %eax, %k0 1723 ; CHECK-NEXT: knotw %k0, %k1 1724 ; CHECK-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 1725 ; CHECK-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} 1726 ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 1727 ; CHECK-NEXT: ret{{[l|q]}} 1728 %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1729 %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1730 %tmp2 = mul nuw <8 x i64> %tmp1, %tmp 1731 ret <8 x i64> %tmp2 1732 } 1733 1734 define <8 x i64> @test_mm512_maskz_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind { 1735 ; X86-LABEL: test_mm512_maskz_mul_epu32: 1736 ; X86: # %bb.0: # %entry 1737 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1738 ; X86-NEXT: kmovw %eax, %k1 1739 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z} 1740 ; X86-NEXT: retl 1741 ; 1742 ; X64-LABEL: test_mm512_maskz_mul_epu32: 1743 ; X64: # %bb.0: # %entry 1744 ; X64-NEXT: kmovw %edi, %k1 1745 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z} 1746 ; X64-NEXT: retq 1747 entry: 1748 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1749 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1750 %2 = mul nuw <8 x i64> %1, %0 1751 %3 = bitcast i8 %__k to <8 x i1> 1752 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1753 ret <8 x i64> %4 1754 } 1755 1756 define <8 x i64> @test_mm512_mask_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind { 1757 ; X86-LABEL: test_mm512_mask_mul_epu32: 1758 ; X86: # %bb.0: # %entry 1759 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1760 ; X86-NEXT: kmovw %eax, %k1 1761 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1} 1762 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0 1763 ; X86-NEXT: retl 1764 ; 1765 ; X64-LABEL: test_mm512_mask_mul_epu32: 1766 ; X64: # %bb.0: # %entry 1767 ; X64-NEXT: kmovw %edi, %k1 1768 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1} 1769 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0 1770 ; X64-NEXT: retq 1771 entry: 1772 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1773 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1774 %2 = mul nuw <8 x i64> %1, %0 1775 %3 = bitcast i8 %__k to <8 x i1> 1776 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %__src 1777 ret <8 x i64> %4 1778 } 1779 1780 define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind { 1781 ; X86-LABEL: test_mm512_set1_epi8: 1782 ; X86: # %bb.0: # %entry 1783 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1784 ; X86-NEXT: vmovd %eax, %xmm0 1785 ; X86-NEXT: vpbroadcastb %xmm0, %ymm0 1786 ; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1787 ; X86-NEXT: retl 1788 ; 1789 ; X64-LABEL: test_mm512_set1_epi8: 1790 ; X64: # %bb.0: # %entry 1791 ; X64-NEXT: vmovd %edi, %xmm0 1792 ; X64-NEXT: vpbroadcastb %xmm0, %ymm0 1793 ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1794 ; X64-NEXT: retq 1795 entry: 1796 %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0 1797 %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer 1798 %0 = bitcast <64 x i8> %vecinit63.i to <8 x double> 1799 ret <8 x double> %0 1800 } 1801 1802 define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) { 1803 ; X86-LABEL: test_mm_cvtu32_sd: 1804 ; X86: # %bb.0: # %entry 1805 ; X86-NEXT: vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0 1806 ; X86-NEXT: retl 1807 ; 1808 ; X64-LABEL: test_mm_cvtu32_sd: 1809 ; X64: # %bb.0: # %entry 1810 ; X64-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 1811 ; X64-NEXT: retq 1812 entry: 1813 %conv.i = uitofp i32 %__B to double 1814 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0 1815 ret <2 x double> %vecins.i 1816 } 1817 1818 define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) { 1819 ; X86-LABEL: test_mm_cvtu64_sd: 1820 ; X86: # %bb.0: # %entry 1821 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1822 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1823 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 1824 ; X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm1, %xmm1 1825 ; X86-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 1826 ; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1827 ; X86-NEXT: retl 1828 ; 1829 ; X64-LABEL: test_mm_cvtu64_sd: 1830 ; X64: # %bb.0: # %entry 1831 ; X64-NEXT: vcvtusi2sdq %rdi, %xmm0, %xmm0 1832 ; X64-NEXT: retq 1833 entry: 1834 %conv.i = uitofp i64 %__B to double 1835 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0 1836 ret <2 x double> %vecins.i 1837 } 1838 1839 define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) { 1840 ; X86-LABEL: test_mm_cvtu32_ss: 1841 ; X86: # %bb.0: # %entry 1842 ; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 1843 ; X86-NEXT: retl 1844 ; 1845 ; X64-LABEL: test_mm_cvtu32_ss: 1846 ; X64: # %bb.0: # %entry 1847 ; X64-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 1848 ; X64-NEXT: retq 1849 entry: 1850 %conv.i = uitofp i32 %__B to float 1851 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0 1852 ret <4 x float> %vecins.i 1853 } 1854 1855 define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) { 1856 ; X86-LABEL: test_mm_cvtu64_ss: 1857 ; X86: # %bb.0: # %entry 1858 ; X86-NEXT: pushl %ebp 1859 ; X86-NEXT: .cfi_def_cfa_offset 8 1860 ; X86-NEXT: .cfi_offset %ebp, -8 1861 ; X86-NEXT: movl %esp, %ebp 1862 ; X86-NEXT: .cfi_def_cfa_register %ebp 1863 ; X86-NEXT: andl $-8, %esp 1864 ; X86-NEXT: subl $16, %esp 1865 ; X86-NEXT: movl 12(%ebp), %eax 1866 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1867 ; X86-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 1868 ; X86-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) 1869 ; X86-NEXT: xorl %ecx, %ecx 1870 ; X86-NEXT: testl %eax, %eax 1871 ; X86-NEXT: setns %cl 1872 ; X86-NEXT: fildll {{[0-9]+}}(%esp) 1873 ; X86-NEXT: fadds {{\.LCPI.*}}(,%ecx,4) 1874 ; X86-NEXT: fstps {{[0-9]+}}(%esp) 1875 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1876 ; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1877 ; X86-NEXT: movl %ebp, %esp 1878 ; X86-NEXT: popl %ebp 1879 ; X86-NEXT: .cfi_def_cfa %esp, 4 1880 ; X86-NEXT: retl 1881 ; 1882 ; X64-LABEL: test_mm_cvtu64_ss: 1883 ; X64: # %bb.0: # %entry 1884 ; X64-NEXT: vcvtusi2ssq %rdi, %xmm0, %xmm0 1885 ; X64-NEXT: retq 1886 entry: 1887 %conv.i = uitofp i64 %__B to float 1888 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0 1889 ret <4 x float> %vecins.i 1890 } 1891 1892 define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) { 1893 ; CHECK-LABEL: test_mm512_cvtps_pd: 1894 ; CHECK: # %bb.0: # %entry 1895 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0 1896 ; CHECK-NEXT: ret{{[l|q]}} 1897 entry: 1898 %conv.i = fpext <8 x float> %__A to <8 x double> 1899 ret <8 x double> %conv.i 1900 } 1901 1902 define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) { 1903 ; CHECK-LABEL: test_mm512_cvtpslo_pd: 1904 ; CHECK: # %bb.0: # %entry 1905 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0 1906 ; CHECK-NEXT: ret{{[l|q]}} 1907 entry: 1908 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1909 %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double> 1910 ret <8 x double> %conv.i.i 1911 } 1912 1913 define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) { 1914 ; X86-LABEL: test_mm512_mask_cvtps_pd: 1915 ; X86: # %bb.0: # %entry 1916 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1917 ; X86-NEXT: kmovw %eax, %k1 1918 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 1919 ; X86-NEXT: retl 1920 ; 1921 ; X64-LABEL: test_mm512_mask_cvtps_pd: 1922 ; X64: # %bb.0: # %entry 1923 ; X64-NEXT: kmovw %edi, %k1 1924 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 1925 ; X64-NEXT: retq 1926 entry: 1927 %conv.i.i = fpext <8 x float> %__A to <8 x double> 1928 %0 = bitcast i8 %__U to <8 x i1> 1929 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W 1930 ret <8 x double> %1 1931 } 1932 1933 define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) { 1934 ; X86-LABEL: test_mm512_mask_cvtpslo_pd: 1935 ; X86: # %bb.0: # %entry 1936 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1937 ; X86-NEXT: kmovw %eax, %k1 1938 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 1939 ; X86-NEXT: retl 1940 ; 1941 ; X64-LABEL: test_mm512_mask_cvtpslo_pd: 1942 ; X64: # %bb.0: # %entry 1943 ; X64-NEXT: kmovw %edi, %k1 1944 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 1945 ; X64-NEXT: retq 1946 entry: 1947 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1948 %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double> 1949 %0 = bitcast i8 %__U to <8 x i1> 1950 %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W 1951 ret <8 x double> %1 1952 } 1953 1954 define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) { 1955 ; X86-LABEL: test_mm512_maskz_cvtps_pd: 1956 ; X86: # %bb.0: # %entry 1957 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1958 ; X86-NEXT: kmovw %eax, %k1 1959 ; X86-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z} 1960 ; X86-NEXT: retl 1961 ; 1962 ; X64-LABEL: test_mm512_maskz_cvtps_pd: 1963 ; X64: # %bb.0: # %entry 1964 ; X64-NEXT: kmovw %edi, %k1 1965 ; X64-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z} 1966 ; X64-NEXT: retq 1967 entry: 1968 %conv.i.i = fpext <8 x float> %__A to <8 x double> 1969 %0 = bitcast i8 %__U to <8 x i1> 1970 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer 1971 ret <8 x double> %1 1972 } 1973 1974 define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) { 1975 ; CHECK-LABEL: test_mm512_cvtepi32_epi8: 1976 ; CHECK: # %bb.0: # %entry 1977 ; CHECK-NEXT: vpmovdb %zmm0, %xmm0 1978 ; CHECK-NEXT: vzeroupper 1979 ; CHECK-NEXT: ret{{[l|q]}} 1980 entry: 1981 %0 = bitcast <8 x i64> %__A to <16 x i32> 1982 %conv.i = trunc <16 x i32> %0 to <16 x i8> 1983 %1 = bitcast <16 x i8> %conv.i to <2 x i64> 1984 ret <2 x i64> %1 1985 } 1986 1987 define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) { 1988 ; X86-LABEL: test_mm512_mask_cvtepi32_epi8: 1989 ; X86: # %bb.0: # %entry 1990 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1991 ; X86-NEXT: vpmovdb %zmm1, %xmm0 {%k1} 1992 ; X86-NEXT: vzeroupper 1993 ; X86-NEXT: retl 1994 ; 1995 ; X64-LABEL: test_mm512_mask_cvtepi32_epi8: 1996 ; X64: # %bb.0: # %entry 1997 ; X64-NEXT: kmovw %edi, %k1 1998 ; X64-NEXT: vpmovdb %zmm1, %xmm0 {%k1} 1999 ; X64-NEXT: vzeroupper 2000 ; X64-NEXT: retq 2001 entry: 2002 %0 = bitcast <8 x i64> %__A to <16 x i32> 2003 %1 = bitcast <2 x i64> %__O to <16 x i8> 2004 %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M) 2005 %3 = bitcast <16 x i8> %2 to <2 x i64> 2006 ret <2 x i64> %3 2007 } 2008 2009 define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) { 2010 ; X86-LABEL: test_mm512_maskz_cvtepi32_epi8: 2011 ; X86: # %bb.0: # %entry 2012 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2013 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z} 2014 ; X86-NEXT: vzeroupper 2015 ; X86-NEXT: retl 2016 ; 2017 ; X64-LABEL: test_mm512_maskz_cvtepi32_epi8: 2018 ; X64: # %bb.0: # %entry 2019 ; X64-NEXT: kmovw %edi, %k1 2020 ; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z} 2021 ; X64-NEXT: vzeroupper 2022 ; X64-NEXT: retq 2023 entry: 2024 %0 = bitcast <8 x i64> %__A to <16 x i32> 2025 %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M) 2026 %2 = bitcast <16 x i8> %1 to <2 x i64> 2027 ret <2 x i64> %2 2028 } 2029 2030 define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) { 2031 ; CHECK-LABEL: test_mm512_cvtepi64_epi32: 2032 ; CHECK: # %bb.0: # %entry 2033 ; CHECK-NEXT: vpmovqd %zmm0, %ymm0 2034 ; CHECK-NEXT: ret{{[l|q]}} 2035 entry: 2036 %conv.i = trunc <8 x i64> %__A to <8 x i32> 2037 %0 = bitcast <8 x i32> %conv.i to <4 x i64> 2038 ret <4 x i64> %0 2039 } 2040 2041 define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) { 2042 ; X86-LABEL: test_mm512_mask_cvtepi64_epi32: 2043 ; X86: # %bb.0: # %entry 2044 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2045 ; X86-NEXT: kmovw %eax, %k1 2046 ; X86-NEXT: vpmovqd %zmm1, %ymm0 {%k1} 2047 ; X86-NEXT: retl 2048 ; 2049 ; X64-LABEL: test_mm512_mask_cvtepi64_epi32: 2050 ; X64: # %bb.0: # %entry 2051 ; X64-NEXT: kmovw %edi, %k1 2052 ; X64-NEXT: vpmovqd %zmm1, %ymm0 {%k1} 2053 ; X64-NEXT: retq 2054 entry: 2055 %conv.i.i = trunc <8 x i64> %__A to <8 x i32> 2056 %0 = bitcast <4 x i64> %__O to <8 x i32> 2057 %1 = bitcast i8 %__M to <8 x i1> 2058 %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0 2059 %3 = bitcast <8 x i32> %2 to <4 x i64> 2060 ret <4 x i64> %3 2061 } 2062 2063 define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) { 2064 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi32: 2065 ; X86: # %bb.0: # %entry 2066 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2067 ; X86-NEXT: kmovw %eax, %k1 2068 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} 2069 ; X86-NEXT: retl 2070 ; 2071 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi32: 2072 ; X64: # %bb.0: # %entry 2073 ; X64-NEXT: kmovw %edi, %k1 2074 ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} 2075 ; X64-NEXT: retq 2076 entry: 2077 %conv.i.i = trunc <8 x i64> %__A to <8 x i32> 2078 %0 = bitcast i8 %__M to <8 x i1> 2079 %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer 2080 %2 = bitcast <8 x i32> %1 to <4 x i64> 2081 ret <4 x i64> %2 2082 } 2083 2084 define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) { 2085 ; CHECK-LABEL: test_mm512_cvtepi64_epi16: 2086 ; CHECK: # %bb.0: # %entry 2087 ; CHECK-NEXT: vpmovqw %zmm0, %xmm0 2088 ; CHECK-NEXT: vzeroupper 2089 ; CHECK-NEXT: ret{{[l|q]}} 2090 entry: 2091 %conv.i = trunc <8 x i64> %__A to <8 x i16> 2092 %0 = bitcast <8 x i16> %conv.i to <2 x i64> 2093 ret <2 x i64> %0 2094 } 2095 2096 define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) { 2097 ; X86-LABEL: test_mm512_mask_cvtepi64_epi16: 2098 ; X86: # %bb.0: # %entry 2099 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2100 ; X86-NEXT: kmovw %eax, %k1 2101 ; X86-NEXT: vpmovqw %zmm1, %xmm0 {%k1} 2102 ; X86-NEXT: vzeroupper 2103 ; X86-NEXT: retl 2104 ; 2105 ; X64-LABEL: test_mm512_mask_cvtepi64_epi16: 2106 ; X64: # %bb.0: # %entry 2107 ; X64-NEXT: kmovw %edi, %k1 2108 ; X64-NEXT: vpmovqw %zmm1, %xmm0 {%k1} 2109 ; X64-NEXT: vzeroupper 2110 ; X64-NEXT: retq 2111 entry: 2112 %0 = bitcast <2 x i64> %__O to <8 x i16> 2113 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M) 2114 %2 = bitcast <8 x i16> %1 to <2 x i64> 2115 ret <2 x i64> %2 2116 } 2117 2118 define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) { 2119 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi16: 2120 ; X86: # %bb.0: # %entry 2121 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2122 ; X86-NEXT: kmovw %eax, %k1 2123 ; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z} 2124 ; X86-NEXT: vzeroupper 2125 ; X86-NEXT: retl 2126 ; 2127 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi16: 2128 ; X64: # %bb.0: # %entry 2129 ; X64-NEXT: kmovw %edi, %k1 2130 ; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z} 2131 ; X64-NEXT: vzeroupper 2132 ; X64-NEXT: retq 2133 entry: 2134 %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M) 2135 %1 = bitcast <8 x i16> %0 to <2 x i64> 2136 ret <2 x i64> %1 2137 } 2138 2139 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) 2140 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) 2141 2142 define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2143 ; CHECK-LABEL: test_mm512_ternarylogic_epi32: 2144 ; CHECK: # %bb.0: # %entry 2145 ; CHECK-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 2146 ; CHECK-NEXT: ret{{[l|q]}} 2147 entry: 2148 %0 = bitcast <8 x i64> %__A to <16 x i32> 2149 %1 = bitcast <8 x i64> %__B to <16 x i32> 2150 %2 = bitcast <8 x i64> %__C to <16 x i32> 2151 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4) 2152 %4 = bitcast <16 x i32> %3 to <8 x i64> 2153 ret <8 x i64> %4 2154 } 2155 2156 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1 2157 2158 define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) { 2159 ; X86-LABEL: test_mm512_mask_ternarylogic_epi32: 2160 ; X86: # %bb.0: # %entry 2161 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2162 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} 2163 ; X86-NEXT: retl 2164 ; 2165 ; X64-LABEL: test_mm512_mask_ternarylogic_epi32: 2166 ; X64: # %bb.0: # %entry 2167 ; X64-NEXT: kmovw %edi, %k1 2168 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} 2169 ; X64-NEXT: retq 2170 entry: 2171 %0 = bitcast <8 x i64> %__A to <16 x i32> 2172 %1 = bitcast <8 x i64> %__B to <16 x i32> 2173 %2 = bitcast <8 x i64> %__C to <16 x i32> 2174 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4) 2175 %4 = bitcast i16 %__U to <16 x i1> 2176 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0 2177 %6 = bitcast <16 x i32> %5 to <8 x i64> 2178 ret <8 x i64> %6 2179 } 2180 2181 define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2182 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi32: 2183 ; X86: # %bb.0: # %entry 2184 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2185 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2186 ; X86-NEXT: retl 2187 ; 2188 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi32: 2189 ; X64: # %bb.0: # %entry 2190 ; X64-NEXT: kmovw %edi, %k1 2191 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2192 ; X64-NEXT: retq 2193 entry: 2194 %0 = bitcast <8 x i64> %__A to <16 x i32> 2195 %1 = bitcast <8 x i64> %__B to <16 x i32> 2196 %2 = bitcast <8 x i64> %__C to <16 x i32> 2197 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4) 2198 %4 = bitcast i16 %__U to <16 x i1> 2199 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 2200 %6 = bitcast <16 x i32> %5 to <8 x i64> 2201 ret <8 x i64> %6 2202 } 2203 2204 define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2205 ; CHECK-LABEL: test_mm512_ternarylogic_epi64: 2206 ; CHECK: # %bb.0: # %entry 2207 ; CHECK-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 2208 ; CHECK-NEXT: ret{{[l|q]}} 2209 entry: 2210 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) 2211 ret <8 x i64> %0 2212 } 2213 2214 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1 2215 2216 define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) { 2217 ; X86-LABEL: test_mm512_mask_ternarylogic_epi64: 2218 ; X86: # %bb.0: # %entry 2219 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2220 ; X86-NEXT: kmovw %eax, %k1 2221 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} 2222 ; X86-NEXT: retl 2223 ; 2224 ; X64-LABEL: test_mm512_mask_ternarylogic_epi64: 2225 ; X64: # %bb.0: # %entry 2226 ; X64-NEXT: kmovw %edi, %k1 2227 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} 2228 ; X64-NEXT: retq 2229 entry: 2230 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) 2231 %1 = bitcast i8 %__U to <8 x i1> 2232 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A 2233 ret <8 x i64> %2 2234 } 2235 2236 define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2237 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi64: 2238 ; X86: # %bb.0: # %entry 2239 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2240 ; X86-NEXT: kmovw %eax, %k1 2241 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2242 ; X86-NEXT: retl 2243 ; 2244 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi64: 2245 ; X64: # %bb.0: # %entry 2246 ; X64-NEXT: kmovw %edi, %k1 2247 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2248 ; X64-NEXT: retq 2249 entry: 2250 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) 2251 %1 = bitcast i8 %__U to <8 x i1> 2252 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 2253 ret <8 x i64> %2 2254 } 2255 2256 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>) 2257 2258 define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) { 2259 ; X86-LABEL: test_mm512_mask2_permutex2var_epi32: 2260 ; X86: # %bb.0: # %entry 2261 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2262 ; X86-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1} 2263 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 2264 ; X86-NEXT: retl 2265 ; 2266 ; X64-LABEL: test_mm512_mask2_permutex2var_epi32: 2267 ; X64: # %bb.0: # %entry 2268 ; X64-NEXT: kmovw %edi, %k1 2269 ; X64-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1} 2270 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0 2271 ; X64-NEXT: retq 2272 entry: 2273 %0 = bitcast <8 x i64> %__A to <16 x i32> 2274 %1 = bitcast <8 x i64> %__I to <16 x i32> 2275 %2 = bitcast <8 x i64> %__B to <16 x i32> 2276 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2277 %4 = bitcast i16 %__U to <16 x i1> 2278 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1 2279 %6 = bitcast <16 x i32> %5 to <8 x i64> 2280 ret <8 x i64> %6 2281 } 2282 2283 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>) 2284 2285 define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) { 2286 ; X86-LABEL: test_mm512_mask2_permutex2var_pd: 2287 ; X86: # %bb.0: # %entry 2288 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2289 ; X86-NEXT: kmovw %eax, %k1 2290 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} 2291 ; X86-NEXT: vmovapd %zmm1, %zmm0 2292 ; X86-NEXT: retl 2293 ; 2294 ; X64-LABEL: test_mm512_mask2_permutex2var_pd: 2295 ; X64: # %bb.0: # %entry 2296 ; X64-NEXT: kmovw %edi, %k1 2297 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} 2298 ; X64-NEXT: vmovapd %zmm1, %zmm0 2299 ; X64-NEXT: retq 2300 entry: 2301 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2302 %1 = bitcast <8 x i64> %__I to <8 x double> 2303 %2 = bitcast i8 %__U to <8 x i1> 2304 %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1 2305 ret <8 x double> %3 2306 } 2307 2308 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>) 2309 2310 define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) { 2311 ; X86-LABEL: test_mm512_mask2_permutex2var_ps: 2312 ; X86: # %bb.0: # %entry 2313 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2314 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} 2315 ; X86-NEXT: vmovaps %zmm1, %zmm0 2316 ; X86-NEXT: retl 2317 ; 2318 ; X64-LABEL: test_mm512_mask2_permutex2var_ps: 2319 ; X64: # %bb.0: # %entry 2320 ; X64-NEXT: kmovw %edi, %k1 2321 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} 2322 ; X64-NEXT: vmovaps %zmm1, %zmm0 2323 ; X64-NEXT: retq 2324 entry: 2325 %0 = bitcast <8 x i64> %__I to <16 x i32> 2326 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2327 %2 = bitcast <8 x i64> %__I to <16 x float> 2328 %3 = bitcast i16 %__U to <16 x i1> 2329 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2 2330 ret <16 x float> %4 2331 } 2332 2333 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>) 2334 2335 define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) { 2336 ; X86-LABEL: test_mm512_mask2_permutex2var_epi64: 2337 ; X86: # %bb.0: # %entry 2338 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2339 ; X86-NEXT: kmovw %eax, %k1 2340 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} 2341 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 2342 ; X86-NEXT: retl 2343 ; 2344 ; X64-LABEL: test_mm512_mask2_permutex2var_epi64: 2345 ; X64: # %bb.0: # %entry 2346 ; X64-NEXT: kmovw %edi, %k1 2347 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} 2348 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0 2349 ; X64-NEXT: retq 2350 entry: 2351 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2352 %1 = bitcast i8 %__U to <8 x i1> 2353 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I 2354 ret <8 x i64> %2 2355 } 2356 2357 define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2358 ; CHECK-LABEL: test_mm512_permutex2var_epi32: 2359 ; CHECK: # %bb.0: # %entry 2360 ; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 2361 ; CHECK-NEXT: ret{{[l|q]}} 2362 entry: 2363 %0 = bitcast <8 x i64> %__A to <16 x i32> 2364 %1 = bitcast <8 x i64> %__I to <16 x i32> 2365 %2 = bitcast <8 x i64> %__B to <16 x i32> 2366 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2367 %4 = bitcast <16 x i32> %3 to <8 x i64> 2368 ret <8 x i64> %4 2369 } 2370 2371 define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2372 ; X86-LABEL: test_mm512_maskz_permutex2var_epi32: 2373 ; X86: # %bb.0: # %entry 2374 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2375 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z} 2376 ; X86-NEXT: retl 2377 ; 2378 ; X64-LABEL: test_mm512_maskz_permutex2var_epi32: 2379 ; X64: # %bb.0: # %entry 2380 ; X64-NEXT: kmovw %edi, %k1 2381 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z} 2382 ; X64-NEXT: retq 2383 entry: 2384 %0 = bitcast <8 x i64> %__A to <16 x i32> 2385 %1 = bitcast <8 x i64> %__I to <16 x i32> 2386 %2 = bitcast <8 x i64> %__B to <16 x i32> 2387 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2388 %4 = bitcast i16 %__U to <16 x i1> 2389 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 2390 %6 = bitcast <16 x i32> %5 to <8 x i64> 2391 ret <8 x i64> %6 2392 } 2393 2394 define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) { 2395 ; X86-LABEL: test_mm512_mask_permutex2var_epi32: 2396 ; X86: # %bb.0: # %entry 2397 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2398 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} 2399 ; X86-NEXT: retl 2400 ; 2401 ; X64-LABEL: test_mm512_mask_permutex2var_epi32: 2402 ; X64: # %bb.0: # %entry 2403 ; X64-NEXT: kmovw %edi, %k1 2404 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} 2405 ; X64-NEXT: retq 2406 entry: 2407 %0 = bitcast <8 x i64> %__A to <16 x i32> 2408 %1 = bitcast <8 x i64> %__I to <16 x i32> 2409 %2 = bitcast <8 x i64> %__B to <16 x i32> 2410 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2411 %4 = bitcast i16 %__U to <16 x i1> 2412 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0 2413 %6 = bitcast <16 x i32> %5 to <8 x i64> 2414 ret <8 x i64> %6 2415 } 2416 2417 define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) { 2418 ; CHECK-LABEL: test_mm512_permutex2var_pd: 2419 ; CHECK: # %bb.0: # %entry 2420 ; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 2421 ; CHECK-NEXT: ret{{[l|q]}} 2422 entry: 2423 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2424 ret <8 x double> %0 2425 } 2426 2427 define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) { 2428 ; X86-LABEL: test_mm512_mask_permutex2var_pd: 2429 ; X86: # %bb.0: # %entry 2430 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2431 ; X86-NEXT: kmovw %eax, %k1 2432 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} 2433 ; X86-NEXT: retl 2434 ; 2435 ; X64-LABEL: test_mm512_mask_permutex2var_pd: 2436 ; X64: # %bb.0: # %entry 2437 ; X64-NEXT: kmovw %edi, %k1 2438 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} 2439 ; X64-NEXT: retq 2440 entry: 2441 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2442 %1 = bitcast i8 %__U to <8 x i1> 2443 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 2444 ret <8 x double> %2 2445 } 2446 2447 define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) { 2448 ; X86-LABEL: test_mm512_maskz_permutex2var_pd: 2449 ; X86: # %bb.0: # %entry 2450 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2451 ; X86-NEXT: kmovw %eax, %k1 2452 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z} 2453 ; X86-NEXT: retl 2454 ; 2455 ; X64-LABEL: test_mm512_maskz_permutex2var_pd: 2456 ; X64: # %bb.0: # %entry 2457 ; X64-NEXT: kmovw %edi, %k1 2458 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z} 2459 ; X64-NEXT: retq 2460 entry: 2461 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2462 %1 = bitcast i8 %__U to <8 x i1> 2463 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 2464 ret <8 x double> %2 2465 } 2466 2467 define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) { 2468 ; CHECK-LABEL: test_mm512_permutex2var_ps: 2469 ; CHECK: # %bb.0: # %entry 2470 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 2471 ; CHECK-NEXT: ret{{[l|q]}} 2472 entry: 2473 %0 = bitcast <8 x i64> %__I to <16 x i32> 2474 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2475 ret <16 x float> %1 2476 } 2477 2478 define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) { 2479 ; X86-LABEL: test_mm512_mask_permutex2var_ps: 2480 ; X86: # %bb.0: # %entry 2481 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2482 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} 2483 ; X86-NEXT: retl 2484 ; 2485 ; X64-LABEL: test_mm512_mask_permutex2var_ps: 2486 ; X64: # %bb.0: # %entry 2487 ; X64-NEXT: kmovw %edi, %k1 2488 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} 2489 ; X64-NEXT: retq 2490 entry: 2491 %0 = bitcast <8 x i64> %__I to <16 x i32> 2492 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2493 %2 = bitcast i16 %__U to <16 x i1> 2494 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A 2495 ret <16 x float> %3 2496 } 2497 2498 define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) { 2499 ; X86-LABEL: test_mm512_maskz_permutex2var_ps: 2500 ; X86: # %bb.0: # %entry 2501 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2502 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z} 2503 ; X86-NEXT: retl 2504 ; 2505 ; X64-LABEL: test_mm512_maskz_permutex2var_ps: 2506 ; X64: # %bb.0: # %entry 2507 ; X64-NEXT: kmovw %edi, %k1 2508 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z} 2509 ; X64-NEXT: retq 2510 entry: 2511 %0 = bitcast <8 x i64> %__I to <16 x i32> 2512 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2513 %2 = bitcast i16 %__U to <16 x i1> 2514 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer 2515 ret <16 x float> %3 2516 } 2517 2518 define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2519 ; CHECK-LABEL: test_mm512_permutex2var_epi64: 2520 ; CHECK: # %bb.0: # %entry 2521 ; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 2522 ; CHECK-NEXT: ret{{[l|q]}} 2523 entry: 2524 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2525 ret <8 x i64> %0 2526 } 2527 2528 define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) { 2529 ; X86-LABEL: test_mm512_mask_permutex2var_epi64: 2530 ; X86: # %bb.0: # %entry 2531 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2532 ; X86-NEXT: kmovw %eax, %k1 2533 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} 2534 ; X86-NEXT: retl 2535 ; 2536 ; X64-LABEL: test_mm512_mask_permutex2var_epi64: 2537 ; X64: # %bb.0: # %entry 2538 ; X64-NEXT: kmovw %edi, %k1 2539 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} 2540 ; X64-NEXT: retq 2541 entry: 2542 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2543 %1 = bitcast i8 %__U to <8 x i1> 2544 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A 2545 ret <8 x i64> %2 2546 } 2547 2548 define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2549 ; X86-LABEL: test_mm512_maskz_permutex2var_epi64: 2550 ; X86: # %bb.0: # %entry 2551 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2552 ; X86-NEXT: kmovw %eax, %k1 2553 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z} 2554 ; X86-NEXT: retl 2555 ; 2556 ; X64-LABEL: test_mm512_maskz_permutex2var_epi64: 2557 ; X64: # %bb.0: # %entry 2558 ; X64-NEXT: kmovw %edi, %k1 2559 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z} 2560 ; X64-NEXT: retq 2561 entry: 2562 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2563 %1 = bitcast i8 %__U to <8 x i1> 2564 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 2565 ret <8 x i64> %2 2566 } 2567 define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2568 ; X86-LABEL: test_mm_mask_add_ss: 2569 ; X86: # %bb.0: # %entry 2570 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2571 ; X86-NEXT: kmovw %eax, %k1 2572 ; X86-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1} 2573 ; X86-NEXT: retl 2574 ; 2575 ; X64-LABEL: test_mm_mask_add_ss: 2576 ; X64: # %bb.0: # %entry 2577 ; X64-NEXT: kmovw %edi, %k1 2578 ; X64-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1} 2579 ; X64-NEXT: retq 2580 entry: 2581 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2582 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2583 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i 2584 %0 = and i8 %__U, 1 2585 %tobool.i = icmp eq i8 %0, 0 2586 %vecext1.i = extractelement <4 x float> %__W, i32 0 2587 %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i 2588 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2589 ret <4 x float> %vecins.i 2590 } 2591 2592 define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2593 ; X86-LABEL: test_mm_maskz_add_ss: 2594 ; X86: # %bb.0: # %entry 2595 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2596 ; X86-NEXT: kmovw %eax, %k1 2597 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z} 2598 ; X86-NEXT: retl 2599 ; 2600 ; X64-LABEL: test_mm_maskz_add_ss: 2601 ; X64: # %bb.0: # %entry 2602 ; X64-NEXT: kmovw %edi, %k1 2603 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z} 2604 ; X64-NEXT: retq 2605 entry: 2606 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2607 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2608 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i 2609 %0 = and i8 %__U, 1 2610 %tobool.i = icmp eq i8 %0, 0 2611 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i 2612 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2613 ret <4 x float> %vecins.i 2614 } 2615 2616 define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2617 ; X86-LABEL: test_mm_mask_add_sd: 2618 ; X86: # %bb.0: # %entry 2619 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2620 ; X86-NEXT: kmovw %eax, %k1 2621 ; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1} 2622 ; X86-NEXT: retl 2623 ; 2624 ; X64-LABEL: test_mm_mask_add_sd: 2625 ; X64: # %bb.0: # %entry 2626 ; X64-NEXT: kmovw %edi, %k1 2627 ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1} 2628 ; X64-NEXT: retq 2629 entry: 2630 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2631 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2632 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i 2633 %0 = and i8 %__U, 1 2634 %tobool.i = icmp eq i8 %0, 0 2635 %vecext1.i = extractelement <2 x double> %__W, i32 0 2636 %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i 2637 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2638 ret <2 x double> %vecins.i 2639 } 2640 2641 define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2642 ; X86-LABEL: test_mm_maskz_add_sd: 2643 ; X86: # %bb.0: # %entry 2644 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2645 ; X86-NEXT: kmovw %eax, %k1 2646 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2647 ; X86-NEXT: retl 2648 ; 2649 ; X64-LABEL: test_mm_maskz_add_sd: 2650 ; X64: # %bb.0: # %entry 2651 ; X64-NEXT: kmovw %edi, %k1 2652 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2653 ; X64-NEXT: retq 2654 entry: 2655 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2656 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2657 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i 2658 %0 = and i8 %__U, 1 2659 %tobool.i = icmp eq i8 %0, 0 2660 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i 2661 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2662 ret <2 x double> %vecins.i 2663 } 2664 2665 define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2666 ; X86-LABEL: test_mm_mask_sub_ss: 2667 ; X86: # %bb.0: # %entry 2668 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2669 ; X86-NEXT: kmovw %eax, %k1 2670 ; X86-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1} 2671 ; X86-NEXT: retl 2672 ; 2673 ; X64-LABEL: test_mm_mask_sub_ss: 2674 ; X64: # %bb.0: # %entry 2675 ; X64-NEXT: kmovw %edi, %k1 2676 ; X64-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1} 2677 ; X64-NEXT: retq 2678 entry: 2679 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2680 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2681 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i 2682 %0 = and i8 %__U, 1 2683 %tobool.i = icmp eq i8 %0, 0 2684 %vecext1.i = extractelement <4 x float> %__W, i32 0 2685 %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i 2686 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2687 ret <4 x float> %vecins.i 2688 } 2689 2690 define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2691 ; X86-LABEL: test_mm_maskz_sub_ss: 2692 ; X86: # %bb.0: # %entry 2693 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2694 ; X86-NEXT: kmovw %eax, %k1 2695 ; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z} 2696 ; X86-NEXT: retl 2697 ; 2698 ; X64-LABEL: test_mm_maskz_sub_ss: 2699 ; X64: # %bb.0: # %entry 2700 ; X64-NEXT: kmovw %edi, %k1 2701 ; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z} 2702 ; X64-NEXT: retq 2703 entry: 2704 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2705 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2706 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i 2707 %0 = and i8 %__U, 1 2708 %tobool.i = icmp eq i8 %0, 0 2709 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i 2710 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2711 ret <4 x float> %vecins.i 2712 } 2713 2714 define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2715 ; X86-LABEL: test_mm_mask_sub_sd: 2716 ; X86: # %bb.0: # %entry 2717 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2718 ; X86-NEXT: kmovw %eax, %k1 2719 ; X86-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1} 2720 ; X86-NEXT: retl 2721 ; 2722 ; X64-LABEL: test_mm_mask_sub_sd: 2723 ; X64: # %bb.0: # %entry 2724 ; X64-NEXT: kmovw %edi, %k1 2725 ; X64-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1} 2726 ; X64-NEXT: retq 2727 entry: 2728 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2729 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2730 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i 2731 %0 = and i8 %__U, 1 2732 %tobool.i = icmp eq i8 %0, 0 2733 %vecext1.i = extractelement <2 x double> %__W, i32 0 2734 %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i 2735 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2736 ret <2 x double> %vecins.i 2737 } 2738 2739 define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2740 ; X86-LABEL: test_mm_maskz_sub_sd: 2741 ; X86: # %bb.0: # %entry 2742 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2743 ; X86-NEXT: kmovw %eax, %k1 2744 ; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2745 ; X86-NEXT: retl 2746 ; 2747 ; X64-LABEL: test_mm_maskz_sub_sd: 2748 ; X64: # %bb.0: # %entry 2749 ; X64-NEXT: kmovw %edi, %k1 2750 ; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2751 ; X64-NEXT: retq 2752 entry: 2753 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2754 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2755 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i 2756 %0 = and i8 %__U, 1 2757 %tobool.i = icmp eq i8 %0, 0 2758 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i 2759 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2760 ret <2 x double> %vecins.i 2761 } 2762 2763 define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2764 ; X86-LABEL: test_mm_mask_mul_ss: 2765 ; X86: # %bb.0: # %entry 2766 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2767 ; X86-NEXT: kmovw %eax, %k1 2768 ; X86-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1} 2769 ; X86-NEXT: retl 2770 ; 2771 ; X64-LABEL: test_mm_mask_mul_ss: 2772 ; X64: # %bb.0: # %entry 2773 ; X64-NEXT: kmovw %edi, %k1 2774 ; X64-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1} 2775 ; X64-NEXT: retq 2776 entry: 2777 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2778 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2779 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i 2780 %0 = and i8 %__U, 1 2781 %tobool.i = icmp eq i8 %0, 0 2782 %vecext1.i = extractelement <4 x float> %__W, i32 0 2783 %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i 2784 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2785 ret <4 x float> %vecins.i 2786 } 2787 2788 define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2789 ; X86-LABEL: test_mm_maskz_mul_ss: 2790 ; X86: # %bb.0: # %entry 2791 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2792 ; X86-NEXT: kmovw %eax, %k1 2793 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z} 2794 ; X86-NEXT: retl 2795 ; 2796 ; X64-LABEL: test_mm_maskz_mul_ss: 2797 ; X64: # %bb.0: # %entry 2798 ; X64-NEXT: kmovw %edi, %k1 2799 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z} 2800 ; X64-NEXT: retq 2801 entry: 2802 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2803 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2804 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i 2805 %0 = and i8 %__U, 1 2806 %tobool.i = icmp eq i8 %0, 0 2807 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i 2808 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2809 ret <4 x float> %vecins.i 2810 } 2811 2812 define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2813 ; X86-LABEL: test_mm_mask_mul_sd: 2814 ; X86: # %bb.0: # %entry 2815 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2816 ; X86-NEXT: kmovw %eax, %k1 2817 ; X86-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1} 2818 ; X86-NEXT: retl 2819 ; 2820 ; X64-LABEL: test_mm_mask_mul_sd: 2821 ; X64: # %bb.0: # %entry 2822 ; X64-NEXT: kmovw %edi, %k1 2823 ; X64-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1} 2824 ; X64-NEXT: retq 2825 entry: 2826 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2827 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2828 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i 2829 %0 = and i8 %__U, 1 2830 %tobool.i = icmp eq i8 %0, 0 2831 %vecext1.i = extractelement <2 x double> %__W, i32 0 2832 %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i 2833 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2834 ret <2 x double> %vecins.i 2835 } 2836 2837 define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2838 ; X86-LABEL: test_mm_maskz_mul_sd: 2839 ; X86: # %bb.0: # %entry 2840 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2841 ; X86-NEXT: kmovw %eax, %k1 2842 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2843 ; X86-NEXT: retl 2844 ; 2845 ; X64-LABEL: test_mm_maskz_mul_sd: 2846 ; X64: # %bb.0: # %entry 2847 ; X64-NEXT: kmovw %edi, %k1 2848 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2849 ; X64-NEXT: retq 2850 entry: 2851 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2852 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2853 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i 2854 %0 = and i8 %__U, 1 2855 %tobool.i = icmp eq i8 %0, 0 2856 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i 2857 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2858 ret <2 x double> %vecins.i 2859 } 2860 2861 define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2862 ; X86-LABEL: test_mm_mask_div_ss: 2863 ; X86: # %bb.0: # %entry 2864 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2865 ; X86-NEXT: kmovw %eax, %k1 2866 ; X86-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1} 2867 ; X86-NEXT: retl 2868 ; 2869 ; X64-LABEL: test_mm_mask_div_ss: 2870 ; X64: # %bb.0: # %entry 2871 ; X64-NEXT: kmovw %edi, %k1 2872 ; X64-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1} 2873 ; X64-NEXT: retq 2874 entry: 2875 %0 = extractelement <4 x float> %__A, i64 0 2876 %1 = extractelement <4 x float> %__B, i64 0 2877 %2 = extractelement <4 x float> %__W, i64 0 2878 %3 = fdiv float %0, %1 2879 %4 = bitcast i8 %__U to <8 x i1> 2880 %5 = extractelement <8 x i1> %4, i64 0 2881 %6 = select i1 %5, float %3, float %2 2882 %7 = insertelement <4 x float> %__A, float %6, i64 0 2883 ret <4 x float> %7 2884 } 2885 2886 define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2887 ; X86-LABEL: test_mm_maskz_div_ss: 2888 ; X86: # %bb.0: # %entry 2889 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2890 ; X86-NEXT: kmovw %eax, %k1 2891 ; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z} 2892 ; X86-NEXT: retl 2893 ; 2894 ; X64-LABEL: test_mm_maskz_div_ss: 2895 ; X64: # %bb.0: # %entry 2896 ; X64-NEXT: kmovw %edi, %k1 2897 ; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z} 2898 ; X64-NEXT: retq 2899 entry: 2900 %0 = extractelement <4 x float> %__A, i64 0 2901 %1 = extractelement <4 x float> %__B, i64 0 2902 %2 = fdiv float %0, %1 2903 %3 = bitcast i8 %__U to <8 x i1> 2904 %4 = extractelement <8 x i1> %3, i64 0 2905 %5 = select i1 %4, float %2, float 0.000000e+00 2906 %6 = insertelement <4 x float> %__A, float %5, i64 0 2907 ret <4 x float> %6 2908 } 2909 2910 define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2911 ; X86-LABEL: test_mm_mask_div_sd: 2912 ; X86: # %bb.0: # %entry 2913 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2914 ; X86-NEXT: kmovw %eax, %k1 2915 ; X86-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1} 2916 ; X86-NEXT: retl 2917 ; 2918 ; X64-LABEL: test_mm_mask_div_sd: 2919 ; X64: # %bb.0: # %entry 2920 ; X64-NEXT: kmovw %edi, %k1 2921 ; X64-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1} 2922 ; X64-NEXT: retq 2923 entry: 2924 %0 = extractelement <2 x double> %__A, i64 0 2925 %1 = extractelement <2 x double> %__B, i64 0 2926 %2 = extractelement <2 x double> %__W, i64 0 2927 %3 = fdiv double %0, %1 2928 %4 = bitcast i8 %__U to <8 x i1> 2929 %5 = extractelement <8 x i1> %4, i64 0 2930 %6 = select i1 %5, double %3, double %2 2931 %7 = insertelement <2 x double> %__A, double %6, i64 0 2932 ret <2 x double> %7 2933 } 2934 2935 define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2936 ; X86-LABEL: test_mm_maskz_div_sd: 2937 ; X86: # %bb.0: # %entry 2938 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2939 ; X86-NEXT: kmovw %eax, %k1 2940 ; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2941 ; X86-NEXT: retl 2942 ; 2943 ; X64-LABEL: test_mm_maskz_div_sd: 2944 ; X64: # %bb.0: # %entry 2945 ; X64-NEXT: kmovw %edi, %k1 2946 ; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2947 ; X64-NEXT: retq 2948 entry: 2949 %0 = extractelement <2 x double> %__A, i64 0 2950 %1 = extractelement <2 x double> %__B, i64 0 2951 %2 = fdiv double %0, %1 2952 %3 = bitcast i8 %__U to <8 x i1> 2953 %4 = extractelement <8 x i1> %3, i64 0 2954 %5 = select i1 %4, double %2, double 0.000000e+00 2955 %6 = insertelement <2 x double> %__A, double %5, i64 0 2956 ret <2 x double> %6 2957 } 2958 2959 2960 define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 2961 ; CHECK-LABEL: test_mm512_fmadd_round_pd: 2962 ; CHECK: # %bb.0: # %entry 2963 ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 2964 ; CHECK-NEXT: ret{{[l|q]}} 2965 entry: 2966 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 2967 ret <8 x double> %0 2968 } 2969 2970 declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1 2971 2972 define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 2973 ; X86-LABEL: test_mm512_mask_fmadd_round_pd: 2974 ; X86: # %bb.0: # %entry 2975 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2976 ; X86-NEXT: kmovw %eax, %k1 2977 ; X86-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 2978 ; X86-NEXT: retl 2979 ; 2980 ; X64-LABEL: test_mm512_mask_fmadd_round_pd: 2981 ; X64: # %bb.0: # %entry 2982 ; X64-NEXT: kmovw %edi, %k1 2983 ; X64-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 2984 ; X64-NEXT: retq 2985 entry: 2986 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 2987 %1 = bitcast i8 %__U to <8 x i1> 2988 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 2989 ret <8 x double> %2 2990 } 2991 2992 define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 2993 ; X86-LABEL: test_mm512_mask3_fmadd_round_pd: 2994 ; X86: # %bb.0: # %entry 2995 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2996 ; X86-NEXT: kmovw %eax, %k1 2997 ; X86-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 2998 ; X86-NEXT: vmovapd %zmm2, %zmm0 2999 ; X86-NEXT: retl 3000 ; 3001 ; X64-LABEL: test_mm512_mask3_fmadd_round_pd: 3002 ; X64: # %bb.0: # %entry 3003 ; X64-NEXT: kmovw %edi, %k1 3004 ; X64-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3005 ; X64-NEXT: vmovapd %zmm2, %zmm0 3006 ; X64-NEXT: retq 3007 entry: 3008 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3009 %1 = bitcast i8 %__U to <8 x i1> 3010 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3011 ret <8 x double> %2 3012 } 3013 3014 define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3015 ; X86-LABEL: test_mm512_maskz_fmadd_round_pd: 3016 ; X86: # %bb.0: # %entry 3017 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3018 ; X86-NEXT: kmovw %eax, %k1 3019 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3020 ; X86-NEXT: retl 3021 ; 3022 ; X64-LABEL: test_mm512_maskz_fmadd_round_pd: 3023 ; X64: # %bb.0: # %entry 3024 ; X64-NEXT: kmovw %edi, %k1 3025 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3026 ; X64-NEXT: retq 3027 entry: 3028 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3029 %1 = bitcast i8 %__U to <8 x i1> 3030 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3031 ret <8 x double> %2 3032 } 3033 3034 define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3035 ; X86-LABEL: test_mm512_fmsub_round_pd: 3036 ; X86: # %bb.0: # %entry 3037 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2 3038 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3039 ; X86-NEXT: retl 3040 ; 3041 ; X64-LABEL: test_mm512_fmsub_round_pd: 3042 ; X64: # %bb.0: # %entry 3043 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3044 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3045 ; X64-NEXT: retq 3046 entry: 3047 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3048 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3049 ret <8 x double> %0 3050 } 3051 3052 define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3053 ; X86-LABEL: test_mm512_mask_fmsub_round_pd: 3054 ; X86: # %bb.0: # %entry 3055 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3056 ; X86-NEXT: kmovw %eax, %k1 3057 ; X86-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3058 ; X86-NEXT: retl 3059 ; 3060 ; X64-LABEL: test_mm512_mask_fmsub_round_pd: 3061 ; X64: # %bb.0: # %entry 3062 ; X64-NEXT: kmovw %edi, %k1 3063 ; X64-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3064 ; X64-NEXT: retq 3065 entry: 3066 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3067 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3068 %1 = bitcast i8 %__U to <8 x i1> 3069 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3070 ret <8 x double> %2 3071 } 3072 3073 define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3074 ; X86-LABEL: test_mm512_maskz_fmsub_round_pd: 3075 ; X86: # %bb.0: # %entry 3076 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3077 ; X86-NEXT: kmovw %eax, %k1 3078 ; X86-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3079 ; X86-NEXT: retl 3080 ; 3081 ; X64-LABEL: test_mm512_maskz_fmsub_round_pd: 3082 ; X64: # %bb.0: # %entry 3083 ; X64-NEXT: kmovw %edi, %k1 3084 ; X64-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3085 ; X64-NEXT: retq 3086 entry: 3087 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3088 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3089 %1 = bitcast i8 %__U to <8 x i1> 3090 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3091 ret <8 x double> %2 3092 } 3093 3094 define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3095 ; X86-LABEL: test_mm512_fnmadd_round_pd: 3096 ; X86: # %bb.0: # %entry 3097 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0 3098 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3099 ; X86-NEXT: retl 3100 ; 3101 ; X64-LABEL: test_mm512_fnmadd_round_pd: 3102 ; X64: # %bb.0: # %entry 3103 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0 3104 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3105 ; X64-NEXT: retq 3106 entry: 3107 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3108 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 3109 ret <8 x double> %0 3110 } 3111 3112 define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3113 ; X86-LABEL: test_mm512_mask3_fnmadd_round_pd: 3114 ; X86: # %bb.0: # %entry 3115 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3116 ; X86-NEXT: kmovw %eax, %k1 3117 ; X86-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3118 ; X86-NEXT: vmovapd %zmm2, %zmm0 3119 ; X86-NEXT: retl 3120 ; 3121 ; X64-LABEL: test_mm512_mask3_fnmadd_round_pd: 3122 ; X64: # %bb.0: # %entry 3123 ; X64-NEXT: kmovw %edi, %k1 3124 ; X64-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3125 ; X64-NEXT: vmovapd %zmm2, %zmm0 3126 ; X64-NEXT: retq 3127 entry: 3128 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3129 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 3130 %1 = bitcast i8 %__U to <8 x i1> 3131 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3132 ret <8 x double> %2 3133 } 3134 3135 define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3136 ; X86-LABEL: test_mm512_maskz_fnmadd_round_pd: 3137 ; X86: # %bb.0: # %entry 3138 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3139 ; X86-NEXT: kmovw %eax, %k1 3140 ; X86-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3141 ; X86-NEXT: retl 3142 ; 3143 ; X64-LABEL: test_mm512_maskz_fnmadd_round_pd: 3144 ; X64: # %bb.0: # %entry 3145 ; X64-NEXT: kmovw %edi, %k1 3146 ; X64-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3147 ; X64-NEXT: retq 3148 entry: 3149 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3150 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 3151 %1 = bitcast i8 %__U to <8 x i1> 3152 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3153 ret <8 x double> %2 3154 } 3155 3156 define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3157 ; CHECK-LABEL: test_mm512_fnmsub_round_pd: 3158 ; CHECK: # %bb.0: # %entry 3159 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0] 3160 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 3161 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 3162 ; CHECK-NEXT: vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0 3163 ; CHECK-NEXT: ret{{[l|q]}} 3164 entry: 3165 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3166 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3167 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8) 3168 ret <8 x double> %0 3169 } 3170 3171 define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3172 ; X86-LABEL: test_mm512_maskz_fnmsub_round_pd: 3173 ; X86: # %bb.0: # %entry 3174 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3175 ; X86-NEXT: kmovw %eax, %k1 3176 ; X86-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3177 ; X86-NEXT: retl 3178 ; 3179 ; X64-LABEL: test_mm512_maskz_fnmsub_round_pd: 3180 ; X64: # %bb.0: # %entry 3181 ; X64-NEXT: kmovw %edi, %k1 3182 ; X64-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3183 ; X64-NEXT: retq 3184 entry: 3185 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3186 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3187 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8) 3188 %1 = bitcast i8 %__U to <8 x i1> 3189 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3190 ret <8 x double> %2 3191 } 3192 3193 define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3194 ; CHECK-LABEL: test_mm512_fmadd_pd: 3195 ; CHECK: # %bb.0: # %entry 3196 ; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3197 ; CHECK-NEXT: ret{{[l|q]}} 3198 entry: 3199 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3200 ret <8 x double> %0 3201 } 3202 3203 define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3204 ; X86-LABEL: test_mm512_mask_fmadd_pd: 3205 ; X86: # %bb.0: # %entry 3206 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3207 ; X86-NEXT: kmovw %eax, %k1 3208 ; X86-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2 3209 ; X86-NEXT: retl 3210 ; 3211 ; X64-LABEL: test_mm512_mask_fmadd_pd: 3212 ; X64: # %bb.0: # %entry 3213 ; X64-NEXT: kmovw %edi, %k1 3214 ; X64-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2 3215 ; X64-NEXT: retq 3216 entry: 3217 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3218 %1 = bitcast i8 %__U to <8 x i1> 3219 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3220 ret <8 x double> %2 3221 } 3222 3223 define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3224 ; X86-LABEL: test_mm512_mask3_fmadd_pd: 3225 ; X86: # %bb.0: # %entry 3226 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3227 ; X86-NEXT: kmovw %eax, %k1 3228 ; X86-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2 3229 ; X86-NEXT: vmovapd %zmm2, %zmm0 3230 ; X86-NEXT: retl 3231 ; 3232 ; X64-LABEL: test_mm512_mask3_fmadd_pd: 3233 ; X64: # %bb.0: # %entry 3234 ; X64-NEXT: kmovw %edi, %k1 3235 ; X64-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2 3236 ; X64-NEXT: vmovapd %zmm2, %zmm0 3237 ; X64-NEXT: retq 3238 entry: 3239 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3240 %1 = bitcast i8 %__U to <8 x i1> 3241 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3242 ret <8 x double> %2 3243 } 3244 3245 define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3246 ; X86-LABEL: test_mm512_maskz_fmadd_pd: 3247 ; X86: # %bb.0: # %entry 3248 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3249 ; X86-NEXT: kmovw %eax, %k1 3250 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3251 ; X86-NEXT: retl 3252 ; 3253 ; X64-LABEL: test_mm512_maskz_fmadd_pd: 3254 ; X64: # %bb.0: # %entry 3255 ; X64-NEXT: kmovw %edi, %k1 3256 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3257 ; X64-NEXT: retq 3258 entry: 3259 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3260 %1 = bitcast i8 %__U to <8 x i1> 3261 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3262 ret <8 x double> %2 3263 } 3264 3265 define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3266 ; X86-LABEL: test_mm512_fmsub_pd: 3267 ; X86: # %bb.0: # %entry 3268 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2 3269 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3270 ; X86-NEXT: retl 3271 ; 3272 ; X64-LABEL: test_mm512_fmsub_pd: 3273 ; X64: # %bb.0: # %entry 3274 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3275 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3276 ; X64-NEXT: retq 3277 entry: 3278 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3279 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 3280 ret <8 x double> %0 3281 } 3282 3283 define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3284 ; X86-LABEL: test_mm512_mask_fmsub_pd: 3285 ; X86: # %bb.0: # %entry 3286 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3287 ; X86-NEXT: kmovw %eax, %k1 3288 ; X86-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2 3289 ; X86-NEXT: retl 3290 ; 3291 ; X64-LABEL: test_mm512_mask_fmsub_pd: 3292 ; X64: # %bb.0: # %entry 3293 ; X64-NEXT: kmovw %edi, %k1 3294 ; X64-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2 3295 ; X64-NEXT: retq 3296 entry: 3297 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3298 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 3299 %1 = bitcast i8 %__U to <8 x i1> 3300 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3301 ret <8 x double> %2 3302 } 3303 3304 define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3305 ; X86-LABEL: test_mm512_maskz_fmsub_pd: 3306 ; X86: # %bb.0: # %entry 3307 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3308 ; X86-NEXT: kmovw %eax, %k1 3309 ; X86-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 3310 ; X86-NEXT: retl 3311 ; 3312 ; X64-LABEL: test_mm512_maskz_fmsub_pd: 3313 ; X64: # %bb.0: # %entry 3314 ; X64-NEXT: kmovw %edi, %k1 3315 ; X64-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 3316 ; X64-NEXT: retq 3317 entry: 3318 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3319 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 3320 %1 = bitcast i8 %__U to <8 x i1> 3321 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3322 ret <8 x double> %2 3323 } 3324 3325 define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3326 ; X86-LABEL: test_mm512_fnmadd_pd: 3327 ; X86: # %bb.0: # %entry 3328 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0 3329 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3330 ; X86-NEXT: retl 3331 ; 3332 ; X64-LABEL: test_mm512_fnmadd_pd: 3333 ; X64: # %bb.0: # %entry 3334 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0 3335 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3336 ; X64-NEXT: retq 3337 entry: 3338 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3339 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 3340 ret <8 x double> %0 3341 } 3342 3343 define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3344 ; X86-LABEL: test_mm512_mask3_fnmadd_pd: 3345 ; X86: # %bb.0: # %entry 3346 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3347 ; X86-NEXT: kmovw %eax, %k1 3348 ; X86-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 3349 ; X86-NEXT: vmovapd %zmm2, %zmm0 3350 ; X86-NEXT: retl 3351 ; 3352 ; X64-LABEL: test_mm512_mask3_fnmadd_pd: 3353 ; X64: # %bb.0: # %entry 3354 ; X64-NEXT: kmovw %edi, %k1 3355 ; X64-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 3356 ; X64-NEXT: vmovapd %zmm2, %zmm0 3357 ; X64-NEXT: retq 3358 entry: 3359 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3360 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 3361 %1 = bitcast i8 %__U to <8 x i1> 3362 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3363 ret <8 x double> %2 3364 } 3365 3366 define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3367 ; X86-LABEL: test_mm512_maskz_fnmadd_pd: 3368 ; X86: # %bb.0: # %entry 3369 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3370 ; X86-NEXT: kmovw %eax, %k1 3371 ; X86-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 3372 ; X86-NEXT: retl 3373 ; 3374 ; X64-LABEL: test_mm512_maskz_fnmadd_pd: 3375 ; X64: # %bb.0: # %entry 3376 ; X64-NEXT: kmovw %edi, %k1 3377 ; X64-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 3378 ; X64-NEXT: retq 3379 entry: 3380 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3381 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 3382 %1 = bitcast i8 %__U to <8 x i1> 3383 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3384 ret <8 x double> %2 3385 } 3386 3387 define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3388 ; CHECK-LABEL: test_mm512_fnmsub_pd: 3389 ; CHECK: # %bb.0: # %entry 3390 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0] 3391 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 3392 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 3393 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0 3394 ; CHECK-NEXT: ret{{[l|q]}} 3395 entry: 3396 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3397 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3398 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10 3399 ret <8 x double> %0 3400 } 3401 3402 define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3403 ; X86-LABEL: test_mm512_maskz_fnmsub_pd: 3404 ; X86: # %bb.0: # %entry 3405 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3406 ; X86-NEXT: kmovw %eax, %k1 3407 ; X86-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 3408 ; X86-NEXT: retl 3409 ; 3410 ; X64-LABEL: test_mm512_maskz_fnmsub_pd: 3411 ; X64: # %bb.0: # %entry 3412 ; X64-NEXT: kmovw %edi, %k1 3413 ; X64-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 3414 ; X64-NEXT: retq 3415 entry: 3416 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3417 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3418 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10 3419 %1 = bitcast i8 %__U to <8 x i1> 3420 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3421 ret <8 x double> %2 3422 } 3423 3424 define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3425 ; CHECK-LABEL: test_mm512_fmadd_round_ps: 3426 ; CHECK: # %bb.0: # %entry 3427 ; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3428 ; CHECK-NEXT: ret{{[l|q]}} 3429 entry: 3430 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3431 ret <16 x float> %0 3432 } 3433 3434 declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1 3435 3436 define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3437 ; X86-LABEL: test_mm512_mask_fmadd_round_ps: 3438 ; X86: # %bb.0: # %entry 3439 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3440 ; X86-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3441 ; X86-NEXT: retl 3442 ; 3443 ; X64-LABEL: test_mm512_mask_fmadd_round_ps: 3444 ; X64: # %bb.0: # %entry 3445 ; X64-NEXT: kmovw %edi, %k1 3446 ; X64-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3447 ; X64-NEXT: retq 3448 entry: 3449 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3450 %1 = bitcast i16 %__U to <16 x i1> 3451 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3452 ret <16 x float> %2 3453 } 3454 3455 define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3456 ; X86-LABEL: test_mm512_mask3_fmadd_round_ps: 3457 ; X86: # %bb.0: # %entry 3458 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3459 ; X86-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3460 ; X86-NEXT: vmovaps %zmm2, %zmm0 3461 ; X86-NEXT: retl 3462 ; 3463 ; X64-LABEL: test_mm512_mask3_fmadd_round_ps: 3464 ; X64: # %bb.0: # %entry 3465 ; X64-NEXT: kmovw %edi, %k1 3466 ; X64-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3467 ; X64-NEXT: vmovaps %zmm2, %zmm0 3468 ; X64-NEXT: retq 3469 entry: 3470 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3471 %1 = bitcast i16 %__U to <16 x i1> 3472 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3473 ret <16 x float> %2 3474 } 3475 3476 define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3477 ; X86-LABEL: test_mm512_maskz_fmadd_round_ps: 3478 ; X86: # %bb.0: # %entry 3479 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3480 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3481 ; X86-NEXT: retl 3482 ; 3483 ; X64-LABEL: test_mm512_maskz_fmadd_round_ps: 3484 ; X64: # %bb.0: # %entry 3485 ; X64-NEXT: kmovw %edi, %k1 3486 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3487 ; X64-NEXT: retq 3488 entry: 3489 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3490 %1 = bitcast i16 %__U to <16 x i1> 3491 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3492 ret <16 x float> %2 3493 } 3494 3495 define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3496 ; X86-LABEL: test_mm512_fmsub_round_ps: 3497 ; X86: # %bb.0: # %entry 3498 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2 3499 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3500 ; X86-NEXT: retl 3501 ; 3502 ; X64-LABEL: test_mm512_fmsub_round_ps: 3503 ; X64: # %bb.0: # %entry 3504 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2 3505 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3506 ; X64-NEXT: retq 3507 entry: 3508 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3509 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 3510 ret <16 x float> %0 3511 } 3512 3513 define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3514 ; X86-LABEL: test_mm512_mask_fmsub_round_ps: 3515 ; X86: # %bb.0: # %entry 3516 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3517 ; X86-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3518 ; X86-NEXT: retl 3519 ; 3520 ; X64-LABEL: test_mm512_mask_fmsub_round_ps: 3521 ; X64: # %bb.0: # %entry 3522 ; X64-NEXT: kmovw %edi, %k1 3523 ; X64-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3524 ; X64-NEXT: retq 3525 entry: 3526 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3527 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 3528 %1 = bitcast i16 %__U to <16 x i1> 3529 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3530 ret <16 x float> %2 3531 } 3532 3533 define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3534 ; X86-LABEL: test_mm512_maskz_fmsub_round_ps: 3535 ; X86: # %bb.0: # %entry 3536 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3537 ; X86-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3538 ; X86-NEXT: retl 3539 ; 3540 ; X64-LABEL: test_mm512_maskz_fmsub_round_ps: 3541 ; X64: # %bb.0: # %entry 3542 ; X64-NEXT: kmovw %edi, %k1 3543 ; X64-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3544 ; X64-NEXT: retq 3545 entry: 3546 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3547 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 3548 %1 = bitcast i16 %__U to <16 x i1> 3549 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3550 ret <16 x float> %2 3551 } 3552 3553 define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3554 ; X86-LABEL: test_mm512_fnmadd_round_ps: 3555 ; X86: # %bb.0: # %entry 3556 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 3557 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3558 ; X86-NEXT: retl 3559 ; 3560 ; X64-LABEL: test_mm512_fnmadd_round_ps: 3561 ; X64: # %bb.0: # %entry 3562 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0 3563 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3564 ; X64-NEXT: retq 3565 entry: 3566 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3567 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 3568 ret <16 x float> %0 3569 } 3570 3571 define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3572 ; X86-LABEL: test_mm512_mask3_fnmadd_round_ps: 3573 ; X86: # %bb.0: # %entry 3574 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3575 ; X86-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3576 ; X86-NEXT: vmovaps %zmm2, %zmm0 3577 ; X86-NEXT: retl 3578 ; 3579 ; X64-LABEL: test_mm512_mask3_fnmadd_round_ps: 3580 ; X64: # %bb.0: # %entry 3581 ; X64-NEXT: kmovw %edi, %k1 3582 ; X64-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3583 ; X64-NEXT: vmovaps %zmm2, %zmm0 3584 ; X64-NEXT: retq 3585 entry: 3586 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3587 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 3588 %1 = bitcast i16 %__U to <16 x i1> 3589 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3590 ret <16 x float> %2 3591 } 3592 3593 define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3594 ; X86-LABEL: test_mm512_maskz_fnmadd_round_ps: 3595 ; X86: # %bb.0: # %entry 3596 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3597 ; X86-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3598 ; X86-NEXT: retl 3599 ; 3600 ; X64-LABEL: test_mm512_maskz_fnmadd_round_ps: 3601 ; X64: # %bb.0: # %entry 3602 ; X64-NEXT: kmovw %edi, %k1 3603 ; X64-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3604 ; X64-NEXT: retq 3605 entry: 3606 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3607 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 3608 %1 = bitcast i16 %__U to <16 x i1> 3609 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3610 ret <16 x float> %2 3611 } 3612 3613 define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3614 ; CHECK-LABEL: test_mm512_fnmsub_round_ps: 3615 ; CHECK: # %bb.0: # %entry 3616 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0] 3617 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 3618 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 3619 ; CHECK-NEXT: vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0 3620 ; CHECK-NEXT: ret{{[l|q]}} 3621 entry: 3622 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3623 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3624 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8) 3625 ret <16 x float> %0 3626 } 3627 3628 define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3629 ; X86-LABEL: test_mm512_maskz_fnmsub_round_ps: 3630 ; X86: # %bb.0: # %entry 3631 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3632 ; X86-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3633 ; X86-NEXT: retl 3634 ; 3635 ; X64-LABEL: test_mm512_maskz_fnmsub_round_ps: 3636 ; X64: # %bb.0: # %entry 3637 ; X64-NEXT: kmovw %edi, %k1 3638 ; X64-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3639 ; X64-NEXT: retq 3640 entry: 3641 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3642 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3643 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8) 3644 %1 = bitcast i16 %__U to <16 x i1> 3645 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3646 ret <16 x float> %2 3647 } 3648 3649 define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3650 ; CHECK-LABEL: test_mm512_fmadd_ps: 3651 ; CHECK: # %bb.0: # %entry 3652 ; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3653 ; CHECK-NEXT: ret{{[l|q]}} 3654 entry: 3655 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3656 ret <16 x float> %0 3657 } 3658 3659 define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3660 ; X86-LABEL: test_mm512_mask_fmadd_ps: 3661 ; X86: # %bb.0: # %entry 3662 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3663 ; X86-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2 3664 ; X86-NEXT: retl 3665 ; 3666 ; X64-LABEL: test_mm512_mask_fmadd_ps: 3667 ; X64: # %bb.0: # %entry 3668 ; X64-NEXT: kmovw %edi, %k1 3669 ; X64-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2 3670 ; X64-NEXT: retq 3671 entry: 3672 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3673 %1 = bitcast i16 %__U to <16 x i1> 3674 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3675 ret <16 x float> %2 3676 } 3677 3678 define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3679 ; X86-LABEL: test_mm512_mask3_fmadd_ps: 3680 ; X86: # %bb.0: # %entry 3681 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3682 ; X86-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2 3683 ; X86-NEXT: vmovaps %zmm2, %zmm0 3684 ; X86-NEXT: retl 3685 ; 3686 ; X64-LABEL: test_mm512_mask3_fmadd_ps: 3687 ; X64: # %bb.0: # %entry 3688 ; X64-NEXT: kmovw %edi, %k1 3689 ; X64-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2 3690 ; X64-NEXT: vmovaps %zmm2, %zmm0 3691 ; X64-NEXT: retq 3692 entry: 3693 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3694 %1 = bitcast i16 %__U to <16 x i1> 3695 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3696 ret <16 x float> %2 3697 } 3698 3699 define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3700 ; X86-LABEL: test_mm512_maskz_fmadd_ps: 3701 ; X86: # %bb.0: # %entry 3702 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3703 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3704 ; X86-NEXT: retl 3705 ; 3706 ; X64-LABEL: test_mm512_maskz_fmadd_ps: 3707 ; X64: # %bb.0: # %entry 3708 ; X64-NEXT: kmovw %edi, %k1 3709 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3710 ; X64-NEXT: retq 3711 entry: 3712 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3713 %1 = bitcast i16 %__U to <16 x i1> 3714 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3715 ret <16 x float> %2 3716 } 3717 3718 define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3719 ; X86-LABEL: test_mm512_fmsub_ps: 3720 ; X86: # %bb.0: # %entry 3721 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2 3722 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3723 ; X86-NEXT: retl 3724 ; 3725 ; X64-LABEL: test_mm512_fmsub_ps: 3726 ; X64: # %bb.0: # %entry 3727 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2 3728 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3729 ; X64-NEXT: retq 3730 entry: 3731 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3732 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 3733 ret <16 x float> %0 3734 } 3735 3736 define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3737 ; X86-LABEL: test_mm512_mask_fmsub_ps: 3738 ; X86: # %bb.0: # %entry 3739 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3740 ; X86-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2 3741 ; X86-NEXT: retl 3742 ; 3743 ; X64-LABEL: test_mm512_mask_fmsub_ps: 3744 ; X64: # %bb.0: # %entry 3745 ; X64-NEXT: kmovw %edi, %k1 3746 ; X64-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2 3747 ; X64-NEXT: retq 3748 entry: 3749 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3750 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 3751 %1 = bitcast i16 %__U to <16 x i1> 3752 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3753 ret <16 x float> %2 3754 } 3755 3756 define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3757 ; X86-LABEL: test_mm512_maskz_fmsub_ps: 3758 ; X86: # %bb.0: # %entry 3759 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3760 ; X86-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 3761 ; X86-NEXT: retl 3762 ; 3763 ; X64-LABEL: test_mm512_maskz_fmsub_ps: 3764 ; X64: # %bb.0: # %entry 3765 ; X64-NEXT: kmovw %edi, %k1 3766 ; X64-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 3767 ; X64-NEXT: retq 3768 entry: 3769 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3770 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 3771 %1 = bitcast i16 %__U to <16 x i1> 3772 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3773 ret <16 x float> %2 3774 } 3775 3776 define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3777 ; X86-LABEL: test_mm512_fnmadd_ps: 3778 ; X86: # %bb.0: # %entry 3779 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 3780 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3781 ; X86-NEXT: retl 3782 ; 3783 ; X64-LABEL: test_mm512_fnmadd_ps: 3784 ; X64: # %bb.0: # %entry 3785 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0 3786 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3787 ; X64-NEXT: retq 3788 entry: 3789 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3790 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 3791 ret <16 x float> %0 3792 } 3793 3794 define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3795 ; X86-LABEL: test_mm512_mask3_fnmadd_ps: 3796 ; X86: # %bb.0: # %entry 3797 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3798 ; X86-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 3799 ; X86-NEXT: vmovaps %zmm2, %zmm0 3800 ; X86-NEXT: retl 3801 ; 3802 ; X64-LABEL: test_mm512_mask3_fnmadd_ps: 3803 ; X64: # %bb.0: # %entry 3804 ; X64-NEXT: kmovw %edi, %k1 3805 ; X64-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 3806 ; X64-NEXT: vmovaps %zmm2, %zmm0 3807 ; X64-NEXT: retq 3808 entry: 3809 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3810 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 3811 %1 = bitcast i16 %__U to <16 x i1> 3812 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3813 ret <16 x float> %2 3814 } 3815 3816 define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3817 ; X86-LABEL: test_mm512_maskz_fnmadd_ps: 3818 ; X86: # %bb.0: # %entry 3819 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3820 ; X86-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 3821 ; X86-NEXT: retl 3822 ; 3823 ; X64-LABEL: test_mm512_maskz_fnmadd_ps: 3824 ; X64: # %bb.0: # %entry 3825 ; X64-NEXT: kmovw %edi, %k1 3826 ; X64-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 3827 ; X64-NEXT: retq 3828 entry: 3829 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3830 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 3831 %1 = bitcast i16 %__U to <16 x i1> 3832 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3833 ret <16 x float> %2 3834 } 3835 3836 define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3837 ; CHECK-LABEL: test_mm512_fnmsub_ps: 3838 ; CHECK: # %bb.0: # %entry 3839 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0] 3840 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 3841 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 3842 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0 3843 ; CHECK-NEXT: ret{{[l|q]}} 3844 entry: 3845 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3846 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3847 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10 3848 ret <16 x float> %0 3849 } 3850 3851 define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3852 ; X86-LABEL: test_mm512_maskz_fnmsub_ps: 3853 ; X86: # %bb.0: # %entry 3854 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3855 ; X86-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 3856 ; X86-NEXT: retl 3857 ; 3858 ; X64-LABEL: test_mm512_maskz_fnmsub_ps: 3859 ; X64: # %bb.0: # %entry 3860 ; X64-NEXT: kmovw %edi, %k1 3861 ; X64-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 3862 ; X64-NEXT: retq 3863 entry: 3864 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3865 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3866 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10 3867 %1 = bitcast i16 %__U to <16 x i1> 3868 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3869 ret <16 x float> %2 3870 } 3871 3872 define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3873 ; CHECK-LABEL: test_mm512_fmaddsub_round_pd: 3874 ; CHECK: # %bb.0: # %entry 3875 ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3876 ; CHECK-NEXT: ret{{[l|q]}} 3877 entry: 3878 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3879 ret <8 x double> %0 3880 } 3881 3882 declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1 3883 3884 define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3885 ; X86-LABEL: test_mm512_mask_fmaddsub_round_pd: 3886 ; X86: # %bb.0: # %entry 3887 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3888 ; X86-NEXT: kmovw %eax, %k1 3889 ; X86-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3890 ; X86-NEXT: retl 3891 ; 3892 ; X64-LABEL: test_mm512_mask_fmaddsub_round_pd: 3893 ; X64: # %bb.0: # %entry 3894 ; X64-NEXT: kmovw %edi, %k1 3895 ; X64-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3896 ; X64-NEXT: retq 3897 entry: 3898 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3899 %1 = bitcast i8 %__U to <8 x i1> 3900 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3901 ret <8 x double> %2 3902 } 3903 3904 define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3905 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd: 3906 ; X86: # %bb.0: # %entry 3907 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3908 ; X86-NEXT: kmovw %eax, %k1 3909 ; X86-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3910 ; X86-NEXT: vmovapd %zmm2, %zmm0 3911 ; X86-NEXT: retl 3912 ; 3913 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd: 3914 ; X64: # %bb.0: # %entry 3915 ; X64-NEXT: kmovw %edi, %k1 3916 ; X64-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3917 ; X64-NEXT: vmovapd %zmm2, %zmm0 3918 ; X64-NEXT: retq 3919 entry: 3920 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3921 %1 = bitcast i8 %__U to <8 x i1> 3922 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3923 ret <8 x double> %2 3924 } 3925 3926 define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3927 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd: 3928 ; X86: # %bb.0: # %entry 3929 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3930 ; X86-NEXT: kmovw %eax, %k1 3931 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3932 ; X86-NEXT: retl 3933 ; 3934 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd: 3935 ; X64: # %bb.0: # %entry 3936 ; X64-NEXT: kmovw %edi, %k1 3937 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3938 ; X64-NEXT: retq 3939 entry: 3940 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3941 %1 = bitcast i8 %__U to <8 x i1> 3942 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3943 ret <8 x double> %2 3944 } 3945 3946 define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3947 ; X86-LABEL: test_mm512_fmsubadd_round_pd: 3948 ; X86: # %bb.0: # %entry 3949 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2 3950 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3951 ; X86-NEXT: retl 3952 ; 3953 ; X64-LABEL: test_mm512_fmsubadd_round_pd: 3954 ; X64: # %bb.0: # %entry 3955 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3956 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3957 ; X64-NEXT: retq 3958 entry: 3959 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3960 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3961 ret <8 x double> %0 3962 } 3963 3964 define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3965 ; X86-LABEL: test_mm512_mask_fmsubadd_round_pd: 3966 ; X86: # %bb.0: # %entry 3967 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3968 ; X86-NEXT: kmovw %eax, %k1 3969 ; X86-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3970 ; X86-NEXT: retl 3971 ; 3972 ; X64-LABEL: test_mm512_mask_fmsubadd_round_pd: 3973 ; X64: # %bb.0: # %entry 3974 ; X64-NEXT: kmovw %edi, %k1 3975 ; X64-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3976 ; X64-NEXT: retq 3977 entry: 3978 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3979 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3980 %1 = bitcast i8 %__U to <8 x i1> 3981 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3982 ret <8 x double> %2 3983 } 3984 3985 define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3986 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd: 3987 ; X86: # %bb.0: # %entry 3988 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3989 ; X86-NEXT: kmovw %eax, %k1 3990 ; X86-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3991 ; X86-NEXT: retl 3992 ; 3993 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd: 3994 ; X64: # %bb.0: # %entry 3995 ; X64-NEXT: kmovw %edi, %k1 3996 ; X64-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3997 ; X64-NEXT: retq 3998 entry: 3999 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4000 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 4001 %1 = bitcast i8 %__U to <8 x i1> 4002 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 4003 ret <8 x double> %2 4004 } 4005 4006 define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4007 ; CHECK-LABEL: test_mm512_fmaddsub_pd: 4008 ; CHECK: # %bb.0: # %entry 4009 ; CHECK-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4010 ; CHECK-NEXT: ret{{[l|q]}} 4011 entry: 4012 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4013 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4014 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4015 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4016 ret <8 x double> %3 4017 } 4018 4019 define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4020 ; X86-LABEL: test_mm512_mask_fmaddsub_pd: 4021 ; X86: # %bb.0: # %entry 4022 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4023 ; X86-NEXT: kmovw %eax, %k1 4024 ; X86-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2 4025 ; X86-NEXT: retl 4026 ; 4027 ; X64-LABEL: test_mm512_mask_fmaddsub_pd: 4028 ; X64: # %bb.0: # %entry 4029 ; X64-NEXT: kmovw %edi, %k1 4030 ; X64-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2 4031 ; X64-NEXT: retq 4032 entry: 4033 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4034 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4035 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4036 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4037 %4 = bitcast i8 %__U to <8 x i1> 4038 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A 4039 ret <8 x double> %5 4040 } 4041 4042 define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4043 ; X86-LABEL: test_mm512_mask3_fmaddsub_pd: 4044 ; X86: # %bb.0: # %entry 4045 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4046 ; X86-NEXT: kmovw %eax, %k1 4047 ; X86-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2 4048 ; X86-NEXT: vmovapd %zmm2, %zmm0 4049 ; X86-NEXT: retl 4050 ; 4051 ; X64-LABEL: test_mm512_mask3_fmaddsub_pd: 4052 ; X64: # %bb.0: # %entry 4053 ; X64-NEXT: kmovw %edi, %k1 4054 ; X64-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2 4055 ; X64-NEXT: vmovapd %zmm2, %zmm0 4056 ; X64-NEXT: retq 4057 entry: 4058 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4059 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4060 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4061 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4062 %4 = bitcast i8 %__U to <8 x i1> 4063 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C 4064 ret <8 x double> %5 4065 } 4066 4067 define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4068 ; X86-LABEL: test_mm512_maskz_fmaddsub_pd: 4069 ; X86: # %bb.0: # %entry 4070 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4071 ; X86-NEXT: kmovw %eax, %k1 4072 ; X86-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4073 ; X86-NEXT: retl 4074 ; 4075 ; X64-LABEL: test_mm512_maskz_fmaddsub_pd: 4076 ; X64: # %bb.0: # %entry 4077 ; X64-NEXT: kmovw %edi, %k1 4078 ; X64-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4079 ; X64-NEXT: retq 4080 entry: 4081 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4082 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4083 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4084 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4085 %4 = bitcast i8 %__U to <8 x i1> 4086 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer 4087 ret <8 x double> %5 4088 } 4089 4090 define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4091 ; CHECK-LABEL: test_mm512_fmsubadd_pd: 4092 ; CHECK: # %bb.0: # %entry 4093 ; CHECK-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4094 ; CHECK-NEXT: ret{{[l|q]}} 4095 entry: 4096 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4097 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4098 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4099 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4100 ret <8 x double> %2 4101 } 4102 4103 define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4104 ; X86-LABEL: test_mm512_mask_fmsubadd_pd: 4105 ; X86: # %bb.0: # %entry 4106 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4107 ; X86-NEXT: kmovw %eax, %k1 4108 ; X86-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2 4109 ; X86-NEXT: retl 4110 ; 4111 ; X64-LABEL: test_mm512_mask_fmsubadd_pd: 4112 ; X64: # %bb.0: # %entry 4113 ; X64-NEXT: kmovw %edi, %k1 4114 ; X64-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2 4115 ; X64-NEXT: retq 4116 entry: 4117 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4118 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4119 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4120 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4121 %3 = bitcast i8 %__U to <8 x i1> 4122 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A 4123 ret <8 x double> %4 4124 } 4125 4126 define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4127 ; X86-LABEL: test_mm512_maskz_fmsubadd_pd: 4128 ; X86: # %bb.0: # %entry 4129 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4130 ; X86-NEXT: kmovw %eax, %k1 4131 ; X86-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4132 ; X86-NEXT: retl 4133 ; 4134 ; X64-LABEL: test_mm512_maskz_fmsubadd_pd: 4135 ; X64: # %bb.0: # %entry 4136 ; X64-NEXT: kmovw %edi, %k1 4137 ; X64-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4138 ; X64-NEXT: retq 4139 entry: 4140 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4141 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4142 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4143 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4144 %3 = bitcast i8 %__U to <8 x i1> 4145 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 4146 ret <8 x double> %4 4147 } 4148 4149 define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4150 ; CHECK-LABEL: test_mm512_fmaddsub_round_ps: 4151 ; CHECK: # %bb.0: # %entry 4152 ; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 4153 ; CHECK-NEXT: ret{{[l|q]}} 4154 entry: 4155 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4156 ret <16 x float> %0 4157 } 4158 4159 declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1 4160 4161 define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4162 ; X86-LABEL: test_mm512_mask_fmaddsub_round_ps: 4163 ; X86: # %bb.0: # %entry 4164 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4165 ; X86-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4166 ; X86-NEXT: retl 4167 ; 4168 ; X64-LABEL: test_mm512_mask_fmaddsub_round_ps: 4169 ; X64: # %bb.0: # %entry 4170 ; X64-NEXT: kmovw %edi, %k1 4171 ; X64-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4172 ; X64-NEXT: retq 4173 entry: 4174 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4175 %1 = bitcast i16 %__U to <16 x i1> 4176 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4177 ret <16 x float> %2 4178 } 4179 4180 define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4181 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps: 4182 ; X86: # %bb.0: # %entry 4183 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4184 ; X86-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4185 ; X86-NEXT: vmovaps %zmm2, %zmm0 4186 ; X86-NEXT: retl 4187 ; 4188 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps: 4189 ; X64: # %bb.0: # %entry 4190 ; X64-NEXT: kmovw %edi, %k1 4191 ; X64-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4192 ; X64-NEXT: vmovaps %zmm2, %zmm0 4193 ; X64-NEXT: retq 4194 entry: 4195 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4196 %1 = bitcast i16 %__U to <16 x i1> 4197 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4198 ret <16 x float> %2 4199 } 4200 4201 define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4202 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps: 4203 ; X86: # %bb.0: # %entry 4204 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4205 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4206 ; X86-NEXT: retl 4207 ; 4208 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps: 4209 ; X64: # %bb.0: # %entry 4210 ; X64-NEXT: kmovw %edi, %k1 4211 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4212 ; X64-NEXT: retq 4213 entry: 4214 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4215 %1 = bitcast i16 %__U to <16 x i1> 4216 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 4217 ret <16 x float> %2 4218 } 4219 4220 define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4221 ; X86-LABEL: test_mm512_fmsubadd_round_ps: 4222 ; X86: # %bb.0: # %entry 4223 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2 4224 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 4225 ; X86-NEXT: retl 4226 ; 4227 ; X64-LABEL: test_mm512_fmsubadd_round_ps: 4228 ; X64: # %bb.0: # %entry 4229 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2 4230 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 4231 ; X64-NEXT: retq 4232 entry: 4233 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4234 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4235 ret <16 x float> %0 4236 } 4237 4238 define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4239 ; X86-LABEL: test_mm512_mask_fmsubadd_round_ps: 4240 ; X86: # %bb.0: # %entry 4241 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4242 ; X86-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4243 ; X86-NEXT: retl 4244 ; 4245 ; X64-LABEL: test_mm512_mask_fmsubadd_round_ps: 4246 ; X64: # %bb.0: # %entry 4247 ; X64-NEXT: kmovw %edi, %k1 4248 ; X64-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4249 ; X64-NEXT: retq 4250 entry: 4251 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4252 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4253 %1 = bitcast i16 %__U to <16 x i1> 4254 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4255 ret <16 x float> %2 4256 } 4257 4258 define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4259 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps: 4260 ; X86: # %bb.0: # %entry 4261 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4262 ; X86-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4263 ; X86-NEXT: retl 4264 ; 4265 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps: 4266 ; X64: # %bb.0: # %entry 4267 ; X64-NEXT: kmovw %edi, %k1 4268 ; X64-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4269 ; X64-NEXT: retq 4270 entry: 4271 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4272 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4273 %1 = bitcast i16 %__U to <16 x i1> 4274 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 4275 ret <16 x float> %2 4276 } 4277 4278 define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4279 ; CHECK-LABEL: test_mm512_fmaddsub_ps: 4280 ; CHECK: # %bb.0: # %entry 4281 ; CHECK-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4282 ; CHECK-NEXT: ret{{[l|q]}} 4283 entry: 4284 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4285 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4286 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4287 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4288 ret <16 x float> %3 4289 } 4290 4291 define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4292 ; X86-LABEL: test_mm512_mask_fmaddsub_ps: 4293 ; X86: # %bb.0: # %entry 4294 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4295 ; X86-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2 4296 ; X86-NEXT: retl 4297 ; 4298 ; X64-LABEL: test_mm512_mask_fmaddsub_ps: 4299 ; X64: # %bb.0: # %entry 4300 ; X64-NEXT: kmovw %edi, %k1 4301 ; X64-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2 4302 ; X64-NEXT: retq 4303 entry: 4304 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4305 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4306 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4307 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4308 %4 = bitcast i16 %__U to <16 x i1> 4309 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A 4310 ret <16 x float> %5 4311 } 4312 4313 define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4314 ; X86-LABEL: test_mm512_mask3_fmaddsub_ps: 4315 ; X86: # %bb.0: # %entry 4316 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4317 ; X86-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2 4318 ; X86-NEXT: vmovaps %zmm2, %zmm0 4319 ; X86-NEXT: retl 4320 ; 4321 ; X64-LABEL: test_mm512_mask3_fmaddsub_ps: 4322 ; X64: # %bb.0: # %entry 4323 ; X64-NEXT: kmovw %edi, %k1 4324 ; X64-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2 4325 ; X64-NEXT: vmovaps %zmm2, %zmm0 4326 ; X64-NEXT: retq 4327 entry: 4328 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4329 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4330 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4331 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4332 %4 = bitcast i16 %__U to <16 x i1> 4333 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C 4334 ret <16 x float> %5 4335 } 4336 4337 define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4338 ; X86-LABEL: test_mm512_maskz_fmaddsub_ps: 4339 ; X86: # %bb.0: # %entry 4340 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4341 ; X86-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4342 ; X86-NEXT: retl 4343 ; 4344 ; X64-LABEL: test_mm512_maskz_fmaddsub_ps: 4345 ; X64: # %bb.0: # %entry 4346 ; X64-NEXT: kmovw %edi, %k1 4347 ; X64-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4348 ; X64-NEXT: retq 4349 entry: 4350 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4351 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4352 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4353 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4354 %4 = bitcast i16 %__U to <16 x i1> 4355 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer 4356 ret <16 x float> %5 4357 } 4358 4359 define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4360 ; CHECK-LABEL: test_mm512_fmsubadd_ps: 4361 ; CHECK: # %bb.0: # %entry 4362 ; CHECK-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4363 ; CHECK-NEXT: ret{{[l|q]}} 4364 entry: 4365 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4366 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4367 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4368 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4369 ret <16 x float> %2 4370 } 4371 4372 define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4373 ; X86-LABEL: test_mm512_mask_fmsubadd_ps: 4374 ; X86: # %bb.0: # %entry 4375 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4376 ; X86-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2 4377 ; X86-NEXT: retl 4378 ; 4379 ; X64-LABEL: test_mm512_mask_fmsubadd_ps: 4380 ; X64: # %bb.0: # %entry 4381 ; X64-NEXT: kmovw %edi, %k1 4382 ; X64-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2 4383 ; X64-NEXT: retq 4384 entry: 4385 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4386 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4387 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4388 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4389 %3 = bitcast i16 %__U to <16 x i1> 4390 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A 4391 ret <16 x float> %4 4392 } 4393 4394 define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4395 ; X86-LABEL: test_mm512_maskz_fmsubadd_ps: 4396 ; X86: # %bb.0: # %entry 4397 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4398 ; X86-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4399 ; X86-NEXT: retl 4400 ; 4401 ; X64-LABEL: test_mm512_maskz_fmsubadd_ps: 4402 ; X64: # %bb.0: # %entry 4403 ; X64-NEXT: kmovw %edi, %k1 4404 ; X64-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4405 ; X64-NEXT: retq 4406 entry: 4407 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4408 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4409 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4410 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4411 %3 = bitcast i16 %__U to <16 x i1> 4412 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer 4413 ret <16 x float> %4 4414 } 4415 4416 define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4417 ; X86-LABEL: test_mm512_mask3_fmsub_round_pd: 4418 ; X86: # %bb.0: # %entry 4419 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4420 ; X86-NEXT: kmovw %eax, %k1 4421 ; X86-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4422 ; X86-NEXT: vmovapd %zmm2, %zmm0 4423 ; X86-NEXT: retl 4424 ; 4425 ; X64-LABEL: test_mm512_mask3_fmsub_round_pd: 4426 ; X64: # %bb.0: # %entry 4427 ; X64-NEXT: kmovw %edi, %k1 4428 ; X64-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4429 ; X64-NEXT: vmovapd %zmm2, %zmm0 4430 ; X64-NEXT: retq 4431 entry: 4432 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4433 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 4434 %1 = bitcast i8 %__U to <8 x i1> 4435 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4436 ret <8 x double> %2 4437 } 4438 4439 define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4440 ; X86-LABEL: test_mm512_mask3_fmsub_pd: 4441 ; X86: # %bb.0: # %entry 4442 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4443 ; X86-NEXT: kmovw %eax, %k1 4444 ; X86-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2 4445 ; X86-NEXT: vmovapd %zmm2, %zmm0 4446 ; X86-NEXT: retl 4447 ; 4448 ; X64-LABEL: test_mm512_mask3_fmsub_pd: 4449 ; X64: # %bb.0: # %entry 4450 ; X64-NEXT: kmovw %edi, %k1 4451 ; X64-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2 4452 ; X64-NEXT: vmovapd %zmm2, %zmm0 4453 ; X64-NEXT: retq 4454 entry: 4455 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4456 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4457 %1 = bitcast i8 %__U to <8 x i1> 4458 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4459 ret <8 x double> %2 4460 } 4461 4462 define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4463 ; X86-LABEL: test_mm512_mask3_fmsub_round_ps: 4464 ; X86: # %bb.0: # %entry 4465 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4466 ; X86-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4467 ; X86-NEXT: vmovaps %zmm2, %zmm0 4468 ; X86-NEXT: retl 4469 ; 4470 ; X64-LABEL: test_mm512_mask3_fmsub_round_ps: 4471 ; X64: # %bb.0: # %entry 4472 ; X64-NEXT: kmovw %edi, %k1 4473 ; X64-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4474 ; X64-NEXT: vmovaps %zmm2, %zmm0 4475 ; X64-NEXT: retq 4476 entry: 4477 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4478 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4479 %1 = bitcast i16 %__U to <16 x i1> 4480 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4481 ret <16 x float> %2 4482 } 4483 4484 define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4485 ; X86-LABEL: test_mm512_mask3_fmsub_ps: 4486 ; X86: # %bb.0: # %entry 4487 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4488 ; X86-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2 4489 ; X86-NEXT: vmovaps %zmm2, %zmm0 4490 ; X86-NEXT: retl 4491 ; 4492 ; X64-LABEL: test_mm512_mask3_fmsub_ps: 4493 ; X64: # %bb.0: # %entry 4494 ; X64-NEXT: kmovw %edi, %k1 4495 ; X64-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2 4496 ; X64-NEXT: vmovaps %zmm2, %zmm0 4497 ; X64-NEXT: retq 4498 entry: 4499 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4500 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4501 %1 = bitcast i16 %__U to <16 x i1> 4502 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4503 ret <16 x float> %2 4504 } 4505 4506 define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4507 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd: 4508 ; X86: # %bb.0: # %entry 4509 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4510 ; X86-NEXT: kmovw %eax, %k1 4511 ; X86-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4512 ; X86-NEXT: vmovapd %zmm2, %zmm0 4513 ; X86-NEXT: retl 4514 ; 4515 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd: 4516 ; X64: # %bb.0: # %entry 4517 ; X64-NEXT: kmovw %edi, %k1 4518 ; X64-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4519 ; X64-NEXT: vmovapd %zmm2, %zmm0 4520 ; X64-NEXT: retq 4521 entry: 4522 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4523 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 4524 %1 = bitcast i8 %__U to <8 x i1> 4525 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4526 ret <8 x double> %2 4527 } 4528 4529 define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4530 ; X86-LABEL: test_mm512_mask3_fmsubadd_pd: 4531 ; X86: # %bb.0: # %entry 4532 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4533 ; X86-NEXT: kmovw %eax, %k1 4534 ; X86-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2 4535 ; X86-NEXT: vmovapd %zmm2, %zmm0 4536 ; X86-NEXT: retl 4537 ; 4538 ; X64-LABEL: test_mm512_mask3_fmsubadd_pd: 4539 ; X64: # %bb.0: # %entry 4540 ; X64-NEXT: kmovw %edi, %k1 4541 ; X64-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2 4542 ; X64-NEXT: vmovapd %zmm2, %zmm0 4543 ; X64-NEXT: retq 4544 entry: 4545 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4546 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4547 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4548 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4549 %3 = bitcast i8 %__U to <8 x i1> 4550 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C 4551 ret <8 x double> %4 4552 } 4553 4554 define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4555 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps: 4556 ; X86: # %bb.0: # %entry 4557 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4558 ; X86-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4559 ; X86-NEXT: vmovaps %zmm2, %zmm0 4560 ; X86-NEXT: retl 4561 ; 4562 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps: 4563 ; X64: # %bb.0: # %entry 4564 ; X64-NEXT: kmovw %edi, %k1 4565 ; X64-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4566 ; X64-NEXT: vmovaps %zmm2, %zmm0 4567 ; X64-NEXT: retq 4568 entry: 4569 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4570 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4571 %1 = bitcast i16 %__U to <16 x i1> 4572 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4573 ret <16 x float> %2 4574 } 4575 4576 define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4577 ; X86-LABEL: test_mm512_mask3_fmsubadd_ps: 4578 ; X86: # %bb.0: # %entry 4579 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4580 ; X86-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2 4581 ; X86-NEXT: vmovaps %zmm2, %zmm0 4582 ; X86-NEXT: retl 4583 ; 4584 ; X64-LABEL: test_mm512_mask3_fmsubadd_ps: 4585 ; X64: # %bb.0: # %entry 4586 ; X64-NEXT: kmovw %edi, %k1 4587 ; X64-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2 4588 ; X64-NEXT: vmovaps %zmm2, %zmm0 4589 ; X64-NEXT: retq 4590 entry: 4591 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4592 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4593 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4594 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4595 %3 = bitcast i16 %__U to <16 x i1> 4596 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C 4597 ret <16 x float> %4 4598 } 4599 4600 define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4601 ; X86-LABEL: test_mm512_mask_fnmadd_round_pd: 4602 ; X86: # %bb.0: # %entry 4603 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4604 ; X86-NEXT: kmovw %eax, %k1 4605 ; X86-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4606 ; X86-NEXT: retl 4607 ; 4608 ; X64-LABEL: test_mm512_mask_fnmadd_round_pd: 4609 ; X64: # %bb.0: # %entry 4610 ; X64-NEXT: kmovw %edi, %k1 4611 ; X64-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4612 ; X64-NEXT: retq 4613 entry: 4614 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4615 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 4616 %1 = bitcast i8 %__U to <8 x i1> 4617 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4618 ret <8 x double> %2 4619 } 4620 4621 define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4622 ; X86-LABEL: test_mm512_mask_fnmadd_pd: 4623 ; X86: # %bb.0: # %entry 4624 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4625 ; X86-NEXT: kmovw %eax, %k1 4626 ; X86-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2 4627 ; X86-NEXT: retl 4628 ; 4629 ; X64-LABEL: test_mm512_mask_fnmadd_pd: 4630 ; X64: # %bb.0: # %entry 4631 ; X64-NEXT: kmovw %edi, %k1 4632 ; X64-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2 4633 ; X64-NEXT: retq 4634 entry: 4635 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4636 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 4637 %1 = bitcast i8 %__U to <8 x i1> 4638 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4639 ret <8 x double> %2 4640 } 4641 4642 define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4643 ; X86-LABEL: test_mm512_mask_fnmadd_round_ps: 4644 ; X86: # %bb.0: # %entry 4645 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4646 ; X86-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4647 ; X86-NEXT: retl 4648 ; 4649 ; X64-LABEL: test_mm512_mask_fnmadd_round_ps: 4650 ; X64: # %bb.0: # %entry 4651 ; X64-NEXT: kmovw %edi, %k1 4652 ; X64-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4653 ; X64-NEXT: retq 4654 entry: 4655 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4656 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 4657 %1 = bitcast i16 %__U to <16 x i1> 4658 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4659 ret <16 x float> %2 4660 } 4661 4662 define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4663 ; X86-LABEL: test_mm512_mask_fnmadd_ps: 4664 ; X86: # %bb.0: # %entry 4665 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4666 ; X86-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2 4667 ; X86-NEXT: retl 4668 ; 4669 ; X64-LABEL: test_mm512_mask_fnmadd_ps: 4670 ; X64: # %bb.0: # %entry 4671 ; X64-NEXT: kmovw %edi, %k1 4672 ; X64-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2 4673 ; X64-NEXT: retq 4674 entry: 4675 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4676 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 4677 %1 = bitcast i16 %__U to <16 x i1> 4678 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4679 ret <16 x float> %2 4680 } 4681 4682 define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4683 ; X86-LABEL: test_mm512_mask_fnmsub_round_pd: 4684 ; X86: # %bb.0: # %entry 4685 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4686 ; X86-NEXT: kmovw %eax, %k1 4687 ; X86-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4688 ; X86-NEXT: retl 4689 ; 4690 ; X64-LABEL: test_mm512_mask_fnmsub_round_pd: 4691 ; X64: # %bb.0: # %entry 4692 ; X64-NEXT: kmovw %edi, %k1 4693 ; X64-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4694 ; X64-NEXT: retq 4695 entry: 4696 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4697 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4698 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8) 4699 %1 = bitcast i8 %__U to <8 x i1> 4700 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4701 ret <8 x double> %2 4702 } 4703 4704 define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4705 ; X86-LABEL: test_mm512_mask3_fnmsub_round_pd: 4706 ; X86: # %bb.0: # %entry 4707 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4708 ; X86-NEXT: kmovw %eax, %k1 4709 ; X86-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4710 ; X86-NEXT: vmovapd %zmm2, %zmm0 4711 ; X86-NEXT: retl 4712 ; 4713 ; X64-LABEL: test_mm512_mask3_fnmsub_round_pd: 4714 ; X64: # %bb.0: # %entry 4715 ; X64-NEXT: kmovw %edi, %k1 4716 ; X64-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4717 ; X64-NEXT: vmovapd %zmm2, %zmm0 4718 ; X64-NEXT: retq 4719 entry: 4720 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4721 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4722 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8) 4723 %1 = bitcast i8 %__U to <8 x i1> 4724 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4725 ret <8 x double> %2 4726 } 4727 4728 define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4729 ; X86-LABEL: test_mm512_mask_fnmsub_pd: 4730 ; X86: # %bb.0: # %entry 4731 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4732 ; X86-NEXT: kmovw %eax, %k1 4733 ; X86-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2 4734 ; X86-NEXT: retl 4735 ; 4736 ; X64-LABEL: test_mm512_mask_fnmsub_pd: 4737 ; X64: # %bb.0: # %entry 4738 ; X64-NEXT: kmovw %edi, %k1 4739 ; X64-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2 4740 ; X64-NEXT: retq 4741 entry: 4742 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4743 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4744 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10 4745 %1 = bitcast i8 %__U to <8 x i1> 4746 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4747 ret <8 x double> %2 4748 } 4749 4750 define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4751 ; X86-LABEL: test_mm512_mask3_fnmsub_pd: 4752 ; X86: # %bb.0: # %entry 4753 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4754 ; X86-NEXT: kmovw %eax, %k1 4755 ; X86-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2 4756 ; X86-NEXT: vmovapd %zmm2, %zmm0 4757 ; X86-NEXT: retl 4758 ; 4759 ; X64-LABEL: test_mm512_mask3_fnmsub_pd: 4760 ; X64: # %bb.0: # %entry 4761 ; X64-NEXT: kmovw %edi, %k1 4762 ; X64-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2 4763 ; X64-NEXT: vmovapd %zmm2, %zmm0 4764 ; X64-NEXT: retq 4765 entry: 4766 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4767 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4768 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10 4769 %1 = bitcast i8 %__U to <8 x i1> 4770 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4771 ret <8 x double> %2 4772 } 4773 4774 define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4775 ; X86-LABEL: test_mm512_mask_fnmsub_round_ps: 4776 ; X86: # %bb.0: # %entry 4777 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4778 ; X86-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4779 ; X86-NEXT: retl 4780 ; 4781 ; X64-LABEL: test_mm512_mask_fnmsub_round_ps: 4782 ; X64: # %bb.0: # %entry 4783 ; X64-NEXT: kmovw %edi, %k1 4784 ; X64-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4785 ; X64-NEXT: retq 4786 entry: 4787 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4788 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4789 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8) 4790 %1 = bitcast i16 %__U to <16 x i1> 4791 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4792 ret <16 x float> %2 4793 } 4794 4795 define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4796 ; X86-LABEL: test_mm512_mask3_fnmsub_round_ps: 4797 ; X86: # %bb.0: # %entry 4798 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4799 ; X86-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4800 ; X86-NEXT: vmovaps %zmm2, %zmm0 4801 ; X86-NEXT: retl 4802 ; 4803 ; X64-LABEL: test_mm512_mask3_fnmsub_round_ps: 4804 ; X64: # %bb.0: # %entry 4805 ; X64-NEXT: kmovw %edi, %k1 4806 ; X64-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4807 ; X64-NEXT: vmovaps %zmm2, %zmm0 4808 ; X64-NEXT: retq 4809 entry: 4810 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4811 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4812 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8) 4813 %1 = bitcast i16 %__U to <16 x i1> 4814 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4815 ret <16 x float> %2 4816 } 4817 4818 define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4819 ; X86-LABEL: test_mm512_mask_fnmsub_ps: 4820 ; X86: # %bb.0: # %entry 4821 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4822 ; X86-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2 4823 ; X86-NEXT: retl 4824 ; 4825 ; X64-LABEL: test_mm512_mask_fnmsub_ps: 4826 ; X64: # %bb.0: # %entry 4827 ; X64-NEXT: kmovw %edi, %k1 4828 ; X64-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2 4829 ; X64-NEXT: retq 4830 entry: 4831 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4832 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4833 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10 4834 %1 = bitcast i16 %__U to <16 x i1> 4835 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4836 ret <16 x float> %2 4837 } 4838 4839 define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4840 ; X86-LABEL: test_mm512_mask3_fnmsub_ps: 4841 ; X86: # %bb.0: # %entry 4842 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4843 ; X86-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2 4844 ; X86-NEXT: vmovaps %zmm2, %zmm0 4845 ; X86-NEXT: retl 4846 ; 4847 ; X64-LABEL: test_mm512_mask3_fnmsub_ps: 4848 ; X64: # %bb.0: # %entry 4849 ; X64-NEXT: kmovw %edi, %k1 4850 ; X64-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2 4851 ; X64-NEXT: vmovaps %zmm2, %zmm0 4852 ; X64-NEXT: retq 4853 entry: 4854 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4855 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4856 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10 4857 %1 = bitcast i16 %__U to <16 x i1> 4858 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4859 ret <16 x float> %2 4860 } 4861 4862 define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 4863 ; X86-LABEL: test_mm_mask_fmadd_ss: 4864 ; X86: # %bb.0: # %entry 4865 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4866 ; X86-NEXT: kmovw %eax, %k1 4867 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4868 ; X86-NEXT: retl 4869 ; 4870 ; X64-LABEL: test_mm_mask_fmadd_ss: 4871 ; X64: # %bb.0: # %entry 4872 ; X64-NEXT: kmovw %edi, %k1 4873 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4874 ; X64-NEXT: retq 4875 entry: 4876 %0 = extractelement <4 x float> %__W, i64 0 4877 %1 = extractelement <4 x float> %__A, i64 0 4878 %2 = extractelement <4 x float> %__B, i64 0 4879 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 4880 %4 = and i8 %__U, 1 4881 %tobool.i = icmp eq i8 %4, 0 4882 %vecext1.i = extractelement <4 x float> %__W, i32 0 4883 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 4884 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 4885 ret <4 x float> %vecins.i 4886 } 4887 4888 define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 4889 ; X86-LABEL: test_mm_mask_fmadd_round_ss: 4890 ; X86: # %bb.0: # %entry 4891 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4892 ; X86-NEXT: kmovw %eax, %k1 4893 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 4894 ; X86-NEXT: retl 4895 ; 4896 ; X64-LABEL: test_mm_mask_fmadd_round_ss: 4897 ; X64: # %bb.0: # %entry 4898 ; X64-NEXT: kmovw %edi, %k1 4899 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 4900 ; X64-NEXT: retq 4901 entry: 4902 %0 = extractelement <4 x float> %__W, i64 0 4903 %1 = extractelement <4 x float> %__A, i64 0 4904 %2 = extractelement <4 x float> %__B, i64 0 4905 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 4906 %4 = bitcast i8 %__U to <8 x i1> 4907 %5 = extractelement <8 x i1> %4, i64 0 4908 %6 = select i1 %5, float %3, float %0 4909 %7 = insertelement <4 x float> %__W, float %6, i64 0 4910 ret <4 x float> %7 4911 } 4912 4913 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1 4914 4915 define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4916 ; X86-LABEL: test_mm_maskz_fmadd_ss: 4917 ; X86: # %bb.0: # %entry 4918 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4919 ; X86-NEXT: kmovw %eax, %k1 4920 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4921 ; X86-NEXT: retl 4922 ; 4923 ; X64-LABEL: test_mm_maskz_fmadd_ss: 4924 ; X64: # %bb.0: # %entry 4925 ; X64-NEXT: kmovw %edi, %k1 4926 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4927 ; X64-NEXT: retq 4928 entry: 4929 %0 = extractelement <4 x float> %__A, i64 0 4930 %1 = extractelement <4 x float> %__B, i64 0 4931 %2 = extractelement <4 x float> %__C, i64 0 4932 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 4933 %4 = and i8 %__U, 1 4934 %tobool.i = icmp eq i8 %4, 0 4935 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 4936 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 4937 ret <4 x float> %vecins.i 4938 } 4939 4940 define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4941 ; X86-LABEL: test_mm_maskz_fmadd_round_ss: 4942 ; X86: # %bb.0: # %entry 4943 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4944 ; X86-NEXT: kmovw %eax, %k1 4945 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 4946 ; X86-NEXT: retl 4947 ; 4948 ; X64-LABEL: test_mm_maskz_fmadd_round_ss: 4949 ; X64: # %bb.0: # %entry 4950 ; X64-NEXT: kmovw %edi, %k1 4951 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 4952 ; X64-NEXT: retq 4953 entry: 4954 %0 = extractelement <4 x float> %__A, i64 0 4955 %1 = extractelement <4 x float> %__B, i64 0 4956 %2 = extractelement <4 x float> %__C, i64 0 4957 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 4958 %4 = bitcast i8 %__U to <8 x i1> 4959 %5 = extractelement <8 x i1> %4, i64 0 4960 %6 = select i1 %5, float %3, float 0.000000e+00 4961 %7 = insertelement <4 x float> %__A, float %6, i64 0 4962 ret <4 x float> %7 4963 } 4964 4965 define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 4966 ; X86-LABEL: test_mm_mask3_fmadd_ss: 4967 ; X86: # %bb.0: # %entry 4968 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4969 ; X86-NEXT: kmovw %eax, %k1 4970 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4971 ; X86-NEXT: vmovaps %xmm2, %xmm0 4972 ; X86-NEXT: retl 4973 ; 4974 ; X64-LABEL: test_mm_mask3_fmadd_ss: 4975 ; X64: # %bb.0: # %entry 4976 ; X64-NEXT: kmovw %edi, %k1 4977 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4978 ; X64-NEXT: vmovaps %xmm2, %xmm0 4979 ; X64-NEXT: retq 4980 entry: 4981 %0 = extractelement <4 x float> %__W, i64 0 4982 %1 = extractelement <4 x float> %__X, i64 0 4983 %2 = extractelement <4 x float> %__Y, i64 0 4984 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 4985 %4 = and i8 %__U, 1 4986 %tobool.i = icmp eq i8 %4, 0 4987 %vecext1.i = extractelement <4 x float> %__Y, i32 0 4988 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 4989 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 4990 ret <4 x float> %vecins.i 4991 } 4992 4993 define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 4994 ; X86-LABEL: test_mm_mask3_fmadd_round_ss: 4995 ; X86: # %bb.0: # %entry 4996 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4997 ; X86-NEXT: kmovw %eax, %k1 4998 ; X86-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 4999 ; X86-NEXT: vmovaps %xmm2, %xmm0 5000 ; X86-NEXT: retl 5001 ; 5002 ; X64-LABEL: test_mm_mask3_fmadd_round_ss: 5003 ; X64: # %bb.0: # %entry 5004 ; X64-NEXT: kmovw %edi, %k1 5005 ; X64-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5006 ; X64-NEXT: vmovaps %xmm2, %xmm0 5007 ; X64-NEXT: retq 5008 entry: 5009 %0 = extractelement <4 x float> %__W, i64 0 5010 %1 = extractelement <4 x float> %__X, i64 0 5011 %2 = extractelement <4 x float> %__Y, i64 0 5012 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5013 %4 = bitcast i8 %__U to <8 x i1> 5014 %5 = extractelement <8 x i1> %4, i64 0 5015 %6 = select i1 %5, float %3, float %2 5016 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5017 ret <4 x float> %7 5018 } 5019 5020 define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5021 ; X86-LABEL: test_mm_mask_fmsub_ss: 5022 ; X86: # %bb.0: # %entry 5023 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5024 ; X86-NEXT: kmovw %eax, %k1 5025 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5026 ; X86-NEXT: retl 5027 ; 5028 ; X64-LABEL: test_mm_mask_fmsub_ss: 5029 ; X64: # %bb.0: # %entry 5030 ; X64-NEXT: kmovw %edi, %k1 5031 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5032 ; X64-NEXT: retq 5033 entry: 5034 %0 = extractelement <4 x float> %__W, i64 0 5035 %1 = extractelement <4 x float> %__A, i64 0 5036 %.rhs.i = extractelement <4 x float> %__B, i64 0 5037 %2 = fsub float -0.000000e+00, %.rhs.i 5038 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5039 %4 = and i8 %__U, 1 5040 %tobool.i = icmp eq i8 %4, 0 5041 %vecext1.i = extractelement <4 x float> %__W, i32 0 5042 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5043 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 5044 ret <4 x float> %vecins.i 5045 } 5046 5047 define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5048 ; X86-LABEL: test_mm_mask_fmsub_round_ss: 5049 ; X86: # %bb.0: # %entry 5050 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5051 ; X86-NEXT: kmovw %eax, %k1 5052 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5053 ; X86-NEXT: retl 5054 ; 5055 ; X64-LABEL: test_mm_mask_fmsub_round_ss: 5056 ; X64: # %bb.0: # %entry 5057 ; X64-NEXT: kmovw %edi, %k1 5058 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5059 ; X64-NEXT: retq 5060 entry: 5061 %0 = extractelement <4 x float> %__W, i64 0 5062 %1 = extractelement <4 x float> %__A, i64 0 5063 %.rhs = extractelement <4 x float> %__B, i64 0 5064 %2 = fsub float -0.000000e+00, %.rhs 5065 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5066 %4 = bitcast i8 %__U to <8 x i1> 5067 %5 = extractelement <8 x i1> %4, i64 0 5068 %6 = select i1 %5, float %3, float %0 5069 %7 = insertelement <4 x float> %__W, float %6, i64 0 5070 ret <4 x float> %7 5071 } 5072 5073 define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5074 ; X86-LABEL: test_mm_maskz_fmsub_ss: 5075 ; X86: # %bb.0: # %entry 5076 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5077 ; X86-NEXT: kmovw %eax, %k1 5078 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5079 ; X86-NEXT: retl 5080 ; 5081 ; X64-LABEL: test_mm_maskz_fmsub_ss: 5082 ; X64: # %bb.0: # %entry 5083 ; X64-NEXT: kmovw %edi, %k1 5084 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5085 ; X64-NEXT: retq 5086 entry: 5087 %0 = extractelement <4 x float> %__A, i64 0 5088 %1 = extractelement <4 x float> %__B, i64 0 5089 %.rhs.i = extractelement <4 x float> %__C, i64 0 5090 %2 = fsub float -0.000000e+00, %.rhs.i 5091 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5092 %4 = and i8 %__U, 1 5093 %tobool.i = icmp eq i8 %4, 0 5094 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 5095 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 5096 ret <4 x float> %vecins.i 5097 } 5098 5099 define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5100 ; X86-LABEL: test_mm_maskz_fmsub_round_ss: 5101 ; X86: # %bb.0: # %entry 5102 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5103 ; X86-NEXT: kmovw %eax, %k1 5104 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5105 ; X86-NEXT: retl 5106 ; 5107 ; X64-LABEL: test_mm_maskz_fmsub_round_ss: 5108 ; X64: # %bb.0: # %entry 5109 ; X64-NEXT: kmovw %edi, %k1 5110 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5111 ; X64-NEXT: retq 5112 entry: 5113 %0 = extractelement <4 x float> %__A, i64 0 5114 %1 = extractelement <4 x float> %__B, i64 0 5115 %.rhs = extractelement <4 x float> %__C, i64 0 5116 %2 = fsub float -0.000000e+00, %.rhs 5117 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5118 %4 = bitcast i8 %__U to <8 x i1> 5119 %5 = extractelement <8 x i1> %4, i64 0 5120 %6 = select i1 %5, float %3, float 0.000000e+00 5121 %7 = insertelement <4 x float> %__A, float %6, i64 0 5122 ret <4 x float> %7 5123 } 5124 5125 define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5126 ; X86-LABEL: test_mm_mask3_fmsub_ss: 5127 ; X86: # %bb.0: # %entry 5128 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5129 ; X86-NEXT: kmovw %eax, %k1 5130 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5131 ; X86-NEXT: vmovaps %xmm2, %xmm0 5132 ; X86-NEXT: retl 5133 ; 5134 ; X64-LABEL: test_mm_mask3_fmsub_ss: 5135 ; X64: # %bb.0: # %entry 5136 ; X64-NEXT: kmovw %edi, %k1 5137 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5138 ; X64-NEXT: vmovaps %xmm2, %xmm0 5139 ; X64-NEXT: retq 5140 entry: 5141 %0 = extractelement <4 x float> %__W, i64 0 5142 %1 = extractelement <4 x float> %__X, i64 0 5143 %.rhs.i = extractelement <4 x float> %__Y, i64 0 5144 %2 = fsub float -0.000000e+00, %.rhs.i 5145 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5146 %4 = and i8 %__U, 1 5147 %tobool.i = icmp eq i8 %4, 0 5148 %vecext1.i = extractelement <4 x float> %__Y, i32 0 5149 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5150 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 5151 ret <4 x float> %vecins.i 5152 } 5153 5154 define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5155 ; X86-LABEL: test_mm_mask3_fmsub_round_ss: 5156 ; X86: # %bb.0: # %entry 5157 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5158 ; X86-NEXT: kmovw %eax, %k1 5159 ; X86-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5160 ; X86-NEXT: vmovaps %xmm2, %xmm0 5161 ; X86-NEXT: retl 5162 ; 5163 ; X64-LABEL: test_mm_mask3_fmsub_round_ss: 5164 ; X64: # %bb.0: # %entry 5165 ; X64-NEXT: kmovw %edi, %k1 5166 ; X64-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5167 ; X64-NEXT: vmovaps %xmm2, %xmm0 5168 ; X64-NEXT: retq 5169 entry: 5170 %0 = extractelement <4 x float> %__W, i64 0 5171 %1 = extractelement <4 x float> %__X, i64 0 5172 %.rhs = extractelement <4 x float> %__Y, i64 0 5173 %2 = fsub float -0.000000e+00, %.rhs 5174 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5175 %4 = bitcast i8 %__U to <8 x i1> 5176 %5 = extractelement <8 x i1> %4, i64 0 5177 %6 = select i1 %5, float %3, float %.rhs 5178 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5179 ret <4 x float> %7 5180 } 5181 5182 define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5183 ; X86-LABEL: test_mm_mask_fnmadd_ss: 5184 ; X86: # %bb.0: # %entry 5185 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5186 ; X86-NEXT: kmovw %eax, %k1 5187 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5188 ; X86-NEXT: retl 5189 ; 5190 ; X64-LABEL: test_mm_mask_fnmadd_ss: 5191 ; X64: # %bb.0: # %entry 5192 ; X64-NEXT: kmovw %edi, %k1 5193 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5194 ; X64-NEXT: retq 5195 entry: 5196 %0 = extractelement <4 x float> %__W, i64 0 5197 %.rhs.i = extractelement <4 x float> %__A, i64 0 5198 %1 = fsub float -0.000000e+00, %.rhs.i 5199 %2 = extractelement <4 x float> %__B, i64 0 5200 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5201 %4 = and i8 %__U, 1 5202 %tobool.i = icmp eq i8 %4, 0 5203 %vecext1.i = extractelement <4 x float> %__W, i32 0 5204 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5205 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 5206 ret <4 x float> %vecins.i 5207 } 5208 5209 define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5210 ; X86-LABEL: test_mm_mask_fnmadd_round_ss: 5211 ; X86: # %bb.0: # %entry 5212 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5213 ; X86-NEXT: kmovw %eax, %k1 5214 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5215 ; X86-NEXT: retl 5216 ; 5217 ; X64-LABEL: test_mm_mask_fnmadd_round_ss: 5218 ; X64: # %bb.0: # %entry 5219 ; X64-NEXT: kmovw %edi, %k1 5220 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5221 ; X64-NEXT: retq 5222 entry: 5223 %0 = extractelement <4 x float> %__W, i64 0 5224 %.rhs = extractelement <4 x float> %__A, i64 0 5225 %1 = fsub float -0.000000e+00, %.rhs 5226 %2 = extractelement <4 x float> %__B, i64 0 5227 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5228 %4 = bitcast i8 %__U to <8 x i1> 5229 %5 = extractelement <8 x i1> %4, i64 0 5230 %6 = select i1 %5, float %3, float %0 5231 %7 = insertelement <4 x float> %__W, float %6, i64 0 5232 ret <4 x float> %7 5233 } 5234 5235 define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5236 ; X86-LABEL: test_mm_maskz_fnmadd_ss: 5237 ; X86: # %bb.0: # %entry 5238 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5239 ; X86-NEXT: kmovw %eax, %k1 5240 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5241 ; X86-NEXT: retl 5242 ; 5243 ; X64-LABEL: test_mm_maskz_fnmadd_ss: 5244 ; X64: # %bb.0: # %entry 5245 ; X64-NEXT: kmovw %edi, %k1 5246 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5247 ; X64-NEXT: retq 5248 entry: 5249 %0 = extractelement <4 x float> %__A, i64 0 5250 %.rhs.i = extractelement <4 x float> %__B, i64 0 5251 %1 = fsub float -0.000000e+00, %.rhs.i 5252 %2 = extractelement <4 x float> %__C, i64 0 5253 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5254 %4 = and i8 %__U, 1 5255 %tobool.i = icmp eq i8 %4, 0 5256 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 5257 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 5258 ret <4 x float> %vecins.i 5259 } 5260 5261 define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5262 ; X86-LABEL: test_mm_maskz_fnmadd_round_ss: 5263 ; X86: # %bb.0: # %entry 5264 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5265 ; X86-NEXT: kmovw %eax, %k1 5266 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5267 ; X86-NEXT: retl 5268 ; 5269 ; X64-LABEL: test_mm_maskz_fnmadd_round_ss: 5270 ; X64: # %bb.0: # %entry 5271 ; X64-NEXT: kmovw %edi, %k1 5272 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5273 ; X64-NEXT: retq 5274 entry: 5275 %0 = extractelement <4 x float> %__A, i64 0 5276 %.rhs = extractelement <4 x float> %__B, i64 0 5277 %1 = fsub float -0.000000e+00, %.rhs 5278 %2 = extractelement <4 x float> %__C, i64 0 5279 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5280 %4 = bitcast i8 %__U to <8 x i1> 5281 %5 = extractelement <8 x i1> %4, i64 0 5282 %6 = select i1 %5, float %3, float 0.000000e+00 5283 %7 = insertelement <4 x float> %__A, float %6, i64 0 5284 ret <4 x float> %7 5285 } 5286 5287 define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5288 ; X86-LABEL: test_mm_mask3_fnmadd_ss: 5289 ; X86: # %bb.0: # %entry 5290 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5291 ; X86-NEXT: kmovw %eax, %k1 5292 ; X86-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 5293 ; X86-NEXT: vmovaps %xmm2, %xmm0 5294 ; X86-NEXT: retl 5295 ; 5296 ; X64-LABEL: test_mm_mask3_fnmadd_ss: 5297 ; X64: # %bb.0: # %entry 5298 ; X64-NEXT: kmovw %edi, %k1 5299 ; X64-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 5300 ; X64-NEXT: vmovaps %xmm2, %xmm0 5301 ; X64-NEXT: retq 5302 entry: 5303 %0 = extractelement <4 x float> %__W, i64 0 5304 %.rhs.i = extractelement <4 x float> %__X, i64 0 5305 %1 = fsub float -0.000000e+00, %.rhs.i 5306 %2 = extractelement <4 x float> %__Y, i64 0 5307 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5308 %4 = and i8 %__U, 1 5309 %tobool.i = icmp eq i8 %4, 0 5310 %vecext1.i = extractelement <4 x float> %__Y, i32 0 5311 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5312 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 5313 ret <4 x float> %vecins.i 5314 } 5315 5316 define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5317 ; X86-LABEL: test_mm_mask3_fnmadd_round_ss: 5318 ; X86: # %bb.0: # %entry 5319 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5320 ; X86-NEXT: kmovw %eax, %k1 5321 ; X86-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5322 ; X86-NEXT: vmovaps %xmm2, %xmm0 5323 ; X86-NEXT: retl 5324 ; 5325 ; X64-LABEL: test_mm_mask3_fnmadd_round_ss: 5326 ; X64: # %bb.0: # %entry 5327 ; X64-NEXT: kmovw %edi, %k1 5328 ; X64-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5329 ; X64-NEXT: vmovaps %xmm2, %xmm0 5330 ; X64-NEXT: retq 5331 entry: 5332 %0 = extractelement <4 x float> %__W, i64 0 5333 %.rhs = extractelement <4 x float> %__X, i64 0 5334 %1 = fsub float -0.000000e+00, %.rhs 5335 %2 = extractelement <4 x float> %__Y, i64 0 5336 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5337 %4 = bitcast i8 %__U to <8 x i1> 5338 %5 = extractelement <8 x i1> %4, i64 0 5339 %6 = select i1 %5, float %3, float %2 5340 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5341 ret <4 x float> %7 5342 } 5343 5344 define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5345 ; X86-LABEL: test_mm_mask_fnmsub_ss: 5346 ; X86: # %bb.0: # %entry 5347 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5348 ; X86-NEXT: kmovw %eax, %k1 5349 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 5350 ; X86-NEXT: retl 5351 ; 5352 ; X64-LABEL: test_mm_mask_fnmsub_ss: 5353 ; X64: # %bb.0: # %entry 5354 ; X64-NEXT: kmovw %edi, %k1 5355 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 5356 ; X64-NEXT: retq 5357 entry: 5358 %0 = extractelement <4 x float> %__W, i64 0 5359 %.rhs.i = extractelement <4 x float> %__A, i64 0 5360 %1 = fsub float -0.000000e+00, %.rhs.i 5361 %.rhs7.i = extractelement <4 x float> %__B, i64 0 5362 %2 = fsub float -0.000000e+00, %.rhs7.i 5363 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5364 %4 = and i8 %__U, 1 5365 %tobool.i = icmp eq i8 %4, 0 5366 %vecext2.i = extractelement <4 x float> %__W, i32 0 5367 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3 5368 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 5369 ret <4 x float> %vecins.i 5370 } 5371 5372 define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5373 ; X86-LABEL: test_mm_mask_fnmsub_round_ss: 5374 ; X86: # %bb.0: # %entry 5375 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5376 ; X86-NEXT: kmovw %eax, %k1 5377 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5378 ; X86-NEXT: retl 5379 ; 5380 ; X64-LABEL: test_mm_mask_fnmsub_round_ss: 5381 ; X64: # %bb.0: # %entry 5382 ; X64-NEXT: kmovw %edi, %k1 5383 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5384 ; X64-NEXT: retq 5385 entry: 5386 %0 = extractelement <4 x float> %__W, i64 0 5387 %.rhs = extractelement <4 x float> %__A, i64 0 5388 %1 = fsub float -0.000000e+00, %.rhs 5389 %.rhs2 = extractelement <4 x float> %__B, i64 0 5390 %2 = fsub float -0.000000e+00, %.rhs2 5391 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5392 %4 = bitcast i8 %__U to <8 x i1> 5393 %5 = extractelement <8 x i1> %4, i64 0 5394 %6 = select i1 %5, float %3, float %0 5395 %7 = insertelement <4 x float> %__W, float %6, i64 0 5396 ret <4 x float> %7 5397 } 5398 5399 define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5400 ; X86-LABEL: test_mm_maskz_fnmsub_ss: 5401 ; X86: # %bb.0: # %entry 5402 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5403 ; X86-NEXT: kmovw %eax, %k1 5404 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 5405 ; X86-NEXT: retl 5406 ; 5407 ; X64-LABEL: test_mm_maskz_fnmsub_ss: 5408 ; X64: # %bb.0: # %entry 5409 ; X64-NEXT: kmovw %edi, %k1 5410 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 5411 ; X64-NEXT: retq 5412 entry: 5413 %0 = extractelement <4 x float> %__A, i64 0 5414 %.rhs.i = extractelement <4 x float> %__B, i64 0 5415 %1 = fsub float -0.000000e+00, %.rhs.i 5416 %.rhs5.i = extractelement <4 x float> %__C, i64 0 5417 %2 = fsub float -0.000000e+00, %.rhs5.i 5418 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5419 %4 = and i8 %__U, 1 5420 %tobool.i = icmp eq i8 %4, 0 5421 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 5422 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 5423 ret <4 x float> %vecins.i 5424 } 5425 5426 define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5427 ; X86-LABEL: test_mm_maskz_fnmsub_round_ss: 5428 ; X86: # %bb.0: # %entry 5429 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5430 ; X86-NEXT: kmovw %eax, %k1 5431 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5432 ; X86-NEXT: retl 5433 ; 5434 ; X64-LABEL: test_mm_maskz_fnmsub_round_ss: 5435 ; X64: # %bb.0: # %entry 5436 ; X64-NEXT: kmovw %edi, %k1 5437 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5438 ; X64-NEXT: retq 5439 entry: 5440 %0 = extractelement <4 x float> %__A, i64 0 5441 %.rhs = extractelement <4 x float> %__B, i64 0 5442 %1 = fsub float -0.000000e+00, %.rhs 5443 %.rhs2 = extractelement <4 x float> %__C, i64 0 5444 %2 = fsub float -0.000000e+00, %.rhs2 5445 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5446 %4 = bitcast i8 %__U to <8 x i1> 5447 %5 = extractelement <8 x i1> %4, i64 0 5448 %6 = select i1 %5, float %3, float 0.000000e+00 5449 %7 = insertelement <4 x float> %__A, float %6, i64 0 5450 ret <4 x float> %7 5451 } 5452 5453 define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5454 ; X86-LABEL: test_mm_mask3_fnmsub_ss: 5455 ; X86: # %bb.0: # %entry 5456 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5457 ; X86-NEXT: kmovw %eax, %k1 5458 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5459 ; X86-NEXT: vmovaps %xmm2, %xmm0 5460 ; X86-NEXT: retl 5461 ; 5462 ; X64-LABEL: test_mm_mask3_fnmsub_ss: 5463 ; X64: # %bb.0: # %entry 5464 ; X64-NEXT: kmovw %edi, %k1 5465 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5466 ; X64-NEXT: vmovaps %xmm2, %xmm0 5467 ; X64-NEXT: retq 5468 entry: 5469 %0 = extractelement <4 x float> %__W, i64 0 5470 %.rhs.i = extractelement <4 x float> %__X, i64 0 5471 %1 = fsub float -0.000000e+00, %.rhs.i 5472 %.rhs7.i = extractelement <4 x float> %__Y, i64 0 5473 %2 = fsub float -0.000000e+00, %.rhs7.i 5474 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5475 %4 = and i8 %__U, 1 5476 %tobool.i = icmp eq i8 %4, 0 5477 %vecext2.i = extractelement <4 x float> %__Y, i32 0 5478 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3 5479 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 5480 ret <4 x float> %vecins.i 5481 } 5482 5483 define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5484 ; X86-LABEL: test_mm_mask3_fnmsub_round_ss: 5485 ; X86: # %bb.0: # %entry 5486 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5487 ; X86-NEXT: kmovw %eax, %k1 5488 ; X86-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5489 ; X86-NEXT: vmovaps %xmm2, %xmm0 5490 ; X86-NEXT: retl 5491 ; 5492 ; X64-LABEL: test_mm_mask3_fnmsub_round_ss: 5493 ; X64: # %bb.0: # %entry 5494 ; X64-NEXT: kmovw %edi, %k1 5495 ; X64-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5496 ; X64-NEXT: vmovaps %xmm2, %xmm0 5497 ; X64-NEXT: retq 5498 entry: 5499 %0 = extractelement <4 x float> %__W, i64 0 5500 %.rhs = extractelement <4 x float> %__X, i64 0 5501 %1 = fsub float -0.000000e+00, %.rhs 5502 %.rhs1 = extractelement <4 x float> %__Y, i64 0 5503 %2 = fsub float -0.000000e+00, %.rhs1 5504 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5505 %4 = bitcast i8 %__U to <8 x i1> 5506 %5 = extractelement <8 x i1> %4, i64 0 5507 %6 = select i1 %5, float %3, float %.rhs1 5508 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5509 ret <4 x float> %7 5510 } 5511 5512 define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5513 ; X86-LABEL: test_mm_mask_fmadd_sd: 5514 ; X86: # %bb.0: # %entry 5515 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5516 ; X86-NEXT: kmovw %eax, %k1 5517 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 5518 ; X86-NEXT: retl 5519 ; 5520 ; X64-LABEL: test_mm_mask_fmadd_sd: 5521 ; X64: # %bb.0: # %entry 5522 ; X64-NEXT: kmovw %edi, %k1 5523 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 5524 ; X64-NEXT: retq 5525 entry: 5526 %0 = extractelement <2 x double> %__W, i64 0 5527 %1 = extractelement <2 x double> %__A, i64 0 5528 %2 = extractelement <2 x double> %__B, i64 0 5529 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5530 %4 = and i8 %__U, 1 5531 %tobool.i = icmp eq i8 %4, 0 5532 %vecext1.i = extractelement <2 x double> %__W, i32 0 5533 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5534 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 5535 ret <2 x double> %vecins.i 5536 } 5537 5538 define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5539 ; X86-LABEL: test_mm_mask_fmadd_round_sd: 5540 ; X86: # %bb.0: # %entry 5541 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5542 ; X86-NEXT: kmovw %eax, %k1 5543 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5544 ; X86-NEXT: retl 5545 ; 5546 ; X64-LABEL: test_mm_mask_fmadd_round_sd: 5547 ; X64: # %bb.0: # %entry 5548 ; X64-NEXT: kmovw %edi, %k1 5549 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5550 ; X64-NEXT: retq 5551 entry: 5552 %0 = extractelement <2 x double> %__W, i64 0 5553 %1 = extractelement <2 x double> %__A, i64 0 5554 %2 = extractelement <2 x double> %__B, i64 0 5555 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5556 %4 = bitcast i8 %__U to <8 x i1> 5557 %5 = extractelement <8 x i1> %4, i64 0 5558 %6 = select i1 %5, double %3, double %0 5559 %7 = insertelement <2 x double> %__W, double %6, i64 0 5560 ret <2 x double> %7 5561 } 5562 5563 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1 5564 5565 define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5566 ; X86-LABEL: test_mm_maskz_fmadd_sd: 5567 ; X86: # %bb.0: # %entry 5568 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5569 ; X86-NEXT: kmovw %eax, %k1 5570 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 5571 ; X86-NEXT: retl 5572 ; 5573 ; X64-LABEL: test_mm_maskz_fmadd_sd: 5574 ; X64: # %bb.0: # %entry 5575 ; X64-NEXT: kmovw %edi, %k1 5576 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 5577 ; X64-NEXT: retq 5578 entry: 5579 %0 = extractelement <2 x double> %__A, i64 0 5580 %1 = extractelement <2 x double> %__B, i64 0 5581 %2 = extractelement <2 x double> %__C, i64 0 5582 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5583 %4 = and i8 %__U, 1 5584 %tobool.i = icmp eq i8 %4, 0 5585 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 5586 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 5587 ret <2 x double> %vecins.i 5588 } 5589 5590 define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5591 ; X86-LABEL: test_mm_maskz_fmadd_round_sd: 5592 ; X86: # %bb.0: # %entry 5593 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5594 ; X86-NEXT: kmovw %eax, %k1 5595 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5596 ; X86-NEXT: retl 5597 ; 5598 ; X64-LABEL: test_mm_maskz_fmadd_round_sd: 5599 ; X64: # %bb.0: # %entry 5600 ; X64-NEXT: kmovw %edi, %k1 5601 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5602 ; X64-NEXT: retq 5603 entry: 5604 %0 = extractelement <2 x double> %__A, i64 0 5605 %1 = extractelement <2 x double> %__B, i64 0 5606 %2 = extractelement <2 x double> %__C, i64 0 5607 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5608 %4 = bitcast i8 %__U to <8 x i1> 5609 %5 = extractelement <8 x i1> %4, i64 0 5610 %6 = select i1 %5, double %3, double 0.000000e+00 5611 %7 = insertelement <2 x double> %__A, double %6, i64 0 5612 ret <2 x double> %7 5613 } 5614 5615 define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5616 ; X86-LABEL: test_mm_mask3_fmadd_sd: 5617 ; X86: # %bb.0: # %entry 5618 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5619 ; X86-NEXT: kmovw %eax, %k1 5620 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 5621 ; X86-NEXT: vmovapd %xmm2, %xmm0 5622 ; X86-NEXT: retl 5623 ; 5624 ; X64-LABEL: test_mm_mask3_fmadd_sd: 5625 ; X64: # %bb.0: # %entry 5626 ; X64-NEXT: kmovw %edi, %k1 5627 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 5628 ; X64-NEXT: vmovapd %xmm2, %xmm0 5629 ; X64-NEXT: retq 5630 entry: 5631 %0 = extractelement <2 x double> %__W, i64 0 5632 %1 = extractelement <2 x double> %__X, i64 0 5633 %2 = extractelement <2 x double> %__Y, i64 0 5634 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5635 %4 = and i8 %__U, 1 5636 %tobool.i = icmp eq i8 %4, 0 5637 %vecext1.i = extractelement <2 x double> %__Y, i32 0 5638 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5639 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 5640 ret <2 x double> %vecins.i 5641 } 5642 5643 define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5644 ; X86-LABEL: test_mm_mask3_fmadd_round_sd: 5645 ; X86: # %bb.0: # %entry 5646 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5647 ; X86-NEXT: kmovw %eax, %k1 5648 ; X86-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5649 ; X86-NEXT: vmovapd %xmm2, %xmm0 5650 ; X86-NEXT: retl 5651 ; 5652 ; X64-LABEL: test_mm_mask3_fmadd_round_sd: 5653 ; X64: # %bb.0: # %entry 5654 ; X64-NEXT: kmovw %edi, %k1 5655 ; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5656 ; X64-NEXT: vmovapd %xmm2, %xmm0 5657 ; X64-NEXT: retq 5658 entry: 5659 %0 = extractelement <2 x double> %__W, i64 0 5660 %1 = extractelement <2 x double> %__X, i64 0 5661 %2 = extractelement <2 x double> %__Y, i64 0 5662 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5663 %4 = bitcast i8 %__U to <8 x i1> 5664 %5 = extractelement <8 x i1> %4, i64 0 5665 %6 = select i1 %5, double %3, double %2 5666 %7 = insertelement <2 x double> %__Y, double %6, i64 0 5667 ret <2 x double> %7 5668 } 5669 5670 define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5671 ; X86-LABEL: test_mm_mask_fmsub_sd: 5672 ; X86: # %bb.0: # %entry 5673 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5674 ; X86-NEXT: kmovw %eax, %k1 5675 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5676 ; X86-NEXT: retl 5677 ; 5678 ; X64-LABEL: test_mm_mask_fmsub_sd: 5679 ; X64: # %bb.0: # %entry 5680 ; X64-NEXT: kmovw %edi, %k1 5681 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5682 ; X64-NEXT: retq 5683 entry: 5684 %0 = extractelement <2 x double> %__W, i64 0 5685 %1 = extractelement <2 x double> %__A, i64 0 5686 %.rhs.i = extractelement <2 x double> %__B, i64 0 5687 %2 = fsub double -0.000000e+00, %.rhs.i 5688 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5689 %4 = and i8 %__U, 1 5690 %tobool.i = icmp eq i8 %4, 0 5691 %vecext1.i = extractelement <2 x double> %__W, i32 0 5692 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5693 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 5694 ret <2 x double> %vecins.i 5695 } 5696 5697 define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5698 ; X86-LABEL: test_mm_mask_fmsub_round_sd: 5699 ; X86: # %bb.0: # %entry 5700 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5701 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm2 5702 ; X86-NEXT: kmovw %eax, %k1 5703 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5704 ; X86-NEXT: retl 5705 ; 5706 ; X64-LABEL: test_mm_mask_fmsub_round_sd: 5707 ; X64: # %bb.0: # %entry 5708 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm2 5709 ; X64-NEXT: kmovw %edi, %k1 5710 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5711 ; X64-NEXT: retq 5712 entry: 5713 %0 = extractelement <2 x double> %__W, i64 0 5714 %1 = extractelement <2 x double> %__A, i64 0 5715 %.rhs = extractelement <2 x double> %__B, i64 0 5716 %2 = fsub double -0.000000e+00, %.rhs 5717 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5718 %4 = bitcast i8 %__U to <8 x i1> 5719 %5 = extractelement <8 x i1> %4, i64 0 5720 %6 = select i1 %5, double %3, double %0 5721 %7 = insertelement <2 x double> %__W, double %6, i64 0 5722 ret <2 x double> %7 5723 } 5724 5725 define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5726 ; X86-LABEL: test_mm_maskz_fmsub_sd: 5727 ; X86: # %bb.0: # %entry 5728 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5729 ; X86-NEXT: kmovw %eax, %k1 5730 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5731 ; X86-NEXT: retl 5732 ; 5733 ; X64-LABEL: test_mm_maskz_fmsub_sd: 5734 ; X64: # %bb.0: # %entry 5735 ; X64-NEXT: kmovw %edi, %k1 5736 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5737 ; X64-NEXT: retq 5738 entry: 5739 %0 = extractelement <2 x double> %__A, i64 0 5740 %1 = extractelement <2 x double> %__B, i64 0 5741 %.rhs.i = extractelement <2 x double> %__C, i64 0 5742 %2 = fsub double -0.000000e+00, %.rhs.i 5743 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5744 %4 = and i8 %__U, 1 5745 %tobool.i = icmp eq i8 %4, 0 5746 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 5747 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 5748 ret <2 x double> %vecins.i 5749 } 5750 5751 define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5752 ; X86-LABEL: test_mm_maskz_fmsub_round_sd: 5753 ; X86: # %bb.0: # %entry 5754 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5755 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm2 5756 ; X86-NEXT: kmovw %eax, %k1 5757 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5758 ; X86-NEXT: retl 5759 ; 5760 ; X64-LABEL: test_mm_maskz_fmsub_round_sd: 5761 ; X64: # %bb.0: # %entry 5762 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm2 5763 ; X64-NEXT: kmovw %edi, %k1 5764 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5765 ; X64-NEXT: retq 5766 entry: 5767 %0 = extractelement <2 x double> %__A, i64 0 5768 %1 = extractelement <2 x double> %__B, i64 0 5769 %.rhs = extractelement <2 x double> %__C, i64 0 5770 %2 = fsub double -0.000000e+00, %.rhs 5771 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5772 %4 = bitcast i8 %__U to <8 x i1> 5773 %5 = extractelement <8 x i1> %4, i64 0 5774 %6 = select i1 %5, double %3, double 0.000000e+00 5775 %7 = insertelement <2 x double> %__A, double %6, i64 0 5776 ret <2 x double> %7 5777 } 5778 5779 define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5780 ; X86-LABEL: test_mm_mask3_fmsub_sd: 5781 ; X86: # %bb.0: # %entry 5782 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5783 ; X86-NEXT: kmovw %eax, %k1 5784 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5785 ; X86-NEXT: vmovapd %xmm2, %xmm0 5786 ; X86-NEXT: retl 5787 ; 5788 ; X64-LABEL: test_mm_mask3_fmsub_sd: 5789 ; X64: # %bb.0: # %entry 5790 ; X64-NEXT: kmovw %edi, %k1 5791 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5792 ; X64-NEXT: vmovapd %xmm2, %xmm0 5793 ; X64-NEXT: retq 5794 entry: 5795 %0 = extractelement <2 x double> %__W, i64 0 5796 %1 = extractelement <2 x double> %__X, i64 0 5797 %.rhs.i = extractelement <2 x double> %__Y, i64 0 5798 %2 = fsub double -0.000000e+00, %.rhs.i 5799 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5800 %4 = and i8 %__U, 1 5801 %tobool.i = icmp eq i8 %4, 0 5802 %vecext1.i = extractelement <2 x double> %__Y, i32 0 5803 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5804 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 5805 ret <2 x double> %vecins.i 5806 } 5807 5808 define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5809 ; X86-LABEL: test_mm_mask3_fmsub_round_sd: 5810 ; X86: # %bb.0: # %entry 5811 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5812 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm3 5813 ; X86-NEXT: vfmadd213sd %xmm3, %xmm0, %xmm1 5814 ; X86-NEXT: kmovw %eax, %k1 5815 ; X86-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} 5816 ; X86-NEXT: vmovapd %xmm2, %xmm0 5817 ; X86-NEXT: retl 5818 ; 5819 ; X64-LABEL: test_mm_mask3_fmsub_round_sd: 5820 ; X64: # %bb.0: # %entry 5821 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm3 5822 ; X64-NEXT: vfmadd213sd %xmm3, %xmm0, %xmm1 5823 ; X64-NEXT: kmovw %edi, %k1 5824 ; X64-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} 5825 ; X64-NEXT: vmovapd %xmm2, %xmm0 5826 ; X64-NEXT: retq 5827 entry: 5828 %0 = extractelement <2 x double> %__W, i64 0 5829 %1 = extractelement <2 x double> %__X, i64 0 5830 %.rhs = extractelement <2 x double> %__Y, i64 0 5831 %2 = fsub double -0.000000e+00, %.rhs 5832 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5833 %4 = bitcast i8 %__U to <8 x i1> 5834 %5 = extractelement <8 x i1> %4, i64 0 5835 %6 = select i1 %5, double %3, double %.rhs 5836 %7 = insertelement <2 x double> %__Y, double %6, i64 0 5837 ret <2 x double> %7 5838 } 5839 5840 define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5841 ; X86-LABEL: test_mm_mask_fnmadd_sd: 5842 ; X86: # %bb.0: # %entry 5843 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5844 ; X86-NEXT: kmovw %eax, %k1 5845 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5846 ; X86-NEXT: retl 5847 ; 5848 ; X64-LABEL: test_mm_mask_fnmadd_sd: 5849 ; X64: # %bb.0: # %entry 5850 ; X64-NEXT: kmovw %edi, %k1 5851 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5852 ; X64-NEXT: retq 5853 entry: 5854 %0 = extractelement <2 x double> %__W, i64 0 5855 %.rhs.i = extractelement <2 x double> %__A, i64 0 5856 %1 = fsub double -0.000000e+00, %.rhs.i 5857 %2 = extractelement <2 x double> %__B, i64 0 5858 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5859 %4 = and i8 %__U, 1 5860 %tobool.i = icmp eq i8 %4, 0 5861 %vecext1.i = extractelement <2 x double> %__W, i32 0 5862 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5863 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 5864 ret <2 x double> %vecins.i 5865 } 5866 5867 define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5868 ; X86-LABEL: test_mm_mask_fnmadd_round_sd: 5869 ; X86: # %bb.0: # %entry 5870 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5871 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1 5872 ; X86-NEXT: kmovw %eax, %k1 5873 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5874 ; X86-NEXT: retl 5875 ; 5876 ; X64-LABEL: test_mm_mask_fnmadd_round_sd: 5877 ; X64: # %bb.0: # %entry 5878 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 5879 ; X64-NEXT: kmovw %edi, %k1 5880 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5881 ; X64-NEXT: retq 5882 entry: 5883 %0 = extractelement <2 x double> %__W, i64 0 5884 %.rhs = extractelement <2 x double> %__A, i64 0 5885 %1 = fsub double -0.000000e+00, %.rhs 5886 %2 = extractelement <2 x double> %__B, i64 0 5887 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5888 %4 = bitcast i8 %__U to <8 x i1> 5889 %5 = extractelement <8 x i1> %4, i64 0 5890 %6 = select i1 %5, double %3, double %0 5891 %7 = insertelement <2 x double> %__W, double %6, i64 0 5892 ret <2 x double> %7 5893 } 5894 5895 define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5896 ; X86-LABEL: test_mm_maskz_fnmadd_sd: 5897 ; X86: # %bb.0: # %entry 5898 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5899 ; X86-NEXT: kmovw %eax, %k1 5900 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5901 ; X86-NEXT: retl 5902 ; 5903 ; X64-LABEL: test_mm_maskz_fnmadd_sd: 5904 ; X64: # %bb.0: # %entry 5905 ; X64-NEXT: kmovw %edi, %k1 5906 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5907 ; X64-NEXT: retq 5908 entry: 5909 %0 = extractelement <2 x double> %__A, i64 0 5910 %.rhs.i = extractelement <2 x double> %__B, i64 0 5911 %1 = fsub double -0.000000e+00, %.rhs.i 5912 %2 = extractelement <2 x double> %__C, i64 0 5913 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5914 %4 = and i8 %__U, 1 5915 %tobool.i = icmp eq i8 %4, 0 5916 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 5917 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 5918 ret <2 x double> %vecins.i 5919 } 5920 5921 define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5922 ; X86-LABEL: test_mm_maskz_fnmadd_round_sd: 5923 ; X86: # %bb.0: # %entry 5924 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5925 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1 5926 ; X86-NEXT: kmovw %eax, %k1 5927 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5928 ; X86-NEXT: retl 5929 ; 5930 ; X64-LABEL: test_mm_maskz_fnmadd_round_sd: 5931 ; X64: # %bb.0: # %entry 5932 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 5933 ; X64-NEXT: kmovw %edi, %k1 5934 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5935 ; X64-NEXT: retq 5936 entry: 5937 %0 = extractelement <2 x double> %__A, i64 0 5938 %.rhs = extractelement <2 x double> %__B, i64 0 5939 %1 = fsub double -0.000000e+00, %.rhs 5940 %2 = extractelement <2 x double> %__C, i64 0 5941 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5942 %4 = bitcast i8 %__U to <8 x i1> 5943 %5 = extractelement <8 x i1> %4, i64 0 5944 %6 = select i1 %5, double %3, double 0.000000e+00 5945 %7 = insertelement <2 x double> %__A, double %6, i64 0 5946 ret <2 x double> %7 5947 } 5948 5949 define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5950 ; X86-LABEL: test_mm_mask3_fnmadd_sd: 5951 ; X86: # %bb.0: # %entry 5952 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5953 ; X86-NEXT: kmovw %eax, %k1 5954 ; X86-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 5955 ; X86-NEXT: vmovapd %xmm2, %xmm0 5956 ; X86-NEXT: retl 5957 ; 5958 ; X64-LABEL: test_mm_mask3_fnmadd_sd: 5959 ; X64: # %bb.0: # %entry 5960 ; X64-NEXT: kmovw %edi, %k1 5961 ; X64-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 5962 ; X64-NEXT: vmovapd %xmm2, %xmm0 5963 ; X64-NEXT: retq 5964 entry: 5965 %0 = extractelement <2 x double> %__W, i64 0 5966 %.rhs.i = extractelement <2 x double> %__X, i64 0 5967 %1 = fsub double -0.000000e+00, %.rhs.i 5968 %2 = extractelement <2 x double> %__Y, i64 0 5969 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5970 %4 = and i8 %__U, 1 5971 %tobool.i = icmp eq i8 %4, 0 5972 %vecext1.i = extractelement <2 x double> %__Y, i32 0 5973 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5974 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 5975 ret <2 x double> %vecins.i 5976 } 5977 5978 define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5979 ; X86-LABEL: test_mm_mask3_fnmadd_round_sd: 5980 ; X86: # %bb.0: # %entry 5981 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5982 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1 5983 ; X86-NEXT: kmovw %eax, %k1 5984 ; X86-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5985 ; X86-NEXT: vmovapd %xmm2, %xmm0 5986 ; X86-NEXT: retl 5987 ; 5988 ; X64-LABEL: test_mm_mask3_fnmadd_round_sd: 5989 ; X64: # %bb.0: # %entry 5990 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 5991 ; X64-NEXT: kmovw %edi, %k1 5992 ; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5993 ; X64-NEXT: vmovapd %xmm2, %xmm0 5994 ; X64-NEXT: retq 5995 entry: 5996 %0 = extractelement <2 x double> %__W, i64 0 5997 %.rhs = extractelement <2 x double> %__X, i64 0 5998 %1 = fsub double -0.000000e+00, %.rhs 5999 %2 = extractelement <2 x double> %__Y, i64 0 6000 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6001 %4 = bitcast i8 %__U to <8 x i1> 6002 %5 = extractelement <8 x i1> %4, i64 0 6003 %6 = select i1 %5, double %3, double %2 6004 %7 = insertelement <2 x double> %__Y, double %6, i64 0 6005 ret <2 x double> %7 6006 } 6007 6008 define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 6009 ; X86-LABEL: test_mm_mask_fnmsub_sd: 6010 ; X86: # %bb.0: # %entry 6011 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6012 ; X86-NEXT: kmovw %eax, %k1 6013 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 6014 ; X86-NEXT: retl 6015 ; 6016 ; X64-LABEL: test_mm_mask_fnmsub_sd: 6017 ; X64: # %bb.0: # %entry 6018 ; X64-NEXT: kmovw %edi, %k1 6019 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 6020 ; X64-NEXT: retq 6021 entry: 6022 %0 = extractelement <2 x double> %__W, i64 0 6023 %.rhs.i = extractelement <2 x double> %__A, i64 0 6024 %1 = fsub double -0.000000e+00, %.rhs.i 6025 %.rhs7.i = extractelement <2 x double> %__B, i64 0 6026 %2 = fsub double -0.000000e+00, %.rhs7.i 6027 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 6028 %4 = and i8 %__U, 1 6029 %tobool.i = icmp eq i8 %4, 0 6030 %vecext2.i = extractelement <2 x double> %__W, i32 0 6031 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3 6032 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 6033 ret <2 x double> %vecins.i 6034 } 6035 6036 define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 6037 ; X86-LABEL: test_mm_mask_fnmsub_round_sd: 6038 ; X86: # %bb.0: # %entry 6039 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6040 ; X86-NEXT: kmovw %eax, %k1 6041 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 6042 ; X86-NEXT: retl 6043 ; 6044 ; X64-LABEL: test_mm_mask_fnmsub_round_sd: 6045 ; X64: # %bb.0: # %entry 6046 ; X64-NEXT: kmovw %edi, %k1 6047 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 6048 ; X64-NEXT: retq 6049 entry: 6050 %0 = extractelement <2 x double> %__W, i64 0 6051 %.rhs = extractelement <2 x double> %__A, i64 0 6052 %1 = fsub double -0.000000e+00, %.rhs 6053 %.rhs2 = extractelement <2 x double> %__B, i64 0 6054 %2 = fsub double -0.000000e+00, %.rhs2 6055 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6056 %4 = bitcast i8 %__U to <8 x i1> 6057 %5 = extractelement <8 x i1> %4, i64 0 6058 %6 = select i1 %5, double %3, double %0 6059 %7 = insertelement <2 x double> %__W, double %6, i64 0 6060 ret <2 x double> %7 6061 } 6062 6063 define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 6064 ; X86-LABEL: test_mm_maskz_fnmsub_sd: 6065 ; X86: # %bb.0: # %entry 6066 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6067 ; X86-NEXT: kmovw %eax, %k1 6068 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 6069 ; X86-NEXT: retl 6070 ; 6071 ; X64-LABEL: test_mm_maskz_fnmsub_sd: 6072 ; X64: # %bb.0: # %entry 6073 ; X64-NEXT: kmovw %edi, %k1 6074 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 6075 ; X64-NEXT: retq 6076 entry: 6077 %0 = extractelement <2 x double> %__A, i64 0 6078 %.rhs.i = extractelement <2 x double> %__B, i64 0 6079 %1 = fsub double -0.000000e+00, %.rhs.i 6080 %.rhs5.i = extractelement <2 x double> %__C, i64 0 6081 %2 = fsub double -0.000000e+00, %.rhs5.i 6082 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 6083 %4 = and i8 %__U, 1 6084 %tobool.i = icmp eq i8 %4, 0 6085 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 6086 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 6087 ret <2 x double> %vecins.i 6088 } 6089 6090 define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 6091 ; X86-LABEL: test_mm_maskz_fnmsub_round_sd: 6092 ; X86: # %bb.0: # %entry 6093 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6094 ; X86-NEXT: kmovw %eax, %k1 6095 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 6096 ; X86-NEXT: retl 6097 ; 6098 ; X64-LABEL: test_mm_maskz_fnmsub_round_sd: 6099 ; X64: # %bb.0: # %entry 6100 ; X64-NEXT: kmovw %edi, %k1 6101 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 6102 ; X64-NEXT: retq 6103 entry: 6104 %0 = extractelement <2 x double> %__A, i64 0 6105 %.rhs = extractelement <2 x double> %__B, i64 0 6106 %1 = fsub double -0.000000e+00, %.rhs 6107 %.rhs2 = extractelement <2 x double> %__C, i64 0 6108 %2 = fsub double -0.000000e+00, %.rhs2 6109 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6110 %4 = bitcast i8 %__U to <8 x i1> 6111 %5 = extractelement <8 x i1> %4, i64 0 6112 %6 = select i1 %5, double %3, double 0.000000e+00 6113 %7 = insertelement <2 x double> %__A, double %6, i64 0 6114 ret <2 x double> %7 6115 } 6116 6117 define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 6118 ; X86-LABEL: test_mm_mask3_fnmsub_sd: 6119 ; X86: # %bb.0: # %entry 6120 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6121 ; X86-NEXT: kmovw %eax, %k1 6122 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 6123 ; X86-NEXT: vmovapd %xmm2, %xmm0 6124 ; X86-NEXT: retl 6125 ; 6126 ; X64-LABEL: test_mm_mask3_fnmsub_sd: 6127 ; X64: # %bb.0: # %entry 6128 ; X64-NEXT: kmovw %edi, %k1 6129 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 6130 ; X64-NEXT: vmovapd %xmm2, %xmm0 6131 ; X64-NEXT: retq 6132 entry: 6133 %0 = extractelement <2 x double> %__W, i64 0 6134 %.rhs.i = extractelement <2 x double> %__X, i64 0 6135 %1 = fsub double -0.000000e+00, %.rhs.i 6136 %.rhs7.i = extractelement <2 x double> %__Y, i64 0 6137 %2 = fsub double -0.000000e+00, %.rhs7.i 6138 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 6139 %4 = and i8 %__U, 1 6140 %tobool.i = icmp eq i8 %4, 0 6141 %vecext2.i = extractelement <2 x double> %__Y, i32 0 6142 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3 6143 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 6144 ret <2 x double> %vecins.i 6145 } 6146 6147 define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 6148 ; X86-LABEL: test_mm_mask3_fnmsub_round_sd: 6149 ; X86: # %bb.0: # %entry 6150 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6151 ; X86-NEXT: kmovw %eax, %k1 6152 ; X86-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 6153 ; X86-NEXT: vmovapd %xmm2, %xmm0 6154 ; X86-NEXT: retl 6155 ; 6156 ; X64-LABEL: test_mm_mask3_fnmsub_round_sd: 6157 ; X64: # %bb.0: # %entry 6158 ; X64-NEXT: kmovw %edi, %k1 6159 ; X64-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 6160 ; X64-NEXT: vmovapd %xmm2, %xmm0 6161 ; X64-NEXT: retq 6162 entry: 6163 %0 = extractelement <2 x double> %__W, i64 0 6164 %.rhs = extractelement <2 x double> %__X, i64 0 6165 %1 = fsub double -0.000000e+00, %.rhs 6166 %.rhs1 = extractelement <2 x double> %__Y, i64 0 6167 %2 = fsub double -0.000000e+00, %.rhs1 6168 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6169 %4 = bitcast i8 %__U to <8 x i1> 6170 %5 = extractelement <8 x i1> %4, i64 0 6171 %6 = select i1 %5, double %3, double %.rhs1 6172 %7 = insertelement <2 x double> %__Y, double %6, i64 0 6173 ret <2 x double> %7 6174 } 6175 6176 define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6177 ; X86-LABEL: test_mm512_mask_expandloadu_epi64: 6178 ; X86: # %bb.0: # %entry 6179 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6180 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6181 ; X86-NEXT: kmovw %ecx, %k1 6182 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} 6183 ; X86-NEXT: retl 6184 ; 6185 ; X64-LABEL: test_mm512_mask_expandloadu_epi64: 6186 ; X64: # %bb.0: # %entry 6187 ; X64-NEXT: kmovw %edi, %k1 6188 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} 6189 ; X64-NEXT: retq 6190 entry: 6191 %0 = bitcast i8* %__P to i64* 6192 %1 = bitcast i8 %__U to <8 x i1> 6193 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W) 6194 ret <8 x i64> %2 6195 } 6196 6197 define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { 6198 ; X86-LABEL: test_mm512_maskz_expandloadu_epi64: 6199 ; X86: # %bb.0: # %entry 6200 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6201 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6202 ; X86-NEXT: kmovw %ecx, %k1 6203 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z} 6204 ; X86-NEXT: retl 6205 ; 6206 ; X64-LABEL: test_mm512_maskz_expandloadu_epi64: 6207 ; X64: # %bb.0: # %entry 6208 ; X64-NEXT: kmovw %edi, %k1 6209 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z} 6210 ; X64-NEXT: retq 6211 entry: 6212 %0 = bitcast i8* %__P to i64* 6213 %1 = bitcast i8 %__U to <8 x i1> 6214 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer) 6215 ret <8 x i64> %2 6216 } 6217 6218 define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { 6219 ; X86-LABEL: test_mm512_mask_expandloadu_pd: 6220 ; X86: # %bb.0: # %entry 6221 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6223 ; X86-NEXT: kmovw %ecx, %k1 6224 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} 6225 ; X86-NEXT: retl 6226 ; 6227 ; X64-LABEL: test_mm512_mask_expandloadu_pd: 6228 ; X64: # %bb.0: # %entry 6229 ; X64-NEXT: kmovw %edi, %k1 6230 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} 6231 ; X64-NEXT: retq 6232 entry: 6233 %0 = bitcast i8* %__P to double* 6234 %1 = bitcast i8 %__U to <8 x i1> 6235 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W) 6236 ret <8 x double> %2 6237 } 6238 6239 define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { 6240 ; X86-LABEL: test_mm512_maskz_expandloadu_pd: 6241 ; X86: # %bb.0: # %entry 6242 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6243 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6244 ; X86-NEXT: kmovw %ecx, %k1 6245 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} {z} 6246 ; X86-NEXT: retl 6247 ; 6248 ; X64-LABEL: test_mm512_maskz_expandloadu_pd: 6249 ; X64: # %bb.0: # %entry 6250 ; X64-NEXT: kmovw %edi, %k1 6251 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} {z} 6252 ; X64-NEXT: retq 6253 entry: 6254 %0 = bitcast i8* %__P to double* 6255 %1 = bitcast i8 %__U to <8 x i1> 6256 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer) 6257 ret <8 x double> %2 6258 } 6259 6260 define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) { 6261 ; X86-LABEL: test_mm512_mask_expandloadu_epi32: 6262 ; X86: # %bb.0: # %entry 6263 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6264 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6265 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} 6266 ; X86-NEXT: retl 6267 ; 6268 ; X64-LABEL: test_mm512_mask_expandloadu_epi32: 6269 ; X64: # %bb.0: # %entry 6270 ; X64-NEXT: kmovw %edi, %k1 6271 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} 6272 ; X64-NEXT: retq 6273 entry: 6274 %0 = bitcast <8 x i64> %__W to <16 x i32> 6275 %1 = bitcast i8* %__P to i32* 6276 %2 = bitcast i16 %__U to <16 x i1> 6277 %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11 6278 %4 = bitcast <16 x i32> %3 to <8 x i64> 6279 ret <8 x i64> %4 6280 } 6281 6282 define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) { 6283 ; X86-LABEL: test_mm512_maskz_expandloadu_epi32: 6284 ; X86: # %bb.0: # %entry 6285 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6286 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6287 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z} 6288 ; X86-NEXT: retl 6289 ; 6290 ; X64-LABEL: test_mm512_maskz_expandloadu_epi32: 6291 ; X64: # %bb.0: # %entry 6292 ; X64-NEXT: kmovw %edi, %k1 6293 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} {z} 6294 ; X64-NEXT: retq 6295 entry: 6296 %0 = bitcast i8* %__P to i32* 6297 %1 = bitcast i16 %__U to <16 x i1> 6298 %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer) 6299 %3 = bitcast <16 x i32> %2 to <8 x i64> 6300 ret <8 x i64> %3 6301 } 6302 6303 define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) { 6304 ; X86-LABEL: test_mm512_mask_expandloadu_ps: 6305 ; X86: # %bb.0: # %entry 6306 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6307 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6308 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} 6309 ; X86-NEXT: retl 6310 ; 6311 ; X64-LABEL: test_mm512_mask_expandloadu_ps: 6312 ; X64: # %bb.0: # %entry 6313 ; X64-NEXT: kmovw %edi, %k1 6314 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} 6315 ; X64-NEXT: retq 6316 entry: 6317 %0 = bitcast i8* %__P to float* 6318 %1 = bitcast i16 %__U to <16 x i1> 6319 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11 6320 ret <16 x float> %2 6321 } 6322 6323 define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) { 6324 ; X86-LABEL: test_mm512_maskz_expandloadu_ps: 6325 ; X86: # %bb.0: # %entry 6326 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6327 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6328 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z} 6329 ; X86-NEXT: retl 6330 ; 6331 ; X64-LABEL: test_mm512_maskz_expandloadu_ps: 6332 ; X64: # %bb.0: # %entry 6333 ; X64-NEXT: kmovw %edi, %k1 6334 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} {z} 6335 ; X64-NEXT: retq 6336 entry: 6337 %0 = bitcast i8* %__P to float* 6338 %1 = bitcast i16 %__U to <16 x i1> 6339 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer) 6340 ret <16 x float> %2 6341 } 6342 6343 define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) { 6344 ; X86-LABEL: test_mm512_mask_compressstoreu_pd: 6345 ; X86: # %bb.0: # %entry 6346 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6347 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6348 ; X86-NEXT: kmovw %eax, %k1 6349 ; X86-NEXT: vcompresspd %zmm0, (%ecx) {%k1} 6350 ; X86-NEXT: vzeroupper 6351 ; X86-NEXT: retl 6352 ; 6353 ; X64-LABEL: test_mm512_mask_compressstoreu_pd: 6354 ; X64: # %bb.0: # %entry 6355 ; X64-NEXT: kmovw %esi, %k1 6356 ; X64-NEXT: vcompresspd %zmm0, (%rdi) {%k1} 6357 ; X64-NEXT: vzeroupper 6358 ; X64-NEXT: retq 6359 entry: 6360 %0 = bitcast i8* %__P to double* 6361 %1 = bitcast i8 %__U to <8 x i1> 6362 tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1) 6363 ret void 6364 } 6365 6366 define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) { 6367 ; X86-LABEL: test_mm512_mask_compressstoreu_epi64: 6368 ; X86: # %bb.0: # %entry 6369 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6370 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6371 ; X86-NEXT: kmovw %eax, %k1 6372 ; X86-NEXT: vpcompressq %zmm0, (%ecx) {%k1} 6373 ; X86-NEXT: vzeroupper 6374 ; X86-NEXT: retl 6375 ; 6376 ; X64-LABEL: test_mm512_mask_compressstoreu_epi64: 6377 ; X64: # %bb.0: # %entry 6378 ; X64-NEXT: kmovw %esi, %k1 6379 ; X64-NEXT: vpcompressq %zmm0, (%rdi) {%k1} 6380 ; X64-NEXT: vzeroupper 6381 ; X64-NEXT: retq 6382 entry: 6383 %0 = bitcast i8* %__P to i64* 6384 %1 = bitcast i8 %__U to <8 x i1> 6385 tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1) 6386 ret void 6387 } 6388 6389 define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) { 6390 ; X86-LABEL: test_mm512_mask_compressstoreu_ps: 6391 ; X86: # %bb.0: # %entry 6392 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6393 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6394 ; X86-NEXT: vcompressps %zmm0, (%eax) {%k1} 6395 ; X86-NEXT: vzeroupper 6396 ; X86-NEXT: retl 6397 ; 6398 ; X64-LABEL: test_mm512_mask_compressstoreu_ps: 6399 ; X64: # %bb.0: # %entry 6400 ; X64-NEXT: kmovw %esi, %k1 6401 ; X64-NEXT: vcompressps %zmm0, (%rdi) {%k1} 6402 ; X64-NEXT: vzeroupper 6403 ; X64-NEXT: retq 6404 entry: 6405 %0 = bitcast i8* %__P to float* 6406 %1 = bitcast i16 %__U to <16 x i1> 6407 tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1) 6408 ret void 6409 } 6410 6411 define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) { 6412 ; X86-LABEL: test_mm512_mask_compressstoreu_epi32: 6413 ; X86: # %bb.0: # %entry 6414 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6415 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6416 ; X86-NEXT: vpcompressd %zmm0, (%eax) {%k1} 6417 ; X86-NEXT: vzeroupper 6418 ; X86-NEXT: retl 6419 ; 6420 ; X64-LABEL: test_mm512_mask_compressstoreu_epi32: 6421 ; X64: # %bb.0: # %entry 6422 ; X64-NEXT: kmovw %esi, %k1 6423 ; X64-NEXT: vpcompressd %zmm0, (%rdi) {%k1} 6424 ; X64-NEXT: vzeroupper 6425 ; X64-NEXT: retq 6426 entry: 6427 %0 = bitcast <8 x i64> %__A to <16 x i32> 6428 %1 = bitcast i8* %__P to i32* 6429 %2 = bitcast i16 %__U to <16 x i1> 6430 tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2) 6431 ret void 6432 } 6433 6434 define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) { 6435 ; X86-LABEL: test_mm512_reduce_add_epi64: 6436 ; X86: # %bb.0: # %entry 6437 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6438 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6439 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6440 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6441 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6442 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6443 ; X86-NEXT: vmovd %xmm0, %eax 6444 ; X86-NEXT: vpextrd $1, %xmm0, %edx 6445 ; X86-NEXT: vzeroupper 6446 ; X86-NEXT: retl 6447 ; 6448 ; X64-LABEL: test_mm512_reduce_add_epi64: 6449 ; X64: # %bb.0: # %entry 6450 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6451 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6452 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6453 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6454 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6455 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6456 ; X64-NEXT: vmovq %xmm0, %rax 6457 ; X64-NEXT: vzeroupper 6458 ; X64-NEXT: retq 6459 entry: 6460 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6461 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6462 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i 6463 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6464 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6465 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i 6466 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6467 %add7.i = add <2 x i64> %shuffle6.i, %add4.i 6468 %vecext.i = extractelement <2 x i64> %add7.i, i32 0 6469 ret i64 %vecext.i 6470 } 6471 6472 define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) { 6473 ; X86-LABEL: test_mm512_reduce_mul_epi64: 6474 ; X86: # %bb.0: # %entry 6475 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6476 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm2 6477 ; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 6478 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm3 6479 ; X86-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 6480 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6481 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2 6482 ; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 6483 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6484 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6485 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 6486 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6487 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 6488 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6489 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6490 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6491 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6492 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6493 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6494 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 6495 ; X86-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 6496 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 6497 ; X86-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 6498 ; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm2 6499 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6500 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6501 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6502 ; X86-NEXT: vmovd %xmm0, %eax 6503 ; X86-NEXT: vpextrd $1, %xmm0, %edx 6504 ; X86-NEXT: vzeroupper 6505 ; X86-NEXT: retl 6506 ; 6507 ; X64-LABEL: test_mm512_reduce_mul_epi64: 6508 ; X64: # %bb.0: # %entry 6509 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6510 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm2 6511 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 6512 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm3 6513 ; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 6514 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6515 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2 6516 ; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 6517 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6518 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6519 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 6520 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6521 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 6522 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6523 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6524 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6525 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6526 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6527 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6528 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 6529 ; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 6530 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 6531 ; X64-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 6532 ; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 6533 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6534 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6535 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6536 ; X64-NEXT: vmovq %xmm0, %rax 6537 ; X64-NEXT: vzeroupper 6538 ; X64-NEXT: retq 6539 entry: 6540 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6541 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6542 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i 6543 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6544 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6545 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i 6546 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6547 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i 6548 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0 6549 ret i64 %vecext.i 6550 } 6551 6552 define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) { 6553 ; X86-LABEL: test_mm512_reduce_or_epi64: 6554 ; X86: # %bb.0: # %entry 6555 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6556 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 6557 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6558 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 6559 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6560 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 6561 ; X86-NEXT: vmovd %xmm0, %eax 6562 ; X86-NEXT: vpextrd $1, %xmm0, %edx 6563 ; X86-NEXT: vzeroupper 6564 ; X86-NEXT: retl 6565 ; 6566 ; X64-LABEL: test_mm512_reduce_or_epi64: 6567 ; X64: # %bb.0: # %entry 6568 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6569 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 6570 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6571 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 6572 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6573 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 6574 ; X64-NEXT: vmovq %xmm0, %rax 6575 ; X64-NEXT: vzeroupper 6576 ; X64-NEXT: retq 6577 entry: 6578 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6579 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6580 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i 6581 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6582 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6583 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i 6584 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6585 %or7.i = or <2 x i64> %shuffle6.i, %or4.i 6586 %vecext.i = extractelement <2 x i64> %or7.i, i32 0 6587 ret i64 %vecext.i 6588 } 6589 6590 define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) { 6591 ; X86-LABEL: test_mm512_reduce_and_epi64: 6592 ; X86: # %bb.0: # %entry 6593 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6594 ; X86-NEXT: vpand %ymm1, %ymm0, %ymm0 6595 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6596 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 6597 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6598 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 6599 ; X86-NEXT: vmovd %xmm0, %eax 6600 ; X86-NEXT: vpextrd $1, %xmm0, %edx 6601 ; X86-NEXT: vzeroupper 6602 ; X86-NEXT: retl 6603 ; 6604 ; X64-LABEL: test_mm512_reduce_and_epi64: 6605 ; X64: # %bb.0: # %entry 6606 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6607 ; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 6608 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6609 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 6610 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6611 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 6612 ; X64-NEXT: vmovq %xmm0, %rax 6613 ; X64-NEXT: vzeroupper 6614 ; X64-NEXT: retq 6615 entry: 6616 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6617 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6618 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i 6619 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6620 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6621 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i 6622 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6623 %and7.i = and <2 x i64> %shuffle6.i, %and4.i 6624 %vecext.i = extractelement <2 x i64> %and7.i, i32 0 6625 ret i64 %vecext.i 6626 } 6627 6628 define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6629 ; X86-LABEL: test_mm512_mask_reduce_add_epi64: 6630 ; X86: # %bb.0: # %entry 6631 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6632 ; X86-NEXT: kmovw %eax, %k1 6633 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6634 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6635 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6636 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6637 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6638 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6639 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6640 ; X86-NEXT: vmovd %xmm0, %eax 6641 ; X86-NEXT: vpextrd $1, %xmm0, %edx 6642 ; X86-NEXT: vzeroupper 6643 ; X86-NEXT: retl 6644 ; 6645 ; X64-LABEL: test_mm512_mask_reduce_add_epi64: 6646 ; X64: # %bb.0: # %entry 6647 ; X64-NEXT: kmovw %edi, %k1 6648 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6649 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6650 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6651 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6652 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6653 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6654 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6655 ; X64-NEXT: vmovq %xmm0, %rax 6656 ; X64-NEXT: vzeroupper 6657 ; X64-NEXT: retq 6658 entry: 6659 %0 = bitcast i8 %__M to <8 x i1> 6660 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer 6661 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6662 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6663 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i 6664 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6665 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6666 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i 6667 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6668 %add7.i = add <2 x i64> %shuffle6.i, %add4.i 6669 %vecext.i = extractelement <2 x i64> %add7.i, i32 0 6670 ret i64 %vecext.i 6671 } 6672 6673 define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6674 ; X86-LABEL: test_mm512_mask_reduce_mul_epi64: 6675 ; X86: # %bb.0: # %entry 6676 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6677 ; X86-NEXT: kmovw %eax, %k1 6678 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0] 6679 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6680 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6681 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm2 6682 ; X86-NEXT: vpmuludq %ymm0, %ymm2, %ymm2 6683 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm3 6684 ; X86-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 6685 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6686 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2 6687 ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 6688 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6689 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6690 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 6691 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6692 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 6693 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6694 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6695 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6696 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6697 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6698 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6699 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 6700 ; X86-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 6701 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 6702 ; X86-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 6703 ; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm2 6704 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6705 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6706 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6707 ; X86-NEXT: vmovd %xmm0, %eax 6708 ; X86-NEXT: vpextrd $1, %xmm0, %edx 6709 ; X86-NEXT: vzeroupper 6710 ; X86-NEXT: retl 6711 ; 6712 ; X64-LABEL: test_mm512_mask_reduce_mul_epi64: 6713 ; X64: # %bb.0: # %entry 6714 ; X64-NEXT: kmovw %edi, %k1 6715 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1] 6716 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6717 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6718 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm2 6719 ; X64-NEXT: vpmuludq %ymm0, %ymm2, %ymm2 6720 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm3 6721 ; X64-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 6722 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6723 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2 6724 ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 6725 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6726 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6727 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 6728 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6729 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 6730 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6731 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6732 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6733 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6734 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6735 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6736 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 6737 ; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 6738 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 6739 ; X64-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 6740 ; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 6741 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6742 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6743 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6744 ; X64-NEXT: vmovq %xmm0, %rax 6745 ; X64-NEXT: vzeroupper 6746 ; X64-NEXT: retq 6747 entry: 6748 %0 = bitcast i8 %__M to <8 x i1> 6749 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 6750 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6751 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6752 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i 6753 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6754 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6755 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i 6756 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6757 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i 6758 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0 6759 ret i64 %vecext.i 6760 } 6761 6762 define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6763 ; X86-LABEL: test_mm512_mask_reduce_and_epi64: 6764 ; X86: # %bb.0: # %entry 6765 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6766 ; X86-NEXT: kmovw %eax, %k1 6767 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 6768 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6769 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6770 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 6771 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6772 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 6773 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6774 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 6775 ; X86-NEXT: vmovd %xmm0, %eax 6776 ; X86-NEXT: vpextrd $1, %xmm0, %edx 6777 ; X86-NEXT: vzeroupper 6778 ; X86-NEXT: retl 6779 ; 6780 ; X64-LABEL: test_mm512_mask_reduce_and_epi64: 6781 ; X64: # %bb.0: # %entry 6782 ; X64-NEXT: kmovw %edi, %k1 6783 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 6784 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6785 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6786 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 6787 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6788 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 6789 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6790 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 6791 ; X64-NEXT: vmovq %xmm0, %rax 6792 ; X64-NEXT: vzeroupper 6793 ; X64-NEXT: retq 6794 entry: 6795 %0 = bitcast i8 %__M to <8 x i1> 6796 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> 6797 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6798 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6799 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i 6800 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6801 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6802 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i 6803 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6804 %and7.i = and <2 x i64> %shuffle6.i, %and4.i 6805 %vecext.i = extractelement <2 x i64> %and7.i, i32 0 6806 ret i64 %vecext.i 6807 } 6808 6809 define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6810 ; X86-LABEL: test_mm512_mask_reduce_or_epi64: 6811 ; X86: # %bb.0: # %entry 6812 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6813 ; X86-NEXT: kmovw %eax, %k1 6814 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6815 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6816 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 6817 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6818 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 6819 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6820 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 6821 ; X86-NEXT: vmovd %xmm0, %eax 6822 ; X86-NEXT: vpextrd $1, %xmm0, %edx 6823 ; X86-NEXT: vzeroupper 6824 ; X86-NEXT: retl 6825 ; 6826 ; X64-LABEL: test_mm512_mask_reduce_or_epi64: 6827 ; X64: # %bb.0: # %entry 6828 ; X64-NEXT: kmovw %edi, %k1 6829 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6830 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6831 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 6832 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6833 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 6834 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6835 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 6836 ; X64-NEXT: vmovq %xmm0, %rax 6837 ; X64-NEXT: vzeroupper 6838 ; X64-NEXT: retq 6839 entry: 6840 %0 = bitcast i8 %__M to <8 x i1> 6841 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer 6842 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6843 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6844 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i 6845 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6846 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6847 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i 6848 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6849 %or7.i = or <2 x i64> %shuffle6.i, %or4.i 6850 %vecext.i = extractelement <2 x i64> %or7.i, i32 0 6851 ret i64 %vecext.i 6852 } 6853 6854 define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) { 6855 ; CHECK-LABEL: test_mm512_reduce_add_epi32: 6856 ; CHECK: # %bb.0: # %entry 6857 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6858 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 6859 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 6860 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 6861 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6862 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 6863 ; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0 6864 ; CHECK-NEXT: vmovd %xmm0, %eax 6865 ; CHECK-NEXT: vzeroupper 6866 ; CHECK-NEXT: ret{{[l|q]}} 6867 entry: 6868 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6869 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 6870 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6871 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 6872 %add.i = add <8 x i32> %0, %1 6873 %2 = bitcast <8 x i32> %add.i to <4 x i64> 6874 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6875 %3 = bitcast <2 x i64> %extract3.i to <4 x i32> 6876 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6877 %4 = bitcast <2 x i64> %extract4.i to <4 x i32> 6878 %add5.i = add <4 x i32> %3, %4 6879 %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 6880 %add6.i = add <4 x i32> %shuffle.i, %add5.i 6881 %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 6882 %add8.i = add <4 x i32> %shuffle7.i, %add6.i 6883 %vecext.i = extractelement <4 x i32> %add8.i, i32 0 6884 ret i32 %vecext.i 6885 } 6886 6887 define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) { 6888 ; CHECK-LABEL: test_mm512_reduce_mul_epi32: 6889 ; CHECK: # %bb.0: # %entry 6890 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6891 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 6892 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 6893 ; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0 6894 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6895 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 6896 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 6897 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 6898 ; CHECK-NEXT: vmovd %xmm0, %eax 6899 ; CHECK-NEXT: vzeroupper 6900 ; CHECK-NEXT: ret{{[l|q]}} 6901 entry: 6902 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6903 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 6904 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6905 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 6906 %mul.i = mul <8 x i32> %0, %1 6907 %2 = bitcast <8 x i32> %mul.i to <4 x i64> 6908 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6909 %3 = bitcast <2 x i64> %extract3.i to <4 x i32> 6910 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6911 %4 = bitcast <2 x i64> %extract4.i to <4 x i32> 6912 %mul5.i = mul <4 x i32> %3, %4 6913 %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 6914 %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i 6915 %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 6916 %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i 6917 %vecext.i = extractelement <4 x i32> %mul8.i, i32 0 6918 ret i32 %vecext.i 6919 } 6920 6921 define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) { 6922 ; CHECK-LABEL: test_mm512_reduce_or_epi32: 6923 ; CHECK: # %bb.0: # %entry 6924 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6925 ; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 6926 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 6927 ; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 6928 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6929 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 6930 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 6931 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 6932 ; CHECK-NEXT: vmovd %xmm0, %eax 6933 ; CHECK-NEXT: vzeroupper 6934 ; CHECK-NEXT: ret{{[l|q]}} 6935 entry: 6936 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6937 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6938 %or25.i = or <4 x i64> %extract.i, %extract2.i 6939 %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6940 %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6941 %or526.i = or <2 x i64> %extract3.i, %extract4.i 6942 %or5.i = bitcast <2 x i64> %or526.i to <4 x i32> 6943 %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 6944 %or6.i = or <4 x i32> %shuffle.i, %or5.i 6945 %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 6946 %or8.i = or <4 x i32> %shuffle7.i, %or6.i 6947 %vecext.i = extractelement <4 x i32> %or8.i, i32 0 6948 ret i32 %vecext.i 6949 } 6950 6951 define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) { 6952 ; CHECK-LABEL: test_mm512_reduce_and_epi32: 6953 ; CHECK: # %bb.0: # %entry 6954 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6955 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 6956 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 6957 ; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 6958 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6959 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 6960 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 6961 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 6962 ; CHECK-NEXT: vmovd %xmm0, %eax 6963 ; CHECK-NEXT: vzeroupper 6964 ; CHECK-NEXT: ret{{[l|q]}} 6965 entry: 6966 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6967 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6968 %and25.i = and <4 x i64> %extract.i, %extract2.i 6969 %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6970 %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6971 %and526.i = and <2 x i64> %extract3.i, %extract4.i 6972 %and5.i = bitcast <2 x i64> %and526.i to <4 x i32> 6973 %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 6974 %and6.i = and <4 x i32> %shuffle.i, %and5.i 6975 %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 6976 %and8.i = and <4 x i32> %shuffle7.i, %and6.i 6977 %vecext.i = extractelement <4 x i32> %and8.i, i32 0 6978 ret i32 %vecext.i 6979 } 6980 6981 define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) { 6982 ; X86-LABEL: test_mm512_mask_reduce_add_epi32: 6983 ; X86: # %bb.0: # %entry 6984 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6985 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 6986 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6987 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0 6988 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6989 ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 6990 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6991 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 6992 ; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0 6993 ; X86-NEXT: vmovd %xmm0, %eax 6994 ; X86-NEXT: vzeroupper 6995 ; X86-NEXT: retl 6996 ; 6997 ; X64-LABEL: test_mm512_mask_reduce_add_epi32: 6998 ; X64: # %bb.0: # %entry 6999 ; X64-NEXT: kmovw %edi, %k1 7000 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 7001 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7002 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 7003 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7004 ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 7005 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7006 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 7007 ; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0 7008 ; X64-NEXT: vmovd %xmm0, %eax 7009 ; X64-NEXT: vzeroupper 7010 ; X64-NEXT: retq 7011 entry: 7012 %0 = bitcast <8 x i64> %__W to <16 x i32> 7013 %1 = bitcast i16 %__M to <16 x i1> 7014 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 7015 %3 = bitcast <16 x i32> %2 to <8 x i64> 7016 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7017 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 7018 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7019 %5 = bitcast <4 x i64> %extract3.i to <8 x i32> 7020 %add.i = add <8 x i32> %4, %5 7021 %6 = bitcast <8 x i32> %add.i to <4 x i64> 7022 %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7023 %7 = bitcast <2 x i64> %extract4.i to <4 x i32> 7024 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7025 %8 = bitcast <2 x i64> %extract5.i to <4 x i32> 7026 %add6.i = add <4 x i32> %7, %8 7027 %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7028 %add7.i = add <4 x i32> %shuffle.i, %add6.i 7029 %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7030 %add9.i = add <4 x i32> %shuffle8.i, %add7.i 7031 %vecext.i = extractelement <4 x i32> %add9.i, i32 0 7032 ret i32 %vecext.i 7033 } 7034 7035 define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) { 7036 ; X86-LABEL: test_mm512_mask_reduce_mul_epi32: 7037 ; X86: # %bb.0: # %entry 7038 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 7039 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 7040 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7041 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7042 ; X86-NEXT: vpmulld %ymm0, %ymm1, %ymm0 7043 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7044 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 7045 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7046 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7047 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7048 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7049 ; X86-NEXT: vmovd %xmm0, %eax 7050 ; X86-NEXT: vzeroupper 7051 ; X86-NEXT: retl 7052 ; 7053 ; X64-LABEL: test_mm512_mask_reduce_mul_epi32: 7054 ; X64: # %bb.0: # %entry 7055 ; X64-NEXT: kmovw %edi, %k1 7056 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 7057 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7058 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7059 ; X64-NEXT: vpmulld %ymm0, %ymm1, %ymm0 7060 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7061 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 7062 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7063 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7064 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7065 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7066 ; X64-NEXT: vmovd %xmm0, %eax 7067 ; X64-NEXT: vzeroupper 7068 ; X64-NEXT: retq 7069 entry: 7070 %0 = bitcast <8 x i64> %__W to <16 x i32> 7071 %1 = bitcast i16 %__M to <16 x i1> 7072 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 7073 %3 = bitcast <16 x i32> %2 to <8 x i64> 7074 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7075 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 7076 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7077 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 7078 %mul.i = mul <8 x i32> %4, %5 7079 %6 = bitcast <8 x i32> %mul.i to <4 x i64> 7080 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7081 %7 = bitcast <2 x i64> %extract5.i to <4 x i32> 7082 %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7083 %8 = bitcast <2 x i64> %extract6.i to <4 x i32> 7084 %mul7.i = mul <4 x i32> %7, %8 7085 %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7086 %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i 7087 %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7088 %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i 7089 %vecext.i = extractelement <4 x i32> %mul10.i, i32 0 7090 ret i32 %vecext.i 7091 } 7092 7093 define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) { 7094 ; X86-LABEL: test_mm512_mask_reduce_and_epi32: 7095 ; X86: # %bb.0: # %entry 7096 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 7097 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 7098 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7099 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7100 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 7101 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7102 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 7103 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7104 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 7105 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7106 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 7107 ; X86-NEXT: vmovd %xmm0, %eax 7108 ; X86-NEXT: vzeroupper 7109 ; X86-NEXT: retl 7110 ; 7111 ; X64-LABEL: test_mm512_mask_reduce_and_epi32: 7112 ; X64: # %bb.0: # %entry 7113 ; X64-NEXT: kmovw %edi, %k1 7114 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 7115 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7116 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7117 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 7118 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7119 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 7120 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7121 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 7122 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7123 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 7124 ; X64-NEXT: vmovd %xmm0, %eax 7125 ; X64-NEXT: vzeroupper 7126 ; X64-NEXT: retq 7127 entry: 7128 %0 = bitcast <8 x i64> %__W to <16 x i32> 7129 %1 = bitcast i16 %__M to <16 x i1> 7130 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 7131 %3 = bitcast <16 x i32> %2 to <8 x i64> 7132 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7133 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7134 %and28.i = and <4 x i64> %extract.i, %extract4.i 7135 %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7136 %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7137 %and729.i = and <2 x i64> %extract5.i, %extract6.i 7138 %and7.i = bitcast <2 x i64> %and729.i to <4 x i32> 7139 %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7140 %and8.i = and <4 x i32> %shuffle.i, %and7.i 7141 %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7142 %and10.i = and <4 x i32> %shuffle9.i, %and8.i 7143 %vecext.i = extractelement <4 x i32> %and10.i, i32 0 7144 ret i32 %vecext.i 7145 } 7146 7147 define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) { 7148 ; X86-LABEL: test_mm512_mask_reduce_or_epi32: 7149 ; X86: # %bb.0: # %entry 7150 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 7151 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 7152 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7153 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 7154 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7155 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 7156 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7157 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 7158 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7159 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 7160 ; X86-NEXT: vmovd %xmm0, %eax 7161 ; X86-NEXT: vzeroupper 7162 ; X86-NEXT: retl 7163 ; 7164 ; X64-LABEL: test_mm512_mask_reduce_or_epi32: 7165 ; X64: # %bb.0: # %entry 7166 ; X64-NEXT: kmovw %edi, %k1 7167 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 7168 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7169 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 7170 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7171 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 7172 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7173 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 7174 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7175 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 7176 ; X64-NEXT: vmovd %xmm0, %eax 7177 ; X64-NEXT: vzeroupper 7178 ; X64-NEXT: retq 7179 entry: 7180 %0 = bitcast <8 x i64> %__W to <16 x i32> 7181 %1 = bitcast i16 %__M to <16 x i1> 7182 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 7183 %3 = bitcast <16 x i32> %2 to <8 x i64> 7184 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7185 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7186 %or27.i = or <4 x i64> %extract.i, %extract3.i 7187 %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7188 %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7189 %or628.i = or <2 x i64> %extract4.i, %extract5.i 7190 %or6.i = bitcast <2 x i64> %or628.i to <4 x i32> 7191 %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7192 %or7.i = or <4 x i32> %shuffle.i, %or6.i 7193 %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7194 %or9.i = or <4 x i32> %shuffle8.i, %or7.i 7195 %vecext.i = extractelement <4 x i32> %or9.i, i32 0 7196 ret i32 %vecext.i 7197 } 7198 7199 define double @test_mm512_reduce_add_pd(<8 x double> %__W) { 7200 ; X86-LABEL: test_mm512_reduce_add_pd: 7201 ; X86: # %bb.0: # %entry 7202 ; X86-NEXT: pushl %ebp 7203 ; X86-NEXT: .cfi_def_cfa_offset 8 7204 ; X86-NEXT: .cfi_offset %ebp, -8 7205 ; X86-NEXT: movl %esp, %ebp 7206 ; X86-NEXT: .cfi_def_cfa_register %ebp 7207 ; X86-NEXT: andl $-8, %esp 7208 ; X86-NEXT: subl $8, %esp 7209 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7210 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7211 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7212 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7213 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7214 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7215 ; X86-NEXT: vmovlpd %xmm0, (%esp) 7216 ; X86-NEXT: fldl (%esp) 7217 ; X86-NEXT: movl %ebp, %esp 7218 ; X86-NEXT: popl %ebp 7219 ; X86-NEXT: .cfi_def_cfa %esp, 4 7220 ; X86-NEXT: vzeroupper 7221 ; X86-NEXT: retl 7222 ; 7223 ; X64-LABEL: test_mm512_reduce_add_pd: 7224 ; X64: # %bb.0: # %entry 7225 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7226 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7227 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7228 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7229 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7230 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7231 ; X64-NEXT: vzeroupper 7232 ; X64-NEXT: retq 7233 entry: 7234 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7235 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7236 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i 7237 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7238 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7239 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i 7240 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7241 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i 7242 %vecext.i = extractelement <2 x double> %add7.i, i32 0 7243 ret double %vecext.i 7244 } 7245 7246 define double @test_mm512_reduce_mul_pd(<8 x double> %__W) { 7247 ; X86-LABEL: test_mm512_reduce_mul_pd: 7248 ; X86: # %bb.0: # %entry 7249 ; X86-NEXT: pushl %ebp 7250 ; X86-NEXT: .cfi_def_cfa_offset 8 7251 ; X86-NEXT: .cfi_offset %ebp, -8 7252 ; X86-NEXT: movl %esp, %ebp 7253 ; X86-NEXT: .cfi_def_cfa_register %ebp 7254 ; X86-NEXT: andl $-8, %esp 7255 ; X86-NEXT: subl $8, %esp 7256 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7257 ; X86-NEXT: vmulpd %ymm1, %ymm0, %ymm0 7258 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7259 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7260 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7261 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7262 ; X86-NEXT: vmovlpd %xmm0, (%esp) 7263 ; X86-NEXT: fldl (%esp) 7264 ; X86-NEXT: movl %ebp, %esp 7265 ; X86-NEXT: popl %ebp 7266 ; X86-NEXT: .cfi_def_cfa %esp, 4 7267 ; X86-NEXT: vzeroupper 7268 ; X86-NEXT: retl 7269 ; 7270 ; X64-LABEL: test_mm512_reduce_mul_pd: 7271 ; X64: # %bb.0: # %entry 7272 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7273 ; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0 7274 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7275 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7276 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7277 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7278 ; X64-NEXT: vzeroupper 7279 ; X64-NEXT: retq 7280 entry: 7281 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7282 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7283 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i 7284 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7285 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7286 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i 7287 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7288 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i 7289 %vecext.i = extractelement <2 x double> %mul7.i, i32 0 7290 ret double %vecext.i 7291 } 7292 7293 define float @test_mm512_reduce_add_ps(<16 x float> %__W) { 7294 ; X86-LABEL: test_mm512_reduce_add_ps: 7295 ; X86: # %bb.0: # %entry 7296 ; X86-NEXT: pushl %eax 7297 ; X86-NEXT: .cfi_def_cfa_offset 8 7298 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7299 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 7300 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7301 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7302 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7303 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7304 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7305 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7306 ; X86-NEXT: vmovss %xmm0, (%esp) 7307 ; X86-NEXT: flds (%esp) 7308 ; X86-NEXT: popl %eax 7309 ; X86-NEXT: .cfi_def_cfa_offset 4 7310 ; X86-NEXT: vzeroupper 7311 ; X86-NEXT: retl 7312 ; 7313 ; X64-LABEL: test_mm512_reduce_add_ps: 7314 ; X64: # %bb.0: # %entry 7315 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7316 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 7317 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7318 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7319 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7320 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7321 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7322 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7323 ; X64-NEXT: vzeroupper 7324 ; X64-NEXT: retq 7325 entry: 7326 %0 = bitcast <16 x float> %__W to <8 x double> 7327 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7328 %1 = bitcast <4 x double> %extract.i to <8 x float> 7329 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7330 %2 = bitcast <4 x double> %extract2.i to <8 x float> 7331 %add.i = fadd <8 x float> %1, %2 7332 %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7333 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7334 %add5.i = fadd <4 x float> %extract3.i, %extract4.i 7335 %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7336 %add6.i = fadd <4 x float> %add5.i, %shuffle.i 7337 %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7338 %add8.i = fadd <4 x float> %add6.i, %shuffle7.i 7339 %vecext.i = extractelement <4 x float> %add8.i, i32 0 7340 ret float %vecext.i 7341 } 7342 7343 define float @test_mm512_reduce_mul_ps(<16 x float> %__W) { 7344 ; X86-LABEL: test_mm512_reduce_mul_ps: 7345 ; X86: # %bb.0: # %entry 7346 ; X86-NEXT: pushl %eax 7347 ; X86-NEXT: .cfi_def_cfa_offset 8 7348 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7349 ; X86-NEXT: vmulps %ymm1, %ymm0, %ymm0 7350 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7351 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7352 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7353 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7354 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7355 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7356 ; X86-NEXT: vmovss %xmm0, (%esp) 7357 ; X86-NEXT: flds (%esp) 7358 ; X86-NEXT: popl %eax 7359 ; X86-NEXT: .cfi_def_cfa_offset 4 7360 ; X86-NEXT: vzeroupper 7361 ; X86-NEXT: retl 7362 ; 7363 ; X64-LABEL: test_mm512_reduce_mul_ps: 7364 ; X64: # %bb.0: # %entry 7365 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7366 ; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0 7367 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7368 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7369 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7370 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7371 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7372 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7373 ; X64-NEXT: vzeroupper 7374 ; X64-NEXT: retq 7375 entry: 7376 %0 = bitcast <16 x float> %__W to <8 x double> 7377 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7378 %1 = bitcast <4 x double> %extract.i to <8 x float> 7379 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7380 %2 = bitcast <4 x double> %extract2.i to <8 x float> 7381 %mul.i = fmul <8 x float> %1, %2 7382 %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7383 %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7384 %mul5.i = fmul <4 x float> %extract3.i, %extract4.i 7385 %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7386 %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i 7387 %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7388 %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i 7389 %vecext.i = extractelement <4 x float> %mul8.i, i32 0 7390 ret float %vecext.i 7391 } 7392 7393 define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) { 7394 ; X86-LABEL: test_mm512_mask_reduce_add_pd: 7395 ; X86: # %bb.0: # %entry 7396 ; X86-NEXT: pushl %ebp 7397 ; X86-NEXT: .cfi_def_cfa_offset 8 7398 ; X86-NEXT: .cfi_offset %ebp, -8 7399 ; X86-NEXT: movl %esp, %ebp 7400 ; X86-NEXT: .cfi_def_cfa_register %ebp 7401 ; X86-NEXT: andl $-8, %esp 7402 ; X86-NEXT: subl $8, %esp 7403 ; X86-NEXT: movb 8(%ebp), %al 7404 ; X86-NEXT: kmovw %eax, %k1 7405 ; X86-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} 7406 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7407 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7408 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7409 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7410 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7411 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7412 ; X86-NEXT: vmovlpd %xmm0, (%esp) 7413 ; X86-NEXT: fldl (%esp) 7414 ; X86-NEXT: movl %ebp, %esp 7415 ; X86-NEXT: popl %ebp 7416 ; X86-NEXT: .cfi_def_cfa %esp, 4 7417 ; X86-NEXT: vzeroupper 7418 ; X86-NEXT: retl 7419 ; 7420 ; X64-LABEL: test_mm512_mask_reduce_add_pd: 7421 ; X64: # %bb.0: # %entry 7422 ; X64-NEXT: kmovw %edi, %k1 7423 ; X64-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} 7424 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7425 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7426 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7427 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7428 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7429 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7430 ; X64-NEXT: vzeroupper 7431 ; X64-NEXT: retq 7432 entry: 7433 %0 = bitcast i8 %__M to <8 x i1> 7434 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer 7435 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7436 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7437 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i 7438 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7439 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7440 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i 7441 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7442 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i 7443 %vecext.i = extractelement <2 x double> %add7.i, i32 0 7444 ret double %vecext.i 7445 } 7446 7447 define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) { 7448 ; X86-LABEL: test_mm512_mask_reduce_mul_pd: 7449 ; X86: # %bb.0: # %entry 7450 ; X86-NEXT: pushl %ebp 7451 ; X86-NEXT: .cfi_def_cfa_offset 8 7452 ; X86-NEXT: .cfi_offset %ebp, -8 7453 ; X86-NEXT: movl %esp, %ebp 7454 ; X86-NEXT: .cfi_def_cfa_register %ebp 7455 ; X86-NEXT: andl $-8, %esp 7456 ; X86-NEXT: subl $8, %esp 7457 ; X86-NEXT: movb 8(%ebp), %al 7458 ; X86-NEXT: kmovw %eax, %k1 7459 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1] 7460 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1} 7461 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7462 ; X86-NEXT: vmulpd %ymm0, %ymm1, %ymm0 7463 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7464 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7465 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7466 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7467 ; X86-NEXT: vmovlpd %xmm0, (%esp) 7468 ; X86-NEXT: fldl (%esp) 7469 ; X86-NEXT: movl %ebp, %esp 7470 ; X86-NEXT: popl %ebp 7471 ; X86-NEXT: .cfi_def_cfa %esp, 4 7472 ; X86-NEXT: vzeroupper 7473 ; X86-NEXT: retl 7474 ; 7475 ; X64-LABEL: test_mm512_mask_reduce_mul_pd: 7476 ; X64: # %bb.0: # %entry 7477 ; X64-NEXT: kmovw %edi, %k1 7478 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1] 7479 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1} 7480 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7481 ; X64-NEXT: vmulpd %ymm0, %ymm1, %ymm0 7482 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7483 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7484 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7485 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7486 ; X64-NEXT: vzeroupper 7487 ; X64-NEXT: retq 7488 entry: 7489 %0 = bitcast i8 %__M to <8 x i1> 7490 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00> 7491 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7492 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7493 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i 7494 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7495 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7496 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i 7497 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7498 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i 7499 %vecext.i = extractelement <2 x double> %mul7.i, i32 0 7500 ret double %vecext.i 7501 } 7502 7503 define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) { 7504 ; X86-LABEL: test_mm512_mask_reduce_add_ps: 7505 ; X86: # %bb.0: # %entry 7506 ; X86-NEXT: pushl %eax 7507 ; X86-NEXT: .cfi_def_cfa_offset 8 7508 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 7509 ; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} 7510 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7511 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 7512 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7513 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7514 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7515 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7516 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7517 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7518 ; X86-NEXT: vmovss %xmm0, (%esp) 7519 ; X86-NEXT: flds (%esp) 7520 ; X86-NEXT: popl %eax 7521 ; X86-NEXT: .cfi_def_cfa_offset 4 7522 ; X86-NEXT: vzeroupper 7523 ; X86-NEXT: retl 7524 ; 7525 ; X64-LABEL: test_mm512_mask_reduce_add_ps: 7526 ; X64: # %bb.0: # %entry 7527 ; X64-NEXT: kmovw %edi, %k1 7528 ; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} 7529 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7530 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 7531 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7532 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7533 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7534 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7535 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7536 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7537 ; X64-NEXT: vzeroupper 7538 ; X64-NEXT: retq 7539 entry: 7540 %0 = bitcast i16 %__M to <16 x i1> 7541 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer 7542 %2 = bitcast <16 x float> %1 to <8 x double> 7543 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7544 %3 = bitcast <4 x double> %extract.i to <8 x float> 7545 %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7546 %4 = bitcast <4 x double> %extract3.i to <8 x float> 7547 %add.i = fadd <8 x float> %3, %4 7548 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7549 %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7550 %add6.i = fadd <4 x float> %extract4.i, %extract5.i 7551 %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7552 %add7.i = fadd <4 x float> %add6.i, %shuffle.i 7553 %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7554 %add9.i = fadd <4 x float> %add7.i, %shuffle8.i 7555 %vecext.i = extractelement <4 x float> %add9.i, i32 0 7556 ret float %vecext.i 7557 } 7558 7559 define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) { 7560 ; X86-LABEL: test_mm512_mask_reduce_mul_ps: 7561 ; X86: # %bb.0: # %entry 7562 ; X86-NEXT: pushl %eax 7563 ; X86-NEXT: .cfi_def_cfa_offset 8 7564 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 7565 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 7566 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} 7567 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7568 ; X86-NEXT: vmulps %ymm0, %ymm1, %ymm0 7569 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7570 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7571 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7572 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7573 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7574 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7575 ; X86-NEXT: vmovss %xmm0, (%esp) 7576 ; X86-NEXT: flds (%esp) 7577 ; X86-NEXT: popl %eax 7578 ; X86-NEXT: .cfi_def_cfa_offset 4 7579 ; X86-NEXT: vzeroupper 7580 ; X86-NEXT: retl 7581 ; 7582 ; X64-LABEL: test_mm512_mask_reduce_mul_ps: 7583 ; X64: # %bb.0: # %entry 7584 ; X64-NEXT: kmovw %edi, %k1 7585 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 7586 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1} 7587 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7588 ; X64-NEXT: vmulps %ymm0, %ymm1, %ymm0 7589 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7590 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7591 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7592 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7593 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7594 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7595 ; X64-NEXT: vzeroupper 7596 ; X64-NEXT: retq 7597 entry: 7598 %0 = bitcast i16 %__M to <16 x i1> 7599 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> 7600 %2 = bitcast <16 x float> %1 to <8 x double> 7601 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7602 %3 = bitcast <4 x double> %extract.i to <8 x float> 7603 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7604 %4 = bitcast <4 x double> %extract4.i to <8 x float> 7605 %mul.i = fmul <8 x float> %3, %4 7606 %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7607 %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7608 %mul7.i = fmul <4 x float> %extract5.i, %extract6.i 7609 %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7610 %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i 7611 %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7612 %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i 7613 %vecext.i = extractelement <4 x float> %mul10.i, i32 0 7614 ret float %vecext.i 7615 } 7616 7617 define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) { 7618 ; X86-LABEL: test_mm512_reduce_max_epi64: 7619 ; X86: # %bb.0: # %entry 7620 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7621 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 7622 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7623 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7624 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7625 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7626 ; X86-NEXT: vmovd %xmm0, %eax 7627 ; X86-NEXT: vpextrd $1, %xmm0, %edx 7628 ; X86-NEXT: vzeroupper 7629 ; X86-NEXT: retl 7630 ; 7631 ; X64-LABEL: test_mm512_reduce_max_epi64: 7632 ; X64: # %bb.0: # %entry 7633 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7634 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 7635 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7636 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7637 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7638 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7639 ; X64-NEXT: vmovq %xmm0, %rax 7640 ; X64-NEXT: vzeroupper 7641 ; X64-NEXT: retq 7642 entry: 7643 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7644 %0 = icmp slt <8 x i64> %shuffle.i, %__W 7645 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7646 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7647 %2 = icmp sgt <8 x i64> %1, %shuffle1.i 7648 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7649 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7650 %4 = icmp sgt <8 x i64> %3, %shuffle3.i 7651 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7652 %vecext.i = extractelement <8 x i64> %5, i32 0 7653 ret i64 %vecext.i 7654 } 7655 7656 define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) { 7657 ; X86-LABEL: test_mm512_reduce_max_epu64: 7658 ; X86: # %bb.0: # %entry 7659 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7660 ; X86-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 7661 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7662 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7663 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7664 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7665 ; X86-NEXT: vmovd %xmm0, %eax 7666 ; X86-NEXT: vpextrd $1, %xmm0, %edx 7667 ; X86-NEXT: vzeroupper 7668 ; X86-NEXT: retl 7669 ; 7670 ; X64-LABEL: test_mm512_reduce_max_epu64: 7671 ; X64: # %bb.0: # %entry 7672 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7673 ; X64-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 7674 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7675 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7676 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7677 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7678 ; X64-NEXT: vmovq %xmm0, %rax 7679 ; X64-NEXT: vzeroupper 7680 ; X64-NEXT: retq 7681 entry: 7682 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7683 %0 = icmp ult <8 x i64> %shuffle.i, %__W 7684 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7685 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7686 %2 = icmp ugt <8 x i64> %1, %shuffle1.i 7687 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7688 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7689 %4 = icmp ugt <8 x i64> %3, %shuffle3.i 7690 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7691 %vecext.i = extractelement <8 x i64> %5, i32 0 7692 ret i64 %vecext.i 7693 } 7694 7695 define double @test_mm512_reduce_max_pd(<8 x double> %__W) { 7696 ; X86-LABEL: test_mm512_reduce_max_pd: 7697 ; X86: # %bb.0: # %entry 7698 ; X86-NEXT: pushl %ebp 7699 ; X86-NEXT: .cfi_def_cfa_offset 8 7700 ; X86-NEXT: .cfi_offset %ebp, -8 7701 ; X86-NEXT: movl %esp, %ebp 7702 ; X86-NEXT: .cfi_def_cfa_register %ebp 7703 ; X86-NEXT: andl $-8, %esp 7704 ; X86-NEXT: subl $8, %esp 7705 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7706 ; X86-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 7707 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7708 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7709 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7710 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7711 ; X86-NEXT: vmovlpd %xmm0, (%esp) 7712 ; X86-NEXT: fldl (%esp) 7713 ; X86-NEXT: movl %ebp, %esp 7714 ; X86-NEXT: popl %ebp 7715 ; X86-NEXT: .cfi_def_cfa %esp, 4 7716 ; X86-NEXT: vzeroupper 7717 ; X86-NEXT: retl 7718 ; 7719 ; X64-LABEL: test_mm512_reduce_max_pd: 7720 ; X64: # %bb.0: # %entry 7721 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7722 ; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 7723 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7724 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7725 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7726 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7727 ; X64-NEXT: vzeroupper 7728 ; X64-NEXT: retq 7729 entry: 7730 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7731 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7732 %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i) 7733 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7734 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7735 %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i) 7736 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7737 %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i) 7738 %vecext.i = extractelement <2 x double> %2, i32 0 7739 ret double %vecext.i 7740 } 7741 7742 define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) { 7743 ; X86-LABEL: test_mm512_reduce_min_epi64: 7744 ; X86: # %bb.0: # %entry 7745 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7746 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 7747 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7748 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7749 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7750 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7751 ; X86-NEXT: vmovd %xmm0, %eax 7752 ; X86-NEXT: vpextrd $1, %xmm0, %edx 7753 ; X86-NEXT: vzeroupper 7754 ; X86-NEXT: retl 7755 ; 7756 ; X64-LABEL: test_mm512_reduce_min_epi64: 7757 ; X64: # %bb.0: # %entry 7758 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7759 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0 7760 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7761 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7762 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7763 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7764 ; X64-NEXT: vmovq %xmm0, %rax 7765 ; X64-NEXT: vzeroupper 7766 ; X64-NEXT: retq 7767 entry: 7768 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7769 %0 = icmp sgt <8 x i64> %shuffle.i, %__W 7770 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7771 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7772 %2 = icmp slt <8 x i64> %1, %shuffle1.i 7773 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7774 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7775 %4 = icmp slt <8 x i64> %3, %shuffle3.i 7776 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7777 %vecext.i = extractelement <8 x i64> %5, i32 0 7778 ret i64 %vecext.i 7779 } 7780 7781 define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) { 7782 ; X86-LABEL: test_mm512_reduce_min_epu64: 7783 ; X86: # %bb.0: # %entry 7784 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7785 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0 7786 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7787 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7788 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7789 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7790 ; X86-NEXT: vmovd %xmm0, %eax 7791 ; X86-NEXT: vpextrd $1, %xmm0, %edx 7792 ; X86-NEXT: vzeroupper 7793 ; X86-NEXT: retl 7794 ; 7795 ; X64-LABEL: test_mm512_reduce_min_epu64: 7796 ; X64: # %bb.0: # %entry 7797 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7798 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0 7799 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7800 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7801 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7802 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7803 ; X64-NEXT: vmovq %xmm0, %rax 7804 ; X64-NEXT: vzeroupper 7805 ; X64-NEXT: retq 7806 entry: 7807 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7808 %0 = icmp ugt <8 x i64> %shuffle.i, %__W 7809 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7810 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7811 %2 = icmp ult <8 x i64> %1, %shuffle1.i 7812 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7813 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7814 %4 = icmp ult <8 x i64> %3, %shuffle3.i 7815 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7816 %vecext.i = extractelement <8 x i64> %5, i32 0 7817 ret i64 %vecext.i 7818 } 7819 7820 define double @test_mm512_reduce_min_pd(<8 x double> %__W) { 7821 ; X86-LABEL: test_mm512_reduce_min_pd: 7822 ; X86: # %bb.0: # %entry 7823 ; X86-NEXT: pushl %ebp 7824 ; X86-NEXT: .cfi_def_cfa_offset 8 7825 ; X86-NEXT: .cfi_offset %ebp, -8 7826 ; X86-NEXT: movl %esp, %ebp 7827 ; X86-NEXT: .cfi_def_cfa_register %ebp 7828 ; X86-NEXT: andl $-8, %esp 7829 ; X86-NEXT: subl $8, %esp 7830 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7831 ; X86-NEXT: vminpd %ymm1, %ymm0, %ymm0 7832 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7833 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 7834 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7835 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 7836 ; X86-NEXT: vmovlpd %xmm0, (%esp) 7837 ; X86-NEXT: fldl (%esp) 7838 ; X86-NEXT: movl %ebp, %esp 7839 ; X86-NEXT: popl %ebp 7840 ; X86-NEXT: .cfi_def_cfa %esp, 4 7841 ; X86-NEXT: vzeroupper 7842 ; X86-NEXT: retl 7843 ; 7844 ; X64-LABEL: test_mm512_reduce_min_pd: 7845 ; X64: # %bb.0: # %entry 7846 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7847 ; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0 7848 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7849 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 7850 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7851 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 7852 ; X64-NEXT: vzeroupper 7853 ; X64-NEXT: retq 7854 entry: 7855 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7856 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7857 %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i) 7858 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7859 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7860 %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i) 7861 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7862 %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i) 7863 %vecext.i = extractelement <2 x double> %2, i32 0 7864 ret double %vecext.i 7865 } 7866 7867 define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) { 7868 ; X86-LABEL: test_mm512_mask_reduce_max_epi64: 7869 ; X86: # %bb.0: # %entry 7870 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7871 ; X86-NEXT: kmovw %eax, %k1 7872 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648] 7873 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 7874 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 7875 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 7876 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7877 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7878 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7879 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7880 ; X86-NEXT: vmovd %xmm0, %eax 7881 ; X86-NEXT: vpextrd $1, %xmm0, %edx 7882 ; X86-NEXT: vzeroupper 7883 ; X86-NEXT: retl 7884 ; 7885 ; X64-LABEL: test_mm512_mask_reduce_max_epi64: 7886 ; X64: # %bb.0: # %entry 7887 ; X64-NEXT: kmovw %edi, %k1 7888 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 7889 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 7890 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 7891 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 7892 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7893 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7894 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7895 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7896 ; X64-NEXT: vmovq %xmm0, %rax 7897 ; X64-NEXT: vzeroupper 7898 ; X64-NEXT: retq 7899 entry: 7900 %0 = bitcast i8 %__M to <8 x i1> 7901 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808> 7902 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7903 %2 = icmp sgt <8 x i64> %1, %shuffle.i 7904 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 7905 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7906 %4 = icmp sgt <8 x i64> %3, %shuffle3.i 7907 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7908 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7909 %6 = icmp sgt <8 x i64> %5, %shuffle5.i 7910 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i 7911 %vecext.i = extractelement <8 x i64> %7, i32 0 7912 ret i64 %vecext.i 7913 } 7914 7915 define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) { 7916 ; X86-LABEL: test_mm512_mask_reduce_max_epu64: 7917 ; X86: # %bb.0: # %entry 7918 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7919 ; X86-NEXT: kmovw %eax, %k1 7920 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 7921 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7922 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7923 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7924 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7925 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7926 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7927 ; X86-NEXT: vmovd %xmm0, %eax 7928 ; X86-NEXT: vpextrd $1, %xmm0, %edx 7929 ; X86-NEXT: vzeroupper 7930 ; X86-NEXT: retl 7931 ; 7932 ; X64-LABEL: test_mm512_mask_reduce_max_epu64: 7933 ; X64: # %bb.0: # %entry 7934 ; X64-NEXT: kmovw %edi, %k1 7935 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 7936 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7937 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7938 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7939 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7940 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7941 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7942 ; X64-NEXT: vmovq %xmm0, %rax 7943 ; X64-NEXT: vzeroupper 7944 ; X64-NEXT: retq 7945 entry: 7946 %0 = bitcast i8 %__M to <8 x i1> 7947 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer 7948 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7949 %2 = icmp ugt <8 x i64> %1, %shuffle.i 7950 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 7951 %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7952 %4 = icmp ugt <8 x i64> %3, %shuffle2.i 7953 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i 7954 %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7955 %6 = icmp ugt <8 x i64> %5, %shuffle4.i 7956 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i 7957 %vecext.i = extractelement <8 x i64> %7, i32 0 7958 ret i64 %vecext.i 7959 } 7960 7961 define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) { 7962 ; X86-LABEL: test_mm512_mask_reduce_max_pd: 7963 ; X86: # %bb.0: # %entry 7964 ; X86-NEXT: pushl %ebp 7965 ; X86-NEXT: .cfi_def_cfa_offset 8 7966 ; X86-NEXT: .cfi_offset %ebp, -8 7967 ; X86-NEXT: movl %esp, %ebp 7968 ; X86-NEXT: .cfi_def_cfa_register %ebp 7969 ; X86-NEXT: andl $-8, %esp 7970 ; X86-NEXT: subl $8, %esp 7971 ; X86-NEXT: movb 8(%ebp), %al 7972 ; X86-NEXT: kmovw %eax, %k1 7973 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 7974 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1} 7975 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7976 ; X86-NEXT: vmaxpd %ymm0, %ymm1, %ymm0 7977 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7978 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7979 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7980 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7981 ; X86-NEXT: vmovlpd %xmm0, (%esp) 7982 ; X86-NEXT: fldl (%esp) 7983 ; X86-NEXT: movl %ebp, %esp 7984 ; X86-NEXT: popl %ebp 7985 ; X86-NEXT: .cfi_def_cfa %esp, 4 7986 ; X86-NEXT: vzeroupper 7987 ; X86-NEXT: retl 7988 ; 7989 ; X64-LABEL: test_mm512_mask_reduce_max_pd: 7990 ; X64: # %bb.0: # %entry 7991 ; X64-NEXT: kmovw %edi, %k1 7992 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 7993 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1} 7994 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7995 ; X64-NEXT: vmaxpd %ymm0, %ymm1, %ymm0 7996 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7997 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7998 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7999 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 8000 ; X64-NEXT: vzeroupper 8001 ; X64-NEXT: retq 8002 entry: 8003 %0 = bitcast i8 %__M to <8 x i1> 8004 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000> 8005 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8006 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8007 %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3 8008 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1> 8009 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3> 8010 %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3 8011 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0> 8012 %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3 8013 %vecext.i = extractelement <2 x double> %4, i32 0 8014 ret double %vecext.i 8015 } 8016 8017 define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) { 8018 ; X86-LABEL: test_mm512_mask_reduce_min_epi64: 8019 ; X86: # %bb.0: # %entry 8020 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8021 ; X86-NEXT: kmovw %eax, %k1 8022 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647] 8023 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8024 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8025 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 8026 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 8027 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8028 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 8029 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8030 ; X86-NEXT: vmovd %xmm0, %eax 8031 ; X86-NEXT: vpextrd $1, %xmm0, %edx 8032 ; X86-NEXT: vzeroupper 8033 ; X86-NEXT: retl 8034 ; 8035 ; X64-LABEL: test_mm512_mask_reduce_min_epi64: 8036 ; X64: # %bb.0: # %entry 8037 ; X64-NEXT: kmovw %edi, %k1 8038 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] 8039 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8040 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8041 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0 8042 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 8043 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8044 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 8045 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8046 ; X64-NEXT: vmovq %xmm0, %rax 8047 ; X64-NEXT: vzeroupper 8048 ; X64-NEXT: retq 8049 entry: 8050 %0 = bitcast i8 %__M to <8 x i1> 8051 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> 8052 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 8053 %2 = icmp slt <8 x i64> %1, %shuffle.i 8054 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 8055 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 8056 %4 = icmp slt <8 x i64> %3, %shuffle3.i 8057 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 8058 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 8059 %6 = icmp slt <8 x i64> %5, %shuffle5.i 8060 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i 8061 %vecext.i = extractelement <8 x i64> %7, i32 0 8062 ret i64 %vecext.i 8063 } 8064 8065 define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) { 8066 ; X86-LABEL: test_mm512_mask_reduce_min_epu64: 8067 ; X86: # %bb.0: # %entry 8068 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8069 ; X86-NEXT: kmovw %eax, %k1 8070 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8071 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8072 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8073 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0 8074 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 8075 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8076 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 8077 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8078 ; X86-NEXT: vmovd %xmm0, %eax 8079 ; X86-NEXT: vpextrd $1, %xmm0, %edx 8080 ; X86-NEXT: vzeroupper 8081 ; X86-NEXT: retl 8082 ; 8083 ; X64-LABEL: test_mm512_mask_reduce_min_epu64: 8084 ; X64: # %bb.0: # %entry 8085 ; X64-NEXT: kmovw %edi, %k1 8086 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8087 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8088 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8089 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0 8090 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 8091 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8092 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 8093 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8094 ; X64-NEXT: vmovq %xmm0, %rax 8095 ; X64-NEXT: vzeroupper 8096 ; X64-NEXT: retq 8097 entry: 8098 %0 = bitcast i8 %__M to <8 x i1> 8099 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> 8100 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 8101 %2 = icmp ult <8 x i64> %1, %shuffle.i 8102 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 8103 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 8104 %4 = icmp ult <8 x i64> %3, %shuffle3.i 8105 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 8106 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 8107 %6 = icmp ult <8 x i64> %5, %shuffle5.i 8108 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i 8109 %vecext.i = extractelement <8 x i64> %7, i32 0 8110 ret i64 %vecext.i 8111 } 8112 8113 define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) { 8114 ; X86-LABEL: test_mm512_mask_reduce_min_pd: 8115 ; X86: # %bb.0: # %entry 8116 ; X86-NEXT: pushl %ebp 8117 ; X86-NEXT: .cfi_def_cfa_offset 8 8118 ; X86-NEXT: .cfi_offset %ebp, -8 8119 ; X86-NEXT: movl %esp, %ebp 8120 ; X86-NEXT: .cfi_def_cfa_register %ebp 8121 ; X86-NEXT: andl $-8, %esp 8122 ; X86-NEXT: subl $8, %esp 8123 ; X86-NEXT: movb 8(%ebp), %al 8124 ; X86-NEXT: kmovw %eax, %k1 8125 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8126 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1} 8127 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8128 ; X86-NEXT: vminpd %ymm0, %ymm1, %ymm0 8129 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8130 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 8131 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8132 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 8133 ; X86-NEXT: vmovlpd %xmm0, (%esp) 8134 ; X86-NEXT: fldl (%esp) 8135 ; X86-NEXT: movl %ebp, %esp 8136 ; X86-NEXT: popl %ebp 8137 ; X86-NEXT: .cfi_def_cfa %esp, 4 8138 ; X86-NEXT: vzeroupper 8139 ; X86-NEXT: retl 8140 ; 8141 ; X64-LABEL: test_mm512_mask_reduce_min_pd: 8142 ; X64: # %bb.0: # %entry 8143 ; X64-NEXT: kmovw %edi, %k1 8144 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8145 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1} 8146 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8147 ; X64-NEXT: vminpd %ymm0, %ymm1, %ymm0 8148 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8149 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 8150 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8151 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 8152 ; X64-NEXT: vzeroupper 8153 ; X64-NEXT: retq 8154 entry: 8155 %0 = bitcast i8 %__M to <8 x i1> 8156 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000> 8157 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8158 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8159 %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) 8160 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1> 8161 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3> 8162 %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) 8163 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0> 8164 %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i) 8165 %vecext.i = extractelement <2 x double> %4, i32 0 8166 ret double %vecext.i 8167 } 8168 8169 define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) { 8170 ; CHECK-LABEL: test_mm512_reduce_max_epi32: 8171 ; CHECK: # %bb.0: # %entry 8172 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8173 ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 8174 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8175 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8176 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8177 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8178 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8179 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8180 ; CHECK-NEXT: vmovd %xmm0, %eax 8181 ; CHECK-NEXT: vzeroupper 8182 ; CHECK-NEXT: ret{{[l|q]}} 8183 entry: 8184 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8185 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8186 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8187 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8188 %2 = icmp sgt <8 x i32> %0, %1 8189 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8190 %4 = bitcast <8 x i32> %3 to <4 x i64> 8191 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8192 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8193 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8194 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8195 %7 = icmp sgt <4 x i32> %5, %6 8196 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8197 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8198 %9 = icmp sgt <4 x i32> %8, %shuffle.i 8199 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8200 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8201 %11 = icmp sgt <4 x i32> %10, %shuffle8.i 8202 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8203 %vecext.i = extractelement <4 x i32> %12, i32 0 8204 ret i32 %vecext.i 8205 } 8206 8207 define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) { 8208 ; CHECK-LABEL: test_mm512_reduce_max_epu32: 8209 ; CHECK: # %bb.0: # %entry 8210 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8211 ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 8212 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8213 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8214 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8215 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8216 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8217 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8218 ; CHECK-NEXT: vmovd %xmm0, %eax 8219 ; CHECK-NEXT: vzeroupper 8220 ; CHECK-NEXT: ret{{[l|q]}} 8221 entry: 8222 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8223 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8224 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8225 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8226 %2 = icmp ugt <8 x i32> %0, %1 8227 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8228 %4 = bitcast <8 x i32> %3 to <4 x i64> 8229 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8230 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8231 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8232 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8233 %7 = icmp ugt <4 x i32> %5, %6 8234 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8235 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8236 %9 = icmp ugt <4 x i32> %8, %shuffle.i 8237 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8238 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8239 %11 = icmp ugt <4 x i32> %10, %shuffle8.i 8240 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8241 %vecext.i = extractelement <4 x i32> %12, i32 0 8242 ret i32 %vecext.i 8243 } 8244 8245 define float @test_mm512_reduce_max_ps(<16 x float> %__W) { 8246 ; X86-LABEL: test_mm512_reduce_max_ps: 8247 ; X86: # %bb.0: # %entry 8248 ; X86-NEXT: pushl %eax 8249 ; X86-NEXT: .cfi_def_cfa_offset 8 8250 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8251 ; X86-NEXT: vmaxps %ymm1, %ymm0, %ymm0 8252 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8253 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8254 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8255 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8256 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8257 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8258 ; X86-NEXT: vmovss %xmm0, (%esp) 8259 ; X86-NEXT: flds (%esp) 8260 ; X86-NEXT: popl %eax 8261 ; X86-NEXT: .cfi_def_cfa_offset 4 8262 ; X86-NEXT: vzeroupper 8263 ; X86-NEXT: retl 8264 ; 8265 ; X64-LABEL: test_mm512_reduce_max_ps: 8266 ; X64: # %bb.0: # %entry 8267 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8268 ; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0 8269 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8270 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8271 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8272 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8273 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8274 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8275 ; X64-NEXT: vzeroupper 8276 ; X64-NEXT: retq 8277 entry: 8278 %0 = bitcast <16 x float> %__W to <8 x double> 8279 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8280 %1 = bitcast <4 x double> %extract.i to <8 x float> 8281 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8282 %2 = bitcast <4 x double> %extract2.i to <8 x float> 8283 %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2) 8284 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8285 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8286 %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i) 8287 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8288 %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i) 8289 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8290 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i) 8291 %vecext.i = extractelement <4 x float> %6, i32 0 8292 ret float %vecext.i 8293 } 8294 8295 define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) { 8296 ; CHECK-LABEL: test_mm512_reduce_min_epi32: 8297 ; CHECK: # %bb.0: # %entry 8298 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8299 ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 8300 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8301 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8302 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8303 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8304 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8305 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8306 ; CHECK-NEXT: vmovd %xmm0, %eax 8307 ; CHECK-NEXT: vzeroupper 8308 ; CHECK-NEXT: ret{{[l|q]}} 8309 entry: 8310 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8311 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8312 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8313 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8314 %2 = icmp slt <8 x i32> %0, %1 8315 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8316 %4 = bitcast <8 x i32> %3 to <4 x i64> 8317 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8318 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8319 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8320 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8321 %7 = icmp slt <4 x i32> %5, %6 8322 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8323 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8324 %9 = icmp slt <4 x i32> %8, %shuffle.i 8325 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8326 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8327 %11 = icmp slt <4 x i32> %10, %shuffle8.i 8328 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8329 %vecext.i = extractelement <4 x i32> %12, i32 0 8330 ret i32 %vecext.i 8331 } 8332 8333 define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) { 8334 ; CHECK-LABEL: test_mm512_reduce_min_epu32: 8335 ; CHECK: # %bb.0: # %entry 8336 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8337 ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 8338 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8339 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 8340 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8341 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 8342 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8343 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 8344 ; CHECK-NEXT: vmovd %xmm0, %eax 8345 ; CHECK-NEXT: vzeroupper 8346 ; CHECK-NEXT: ret{{[l|q]}} 8347 entry: 8348 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8349 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8350 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8351 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8352 %2 = icmp ult <8 x i32> %0, %1 8353 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8354 %4 = bitcast <8 x i32> %3 to <4 x i64> 8355 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8356 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8357 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8358 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8359 %7 = icmp ult <4 x i32> %5, %6 8360 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8361 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8362 %9 = icmp ult <4 x i32> %8, %shuffle.i 8363 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8364 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8365 %11 = icmp ult <4 x i32> %10, %shuffle8.i 8366 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8367 %vecext.i = extractelement <4 x i32> %12, i32 0 8368 ret i32 %vecext.i 8369 } 8370 8371 define float @test_mm512_reduce_min_ps(<16 x float> %__W) { 8372 ; X86-LABEL: test_mm512_reduce_min_ps: 8373 ; X86: # %bb.0: # %entry 8374 ; X86-NEXT: pushl %eax 8375 ; X86-NEXT: .cfi_def_cfa_offset 8 8376 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8377 ; X86-NEXT: vminps %ymm1, %ymm0, %ymm0 8378 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8379 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8380 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8381 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8382 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8383 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8384 ; X86-NEXT: vmovss %xmm0, (%esp) 8385 ; X86-NEXT: flds (%esp) 8386 ; X86-NEXT: popl %eax 8387 ; X86-NEXT: .cfi_def_cfa_offset 4 8388 ; X86-NEXT: vzeroupper 8389 ; X86-NEXT: retl 8390 ; 8391 ; X64-LABEL: test_mm512_reduce_min_ps: 8392 ; X64: # %bb.0: # %entry 8393 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8394 ; X64-NEXT: vminps %ymm1, %ymm0, %ymm0 8395 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8396 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8397 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8398 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8399 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8400 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8401 ; X64-NEXT: vzeroupper 8402 ; X64-NEXT: retq 8403 entry: 8404 %0 = bitcast <16 x float> %__W to <8 x double> 8405 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8406 %1 = bitcast <4 x double> %extract.i to <8 x float> 8407 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8408 %2 = bitcast <4 x double> %extract2.i to <8 x float> 8409 %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2) 8410 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8411 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8412 %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i) 8413 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8414 %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i) 8415 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8416 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i) 8417 %vecext.i = extractelement <4 x float> %6, i32 0 8418 ret float %vecext.i 8419 } 8420 8421 define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) { 8422 ; X86-LABEL: test_mm512_mask_reduce_max_epi32: 8423 ; X86: # %bb.0: # %entry 8424 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8425 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] 8426 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8427 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8428 ; X86-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0 8429 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8430 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8431 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8432 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8433 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8434 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8435 ; X86-NEXT: vmovd %xmm0, %eax 8436 ; X86-NEXT: vzeroupper 8437 ; X86-NEXT: retl 8438 ; 8439 ; X64-LABEL: test_mm512_mask_reduce_max_epi32: 8440 ; X64: # %bb.0: # %entry 8441 ; X64-NEXT: kmovw %edi, %k1 8442 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] 8443 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8444 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8445 ; X64-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0 8446 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8447 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8448 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8449 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8450 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8451 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8452 ; X64-NEXT: vmovd %xmm0, %eax 8453 ; X64-NEXT: vzeroupper 8454 ; X64-NEXT: retq 8455 entry: 8456 %0 = bitcast <8 x i64> %__W to <16 x i32> 8457 %1 = bitcast i16 %__M to <16 x i1> 8458 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 8459 %3 = bitcast <16 x i32> %2 to <8 x i64> 8460 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8461 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8462 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8463 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 8464 %6 = icmp sgt <8 x i32> %4, %5 8465 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8466 %8 = bitcast <8 x i32> %7 to <4 x i64> 8467 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8468 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8469 %9 = bitcast <2 x i64> %extract6.i to <4 x i32> 8470 %10 = bitcast <2 x i64> %extract7.i to <4 x i32> 8471 %11 = icmp sgt <4 x i32> %9, %10 8472 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8473 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8474 %13 = icmp sgt <4 x i32> %12, %shuffle.i 8475 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8476 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8477 %15 = icmp sgt <4 x i32> %14, %shuffle10.i 8478 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i 8479 %vecext.i = extractelement <4 x i32> %16, i32 0 8480 ret i32 %vecext.i 8481 } 8482 8483 define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) { 8484 ; X86-LABEL: test_mm512_mask_reduce_max_epu32: 8485 ; X86: # %bb.0: # %entry 8486 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8487 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 8488 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8489 ; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 8490 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8491 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8492 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8493 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8494 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8495 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8496 ; X86-NEXT: vmovd %xmm0, %eax 8497 ; X86-NEXT: vzeroupper 8498 ; X86-NEXT: retl 8499 ; 8500 ; X64-LABEL: test_mm512_mask_reduce_max_epu32: 8501 ; X64: # %bb.0: # %entry 8502 ; X64-NEXT: kmovw %edi, %k1 8503 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 8504 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8505 ; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 8506 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8507 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8508 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8509 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8510 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8511 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8512 ; X64-NEXT: vmovd %xmm0, %eax 8513 ; X64-NEXT: vzeroupper 8514 ; X64-NEXT: retq 8515 entry: 8516 %0 = bitcast <8 x i64> %__W to <16 x i32> 8517 %1 = bitcast i16 %__M to <16 x i1> 8518 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 8519 %3 = bitcast <16 x i32> %2 to <8 x i64> 8520 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8521 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8522 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8523 %5 = bitcast <4 x i64> %extract3.i to <8 x i32> 8524 %6 = icmp ugt <8 x i32> %4, %5 8525 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8526 %8 = bitcast <8 x i32> %7 to <4 x i64> 8527 %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8528 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8529 %9 = bitcast <2 x i64> %extract5.i to <4 x i32> 8530 %10 = bitcast <2 x i64> %extract6.i to <4 x i32> 8531 %11 = icmp ugt <4 x i32> %9, %10 8532 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8533 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8534 %13 = icmp ugt <4 x i32> %12, %shuffle.i 8535 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8536 %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8537 %15 = icmp ugt <4 x i32> %14, %shuffle9.i 8538 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i 8539 %vecext.i = extractelement <4 x i32> %16, i32 0 8540 ret i32 %vecext.i 8541 } 8542 8543 define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) { 8544 ; X86-LABEL: test_mm512_mask_reduce_max_ps: 8545 ; X86: # %bb.0: # %entry 8546 ; X86-NEXT: pushl %eax 8547 ; X86-NEXT: .cfi_def_cfa_offset 8 8548 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8549 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 8550 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8551 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8552 ; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm0 8553 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8554 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8555 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8556 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8557 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8558 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8559 ; X86-NEXT: vmovss %xmm0, (%esp) 8560 ; X86-NEXT: flds (%esp) 8561 ; X86-NEXT: popl %eax 8562 ; X86-NEXT: .cfi_def_cfa_offset 4 8563 ; X86-NEXT: vzeroupper 8564 ; X86-NEXT: retl 8565 ; 8566 ; X64-LABEL: test_mm512_mask_reduce_max_ps: 8567 ; X64: # %bb.0: # %entry 8568 ; X64-NEXT: kmovw %edi, %k1 8569 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 8570 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8571 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8572 ; X64-NEXT: vmaxps %ymm0, %ymm1, %ymm0 8573 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8574 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8575 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8576 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8577 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8578 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8579 ; X64-NEXT: vzeroupper 8580 ; X64-NEXT: retq 8581 entry: 8582 %0 = bitcast i16 %__M to <16 x i1> 8583 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000> 8584 %2 = bitcast <16 x float> %1 to <8 x double> 8585 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8586 %3 = bitcast <4 x double> %extract.i to <8 x float> 8587 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8588 %4 = bitcast <4 x double> %extract4.i to <8 x float> 8589 %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4) 8590 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8591 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8592 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i) 8593 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8594 %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i) 8595 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8596 %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i) 8597 %vecext.i = extractelement <4 x float> %8, i32 0 8598 ret float %vecext.i 8599 } 8600 8601 define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) { 8602 ; X86-LABEL: test_mm512_mask_reduce_min_epi32: 8603 ; X86: # %bb.0: # %entry 8604 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8605 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 8606 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8607 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8608 ; X86-NEXT: vpminsd %ymm0, %ymm1, %ymm0 8609 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8610 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8611 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8612 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8613 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8614 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8615 ; X86-NEXT: vmovd %xmm0, %eax 8616 ; X86-NEXT: vzeroupper 8617 ; X86-NEXT: retl 8618 ; 8619 ; X64-LABEL: test_mm512_mask_reduce_min_epi32: 8620 ; X64: # %bb.0: # %entry 8621 ; X64-NEXT: kmovw %edi, %k1 8622 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 8623 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8624 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8625 ; X64-NEXT: vpminsd %ymm0, %ymm1, %ymm0 8626 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8627 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8628 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8629 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8630 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8631 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8632 ; X64-NEXT: vmovd %xmm0, %eax 8633 ; X64-NEXT: vzeroupper 8634 ; X64-NEXT: retq 8635 entry: 8636 %0 = bitcast <8 x i64> %__W to <16 x i32> 8637 %1 = bitcast i16 %__M to <16 x i1> 8638 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647> 8639 %3 = bitcast <16 x i32> %2 to <8 x i64> 8640 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8641 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8642 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8643 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 8644 %6 = icmp slt <8 x i32> %4, %5 8645 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8646 %8 = bitcast <8 x i32> %7 to <4 x i64> 8647 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8648 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8649 %9 = bitcast <2 x i64> %extract6.i to <4 x i32> 8650 %10 = bitcast <2 x i64> %extract7.i to <4 x i32> 8651 %11 = icmp slt <4 x i32> %9, %10 8652 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8653 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8654 %13 = icmp slt <4 x i32> %12, %shuffle.i 8655 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8656 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8657 %15 = icmp slt <4 x i32> %14, %shuffle10.i 8658 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i 8659 %vecext.i = extractelement <4 x i32> %16, i32 0 8660 ret i32 %vecext.i 8661 } 8662 8663 define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) { 8664 ; X86-LABEL: test_mm512_mask_reduce_min_epu32: 8665 ; X86: # %bb.0: # %entry 8666 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8667 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8668 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8669 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8670 ; X86-NEXT: vpminud %ymm0, %ymm1, %ymm0 8671 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8672 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 8673 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8674 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 8675 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8676 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 8677 ; X86-NEXT: vmovd %xmm0, %eax 8678 ; X86-NEXT: vzeroupper 8679 ; X86-NEXT: retl 8680 ; 8681 ; X64-LABEL: test_mm512_mask_reduce_min_epu32: 8682 ; X64: # %bb.0: # %entry 8683 ; X64-NEXT: kmovw %edi, %k1 8684 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8685 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8686 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8687 ; X64-NEXT: vpminud %ymm0, %ymm1, %ymm0 8688 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8689 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 8690 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8691 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 8692 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8693 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 8694 ; X64-NEXT: vmovd %xmm0, %eax 8695 ; X64-NEXT: vzeroupper 8696 ; X64-NEXT: retq 8697 entry: 8698 %0 = bitcast <8 x i64> %__W to <16 x i32> 8699 %1 = bitcast i16 %__M to <16 x i1> 8700 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 8701 %3 = bitcast <16 x i32> %2 to <8 x i64> 8702 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8703 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8704 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8705 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 8706 %6 = icmp ult <8 x i32> %4, %5 8707 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8708 %8 = bitcast <8 x i32> %7 to <4 x i64> 8709 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8710 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8711 %9 = bitcast <2 x i64> %extract6.i to <4 x i32> 8712 %10 = bitcast <2 x i64> %extract7.i to <4 x i32> 8713 %11 = icmp ult <4 x i32> %9, %10 8714 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8715 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8716 %13 = icmp ult <4 x i32> %12, %shuffle.i 8717 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8718 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8719 %15 = icmp ult <4 x i32> %14, %shuffle10.i 8720 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i 8721 %vecext.i = extractelement <4 x i32> %16, i32 0 8722 ret i32 %vecext.i 8723 } 8724 8725 define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) { 8726 ; X86-LABEL: test_mm512_mask_reduce_min_ps: 8727 ; X86: # %bb.0: # %entry 8728 ; X86-NEXT: pushl %eax 8729 ; X86-NEXT: .cfi_def_cfa_offset 8 8730 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8731 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8732 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8733 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8734 ; X86-NEXT: vminps %ymm0, %ymm1, %ymm0 8735 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8736 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8737 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8738 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8739 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8740 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8741 ; X86-NEXT: vmovss %xmm0, (%esp) 8742 ; X86-NEXT: flds (%esp) 8743 ; X86-NEXT: popl %eax 8744 ; X86-NEXT: .cfi_def_cfa_offset 4 8745 ; X86-NEXT: vzeroupper 8746 ; X86-NEXT: retl 8747 ; 8748 ; X64-LABEL: test_mm512_mask_reduce_min_ps: 8749 ; X64: # %bb.0: # %entry 8750 ; X64-NEXT: kmovw %edi, %k1 8751 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8752 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8753 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8754 ; X64-NEXT: vminps %ymm0, %ymm1, %ymm0 8755 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8756 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8757 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8758 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8759 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8760 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8761 ; X64-NEXT: vzeroupper 8762 ; X64-NEXT: retq 8763 entry: 8764 %0 = bitcast i16 %__M to <16 x i1> 8765 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000> 8766 %2 = bitcast <16 x float> %1 to <8 x double> 8767 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8768 %3 = bitcast <4 x double> %extract.i to <8 x float> 8769 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8770 %4 = bitcast <4 x double> %extract4.i to <8 x float> 8771 %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4) 8772 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8773 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8774 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i) 8775 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8776 %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i) 8777 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8778 %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i) 8779 %vecext.i = extractelement <4 x float> %8, i32 0 8780 ret float %vecext.i 8781 } 8782 8783 define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8784 ; X86-LABEL: test_mm512_mask_max_pd: 8785 ; X86: # %bb.0: # %entry 8786 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8787 ; X86-NEXT: kmovw %eax, %k1 8788 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8789 ; X86-NEXT: retl 8790 ; 8791 ; X64-LABEL: test_mm512_mask_max_pd: 8792 ; X64: # %bb.0: # %entry 8793 ; X64-NEXT: kmovw %edi, %k1 8794 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8795 ; X64-NEXT: retq 8796 entry: 8797 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8798 %1 = bitcast i8 %__U to <8 x i1> 8799 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 8800 ret <8 x double> %2 8801 } 8802 8803 define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8804 ; X86-LABEL: test_mm512_maskz_max_pd: 8805 ; X86: # %bb.0: # %entry 8806 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8807 ; X86-NEXT: kmovw %eax, %k1 8808 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8809 ; X86-NEXT: retl 8810 ; 8811 ; X64-LABEL: test_mm512_maskz_max_pd: 8812 ; X64: # %bb.0: # %entry 8813 ; X64-NEXT: kmovw %edi, %k1 8814 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8815 ; X64-NEXT: retq 8816 entry: 8817 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8818 %1 = bitcast i8 %__U to <8 x i1> 8819 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 8820 ret <8 x double> %2 8821 } 8822 8823 define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 8824 ; X86-LABEL: test_mm512_mask_max_ps: 8825 ; X86: # %bb.0: # %entry 8826 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8827 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 8828 ; X86-NEXT: retl 8829 ; 8830 ; X64-LABEL: test_mm512_mask_max_ps: 8831 ; X64: # %bb.0: # %entry 8832 ; X64-NEXT: kmovw %edi, %k1 8833 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 8834 ; X64-NEXT: retq 8835 entry: 8836 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 8837 %1 = bitcast i16 %__U to <16 x i1> 8838 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 8839 ret <16 x float> %2 8840 } 8841 8842 define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8843 ; X86-LABEL: test_mm512_mask_max_round_pd: 8844 ; X86: # %bb.0: # %entry 8845 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8846 ; X86-NEXT: kmovw %eax, %k1 8847 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8848 ; X86-NEXT: retl 8849 ; 8850 ; X64-LABEL: test_mm512_mask_max_round_pd: 8851 ; X64: # %bb.0: # %entry 8852 ; X64-NEXT: kmovw %edi, %k1 8853 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8854 ; X64-NEXT: retq 8855 entry: 8856 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8857 %1 = bitcast i8 %__U to <8 x i1> 8858 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 8859 ret <8 x double> %2 8860 } 8861 8862 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) 8863 8864 define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8865 ; X86-LABEL: test_mm512_maskz_max_round_pd: 8866 ; X86: # %bb.0: # %entry 8867 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8868 ; X86-NEXT: kmovw %eax, %k1 8869 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8870 ; X86-NEXT: retl 8871 ; 8872 ; X64-LABEL: test_mm512_maskz_max_round_pd: 8873 ; X64: # %bb.0: # %entry 8874 ; X64-NEXT: kmovw %edi, %k1 8875 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8876 ; X64-NEXT: retq 8877 entry: 8878 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8879 %1 = bitcast i8 %__U to <8 x i1> 8880 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 8881 ret <8 x double> %2 8882 } 8883 8884 define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) { 8885 ; CHECK-LABEL: test_mm512_max_round_pd: 8886 ; CHECK: # %bb.0: # %entry 8887 ; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 8888 ; CHECK-NEXT: ret{{[l|q]}} 8889 entry: 8890 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8891 ret <8 x double> %0 8892 } 8893 8894 define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 8895 ; X86-LABEL: test_mm512_maskz_max_ps: 8896 ; X86: # %bb.0: # %entry 8897 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8898 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 8899 ; X86-NEXT: retl 8900 ; 8901 ; X64-LABEL: test_mm512_maskz_max_ps: 8902 ; X64: # %bb.0: # %entry 8903 ; X64-NEXT: kmovw %edi, %k1 8904 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 8905 ; X64-NEXT: retq 8906 entry: 8907 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 8908 %1 = bitcast i16 %__U to <16 x i1> 8909 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 8910 ret <16 x float> %2 8911 } 8912 8913 define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 8914 ; X86-LABEL: test_mm512_mask_max_round_ps: 8915 ; X86: # %bb.0: # %entry 8916 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8917 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 8918 ; X86-NEXT: retl 8919 ; 8920 ; X64-LABEL: test_mm512_mask_max_round_ps: 8921 ; X64: # %bb.0: # %entry 8922 ; X64-NEXT: kmovw %edi, %k1 8923 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 8924 ; X64-NEXT: retq 8925 entry: 8926 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 8927 %1 = bitcast i16 %__U to <16 x i1> 8928 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 8929 ret <16 x float> %2 8930 } 8931 8932 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) 8933 8934 define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 8935 ; X86-LABEL: test_mm512_maskz_max_round_ps: 8936 ; X86: # %bb.0: # %entry 8937 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8938 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 8939 ; X86-NEXT: retl 8940 ; 8941 ; X64-LABEL: test_mm512_maskz_max_round_ps: 8942 ; X64: # %bb.0: # %entry 8943 ; X64-NEXT: kmovw %edi, %k1 8944 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 8945 ; X64-NEXT: retq 8946 entry: 8947 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 8948 %1 = bitcast i16 %__U to <16 x i1> 8949 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 8950 ret <16 x float> %2 8951 } 8952 8953 define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) { 8954 ; CHECK-LABEL: test_mm512_max_round_ps: 8955 ; CHECK: # %bb.0: # %entry 8956 ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0 8957 ; CHECK-NEXT: ret{{[l|q]}} 8958 entry: 8959 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 8960 ret <16 x float> %0 8961 } 8962 8963 define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8964 ; X86-LABEL: test_mm512_mask_min_pd: 8965 ; X86: # %bb.0: # %entry 8966 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8967 ; X86-NEXT: kmovw %eax, %k1 8968 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 8969 ; X86-NEXT: retl 8970 ; 8971 ; X64-LABEL: test_mm512_mask_min_pd: 8972 ; X64: # %bb.0: # %entry 8973 ; X64-NEXT: kmovw %edi, %k1 8974 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 8975 ; X64-NEXT: retq 8976 entry: 8977 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8978 %1 = bitcast i8 %__U to <8 x i1> 8979 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 8980 ret <8 x double> %2 8981 } 8982 8983 define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8984 ; X86-LABEL: test_mm512_maskz_min_pd: 8985 ; X86: # %bb.0: # %entry 8986 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8987 ; X86-NEXT: kmovw %eax, %k1 8988 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8989 ; X86-NEXT: retl 8990 ; 8991 ; X64-LABEL: test_mm512_maskz_min_pd: 8992 ; X64: # %bb.0: # %entry 8993 ; X64-NEXT: kmovw %edi, %k1 8994 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8995 ; X64-NEXT: retq 8996 entry: 8997 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8998 %1 = bitcast i8 %__U to <8 x i1> 8999 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9000 ret <8 x double> %2 9001 } 9002 9003 define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 9004 ; X86-LABEL: test_mm512_mask_min_round_pd: 9005 ; X86: # %bb.0: # %entry 9006 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9007 ; X86-NEXT: kmovw %eax, %k1 9008 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 9009 ; X86-NEXT: retl 9010 ; 9011 ; X64-LABEL: test_mm512_mask_min_round_pd: 9012 ; X64: # %bb.0: # %entry 9013 ; X64-NEXT: kmovw %edi, %k1 9014 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 9015 ; X64-NEXT: retq 9016 entry: 9017 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9018 %1 = bitcast i8 %__U to <8 x i1> 9019 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 9020 ret <8 x double> %2 9021 } 9022 9023 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) 9024 9025 define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 9026 ; X86-LABEL: test_mm512_maskz_min_round_pd: 9027 ; X86: # %bb.0: # %entry 9028 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9029 ; X86-NEXT: kmovw %eax, %k1 9030 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 9031 ; X86-NEXT: retl 9032 ; 9033 ; X64-LABEL: test_mm512_maskz_min_round_pd: 9034 ; X64: # %bb.0: # %entry 9035 ; X64-NEXT: kmovw %edi, %k1 9036 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 9037 ; X64-NEXT: retq 9038 entry: 9039 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9040 %1 = bitcast i8 %__U to <8 x i1> 9041 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9042 ret <8 x double> %2 9043 } 9044 9045 define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) { 9046 ; CHECK-LABEL: test_mm512_min_round_pd: 9047 ; CHECK: # %bb.0: # %entry 9048 ; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0 9049 ; CHECK-NEXT: ret{{[l|q]}} 9050 entry: 9051 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9052 ret <8 x double> %0 9053 } 9054 9055 define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9056 ; X86-LABEL: test_mm512_mask_min_ps: 9057 ; X86: # %bb.0: # %entry 9058 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9059 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9060 ; X86-NEXT: retl 9061 ; 9062 ; X64-LABEL: test_mm512_mask_min_ps: 9063 ; X64: # %bb.0: # %entry 9064 ; X64-NEXT: kmovw %edi, %k1 9065 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9066 ; X64-NEXT: retq 9067 entry: 9068 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9069 %1 = bitcast i16 %__U to <16 x i1> 9070 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9071 ret <16 x float> %2 9072 } 9073 9074 define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9075 ; X86-LABEL: test_mm512_maskz_min_ps: 9076 ; X86: # %bb.0: # %entry 9077 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9078 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9079 ; X86-NEXT: retl 9080 ; 9081 ; X64-LABEL: test_mm512_maskz_min_ps: 9082 ; X64: # %bb.0: # %entry 9083 ; X64-NEXT: kmovw %edi, %k1 9084 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9085 ; X64-NEXT: retq 9086 entry: 9087 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9088 %1 = bitcast i16 %__U to <16 x i1> 9089 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9090 ret <16 x float> %2 9091 } 9092 9093 define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9094 ; X86-LABEL: test_mm512_mask_min_round_ps: 9095 ; X86: # %bb.0: # %entry 9096 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9097 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9098 ; X86-NEXT: retl 9099 ; 9100 ; X64-LABEL: test_mm512_mask_min_round_ps: 9101 ; X64: # %bb.0: # %entry 9102 ; X64-NEXT: kmovw %edi, %k1 9103 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9104 ; X64-NEXT: retq 9105 entry: 9106 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9107 %1 = bitcast i16 %__U to <16 x i1> 9108 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9109 ret <16 x float> %2 9110 } 9111 9112 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) 9113 9114 define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9115 ; X86-LABEL: test_mm512_maskz_min_round_ps: 9116 ; X86: # %bb.0: # %entry 9117 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9118 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9119 ; X86-NEXT: retl 9120 ; 9121 ; X64-LABEL: test_mm512_maskz_min_round_ps: 9122 ; X64: # %bb.0: # %entry 9123 ; X64-NEXT: kmovw %edi, %k1 9124 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9125 ; X64-NEXT: retq 9126 entry: 9127 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9128 %1 = bitcast i16 %__U to <16 x i1> 9129 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9130 ret <16 x float> %2 9131 } 9132 9133 define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) { 9134 ; CHECK-LABEL: test_mm512_min_round_ps: 9135 ; CHECK: # %bb.0: # %entry 9136 ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0 9137 ; CHECK-NEXT: ret{{[l|q]}} 9138 entry: 9139 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9140 ret <16 x float> %0 9141 } 9142 9143 define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) { 9144 ; CHECK-LABEL: test_mm512_sqrt_pd: 9145 ; CHECK: # %bb.0: # %entry 9146 ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 9147 ; CHECK-NEXT: ret{{[l|q]}} 9148 entry: 9149 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a) 9150 ret <8 x double> %0 9151 } 9152 9153 define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) { 9154 ; X86-LABEL: test_mm512_mask_sqrt_pd: 9155 ; X86: # %bb.0: # %entry 9156 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9157 ; X86-NEXT: kmovw %eax, %k1 9158 ; X86-NEXT: vsqrtpd %zmm1, %zmm0 {%k1} 9159 ; X86-NEXT: retl 9160 ; 9161 ; X64-LABEL: test_mm512_mask_sqrt_pd: 9162 ; X64: # %bb.0: # %entry 9163 ; X64-NEXT: kmovw %edi, %k1 9164 ; X64-NEXT: vsqrtpd %zmm1, %zmm0 {%k1} 9165 ; X64-NEXT: retq 9166 entry: 9167 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A) 9168 %1 = bitcast i8 %__U to <8 x i1> 9169 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 9170 ret <8 x double> %2 9171 } 9172 9173 define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) { 9174 ; X86-LABEL: test_mm512_maskz_sqrt_pd: 9175 ; X86: # %bb.0: # %entry 9176 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9177 ; X86-NEXT: kmovw %eax, %k1 9178 ; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z} 9179 ; X86-NEXT: retl 9180 ; 9181 ; X64-LABEL: test_mm512_maskz_sqrt_pd: 9182 ; X64: # %bb.0: # %entry 9183 ; X64-NEXT: kmovw %edi, %k1 9184 ; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z} 9185 ; X64-NEXT: retq 9186 entry: 9187 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A) 9188 %1 = bitcast i8 %__U to <8 x i1> 9189 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9190 ret <8 x double> %2 9191 } 9192 9193 define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) { 9194 ; X86-LABEL: test_mm512_mask_sqrt_round_pd: 9195 ; X86: # %bb.0: # %entry 9196 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9197 ; X86-NEXT: kmovw %eax, %k1 9198 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1} 9199 ; X86-NEXT: retl 9200 ; 9201 ; X64-LABEL: test_mm512_mask_sqrt_round_pd: 9202 ; X64: # %bb.0: # %entry 9203 ; X64-NEXT: kmovw %edi, %k1 9204 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1} 9205 ; X64-NEXT: retq 9206 entry: 9207 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) 9208 %1 = bitcast i8 %__U to <8 x i1> 9209 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 9210 ret <8 x double> %2 9211 } 9212 9213 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) 9214 9215 define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) { 9216 ; X86-LABEL: test_mm512_maskz_sqrt_round_pd: 9217 ; X86: # %bb.0: # %entry 9218 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9219 ; X86-NEXT: kmovw %eax, %k1 9220 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9221 ; X86-NEXT: retl 9222 ; 9223 ; X64-LABEL: test_mm512_maskz_sqrt_round_pd: 9224 ; X64: # %bb.0: # %entry 9225 ; X64-NEXT: kmovw %edi, %k1 9226 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9227 ; X64-NEXT: retq 9228 entry: 9229 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) 9230 %1 = bitcast i8 %__U to <8 x i1> 9231 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9232 ret <8 x double> %2 9233 } 9234 9235 define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) { 9236 ; CHECK-LABEL: test_mm512_sqrt_round_pd: 9237 ; CHECK: # %bb.0: # %entry 9238 ; CHECK-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 9239 ; CHECK-NEXT: ret{{[l|q]}} 9240 entry: 9241 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) 9242 ret <8 x double> %0 9243 } 9244 9245 define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) { 9246 ; CHECK-LABEL: test_mm512_sqrt_ps: 9247 ; CHECK: # %bb.0: # %entry 9248 ; CHECK-NEXT: vsqrtps %zmm0, %zmm0 9249 ; CHECK-NEXT: ret{{[l|q]}} 9250 entry: 9251 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a) 9252 ret <16 x float> %0 9253 } 9254 9255 define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) { 9256 ; X86-LABEL: test_mm512_mask_sqrt_ps: 9257 ; X86: # %bb.0: # %entry 9258 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9259 ; X86-NEXT: vsqrtps %zmm1, %zmm0 {%k1} 9260 ; X86-NEXT: retl 9261 ; 9262 ; X64-LABEL: test_mm512_mask_sqrt_ps: 9263 ; X64: # %bb.0: # %entry 9264 ; X64-NEXT: kmovw %edi, %k1 9265 ; X64-NEXT: vsqrtps %zmm1, %zmm0 {%k1} 9266 ; X64-NEXT: retq 9267 entry: 9268 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A) 9269 %1 = bitcast i16 %__U to <16 x i1> 9270 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9271 ret <16 x float> %2 9272 } 9273 9274 define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) { 9275 ; X86-LABEL: test_mm512_maskz_sqrt_ps: 9276 ; X86: # %bb.0: # %entry 9277 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9278 ; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} 9279 ; X86-NEXT: retl 9280 ; 9281 ; X64-LABEL: test_mm512_maskz_sqrt_ps: 9282 ; X64: # %bb.0: # %entry 9283 ; X64-NEXT: kmovw %edi, %k1 9284 ; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} 9285 ; X64-NEXT: retq 9286 entry: 9287 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A) 9288 %1 = bitcast i16 %__U to <16 x i1> 9289 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9290 ret <16 x float> %2 9291 } 9292 9293 define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) { 9294 ; X86-LABEL: test_mm512_mask_sqrt_round_ps: 9295 ; X86: # %bb.0: # %entry 9296 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9297 ; X86-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1} 9298 ; X86-NEXT: retl 9299 ; 9300 ; X64-LABEL: test_mm512_mask_sqrt_round_ps: 9301 ; X64: # %bb.0: # %entry 9302 ; X64-NEXT: kmovw %edi, %k1 9303 ; X64-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1} 9304 ; X64-NEXT: retq 9305 entry: 9306 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) 9307 %1 = bitcast i16 %__U to <16 x i1> 9308 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9309 ret <16 x float> %2 9310 } 9311 9312 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) 9313 9314 define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) { 9315 ; X86-LABEL: test_mm512_maskz_sqrt_round_ps: 9316 ; X86: # %bb.0: # %entry 9317 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9318 ; X86-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9319 ; X86-NEXT: retl 9320 ; 9321 ; X64-LABEL: test_mm512_maskz_sqrt_round_ps: 9322 ; X64: # %bb.0: # %entry 9323 ; X64-NEXT: kmovw %edi, %k1 9324 ; X64-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9325 ; X64-NEXT: retq 9326 entry: 9327 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) 9328 %1 = bitcast i16 %__U to <16 x i1> 9329 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9330 ret <16 x float> %2 9331 } 9332 9333 define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) { 9334 ; CHECK-LABEL: test_mm512_sqrt_round_ps: 9335 ; CHECK: # %bb.0: # %entry 9336 ; CHECK-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 9337 ; CHECK-NEXT: ret{{[l|q]}} 9338 entry: 9339 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) 9340 ret <16 x float> %0 9341 } 9342 9343 define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 { 9344 ; CHECK-LABEL: test_mm512_rol_epi32: 9345 ; CHECK: # %bb.0: # %entry 9346 ; CHECK-NEXT: vprold $5, %zmm0, %zmm0 9347 ; CHECK-NEXT: ret{{[l|q]}} 9348 entry: 9349 %0 = bitcast <8 x i64> %__A to <16 x i32> 9350 %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5) 9351 %2 = bitcast <16 x i32> %1 to <8 x i64> 9352 ret <8 x i64> %2 9353 } 9354 9355 declare <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32>, i32) #1 9356 9357 define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) { 9358 ; X86-LABEL: test_mm512_mask_rol_epi32: 9359 ; X86: # %bb.0: # %entry 9360 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9361 ; X86-NEXT: vprold $5, %zmm1, %zmm0 {%k1} 9362 ; X86-NEXT: retl 9363 ; 9364 ; X64-LABEL: test_mm512_mask_rol_epi32: 9365 ; X64: # %bb.0: # %entry 9366 ; X64-NEXT: kmovw %edi, %k1 9367 ; X64-NEXT: vprold $5, %zmm1, %zmm0 {%k1} 9368 ; X64-NEXT: retq 9369 entry: 9370 %0 = bitcast <8 x i64> %__A to <16 x i32> 9371 %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5) 9372 %2 = bitcast <8 x i64> %__W to <16 x i32> 9373 %3 = bitcast i16 %__U to <16 x i1> 9374 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2 9375 %5 = bitcast <16 x i32> %4 to <8 x i64> 9376 ret <8 x i64> %5 9377 } 9378 9379 define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) { 9380 ; X86-LABEL: test_mm512_maskz_rol_epi32: 9381 ; X86: # %bb.0: # %entry 9382 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9383 ; X86-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z} 9384 ; X86-NEXT: retl 9385 ; 9386 ; X64-LABEL: test_mm512_maskz_rol_epi32: 9387 ; X64: # %bb.0: # %entry 9388 ; X64-NEXT: kmovw %edi, %k1 9389 ; X64-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z} 9390 ; X64-NEXT: retq 9391 entry: 9392 %0 = bitcast <8 x i64> %__A to <16 x i32> 9393 %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5) 9394 %2 = bitcast i16 %__U to <16 x i1> 9395 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer 9396 %4 = bitcast <16 x i32> %3 to <8 x i64> 9397 ret <8 x i64> %4 9398 } 9399 9400 define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) { 9401 ; CHECK-LABEL: test_mm512_rol_epi64: 9402 ; CHECK: # %bb.0: # %entry 9403 ; CHECK-NEXT: vprolq $5, %zmm0, %zmm0 9404 ; CHECK-NEXT: ret{{[l|q]}} 9405 entry: 9406 %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5) 9407 ret <8 x i64> %0 9408 } 9409 9410 declare <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64>, i32) #1 9411 9412 define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) { 9413 ; X86-LABEL: test_mm512_mask_rol_epi64: 9414 ; X86: # %bb.0: # %entry 9415 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9416 ; X86-NEXT: kmovw %eax, %k1 9417 ; X86-NEXT: vprolq $5, %zmm1, %zmm0 {%k1} 9418 ; X86-NEXT: retl 9419 ; 9420 ; X64-LABEL: test_mm512_mask_rol_epi64: 9421 ; X64: # %bb.0: # %entry 9422 ; X64-NEXT: kmovw %edi, %k1 9423 ; X64-NEXT: vprolq $5, %zmm1, %zmm0 {%k1} 9424 ; X64-NEXT: retq 9425 entry: 9426 %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5) 9427 %1 = bitcast i8 %__U to <8 x i1> 9428 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9429 ret <8 x i64> %2 9430 } 9431 9432 define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) { 9433 ; X86-LABEL: test_mm512_maskz_rol_epi64: 9434 ; X86: # %bb.0: # %entry 9435 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9436 ; X86-NEXT: kmovw %eax, %k1 9437 ; X86-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z} 9438 ; X86-NEXT: retl 9439 ; 9440 ; X64-LABEL: test_mm512_maskz_rol_epi64: 9441 ; X64: # %bb.0: # %entry 9442 ; X64-NEXT: kmovw %edi, %k1 9443 ; X64-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z} 9444 ; X64-NEXT: retq 9445 entry: 9446 %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5) 9447 %1 = bitcast i8 %__U to <8 x i1> 9448 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9449 ret <8 x i64> %2 9450 } 9451 9452 define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) { 9453 ; CHECK-LABEL: test_mm512_rolv_epi32: 9454 ; CHECK: # %bb.0: # %entry 9455 ; CHECK-NEXT: vprolvd %zmm1, %zmm0, %zmm0 9456 ; CHECK-NEXT: ret{{[l|q]}} 9457 entry: 9458 %0 = bitcast <8 x i64> %__A to <16 x i32> 9459 %1 = bitcast <8 x i64> %__B to <16 x i32> 9460 %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1) 9461 %3 = bitcast <16 x i32> %2 to <8 x i64> 9462 ret <8 x i64> %3 9463 } 9464 9465 define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9466 ; X86-LABEL: test_mm512_mask_rolv_epi32: 9467 ; X86: # %bb.0: # %entry 9468 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9469 ; X86-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1} 9470 ; X86-NEXT: retl 9471 ; 9472 ; X64-LABEL: test_mm512_mask_rolv_epi32: 9473 ; X64: # %bb.0: # %entry 9474 ; X64-NEXT: kmovw %edi, %k1 9475 ; X64-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1} 9476 ; X64-NEXT: retq 9477 entry: 9478 %0 = bitcast <8 x i64> %__A to <16 x i32> 9479 %1 = bitcast <8 x i64> %__B to <16 x i32> 9480 %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1) 9481 %3 = bitcast <8 x i64> %__W to <16 x i32> 9482 %4 = bitcast i16 %__U to <16 x i1> 9483 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3 9484 %6 = bitcast <16 x i32> %5 to <8 x i64> 9485 ret <8 x i64> %6 9486 } 9487 9488 define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9489 ; X86-LABEL: test_mm512_maskz_rolv_epi32: 9490 ; X86: # %bb.0: # %entry 9491 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9492 ; X86-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9493 ; X86-NEXT: retl 9494 ; 9495 ; X64-LABEL: test_mm512_maskz_rolv_epi32: 9496 ; X64: # %bb.0: # %entry 9497 ; X64-NEXT: kmovw %edi, %k1 9498 ; X64-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9499 ; X64-NEXT: retq 9500 entry: 9501 %0 = bitcast <8 x i64> %__A to <16 x i32> 9502 %1 = bitcast <8 x i64> %__B to <16 x i32> 9503 %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1) 9504 %3 = bitcast i16 %__U to <16 x i1> 9505 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 9506 %5 = bitcast <16 x i32> %4 to <8 x i64> 9507 ret <8 x i64> %5 9508 } 9509 9510 define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) { 9511 ; CHECK-LABEL: test_mm512_rolv_epi64: 9512 ; CHECK: # %bb.0: # %entry 9513 ; CHECK-NEXT: vprolvq %zmm1, %zmm0, %zmm0 9514 ; CHECK-NEXT: ret{{[l|q]}} 9515 entry: 9516 %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9517 ret <8 x i64> %0 9518 } 9519 9520 define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9521 ; X86-LABEL: test_mm512_mask_rolv_epi64: 9522 ; X86: # %bb.0: # %entry 9523 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9524 ; X86-NEXT: kmovw %eax, %k1 9525 ; X86-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1} 9526 ; X86-NEXT: retl 9527 ; 9528 ; X64-LABEL: test_mm512_mask_rolv_epi64: 9529 ; X64: # %bb.0: # %entry 9530 ; X64-NEXT: kmovw %edi, %k1 9531 ; X64-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1} 9532 ; X64-NEXT: retq 9533 entry: 9534 %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9535 %1 = bitcast i8 %__U to <8 x i1> 9536 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9537 ret <8 x i64> %2 9538 } 9539 9540 define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9541 ; X86-LABEL: test_mm512_maskz_rolv_epi64: 9542 ; X86: # %bb.0: # %entry 9543 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9544 ; X86-NEXT: kmovw %eax, %k1 9545 ; X86-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9546 ; X86-NEXT: retl 9547 ; 9548 ; X64-LABEL: test_mm512_maskz_rolv_epi64: 9549 ; X64: # %bb.0: # %entry 9550 ; X64-NEXT: kmovw %edi, %k1 9551 ; X64-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9552 ; X64-NEXT: retq 9553 entry: 9554 %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9555 %1 = bitcast i8 %__U to <8 x i1> 9556 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9557 ret <8 x i64> %2 9558 } 9559 9560 define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) { 9561 ; CHECK-LABEL: test_mm512_ror_epi32: 9562 ; CHECK: # %bb.0: # %entry 9563 ; CHECK-NEXT: vprord $5, %zmm0, %zmm0 9564 ; CHECK-NEXT: ret{{[l|q]}} 9565 entry: 9566 %0 = bitcast <8 x i64> %__A to <16 x i32> 9567 %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5) 9568 %2 = bitcast <16 x i32> %1 to <8 x i64> 9569 ret <8 x i64> %2 9570 } 9571 9572 declare <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32>, i32) #1 9573 9574 define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) { 9575 ; X86-LABEL: test_mm512_mask_ror_epi32: 9576 ; X86: # %bb.0: # %entry 9577 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9578 ; X86-NEXT: vprord $5, %zmm1, %zmm0 {%k1} 9579 ; X86-NEXT: retl 9580 ; 9581 ; X64-LABEL: test_mm512_mask_ror_epi32: 9582 ; X64: # %bb.0: # %entry 9583 ; X64-NEXT: kmovw %edi, %k1 9584 ; X64-NEXT: vprord $5, %zmm1, %zmm0 {%k1} 9585 ; X64-NEXT: retq 9586 entry: 9587 %0 = bitcast <8 x i64> %__A to <16 x i32> 9588 %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5) 9589 %2 = bitcast <8 x i64> %__W to <16 x i32> 9590 %3 = bitcast i16 %__U to <16 x i1> 9591 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2 9592 %5 = bitcast <16 x i32> %4 to <8 x i64> 9593 ret <8 x i64> %5 9594 } 9595 9596 define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) { 9597 ; X86-LABEL: test_mm512_maskz_ror_epi32: 9598 ; X86: # %bb.0: # %entry 9599 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9600 ; X86-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z} 9601 ; X86-NEXT: retl 9602 ; 9603 ; X64-LABEL: test_mm512_maskz_ror_epi32: 9604 ; X64: # %bb.0: # %entry 9605 ; X64-NEXT: kmovw %edi, %k1 9606 ; X64-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z} 9607 ; X64-NEXT: retq 9608 entry: 9609 %0 = bitcast <8 x i64> %__A to <16 x i32> 9610 %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5) 9611 %2 = bitcast i16 %__U to <16 x i1> 9612 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer 9613 %4 = bitcast <16 x i32> %3 to <8 x i64> 9614 ret <8 x i64> %4 9615 } 9616 9617 define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) { 9618 ; CHECK-LABEL: test_mm512_ror_epi64: 9619 ; CHECK: # %bb.0: # %entry 9620 ; CHECK-NEXT: vprorq $5, %zmm0, %zmm0 9621 ; CHECK-NEXT: ret{{[l|q]}} 9622 entry: 9623 %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5) 9624 ret <8 x i64> %0 9625 } 9626 9627 declare <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64>, i32) #1 9628 9629 define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) { 9630 ; X86-LABEL: test_mm512_mask_ror_epi64: 9631 ; X86: # %bb.0: # %entry 9632 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9633 ; X86-NEXT: kmovw %eax, %k1 9634 ; X86-NEXT: vprorq $5, %zmm1, %zmm0 {%k1} 9635 ; X86-NEXT: retl 9636 ; 9637 ; X64-LABEL: test_mm512_mask_ror_epi64: 9638 ; X64: # %bb.0: # %entry 9639 ; X64-NEXT: kmovw %edi, %k1 9640 ; X64-NEXT: vprorq $5, %zmm1, %zmm0 {%k1} 9641 ; X64-NEXT: retq 9642 entry: 9643 %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5) 9644 %1 = bitcast i8 %__U to <8 x i1> 9645 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9646 ret <8 x i64> %2 9647 } 9648 9649 define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) { 9650 ; X86-LABEL: test_mm512_maskz_ror_epi64: 9651 ; X86: # %bb.0: # %entry 9652 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9653 ; X86-NEXT: kmovw %eax, %k1 9654 ; X86-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z} 9655 ; X86-NEXT: retl 9656 ; 9657 ; X64-LABEL: test_mm512_maskz_ror_epi64: 9658 ; X64: # %bb.0: # %entry 9659 ; X64-NEXT: kmovw %edi, %k1 9660 ; X64-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z} 9661 ; X64-NEXT: retq 9662 entry: 9663 %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5) 9664 %1 = bitcast i8 %__U to <8 x i1> 9665 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9666 ret <8 x i64> %2 9667 } 9668 9669 define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) { 9670 ; CHECK-LABEL: test_mm512_rorv_epi32: 9671 ; CHECK: # %bb.0: # %entry 9672 ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0 9673 ; CHECK-NEXT: ret{{[l|q]}} 9674 entry: 9675 %0 = bitcast <8 x i64> %__A to <16 x i32> 9676 %1 = bitcast <8 x i64> %__B to <16 x i32> 9677 %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1) 9678 %3 = bitcast <16 x i32> %2 to <8 x i64> 9679 ret <8 x i64> %3 9680 } 9681 9682 define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9683 ; X86-LABEL: test_mm512_mask_rorv_epi32: 9684 ; X86: # %bb.0: # %entry 9685 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9686 ; X86-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1} 9687 ; X86-NEXT: retl 9688 ; 9689 ; X64-LABEL: test_mm512_mask_rorv_epi32: 9690 ; X64: # %bb.0: # %entry 9691 ; X64-NEXT: kmovw %edi, %k1 9692 ; X64-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1} 9693 ; X64-NEXT: retq 9694 entry: 9695 %0 = bitcast <8 x i64> %__A to <16 x i32> 9696 %1 = bitcast <8 x i64> %__B to <16 x i32> 9697 %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1) 9698 %3 = bitcast <8 x i64> %__W to <16 x i32> 9699 %4 = bitcast i16 %__U to <16 x i1> 9700 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3 9701 %6 = bitcast <16 x i32> %5 to <8 x i64> 9702 ret <8 x i64> %6 9703 } 9704 9705 define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9706 ; X86-LABEL: test_mm512_maskz_rorv_epi32: 9707 ; X86: # %bb.0: # %entry 9708 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9709 ; X86-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9710 ; X86-NEXT: retl 9711 ; 9712 ; X64-LABEL: test_mm512_maskz_rorv_epi32: 9713 ; X64: # %bb.0: # %entry 9714 ; X64-NEXT: kmovw %edi, %k1 9715 ; X64-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9716 ; X64-NEXT: retq 9717 entry: 9718 %0 = bitcast <8 x i64> %__A to <16 x i32> 9719 %1 = bitcast <8 x i64> %__B to <16 x i32> 9720 %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1) 9721 %3 = bitcast i16 %__U to <16 x i1> 9722 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 9723 %5 = bitcast <16 x i32> %4 to <8 x i64> 9724 ret <8 x i64> %5 9725 } 9726 9727 define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) { 9728 ; CHECK-LABEL: test_mm512_rorv_epi64: 9729 ; CHECK: # %bb.0: # %entry 9730 ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0 9731 ; CHECK-NEXT: ret{{[l|q]}} 9732 entry: 9733 %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9734 ret <8 x i64> %0 9735 } 9736 9737 define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9738 ; X86-LABEL: test_mm512_mask_rorv_epi64: 9739 ; X86: # %bb.0: # %entry 9740 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9741 ; X86-NEXT: kmovw %eax, %k1 9742 ; X86-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1} 9743 ; X86-NEXT: retl 9744 ; 9745 ; X64-LABEL: test_mm512_mask_rorv_epi64: 9746 ; X64: # %bb.0: # %entry 9747 ; X64-NEXT: kmovw %edi, %k1 9748 ; X64-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1} 9749 ; X64-NEXT: retq 9750 entry: 9751 %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9752 %1 = bitcast i8 %__U to <8 x i1> 9753 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9754 ret <8 x i64> %2 9755 } 9756 9757 define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9758 ; X86-LABEL: test_mm512_maskz_rorv_epi64: 9759 ; X86: # %bb.0: # %entry 9760 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9761 ; X86-NEXT: kmovw %eax, %k1 9762 ; X86-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9763 ; X86-NEXT: retl 9764 ; 9765 ; X64-LABEL: test_mm512_maskz_rorv_epi64: 9766 ; X64: # %bb.0: # %entry 9767 ; X64-NEXT: kmovw %edi, %k1 9768 ; X64-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9769 ; X64-NEXT: retq 9770 entry: 9771 %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9772 %1 = bitcast i8 %__U to <8 x i1> 9773 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9774 ret <8 x i64> %2 9775 } 9776 9777 declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9 9778 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9 9779 declare float @llvm.fma.f32(float, float, float) #9 9780 declare double @llvm.fma.f64(double, double, double) #9 9781 declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>) 9782 declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>) 9783 declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10 9784 declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>) 9785 declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>) 9786 declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>) 9787 declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>) 9788 declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>) 9789 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) 9790 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) 9791 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) 9792 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) 9793 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) 9794 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) 9795 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) 9796 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) 9797 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) 9798 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) 9799 declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>) 9800 declare <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64>, <8 x i64>) 9801 declare <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32>, <16 x i32>) 9802 declare <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64>, <8 x i64>) 9803 9804 !0 = !{i32 1} 9805 9806