1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X86 3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X64 4 5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c 6 7 define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 8 ; CHECK-LABEL: test_mm256_add_pd: 9 ; CHECK: # %bb.0: 10 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 11 ; CHECK-NEXT: ret{{[l|q]}} 12 %res = fadd <4 x double> %a0, %a1 13 ret <4 x double> %res 14 } 15 16 define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 17 ; CHECK-LABEL: test_mm256_add_ps: 18 ; CHECK: # %bb.0: 19 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 20 ; CHECK-NEXT: ret{{[l|q]}} 21 %res = fadd <8 x float> %a0, %a1 22 ret <8 x float> %res 23 } 24 25 define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 26 ; CHECK-LABEL: test_mm256_addsub_pd: 27 ; CHECK: # %bb.0: 28 ; CHECK-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 29 ; CHECK-NEXT: ret{{[l|q]}} 30 %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) 31 ret <4 x double> %res 32 } 33 declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone 34 35 define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 36 ; CHECK-LABEL: test_mm256_addsub_ps: 37 ; CHECK: # %bb.0: 38 ; CHECK-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 39 ; CHECK-NEXT: ret{{[l|q]}} 40 %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) 41 ret <8 x float> %res 42 } 43 declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone 44 45 define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 46 ; CHECK-LABEL: test_mm256_and_pd: 47 ; CHECK: # %bb.0: 48 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 49 ; CHECK-NEXT: ret{{[l|q]}} 50 %1 = bitcast <4 x double> %a0 to <4 x i64> 51 %2 = bitcast <4 x double> %a1 to <4 x i64> 52 %res = and <4 x i64> %1, %2 53 %bc = bitcast <4 x i64> %res to <4 x double> 54 ret <4 x double> %bc 55 } 56 57 define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 58 ; CHECK-LABEL: test_mm256_and_ps: 59 ; CHECK: # %bb.0: 60 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 61 ; CHECK-NEXT: ret{{[l|q]}} 62 %1 = bitcast <8 x float> %a0 to <8 x i32> 63 %2 = bitcast <8 x float> %a1 to <8 x i32> 64 %res = and <8 x i32> %1, %2 65 %bc = bitcast <8 x i32> %res to <8 x float> 66 ret <8 x float> %bc 67 } 68 69 define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 70 ; CHECK-LABEL: test_mm256_andnot_pd: 71 ; CHECK: # %bb.0: 72 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 73 ; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 74 ; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 75 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 76 ; CHECK-NEXT: ret{{[l|q]}} 77 %1 = bitcast <4 x double> %a0 to <4 x i64> 78 %2 = bitcast <4 x double> %a1 to <4 x i64> 79 %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1> 80 %res = and <4 x i64> %3, %2 81 %bc = bitcast <4 x i64> %res to <4 x double> 82 ret <4 x double> %bc 83 } 84 85 define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 86 ; CHECK-LABEL: test_mm256_andnot_ps: 87 ; CHECK: # %bb.0: 88 ; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 89 ; CHECK-NEXT: ret{{[l|q]}} 90 %1 = bitcast <8 x float> %a0 to <8 x i32> 91 %2 = bitcast <8 x float> %a1 to <8 x i32> 92 %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 93 %res = and <8 x i32> %3, %2 94 %bc = bitcast <8 x i32> %res to <8 x float> 95 ret <8 x float> %bc 96 } 97 98 define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 99 ; CHECK-LABEL: test_mm256_blend_pd: 100 ; CHECK: # %bb.0: 101 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] 102 ; CHECK-NEXT: ret{{[l|q]}} 103 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3> 104 ret <4 x double> %res 105 } 106 107 define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 108 ; CHECK-LABEL: test_mm256_blend_ps: 109 ; CHECK: # %bb.0: 110 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7] 111 ; CHECK-NEXT: ret{{[l|q]}} 112 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15> 113 ret <8 x float> %res 114 } 115 116 define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind { 117 ; CHECK-LABEL: test_mm256_blendv_pd: 118 ; CHECK: # %bb.0: 119 ; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 120 ; CHECK-NEXT: ret{{[l|q]}} 121 %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) 122 ret <4 x double> %res 123 } 124 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone 125 126 define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind { 127 ; CHECK-LABEL: test_mm256_blendv_ps: 128 ; CHECK: # %bb.0: 129 ; CHECK-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 130 ; CHECK-NEXT: ret{{[l|q]}} 131 %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) 132 ret <8 x float> %res 133 } 134 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone 135 136 define <4 x double> @test_mm256_broadcast_pd(<2 x double>* %a0) nounwind { 137 ; X86-LABEL: test_mm256_broadcast_pd: 138 ; X86: # %bb.0: 139 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 140 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 141 ; X86-NEXT: retl 142 ; 143 ; X64-LABEL: test_mm256_broadcast_pd: 144 ; X64: # %bb.0: 145 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 146 ; X64-NEXT: retq 147 %ld = load <2 x double>, <2 x double>* %a0 148 %res = shufflevector <2 x double> %ld, <2 x double> %ld, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 149 ret <4 x double> %res 150 } 151 152 define <8 x float> @test_mm256_broadcast_ps(<4 x float>* %a0) nounwind { 153 ; X86-LABEL: test_mm256_broadcast_ps: 154 ; X86: # %bb.0: 155 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 156 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 157 ; X86-NEXT: retl 158 ; 159 ; X64-LABEL: test_mm256_broadcast_ps: 160 ; X64: # %bb.0: 161 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 162 ; X64-NEXT: retq 163 %ld = load <4 x float>, <4 x float>* %a0 164 %res = shufflevector <4 x float> %ld, <4 x float> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 165 ret <8 x float> %res 166 } 167 168 define <4 x double> @test_mm256_broadcast_sd(double* %a0) nounwind { 169 ; X86-LABEL: test_mm256_broadcast_sd: 170 ; X86: # %bb.0: 171 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 172 ; X86-NEXT: vbroadcastsd (%eax), %ymm0 173 ; X86-NEXT: retl 174 ; 175 ; X64-LABEL: test_mm256_broadcast_sd: 176 ; X64: # %bb.0: 177 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0 178 ; X64-NEXT: retq 179 %ld = load double, double* %a0 180 %ins0 = insertelement <4 x double> undef, double %ld, i32 0 181 %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1 182 %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2 183 %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3 184 ret <4 x double> %ins3 185 } 186 187 define <4 x float> @test_mm_broadcast_ss(float* %a0) nounwind { 188 ; X86-LABEL: test_mm_broadcast_ss: 189 ; X86: # %bb.0: 190 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 191 ; X86-NEXT: vbroadcastss (%eax), %xmm0 192 ; X86-NEXT: retl 193 ; 194 ; X64-LABEL: test_mm_broadcast_ss: 195 ; X64: # %bb.0: 196 ; X64-NEXT: vbroadcastss (%rdi), %xmm0 197 ; X64-NEXT: retq 198 %ld = load float, float* %a0 199 %ins0 = insertelement <4 x float> undef, float %ld, i32 0 200 %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1 201 %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2 202 %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3 203 ret <4 x float> %ins3 204 } 205 206 define <8 x float> @test_mm256_broadcast_ss(float* %a0) nounwind { 207 ; X86-LABEL: test_mm256_broadcast_ss: 208 ; X86: # %bb.0: 209 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 210 ; X86-NEXT: vbroadcastss (%eax), %ymm0 211 ; X86-NEXT: retl 212 ; 213 ; X64-LABEL: test_mm256_broadcast_ss: 214 ; X64: # %bb.0: 215 ; X64-NEXT: vbroadcastss (%rdi), %ymm0 216 ; X64-NEXT: retq 217 %ld = load float, float* %a0 218 %ins0 = insertelement <8 x float> undef, float %ld, i32 0 219 %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1 220 %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2 221 %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3 222 %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4 223 %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5 224 %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6 225 %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7 226 ret <8 x float> %ins7 227 } 228 229 define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind { 230 ; CHECK-LABEL: test_mm256_castpd_ps: 231 ; CHECK: # %bb.0: 232 ; CHECK-NEXT: ret{{[l|q]}} 233 %res = bitcast <4 x double> %a0 to <8 x float> 234 ret <8 x float> %res 235 } 236 237 define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind { 238 ; CHECK-LABEL: test_mm256_castpd_si256: 239 ; CHECK: # %bb.0: 240 ; CHECK-NEXT: ret{{[l|q]}} 241 %res = bitcast <4 x double> %a0 to <4 x i64> 242 ret <4 x i64> %res 243 } 244 245 define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind { 246 ; CHECK-LABEL: test_mm256_castpd128_pd256: 247 ; CHECK: # %bb.0: 248 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 249 ; CHECK-NEXT: ret{{[l|q]}} 250 %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 251 ret <4 x double> %res 252 } 253 254 define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind { 255 ; CHECK-LABEL: test_mm256_castpd256_pd128: 256 ; CHECK: # %bb.0: 257 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 258 ; CHECK-NEXT: vzeroupper 259 ; CHECK-NEXT: ret{{[l|q]}} 260 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1> 261 ret <2 x double> %res 262 } 263 264 define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind { 265 ; CHECK-LABEL: test_mm256_castps_pd: 266 ; CHECK: # %bb.0: 267 ; CHECK-NEXT: ret{{[l|q]}} 268 %res = bitcast <8 x float> %a0 to <4 x double> 269 ret <4 x double> %res 270 } 271 272 define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind { 273 ; CHECK-LABEL: test_mm256_castps_si256: 274 ; CHECK: # %bb.0: 275 ; CHECK-NEXT: ret{{[l|q]}} 276 %res = bitcast <8 x float> %a0 to <4 x i64> 277 ret <4 x i64> %res 278 } 279 280 define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind { 281 ; CHECK-LABEL: test_mm256_castps128_ps256: 282 ; CHECK: # %bb.0: 283 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 284 ; CHECK-NEXT: ret{{[l|q]}} 285 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 286 ret <8 x float> %res 287 } 288 289 define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind { 290 ; CHECK-LABEL: test_mm256_castps256_ps128: 291 ; CHECK: # %bb.0: 292 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 293 ; CHECK-NEXT: vzeroupper 294 ; CHECK-NEXT: ret{{[l|q]}} 295 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 296 ret <4 x float> %res 297 } 298 299 define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind { 300 ; CHECK-LABEL: test_mm256_castsi128_si256: 301 ; CHECK: # %bb.0: 302 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 303 ; CHECK-NEXT: ret{{[l|q]}} 304 %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 305 ret <4 x i64> %res 306 } 307 308 define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind { 309 ; CHECK-LABEL: test_mm256_castsi256_pd: 310 ; CHECK: # %bb.0: 311 ; CHECK-NEXT: ret{{[l|q]}} 312 %res = bitcast <4 x i64> %a0 to <4 x double> 313 ret <4 x double> %res 314 } 315 316 define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind { 317 ; CHECK-LABEL: test_mm256_castsi256_ps: 318 ; CHECK: # %bb.0: 319 ; CHECK-NEXT: ret{{[l|q]}} 320 %res = bitcast <4 x i64> %a0 to <8 x float> 321 ret <8 x float> %res 322 } 323 324 define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind { 325 ; CHECK-LABEL: test_mm256_castsi256_si128: 326 ; CHECK: # %bb.0: 327 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 328 ; CHECK-NEXT: vzeroupper 329 ; CHECK-NEXT: ret{{[l|q]}} 330 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1> 331 ret <2 x i64> %res 332 } 333 334 define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind { 335 ; CHECK-LABEL: test_mm256_ceil_pd: 336 ; CHECK: # %bb.0: 337 ; CHECK-NEXT: vroundpd $2, %ymm0, %ymm0 338 ; CHECK-NEXT: ret{{[l|q]}} 339 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2) 340 ret <4 x double> %res 341 } 342 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone 343 344 define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind { 345 ; CHECK-LABEL: test_mm256_ceil_ps: 346 ; CHECK: # %bb.0: 347 ; CHECK-NEXT: vroundps $2, %ymm0, %ymm0 348 ; CHECK-NEXT: ret{{[l|q]}} 349 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2) 350 ret <8 x float> %res 351 } 352 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone 353 354 define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 355 ; CHECK-LABEL: test_mm_cmp_pd: 356 ; CHECK: # %bb.0: 357 ; CHECK-NEXT: vcmpgepd %xmm1, %xmm0, %xmm0 358 ; CHECK-NEXT: ret{{[l|q]}} 359 %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13) 360 ret <2 x double> %res 361 } 362 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone 363 364 define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 365 ; CHECK-LABEL: test_mm256_cmp_pd: 366 ; CHECK: # %bb.0: 367 ; CHECK-NEXT: vcmpgepd %ymm1, %ymm0, %ymm0 368 ; CHECK-NEXT: ret{{[l|q]}} 369 %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13) 370 ret <4 x double> %res 371 } 372 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 373 374 define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 375 ; CHECK-LABEL: test_mm_cmp_ps: 376 ; CHECK: # %bb.0: 377 ; CHECK-NEXT: vcmpgeps %xmm1, %xmm0, %xmm0 378 ; CHECK-NEXT: ret{{[l|q]}} 379 %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13) 380 ret <4 x float> %res 381 } 382 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone 383 384 define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 385 ; CHECK-LABEL: test_mm256_cmp_ps: 386 ; CHECK: # %bb.0: 387 ; CHECK-NEXT: vcmpgeps %ymm1, %ymm0, %ymm0 388 ; CHECK-NEXT: ret{{[l|q]}} 389 %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13) 390 ret <8 x float> %res 391 } 392 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 393 394 define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind { 395 ; CHECK-LABEL: test_mm_cmp_sd: 396 ; CHECK: # %bb.0: 397 ; CHECK-NEXT: vcmpgesd %xmm1, %xmm0, %xmm0 398 ; CHECK-NEXT: ret{{[l|q]}} 399 %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13) 400 ret <2 x double> %res 401 } 402 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone 403 404 define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 405 ; CHECK-LABEL: test_mm_cmp_ss: 406 ; CHECK: # %bb.0: 407 ; CHECK-NEXT: vcmpgess %xmm1, %xmm0, %xmm0 408 ; CHECK-NEXT: ret{{[l|q]}} 409 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13) 410 ret <4 x float> %res 411 } 412 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone 413 414 define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind { 415 ; CHECK-LABEL: test_mm256_cvtepi32_pd: 416 ; CHECK: # %bb.0: 417 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 418 ; CHECK-NEXT: ret{{[l|q]}} 419 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 420 %res = sitofp <4 x i32> %arg0 to <4 x double> 421 ret <4 x double> %res 422 } 423 424 define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind { 425 ; CHECK-LABEL: test_mm256_cvtepi32_ps: 426 ; CHECK: # %bb.0: 427 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 428 ; CHECK-NEXT: ret{{[l|q]}} 429 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 430 %res = sitofp <8 x i32> %arg0 to <8 x float> 431 ret <8 x float> %res 432 } 433 434 define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind { 435 ; CHECK-LABEL: test_mm256_cvtpd_epi32: 436 ; CHECK: # %bb.0: 437 ; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0 438 ; CHECK-NEXT: vzeroupper 439 ; CHECK-NEXT: ret{{[l|q]}} 440 %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) 441 %res = bitcast <4 x i32> %cvt to <2 x i64> 442 ret <2 x i64> %res 443 } 444 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone 445 446 define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind { 447 ; CHECK-LABEL: test_mm256_cvtpd_ps: 448 ; CHECK: # %bb.0: 449 ; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 450 ; CHECK-NEXT: vzeroupper 451 ; CHECK-NEXT: ret{{[l|q]}} 452 %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0) 453 ret <4 x float> %res 454 } 455 declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone 456 457 define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind { 458 ; CHECK-LABEL: test_mm256_cvtps_epi32: 459 ; CHECK: # %bb.0: 460 ; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 461 ; CHECK-NEXT: ret{{[l|q]}} 462 %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0) 463 %res = bitcast <8 x i32> %cvt to <4 x i64> 464 ret <4 x i64> %res 465 } 466 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone 467 468 define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind { 469 ; CHECK-LABEL: test_mm256_cvtps_pd: 470 ; CHECK: # %bb.0: 471 ; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 472 ; CHECK-NEXT: ret{{[l|q]}} 473 %res = fpext <4 x float> %a0 to <4 x double> 474 ret <4 x double> %res 475 } 476 477 define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind { 478 ; CHECK-LABEL: test_mm256_cvttpd_epi32: 479 ; CHECK: # %bb.0: 480 ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 481 ; CHECK-NEXT: vzeroupper 482 ; CHECK-NEXT: ret{{[l|q]}} 483 %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) 484 %res = bitcast <4 x i32> %cvt to <2 x i64> 485 ret <2 x i64> %res 486 } 487 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone 488 489 define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind { 490 ; CHECK-LABEL: test_mm256_cvttps_epi32: 491 ; CHECK: # %bb.0: 492 ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 493 ; CHECK-NEXT: ret{{[l|q]}} 494 %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) 495 %res = bitcast <8 x i32> %cvt to <4 x i64> 496 ret <4 x i64> %res 497 } 498 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone 499 500 define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 501 ; CHECK-LABEL: test_mm256_div_pd: 502 ; CHECK: # %bb.0: 503 ; CHECK-NEXT: vdivpd %ymm1, %ymm0, %ymm0 504 ; CHECK-NEXT: ret{{[l|q]}} 505 %res = fdiv <4 x double> %a0, %a1 506 ret <4 x double> %res 507 } 508 509 define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 510 ; CHECK-LABEL: test_mm256_div_ps: 511 ; CHECK: # %bb.0: 512 ; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 513 ; CHECK-NEXT: ret{{[l|q]}} 514 %res = fdiv <8 x float> %a0, %a1 515 ret <8 x float> %res 516 } 517 518 define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 519 ; CHECK-LABEL: test_mm256_dp_ps: 520 ; CHECK: # %bb.0: 521 ; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 522 ; CHECK-NEXT: ret{{[l|q]}} 523 %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) 524 ret <8 x float> %res 525 } 526 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 527 528 define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind { 529 ; CHECK-LABEL: test_mm256_extract_epi8: 530 ; CHECK: # %bb.0: 531 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 532 ; CHECK-NEXT: vpextrb $15, %xmm0, %eax 533 ; CHECK-NEXT: movzbl %al, %eax 534 ; CHECK-NEXT: vzeroupper 535 ; CHECK-NEXT: ret{{[l|q]}} 536 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 537 %ext = extractelement <32 x i8> %arg0, i32 31 538 %res = zext i8 %ext to i32 539 ret i32 %res 540 } 541 542 define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind { 543 ; CHECK-LABEL: test_mm256_extract_epi16: 544 ; CHECK: # %bb.0: 545 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 546 ; CHECK-NEXT: vpextrw $3, %xmm0, %eax 547 ; CHECK-NEXT: movzwl %ax, %eax 548 ; CHECK-NEXT: vzeroupper 549 ; CHECK-NEXT: ret{{[l|q]}} 550 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 551 %ext = extractelement <16 x i16> %arg0, i32 11 552 %res = zext i16 %ext to i32 553 ret i32 %res 554 } 555 556 define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind { 557 ; CHECK-LABEL: test_mm256_extract_epi32: 558 ; CHECK: # %bb.0: 559 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 560 ; CHECK-NEXT: vextractps $1, %xmm0, %eax 561 ; CHECK-NEXT: vzeroupper 562 ; CHECK-NEXT: ret{{[l|q]}} 563 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 564 %res = extractelement <8 x i32> %arg0, i32 5 565 ret i32 %res 566 } 567 568 define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind { 569 ; X86-LABEL: test_mm256_extract_epi64: 570 ; X86: # %bb.0: 571 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 572 ; X86-NEXT: vextractps $2, %xmm0, %eax 573 ; X86-NEXT: vextractps $3, %xmm0, %edx 574 ; X86-NEXT: vzeroupper 575 ; X86-NEXT: retl 576 ; 577 ; X64-LABEL: test_mm256_extract_epi64: 578 ; X64: # %bb.0: 579 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 580 ; X64-NEXT: vpextrq $1, %xmm0, %rax 581 ; X64-NEXT: vzeroupper 582 ; X64-NEXT: retq 583 %res = extractelement <4 x i64> %a0, i32 3 584 ret i64 %res 585 } 586 587 define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind { 588 ; CHECK-LABEL: test_mm256_extractf128_pd: 589 ; CHECK: # %bb.0: 590 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 591 ; CHECK-NEXT: vzeroupper 592 ; CHECK-NEXT: ret{{[l|q]}} 593 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3> 594 ret <2 x double> %res 595 } 596 597 define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind { 598 ; CHECK-LABEL: test_mm256_extractf128_ps: 599 ; CHECK: # %bb.0: 600 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 601 ; CHECK-NEXT: vzeroupper 602 ; CHECK-NEXT: ret{{[l|q]}} 603 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 604 ret <4 x float> %res 605 } 606 607 define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind { 608 ; CHECK-LABEL: test_mm256_extractf128_si256: 609 ; CHECK: # %bb.0: 610 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 611 ; CHECK-NEXT: vzeroupper 612 ; CHECK-NEXT: ret{{[l|q]}} 613 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3> 614 ret <2 x i64> %res 615 } 616 617 define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind { 618 ; CHECK-LABEL: test_mm256_floor_pd: 619 ; CHECK: # %bb.0: 620 ; CHECK-NEXT: vroundpd $1, %ymm0, %ymm0 621 ; CHECK-NEXT: ret{{[l|q]}} 622 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1) 623 ret <4 x double> %res 624 } 625 626 define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind { 627 ; CHECK-LABEL: test_mm256_floor_ps: 628 ; CHECK: # %bb.0: 629 ; CHECK-NEXT: vroundps $1, %ymm0, %ymm0 630 ; CHECK-NEXT: ret{{[l|q]}} 631 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1) 632 ret <8 x float> %res 633 } 634 635 define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 636 ; CHECK-LABEL: test_mm256_hadd_pd: 637 ; CHECK: # %bb.0: 638 ; CHECK-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 639 ; CHECK-NEXT: ret{{[l|q]}} 640 %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) 641 ret <4 x double> %res 642 } 643 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone 644 645 define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 646 ; CHECK-LABEL: test_mm256_hadd_ps: 647 ; CHECK: # %bb.0: 648 ; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0 649 ; CHECK-NEXT: ret{{[l|q]}} 650 %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) 651 ret <8 x float> %res 652 } 653 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone 654 655 define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 656 ; CHECK-LABEL: test_mm256_hsub_pd: 657 ; CHECK: # %bb.0: 658 ; CHECK-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 659 ; CHECK-NEXT: ret{{[l|q]}} 660 %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) 661 ret <4 x double> %res 662 } 663 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone 664 665 define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 666 ; CHECK-LABEL: test_mm256_hsub_ps: 667 ; CHECK: # %bb.0: 668 ; CHECK-NEXT: vhsubps %ymm1, %ymm0, %ymm0 669 ; CHECK-NEXT: ret{{[l|q]}} 670 %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) 671 ret <8 x float> %res 672 } 673 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone 674 675 define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind { 676 ; X86-LABEL: test_mm256_insert_epi8: 677 ; X86: # %bb.0: 678 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 679 ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1 680 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 681 ; X86-NEXT: retl 682 ; 683 ; X64-LABEL: test_mm256_insert_epi8: 684 ; X64: # %bb.0: 685 ; X64-NEXT: movzbl %dil, %eax 686 ; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1 687 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 688 ; X64-NEXT: retq 689 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 690 %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4 691 %bc = bitcast <32 x i8> %res to <4 x i64> 692 ret <4 x i64> %bc 693 } 694 695 define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind { 696 ; X86-LABEL: test_mm256_insert_epi16: 697 ; X86: # %bb.0: 698 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 699 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 700 ; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 701 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 702 ; X86-NEXT: retl 703 ; 704 ; X64-LABEL: test_mm256_insert_epi16: 705 ; X64: # %bb.0: 706 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 707 ; X64-NEXT: vpinsrw $6, %edi, %xmm1, %xmm1 708 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 709 ; X64-NEXT: retq 710 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 711 %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14 712 %bc = bitcast <16 x i16> %res to <4 x i64> 713 ret <4 x i64> %bc 714 } 715 716 define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind { 717 ; X86-LABEL: test_mm256_insert_epi32: 718 ; X86: # %bb.0: 719 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1 720 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 721 ; X86-NEXT: retl 722 ; 723 ; X64-LABEL: test_mm256_insert_epi32: 724 ; X64: # %bb.0: 725 ; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm1 726 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 727 ; X64-NEXT: retq 728 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 729 %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3 730 %bc = bitcast <8 x i32> %res to <4 x i64> 731 ret <4 x i64> %bc 732 } 733 734 define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind { 735 ; X86-LABEL: test_mm256_insert_epi64: 736 ; X86: # %bb.0: 737 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 738 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 739 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 740 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 741 ; X86-NEXT: retl 742 ; 743 ; X64-LABEL: test_mm256_insert_epi64: 744 ; X64: # %bb.0: 745 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 746 ; X64-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 747 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 748 ; X64-NEXT: retq 749 %res = insertelement <4 x i64> %a0, i64 %a1, i32 3 750 ret <4 x i64> %res 751 } 752 753 define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind { 754 ; CHECK-LABEL: test_mm256_insertf128_pd: 755 ; CHECK: # %bb.0: 756 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 757 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 758 ; CHECK-NEXT: ret{{[l|q]}} 759 %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 760 %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 761 ret <4 x double> %res 762 } 763 764 define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind { 765 ; CHECK-LABEL: test_mm256_insertf128_ps: 766 ; CHECK: # %bb.0: 767 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 768 ; CHECK-NEXT: ret{{[l|q]}} 769 %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 770 %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 771 ret <8 x float> %res 772 } 773 774 define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 775 ; CHECK-LABEL: test_mm256_insertf128_si256: 776 ; CHECK: # %bb.0: 777 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 778 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 779 ; CHECK-NEXT: ret{{[l|q]}} 780 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 781 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 782 ret <4 x i64> %res 783 } 784 785 define <4 x i64> @test_mm256_lddqu_si256(<4 x i64>* %a0) nounwind { 786 ; X86-LABEL: test_mm256_lddqu_si256: 787 ; X86: # %bb.0: 788 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 789 ; X86-NEXT: vlddqu (%eax), %ymm0 790 ; X86-NEXT: retl 791 ; 792 ; X64-LABEL: test_mm256_lddqu_si256: 793 ; X64: # %bb.0: 794 ; X64-NEXT: vlddqu (%rdi), %ymm0 795 ; X64-NEXT: retq 796 %arg0 = bitcast <4 x i64>* %a0 to i8* 797 %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %arg0) 798 %bc = bitcast <32 x i8> %res to <4 x i64> 799 ret <4 x i64> %bc 800 } 801 declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readnone 802 803 define <4 x double> @test_mm256_load_pd(double* %a0) nounwind { 804 ; X86-LABEL: test_mm256_load_pd: 805 ; X86: # %bb.0: 806 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 807 ; X86-NEXT: vmovaps (%eax), %ymm0 808 ; X86-NEXT: retl 809 ; 810 ; X64-LABEL: test_mm256_load_pd: 811 ; X64: # %bb.0: 812 ; X64-NEXT: vmovaps (%rdi), %ymm0 813 ; X64-NEXT: retq 814 %arg0 = bitcast double* %a0 to <4 x double>* 815 %res = load <4 x double>, <4 x double>* %arg0, align 32 816 ret <4 x double> %res 817 } 818 819 define <8 x float> @test_mm256_load_ps(float* %a0) nounwind { 820 ; X86-LABEL: test_mm256_load_ps: 821 ; X86: # %bb.0: 822 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 823 ; X86-NEXT: vmovaps (%eax), %ymm0 824 ; X86-NEXT: retl 825 ; 826 ; X64-LABEL: test_mm256_load_ps: 827 ; X64: # %bb.0: 828 ; X64-NEXT: vmovaps (%rdi), %ymm0 829 ; X64-NEXT: retq 830 %arg0 = bitcast float* %a0 to <8 x float>* 831 %res = load <8 x float>, <8 x float>* %arg0, align 32 832 ret <8 x float> %res 833 } 834 835 define <4 x i64> @test_mm256_load_si256(<4 x i64>* %a0) nounwind { 836 ; X86-LABEL: test_mm256_load_si256: 837 ; X86: # %bb.0: 838 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 839 ; X86-NEXT: vmovaps (%eax), %ymm0 840 ; X86-NEXT: retl 841 ; 842 ; X64-LABEL: test_mm256_load_si256: 843 ; X64: # %bb.0: 844 ; X64-NEXT: vmovaps (%rdi), %ymm0 845 ; X64-NEXT: retq 846 %res = load <4 x i64>, <4 x i64>* %a0, align 32 847 ret <4 x i64> %res 848 } 849 850 define <4 x double> @test_mm256_loadu_pd(double* %a0) nounwind { 851 ; X86-LABEL: test_mm256_loadu_pd: 852 ; X86: # %bb.0: 853 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 854 ; X86-NEXT: vmovups (%eax), %ymm0 855 ; X86-NEXT: retl 856 ; 857 ; X64-LABEL: test_mm256_loadu_pd: 858 ; X64: # %bb.0: 859 ; X64-NEXT: vmovups (%rdi), %ymm0 860 ; X64-NEXT: retq 861 %arg0 = bitcast double* %a0 to <4 x double>* 862 %res = load <4 x double>, <4 x double>* %arg0, align 1 863 ret <4 x double> %res 864 } 865 866 define <8 x float> @test_mm256_loadu_ps(float* %a0) nounwind { 867 ; X86-LABEL: test_mm256_loadu_ps: 868 ; X86: # %bb.0: 869 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 870 ; X86-NEXT: vmovups (%eax), %ymm0 871 ; X86-NEXT: retl 872 ; 873 ; X64-LABEL: test_mm256_loadu_ps: 874 ; X64: # %bb.0: 875 ; X64-NEXT: vmovups (%rdi), %ymm0 876 ; X64-NEXT: retq 877 %arg0 = bitcast float* %a0 to <8 x float>* 878 %res = load <8 x float>, <8 x float>* %arg0, align 1 879 ret <8 x float> %res 880 } 881 882 define <4 x i64> @test_mm256_loadu_si256(<4 x i64>* %a0) nounwind { 883 ; X86-LABEL: test_mm256_loadu_si256: 884 ; X86: # %bb.0: 885 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 886 ; X86-NEXT: vmovups (%eax), %ymm0 887 ; X86-NEXT: retl 888 ; 889 ; X64-LABEL: test_mm256_loadu_si256: 890 ; X64: # %bb.0: 891 ; X64-NEXT: vmovups (%rdi), %ymm0 892 ; X64-NEXT: retq 893 %res = load <4 x i64>, <4 x i64>* %a0, align 1 894 ret <4 x i64> %res 895 } 896 897 define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind { 898 ; X86-LABEL: test_mm256_loadu2_m128: 899 ; X86: # %bb.0: 900 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 901 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 902 ; X86-NEXT: vmovups (%eax), %xmm0 903 ; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0 904 ; X86-NEXT: retl 905 ; 906 ; X64-LABEL: test_mm256_loadu2_m128: 907 ; X64: # %bb.0: 908 ; X64-NEXT: vmovups (%rsi), %xmm0 909 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 910 ; X64-NEXT: retq 911 %arg0 = bitcast float* %a0 to <4 x float>* 912 %hi4 = load <4 x float>, <4 x float>* %arg0, align 1 913 %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 914 %arg1 = bitcast float* %a1 to <4 x float>* 915 %lo4 = load <4 x float>, <4 x float>* %arg1, align 1 916 %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 917 %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 918 ret <8 x float> %res 919 } 920 921 define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind { 922 ; X86-LABEL: test_mm256_loadu2_m128d: 923 ; X86: # %bb.0: 924 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 925 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 926 ; X86-NEXT: vmovups (%eax), %xmm0 927 ; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0 928 ; X86-NEXT: retl 929 ; 930 ; X64-LABEL: test_mm256_loadu2_m128d: 931 ; X64: # %bb.0: 932 ; X64-NEXT: vmovups (%rsi), %xmm0 933 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 934 ; X64-NEXT: retq 935 %arg0 = bitcast double* %a0 to <2 x double>* 936 %hi2 = load <2 x double>, <2 x double>* %arg0, align 1 937 %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 938 %arg1 = bitcast double* %a1 to <2 x double>* 939 %lo2 = load <2 x double>, <2 x double>* %arg1, align 1 940 %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 941 %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 942 ret <4 x double> %res 943 } 944 945 define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind { 946 ; X86-LABEL: test_mm256_loadu2_m128i: 947 ; X86: # %bb.0: 948 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 949 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 950 ; X86-NEXT: vmovups (%eax), %xmm0 951 ; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0 952 ; X86-NEXT: retl 953 ; 954 ; X64-LABEL: test_mm256_loadu2_m128i: 955 ; X64: # %bb.0: 956 ; X64-NEXT: vmovups (%rsi), %xmm0 957 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 958 ; X64-NEXT: retq 959 %arg0 = bitcast i64* %a0 to <2 x i64>* 960 %hi2 = load <2 x i64>, <2 x i64>* %arg0, align 1 961 %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 962 %arg1 = bitcast i64* %a1 to <2 x i64>* 963 %lo2 = load <2 x i64>, <2 x i64>* %arg1, align 1 964 %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 965 %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 966 ret <4 x i64> %res 967 } 968 969 define <2 x double> @test_mm_maskload_pd(double* %a0, <2 x i64> %a1) nounwind { 970 ; X86-LABEL: test_mm_maskload_pd: 971 ; X86: # %bb.0: 972 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 973 ; X86-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0 974 ; X86-NEXT: retl 975 ; 976 ; X64-LABEL: test_mm_maskload_pd: 977 ; X64: # %bb.0: 978 ; X64-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm0 979 ; X64-NEXT: retq 980 %arg0 = bitcast double* %a0 to i8* 981 %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %arg0, <2 x i64> %a1) 982 ret <2 x double> %res 983 } 984 declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readnone 985 986 define <4 x double> @test_mm256_maskload_pd(double* %a0, <4 x i64> %a1) nounwind { 987 ; X86-LABEL: test_mm256_maskload_pd: 988 ; X86: # %bb.0: 989 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 990 ; X86-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0 991 ; X86-NEXT: retl 992 ; 993 ; X64-LABEL: test_mm256_maskload_pd: 994 ; X64: # %bb.0: 995 ; X64-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 996 ; X64-NEXT: retq 997 %arg0 = bitcast double* %a0 to i8* 998 %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %arg0, <4 x i64> %a1) 999 ret <4 x double> %res 1000 } 1001 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readnone 1002 1003 define <4 x float> @test_mm_maskload_ps(float* %a0, <2 x i64> %a1) nounwind { 1004 ; X86-LABEL: test_mm_maskload_ps: 1005 ; X86: # %bb.0: 1006 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1007 ; X86-NEXT: vmaskmovps (%eax), %xmm0, %xmm0 1008 ; X86-NEXT: retl 1009 ; 1010 ; X64-LABEL: test_mm_maskload_ps: 1011 ; X64: # %bb.0: 1012 ; X64-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 1013 ; X64-NEXT: retq 1014 %arg0 = bitcast float* %a0 to i8* 1015 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1016 %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %arg0, <4 x i32> %arg1) 1017 ret <4 x float> %res 1018 } 1019 declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readnone 1020 1021 define <8 x float> @test_mm256_maskload_ps(float* %a0, <4 x i64> %a1) nounwind { 1022 ; X86-LABEL: test_mm256_maskload_ps: 1023 ; X86: # %bb.0: 1024 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1025 ; X86-NEXT: vmaskmovps (%eax), %ymm0, %ymm0 1026 ; X86-NEXT: retl 1027 ; 1028 ; X64-LABEL: test_mm256_maskload_ps: 1029 ; X64: # %bb.0: 1030 ; X64-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 1031 ; X64-NEXT: retq 1032 %arg0 = bitcast float* %a0 to i8* 1033 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1034 %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %arg0, <8 x i32> %arg1) 1035 ret <8 x float> %res 1036 } 1037 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readnone 1038 1039 define void @test_mm_maskstore_pd(double* %a0, <2 x i64> %a1, <2 x double> %a2) nounwind { 1040 ; X86-LABEL: test_mm_maskstore_pd: 1041 ; X86: # %bb.0: 1042 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1043 ; X86-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax) 1044 ; X86-NEXT: retl 1045 ; 1046 ; X64-LABEL: test_mm_maskstore_pd: 1047 ; X64: # %bb.0: 1048 ; X64-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) 1049 ; X64-NEXT: retq 1050 %arg0 = bitcast double* %a0 to i8* 1051 call void @llvm.x86.avx.maskstore.pd(i8* %arg0, <2 x i64> %a1, <2 x double> %a2) 1052 ret void 1053 } 1054 declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind readnone 1055 1056 define void @test_mm256_maskstore_pd(double* %a0, <4 x i64> %a1, <4 x double> %a2) nounwind { 1057 ; X86-LABEL: test_mm256_maskstore_pd: 1058 ; X86: # %bb.0: 1059 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1060 ; X86-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) 1061 ; X86-NEXT: vzeroupper 1062 ; X86-NEXT: retl 1063 ; 1064 ; X64-LABEL: test_mm256_maskstore_pd: 1065 ; X64: # %bb.0: 1066 ; X64-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) 1067 ; X64-NEXT: vzeroupper 1068 ; X64-NEXT: retq 1069 %arg0 = bitcast double* %a0 to i8* 1070 call void @llvm.x86.avx.maskstore.pd.256(i8* %arg0, <4 x i64> %a1, <4 x double> %a2) 1071 ret void 1072 } 1073 declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind readnone 1074 1075 define void @test_mm_maskstore_ps(float* %a0, <2 x i64> %a1, <4 x float> %a2) nounwind { 1076 ; X86-LABEL: test_mm_maskstore_ps: 1077 ; X86: # %bb.0: 1078 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1079 ; X86-NEXT: vmaskmovps %xmm1, %xmm0, (%eax) 1080 ; X86-NEXT: retl 1081 ; 1082 ; X64-LABEL: test_mm_maskstore_ps: 1083 ; X64: # %bb.0: 1084 ; X64-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) 1085 ; X64-NEXT: retq 1086 %arg0 = bitcast float* %a0 to i8* 1087 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1088 call void @llvm.x86.avx.maskstore.ps(i8* %arg0, <4 x i32> %arg1, <4 x float> %a2) 1089 ret void 1090 } 1091 declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind readnone 1092 1093 define void @test_mm256_maskstore_ps(float* %a0, <4 x i64> %a1, <8 x float> %a2) nounwind { 1094 ; X86-LABEL: test_mm256_maskstore_ps: 1095 ; X86: # %bb.0: 1096 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1097 ; X86-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) 1098 ; X86-NEXT: vzeroupper 1099 ; X86-NEXT: retl 1100 ; 1101 ; X64-LABEL: test_mm256_maskstore_ps: 1102 ; X64: # %bb.0: 1103 ; X64-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) 1104 ; X64-NEXT: vzeroupper 1105 ; X64-NEXT: retq 1106 %arg0 = bitcast float* %a0 to i8* 1107 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1108 call void @llvm.x86.avx.maskstore.ps.256(i8* %arg0, <8 x i32> %arg1, <8 x float> %a2) 1109 ret void 1110 } 1111 declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind readnone 1112 1113 define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1114 ; CHECK-LABEL: test_mm256_max_pd: 1115 ; CHECK: # %bb.0: 1116 ; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 1117 ; CHECK-NEXT: ret{{[l|q]}} 1118 %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) 1119 ret <4 x double> %res 1120 } 1121 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone 1122 1123 define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1124 ; CHECK-LABEL: test_mm256_max_ps: 1125 ; CHECK: # %bb.0: 1126 ; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 1127 ; CHECK-NEXT: ret{{[l|q]}} 1128 %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) 1129 ret <8 x float> %res 1130 } 1131 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone 1132 1133 define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1134 ; CHECK-LABEL: test_mm256_min_pd: 1135 ; CHECK: # %bb.0: 1136 ; CHECK-NEXT: vminpd %ymm1, %ymm0, %ymm0 1137 ; CHECK-NEXT: ret{{[l|q]}} 1138 %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) 1139 ret <4 x double> %res 1140 } 1141 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone 1142 1143 define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1144 ; CHECK-LABEL: test_mm256_min_ps: 1145 ; CHECK: # %bb.0: 1146 ; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 1147 ; CHECK-NEXT: ret{{[l|q]}} 1148 %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) 1149 ret <8 x float> %res 1150 } 1151 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone 1152 1153 define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind { 1154 ; CHECK-LABEL: test_mm256_movedup_pd: 1155 ; CHECK: # %bb.0: 1156 ; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 1157 ; CHECK-NEXT: ret{{[l|q]}} 1158 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 1159 ret <4 x double> %res 1160 } 1161 1162 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind { 1163 ; CHECK-LABEL: test_mm256_movehdup_ps: 1164 ; CHECK: # %bb.0: 1165 ; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 1166 ; CHECK-NEXT: ret{{[l|q]}} 1167 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 1168 ret <8 x float> %res 1169 } 1170 1171 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind { 1172 ; CHECK-LABEL: test_mm256_moveldup_ps: 1173 ; CHECK: # %bb.0: 1174 ; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 1175 ; CHECK-NEXT: ret{{[l|q]}} 1176 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1177 ret <8 x float> %res 1178 } 1179 1180 define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind { 1181 ; CHECK-LABEL: test_mm256_movemask_pd: 1182 ; CHECK: # %bb.0: 1183 ; CHECK-NEXT: vmovmskpd %ymm0, %eax 1184 ; CHECK-NEXT: vzeroupper 1185 ; CHECK-NEXT: ret{{[l|q]}} 1186 %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) 1187 ret i32 %res 1188 } 1189 declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone 1190 1191 define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind { 1192 ; CHECK-LABEL: test_mm256_movemask_ps: 1193 ; CHECK: # %bb.0: 1194 ; CHECK-NEXT: vmovmskps %ymm0, %eax 1195 ; CHECK-NEXT: vzeroupper 1196 ; CHECK-NEXT: ret{{[l|q]}} 1197 %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) 1198 ret i32 %res 1199 } 1200 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone 1201 1202 define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1203 ; CHECK-LABEL: test_mm256_mul_pd: 1204 ; CHECK: # %bb.0: 1205 ; CHECK-NEXT: vmulpd %ymm1, %ymm0, %ymm0 1206 ; CHECK-NEXT: ret{{[l|q]}} 1207 %res = fmul <4 x double> %a0, %a1 1208 ret <4 x double> %res 1209 } 1210 1211 define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1212 ; CHECK-LABEL: test_mm256_mul_ps: 1213 ; CHECK: # %bb.0: 1214 ; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 1215 ; CHECK-NEXT: ret{{[l|q]}} 1216 %res = fmul <8 x float> %a0, %a1 1217 ret <8 x float> %res 1218 } 1219 1220 define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1221 ; CHECK-LABEL: test_mm256_or_pd: 1222 ; CHECK: # %bb.0: 1223 ; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 1224 ; CHECK-NEXT: ret{{[l|q]}} 1225 %1 = bitcast <4 x double> %a0 to <4 x i64> 1226 %2 = bitcast <4 x double> %a1 to <4 x i64> 1227 %res = or <4 x i64> %1, %2 1228 %bc = bitcast <4 x i64> %res to <4 x double> 1229 ret <4 x double> %bc 1230 } 1231 1232 define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1233 ; CHECK-LABEL: test_mm256_or_ps: 1234 ; CHECK: # %bb.0: 1235 ; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 1236 ; CHECK-NEXT: ret{{[l|q]}} 1237 %1 = bitcast <8 x float> %a0 to <8 x i32> 1238 %2 = bitcast <8 x float> %a1 to <8 x i32> 1239 %res = or <8 x i32> %1, %2 1240 %bc = bitcast <8 x i32> %res to <8 x float> 1241 ret <8 x float> %bc 1242 } 1243 1244 define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind { 1245 ; CHECK-LABEL: test_mm_permute_pd: 1246 ; CHECK: # %bb.0: 1247 ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1248 ; CHECK-NEXT: ret{{[l|q]}} 1249 %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0> 1250 ret <2 x double> %res 1251 } 1252 1253 define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind { 1254 ; CHECK-LABEL: test_mm256_permute_pd: 1255 ; CHECK: # %bb.0: 1256 ; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 1257 ; CHECK-NEXT: ret{{[l|q]}} 1258 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 1259 ret <4 x double> %res 1260 } 1261 1262 define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind { 1263 ; CHECK-LABEL: test_mm_permute_ps: 1264 ; CHECK: # %bb.0: 1265 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] 1266 ; CHECK-NEXT: ret{{[l|q]}} 1267 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1268 ret <4 x float> %res 1269 } 1270 1271 define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind { 1272 ; CHECK-LABEL: test2_mm_permute_ps: 1273 ; CHECK: # %bb.0: 1274 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3] 1275 ; CHECK-NEXT: ret{{[l|q]}} 1276 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3> 1277 ret <4 x float> %res 1278 } 1279 1280 define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind { 1281 ; CHECK-LABEL: test_mm256_permute_ps: 1282 ; CHECK: # %bb.0: 1283 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 1284 ; CHECK-NEXT: ret{{[l|q]}} 1285 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 1286 ret <8 x float> %res 1287 } 1288 1289 define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1290 ; CHECK-LABEL: test_mm256_permute2f128_pd: 1291 ; CHECK: # %bb.0: 1292 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1] 1293 ; CHECK-NEXT: ret{{[l|q]}} 1294 %res = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1295 ret <4 x double> %res 1296 } 1297 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 1298 1299 ; PR26667 1300 define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1301 ; CHECK-LABEL: test_mm256_permute2f128_ps: 1302 ; CHECK: # %bb.0: 1303 ; CHECK-NEXT: vmovaps %ymm1, %ymm0 1304 ; CHECK-NEXT: ret{{[l|q]}} 1305 %res = shufflevector <8 x float> %a1, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 1306 ret <8 x float> %res 1307 } 1308 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 1309 1310 define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1311 ; CHECK-LABEL: test_mm256_permute2f128_si256: 1312 ; CHECK: # %bb.0: 1313 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] 1314 ; CHECK-NEXT: ret{{[l|q]}} 1315 %1 = bitcast <4 x i64> %a0 to <8 x i32> 1316 %2 = bitcast <4 x i64> %a1 to <8 x i32> 1317 %res = shufflevector <8 x i32> %2, <8 x i32> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 1318 %bc = bitcast <8 x i32> %res to <4 x i64> 1319 ret <4 x i64> %bc 1320 } 1321 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone 1322 1323 define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind { 1324 ; CHECK-LABEL: test_mm_permutevar_pd: 1325 ; CHECK: # %bb.0: 1326 ; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 1327 ; CHECK-NEXT: ret{{[l|q]}} 1328 %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) 1329 ret <2 x double> %res 1330 } 1331 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone 1332 1333 define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind { 1334 ; CHECK-LABEL: test_mm256_permutevar_pd: 1335 ; CHECK: # %bb.0: 1336 ; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 1337 ; CHECK-NEXT: ret{{[l|q]}} 1338 %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) 1339 ret <4 x double> %res 1340 } 1341 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone 1342 1343 define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind { 1344 ; CHECK-LABEL: test_mm_permutevar_ps: 1345 ; CHECK: # %bb.0: 1346 ; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0 1347 ; CHECK-NEXT: ret{{[l|q]}} 1348 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1349 %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1) 1350 ret <4 x float> %res 1351 } 1352 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone 1353 1354 define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind { 1355 ; CHECK-LABEL: test_mm256_permutevar_ps: 1356 ; CHECK: # %bb.0: 1357 ; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0 1358 ; CHECK-NEXT: ret{{[l|q]}} 1359 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1360 %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1) 1361 ret <8 x float> %res 1362 } 1363 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone 1364 1365 define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind { 1366 ; CHECK-LABEL: test_mm256_rcp_ps: 1367 ; CHECK: # %bb.0: 1368 ; CHECK-NEXT: vrcpps %ymm0, %ymm0 1369 ; CHECK-NEXT: ret{{[l|q]}} 1370 %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) 1371 ret <8 x float> %res 1372 } 1373 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone 1374 1375 define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind { 1376 ; CHECK-LABEL: test_mm256_round_pd: 1377 ; CHECK: # %bb.0: 1378 ; CHECK-NEXT: vroundpd $4, %ymm0, %ymm0 1379 ; CHECK-NEXT: ret{{[l|q]}} 1380 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4) 1381 ret <4 x double> %res 1382 } 1383 1384 define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind { 1385 ; CHECK-LABEL: test_mm256_round_ps: 1386 ; CHECK: # %bb.0: 1387 ; CHECK-NEXT: vroundps $4, %ymm0, %ymm0 1388 ; CHECK-NEXT: ret{{[l|q]}} 1389 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4) 1390 ret <8 x float> %res 1391 } 1392 1393 define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind { 1394 ; CHECK-LABEL: test_mm256_rsqrt_ps: 1395 ; CHECK: # %bb.0: 1396 ; CHECK-NEXT: vrsqrtps %ymm0, %ymm0 1397 ; CHECK-NEXT: ret{{[l|q]}} 1398 %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) 1399 ret <8 x float> %res 1400 } 1401 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone 1402 1403 define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind { 1404 ; X86-LABEL: test_mm256_set_epi8: 1405 ; X86: # %bb.0: 1406 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1407 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 1408 ; X86-NEXT: vmovd %ecx, %xmm0 1409 ; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 1410 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1411 ; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 1412 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1413 ; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 1414 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1415 ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 1416 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1417 ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 1418 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1419 ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 1420 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1421 ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 1422 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1423 ; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 1424 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1425 ; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 1426 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1427 ; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 1428 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1429 ; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 1430 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1431 ; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 1432 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1433 ; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 1434 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1435 ; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 1436 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1437 ; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 1438 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1439 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 1440 ; X86-NEXT: vmovd %ecx, %xmm1 1441 ; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 1442 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1443 ; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 1444 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1445 ; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 1446 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1447 ; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 1448 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1449 ; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 1450 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1451 ; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 1452 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1453 ; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 1454 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1455 ; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 1456 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1457 ; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 1458 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1459 ; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 1460 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1461 ; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 1462 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1463 ; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 1464 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1465 ; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 1466 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1467 ; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 1468 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1469 ; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 1470 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1471 ; X86-NEXT: retl 1472 ; 1473 ; X64-LABEL: test_mm256_set_epi8: 1474 ; X64: # %bb.0: 1475 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d 1476 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1477 ; X64-NEXT: vmovd %eax, %xmm0 1478 ; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 1479 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1480 ; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 1481 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1482 ; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 1483 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1484 ; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 1485 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1486 ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 1487 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1488 ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 1489 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1490 ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 1491 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1492 ; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 1493 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1494 ; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 1495 ; X64-NEXT: movzbl %r9b, %eax 1496 ; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 1497 ; X64-NEXT: movzbl %r8b, %eax 1498 ; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 1499 ; X64-NEXT: movzbl %cl, %eax 1500 ; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 1501 ; X64-NEXT: movzbl %dl, %eax 1502 ; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 1503 ; X64-NEXT: movzbl %sil, %eax 1504 ; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 1505 ; X64-NEXT: movzbl %dil, %eax 1506 ; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 1507 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1508 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx 1509 ; X64-NEXT: vmovd %ecx, %xmm1 1510 ; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 1511 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1512 ; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 1513 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1514 ; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 1515 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1516 ; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 1517 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1518 ; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 1519 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1520 ; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 1521 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1522 ; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 1523 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1524 ; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 1525 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1526 ; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 1527 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1528 ; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 1529 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1530 ; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 1531 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1532 ; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 1533 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1534 ; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 1535 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1536 ; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 1537 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1538 ; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 1539 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1540 ; X64-NEXT: retq 1541 %res0 = insertelement <32 x i8> undef, i8 %a31, i32 0 1542 %res1 = insertelement <32 x i8> %res0, i8 %a30, i32 1 1543 %res2 = insertelement <32 x i8> %res1, i8 %a29, i32 2 1544 %res3 = insertelement <32 x i8> %res2, i8 %a28, i32 3 1545 %res4 = insertelement <32 x i8> %res3, i8 %a27, i32 4 1546 %res5 = insertelement <32 x i8> %res4, i8 %a26, i32 5 1547 %res6 = insertelement <32 x i8> %res5, i8 %a25, i32 6 1548 %res7 = insertelement <32 x i8> %res6, i8 %a24, i32 7 1549 %res8 = insertelement <32 x i8> %res7, i8 %a23, i32 8 1550 %res9 = insertelement <32 x i8> %res8, i8 %a22, i32 9 1551 %res10 = insertelement <32 x i8> %res9, i8 %a21, i32 10 1552 %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11 1553 %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12 1554 %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13 1555 %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14 1556 %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15 1557 %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16 1558 %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17 1559 %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18 1560 %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19 1561 %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20 1562 %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21 1563 %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22 1564 %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23 1565 %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24 1566 %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25 1567 %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26 1568 %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27 1569 %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28 1570 %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29 1571 %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30 1572 %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31 1573 %res = bitcast <32 x i8> %res31 to <4 x i64> 1574 ret <4 x i64> %res 1575 } 1576 1577 define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind { 1578 ; X86-LABEL: test_mm256_set_epi16: 1579 ; X86: # %bb.0: 1580 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1581 ; X86-NEXT: vmovd %eax, %xmm0 1582 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1583 ; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 1584 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1585 ; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 1586 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1587 ; X86-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 1588 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1589 ; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 1590 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1591 ; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 1592 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1593 ; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 1594 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1595 ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 1596 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1597 ; X86-NEXT: vmovd %eax, %xmm1 1598 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1599 ; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 1600 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1601 ; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 1602 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1603 ; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 1604 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1605 ; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 1606 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1607 ; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 1608 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1609 ; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 1610 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1611 ; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 1612 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1613 ; X86-NEXT: retl 1614 ; 1615 ; X64-LABEL: test_mm256_set_epi16: 1616 ; X64: # %bb.0: 1617 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1618 ; X64-NEXT: vmovd %eax, %xmm0 1619 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1620 ; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 1621 ; X64-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0 1622 ; X64-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 1623 ; X64-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 1624 ; X64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 1625 ; X64-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0 1626 ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 1627 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1628 ; X64-NEXT: vmovd %eax, %xmm1 1629 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1630 ; X64-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 1631 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1632 ; X64-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 1633 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1634 ; X64-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 1635 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1636 ; X64-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 1637 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1638 ; X64-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 1639 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1640 ; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 1641 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1642 ; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 1643 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1644 ; X64-NEXT: retq 1645 %res0 = insertelement <16 x i16> undef, i16 %a15, i32 0 1646 %res1 = insertelement <16 x i16> %res0, i16 %a14, i32 1 1647 %res2 = insertelement <16 x i16> %res1, i16 %a13, i32 2 1648 %res3 = insertelement <16 x i16> %res2, i16 %a12, i32 3 1649 %res4 = insertelement <16 x i16> %res3, i16 %a11, i32 4 1650 %res5 = insertelement <16 x i16> %res4, i16 %a10, i32 5 1651 %res6 = insertelement <16 x i16> %res5, i16 %a9 , i32 6 1652 %res7 = insertelement <16 x i16> %res6, i16 %a8 , i32 7 1653 %res8 = insertelement <16 x i16> %res7, i16 %a7 , i32 8 1654 %res9 = insertelement <16 x i16> %res8, i16 %a6 , i32 9 1655 %res10 = insertelement <16 x i16> %res9, i16 %a5 , i32 10 1656 %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11 1657 %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12 1658 %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13 1659 %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14 1660 %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15 1661 %res = bitcast <16 x i16> %res15 to <4 x i64> 1662 ret <4 x i64> %res 1663 } 1664 1665 define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind { 1666 ; X86-LABEL: test_mm256_set_epi32: 1667 ; X86: # %bb.0: 1668 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1669 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1670 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 1671 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 1672 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1673 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1674 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 1675 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 1676 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1677 ; X86-NEXT: retl 1678 ; 1679 ; X64-LABEL: test_mm256_set_epi32: 1680 ; X64: # %bb.0: 1681 ; X64-NEXT: vmovd %ecx, %xmm0 1682 ; X64-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 1683 ; X64-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 1684 ; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 1685 ; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1686 ; X64-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 1687 ; X64-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1 1688 ; X64-NEXT: vpinsrd $3, %r8d, %xmm1, %xmm1 1689 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1690 ; X64-NEXT: retq 1691 %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0 1692 %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1 1693 %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2 1694 %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3 1695 %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4 1696 %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5 1697 %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6 1698 %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7 1699 %res = bitcast <8 x i32> %res7 to <4 x i64> 1700 ret <4 x i64> %res 1701 } 1702 1703 define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind { 1704 ; X86-LABEL: test_mm256_set_epi64x: 1705 ; X86: # %bb.0: 1706 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1707 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1708 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 1709 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 1710 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1711 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1712 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 1713 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 1714 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1715 ; X86-NEXT: retl 1716 ; 1717 ; X64-LABEL: test_mm256_set_epi64x: 1718 ; X64: # %bb.0: 1719 ; X64-NEXT: vmovq %rdi, %xmm0 1720 ; X64-NEXT: vmovq %rsi, %xmm1 1721 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1722 ; X64-NEXT: vmovq %rdx, %xmm1 1723 ; X64-NEXT: vmovq %rcx, %xmm2 1724 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1725 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1726 ; X64-NEXT: retq 1727 %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0 1728 %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1 1729 %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2 1730 %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3 1731 ret <4 x i64> %res3 1732 } 1733 1734 define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind { 1735 ; CHECK-LABEL: test_mm256_set_m128: 1736 ; CHECK: # %bb.0: 1737 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1738 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1739 ; CHECK-NEXT: ret{{[l|q]}} 1740 %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1741 ret <8 x float> %res 1742 } 1743 1744 define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind { 1745 ; CHECK-LABEL: test_mm256_set_m128d: 1746 ; CHECK: # %bb.0: 1747 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1748 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1749 ; CHECK-NEXT: ret{{[l|q]}} 1750 %arg0 = bitcast <2 x double> %a0 to <4 x float> 1751 %arg1 = bitcast <2 x double> %a1 to <4 x float> 1752 %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1753 %bc = bitcast <8 x float> %res to <4 x double> 1754 ret <4 x double> %bc 1755 } 1756 1757 define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind { 1758 ; CHECK-LABEL: test_mm256_set_m128i: 1759 ; CHECK: # %bb.0: 1760 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1761 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1762 ; CHECK-NEXT: ret{{[l|q]}} 1763 %arg0 = bitcast <2 x i64> %a0 to <4 x float> 1764 %arg1 = bitcast <2 x i64> %a1 to <4 x float> 1765 %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1766 %bc = bitcast <8 x float> %res to <4 x i64> 1767 ret <4 x i64> %bc 1768 } 1769 1770 define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind { 1771 ; X86-LABEL: test_mm256_set_pd: 1772 ; X86: # %bb.0: 1773 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1774 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 1775 ; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1776 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 1777 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 1778 ; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1779 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1780 ; X86-NEXT: retl 1781 ; 1782 ; X64-LABEL: test_mm256_set_pd: 1783 ; X64: # %bb.0: 1784 ; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1785 ; X64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] 1786 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1787 ; X64-NEXT: retq 1788 %res0 = insertelement <4 x double> undef, double %a3, i32 0 1789 %res1 = insertelement <4 x double> %res0, double %a2, i32 1 1790 %res2 = insertelement <4 x double> %res1, double %a1, i32 2 1791 %res3 = insertelement <4 x double> %res2, double %a0, i32 3 1792 ret <4 x double> %res3 1793 } 1794 1795 define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind { 1796 ; X86-LABEL: test_mm256_set_ps: 1797 ; X86: # %bb.0: 1798 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1799 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1800 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 1801 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1802 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 1803 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1804 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 1805 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1806 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1807 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] 1808 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1809 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1810 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1811 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] 1812 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1813 ; X86-NEXT: retl 1814 ; 1815 ; X64-LABEL: test_mm256_set_ps: 1816 ; X64: # %bb.0: 1817 ; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 1818 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 1819 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1820 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3] 1821 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3] 1822 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 1823 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1824 ; X64-NEXT: retq 1825 %res0 = insertelement <8 x float> undef, float %a7, i32 0 1826 %res1 = insertelement <8 x float> %res0, float %a6, i32 1 1827 %res2 = insertelement <8 x float> %res1, float %a5, i32 2 1828 %res3 = insertelement <8 x float> %res2, float %a4, i32 3 1829 %res4 = insertelement <8 x float> %res3, float %a3, i32 4 1830 %res5 = insertelement <8 x float> %res4, float %a2, i32 5 1831 %res6 = insertelement <8 x float> %res5, float %a1, i32 6 1832 %res7 = insertelement <8 x float> %res6, float %a0, i32 7 1833 ret <8 x float> %res7 1834 } 1835 1836 define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind { 1837 ; X86-LABEL: test_mm256_set1_epi8: 1838 ; X86: # %bb.0: 1839 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1840 ; X86-NEXT: vmovd %eax, %xmm0 1841 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1842 ; X86-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1843 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1844 ; X86-NEXT: retl 1845 ; 1846 ; X64-LABEL: test_mm256_set1_epi8: 1847 ; X64: # %bb.0: 1848 ; X64-NEXT: movzbl %dil, %eax 1849 ; X64-NEXT: vmovd %eax, %xmm0 1850 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1851 ; X64-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1852 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1853 ; X64-NEXT: retq 1854 %res0 = insertelement <32 x i8> undef, i8 %a0, i32 0 1855 %res1 = insertelement <32 x i8> %res0, i8 %a0, i32 1 1856 %res2 = insertelement <32 x i8> %res1, i8 %a0, i32 2 1857 %res3 = insertelement <32 x i8> %res2, i8 %a0, i32 3 1858 %res4 = insertelement <32 x i8> %res3, i8 %a0, i32 4 1859 %res5 = insertelement <32 x i8> %res4, i8 %a0, i32 5 1860 %res6 = insertelement <32 x i8> %res5, i8 %a0, i32 6 1861 %res7 = insertelement <32 x i8> %res6, i8 %a0, i32 7 1862 %res8 = insertelement <32 x i8> %res7, i8 %a0, i32 8 1863 %res9 = insertelement <32 x i8> %res8, i8 %a0, i32 9 1864 %res10 = insertelement <32 x i8> %res9, i8 %a0, i32 10 1865 %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11 1866 %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12 1867 %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13 1868 %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14 1869 %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15 1870 %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16 1871 %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17 1872 %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18 1873 %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19 1874 %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20 1875 %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21 1876 %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22 1877 %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23 1878 %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24 1879 %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25 1880 %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26 1881 %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27 1882 %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28 1883 %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29 1884 %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30 1885 %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31 1886 %res = bitcast <32 x i8> %res31 to <4 x i64> 1887 ret <4 x i64> %res 1888 } 1889 1890 define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind { 1891 ; X86-LABEL: test_mm256_set1_epi16: 1892 ; X86: # %bb.0: 1893 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1894 ; X86-NEXT: vmovd %eax, %xmm0 1895 ; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] 1896 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1897 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1898 ; X86-NEXT: retl 1899 ; 1900 ; X64-LABEL: test_mm256_set1_epi16: 1901 ; X64: # %bb.0: 1902 ; X64-NEXT: vmovd %edi, %xmm0 1903 ; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] 1904 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1905 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1906 ; X64-NEXT: retq 1907 %res0 = insertelement <16 x i16> undef, i16 %a0, i32 0 1908 %res1 = insertelement <16 x i16> %res0, i16 %a0, i32 1 1909 %res2 = insertelement <16 x i16> %res1, i16 %a0, i32 2 1910 %res3 = insertelement <16 x i16> %res2, i16 %a0, i32 3 1911 %res4 = insertelement <16 x i16> %res3, i16 %a0, i32 4 1912 %res5 = insertelement <16 x i16> %res4, i16 %a0, i32 5 1913 %res6 = insertelement <16 x i16> %res5, i16 %a0, i32 6 1914 %res7 = insertelement <16 x i16> %res6, i16 %a0, i32 7 1915 %res8 = insertelement <16 x i16> %res7, i16 %a0, i32 8 1916 %res9 = insertelement <16 x i16> %res8, i16 %a0, i32 9 1917 %res10 = insertelement <16 x i16> %res9, i16 %a0, i32 10 1918 %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11 1919 %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12 1920 %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13 1921 %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14 1922 %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15 1923 %res = bitcast <16 x i16> %res15 to <4 x i64> 1924 ret <4 x i64> %res 1925 } 1926 1927 define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind { 1928 ; X86-LABEL: test_mm256_set1_epi32: 1929 ; X86: # %bb.0: 1930 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1931 ; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1932 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1933 ; X86-NEXT: retl 1934 ; 1935 ; X64-LABEL: test_mm256_set1_epi32: 1936 ; X64: # %bb.0: 1937 ; X64-NEXT: vmovd %edi, %xmm0 1938 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1939 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1940 ; X64-NEXT: retq 1941 %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0 1942 %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1 1943 %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2 1944 %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3 1945 %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4 1946 %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5 1947 %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6 1948 %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7 1949 %res = bitcast <8 x i32> %res7 to <4 x i64> 1950 ret <4 x i64> %res 1951 } 1952 1953 define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind { 1954 ; X86-LABEL: test_mm256_set1_epi64x: 1955 ; X86: # %bb.0: 1956 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1957 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 1958 ; X86-NEXT: vmovd %ecx, %xmm0 1959 ; X86-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 1960 ; X86-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 1961 ; X86-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 1962 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1963 ; X86-NEXT: retl 1964 ; 1965 ; X64-LABEL: test_mm256_set1_epi64x: 1966 ; X64: # %bb.0: 1967 ; X64-NEXT: vmovq %rdi, %xmm0 1968 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1969 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1970 ; X64-NEXT: retq 1971 %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0 1972 %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1 1973 %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2 1974 %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3 1975 ret <4 x i64> %res3 1976 } 1977 1978 define <4 x double> @test_mm256_set1_pd(double %a0) nounwind { 1979 ; X86-LABEL: test_mm256_set1_pd: 1980 ; X86: # %bb.0: 1981 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1982 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1983 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1984 ; X86-NEXT: retl 1985 ; 1986 ; X64-LABEL: test_mm256_set1_pd: 1987 ; X64: # %bb.0: 1988 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1989 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1990 ; X64-NEXT: retq 1991 %res0 = insertelement <4 x double> undef, double %a0, i32 0 1992 %res1 = insertelement <4 x double> %res0, double %a0, i32 1 1993 %res2 = insertelement <4 x double> %res1, double %a0, i32 2 1994 %res3 = insertelement <4 x double> %res2, double %a0, i32 3 1995 ret <4 x double> %res3 1996 } 1997 1998 define <8 x float> @test_mm256_set1_ps(float %a0) nounwind { 1999 ; X86-LABEL: test_mm256_set1_ps: 2000 ; X86: # %bb.0: 2001 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2002 ; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 2003 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2004 ; X86-NEXT: retl 2005 ; 2006 ; X64-LABEL: test_mm256_set1_ps: 2007 ; X64: # %bb.0: 2008 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 2009 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2010 ; X64-NEXT: retq 2011 %res0 = insertelement <8 x float> undef, float %a0, i32 0 2012 %res1 = insertelement <8 x float> %res0, float %a0, i32 1 2013 %res2 = insertelement <8 x float> %res1, float %a0, i32 2 2014 %res3 = insertelement <8 x float> %res2, float %a0, i32 3 2015 %res4 = insertelement <8 x float> %res3, float %a0, i32 4 2016 %res5 = insertelement <8 x float> %res4, float %a0, i32 5 2017 %res6 = insertelement <8 x float> %res5, float %a0, i32 6 2018 %res7 = insertelement <8 x float> %res6, float %a0, i32 7 2019 ret <8 x float> %res7 2020 } 2021 2022 define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind { 2023 ; X86-LABEL: test_mm256_setr_epi8: 2024 ; X86: # %bb.0: 2025 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2026 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 2027 ; X86-NEXT: vmovd %ecx, %xmm0 2028 ; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 2029 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2030 ; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 2031 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2032 ; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 2033 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2034 ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 2035 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2036 ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 2037 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2038 ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 2039 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2040 ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 2041 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2042 ; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 2043 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2044 ; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 2045 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2046 ; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 2047 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2048 ; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 2049 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2050 ; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 2051 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2052 ; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 2053 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2054 ; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 2055 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2056 ; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 2057 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2058 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 2059 ; X86-NEXT: vmovd %ecx, %xmm1 2060 ; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 2061 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2062 ; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 2063 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2064 ; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 2065 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2066 ; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 2067 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2068 ; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 2069 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2070 ; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 2071 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2072 ; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 2073 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2074 ; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 2075 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2076 ; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 2077 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2078 ; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 2079 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2080 ; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 2081 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2082 ; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 2083 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2084 ; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 2085 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2086 ; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 2087 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2088 ; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 2089 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2090 ; X86-NEXT: retl 2091 ; 2092 ; X64-LABEL: test_mm256_setr_epi8: 2093 ; X64: # %bb.0: 2094 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d 2095 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2096 ; X64-NEXT: vmovd %eax, %xmm0 2097 ; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 2098 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2099 ; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 2100 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2101 ; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 2102 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2103 ; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 2104 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2105 ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 2106 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2107 ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 2108 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2109 ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 2110 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2111 ; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 2112 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2113 ; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 2114 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2115 ; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 2116 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2117 ; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 2118 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2119 ; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 2120 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2121 ; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 2122 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2123 ; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 2124 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2125 ; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 2126 ; X64-NEXT: movzbl %sil, %eax 2127 ; X64-NEXT: movzbl %dil, %esi 2128 ; X64-NEXT: vmovd %esi, %xmm1 2129 ; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 2130 ; X64-NEXT: movzbl %dl, %eax 2131 ; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 2132 ; X64-NEXT: movzbl %cl, %eax 2133 ; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 2134 ; X64-NEXT: movzbl %r8b, %eax 2135 ; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 2136 ; X64-NEXT: movzbl %r9b, %eax 2137 ; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 2138 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2139 ; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 2140 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2141 ; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 2142 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2143 ; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 2144 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2145 ; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 2146 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2147 ; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 2148 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2149 ; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 2150 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2151 ; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 2152 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2153 ; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 2154 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2155 ; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 2156 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2157 ; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 2158 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2159 ; X64-NEXT: retq 2160 %res0 = insertelement <32 x i8> undef, i8 %a0 , i32 0 2161 %res1 = insertelement <32 x i8> %res0, i8 %a1 , i32 1 2162 %res2 = insertelement <32 x i8> %res1, i8 %a2 , i32 2 2163 %res3 = insertelement <32 x i8> %res2, i8 %a3 , i32 3 2164 %res4 = insertelement <32 x i8> %res3, i8 %a4 , i32 4 2165 %res5 = insertelement <32 x i8> %res4, i8 %a5 , i32 5 2166 %res6 = insertelement <32 x i8> %res5, i8 %a6 , i32 6 2167 %res7 = insertelement <32 x i8> %res6, i8 %a7 , i32 7 2168 %res8 = insertelement <32 x i8> %res7, i8 %a8 , i32 8 2169 %res9 = insertelement <32 x i8> %res8, i8 %a9 , i32 9 2170 %res10 = insertelement <32 x i8> %res9, i8 %a10, i32 10 2171 %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11 2172 %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12 2173 %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13 2174 %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14 2175 %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15 2176 %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16 2177 %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17 2178 %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18 2179 %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19 2180 %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20 2181 %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21 2182 %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22 2183 %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23 2184 %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24 2185 %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25 2186 %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26 2187 %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27 2188 %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28 2189 %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29 2190 %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30 2191 %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31 2192 %res = bitcast <32 x i8> %res31 to <4 x i64> 2193 ret <4 x i64> %res 2194 } 2195 2196 define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind { 2197 ; X86-LABEL: test_mm256_setr_epi16: 2198 ; X86: # %bb.0: 2199 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2200 ; X86-NEXT: vmovd %eax, %xmm0 2201 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2202 ; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 2203 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2204 ; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 2205 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2206 ; X86-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 2207 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2208 ; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 2209 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2210 ; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 2211 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2212 ; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 2213 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2214 ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 2215 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2216 ; X86-NEXT: vmovd %eax, %xmm1 2217 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2218 ; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 2219 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2220 ; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 2221 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2222 ; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 2223 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2224 ; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 2225 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2226 ; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 2227 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2228 ; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 2229 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2230 ; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 2231 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2232 ; X86-NEXT: retl 2233 ; 2234 ; X64-LABEL: test_mm256_setr_epi16: 2235 ; X64: # %bb.0: 2236 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2237 ; X64-NEXT: vmovd %eax, %xmm0 2238 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2239 ; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 2240 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2241 ; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 2242 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2243 ; X64-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 2244 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2245 ; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 2246 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2247 ; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 2248 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2249 ; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 2250 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2251 ; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 2252 ; X64-NEXT: vmovd %edi, %xmm1 2253 ; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 2254 ; X64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 2255 ; X64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 2256 ; X64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1 2257 ; X64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1 2258 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2259 ; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 2260 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2261 ; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 2262 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2263 ; X64-NEXT: retq 2264 %res0 = insertelement <16 x i16> undef, i16 %a0 , i32 0 2265 %res1 = insertelement <16 x i16> %res0, i16 %a1 , i32 1 2266 %res2 = insertelement <16 x i16> %res1, i16 %a2 , i32 2 2267 %res3 = insertelement <16 x i16> %res2, i16 %a3 , i32 3 2268 %res4 = insertelement <16 x i16> %res3, i16 %a4 , i32 4 2269 %res5 = insertelement <16 x i16> %res4, i16 %a5 , i32 5 2270 %res6 = insertelement <16 x i16> %res5, i16 %a6 , i32 6 2271 %res7 = insertelement <16 x i16> %res6, i16 %a7 , i32 7 2272 %res8 = insertelement <16 x i16> %res7, i16 %a8 , i32 8 2273 %res9 = insertelement <16 x i16> %res8, i16 %a9 , i32 9 2274 %res10 = insertelement <16 x i16> %res9, i16 %a10, i32 10 2275 %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11 2276 %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12 2277 %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13 2278 %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14 2279 %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15 2280 %res = bitcast <16 x i16> %res15 to <4 x i64> 2281 ret <4 x i64> %res 2282 } 2283 2284 define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind { 2285 ; X86-LABEL: test_mm256_setr_epi32: 2286 ; X86: # %bb.0: 2287 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2288 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 2289 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 2290 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 2291 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2292 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 2293 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 2294 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 2295 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2296 ; X86-NEXT: retl 2297 ; 2298 ; X64-LABEL: test_mm256_setr_epi32: 2299 ; X64: # %bb.0: 2300 ; X64-NEXT: vmovd %r8d, %xmm0 2301 ; X64-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0 2302 ; X64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 2303 ; X64-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 2304 ; X64-NEXT: vmovd %edi, %xmm1 2305 ; X64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 2306 ; X64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 2307 ; X64-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 2308 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2309 ; X64-NEXT: retq 2310 %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0 2311 %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1 2312 %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2 2313 %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3 2314 %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4 2315 %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5 2316 %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6 2317 %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7 2318 %res = bitcast <8 x i32> %res7 to <4 x i64> 2319 ret <4 x i64> %res 2320 } 2321 2322 define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind { 2323 ; X86-LABEL: test_mm256_setr_epi64x: 2324 ; X86: # %bb.0: 2325 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2326 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 2327 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 2328 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 2329 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2330 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 2331 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 2332 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 2333 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2334 ; X86-NEXT: retl 2335 ; 2336 ; X64-LABEL: test_mm256_setr_epi64x: 2337 ; X64: # %bb.0: 2338 ; X64-NEXT: vmovq %rcx, %xmm0 2339 ; X64-NEXT: vmovq %rdx, %xmm1 2340 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2341 ; X64-NEXT: vmovq %rsi, %xmm1 2342 ; X64-NEXT: vmovq %rdi, %xmm2 2343 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 2344 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2345 ; X64-NEXT: retq 2346 %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0 2347 %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1 2348 %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2 2349 %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3 2350 ret <4 x i64> %res3 2351 } 2352 2353 define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind { 2354 ; CHECK-LABEL: test_mm256_setr_m128: 2355 ; CHECK: # %bb.0: 2356 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2357 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2358 ; CHECK-NEXT: ret{{[l|q]}} 2359 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2360 ret <8 x float> %res 2361 } 2362 2363 define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind { 2364 ; CHECK-LABEL: test_mm256_setr_m128d: 2365 ; CHECK: # %bb.0: 2366 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2367 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2368 ; CHECK-NEXT: ret{{[l|q]}} 2369 %arg0 = bitcast <2 x double> %a0 to <4 x float> 2370 %arg1 = bitcast <2 x double> %a1 to <4 x float> 2371 %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2372 %bc = bitcast <8 x float> %res to <4 x double> 2373 ret <4 x double> %bc 2374 } 2375 2376 define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind { 2377 ; CHECK-LABEL: test_mm256_setr_m128i: 2378 ; CHECK: # %bb.0: 2379 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2380 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2381 ; CHECK-NEXT: ret{{[l|q]}} 2382 %arg0 = bitcast <2 x i64> %a0 to <4 x float> 2383 %arg1 = bitcast <2 x i64> %a1 to <4 x float> 2384 %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2385 %bc = bitcast <8 x float> %res to <4 x i64> 2386 ret <4 x i64> %bc 2387 } 2388 2389 define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind { 2390 ; X86-LABEL: test_mm256_setr_pd: 2391 ; X86: # %bb.0: 2392 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2393 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2394 ; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2395 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2396 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 2397 ; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 2398 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2399 ; X86-NEXT: retl 2400 ; 2401 ; X64-LABEL: test_mm256_setr_pd: 2402 ; X64: # %bb.0: 2403 ; X64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 2404 ; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2405 ; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2406 ; X64-NEXT: retq 2407 %res0 = insertelement <4 x double> undef, double %a0, i32 0 2408 %res1 = insertelement <4 x double> %res0, double %a1, i32 1 2409 %res2 = insertelement <4 x double> %res1, double %a2, i32 2 2410 %res3 = insertelement <4 x double> %res2, double %a3, i32 3 2411 ret <4 x double> %res3 2412 } 2413 2414 define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind { 2415 ; X86-LABEL: test_mm256_setr_ps: 2416 ; X86: # %bb.0: 2417 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2418 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 2419 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2420 ; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 2421 ; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 2422 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 2423 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2424 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 2425 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2426 ; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 2427 ; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero 2428 ; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] 2429 ; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] 2430 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] 2431 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2432 ; X86-NEXT: retl 2433 ; 2434 ; X64-LABEL: test_mm256_setr_ps: 2435 ; X64: # %bb.0: 2436 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] 2437 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] 2438 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] 2439 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 2440 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] 2441 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] 2442 ; X64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 2443 ; X64-NEXT: retq 2444 %res0 = insertelement <8 x float> undef, float %a0, i32 0 2445 %res1 = insertelement <8 x float> %res0, float %a1, i32 1 2446 %res2 = insertelement <8 x float> %res1, float %a2, i32 2 2447 %res3 = insertelement <8 x float> %res2, float %a3, i32 3 2448 %res4 = insertelement <8 x float> %res3, float %a4, i32 4 2449 %res5 = insertelement <8 x float> %res4, float %a5, i32 5 2450 %res6 = insertelement <8 x float> %res5, float %a6, i32 6 2451 %res7 = insertelement <8 x float> %res6, float %a7, i32 7 2452 ret <8 x float> %res7 2453 } 2454 2455 define <4 x double> @test_mm256_setzero_pd() nounwind { 2456 ; CHECK-LABEL: test_mm256_setzero_pd: 2457 ; CHECK: # %bb.0: 2458 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 2459 ; CHECK-NEXT: ret{{[l|q]}} 2460 ret <4 x double> zeroinitializer 2461 } 2462 2463 define <8 x float> @test_mm256_setzero_ps() nounwind { 2464 ; CHECK-LABEL: test_mm256_setzero_ps: 2465 ; CHECK: # %bb.0: 2466 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 2467 ; CHECK-NEXT: ret{{[l|q]}} 2468 ret <8 x float> zeroinitializer 2469 } 2470 2471 define <4 x i64> @test_mm256_setzero_si256() nounwind { 2472 ; CHECK-LABEL: test_mm256_setzero_si256: 2473 ; CHECK: # %bb.0: 2474 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 2475 ; CHECK-NEXT: ret{{[l|q]}} 2476 ret <4 x i64> zeroinitializer 2477 } 2478 2479 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2480 ; CHECK-LABEL: test_mm256_shuffle_pd: 2481 ; CHECK: # %bb.0: 2482 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 2483 ; CHECK-NEXT: ret{{[l|q]}} 2484 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 2485 ret <4 x double> %res 2486 } 2487 2488 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2489 ; CHECK-LABEL: test_mm256_shuffle_ps: 2490 ; CHECK: # %bb.0: 2491 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] 2492 ; CHECK-NEXT: ret{{[l|q]}} 2493 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12> 2494 ret <8 x float> %res 2495 } 2496 2497 define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind { 2498 ; CHECK-LABEL: test_mm256_sqrt_pd: 2499 ; CHECK: # %bb.0: # %entry 2500 ; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 2501 ; CHECK-NEXT: ret{{[l|q]}} 2502 entry: 2503 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2 2504 ret <4 x double> %0 2505 } 2506 2507 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1 2508 2509 define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind { 2510 ; CHECK-LABEL: test_mm256_sqrt_ps: 2511 ; CHECK: # %bb.0: # %entry 2512 ; CHECK-NEXT: vsqrtps %ymm0, %ymm0 2513 ; CHECK-NEXT: ret{{[l|q]}} 2514 entry: 2515 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2 2516 ret <8 x float> %0 2517 } 2518 2519 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1 2520 2521 define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind { 2522 ; X86-LABEL: test_mm256_store_pd: 2523 ; X86: # %bb.0: 2524 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2525 ; X86-NEXT: vmovaps %ymm0, (%eax) 2526 ; X86-NEXT: vzeroupper 2527 ; X86-NEXT: retl 2528 ; 2529 ; X64-LABEL: test_mm256_store_pd: 2530 ; X64: # %bb.0: 2531 ; X64-NEXT: vmovaps %ymm0, (%rdi) 2532 ; X64-NEXT: vzeroupper 2533 ; X64-NEXT: retq 2534 %arg0 = bitcast double* %a0 to <4 x double>* 2535 store <4 x double> %a1, <4 x double>* %arg0, align 32 2536 ret void 2537 } 2538 2539 define void @test_mm256_store_ps(float* %a0, <8 x float> %a1) nounwind { 2540 ; X86-LABEL: test_mm256_store_ps: 2541 ; X86: # %bb.0: 2542 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2543 ; X86-NEXT: vmovaps %ymm0, (%eax) 2544 ; X86-NEXT: vzeroupper 2545 ; X86-NEXT: retl 2546 ; 2547 ; X64-LABEL: test_mm256_store_ps: 2548 ; X64: # %bb.0: 2549 ; X64-NEXT: vmovaps %ymm0, (%rdi) 2550 ; X64-NEXT: vzeroupper 2551 ; X64-NEXT: retq 2552 %arg0 = bitcast float* %a0 to <8 x float>* 2553 store <8 x float> %a1, <8 x float>* %arg0, align 32 2554 ret void 2555 } 2556 2557 define void @test_mm256_store_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind { 2558 ; X86-LABEL: test_mm256_store_si256: 2559 ; X86: # %bb.0: 2560 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2561 ; X86-NEXT: vmovaps %ymm0, (%eax) 2562 ; X86-NEXT: vzeroupper 2563 ; X86-NEXT: retl 2564 ; 2565 ; X64-LABEL: test_mm256_store_si256: 2566 ; X64: # %bb.0: 2567 ; X64-NEXT: vmovaps %ymm0, (%rdi) 2568 ; X64-NEXT: vzeroupper 2569 ; X64-NEXT: retq 2570 store <4 x i64> %a1, <4 x i64>* %a0, align 32 2571 ret void 2572 } 2573 2574 define void @test_mm256_storeu_pd(double* %a0, <4 x double> %a1) nounwind { 2575 ; X86-LABEL: test_mm256_storeu_pd: 2576 ; X86: # %bb.0: 2577 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2578 ; X86-NEXT: vmovups %ymm0, (%eax) 2579 ; X86-NEXT: vzeroupper 2580 ; X86-NEXT: retl 2581 ; 2582 ; X64-LABEL: test_mm256_storeu_pd: 2583 ; X64: # %bb.0: 2584 ; X64-NEXT: vmovups %ymm0, (%rdi) 2585 ; X64-NEXT: vzeroupper 2586 ; X64-NEXT: retq 2587 %arg0 = bitcast double* %a0 to <4 x double>* 2588 store <4 x double> %a1, <4 x double>* %arg0, align 1 2589 ret void 2590 } 2591 2592 define void @test_mm256_storeu_ps(float* %a0, <8 x float> %a1) nounwind { 2593 ; X86-LABEL: test_mm256_storeu_ps: 2594 ; X86: # %bb.0: 2595 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2596 ; X86-NEXT: vmovups %ymm0, (%eax) 2597 ; X86-NEXT: vzeroupper 2598 ; X86-NEXT: retl 2599 ; 2600 ; X64-LABEL: test_mm256_storeu_ps: 2601 ; X64: # %bb.0: 2602 ; X64-NEXT: vmovups %ymm0, (%rdi) 2603 ; X64-NEXT: vzeroupper 2604 ; X64-NEXT: retq 2605 %arg0 = bitcast float* %a0 to <8 x float>* 2606 store <8 x float> %a1, <8 x float>* %arg0, align 1 2607 ret void 2608 } 2609 2610 define void @test_mm256_storeu_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind { 2611 ; X86-LABEL: test_mm256_storeu_si256: 2612 ; X86: # %bb.0: 2613 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2614 ; X86-NEXT: vmovups %ymm0, (%eax) 2615 ; X86-NEXT: vzeroupper 2616 ; X86-NEXT: retl 2617 ; 2618 ; X64-LABEL: test_mm256_storeu_si256: 2619 ; X64: # %bb.0: 2620 ; X64-NEXT: vmovups %ymm0, (%rdi) 2621 ; X64-NEXT: vzeroupper 2622 ; X64-NEXT: retq 2623 store <4 x i64> %a1, <4 x i64>* %a0, align 1 2624 ret void 2625 } 2626 2627 define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) nounwind { 2628 ; X86-LABEL: test_mm256_storeu2_m128: 2629 ; X86: # %bb.0: 2630 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2631 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 2632 ; X86-NEXT: vmovups %xmm0, (%ecx) 2633 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 2634 ; X86-NEXT: vmovups %xmm0, (%eax) 2635 ; X86-NEXT: vzeroupper 2636 ; X86-NEXT: retl 2637 ; 2638 ; X64-LABEL: test_mm256_storeu2_m128: 2639 ; X64: # %bb.0: 2640 ; X64-NEXT: vmovups %xmm0, (%rdi) 2641 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 2642 ; X64-NEXT: vmovups %xmm0, (%rsi) 2643 ; X64-NEXT: vzeroupper 2644 ; X64-NEXT: retq 2645 %arg0 = bitcast float* %a0 to <4 x float>* 2646 %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2647 store <4 x float> %lo, <4 x float>* %arg0, align 1 2648 %arg1 = bitcast float* %a1 to <4 x float>* 2649 %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2650 store <4 x float> %hi, <4 x float>* %arg1, align 1 2651 ret void 2652 } 2653 2654 define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2) nounwind { 2655 ; X86-LABEL: test_mm256_storeu2_m128d: 2656 ; X86: # %bb.0: 2657 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2658 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 2659 ; X86-NEXT: vmovups %xmm0, (%ecx) 2660 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 2661 ; X86-NEXT: vmovups %xmm0, (%eax) 2662 ; X86-NEXT: vzeroupper 2663 ; X86-NEXT: retl 2664 ; 2665 ; X64-LABEL: test_mm256_storeu2_m128d: 2666 ; X64: # %bb.0: 2667 ; X64-NEXT: vmovups %xmm0, (%rdi) 2668 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 2669 ; X64-NEXT: vmovups %xmm0, (%rsi) 2670 ; X64-NEXT: vzeroupper 2671 ; X64-NEXT: retq 2672 %arg0 = bitcast double* %a0 to <2 x double>* 2673 %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1> 2674 store <2 x double> %lo, <2 x double>* %arg0, align 1 2675 %arg1 = bitcast double* %a1 to <2 x double>* 2676 %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3> 2677 store <2 x double> %hi, <2 x double>* %arg1, align 1 2678 ret void 2679 } 2680 2681 define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64> %a2) nounwind { 2682 ; X86-LABEL: test_mm256_storeu2_m128i: 2683 ; X86: # %bb.0: 2684 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2685 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 2686 ; X86-NEXT: vmovups %xmm0, (%ecx) 2687 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 2688 ; X86-NEXT: vmovups %xmm0, (%eax) 2689 ; X86-NEXT: vzeroupper 2690 ; X86-NEXT: retl 2691 ; 2692 ; X64-LABEL: test_mm256_storeu2_m128i: 2693 ; X64: # %bb.0: 2694 ; X64-NEXT: vmovups %xmm0, (%rdi) 2695 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 2696 ; X64-NEXT: vmovups %xmm0, (%rsi) 2697 ; X64-NEXT: vzeroupper 2698 ; X64-NEXT: retq 2699 %arg0 = bitcast <2 x i64>* %a0 to <2 x i64>* 2700 %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1> 2701 store <2 x i64> %lo, <2 x i64>* %arg0, align 1 2702 %arg1 = bitcast <2 x i64>* %a1 to <2 x i64>* 2703 %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3> 2704 store <2 x i64> %hi, <2 x i64>* %arg1, align 1 2705 ret void 2706 } 2707 2708 define void @test_mm256_stream_pd(double *%a0, <4 x double> %a1) nounwind { 2709 ; X86-LABEL: test_mm256_stream_pd: 2710 ; X86: # %bb.0: 2711 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2712 ; X86-NEXT: vmovntps %ymm0, (%eax) 2713 ; X86-NEXT: vzeroupper 2714 ; X86-NEXT: retl 2715 ; 2716 ; X64-LABEL: test_mm256_stream_pd: 2717 ; X64: # %bb.0: 2718 ; X64-NEXT: vmovntps %ymm0, (%rdi) 2719 ; X64-NEXT: vzeroupper 2720 ; X64-NEXT: retq 2721 %arg0 = bitcast double* %a0 to <4 x double>* 2722 store <4 x double> %a1, <4 x double>* %arg0, align 32, !nontemporal !0 2723 ret void 2724 } 2725 2726 define void @test_mm256_stream_ps(float *%a0, <8 x float> %a1) nounwind { 2727 ; X86-LABEL: test_mm256_stream_ps: 2728 ; X86: # %bb.0: 2729 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2730 ; X86-NEXT: vmovntps %ymm0, (%eax) 2731 ; X86-NEXT: vzeroupper 2732 ; X86-NEXT: retl 2733 ; 2734 ; X64-LABEL: test_mm256_stream_ps: 2735 ; X64: # %bb.0: 2736 ; X64-NEXT: vmovntps %ymm0, (%rdi) 2737 ; X64-NEXT: vzeroupper 2738 ; X64-NEXT: retq 2739 %arg0 = bitcast float* %a0 to <8 x float>* 2740 store <8 x float> %a1, <8 x float>* %arg0, align 32, !nontemporal !0 2741 ret void 2742 } 2743 2744 define void @test_mm256_stream_si256(<4 x i64> *%a0, <4 x i64> %a1) nounwind { 2745 ; X86-LABEL: test_mm256_stream_si256: 2746 ; X86: # %bb.0: 2747 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2748 ; X86-NEXT: vmovntps %ymm0, (%eax) 2749 ; X86-NEXT: vzeroupper 2750 ; X86-NEXT: retl 2751 ; 2752 ; X64-LABEL: test_mm256_stream_si256: 2753 ; X64: # %bb.0: 2754 ; X64-NEXT: vmovntps %ymm0, (%rdi) 2755 ; X64-NEXT: vzeroupper 2756 ; X64-NEXT: retq 2757 store <4 x i64> %a1, <4 x i64>* %a0, align 32, !nontemporal !0 2758 ret void 2759 } 2760 2761 define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2762 ; CHECK-LABEL: test_mm256_sub_pd: 2763 ; CHECK: # %bb.0: 2764 ; CHECK-NEXT: vsubpd %ymm1, %ymm0, %ymm0 2765 ; CHECK-NEXT: ret{{[l|q]}} 2766 %res = fsub <4 x double> %a0, %a1 2767 ret <4 x double> %res 2768 } 2769 2770 define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2771 ; CHECK-LABEL: test_mm256_sub_ps: 2772 ; CHECK: # %bb.0: 2773 ; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 2774 ; CHECK-NEXT: ret{{[l|q]}} 2775 %res = fsub <8 x float> %a0, %a1 2776 ret <8 x float> %res 2777 } 2778 2779 define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 2780 ; CHECK-LABEL: test_mm_testc_pd: 2781 ; CHECK: # %bb.0: 2782 ; CHECK-NEXT: xorl %eax, %eax 2783 ; CHECK-NEXT: vtestpd %xmm1, %xmm0 2784 ; CHECK-NEXT: setb %al 2785 ; CHECK-NEXT: ret{{[l|q]}} 2786 %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) 2787 ret i32 %res 2788 } 2789 declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone 2790 2791 define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2792 ; CHECK-LABEL: test_mm256_testc_pd: 2793 ; CHECK: # %bb.0: 2794 ; CHECK-NEXT: xorl %eax, %eax 2795 ; CHECK-NEXT: vtestpd %ymm1, %ymm0 2796 ; CHECK-NEXT: setb %al 2797 ; CHECK-NEXT: vzeroupper 2798 ; CHECK-NEXT: ret{{[l|q]}} 2799 %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) 2800 ret i32 %res 2801 } 2802 declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone 2803 2804 define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2805 ; CHECK-LABEL: test_mm_testc_ps: 2806 ; CHECK: # %bb.0: 2807 ; CHECK-NEXT: xorl %eax, %eax 2808 ; CHECK-NEXT: vtestps %xmm1, %xmm0 2809 ; CHECK-NEXT: setb %al 2810 ; CHECK-NEXT: ret{{[l|q]}} 2811 %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) 2812 ret i32 %res 2813 } 2814 declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone 2815 2816 define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2817 ; CHECK-LABEL: test_mm256_testc_ps: 2818 ; CHECK: # %bb.0: 2819 ; CHECK-NEXT: xorl %eax, %eax 2820 ; CHECK-NEXT: vtestps %ymm1, %ymm0 2821 ; CHECK-NEXT: setb %al 2822 ; CHECK-NEXT: vzeroupper 2823 ; CHECK-NEXT: ret{{[l|q]}} 2824 %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) 2825 ret i32 %res 2826 } 2827 declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone 2828 2829 define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2830 ; CHECK-LABEL: test_mm256_testc_si256: 2831 ; CHECK: # %bb.0: 2832 ; CHECK-NEXT: xorl %eax, %eax 2833 ; CHECK-NEXT: vptest %ymm1, %ymm0 2834 ; CHECK-NEXT: setb %al 2835 ; CHECK-NEXT: vzeroupper 2836 ; CHECK-NEXT: ret{{[l|q]}} 2837 %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) 2838 ret i32 %res 2839 } 2840 declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone 2841 2842 define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 2843 ; CHECK-LABEL: test_mm_testnzc_pd: 2844 ; CHECK: # %bb.0: 2845 ; CHECK-NEXT: xorl %eax, %eax 2846 ; CHECK-NEXT: vtestpd %xmm1, %xmm0 2847 ; CHECK-NEXT: seta %al 2848 ; CHECK-NEXT: ret{{[l|q]}} 2849 %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1) 2850 ret i32 %res 2851 } 2852 declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone 2853 2854 define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2855 ; CHECK-LABEL: test_mm256_testnzc_pd: 2856 ; CHECK: # %bb.0: 2857 ; CHECK-NEXT: xorl %eax, %eax 2858 ; CHECK-NEXT: vtestpd %ymm1, %ymm0 2859 ; CHECK-NEXT: seta %al 2860 ; CHECK-NEXT: vzeroupper 2861 ; CHECK-NEXT: ret{{[l|q]}} 2862 %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1) 2863 ret i32 %res 2864 } 2865 declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone 2866 2867 define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2868 ; CHECK-LABEL: test_mm_testnzc_ps: 2869 ; CHECK: # %bb.0: 2870 ; CHECK-NEXT: xorl %eax, %eax 2871 ; CHECK-NEXT: vtestps %xmm1, %xmm0 2872 ; CHECK-NEXT: seta %al 2873 ; CHECK-NEXT: ret{{[l|q]}} 2874 %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1) 2875 ret i32 %res 2876 } 2877 declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone 2878 2879 define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2880 ; CHECK-LABEL: test_mm256_testnzc_ps: 2881 ; CHECK: # %bb.0: 2882 ; CHECK-NEXT: xorl %eax, %eax 2883 ; CHECK-NEXT: vtestps %ymm1, %ymm0 2884 ; CHECK-NEXT: seta %al 2885 ; CHECK-NEXT: vzeroupper 2886 ; CHECK-NEXT: ret{{[l|q]}} 2887 %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1) 2888 ret i32 %res 2889 } 2890 declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone 2891 2892 define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2893 ; CHECK-LABEL: test_mm256_testnzc_si256: 2894 ; CHECK: # %bb.0: 2895 ; CHECK-NEXT: xorl %eax, %eax 2896 ; CHECK-NEXT: vptest %ymm1, %ymm0 2897 ; CHECK-NEXT: seta %al 2898 ; CHECK-NEXT: vzeroupper 2899 ; CHECK-NEXT: ret{{[l|q]}} 2900 %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1) 2901 ret i32 %res 2902 } 2903 declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone 2904 2905 define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 2906 ; CHECK-LABEL: test_mm_testz_pd: 2907 ; CHECK: # %bb.0: 2908 ; CHECK-NEXT: xorl %eax, %eax 2909 ; CHECK-NEXT: vtestpd %xmm1, %xmm0 2910 ; CHECK-NEXT: sete %al 2911 ; CHECK-NEXT: ret{{[l|q]}} 2912 %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1) 2913 ret i32 %res 2914 } 2915 declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone 2916 2917 define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2918 ; CHECK-LABEL: test_mm256_testz_pd: 2919 ; CHECK: # %bb.0: 2920 ; CHECK-NEXT: xorl %eax, %eax 2921 ; CHECK-NEXT: vtestpd %ymm1, %ymm0 2922 ; CHECK-NEXT: sete %al 2923 ; CHECK-NEXT: vzeroupper 2924 ; CHECK-NEXT: ret{{[l|q]}} 2925 %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1) 2926 ret i32 %res 2927 } 2928 declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone 2929 2930 define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2931 ; CHECK-LABEL: test_mm_testz_ps: 2932 ; CHECK: # %bb.0: 2933 ; CHECK-NEXT: xorl %eax, %eax 2934 ; CHECK-NEXT: vtestps %xmm1, %xmm0 2935 ; CHECK-NEXT: sete %al 2936 ; CHECK-NEXT: ret{{[l|q]}} 2937 %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1) 2938 ret i32 %res 2939 } 2940 declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone 2941 2942 define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2943 ; CHECK-LABEL: test_mm256_testz_ps: 2944 ; CHECK: # %bb.0: 2945 ; CHECK-NEXT: xorl %eax, %eax 2946 ; CHECK-NEXT: vtestps %ymm1, %ymm0 2947 ; CHECK-NEXT: sete %al 2948 ; CHECK-NEXT: vzeroupper 2949 ; CHECK-NEXT: ret{{[l|q]}} 2950 %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1) 2951 ret i32 %res 2952 } 2953 declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone 2954 2955 define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2956 ; CHECK-LABEL: test_mm256_testz_si256: 2957 ; CHECK: # %bb.0: 2958 ; CHECK-NEXT: xorl %eax, %eax 2959 ; CHECK-NEXT: vptest %ymm1, %ymm0 2960 ; CHECK-NEXT: sete %al 2961 ; CHECK-NEXT: vzeroupper 2962 ; CHECK-NEXT: ret{{[l|q]}} 2963 %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1) 2964 ret i32 %res 2965 } 2966 declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone 2967 2968 define <2 x double> @test_mm_undefined_pd() nounwind { 2969 ; CHECK-LABEL: test_mm_undefined_pd: 2970 ; CHECK: # %bb.0: 2971 ; CHECK-NEXT: ret{{[l|q]}} 2972 ret <2 x double> undef 2973 } 2974 2975 define <4 x double> @test_mm256_undefined_pd() nounwind { 2976 ; CHECK-LABEL: test_mm256_undefined_pd: 2977 ; CHECK: # %bb.0: 2978 ; CHECK-NEXT: ret{{[l|q]}} 2979 ret <4 x double> undef 2980 } 2981 2982 define <8 x float> @test_mm256_undefined_ps() nounwind { 2983 ; CHECK-LABEL: test_mm256_undefined_ps: 2984 ; CHECK: # %bb.0: 2985 ; CHECK-NEXT: ret{{[l|q]}} 2986 ret <8 x float> undef 2987 } 2988 2989 define <4 x i64> @test_mm256_undefined_si256() nounwind { 2990 ; CHECK-LABEL: test_mm256_undefined_si256: 2991 ; CHECK: # %bb.0: 2992 ; CHECK-NEXT: ret{{[l|q]}} 2993 ret <4 x i64> undef 2994 } 2995 2996 define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2997 ; CHECK-LABEL: test_mm256_unpackhi_pd: 2998 ; CHECK: # %bb.0: 2999 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 3000 ; CHECK-NEXT: ret{{[l|q]}} 3001 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 3002 ret <4 x double> %res 3003 } 3004 3005 define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 3006 ; CHECK-LABEL: test_mm256_unpackhi_ps: 3007 ; CHECK: # %bb.0: 3008 ; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 3009 ; CHECK-NEXT: ret{{[l|q]}} 3010 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 3011 ret <8 x float> %res 3012 } 3013 3014 define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 3015 ; CHECK-LABEL: test_mm256_unpacklo_pd: 3016 ; CHECK: # %bb.0: 3017 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 3018 ; CHECK-NEXT: ret{{[l|q]}} 3019 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 3020 ret <4 x double> %res 3021 } 3022 3023 define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 3024 ; CHECK-LABEL: test_mm256_unpacklo_ps: 3025 ; CHECK: # %bb.0: 3026 ; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 3027 ; CHECK-NEXT: ret{{[l|q]}} 3028 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 3029 ret <8 x float> %res 3030 } 3031 3032 define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 3033 ; CHECK-LABEL: test_mm256_xor_pd: 3034 ; CHECK: # %bb.0: 3035 ; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 3036 ; CHECK-NEXT: ret{{[l|q]}} 3037 %1 = bitcast <4 x double> %a0 to <4 x i64> 3038 %2 = bitcast <4 x double> %a1 to <4 x i64> 3039 %res = xor <4 x i64> %1, %2 3040 %bc = bitcast <4 x i64> %res to <4 x double> 3041 ret <4 x double> %bc 3042 } 3043 3044 define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 3045 ; CHECK-LABEL: test_mm256_xor_ps: 3046 ; CHECK: # %bb.0: 3047 ; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 3048 ; CHECK-NEXT: ret{{[l|q]}} 3049 %1 = bitcast <8 x float> %a0 to <8 x i32> 3050 %2 = bitcast <8 x float> %a1 to <8 x i32> 3051 %res = xor <8 x i32> %1, %2 3052 %bc = bitcast <8 x i32> %res to <8 x float> 3053 ret <8 x float> %bc 3054 } 3055 3056 define void @test_mm256_zeroall() nounwind { 3057 ; CHECK-LABEL: test_mm256_zeroall: 3058 ; CHECK: # %bb.0: 3059 ; CHECK-NEXT: vzeroall 3060 ; CHECK-NEXT: ret{{[l|q]}} 3061 call void @llvm.x86.avx.vzeroall() 3062 ret void 3063 } 3064 declare void @llvm.x86.avx.vzeroall() nounwind readnone 3065 3066 define void @test_mm256_zeroupper() nounwind { 3067 ; CHECK-LABEL: test_mm256_zeroupper: 3068 ; CHECK: # %bb.0: 3069 ; CHECK-NEXT: vzeroupper 3070 ; CHECK-NEXT: ret{{[l|q]}} 3071 call void @llvm.x86.avx.vzeroupper() 3072 ret void 3073 } 3074 declare void @llvm.x86.avx.vzeroupper() nounwind readnone 3075 3076 define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind { 3077 ; CHECK-LABEL: test_mm256_zextpd128_pd256: 3078 ; CHECK: # %bb.0: 3079 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 3080 ; CHECK-NEXT: ret{{[l|q]}} 3081 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3082 ret <4 x double> %res 3083 } 3084 3085 define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind { 3086 ; CHECK-LABEL: test_mm256_zextps128_ps256: 3087 ; CHECK: # %bb.0: 3088 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 3089 ; CHECK-NEXT: ret{{[l|q]}} 3090 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3091 ret <8 x float> %res 3092 } 3093 3094 define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind { 3095 ; CHECK-LABEL: test_mm256_zextsi128_si256: 3096 ; CHECK: # %bb.0: 3097 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 3098 ; CHECK-NEXT: ret{{[l|q]}} 3099 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3100 ret <4 x i64> %res 3101 } 3102 3103 !0 = !{i32 1} 3104