1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=SSE32 3 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=SSE64 4 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVXONLY32 5 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVXONLY64 6 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=KNL32 7 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=KNL64 8 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=SKX32 9 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=SKX64 10 11 define i32 @test_store_32(i32* nocapture %addr, i32 %value) { 12 ; ALL32-LABEL: test_store_32: 13 ; ALL32: # %bb.0: # %entry 14 ; ALL32-NEXT: movl %esi, (%rdi) 15 ; ALL32-NEXT: movl %esi, %eax 16 ; ALL32-NEXT: retq 17 ; 18 ; ALL64-LABEL: test_store_32: 19 ; ALL64: # %bb.0: # %entry 20 ; ALL64-NEXT: movl {{[0-9]+}}(%esp), %eax 21 ; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx 22 ; ALL64-NEXT: movl %eax, (%ecx) 23 ; ALL64-NEXT: retl 24 entry: 25 store i32 %value, i32* %addr, align 1 26 ret i32 %value 27 } 28 29 define i16 @test_store_16(i16* nocapture %addr, i16 %value) { 30 ; ALL32-LABEL: test_store_16: 31 ; ALL32: # %bb.0: # %entry 32 ; ALL32-NEXT: movw %si, (%rdi) 33 ; ALL32-NEXT: movl %esi, %eax 34 ; ALL32-NEXT: retq 35 ; 36 ; ALL64-LABEL: test_store_16: 37 ; ALL64: # %bb.0: # %entry 38 ; ALL64-NEXT: movzwl {{[0-9]+}}(%esp), %eax 39 ; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx 40 ; ALL64-NEXT: movw %ax, (%ecx) 41 ; ALL64-NEXT: retl 42 entry: 43 store i16 %value, i16* %addr, align 1 44 ret i16 %value 45 } 46 47 define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) { 48 ; SSE32-LABEL: test_store_4xi32: 49 ; SSE32: # %bb.0: 50 ; SSE32-NEXT: paddd %xmm1, %xmm0 51 ; SSE32-NEXT: movdqu %xmm0, (%rdi) 52 ; SSE32-NEXT: retq 53 ; 54 ; SSE64-LABEL: test_store_4xi32: 55 ; SSE64: # %bb.0: 56 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 57 ; SSE64-NEXT: paddd %xmm1, %xmm0 58 ; SSE64-NEXT: movdqu %xmm0, (%eax) 59 ; SSE64-NEXT: retl 60 ; 61 ; AVX32-LABEL: test_store_4xi32: 62 ; AVX32: # %bb.0: 63 ; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 64 ; AVX32-NEXT: vmovdqu %xmm0, (%rdi) 65 ; AVX32-NEXT: retq 66 ; 67 ; AVX64-LABEL: test_store_4xi32: 68 ; AVX64: # %bb.0: 69 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 70 ; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 71 ; AVX64-NEXT: vmovdqu %xmm0, (%eax) 72 ; AVX64-NEXT: retl 73 %foo = add <4 x i32> %value, %value2 ; to force integer type on store 74 store <4 x i32> %foo, <4 x i32>* %addr, align 1 75 ret <4 x i32> %foo 76 } 77 78 define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) { 79 ; SSE32-LABEL: test_store_4xi32_aligned: 80 ; SSE32: # %bb.0: 81 ; SSE32-NEXT: paddd %xmm1, %xmm0 82 ; SSE32-NEXT: movdqa %xmm0, (%rdi) 83 ; SSE32-NEXT: retq 84 ; 85 ; SSE64-LABEL: test_store_4xi32_aligned: 86 ; SSE64: # %bb.0: 87 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 88 ; SSE64-NEXT: paddd %xmm1, %xmm0 89 ; SSE64-NEXT: movdqa %xmm0, (%eax) 90 ; SSE64-NEXT: retl 91 ; 92 ; AVX32-LABEL: test_store_4xi32_aligned: 93 ; AVX32: # %bb.0: 94 ; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 95 ; AVX32-NEXT: vmovdqa %xmm0, (%rdi) 96 ; AVX32-NEXT: retq 97 ; 98 ; AVX64-LABEL: test_store_4xi32_aligned: 99 ; AVX64: # %bb.0: 100 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 101 ; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 102 ; AVX64-NEXT: vmovdqa %xmm0, (%eax) 103 ; AVX64-NEXT: retl 104 %foo = add <4 x i32> %value, %value2 ; to force integer type on store 105 store <4 x i32> %foo, <4 x i32>* %addr, align 16 106 ret <4 x i32> %foo 107 } 108 109 define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %value) { 110 ; SSE32-LABEL: test_store_4xf32: 111 ; SSE32: # %bb.0: 112 ; SSE32-NEXT: movups %xmm0, (%rdi) 113 ; SSE32-NEXT: retq 114 ; 115 ; SSE64-LABEL: test_store_4xf32: 116 ; SSE64: # %bb.0: 117 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 118 ; SSE64-NEXT: movups %xmm0, (%eax) 119 ; SSE64-NEXT: retl 120 ; 121 ; AVX32-LABEL: test_store_4xf32: 122 ; AVX32: # %bb.0: 123 ; AVX32-NEXT: vmovups %xmm0, (%rdi) 124 ; AVX32-NEXT: retq 125 ; 126 ; AVX64-LABEL: test_store_4xf32: 127 ; AVX64: # %bb.0: 128 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 129 ; AVX64-NEXT: vmovups %xmm0, (%eax) 130 ; AVX64-NEXT: retl 131 store <4 x float> %value, <4 x float>* %addr, align 1 132 ret <4 x float> %value 133 } 134 135 define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x float> %value) { 136 ; SSE32-LABEL: test_store_4xf32_aligned: 137 ; SSE32: # %bb.0: 138 ; SSE32-NEXT: movaps %xmm0, (%rdi) 139 ; SSE32-NEXT: retq 140 ; 141 ; SSE64-LABEL: test_store_4xf32_aligned: 142 ; SSE64: # %bb.0: 143 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 144 ; SSE64-NEXT: movaps %xmm0, (%eax) 145 ; SSE64-NEXT: retl 146 ; 147 ; AVX32-LABEL: test_store_4xf32_aligned: 148 ; AVX32: # %bb.0: 149 ; AVX32-NEXT: vmovaps %xmm0, (%rdi) 150 ; AVX32-NEXT: retq 151 ; 152 ; AVX64-LABEL: test_store_4xf32_aligned: 153 ; AVX64: # %bb.0: 154 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 155 ; AVX64-NEXT: vmovaps %xmm0, (%eax) 156 ; AVX64-NEXT: retl 157 store <4 x float> %value, <4 x float>* %addr, align 16 158 ret <4 x float> %value 159 } 160 161 define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) { 162 ; SSE32-LABEL: test_store_2xf64: 163 ; SSE32: # %bb.0: 164 ; SSE32-NEXT: addpd %xmm1, %xmm0 165 ; SSE32-NEXT: movupd %xmm0, (%rdi) 166 ; SSE32-NEXT: retq 167 ; 168 ; SSE64-LABEL: test_store_2xf64: 169 ; SSE64: # %bb.0: 170 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 171 ; SSE64-NEXT: addpd %xmm1, %xmm0 172 ; SSE64-NEXT: movupd %xmm0, (%eax) 173 ; SSE64-NEXT: retl 174 ; 175 ; AVX32-LABEL: test_store_2xf64: 176 ; AVX32: # %bb.0: 177 ; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0 178 ; AVX32-NEXT: vmovupd %xmm0, (%rdi) 179 ; AVX32-NEXT: retq 180 ; 181 ; AVX64-LABEL: test_store_2xf64: 182 ; AVX64: # %bb.0: 183 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 184 ; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 185 ; AVX64-NEXT: vmovupd %xmm0, (%eax) 186 ; AVX64-NEXT: retl 187 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store 188 store <2 x double> %foo, <2 x double>* %addr, align 1 189 ret <2 x double> %foo 190 } 191 192 define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) { 193 ; SSE32-LABEL: test_store_2xf64_aligned: 194 ; SSE32: # %bb.0: 195 ; SSE32-NEXT: addpd %xmm1, %xmm0 196 ; SSE32-NEXT: movapd %xmm0, (%rdi) 197 ; SSE32-NEXT: retq 198 ; 199 ; SSE64-LABEL: test_store_2xf64_aligned: 200 ; SSE64: # %bb.0: 201 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 202 ; SSE64-NEXT: addpd %xmm1, %xmm0 203 ; SSE64-NEXT: movapd %xmm0, (%eax) 204 ; SSE64-NEXT: retl 205 ; 206 ; AVX32-LABEL: test_store_2xf64_aligned: 207 ; AVX32: # %bb.0: 208 ; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0 209 ; AVX32-NEXT: vmovapd %xmm0, (%rdi) 210 ; AVX32-NEXT: retq 211 ; 212 ; AVX64-LABEL: test_store_2xf64_aligned: 213 ; AVX64: # %bb.0: 214 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 215 ; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 216 ; AVX64-NEXT: vmovapd %xmm0, (%eax) 217 ; AVX64-NEXT: retl 218 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store 219 store <2 x double> %foo, <2 x double>* %addr, align 16 220 ret <2 x double> %foo 221 } 222 223 define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value) { 224 ; SSE32-LABEL: test_store_8xi32: 225 ; SSE32: # %bb.0: 226 ; SSE32-NEXT: movups %xmm0, (%rdi) 227 ; SSE32-NEXT: movups %xmm1, 16(%rdi) 228 ; SSE32-NEXT: retq 229 ; 230 ; SSE64-LABEL: test_store_8xi32: 231 ; SSE64: # %bb.0: 232 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 233 ; SSE64-NEXT: movups %xmm0, (%eax) 234 ; SSE64-NEXT: movups %xmm1, 16(%eax) 235 ; SSE64-NEXT: retl 236 ; 237 ; AVX32-LABEL: test_store_8xi32: 238 ; AVX32: # %bb.0: 239 ; AVX32-NEXT: vmovups %ymm0, (%rdi) 240 ; AVX32-NEXT: retq 241 ; 242 ; AVX64-LABEL: test_store_8xi32: 243 ; AVX64: # %bb.0: 244 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 245 ; AVX64-NEXT: vmovups %ymm0, (%eax) 246 ; AVX64-NEXT: retl 247 store <8 x i32> %value, <8 x i32>* %addr, align 1 248 ret <8 x i32> %value 249 } 250 251 define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32> %value) { 252 ; SSE32-LABEL: test_store_8xi32_aligned: 253 ; SSE32: # %bb.0: 254 ; SSE32-NEXT: movaps %xmm0, (%rdi) 255 ; SSE32-NEXT: movaps %xmm1, 16(%rdi) 256 ; SSE32-NEXT: retq 257 ; 258 ; SSE64-LABEL: test_store_8xi32_aligned: 259 ; SSE64: # %bb.0: 260 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 261 ; SSE64-NEXT: movaps %xmm0, (%eax) 262 ; SSE64-NEXT: movaps %xmm1, 16(%eax) 263 ; SSE64-NEXT: retl 264 ; 265 ; AVX32-LABEL: test_store_8xi32_aligned: 266 ; AVX32: # %bb.0: 267 ; AVX32-NEXT: vmovaps %ymm0, (%rdi) 268 ; AVX32-NEXT: retq 269 ; 270 ; AVX64-LABEL: test_store_8xi32_aligned: 271 ; AVX64: # %bb.0: 272 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 273 ; AVX64-NEXT: vmovaps %ymm0, (%eax) 274 ; AVX64-NEXT: retl 275 store <8 x i32> %value, <8 x i32>* %addr, align 32 276 ret <8 x i32> %value 277 } 278 279 define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %value) { 280 ; SSE32-LABEL: test_store_8xf32: 281 ; SSE32: # %bb.0: 282 ; SSE32-NEXT: movups %xmm0, (%rdi) 283 ; SSE32-NEXT: movups %xmm1, 16(%rdi) 284 ; SSE32-NEXT: retq 285 ; 286 ; SSE64-LABEL: test_store_8xf32: 287 ; SSE64: # %bb.0: 288 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 289 ; SSE64-NEXT: movups %xmm0, (%eax) 290 ; SSE64-NEXT: movups %xmm1, 16(%eax) 291 ; SSE64-NEXT: retl 292 ; 293 ; AVX32-LABEL: test_store_8xf32: 294 ; AVX32: # %bb.0: 295 ; AVX32-NEXT: vmovups %ymm0, (%rdi) 296 ; AVX32-NEXT: retq 297 ; 298 ; AVX64-LABEL: test_store_8xf32: 299 ; AVX64: # %bb.0: 300 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 301 ; AVX64-NEXT: vmovups %ymm0, (%eax) 302 ; AVX64-NEXT: retl 303 store <8 x float> %value, <8 x float>* %addr, align 1 304 ret <8 x float> %value 305 } 306 307 define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x float> %value) { 308 ; SSE32-LABEL: test_store_8xf32_aligned: 309 ; SSE32: # %bb.0: 310 ; SSE32-NEXT: movaps %xmm0, (%rdi) 311 ; SSE32-NEXT: movaps %xmm1, 16(%rdi) 312 ; SSE32-NEXT: retq 313 ; 314 ; SSE64-LABEL: test_store_8xf32_aligned: 315 ; SSE64: # %bb.0: 316 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 317 ; SSE64-NEXT: movaps %xmm0, (%eax) 318 ; SSE64-NEXT: movaps %xmm1, 16(%eax) 319 ; SSE64-NEXT: retl 320 ; 321 ; AVX32-LABEL: test_store_8xf32_aligned: 322 ; AVX32: # %bb.0: 323 ; AVX32-NEXT: vmovaps %ymm0, (%rdi) 324 ; AVX32-NEXT: retq 325 ; 326 ; AVX64-LABEL: test_store_8xf32_aligned: 327 ; AVX64: # %bb.0: 328 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 329 ; AVX64-NEXT: vmovaps %ymm0, (%eax) 330 ; AVX64-NEXT: retl 331 store <8 x float> %value, <8 x float>* %addr, align 32 332 ret <8 x float> %value 333 } 334 335 define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) { 336 ; SSE32-LABEL: test_store_4xf64: 337 ; SSE32: # %bb.0: 338 ; SSE32-NEXT: addpd %xmm3, %xmm1 339 ; SSE32-NEXT: addpd %xmm2, %xmm0 340 ; SSE32-NEXT: movupd %xmm0, (%rdi) 341 ; SSE32-NEXT: movupd %xmm1, 16(%rdi) 342 ; SSE32-NEXT: retq 343 ; 344 ; SSE64-LABEL: test_store_4xf64: 345 ; SSE64: # %bb.0: 346 ; SSE64-NEXT: subl $12, %esp 347 ; SSE64-NEXT: .cfi_def_cfa_offset 16 348 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 349 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 350 ; SSE64-NEXT: addpd %xmm2, %xmm0 351 ; SSE64-NEXT: movupd %xmm0, (%eax) 352 ; SSE64-NEXT: movupd %xmm1, 16(%eax) 353 ; SSE64-NEXT: addl $12, %esp 354 ; SSE64-NEXT: .cfi_def_cfa_offset 4 355 ; SSE64-NEXT: retl 356 ; 357 ; AVX32-LABEL: test_store_4xf64: 358 ; AVX32: # %bb.0: 359 ; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 360 ; AVX32-NEXT: vmovupd %ymm0, (%rdi) 361 ; AVX32-NEXT: retq 362 ; 363 ; AVX64-LABEL: test_store_4xf64: 364 ; AVX64: # %bb.0: 365 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 366 ; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 367 ; AVX64-NEXT: vmovupd %ymm0, (%eax) 368 ; AVX64-NEXT: retl 369 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store 370 store <4 x double> %foo, <4 x double>* %addr, align 1 371 ret <4 x double> %foo 372 } 373 374 define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) { 375 ; SSE32-LABEL: test_store_4xf64_aligned: 376 ; SSE32: # %bb.0: 377 ; SSE32-NEXT: addpd %xmm3, %xmm1 378 ; SSE32-NEXT: addpd %xmm2, %xmm0 379 ; SSE32-NEXT: movapd %xmm0, (%rdi) 380 ; SSE32-NEXT: movapd %xmm1, 16(%rdi) 381 ; SSE32-NEXT: retq 382 ; 383 ; SSE64-LABEL: test_store_4xf64_aligned: 384 ; SSE64: # %bb.0: 385 ; SSE64-NEXT: subl $12, %esp 386 ; SSE64-NEXT: .cfi_def_cfa_offset 16 387 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 388 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 389 ; SSE64-NEXT: addpd %xmm2, %xmm0 390 ; SSE64-NEXT: movapd %xmm0, (%eax) 391 ; SSE64-NEXT: movapd %xmm1, 16(%eax) 392 ; SSE64-NEXT: addl $12, %esp 393 ; SSE64-NEXT: .cfi_def_cfa_offset 4 394 ; SSE64-NEXT: retl 395 ; 396 ; AVX32-LABEL: test_store_4xf64_aligned: 397 ; AVX32: # %bb.0: 398 ; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 399 ; AVX32-NEXT: vmovapd %ymm0, (%rdi) 400 ; AVX32-NEXT: retq 401 ; 402 ; AVX64-LABEL: test_store_4xf64_aligned: 403 ; AVX64: # %bb.0: 404 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 405 ; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 406 ; AVX64-NEXT: vmovapd %ymm0, (%eax) 407 ; AVX64-NEXT: retl 408 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store 409 store <4 x double> %foo, <4 x double>* %addr, align 32 410 ret <4 x double> %foo 411 } 412 413 define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %value) { 414 ; SSE32-LABEL: test_store_16xi32: 415 ; SSE32: # %bb.0: 416 ; SSE32-NEXT: movups %xmm0, (%rdi) 417 ; SSE32-NEXT: movups %xmm1, 16(%rdi) 418 ; SSE32-NEXT: movups %xmm2, 32(%rdi) 419 ; SSE32-NEXT: movups %xmm3, 48(%rdi) 420 ; SSE32-NEXT: retq 421 ; 422 ; SSE64-LABEL: test_store_16xi32: 423 ; SSE64: # %bb.0: 424 ; SSE64-NEXT: subl $12, %esp 425 ; SSE64-NEXT: .cfi_def_cfa_offset 16 426 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 427 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 428 ; SSE64-NEXT: movups %xmm0, (%eax) 429 ; SSE64-NEXT: movups %xmm1, 16(%eax) 430 ; SSE64-NEXT: movups %xmm2, 32(%eax) 431 ; SSE64-NEXT: movups %xmm3, 48(%eax) 432 ; SSE64-NEXT: addl $12, %esp 433 ; SSE64-NEXT: .cfi_def_cfa_offset 4 434 ; SSE64-NEXT: retl 435 ; 436 ; AVXONLY32-LABEL: test_store_16xi32: 437 ; AVXONLY32: # %bb.0: 438 ; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) 439 ; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) 440 ; AVXONLY32-NEXT: retq 441 ; 442 ; AVXONLY64-LABEL: test_store_16xi32: 443 ; AVXONLY64: # %bb.0: 444 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 445 ; AVXONLY64-NEXT: vmovups %ymm0, (%eax) 446 ; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) 447 ; AVXONLY64-NEXT: retl 448 ; 449 ; AVX51232-LABEL: test_store_16xi32: 450 ; AVX51232: # %bb.0: 451 ; AVX51232-NEXT: vmovups %zmm0, (%rdi) 452 ; AVX51232-NEXT: retq 453 ; 454 ; AVX51264-LABEL: test_store_16xi32: 455 ; AVX51264: # %bb.0: 456 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 457 ; AVX51264-NEXT: vmovups %zmm0, (%eax) 458 ; AVX51264-NEXT: retl 459 store <16 x i32> %value, <16 x i32>* %addr, align 1 460 ret <16 x i32> %value 461 } 462 463 define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x i32> %value) { 464 ; SSE32-LABEL: test_store_16xi32_aligned: 465 ; SSE32: # %bb.0: 466 ; SSE32-NEXT: movaps %xmm0, (%rdi) 467 ; SSE32-NEXT: movaps %xmm1, 16(%rdi) 468 ; SSE32-NEXT: movaps %xmm2, 32(%rdi) 469 ; SSE32-NEXT: movaps %xmm3, 48(%rdi) 470 ; SSE32-NEXT: retq 471 ; 472 ; SSE64-LABEL: test_store_16xi32_aligned: 473 ; SSE64: # %bb.0: 474 ; SSE64-NEXT: subl $12, %esp 475 ; SSE64-NEXT: .cfi_def_cfa_offset 16 476 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 477 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 478 ; SSE64-NEXT: movaps %xmm0, (%eax) 479 ; SSE64-NEXT: movaps %xmm1, 16(%eax) 480 ; SSE64-NEXT: movaps %xmm2, 32(%eax) 481 ; SSE64-NEXT: movaps %xmm3, 48(%eax) 482 ; SSE64-NEXT: addl $12, %esp 483 ; SSE64-NEXT: .cfi_def_cfa_offset 4 484 ; SSE64-NEXT: retl 485 ; 486 ; AVXONLY32-LABEL: test_store_16xi32_aligned: 487 ; AVXONLY32: # %bb.0: 488 ; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) 489 ; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) 490 ; AVXONLY32-NEXT: retq 491 ; 492 ; AVXONLY64-LABEL: test_store_16xi32_aligned: 493 ; AVXONLY64: # %bb.0: 494 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 495 ; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) 496 ; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) 497 ; AVXONLY64-NEXT: retl 498 ; 499 ; AVX51232-LABEL: test_store_16xi32_aligned: 500 ; AVX51232: # %bb.0: 501 ; AVX51232-NEXT: vmovaps %zmm0, (%rdi) 502 ; AVX51232-NEXT: retq 503 ; 504 ; AVX51264-LABEL: test_store_16xi32_aligned: 505 ; AVX51264: # %bb.0: 506 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 507 ; AVX51264-NEXT: vmovaps %zmm0, (%eax) 508 ; AVX51264-NEXT: retl 509 store <16 x i32> %value, <16 x i32>* %addr, align 64 510 ret <16 x i32> %value 511 } 512 513 define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x float> %value) { 514 ; SSE32-LABEL: test_store_16xf32: 515 ; SSE32: # %bb.0: 516 ; SSE32-NEXT: movups %xmm0, (%rdi) 517 ; SSE32-NEXT: movups %xmm1, 16(%rdi) 518 ; SSE32-NEXT: movups %xmm2, 32(%rdi) 519 ; SSE32-NEXT: movups %xmm3, 48(%rdi) 520 ; SSE32-NEXT: retq 521 ; 522 ; SSE64-LABEL: test_store_16xf32: 523 ; SSE64: # %bb.0: 524 ; SSE64-NEXT: subl $12, %esp 525 ; SSE64-NEXT: .cfi_def_cfa_offset 16 526 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 527 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 528 ; SSE64-NEXT: movups %xmm0, (%eax) 529 ; SSE64-NEXT: movups %xmm1, 16(%eax) 530 ; SSE64-NEXT: movups %xmm2, 32(%eax) 531 ; SSE64-NEXT: movups %xmm3, 48(%eax) 532 ; SSE64-NEXT: addl $12, %esp 533 ; SSE64-NEXT: .cfi_def_cfa_offset 4 534 ; SSE64-NEXT: retl 535 ; 536 ; AVXONLY32-LABEL: test_store_16xf32: 537 ; AVXONLY32: # %bb.0: 538 ; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) 539 ; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) 540 ; AVXONLY32-NEXT: retq 541 ; 542 ; AVXONLY64-LABEL: test_store_16xf32: 543 ; AVXONLY64: # %bb.0: 544 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 545 ; AVXONLY64-NEXT: vmovups %ymm0, (%eax) 546 ; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) 547 ; AVXONLY64-NEXT: retl 548 ; 549 ; AVX51232-LABEL: test_store_16xf32: 550 ; AVX51232: # %bb.0: 551 ; AVX51232-NEXT: vmovups %zmm0, (%rdi) 552 ; AVX51232-NEXT: retq 553 ; 554 ; AVX51264-LABEL: test_store_16xf32: 555 ; AVX51264: # %bb.0: 556 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 557 ; AVX51264-NEXT: vmovups %zmm0, (%eax) 558 ; AVX51264-NEXT: retl 559 store <16 x float> %value, <16 x float>* %addr, align 1 560 ret <16 x float> %value 561 } 562 563 define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <16 x float> %value) { 564 ; SSE32-LABEL: test_store_16xf32_aligned: 565 ; SSE32: # %bb.0: 566 ; SSE32-NEXT: movaps %xmm0, (%rdi) 567 ; SSE32-NEXT: movaps %xmm1, 16(%rdi) 568 ; SSE32-NEXT: movaps %xmm2, 32(%rdi) 569 ; SSE32-NEXT: movaps %xmm3, 48(%rdi) 570 ; SSE32-NEXT: retq 571 ; 572 ; SSE64-LABEL: test_store_16xf32_aligned: 573 ; SSE64: # %bb.0: 574 ; SSE64-NEXT: subl $12, %esp 575 ; SSE64-NEXT: .cfi_def_cfa_offset 16 576 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 577 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 578 ; SSE64-NEXT: movaps %xmm0, (%eax) 579 ; SSE64-NEXT: movaps %xmm1, 16(%eax) 580 ; SSE64-NEXT: movaps %xmm2, 32(%eax) 581 ; SSE64-NEXT: movaps %xmm3, 48(%eax) 582 ; SSE64-NEXT: addl $12, %esp 583 ; SSE64-NEXT: .cfi_def_cfa_offset 4 584 ; SSE64-NEXT: retl 585 ; 586 ; AVXONLY32-LABEL: test_store_16xf32_aligned: 587 ; AVXONLY32: # %bb.0: 588 ; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) 589 ; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) 590 ; AVXONLY32-NEXT: retq 591 ; 592 ; AVXONLY64-LABEL: test_store_16xf32_aligned: 593 ; AVXONLY64: # %bb.0: 594 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 595 ; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) 596 ; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) 597 ; AVXONLY64-NEXT: retl 598 ; 599 ; AVX51232-LABEL: test_store_16xf32_aligned: 600 ; AVX51232: # %bb.0: 601 ; AVX51232-NEXT: vmovaps %zmm0, (%rdi) 602 ; AVX51232-NEXT: retq 603 ; 604 ; AVX51264-LABEL: test_store_16xf32_aligned: 605 ; AVX51264: # %bb.0: 606 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 607 ; AVX51264-NEXT: vmovaps %zmm0, (%eax) 608 ; AVX51264-NEXT: retl 609 store <16 x float> %value, <16 x float>* %addr, align 64 610 ret <16 x float> %value 611 } 612 613 define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) { 614 ; SSE32-LABEL: test_store_8xf64: 615 ; SSE32: # %bb.0: 616 ; SSE32-NEXT: addpd %xmm7, %xmm3 617 ; SSE32-NEXT: addpd %xmm6, %xmm2 618 ; SSE32-NEXT: addpd %xmm5, %xmm1 619 ; SSE32-NEXT: addpd %xmm4, %xmm0 620 ; SSE32-NEXT: movupd %xmm0, (%rdi) 621 ; SSE32-NEXT: movupd %xmm1, 16(%rdi) 622 ; SSE32-NEXT: movupd %xmm2, 32(%rdi) 623 ; SSE32-NEXT: movupd %xmm3, 48(%rdi) 624 ; SSE32-NEXT: retq 625 ; 626 ; SSE64-LABEL: test_store_8xf64: 627 ; SSE64: # %bb.0: 628 ; SSE64-NEXT: subl $12, %esp 629 ; SSE64-NEXT: .cfi_def_cfa_offset 16 630 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 631 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 632 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm3 633 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm2 634 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 635 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 636 ; SSE64-NEXT: movupd %xmm0, (%eax) 637 ; SSE64-NEXT: movupd %xmm1, 16(%eax) 638 ; SSE64-NEXT: movupd %xmm2, 32(%eax) 639 ; SSE64-NEXT: movupd %xmm3, 48(%eax) 640 ; SSE64-NEXT: addl $12, %esp 641 ; SSE64-NEXT: .cfi_def_cfa_offset 4 642 ; SSE64-NEXT: retl 643 ; 644 ; AVXONLY32-LABEL: test_store_8xf64: 645 ; AVXONLY32: # %bb.0: 646 ; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1 647 ; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0 648 ; AVXONLY32-NEXT: vmovupd %ymm0, (%rdi) 649 ; AVXONLY32-NEXT: vmovupd %ymm1, 32(%rdi) 650 ; AVXONLY32-NEXT: retq 651 ; 652 ; AVXONLY64-LABEL: test_store_8xf64: 653 ; AVXONLY64: # %bb.0: 654 ; AVXONLY64-NEXT: pushl %ebp 655 ; AVXONLY64-NEXT: .cfi_def_cfa_offset 8 656 ; AVXONLY64-NEXT: .cfi_offset %ebp, -8 657 ; AVXONLY64-NEXT: movl %esp, %ebp 658 ; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp 659 ; AVXONLY64-NEXT: andl $-32, %esp 660 ; AVXONLY64-NEXT: subl $32, %esp 661 ; AVXONLY64-NEXT: movl 8(%ebp), %eax 662 ; AVXONLY64-NEXT: vaddpd 40(%ebp), %ymm1, %ymm1 663 ; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 664 ; AVXONLY64-NEXT: vmovupd %ymm0, (%eax) 665 ; AVXONLY64-NEXT: vmovupd %ymm1, 32(%eax) 666 ; AVXONLY64-NEXT: movl %ebp, %esp 667 ; AVXONLY64-NEXT: popl %ebp 668 ; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4 669 ; AVXONLY64-NEXT: retl 670 ; 671 ; AVX51232-LABEL: test_store_8xf64: 672 ; AVX51232: # %bb.0: 673 ; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0 674 ; AVX51232-NEXT: vmovupd %zmm0, (%rdi) 675 ; AVX51232-NEXT: retq 676 ; 677 ; AVX51264-LABEL: test_store_8xf64: 678 ; AVX51264: # %bb.0: 679 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 680 ; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0 681 ; AVX51264-NEXT: vmovupd %zmm0, (%eax) 682 ; AVX51264-NEXT: retl 683 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store 684 store <8 x double> %foo, <8 x double>* %addr, align 1 685 ret <8 x double> %foo 686 } 687 688 define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) { 689 ; SSE32-LABEL: test_store_8xf64_aligned: 690 ; SSE32: # %bb.0: 691 ; SSE32-NEXT: addpd %xmm7, %xmm3 692 ; SSE32-NEXT: addpd %xmm6, %xmm2 693 ; SSE32-NEXT: addpd %xmm5, %xmm1 694 ; SSE32-NEXT: addpd %xmm4, %xmm0 695 ; SSE32-NEXT: movapd %xmm0, (%rdi) 696 ; SSE32-NEXT: movapd %xmm1, 16(%rdi) 697 ; SSE32-NEXT: movapd %xmm2, 32(%rdi) 698 ; SSE32-NEXT: movapd %xmm3, 48(%rdi) 699 ; SSE32-NEXT: retq 700 ; 701 ; SSE64-LABEL: test_store_8xf64_aligned: 702 ; SSE64: # %bb.0: 703 ; SSE64-NEXT: subl $12, %esp 704 ; SSE64-NEXT: .cfi_def_cfa_offset 16 705 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 706 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 707 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm3 708 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm2 709 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 710 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 711 ; SSE64-NEXT: movapd %xmm0, (%eax) 712 ; SSE64-NEXT: movapd %xmm1, 16(%eax) 713 ; SSE64-NEXT: movapd %xmm2, 32(%eax) 714 ; SSE64-NEXT: movapd %xmm3, 48(%eax) 715 ; SSE64-NEXT: addl $12, %esp 716 ; SSE64-NEXT: .cfi_def_cfa_offset 4 717 ; SSE64-NEXT: retl 718 ; 719 ; AVXONLY32-LABEL: test_store_8xf64_aligned: 720 ; AVXONLY32: # %bb.0: 721 ; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1 722 ; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0 723 ; AVXONLY32-NEXT: vmovapd %ymm0, (%rdi) 724 ; AVXONLY32-NEXT: vmovapd %ymm1, 32(%rdi) 725 ; AVXONLY32-NEXT: retq 726 ; 727 ; AVXONLY64-LABEL: test_store_8xf64_aligned: 728 ; AVXONLY64: # %bb.0: 729 ; AVXONLY64-NEXT: pushl %ebp 730 ; AVXONLY64-NEXT: .cfi_def_cfa_offset 8 731 ; AVXONLY64-NEXT: .cfi_offset %ebp, -8 732 ; AVXONLY64-NEXT: movl %esp, %ebp 733 ; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp 734 ; AVXONLY64-NEXT: andl $-32, %esp 735 ; AVXONLY64-NEXT: subl $32, %esp 736 ; AVXONLY64-NEXT: movl 8(%ebp), %eax 737 ; AVXONLY64-NEXT: vaddpd 40(%ebp), %ymm1, %ymm1 738 ; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 739 ; AVXONLY64-NEXT: vmovapd %ymm0, (%eax) 740 ; AVXONLY64-NEXT: vmovapd %ymm1, 32(%eax) 741 ; AVXONLY64-NEXT: movl %ebp, %esp 742 ; AVXONLY64-NEXT: popl %ebp 743 ; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4 744 ; AVXONLY64-NEXT: retl 745 ; 746 ; AVX51232-LABEL: test_store_8xf64_aligned: 747 ; AVX51232: # %bb.0: 748 ; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0 749 ; AVX51232-NEXT: vmovapd %zmm0, (%rdi) 750 ; AVX51232-NEXT: retq 751 ; 752 ; AVX51264-LABEL: test_store_8xf64_aligned: 753 ; AVX51264: # %bb.0: 754 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 755 ; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0 756 ; AVX51264-NEXT: vmovapd %zmm0, (%eax) 757 ; AVX51264-NEXT: retl 758 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store 759 store <8 x double> %foo, <8 x double>* %addr, align 64 760 ret <8 x double> %foo 761 } 762