Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=SSE32
      3 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=SSE64
      4 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVXONLY32
      5 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVXONLY64
      6 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=KNL32
      7 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=KNL64
      8 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=SKX32
      9 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=SKX64
     10 
     11 define i32 @test_store_32(i32* nocapture %addr, i32 %value) {
     12 ; ALL32-LABEL: test_store_32:
     13 ; ALL32:       # %bb.0: # %entry
     14 ; ALL32-NEXT:    movl %esi, (%rdi)
     15 ; ALL32-NEXT:    movl %esi, %eax
     16 ; ALL32-NEXT:    retq
     17 ;
     18 ; ALL64-LABEL: test_store_32:
     19 ; ALL64:       # %bb.0: # %entry
     20 ; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %eax
     21 ; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     22 ; ALL64-NEXT:    movl %eax, (%ecx)
     23 ; ALL64-NEXT:    retl
     24 entry:
     25   store i32 %value, i32* %addr, align 1
     26   ret i32 %value
     27 }
     28 
     29 define i16 @test_store_16(i16* nocapture %addr, i16 %value) {
     30 ; ALL32-LABEL: test_store_16:
     31 ; ALL32:       # %bb.0: # %entry
     32 ; ALL32-NEXT:    movw %si, (%rdi)
     33 ; ALL32-NEXT:    movl %esi, %eax
     34 ; ALL32-NEXT:    retq
     35 ;
     36 ; ALL64-LABEL: test_store_16:
     37 ; ALL64:       # %bb.0: # %entry
     38 ; ALL64-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
     39 ; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     40 ; ALL64-NEXT:    movw %ax, (%ecx)
     41 ; ALL64-NEXT:    retl
     42 entry:
     43   store i16 %value, i16* %addr, align 1
     44   ret i16 %value
     45 }
     46 
     47 define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
     48 ; SSE32-LABEL: test_store_4xi32:
     49 ; SSE32:       # %bb.0:
     50 ; SSE32-NEXT:    paddd %xmm1, %xmm0
     51 ; SSE32-NEXT:    movdqu %xmm0, (%rdi)
     52 ; SSE32-NEXT:    retq
     53 ;
     54 ; SSE64-LABEL: test_store_4xi32:
     55 ; SSE64:       # %bb.0:
     56 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
     57 ; SSE64-NEXT:    paddd %xmm1, %xmm0
     58 ; SSE64-NEXT:    movdqu %xmm0, (%eax)
     59 ; SSE64-NEXT:    retl
     60 ;
     61 ; AVX32-LABEL: test_store_4xi32:
     62 ; AVX32:       # %bb.0:
     63 ; AVX32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
     64 ; AVX32-NEXT:    vmovdqu %xmm0, (%rdi)
     65 ; AVX32-NEXT:    retq
     66 ;
     67 ; AVX64-LABEL: test_store_4xi32:
     68 ; AVX64:       # %bb.0:
     69 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
     70 ; AVX64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
     71 ; AVX64-NEXT:    vmovdqu %xmm0, (%eax)
     72 ; AVX64-NEXT:    retl
     73   %foo = add <4 x i32> %value, %value2 ; to force integer type on store
     74   store <4 x i32> %foo, <4 x i32>* %addr, align 1
     75   ret <4 x i32> %foo
     76 }
     77 
     78 define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
     79 ; SSE32-LABEL: test_store_4xi32_aligned:
     80 ; SSE32:       # %bb.0:
     81 ; SSE32-NEXT:    paddd %xmm1, %xmm0
     82 ; SSE32-NEXT:    movdqa %xmm0, (%rdi)
     83 ; SSE32-NEXT:    retq
     84 ;
     85 ; SSE64-LABEL: test_store_4xi32_aligned:
     86 ; SSE64:       # %bb.0:
     87 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
     88 ; SSE64-NEXT:    paddd %xmm1, %xmm0
     89 ; SSE64-NEXT:    movdqa %xmm0, (%eax)
     90 ; SSE64-NEXT:    retl
     91 ;
     92 ; AVX32-LABEL: test_store_4xi32_aligned:
     93 ; AVX32:       # %bb.0:
     94 ; AVX32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
     95 ; AVX32-NEXT:    vmovdqa %xmm0, (%rdi)
     96 ; AVX32-NEXT:    retq
     97 ;
     98 ; AVX64-LABEL: test_store_4xi32_aligned:
     99 ; AVX64:       # %bb.0:
    100 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    101 ; AVX64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    102 ; AVX64-NEXT:    vmovdqa %xmm0, (%eax)
    103 ; AVX64-NEXT:    retl
    104   %foo = add <4 x i32> %value, %value2 ; to force integer type on store
    105   store <4 x i32> %foo, <4 x i32>* %addr, align 16
    106   ret <4 x i32> %foo
    107 }
    108 
    109 define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %value) {
    110 ; SSE32-LABEL: test_store_4xf32:
    111 ; SSE32:       # %bb.0:
    112 ; SSE32-NEXT:    movups %xmm0, (%rdi)
    113 ; SSE32-NEXT:    retq
    114 ;
    115 ; SSE64-LABEL: test_store_4xf32:
    116 ; SSE64:       # %bb.0:
    117 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    118 ; SSE64-NEXT:    movups %xmm0, (%eax)
    119 ; SSE64-NEXT:    retl
    120 ;
    121 ; AVX32-LABEL: test_store_4xf32:
    122 ; AVX32:       # %bb.0:
    123 ; AVX32-NEXT:    vmovups %xmm0, (%rdi)
    124 ; AVX32-NEXT:    retq
    125 ;
    126 ; AVX64-LABEL: test_store_4xf32:
    127 ; AVX64:       # %bb.0:
    128 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    129 ; AVX64-NEXT:    vmovups %xmm0, (%eax)
    130 ; AVX64-NEXT:    retl
    131   store <4 x float> %value, <4 x float>* %addr, align 1
    132   ret <4 x float> %value
    133 }
    134 
    135 define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x float> %value) {
    136 ; SSE32-LABEL: test_store_4xf32_aligned:
    137 ; SSE32:       # %bb.0:
    138 ; SSE32-NEXT:    movaps %xmm0, (%rdi)
    139 ; SSE32-NEXT:    retq
    140 ;
    141 ; SSE64-LABEL: test_store_4xf32_aligned:
    142 ; SSE64:       # %bb.0:
    143 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    144 ; SSE64-NEXT:    movaps %xmm0, (%eax)
    145 ; SSE64-NEXT:    retl
    146 ;
    147 ; AVX32-LABEL: test_store_4xf32_aligned:
    148 ; AVX32:       # %bb.0:
    149 ; AVX32-NEXT:    vmovaps %xmm0, (%rdi)
    150 ; AVX32-NEXT:    retq
    151 ;
    152 ; AVX64-LABEL: test_store_4xf32_aligned:
    153 ; AVX64:       # %bb.0:
    154 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    155 ; AVX64-NEXT:    vmovaps %xmm0, (%eax)
    156 ; AVX64-NEXT:    retl
    157   store <4 x float> %value, <4 x float>* %addr, align 16
    158   ret <4 x float> %value
    159 }
    160 
    161 define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
    162 ; SSE32-LABEL: test_store_2xf64:
    163 ; SSE32:       # %bb.0:
    164 ; SSE32-NEXT:    addpd %xmm1, %xmm0
    165 ; SSE32-NEXT:    movupd %xmm0, (%rdi)
    166 ; SSE32-NEXT:    retq
    167 ;
    168 ; SSE64-LABEL: test_store_2xf64:
    169 ; SSE64:       # %bb.0:
    170 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    171 ; SSE64-NEXT:    addpd %xmm1, %xmm0
    172 ; SSE64-NEXT:    movupd %xmm0, (%eax)
    173 ; SSE64-NEXT:    retl
    174 ;
    175 ; AVX32-LABEL: test_store_2xf64:
    176 ; AVX32:       # %bb.0:
    177 ; AVX32-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
    178 ; AVX32-NEXT:    vmovupd %xmm0, (%rdi)
    179 ; AVX32-NEXT:    retq
    180 ;
    181 ; AVX64-LABEL: test_store_2xf64:
    182 ; AVX64:       # %bb.0:
    183 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    184 ; AVX64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
    185 ; AVX64-NEXT:    vmovupd %xmm0, (%eax)
    186 ; AVX64-NEXT:    retl
    187   %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
    188   store <2 x double> %foo, <2 x double>* %addr, align 1
    189   ret <2 x double> %foo
    190 }
    191 
    192 define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
    193 ; SSE32-LABEL: test_store_2xf64_aligned:
    194 ; SSE32:       # %bb.0:
    195 ; SSE32-NEXT:    addpd %xmm1, %xmm0
    196 ; SSE32-NEXT:    movapd %xmm0, (%rdi)
    197 ; SSE32-NEXT:    retq
    198 ;
    199 ; SSE64-LABEL: test_store_2xf64_aligned:
    200 ; SSE64:       # %bb.0:
    201 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    202 ; SSE64-NEXT:    addpd %xmm1, %xmm0
    203 ; SSE64-NEXT:    movapd %xmm0, (%eax)
    204 ; SSE64-NEXT:    retl
    205 ;
    206 ; AVX32-LABEL: test_store_2xf64_aligned:
    207 ; AVX32:       # %bb.0:
    208 ; AVX32-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
    209 ; AVX32-NEXT:    vmovapd %xmm0, (%rdi)
    210 ; AVX32-NEXT:    retq
    211 ;
    212 ; AVX64-LABEL: test_store_2xf64_aligned:
    213 ; AVX64:       # %bb.0:
    214 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    215 ; AVX64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
    216 ; AVX64-NEXT:    vmovapd %xmm0, (%eax)
    217 ; AVX64-NEXT:    retl
    218   %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
    219   store <2 x double> %foo, <2 x double>* %addr, align 16
    220   ret <2 x double> %foo
    221 }
    222 
    223 define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value) {
    224 ; SSE32-LABEL: test_store_8xi32:
    225 ; SSE32:       # %bb.0:
    226 ; SSE32-NEXT:    movups %xmm0, (%rdi)
    227 ; SSE32-NEXT:    movups %xmm1, 16(%rdi)
    228 ; SSE32-NEXT:    retq
    229 ;
    230 ; SSE64-LABEL: test_store_8xi32:
    231 ; SSE64:       # %bb.0:
    232 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    233 ; SSE64-NEXT:    movups %xmm0, (%eax)
    234 ; SSE64-NEXT:    movups %xmm1, 16(%eax)
    235 ; SSE64-NEXT:    retl
    236 ;
    237 ; AVX32-LABEL: test_store_8xi32:
    238 ; AVX32:       # %bb.0:
    239 ; AVX32-NEXT:    vmovups %ymm0, (%rdi)
    240 ; AVX32-NEXT:    retq
    241 ;
    242 ; AVX64-LABEL: test_store_8xi32:
    243 ; AVX64:       # %bb.0:
    244 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    245 ; AVX64-NEXT:    vmovups %ymm0, (%eax)
    246 ; AVX64-NEXT:    retl
    247   store <8 x i32> %value, <8 x i32>* %addr, align 1
    248   ret <8 x i32> %value
    249 }
    250 
    251 define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32> %value) {
    252 ; SSE32-LABEL: test_store_8xi32_aligned:
    253 ; SSE32:       # %bb.0:
    254 ; SSE32-NEXT:    movaps %xmm0, (%rdi)
    255 ; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
    256 ; SSE32-NEXT:    retq
    257 ;
    258 ; SSE64-LABEL: test_store_8xi32_aligned:
    259 ; SSE64:       # %bb.0:
    260 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    261 ; SSE64-NEXT:    movaps %xmm0, (%eax)
    262 ; SSE64-NEXT:    movaps %xmm1, 16(%eax)
    263 ; SSE64-NEXT:    retl
    264 ;
    265 ; AVX32-LABEL: test_store_8xi32_aligned:
    266 ; AVX32:       # %bb.0:
    267 ; AVX32-NEXT:    vmovaps %ymm0, (%rdi)
    268 ; AVX32-NEXT:    retq
    269 ;
    270 ; AVX64-LABEL: test_store_8xi32_aligned:
    271 ; AVX64:       # %bb.0:
    272 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    273 ; AVX64-NEXT:    vmovaps %ymm0, (%eax)
    274 ; AVX64-NEXT:    retl
    275   store <8 x i32> %value, <8 x i32>* %addr, align 32
    276   ret <8 x i32> %value
    277 }
    278 
    279 define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %value) {
    280 ; SSE32-LABEL: test_store_8xf32:
    281 ; SSE32:       # %bb.0:
    282 ; SSE32-NEXT:    movups %xmm0, (%rdi)
    283 ; SSE32-NEXT:    movups %xmm1, 16(%rdi)
    284 ; SSE32-NEXT:    retq
    285 ;
    286 ; SSE64-LABEL: test_store_8xf32:
    287 ; SSE64:       # %bb.0:
    288 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    289 ; SSE64-NEXT:    movups %xmm0, (%eax)
    290 ; SSE64-NEXT:    movups %xmm1, 16(%eax)
    291 ; SSE64-NEXT:    retl
    292 ;
    293 ; AVX32-LABEL: test_store_8xf32:
    294 ; AVX32:       # %bb.0:
    295 ; AVX32-NEXT:    vmovups %ymm0, (%rdi)
    296 ; AVX32-NEXT:    retq
    297 ;
    298 ; AVX64-LABEL: test_store_8xf32:
    299 ; AVX64:       # %bb.0:
    300 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    301 ; AVX64-NEXT:    vmovups %ymm0, (%eax)
    302 ; AVX64-NEXT:    retl
    303   store <8 x float> %value, <8 x float>* %addr, align 1
    304   ret <8 x float> %value
    305 }
    306 
    307 define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x float> %value) {
    308 ; SSE32-LABEL: test_store_8xf32_aligned:
    309 ; SSE32:       # %bb.0:
    310 ; SSE32-NEXT:    movaps %xmm0, (%rdi)
    311 ; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
    312 ; SSE32-NEXT:    retq
    313 ;
    314 ; SSE64-LABEL: test_store_8xf32_aligned:
    315 ; SSE64:       # %bb.0:
    316 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    317 ; SSE64-NEXT:    movaps %xmm0, (%eax)
    318 ; SSE64-NEXT:    movaps %xmm1, 16(%eax)
    319 ; SSE64-NEXT:    retl
    320 ;
    321 ; AVX32-LABEL: test_store_8xf32_aligned:
    322 ; AVX32:       # %bb.0:
    323 ; AVX32-NEXT:    vmovaps %ymm0, (%rdi)
    324 ; AVX32-NEXT:    retq
    325 ;
    326 ; AVX64-LABEL: test_store_8xf32_aligned:
    327 ; AVX64:       # %bb.0:
    328 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    329 ; AVX64-NEXT:    vmovaps %ymm0, (%eax)
    330 ; AVX64-NEXT:    retl
    331   store <8 x float> %value, <8 x float>* %addr, align 32
    332   ret <8 x float> %value
    333 }
    334 
    335 define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
    336 ; SSE32-LABEL: test_store_4xf64:
    337 ; SSE32:       # %bb.0:
    338 ; SSE32-NEXT:    addpd %xmm3, %xmm1
    339 ; SSE32-NEXT:    addpd %xmm2, %xmm0
    340 ; SSE32-NEXT:    movupd %xmm0, (%rdi)
    341 ; SSE32-NEXT:    movupd %xmm1, 16(%rdi)
    342 ; SSE32-NEXT:    retq
    343 ;
    344 ; SSE64-LABEL: test_store_4xf64:
    345 ; SSE64:       # %bb.0:
    346 ; SSE64-NEXT:    subl $12, %esp
    347 ; SSE64-NEXT:    .cfi_def_cfa_offset 16
    348 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    349 ; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
    350 ; SSE64-NEXT:    addpd %xmm2, %xmm0
    351 ; SSE64-NEXT:    movupd %xmm0, (%eax)
    352 ; SSE64-NEXT:    movupd %xmm1, 16(%eax)
    353 ; SSE64-NEXT:    addl $12, %esp
    354 ; SSE64-NEXT:    .cfi_def_cfa_offset 4
    355 ; SSE64-NEXT:    retl
    356 ;
    357 ; AVX32-LABEL: test_store_4xf64:
    358 ; AVX32:       # %bb.0:
    359 ; AVX32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
    360 ; AVX32-NEXT:    vmovupd %ymm0, (%rdi)
    361 ; AVX32-NEXT:    retq
    362 ;
    363 ; AVX64-LABEL: test_store_4xf64:
    364 ; AVX64:       # %bb.0:
    365 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    366 ; AVX64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
    367 ; AVX64-NEXT:    vmovupd %ymm0, (%eax)
    368 ; AVX64-NEXT:    retl
    369   %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
    370   store <4 x double> %foo, <4 x double>* %addr, align 1
    371   ret <4 x double> %foo
    372 }
    373 
    374 define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
    375 ; SSE32-LABEL: test_store_4xf64_aligned:
    376 ; SSE32:       # %bb.0:
    377 ; SSE32-NEXT:    addpd %xmm3, %xmm1
    378 ; SSE32-NEXT:    addpd %xmm2, %xmm0
    379 ; SSE32-NEXT:    movapd %xmm0, (%rdi)
    380 ; SSE32-NEXT:    movapd %xmm1, 16(%rdi)
    381 ; SSE32-NEXT:    retq
    382 ;
    383 ; SSE64-LABEL: test_store_4xf64_aligned:
    384 ; SSE64:       # %bb.0:
    385 ; SSE64-NEXT:    subl $12, %esp
    386 ; SSE64-NEXT:    .cfi_def_cfa_offset 16
    387 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    388 ; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
    389 ; SSE64-NEXT:    addpd %xmm2, %xmm0
    390 ; SSE64-NEXT:    movapd %xmm0, (%eax)
    391 ; SSE64-NEXT:    movapd %xmm1, 16(%eax)
    392 ; SSE64-NEXT:    addl $12, %esp
    393 ; SSE64-NEXT:    .cfi_def_cfa_offset 4
    394 ; SSE64-NEXT:    retl
    395 ;
    396 ; AVX32-LABEL: test_store_4xf64_aligned:
    397 ; AVX32:       # %bb.0:
    398 ; AVX32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
    399 ; AVX32-NEXT:    vmovapd %ymm0, (%rdi)
    400 ; AVX32-NEXT:    retq
    401 ;
    402 ; AVX64-LABEL: test_store_4xf64_aligned:
    403 ; AVX64:       # %bb.0:
    404 ; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    405 ; AVX64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
    406 ; AVX64-NEXT:    vmovapd %ymm0, (%eax)
    407 ; AVX64-NEXT:    retl
    408   %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
    409   store <4 x double> %foo, <4 x double>* %addr, align 32
    410   ret <4 x double> %foo
    411 }
    412 
    413 define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %value) {
    414 ; SSE32-LABEL: test_store_16xi32:
    415 ; SSE32:       # %bb.0:
    416 ; SSE32-NEXT:    movups %xmm0, (%rdi)
    417 ; SSE32-NEXT:    movups %xmm1, 16(%rdi)
    418 ; SSE32-NEXT:    movups %xmm2, 32(%rdi)
    419 ; SSE32-NEXT:    movups %xmm3, 48(%rdi)
    420 ; SSE32-NEXT:    retq
    421 ;
    422 ; SSE64-LABEL: test_store_16xi32:
    423 ; SSE64:       # %bb.0:
    424 ; SSE64-NEXT:    subl $12, %esp
    425 ; SSE64-NEXT:    .cfi_def_cfa_offset 16
    426 ; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
    427 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    428 ; SSE64-NEXT:    movups %xmm0, (%eax)
    429 ; SSE64-NEXT:    movups %xmm1, 16(%eax)
    430 ; SSE64-NEXT:    movups %xmm2, 32(%eax)
    431 ; SSE64-NEXT:    movups %xmm3, 48(%eax)
    432 ; SSE64-NEXT:    addl $12, %esp
    433 ; SSE64-NEXT:    .cfi_def_cfa_offset 4
    434 ; SSE64-NEXT:    retl
    435 ;
    436 ; AVXONLY32-LABEL: test_store_16xi32:
    437 ; AVXONLY32:       # %bb.0:
    438 ; AVXONLY32-NEXT:    vmovups %ymm0, (%rdi)
    439 ; AVXONLY32-NEXT:    vmovups %ymm1, 32(%rdi)
    440 ; AVXONLY32-NEXT:    retq
    441 ;
    442 ; AVXONLY64-LABEL: test_store_16xi32:
    443 ; AVXONLY64:       # %bb.0:
    444 ; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    445 ; AVXONLY64-NEXT:    vmovups %ymm0, (%eax)
    446 ; AVXONLY64-NEXT:    vmovups %ymm1, 32(%eax)
    447 ; AVXONLY64-NEXT:    retl
    448 ;
    449 ; AVX51232-LABEL: test_store_16xi32:
    450 ; AVX51232:       # %bb.0:
    451 ; AVX51232-NEXT:    vmovups %zmm0, (%rdi)
    452 ; AVX51232-NEXT:    retq
    453 ;
    454 ; AVX51264-LABEL: test_store_16xi32:
    455 ; AVX51264:       # %bb.0:
    456 ; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
    457 ; AVX51264-NEXT:    vmovups %zmm0, (%eax)
    458 ; AVX51264-NEXT:    retl
    459   store <16 x i32> %value, <16 x i32>* %addr, align 1
    460   ret <16 x i32> %value
    461 }
    462 
    463 define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x i32> %value) {
    464 ; SSE32-LABEL: test_store_16xi32_aligned:
    465 ; SSE32:       # %bb.0:
    466 ; SSE32-NEXT:    movaps %xmm0, (%rdi)
    467 ; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
    468 ; SSE32-NEXT:    movaps %xmm2, 32(%rdi)
    469 ; SSE32-NEXT:    movaps %xmm3, 48(%rdi)
    470 ; SSE32-NEXT:    retq
    471 ;
    472 ; SSE64-LABEL: test_store_16xi32_aligned:
    473 ; SSE64:       # %bb.0:
    474 ; SSE64-NEXT:    subl $12, %esp
    475 ; SSE64-NEXT:    .cfi_def_cfa_offset 16
    476 ; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
    477 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    478 ; SSE64-NEXT:    movaps %xmm0, (%eax)
    479 ; SSE64-NEXT:    movaps %xmm1, 16(%eax)
    480 ; SSE64-NEXT:    movaps %xmm2, 32(%eax)
    481 ; SSE64-NEXT:    movaps %xmm3, 48(%eax)
    482 ; SSE64-NEXT:    addl $12, %esp
    483 ; SSE64-NEXT:    .cfi_def_cfa_offset 4
    484 ; SSE64-NEXT:    retl
    485 ;
    486 ; AVXONLY32-LABEL: test_store_16xi32_aligned:
    487 ; AVXONLY32:       # %bb.0:
    488 ; AVXONLY32-NEXT:    vmovaps %ymm0, (%rdi)
    489 ; AVXONLY32-NEXT:    vmovaps %ymm1, 32(%rdi)
    490 ; AVXONLY32-NEXT:    retq
    491 ;
    492 ; AVXONLY64-LABEL: test_store_16xi32_aligned:
    493 ; AVXONLY64:       # %bb.0:
    494 ; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    495 ; AVXONLY64-NEXT:    vmovaps %ymm0, (%eax)
    496 ; AVXONLY64-NEXT:    vmovaps %ymm1, 32(%eax)
    497 ; AVXONLY64-NEXT:    retl
    498 ;
    499 ; AVX51232-LABEL: test_store_16xi32_aligned:
    500 ; AVX51232:       # %bb.0:
    501 ; AVX51232-NEXT:    vmovaps %zmm0, (%rdi)
    502 ; AVX51232-NEXT:    retq
    503 ;
    504 ; AVX51264-LABEL: test_store_16xi32_aligned:
    505 ; AVX51264:       # %bb.0:
    506 ; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
    507 ; AVX51264-NEXT:    vmovaps %zmm0, (%eax)
    508 ; AVX51264-NEXT:    retl
    509   store <16 x i32> %value, <16 x i32>* %addr, align 64
    510   ret <16 x i32> %value
    511 }
    512 
    513 define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x float> %value) {
    514 ; SSE32-LABEL: test_store_16xf32:
    515 ; SSE32:       # %bb.0:
    516 ; SSE32-NEXT:    movups %xmm0, (%rdi)
    517 ; SSE32-NEXT:    movups %xmm1, 16(%rdi)
    518 ; SSE32-NEXT:    movups %xmm2, 32(%rdi)
    519 ; SSE32-NEXT:    movups %xmm3, 48(%rdi)
    520 ; SSE32-NEXT:    retq
    521 ;
    522 ; SSE64-LABEL: test_store_16xf32:
    523 ; SSE64:       # %bb.0:
    524 ; SSE64-NEXT:    subl $12, %esp
    525 ; SSE64-NEXT:    .cfi_def_cfa_offset 16
    526 ; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
    527 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    528 ; SSE64-NEXT:    movups %xmm0, (%eax)
    529 ; SSE64-NEXT:    movups %xmm1, 16(%eax)
    530 ; SSE64-NEXT:    movups %xmm2, 32(%eax)
    531 ; SSE64-NEXT:    movups %xmm3, 48(%eax)
    532 ; SSE64-NEXT:    addl $12, %esp
    533 ; SSE64-NEXT:    .cfi_def_cfa_offset 4
    534 ; SSE64-NEXT:    retl
    535 ;
    536 ; AVXONLY32-LABEL: test_store_16xf32:
    537 ; AVXONLY32:       # %bb.0:
    538 ; AVXONLY32-NEXT:    vmovups %ymm0, (%rdi)
    539 ; AVXONLY32-NEXT:    vmovups %ymm1, 32(%rdi)
    540 ; AVXONLY32-NEXT:    retq
    541 ;
    542 ; AVXONLY64-LABEL: test_store_16xf32:
    543 ; AVXONLY64:       # %bb.0:
    544 ; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    545 ; AVXONLY64-NEXT:    vmovups %ymm0, (%eax)
    546 ; AVXONLY64-NEXT:    vmovups %ymm1, 32(%eax)
    547 ; AVXONLY64-NEXT:    retl
    548 ;
    549 ; AVX51232-LABEL: test_store_16xf32:
    550 ; AVX51232:       # %bb.0:
    551 ; AVX51232-NEXT:    vmovups %zmm0, (%rdi)
    552 ; AVX51232-NEXT:    retq
    553 ;
    554 ; AVX51264-LABEL: test_store_16xf32:
    555 ; AVX51264:       # %bb.0:
    556 ; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
    557 ; AVX51264-NEXT:    vmovups %zmm0, (%eax)
    558 ; AVX51264-NEXT:    retl
    559   store <16 x float> %value, <16 x float>* %addr, align 1
    560   ret <16 x float> %value
    561 }
    562 
    563 define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <16 x float> %value) {
    564 ; SSE32-LABEL: test_store_16xf32_aligned:
    565 ; SSE32:       # %bb.0:
    566 ; SSE32-NEXT:    movaps %xmm0, (%rdi)
    567 ; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
    568 ; SSE32-NEXT:    movaps %xmm2, 32(%rdi)
    569 ; SSE32-NEXT:    movaps %xmm3, 48(%rdi)
    570 ; SSE32-NEXT:    retq
    571 ;
    572 ; SSE64-LABEL: test_store_16xf32_aligned:
    573 ; SSE64:       # %bb.0:
    574 ; SSE64-NEXT:    subl $12, %esp
    575 ; SSE64-NEXT:    .cfi_def_cfa_offset 16
    576 ; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
    577 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    578 ; SSE64-NEXT:    movaps %xmm0, (%eax)
    579 ; SSE64-NEXT:    movaps %xmm1, 16(%eax)
    580 ; SSE64-NEXT:    movaps %xmm2, 32(%eax)
    581 ; SSE64-NEXT:    movaps %xmm3, 48(%eax)
    582 ; SSE64-NEXT:    addl $12, %esp
    583 ; SSE64-NEXT:    .cfi_def_cfa_offset 4
    584 ; SSE64-NEXT:    retl
    585 ;
    586 ; AVXONLY32-LABEL: test_store_16xf32_aligned:
    587 ; AVXONLY32:       # %bb.0:
    588 ; AVXONLY32-NEXT:    vmovaps %ymm0, (%rdi)
    589 ; AVXONLY32-NEXT:    vmovaps %ymm1, 32(%rdi)
    590 ; AVXONLY32-NEXT:    retq
    591 ;
    592 ; AVXONLY64-LABEL: test_store_16xf32_aligned:
    593 ; AVXONLY64:       # %bb.0:
    594 ; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    595 ; AVXONLY64-NEXT:    vmovaps %ymm0, (%eax)
    596 ; AVXONLY64-NEXT:    vmovaps %ymm1, 32(%eax)
    597 ; AVXONLY64-NEXT:    retl
    598 ;
    599 ; AVX51232-LABEL: test_store_16xf32_aligned:
    600 ; AVX51232:       # %bb.0:
    601 ; AVX51232-NEXT:    vmovaps %zmm0, (%rdi)
    602 ; AVX51232-NEXT:    retq
    603 ;
    604 ; AVX51264-LABEL: test_store_16xf32_aligned:
    605 ; AVX51264:       # %bb.0:
    606 ; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
    607 ; AVX51264-NEXT:    vmovaps %zmm0, (%eax)
    608 ; AVX51264-NEXT:    retl
    609   store <16 x float> %value, <16 x float>* %addr, align 64
    610   ret <16 x float> %value
    611 }
    612 
    613 define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
    614 ; SSE32-LABEL: test_store_8xf64:
    615 ; SSE32:       # %bb.0:
    616 ; SSE32-NEXT:    addpd %xmm7, %xmm3
    617 ; SSE32-NEXT:    addpd %xmm6, %xmm2
    618 ; SSE32-NEXT:    addpd %xmm5, %xmm1
    619 ; SSE32-NEXT:    addpd %xmm4, %xmm0
    620 ; SSE32-NEXT:    movupd %xmm0, (%rdi)
    621 ; SSE32-NEXT:    movupd %xmm1, 16(%rdi)
    622 ; SSE32-NEXT:    movupd %xmm2, 32(%rdi)
    623 ; SSE32-NEXT:    movupd %xmm3, 48(%rdi)
    624 ; SSE32-NEXT:    retq
    625 ;
    626 ; SSE64-LABEL: test_store_8xf64:
    627 ; SSE64:       # %bb.0:
    628 ; SSE64-NEXT:    subl $12, %esp
    629 ; SSE64-NEXT:    .cfi_def_cfa_offset 16
    630 ; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
    631 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    632 ; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm3
    633 ; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm2
    634 ; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
    635 ; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
    636 ; SSE64-NEXT:    movupd %xmm0, (%eax)
    637 ; SSE64-NEXT:    movupd %xmm1, 16(%eax)
    638 ; SSE64-NEXT:    movupd %xmm2, 32(%eax)
    639 ; SSE64-NEXT:    movupd %xmm3, 48(%eax)
    640 ; SSE64-NEXT:    addl $12, %esp
    641 ; SSE64-NEXT:    .cfi_def_cfa_offset 4
    642 ; SSE64-NEXT:    retl
    643 ;
    644 ; AVXONLY32-LABEL: test_store_8xf64:
    645 ; AVXONLY32:       # %bb.0:
    646 ; AVXONLY32-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
    647 ; AVXONLY32-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
    648 ; AVXONLY32-NEXT:    vmovupd %ymm0, (%rdi)
    649 ; AVXONLY32-NEXT:    vmovupd %ymm1, 32(%rdi)
    650 ; AVXONLY32-NEXT:    retq
    651 ;
    652 ; AVXONLY64-LABEL: test_store_8xf64:
    653 ; AVXONLY64:       # %bb.0:
    654 ; AVXONLY64-NEXT:    pushl %ebp
    655 ; AVXONLY64-NEXT:    .cfi_def_cfa_offset 8
    656 ; AVXONLY64-NEXT:    .cfi_offset %ebp, -8
    657 ; AVXONLY64-NEXT:    movl %esp, %ebp
    658 ; AVXONLY64-NEXT:    .cfi_def_cfa_register %ebp
    659 ; AVXONLY64-NEXT:    andl $-32, %esp
    660 ; AVXONLY64-NEXT:    subl $32, %esp
    661 ; AVXONLY64-NEXT:    movl 8(%ebp), %eax
    662 ; AVXONLY64-NEXT:    vaddpd 40(%ebp), %ymm1, %ymm1
    663 ; AVXONLY64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
    664 ; AVXONLY64-NEXT:    vmovupd %ymm0, (%eax)
    665 ; AVXONLY64-NEXT:    vmovupd %ymm1, 32(%eax)
    666 ; AVXONLY64-NEXT:    movl %ebp, %esp
    667 ; AVXONLY64-NEXT:    popl %ebp
    668 ; AVXONLY64-NEXT:    .cfi_def_cfa %esp, 4
    669 ; AVXONLY64-NEXT:    retl
    670 ;
    671 ; AVX51232-LABEL: test_store_8xf64:
    672 ; AVX51232:       # %bb.0:
    673 ; AVX51232-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
    674 ; AVX51232-NEXT:    vmovupd %zmm0, (%rdi)
    675 ; AVX51232-NEXT:    retq
    676 ;
    677 ; AVX51264-LABEL: test_store_8xf64:
    678 ; AVX51264:       # %bb.0:
    679 ; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
    680 ; AVX51264-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
    681 ; AVX51264-NEXT:    vmovupd %zmm0, (%eax)
    682 ; AVX51264-NEXT:    retl
    683   %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
    684   store <8 x double> %foo, <8 x double>* %addr, align 1
    685   ret <8 x double> %foo
    686 }
    687 
    688 define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
    689 ; SSE32-LABEL: test_store_8xf64_aligned:
    690 ; SSE32:       # %bb.0:
    691 ; SSE32-NEXT:    addpd %xmm7, %xmm3
    692 ; SSE32-NEXT:    addpd %xmm6, %xmm2
    693 ; SSE32-NEXT:    addpd %xmm5, %xmm1
    694 ; SSE32-NEXT:    addpd %xmm4, %xmm0
    695 ; SSE32-NEXT:    movapd %xmm0, (%rdi)
    696 ; SSE32-NEXT:    movapd %xmm1, 16(%rdi)
    697 ; SSE32-NEXT:    movapd %xmm2, 32(%rdi)
    698 ; SSE32-NEXT:    movapd %xmm3, 48(%rdi)
    699 ; SSE32-NEXT:    retq
    700 ;
    701 ; SSE64-LABEL: test_store_8xf64_aligned:
    702 ; SSE64:       # %bb.0:
    703 ; SSE64-NEXT:    subl $12, %esp
    704 ; SSE64-NEXT:    .cfi_def_cfa_offset 16
    705 ; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
    706 ; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
    707 ; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm3
    708 ; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm2
    709 ; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
    710 ; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
    711 ; SSE64-NEXT:    movapd %xmm0, (%eax)
    712 ; SSE64-NEXT:    movapd %xmm1, 16(%eax)
    713 ; SSE64-NEXT:    movapd %xmm2, 32(%eax)
    714 ; SSE64-NEXT:    movapd %xmm3, 48(%eax)
    715 ; SSE64-NEXT:    addl $12, %esp
    716 ; SSE64-NEXT:    .cfi_def_cfa_offset 4
    717 ; SSE64-NEXT:    retl
    718 ;
    719 ; AVXONLY32-LABEL: test_store_8xf64_aligned:
    720 ; AVXONLY32:       # %bb.0:
    721 ; AVXONLY32-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
    722 ; AVXONLY32-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
    723 ; AVXONLY32-NEXT:    vmovapd %ymm0, (%rdi)
    724 ; AVXONLY32-NEXT:    vmovapd %ymm1, 32(%rdi)
    725 ; AVXONLY32-NEXT:    retq
    726 ;
    727 ; AVXONLY64-LABEL: test_store_8xf64_aligned:
    728 ; AVXONLY64:       # %bb.0:
    729 ; AVXONLY64-NEXT:    pushl %ebp
    730 ; AVXONLY64-NEXT:    .cfi_def_cfa_offset 8
    731 ; AVXONLY64-NEXT:    .cfi_offset %ebp, -8
    732 ; AVXONLY64-NEXT:    movl %esp, %ebp
    733 ; AVXONLY64-NEXT:    .cfi_def_cfa_register %ebp
    734 ; AVXONLY64-NEXT:    andl $-32, %esp
    735 ; AVXONLY64-NEXT:    subl $32, %esp
    736 ; AVXONLY64-NEXT:    movl 8(%ebp), %eax
    737 ; AVXONLY64-NEXT:    vaddpd 40(%ebp), %ymm1, %ymm1
    738 ; AVXONLY64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
    739 ; AVXONLY64-NEXT:    vmovapd %ymm0, (%eax)
    740 ; AVXONLY64-NEXT:    vmovapd %ymm1, 32(%eax)
    741 ; AVXONLY64-NEXT:    movl %ebp, %esp
    742 ; AVXONLY64-NEXT:    popl %ebp
    743 ; AVXONLY64-NEXT:    .cfi_def_cfa %esp, 4
    744 ; AVXONLY64-NEXT:    retl
    745 ;
    746 ; AVX51232-LABEL: test_store_8xf64_aligned:
    747 ; AVX51232:       # %bb.0:
    748 ; AVX51232-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
    749 ; AVX51232-NEXT:    vmovapd %zmm0, (%rdi)
    750 ; AVX51232-NEXT:    retq
    751 ;
    752 ; AVX51264-LABEL: test_store_8xf64_aligned:
    753 ; AVX51264:       # %bb.0:
    754 ; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
    755 ; AVX51264-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
    756 ; AVX51264-NEXT:    vmovapd %zmm0, (%eax)
    757 ; AVX51264-NEXT:    retl
    758   %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
    759   store <8 x double> %foo, <8 x double>* %addr, align 64
    760   ret <8 x double> %foo
    761 }
    762