Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -O3 -verify-machineinstrs -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 < %s | FileCheck %s
      2 
      3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
      4 target triple = "x86_64-unknown-unknown"
      5 
      6 ; Stack reload folding tests.
      7 ;
      8 ; By including a nop call with sideeffects we can force a partial register spill of the
      9 ; relevant registers and check that the reload is correctly folded into the instruction.
     10 
     11 define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
     12   ;CHECK-LABEL: stack_fold_addpd
     13   ;CHECK:       addpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     14   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     15   %2 = fadd <2 x double> %a0, %a1
     16   ret <2 x double> %2
     17 }
     18 
     19 define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
     20   ;CHECK-LABEL: stack_fold_addps
     21   ;CHECK:       addps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     22   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     23   %2 = fadd <4 x float> %a0, %a1
     24   ret <4 x float> %2
     25 }
     26 
     27 define double @stack_fold_addsd(double %a0, double %a1) {
     28   ;CHECK-LABEL: stack_fold_addsd
     29   ;CHECK:       addsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
     30   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     31   %2 = fadd double %a0, %a1
     32   ret double %2
     33 }
     34 
     35 define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
     36   ;CHECK-LABEL: stack_fold_addsd_int
     37   ;CHECK:       addsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     38   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     39   %2 = extractelement <2 x double> %a0, i32 0
     40   %3 = extractelement <2 x double> %a1, i32 0
     41   %4 = fadd double %2, %3
     42   %5 = insertelement <2 x double> %a0, double %4, i32 0
     43   ret <2 x double> %5
     44 }
     45 
     46 define float @stack_fold_addss(float %a0, float %a1) {
     47   ;CHECK-LABEL: stack_fold_addss
     48   ;CHECK:       addss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
     49   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     50   %2 = fadd float %a0, %a1
     51   ret float %2
     52 }
     53 
     54 define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
     55   ;CHECK-LABEL: stack_fold_addss_int
     56   ;CHECK:       addss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     57   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     58   %2 = extractelement <4 x float> %a0, i32 0
     59   %3 = extractelement <4 x float> %a1, i32 0
     60   %4 = fadd float %2, %3
     61   %5 = insertelement <4 x float> %a0, float %4, i32 0
     62   ret <4 x float> %5
     63 }
     64 
     65 define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
     66   ;CHECK-LABEL: stack_fold_addsubpd
     67   ;CHECK:       addsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     68   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     69   %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
     70   ret <2 x double> %2
     71 }
     72 declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
     73 
     74 define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
     75   ;CHECK-LABEL: stack_fold_addsubps
     76   ;CHECK:       addsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     77   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     78   %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
     79   ret <4 x float> %2
     80 }
     81 declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
     82 
     83 define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
     84   ;CHECK-LABEL: stack_fold_andnpd
     85   ;CHECK:       andnpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     86   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     87   %2 = bitcast <2 x double> %a0 to <2 x i64>
     88   %3 = bitcast <2 x double> %a1 to <2 x i64>
     89   %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
     90   %5 = and <2 x i64> %4, %3
     91   %6 = bitcast <2 x i64> %5 to <2 x double>
     92   ; fadd forces execution domain
     93   %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
     94   ret <2 x double> %7
     95 }
     96 
     97 define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
     98   ;CHECK-LABEL: stack_fold_andnps
     99   ;CHECK:       andnps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    100   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    101   %2 = bitcast <4 x float> %a0 to <2 x i64>
    102   %3 = bitcast <4 x float> %a1 to <2 x i64>
    103   %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
    104   %5 = and <2 x i64> %4, %3
    105   %6 = bitcast <2 x i64> %5 to <4 x float>
    106   ; fadd forces execution domain
    107   %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
    108   ret <4 x float> %7
    109 }
    110 
    111 define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
    112   ;CHECK-LABEL: stack_fold_andpd
    113   ;CHECK:       andpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    114   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    115   %2 = bitcast <2 x double> %a0 to <2 x i64>
    116   %3 = bitcast <2 x double> %a1 to <2 x i64>
    117   %4 = and <2 x i64> %2, %3
    118   %5 = bitcast <2 x i64> %4 to <2 x double>
    119   ; fadd forces execution domain
    120   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
    121   ret <2 x double> %6
    122 }
    123 
    124 define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
    125   ;CHECK-LABEL: stack_fold_andps
    126   ;CHECK:       andps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    127   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    128   %2 = bitcast <4 x float> %a0 to <2 x i64>
    129   %3 = bitcast <4 x float> %a1 to <2 x i64>
    130   %4 = and <2 x i64> %2, %3
    131   %5 = bitcast <2 x i64> %4 to <4 x float>
    132   ; fadd forces execution domain
    133   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
    134   ret <4 x float> %6
    135 }
    136 
    137 define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
    138   ;CHECK-LABEL: stack_fold_blendpd
    139   ;CHECK:       blendpd $2, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    140   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    141   %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
    142   ; fadd forces execution domain
    143   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
    144   ret <2 x double> %3
    145 }
    146 
    147 define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
    148   ;CHECK-LABEL: stack_fold_blendps
    149   ;CHECK:       blendps $6, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    150   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    151   %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
    152   ; fadd forces execution domain
    153   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
    154   ret <4 x float> %3
    155 }
    156 
    157 define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
    158   ;CHECK-LABEL: stack_fold_blendvpd
    159   ;CHECK:       blendvpd %xmm0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    160   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    161   %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
    162   ret <2 x double> %2
    163 }
    164 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
    165 
    166 define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
    167   ;CHECK-LABEL: stack_fold_blendvps
    168   ;CHECK:       blendvps %xmm0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    169   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    170   %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
    171   ret <4 x float> %2
    172 }
    173 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
    174 
    175 define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
    176   ;CHECK-LABEL: stack_fold_cmppd
    177   ;CHECK:       cmpeqpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    178   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    179   %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
    180   ret <2 x double> %2
    181 }
    182 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
    183 
    184 define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
    185   ;CHECK-LABEL: stack_fold_cmpps
    186   ;CHECK:       cmpeqps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    187   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    188   %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
    189   ret <4 x float> %2
    190 }
    191 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
    192 
    193 define i32 @stack_fold_cmpsd(double %a0, double %a1) {
    194   ;CHECK-LABEL: stack_fold_cmpsd
    195   ;CHECK:       cmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    196   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    197   %2 = fcmp oeq double %a0, %a1
    198   %3 = zext i1 %2 to i32
    199   ret i32 %3
    200 }
    201 
    202 define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
    203   ;CHECK-LABEL: stack_fold_cmpsd_int
    204   ;CHECK:       cmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    205   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    206   %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
    207   ret <2 x double> %2
    208 }
    209 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
    210 
    211 define i32 @stack_fold_cmpss(float %a0, float %a1) {
    212   ;CHECK-LABEL: stack_fold_cmpss
    213   ;CHECK:       cmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    214   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    215   %2 = fcmp oeq float %a0, %a1
    216   %3 = zext i1 %2 to i32
    217   ret i32 %3
    218 }
    219 
    220 define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
    221   ;CHECK-LABEL: stack_fold_cmpss_int
    222   ;CHECK:       cmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    223   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    224   %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
    225   ret <4 x float> %2
    226 }
    227 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
    228 
    229 ; TODO stack_fold_comisd
    230 
    231 define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
    232   ;CHECK-LABEL: stack_fold_comisd_int
    233   ;CHECK:       comisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    234   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    235   %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
    236   ret i32 %2
    237 }
    238 declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
    239 
    240 ; TODO stack_fold_comiss
    241 
    242 define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
    243   ;CHECK-LABEL: stack_fold_comiss_int
    244   ;CHECK:       comiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    245   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    246   %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
    247   ret i32 %2
    248 }
    249 declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
    250 
    251 define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
    252   ;CHECK-LABEL: stack_fold_cvtdq2pd
    253   ;CHECK:       cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    254   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    255   %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
    256   %3 = sitofp <2 x i32> %2 to <2 x double>
    257   ret <2 x double> %3
    258 }
    259 
    260 define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
    261   ;CHECK-LABEL: stack_fold_cvtdq2pd_int
    262   ;CHECK:       cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    263   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    264   %2 = shufflevector <4 x i32> %a0, <4 x i32> %a0, <2 x i32> <i32 0, i32 1>
    265   %cvt = sitofp <2 x i32> %2 to <2 x double>
    266   ret <2 x double> %cvt
    267 }
    268 
    269 define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
    270   ;CHECK-LABEL: stack_fold_cvtdq2ps
    271   ;CHECK:       cvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    272   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    273   %2 = sitofp <4 x i32> %a0 to <4 x float>
    274   ret <4 x float> %2
    275 }
    276 
    277 define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
    278   ;CHECK-LABEL: stack_fold_cvtpd2dq
    279   ;CHECK:       cvtpd2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    280   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    281   %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
    282   ret <4 x i32> %2
    283 }
    284 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
    285 
    286 define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
    287   ;CHECK-LABEL: stack_fold_cvtpd2ps
    288   ;CHECK:       cvtpd2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    289   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    290   %2 = fptrunc <2 x double> %a0 to <2 x float>
    291   ret <2 x float> %2
    292 }
    293 
    294 define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
    295   ;CHECK-LABEL: stack_fold_cvtps2dq
    296   ;CHECK:       cvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    297   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    298   %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
    299   ret <4 x i32> %2
    300 }
    301 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
    302 
    303 define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
    304   ;CHECK-LABEL: stack_fold_cvtps2pd
    305   ;CHECK:       cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    306   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    307   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
    308   %3 = fpext <2 x float> %2 to <2 x double>
    309   ret <2 x double> %3
    310 }
    311 
    312 define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
    313   ;CHECK-LABEL: stack_fold_cvtps2pd_int
    314   ;CHECK:       cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    315   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    316   %2 = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1>
    317   %cvtps2pd = fpext <2 x float> %2 to <2 x double>
    318   ret <2 x double> %cvtps2pd
    319 }
    320 
    321 ; TODO stack_fold_cvtsd2si
    322 
    323 define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
    324   ;CHECK-LABEL: stack_fold_cvtsd2si_int
    325   ;CHECK:       cvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    326   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    327   %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
    328   ret i32 %2
    329 }
    330 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
    331 
    332 ; TODO stack_fold_cvtsd2si64
    333 
    334 define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
    335   ;CHECK-LABEL: stack_fold_cvtsd2si64_int
    336   ;CHECK:       cvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    337   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    338   %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
    339   ret i64 %2
    340 }
    341 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
    342 
    343 define float @stack_fold_cvtsd2ss(double %a0) minsize {
    344   ;CHECK-LABEL: stack_fold_cvtsd2ss
    345   ;CHECK:       cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    346   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    347   %2 = fptrunc double %a0 to float
    348   ret float %2
    349 }
    350 
    351 define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) optsize {
    352   ;CHECK-LABEL: stack_fold_cvtsd2ss_int
    353   ;CHECK:       cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    354   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    355   %2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
    356   ret <4 x float> %2
    357 }
    358 declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
    359 
    360 define double @stack_fold_cvtsi2sd(i32 %a0) minsize {
    361   ;CHECK-LABEL: stack_fold_cvtsi2sd
    362   ;CHECK:       cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    363   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    364   %2 = sitofp i32 %a0 to double
    365   ret double %2
    366 }
    367 
    368 define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0, <2 x double> %b0) {
    369   ;CHECK-LABEL: stack_fold_cvtsi2sd_int
    370   ;CHECK:       cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    371   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    372   %2 = sitofp i32 %a0 to double
    373   %3 = insertelement <2 x double> %b0, double %2, i64 0
    374   ret <2 x double> %3
    375 }
    376 
    377 define double @stack_fold_cvtsi642sd(i64 %a0) optsize {
    378   ;CHECK-LABEL: stack_fold_cvtsi642sd
    379   ;CHECK:       cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    380   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    381   %2 = sitofp i64 %a0 to double
    382   ret double %2
    383 }
    384 
    385 define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0, <2 x double> %b0) {
    386   ;CHECK-LABEL: stack_fold_cvtsi642sd_int
    387   ;CHECK:       cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    388   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    389   %2 = sitofp i64 %a0 to double
    390   %3 = insertelement <2 x double> %b0, double %2, i64 0
    391   ret <2 x double> %3
    392 }
    393 
    394 define float @stack_fold_cvtsi2ss(i32 %a0) minsize {
    395   ;CHECK-LABEL: stack_fold_cvtsi2ss
    396   ;CHECK:       cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    397   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    398   %2 = sitofp i32 %a0 to float
    399   ret float %2
    400 }
    401 
    402 define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0, <4 x float> %b0) {
    403   ;CHECK-LABEL: stack_fold_cvtsi2ss_int
    404   ;CHECK:  cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    405   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    406   %2 = sitofp i32 %a0 to float
    407   %3 = insertelement <4 x float> %b0, float %2, i64 0
    408   ret <4 x float> %3
    409 }
    410 
    411 define float @stack_fold_cvtsi642ss(i64 %a0) optsize {
    412   ;CHECK-LABEL: stack_fold_cvtsi642ss
    413   ;CHECK:       cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    414   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    415   %2 = sitofp i64 %a0 to float
    416   ret float %2
    417 }
    418 
    419 define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0, <4 x float> %b0) {
    420   ;CHECK-LABEL: stack_fold_cvtsi642ss_int
    421   ;CHECK:  cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    422   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    423   %2 = sitofp i64 %a0 to float
    424   %3 = insertelement <4 x float> %b0, float %2, i64 0
    425   ret <4 x float> %3
    426 }
    427 
    428 define double @stack_fold_cvtss2sd(float %a0) minsize {
    429   ;CHECK-LABEL: stack_fold_cvtss2sd
    430   ;CHECK:       cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    431   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    432   %2 = fpext float %a0 to double
    433   ret double %2
    434 }
    435 
    436 define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) optsize {
    437   ;CHECK-LABEL: stack_fold_cvtss2sd_int
    438   ;CHECK:       cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    439   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    440   %2 = extractelement <4 x float> %a0, i64 0
    441   %3 = fpext float %2 to double
    442   %4 = insertelement <2 x double> zeroinitializer, double %3, i64 0
    443   ret <2 x double> %4
    444 }
    445 
    446 ; TODO stack_fold_cvtss2si
    447 
    448 define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
    449   ;CHECK-LABEL: stack_fold_cvtss2si_int
    450   ;CHECK:       cvtss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    451   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    452   %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
    453   ret i32 %2
    454 }
    455 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
    456 
    457 ; TODO stack_fold_cvtss2si64
    458 
    459 define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
    460   ;CHECK-LABEL: stack_fold_cvtss2si64_int
    461   ;CHECK:       cvtss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    462   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    463   %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
    464   ret i64 %2
    465 }
    466 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
    467 
    468 define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
    469   ;CHECK-LABEL: stack_fold_cvttpd2dq
    470   ;CHECK:       cvttpd2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    471   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    472   %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
    473   ret <4 x i32> %2
    474 }
    475 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
    476 
    477 define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
    478   ;CHECK-LABEL: stack_fold_cvttps2dq
    479   ;CHECK:       cvttps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    480   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    481   %2 = fptosi <4 x float> %a0 to <4 x i32>
    482   ret <4 x i32> %2
    483 }
    484 
    485 define i32 @stack_fold_cvttsd2si(double %a0) {
    486   ;CHECK-LABEL: stack_fold_cvttsd2si
    487   ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 8-byte Folded Reload
    488   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    489   %2 = fptosi double %a0 to i32
    490   ret i32 %2
    491 }
    492 
    493 define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
    494   ;CHECK-LABEL: stack_fold_cvttsd2si_int
    495   ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    496   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    497   %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
    498   ret i32 %2
    499 }
    500 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
    501 
    502 define i64 @stack_fold_cvttsd2si64(double %a0) {
    503   ;CHECK-LABEL: stack_fold_cvttsd2si64
    504   ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
    505   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    506   %2 = fptosi double %a0 to i64
    507   ret i64 %2
    508 }
    509 
    510 define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
    511   ;CHECK-LABEL: stack_fold_cvttsd2si64_int
    512   ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    513   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    514   %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
    515   ret i64 %2
    516 }
    517 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
    518 
    519 define i32 @stack_fold_cvttss2si(float %a0) {
    520   ;CHECK-LABEL: stack_fold_cvttss2si
    521   ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
    522   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    523   %2 = fptosi float %a0 to i32
    524   ret i32 %2
    525 }
    526 
    527 define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
    528   ;CHECK-LABEL: stack_fold_cvttss2si_int
    529   ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    530   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    531   %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
    532   ret i32 %2
    533 }
    534 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
    535 
    536 define i64 @stack_fold_cvttss2si64(float %a0) {
    537   ;CHECK-LABEL: stack_fold_cvttss2si64
    538   ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
    539   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    540   %2 = fptosi float %a0 to i64
    541   ret i64 %2
    542 }
    543 
    544 define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
    545   ;CHECK-LABEL: stack_fold_cvttss2si64_int
    546   ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    547   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    548   %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
    549   ret i64 %2
    550 }
    551 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
    552 
    553 define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
    554   ;CHECK-LABEL: stack_fold_divpd
    555   ;CHECK:       divpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    556   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    557   %2 = fdiv <2 x double> %a0, %a1
    558   ret <2 x double> %2
    559 }
    560 
    561 define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
    562   ;CHECK-LABEL: stack_fold_divps
    563   ;CHECK:       divps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    564   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    565   %2 = fdiv <4 x float> %a0, %a1
    566   ret <4 x float> %2
    567 }
    568 
    569 define double @stack_fold_divsd(double %a0, double %a1) {
    570   ;CHECK-LABEL: stack_fold_divsd
    571   ;CHECK:       divsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    572   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    573   %2 = fdiv double %a0, %a1
    574   ret double %2
    575 }
    576 
    577 define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
    578   ;CHECK-LABEL: stack_fold_divsd_int
    579   ;CHECK:       divsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    580   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    581   %2 = extractelement <2 x double> %a0, i32 0
    582   %3 = extractelement <2 x double> %a1, i32 0
    583   %4 = fdiv double %2, %3
    584   %5 = insertelement <2 x double> %a0, double %4, i32 0
    585   ret <2 x double> %5
    586 }
    587 declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
    588 
    589 define float @stack_fold_divss(float %a0, float %a1) {
    590   ;CHECK-LABEL: stack_fold_divss
    591   ;CHECK:       divss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    592   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    593   %2 = fdiv float %a0, %a1
    594   ret float %2
    595 }
    596 
    597 define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
    598   ;CHECK-LABEL: stack_fold_divss_int
    599   ;CHECK:       divss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    600   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    601   %2 = extractelement <4 x float> %a0, i32 0
    602   %3 = extractelement <4 x float> %a1, i32 0
    603   %4 = fdiv float %2, %3
    604   %5 = insertelement <4 x float> %a0, float %4, i32 0
    605   ret <4 x float> %5
    606 }
    607 declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
    608 
    609 define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
    610   ;CHECK-LABEL: stack_fold_dppd
    611   ;CHECK:       dppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    612   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    613   %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
    614   ret <2 x double> %2
    615 }
    616 declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
    617 
    618 define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
    619   ;CHECK-LABEL: stack_fold_dpps
    620   ;CHECK:       dpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    621   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    622   %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
    623   ret <4 x float> %2
    624 }
    625 declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
    626 
    627 define i32 @stack_fold_extractps(<4 x float> %a0) {
    628   ;CHECK-LABEL: stack_fold_extractps
    629   ;CHECK:       extractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
    630   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
    631   ; fadd forces execution domain
    632   %1 = fadd <4 x float> %a0, <float 1.0, float 2.0, float 3.0, float 4.0>
    633   %2 = extractelement <4 x float> %1, i32 1
    634   %3 = bitcast float %2 to i32
    635   %4 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    636   ret i32 %3
    637 }
    638 
    639 define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
    640   ;CHECK-LABEL: stack_fold_haddpd
    641   ;CHECK:       haddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    642   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    643   %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
    644   ret <2 x double> %2
    645 }
    646 declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
    647 
    648 define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
    649   ;CHECK-LABEL: stack_fold_haddps
    650   ;CHECK:       haddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    651   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    652   %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
    653   ret <4 x float> %2
    654 }
    655 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
    656 
    657 define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
    658   ;CHECK-LABEL: stack_fold_hsubpd
    659   ;CHECK:       hsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    660   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    661   %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
    662   ret <2 x double> %2
    663 }
    664 declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
    665 
    666 define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
    667   ;CHECK-LABEL: stack_fold_hsubps
    668   ;CHECK:       hsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    669   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    670   %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
    671   ret <4 x float> %2
    672 }
    673 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
    674 
    675 define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
    676   ;CHECK-LABEL: stack_fold_insertps
    677   ;CHECK:       insertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    678   ;CHECK-NEXT:                                                        {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
    679   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    680   %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
    681   ret <4 x float> %2
    682 }
    683 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
    684 
    685 define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
    686   ;CHECK-LABEL: stack_fold_maxpd
    687   ;CHECK:       maxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    688   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    689   %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
    690   ret <2 x double> %2
    691 }
    692 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
    693 
    694 define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
    695   ;CHECK-LABEL: stack_fold_maxpd_commutable
    696   ;CHECK:       maxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    697   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    698   %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
    699   ret <2 x double> %2
    700 }
    701 
    702 define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
    703   ;CHECK-LABEL: stack_fold_maxps
    704   ;CHECK:       maxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    705   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    706   %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
    707   ret <4 x float> %2
    708 }
    709 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
    710 
    711 define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
    712   ;CHECK-LABEL: stack_fold_maxps_commutable
    713   ;CHECK:       maxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    714   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    715   %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
    716   ret <4 x float> %2
    717 }
    718 
    719 define double @stack_fold_maxsd(double %a0, double %a1) #0 {
    720   ;CHECK-LABEL: stack_fold_maxsd
    721   ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    722   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    723   %2 = fcmp ogt double %a0, %a1
    724   %3 = select i1 %2, double %a0, double %a1
    725   ret double %3
    726 }
    727 
    728 define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
    729   ;CHECK-LABEL: stack_fold_maxsd_commutable
    730   ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    731   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    732   %2 = fcmp ogt double %a0, %a1
    733   %3 = select i1 %2, double %a0, double %a1
    734   ret double %3
    735 }
    736 
    737 define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
    738   ;CHECK-LABEL: stack_fold_maxsd_int
    739   ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    740   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    741   %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
    742   ret <2 x double> %2
    743 }
    744 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
    745 
    746 define float @stack_fold_maxss(float %a0, float %a1) #0 {
    747   ;CHECK-LABEL: stack_fold_maxss
    748   ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    749   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    750   %2 = fcmp ogt float %a0, %a1
    751   %3 = select i1 %2, float %a0, float %a1
    752   ret float %3
    753 }
    754 
    755 define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
    756   ;CHECK-LABEL: stack_fold_maxss_commutable
    757   ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    758   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    759   %2 = fcmp ogt float %a0, %a1
    760   %3 = select i1 %2, float %a0, float %a1
    761   ret float %3
    762 }
    763 
    764 define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
    765   ;CHECK-LABEL: stack_fold_maxss_int
    766   ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    767   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    768   %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
    769   ret <4 x float> %2
    770 }
    771 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
    772 
    773 define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
    774   ;CHECK-LABEL: stack_fold_minpd
    775   ;CHECK:       minpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    776   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    777   %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
    778   ret <2 x double> %2
    779 }
    780 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
    781 
    782 define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
    783   ;CHECK-LABEL: stack_fold_minpd_commutable
    784   ;CHECK:       minpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    785   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    786   %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
    787   ret <2 x double> %2
    788 }
    789 
    790 define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
    791   ;CHECK-LABEL: stack_fold_minps
    792   ;CHECK:       minps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    793   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    794   %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
    795   ret <4 x float> %2
    796 }
    797 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
    798 
    799 define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
    800   ;CHECK-LABEL: stack_fold_minps_commutable
    801   ;CHECK:       minps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    802   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    803   %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
    804   ret <4 x float> %2
    805 }
    806 
    807 define double @stack_fold_minsd(double %a0, double %a1) #0 {
    808   ;CHECK-LABEL: stack_fold_minsd
    809   ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    810   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    811   %2 = fcmp olt double %a0, %a1
    812   %3 = select i1 %2, double %a0, double %a1
    813   ret double %3
    814 }
    815 
    816 define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
    817   ;CHECK-LABEL: stack_fold_minsd_commutable
    818   ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    819   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    820   %2 = fcmp olt double %a0, %a1
    821   %3 = select i1 %2, double %a0, double %a1
    822   ret double %3
    823 }
    824 
    825 define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
    826   ;CHECK-LABEL: stack_fold_minsd_int
    827   ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    828   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    829   %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
    830   ret <2 x double> %2
    831 }
    832 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
    833 
    834 define float @stack_fold_minss(float %a0, float %a1) #0 {
    835   ;CHECK-LABEL: stack_fold_minss
    836   ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    837   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    838   %2 = fcmp olt float %a0, %a1
    839   %3 = select i1 %2, float %a0, float %a1
    840   ret float %3
    841 }
    842 
    843 define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
    844   ;CHECK-LABEL: stack_fold_minss_commutable
    845   ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    846   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    847   %2 = fcmp olt float %a0, %a1
    848   %3 = select i1 %2, float %a0, float %a1
    849   ret float %3
    850 }
    851 
    852 define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
    853   ;CHECK-LABEL: stack_fold_minss_int
    854   ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    855   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    856   %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
    857   ret <4 x float> %2
    858 }
    859 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
    860 
    861 define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
    862   ;CHECK-LABEL: stack_fold_movddup
    863   ;CHECK:   movddup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    864   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    865   %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
    866   ret <2 x double> %2
    867 }
    868 ; TODO stack_fold_movhpd (load / store)
    869 ; TODO stack_fold_movhps (load / store)
    870 
    871 ; TODO stack_fold_movlpd (load / store)
    872 ; TODO stack_fold_movlps (load / store)
    873 
    874 define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
    875   ;CHECK-LABEL: stack_fold_movshdup
    876   ;CHECK:       movshdup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    877   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    878   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
    879   ret <4 x float> %2
    880 }
    881 
    882 define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
    883   ;CHECK-LABEL: stack_fold_movsldup
    884   ;CHECK:       movsldup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    885   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    886   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    887   ret <4 x float> %2
    888 }
    889 
    890 define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
    891   ;CHECK-LABEL: stack_fold_mulpd
    892   ;CHECK:       mulpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    893   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    894   %2 = fmul <2 x double> %a0, %a1
    895   ret <2 x double> %2
    896 }
    897 
    898 define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
    899   ;CHECK-LABEL: stack_fold_mulps
    900   ;CHECK:       mulps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    901   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    902   %2 = fmul <4 x float> %a0, %a1
    903   ret <4 x float> %2
    904 }
    905 
    906 define double @stack_fold_mulsd(double %a0, double %a1) {
    907   ;CHECK-LABEL: stack_fold_mulsd
    908   ;CHECK:       mulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    909   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    910   %2 = fmul double %a0, %a1
    911   ret double %2
    912 }
    913 
    914 define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
    915   ;CHECK-LABEL: stack_fold_mulsd_int
    916   ;CHECK:       mulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    917   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    918   %2 = extractelement <2 x double> %a0, i32 0
    919   %3 = extractelement <2 x double> %a1, i32 0
    920   %4 = fmul double %2, %3
    921   %5 = insertelement <2 x double> %a0, double %4, i32 0
    922   ret <2 x double> %5
    923 }
    924 
    925 define float @stack_fold_mulss(float %a0, float %a1) {
    926   ;CHECK-LABEL: stack_fold_mulss
    927   ;CHECK:       mulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    928   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    929   %2 = fmul float %a0, %a1
    930   ret float %2
    931 }
    932 
    933 define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
    934   ;CHECK-LABEL: stack_fold_mulss_int
    935   ;CHECK:       mulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    936   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    937   %2 = extractelement <4 x float> %a0, i32 0
    938   %3 = extractelement <4 x float> %a1, i32 0
    939   %4 = fmul float %2, %3
    940   %5 = insertelement <4 x float> %a0, float %4, i32 0
    941   ret <4 x float> %5
    942 }
    943 
    944 define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
    945   ;CHECK-LABEL: stack_fold_orpd
    946   ;CHECK:       orpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    947   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    948   %2 = bitcast <2 x double> %a0 to <2 x i64>
    949   %3 = bitcast <2 x double> %a1 to <2 x i64>
    950   %4 = or <2 x i64> %2, %3
    951   %5 = bitcast <2 x i64> %4 to <2 x double>
    952   ; fadd forces execution domain
    953   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
    954   ret <2 x double> %6
    955 }
    956 
    957 define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
    958   ;CHECK-LABEL: stack_fold_orps
    959   ;CHECK:       orps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    960   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    961   %2 = bitcast <4 x float> %a0 to <2 x i64>
    962   %3 = bitcast <4 x float> %a1 to <2 x i64>
    963   %4 = or <2 x i64> %2, %3
    964   %5 = bitcast <2 x i64> %4 to <4 x float>
    965   ; fadd forces execution domain
    966   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
    967   ret <4 x float> %6
    968 }
    969 
    970 ; TODO stack_fold_rcpps
    971 
    972 define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
    973   ;CHECK-LABEL: stack_fold_rcpps_int
    974   ;CHECK:       rcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    975   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    976   %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
    977   ret <4 x float> %2
    978 }
    979 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
    980 
    981 ; TODO stack_fold_rcpss
    982 
    983 define <4 x float> @stack_fold_rcpss_int(<4 x float> %a0, <4 x float> %a1) optsize {
    984   ;CHECK-LABEL: stack_fold_rcpss_int
    985   ;CHECK:       rcpss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    986   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    987   %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a1)
    988   %3 = extractelement <4 x float> %2, i32 0
    989   %4 = insertelement <4 x float> %a0, float %3, i32 0
    990   ret <4 x float> %4
    991 }
    992 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>)
    993 
    994 define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
    995   ;CHECK-LABEL: stack_fold_roundpd
    996   ;CHECK:       roundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    997   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    998   %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
    999   ret <2 x double> %2
   1000 }
   1001 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
   1002 
   1003 define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
   1004   ;CHECK-LABEL: stack_fold_roundps
   1005   ;CHECK:       roundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1006   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1007   %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
   1008   ret <4 x float> %2
   1009 }
   1010 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
   1011 
   1012 define double @stack_fold_roundsd(double %a0) optsize {
   1013   ;CHECK-LABEL: stack_fold_roundsd
   1014   ;CHECK:       roundsd $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1015   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1016   %2 = call double @llvm.floor.f64(double %a0)
   1017   ret double %2
   1018 }
   1019 declare double @llvm.floor.f64(double) nounwind readnone
   1020 
   1021 define <2 x double> @stack_fold_roundsd_int(<2 x double> %a0, <2 x double> %a1) optsize {
   1022   ;CHECK-LABEL: stack_fold_roundsd_int
   1023   ;CHECK:       roundsd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1024   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1025   %2 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
   1026   ret <2 x double> %2
   1027 }
   1028 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
   1029 
   1030 define float @stack_fold_roundss(float %a0) minsize {
   1031   ;CHECK-LABEL: stack_fold_roundss
   1032   ;CHECK:       roundss $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1033   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1034   %2 = call float @llvm.floor.f32(float %a0)
   1035   ret float %2
   1036 }
   1037 declare float @llvm.floor.f32(float) nounwind readnone
   1038 
   1039 define <4 x float> @stack_fold_roundss_int(<4 x float> %a0, <4 x float> %a1) optsize {
   1040   ;CHECK-LABEL: stack_fold_roundss_int
   1041   ;CHECK:       roundss $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1042   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1043   %2 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
   1044   ret <4 x float> %2
   1045 }
   1046 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
   1047 
   1048 ; TODO stack_fold_rsqrtps
   1049 
   1050 define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
   1051   ;CHECK-LABEL: stack_fold_rsqrtps_int
   1052   ;CHECK:       rsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1053   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1054   %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
   1055   ret <4 x float> %2
   1056 }
   1057 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
   1058 
   1059 ; TODO stack_fold_rsqrtss
   1060 
   1061 define <4 x float> @stack_fold_rsqrtss_int(<4 x float> %a0, <4 x float> %a1) optsize {
   1062   ;CHECK-LABEL: stack_fold_rsqrtss_int
   1063   ;CHECK:       rsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1064   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1065   %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a1)
   1066   %3 = extractelement <4 x float> %2, i32 0
   1067   %4 = insertelement <4 x float> %a0, float %3, i32 0
   1068   ret <4 x float> %4
   1069 }
   1070 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>)
   1071 
   1072 define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
   1073   ;CHECK-LABEL: stack_fold_shufpd
   1074   ;CHECK:       shufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1075   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1076   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
   1077   ; fadd forces execution domain
   1078   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
   1079   ret <2 x double> %3
   1080 }
   1081 
   1082 define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
   1083   ;CHECK-LABEL: stack_fold_shufps
   1084   ;CHECK:       shufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1085   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1086   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
   1087   ret <4 x float> %2
   1088 }
   1089 
   1090 define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
   1091   ;CHECK-LABEL: stack_fold_sqrtpd
   1092   ;CHECK:       sqrtpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1093   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1094   %2 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
   1095   ret <2 x double> %2
   1096 }
   1097 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
   1098 
   1099 define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
   1100   ;CHECK-LABEL: stack_fold_sqrtps
   1101   ;CHECK:       sqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1102   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1103   %2 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
   1104   ret <4 x float> %2
   1105 }
   1106 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
   1107 
   1108 define double @stack_fold_sqrtsd(double %a0) optsize {
   1109   ;CHECK-LABEL: stack_fold_sqrtsd
   1110   ;CHECK:       sqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1111   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1112   %2 = call double @llvm.sqrt.f64(double %a0)
   1113   ret double %2
   1114 }
   1115 declare double @llvm.sqrt.f64(double) nounwind readnone
   1116 
   1117 define <2 x double> @stack_fold_sqrtsd_int(<2 x double> %a0, <2 x double> %a1) optsize {
   1118   ;CHECK-LABEL: stack_fold_sqrtsd_int
   1119   ;CHECK:       sqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1120   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1121   %2 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1)
   1122   %3 = extractelement <2 x double> %2, i32 0
   1123   %4 = insertelement <2 x double> %a0, double %3, i32 0
   1124   ret <2 x double> %4
   1125 }
   1126 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
   1127 
   1128 define float @stack_fold_sqrtss(float %a0) minsize {
   1129   ;CHECK-LABEL: stack_fold_sqrtss
   1130   ;CHECK:       sqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1131   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1132   %2 = call float @llvm.sqrt.f32(float %a0)
   1133   ret float %2
   1134 }
   1135 declare float @llvm.sqrt.f32(float) nounwind readnone
   1136 
   1137 define <4 x float> @stack_fold_sqrtss_int(<4 x float> %a0, <4 x float> %a1) optsize {
   1138   ;CHECK-LABEL: stack_fold_sqrtss_int
   1139   ;CHECK:       sqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1140   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1141   %2 = extractelement <4 x float> %a1, i64 0
   1142   %3 = call float @llvm.sqrt.f32(float %2)
   1143   %4 = insertelement <4 x float> %a1, float %3, i64 0
   1144   %5 = extractelement <4 x float> %4, i32 0
   1145   %6 = insertelement <4 x float> %a0, float %5, i32 0
   1146   ret <4 x float> %6
   1147 }
   1148 
   1149 define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
   1150   ;CHECK-LABEL: stack_fold_subpd
   1151   ;CHECK:       subpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1152   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1153   %2 = fsub <2 x double> %a0, %a1
   1154   ret <2 x double> %2
   1155 }
   1156 
   1157 define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
   1158   ;CHECK-LABEL: stack_fold_subps
   1159   ;CHECK:       subps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1160   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1161   %2 = fsub <4 x float> %a0, %a1
   1162   ret <4 x float> %2
   1163 }
   1164 
   1165 define double @stack_fold_subsd(double %a0, double %a1) {
   1166   ;CHECK-LABEL: stack_fold_subsd
   1167   ;CHECK:       subsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1168   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1169   %2 = fsub double %a0, %a1
   1170   ret double %2
   1171 }
   1172 
   1173 define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
   1174   ;CHECK-LABEL: stack_fold_subsd_int
   1175   ;CHECK:       subsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1176   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1177   %2 = extractelement <2 x double> %a0, i32 0
   1178   %3 = extractelement <2 x double> %a1, i32 0
   1179   %4 = fsub double %2, %3
   1180   %5 = insertelement <2 x double> %a0, double %4, i32 0
   1181   ret <2 x double> %5
   1182 }
   1183 
   1184 define float @stack_fold_subss(float %a0, float %a1) {
   1185   ;CHECK-LABEL: stack_fold_subss
   1186   ;CHECK:       subss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1187   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1188   %2 = fsub float %a0, %a1
   1189   ret float %2
   1190 }
   1191 
   1192 define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
   1193   ;CHECK-LABEL: stack_fold_subss_int
   1194   ;CHECK:       subss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1195   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1196   %2 = extractelement <4 x float> %a0, i32 0
   1197   %3 = extractelement <4 x float> %a1, i32 0
   1198   %4 = fsub float %2, %3
   1199   %5 = insertelement <4 x float> %a0, float %4, i32 0
   1200   ret <4 x float> %5
   1201 }
   1202 
   1203 define i32 @stack_fold_ucomisd(double %a0, double %a1) {
   1204   ;CHECK-LABEL: stack_fold_ucomisd
   1205   ;CHECK:       ucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1206   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1207   %2 = fcmp ueq double %a0, %a1
   1208   %3 = select i1 %2, i32 1, i32 -1
   1209   ret i32 %3
   1210 }
   1211 
   1212 define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
   1213   ;CHECK-LABEL: stack_fold_ucomisd_int
   1214   ;CHECK:       ucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1215   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1216   %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
   1217   ret i32 %2
   1218 }
   1219 declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
   1220 
   1221 define i32 @stack_fold_ucomiss(float %a0, float %a1) {
   1222   ;CHECK-LABEL: stack_fold_ucomiss
   1223   ;CHECK:       ucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1224   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1225   %2 = fcmp ueq float %a0, %a1
   1226   %3 = select i1 %2, i32 1, i32 -1
   1227   ret i32 %3
   1228 }
   1229 
   1230 define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
   1231   ;CHECK-LABEL: stack_fold_ucomiss_int
   1232   ;CHECK:       ucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1233   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1234   %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
   1235   ret i32 %2
   1236 }
   1237 declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
   1238 
   1239 define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
   1240   ;CHECK-LABEL: stack_fold_unpckhpd
   1241   ;CHECK:       unpckhpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1242   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1243   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
   1244   ; fadd forces execution domain
   1245   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
   1246   ret <2 x double> %3
   1247 }
   1248 
   1249 define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
   1250   ;CHECK-LABEL: stack_fold_unpckhps
   1251   ;CHECK:       unpckhps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1252   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1253   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   1254   ; fadd forces execution domain
   1255   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
   1256   ret <4 x float> %3
   1257 }
   1258 
   1259 define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
   1260   ;CHECK-LABEL: stack_fold_unpcklpd
   1261   ;CHECK:       unpcklpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1262   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1263   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
   1264   ; fadd forces execution domain
   1265   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
   1266   ret <2 x double> %3
   1267 }
   1268 
   1269 define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
   1270   ;CHECK-LABEL: stack_fold_unpcklps
   1271   ;CHECK:       unpcklps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1272   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1273   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   1274   ; fadd forces execution domain
   1275   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
   1276   ret <4 x float> %3
   1277 }
   1278 
   1279 define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
   1280   ;CHECK-LABEL: stack_fold_xorpd
   1281   ;CHECK:       xorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1282   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1283   %2 = bitcast <2 x double> %a0 to <2 x i64>
   1284   %3 = bitcast <2 x double> %a1 to <2 x i64>
   1285   %4 = xor <2 x i64> %2, %3
   1286   %5 = bitcast <2 x i64> %4 to <2 x double>
   1287   ; fadd forces execution domain
   1288   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
   1289   ret <2 x double> %6
   1290 }
   1291 
   1292 define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
   1293   ;CHECK-LABEL: stack_fold_xorps
   1294   ;CHECK:       xorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1295   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1296   %2 = bitcast <4 x float> %a0 to <2 x i64>
   1297   %3 = bitcast <4 x float> %a1 to <2 x i64>
   1298   %4 = xor <2 x i64> %2, %3
   1299   %5 = bitcast <2 x i64> %4 to <4 x float>
   1300   ; fadd forces execution domain
   1301   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
   1302   ret <4 x float> %6
   1303 }
   1304 
   1305 attributes #0 = { "unsafe-fp-math"="false" }
   1306 attributes #1 = { "unsafe-fp-math"="true" }
   1307