Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c < %s | FileCheck %s
      2 
      3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
      4 target triple = "x86_64-unknown-unknown"
      5 
      6 ; Stack reload folding tests.
      7 ;
      8 ; By including a nop call with sideeffects we can force a partial register spill of the
      9 ; relevant registers and check that the reload is correctly folded into the instruction.
     10 
     11 define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
     12   ;CHECK-LABEL: stack_fold_addpd
     13   ;CHECK:       vaddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     14   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     15   %2 = fadd <2 x double> %a0, %a1
     16   ret <2 x double> %2
     17 }
     18 
     19 define <4 x double> @stack_fold_addpd_ymm(<4 x double> %a0, <4 x double> %a1) {
     20   ;CHECK-LABEL: stack_fold_addpd_ymm
     21   ;CHECK:       vaddpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     22   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     23   %2 = fadd <4 x double> %a0, %a1
     24   ret <4 x double> %2
     25 }
     26 
     27 define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
     28   ;CHECK-LABEL: stack_fold_addps
     29   ;CHECK:       vaddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     30   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     31   %2 = fadd <4 x float> %a0, %a1
     32   ret <4 x float> %2
     33 }
     34 
     35 define <8 x float> @stack_fold_addps_ymm(<8 x float> %a0, <8 x float> %a1) {
     36   ;CHECK-LABEL: stack_fold_addps_ymm
     37   ;CHECK:       vaddps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     38   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     39   %2 = fadd <8 x float> %a0, %a1
     40   ret <8 x float> %2
     41 }
     42 
     43 define double @stack_fold_addsd(double %a0, double %a1) {
     44   ;CHECK-LABEL: stack_fold_addsd
     45   ;CHECK:       vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
     46   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     47   %2 = fadd double %a0, %a1
     48   ret double %2
     49 }
     50 
     51 define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
     52   ;CHECK-LABEL: stack_fold_addsd_int
     53   ;CHECK:       vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     54   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     55   %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1)
     56   ret <2 x double> %2
     57 }
     58 declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
     59 
     60 define float @stack_fold_addss(float %a0, float %a1) {
     61   ;CHECK-LABEL: stack_fold_addss
     62   ;CHECK:       vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
     63   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     64   %2 = fadd float %a0, %a1
     65   ret float %2
     66 }
     67 
     68 define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
     69   ;CHECK-LABEL: stack_fold_addss_int
     70   ;CHECK:       vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     71   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     72   %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1)
     73   ret <4 x float> %2
     74 }
     75 declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
     76 
     77 define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
     78   ;CHECK-LABEL: stack_fold_addsubpd
     79   ;CHECK:       vaddsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     80   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     81   %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
     82   ret <2 x double> %2
     83 }
     84 declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
     85 
     86 define <4 x double> @stack_fold_addsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
     87   ;CHECK-LABEL: stack_fold_addsubpd_ymm
     88   ;CHECK:       vaddsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     89   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     90   %2 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
     91   ret <4 x double> %2
     92 }
     93 declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
     94 
     95 define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
     96   ;CHECK-LABEL: stack_fold_addsubps
     97   ;CHECK:       vaddsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     98   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     99   %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
    100   ret <4 x float> %2
    101 }
    102 declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
    103 
    104 define <8 x float> @stack_fold_addsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
    105   ;CHECK-LABEL: stack_fold_addsubps_ymm
    106   ;CHECK:       vaddsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    107   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    108   %2 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
    109   ret <8 x float> %2
    110 }
    111 declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
    112 
    113 define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
    114   ;CHECK-LABEL: stack_fold_andnpd
    115   ;CHECK:       vandnpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    116   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    117   %2 = bitcast <2 x double> %a0 to <2 x i64>
    118   %3 = bitcast <2 x double> %a1 to <2 x i64>
    119   %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
    120   %5 = and <2 x i64> %4, %3
    121   %6 = bitcast <2 x i64> %5 to <2 x double>
    122   ; fadd forces execution domain
    123   %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
    124   ret <2 x double> %7
    125 }
    126 
    127 define <4 x double> @stack_fold_andnpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    128   ;CHECK-LABEL: stack_fold_andnpd_ymm
    129   ;CHECK:       vandnpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    130   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    131   %2 = bitcast <4 x double> %a0 to <4 x i64>
    132   %3 = bitcast <4 x double> %a1 to <4 x i64>
    133   %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
    134   %5 = and <4 x i64> %4, %3
    135   %6 = bitcast <4 x i64> %5 to <4 x double>
    136   ; fadd forces execution domain
    137   %7 = fadd <4 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0>
    138   ret <4 x double> %7
    139 }
    140 
    141 define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
    142   ;CHECK-LABEL: stack_fold_andnps
    143   ;CHECK:       vandnps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    144   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    145   %2 = bitcast <4 x float> %a0 to <2 x i64>
    146   %3 = bitcast <4 x float> %a1 to <2 x i64>
    147   %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
    148   %5 = and <2 x i64> %4, %3
    149   %6 = bitcast <2 x i64> %5 to <4 x float>
    150   ; fadd forces execution domain
    151   %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
    152   ret <4 x float> %7
    153 }
    154 
    155 define <8 x float> @stack_fold_andnps_ymm(<8 x float> %a0, <8 x float> %a1) {
    156   ;CHECK-LABEL: stack_fold_andnps_ymm
    157   ;CHECK:       vandnps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    158   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    159   %2 = bitcast <8 x float> %a0 to <4 x i64>
    160   %3 = bitcast <8 x float> %a1 to <4 x i64>
    161   %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
    162   %5 = and <4 x i64> %4, %3
    163   %6 = bitcast <4 x i64> %5 to <8 x float>
    164   ; fadd forces execution domain
    165   %7 = fadd <8 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
    166   ret <8 x float> %7
    167 }
    168 
    169 define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
    170   ;CHECK-LABEL: stack_fold_andpd
    171   ;CHECK:       vandpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    172   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    173   %2 = bitcast <2 x double> %a0 to <2 x i64>
    174   %3 = bitcast <2 x double> %a1 to <2 x i64>
    175   %4 = and <2 x i64> %2, %3
    176   %5 = bitcast <2 x i64> %4 to <2 x double>
    177   ; fadd forces execution domain
    178   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
    179   ret <2 x double> %6
    180 }
    181 
    182 define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    183   ;CHECK-LABEL: stack_fold_andpd_ymm
    184   ;CHECK:       vandpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    185   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    186   %2 = bitcast <4 x double> %a0 to <4 x i64>
    187   %3 = bitcast <4 x double> %a1 to <4 x i64>
    188   %4 = and <4 x i64> %2, %3
    189   %5 = bitcast <4 x i64> %4 to <4 x double>
    190   ; fadd forces execution domain
    191   %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
    192   ret <4 x double> %6
    193 }
    194 
    195 define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
    196   ;CHECK-LABEL: stack_fold_andps
    197   ;CHECK:       vandps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    198   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    199   %2 = bitcast <4 x float> %a0 to <2 x i64>
    200   %3 = bitcast <4 x float> %a1 to <2 x i64>
    201   %4 = and <2 x i64> %2, %3
    202   %5 = bitcast <2 x i64> %4 to <4 x float>
    203   ; fadd forces execution domain
    204   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
    205   ret <4 x float> %6
    206 }
    207 
    208 define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) {
    209   ;CHECK-LABEL: stack_fold_andps_ymm
    210   ;CHECK:       vandps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    211   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    212   %2 = bitcast <8 x float> %a0 to <4 x i64>
    213   %3 = bitcast <8 x float> %a1 to <4 x i64>
    214   %4 = and <4 x i64> %2, %3
    215   %5 = bitcast <4 x i64> %4 to <8 x float>
    216   ; fadd forces execution domain
    217   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
    218   ret <8 x float> %6
    219 }
    220 
    221 define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
    222   ;CHECK-LABEL: stack_fold_blendpd
    223   ;CHECK:       vblendpd $2, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    224   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    225   %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
    226   ret <2 x double> %2
    227 }
    228 
    229 define <4 x double> @stack_fold_blendpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    230   ;CHECK-LABEL: stack_fold_blendpd_ymm
    231   ;CHECK:       vblendpd $6, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    232   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    233   %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %a0, <4 x double> %a1
    234   ret <4 x double> %2
    235 }
    236 
    237 define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
    238   ;CHECK-LABEL: stack_fold_blendps
    239   ;CHECK:       vblendps $6, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    240   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    241   %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
    242   ret <4 x float> %2
    243 }
    244 
    245 define <8 x float> @stack_fold_blendps_ymm(<8 x float> %a0, <8 x float> %a1) {
    246   ;CHECK-LABEL: stack_fold_blendps_ymm
    247   ;CHECK:       vblendps $102, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    248   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    249   %2 = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %a0, <8 x float> %a1
    250   ret <8 x float> %2
    251 }
    252 
    253 define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
    254   ;CHECK-LABEL: stack_fold_blendvpd
    255   ;CHECK:       vblendvpd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    256   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    257   %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
    258   ret <2 x double> %2
    259 }
    260 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
    261 
    262 define <4 x double> @stack_fold_blendvpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %c) {
    263   ;CHECK-LABEL: stack_fold_blendvpd_ymm
    264   ;CHECK:       vblendvpd {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    265   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    266   %2 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a1, <4 x double> %c, <4 x double> %a0)
    267   ret <4 x double> %2
    268 }
    269 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
    270 
    271 define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
    272   ;CHECK-LABEL: stack_fold_blendvps
    273   ;CHECK:       vblendvps {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    274   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    275   %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
    276   ret <4 x float> %2
    277 }
    278 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
    279 
    280 define <8 x float> @stack_fold_blendvps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %c) {
    281   ;CHECK-LABEL: stack_fold_blendvps_ymm
    282   ;CHECK:       vblendvps {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    283   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    284   %2 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a1, <8 x float> %c, <8 x float> %a0)
    285   ret <8 x float> %2
    286 }
    287 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
    288 
    289 define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
    290   ;CHECK-LABEL: stack_fold_cmppd
    291   ;CHECK:       vcmpeqpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    292   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    293   %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
    294   ret <2 x double> %2
    295 }
    296 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
    297 
    298 define <4 x double> @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
    299   ;CHECK-LABEL: stack_fold_cmppd_ymm
    300   ;CHECK:       vcmpeqpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    301   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    302   %2 = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
    303   ret <4 x double> %2
    304 }
    305 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
    306 
    307 define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
    308   ;CHECK-LABEL: stack_fold_cmpps
    309   ;CHECK:       vcmpeqps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    310   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    311   %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
    312   ret <4 x float> %2
    313 }
    314 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
    315 
    316 define <8 x float> @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
    317   ;CHECK-LABEL: stack_fold_cmpps_ymm
    318   ;CHECK:       vcmpeqps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    319   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    320   %2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0)
    321   ret <8 x float> %2
    322 }
    323 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
    324 
    325 define i32 @stack_fold_cmpsd(double %a0, double %a1) {
    326   ;CHECK-LABEL: stack_fold_cmpsd
    327   ;CHECK:       vcmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    328   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    329   %2 = fcmp oeq double %a0, %a1
    330   %3 = zext i1 %2 to i32
    331   ret i32 %3
    332 }
    333 
    334 define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
    335   ;CHECK-LABEL: stack_fold_cmpsd_int
    336   ;CHECK:       vcmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    337   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    338   %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
    339   ret <2 x double> %2
    340 }
    341 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
    342 
    343 define i32 @stack_fold_cmpss(float %a0, float %a1) {
    344   ;CHECK-LABEL: stack_fold_cmpss
    345   ;CHECK:       vcmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    346   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    347   %2 = fcmp oeq float %a0, %a1
    348   %3 = zext i1 %2 to i32
    349   ret i32 %3
    350 }
    351 
    352 define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
    353   ;CHECK-LABEL: stack_fold_cmpss_int
    354   ;CHECK:       vcmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    355   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    356   %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
    357   ret <4 x float> %2
    358 }
    359 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
    360 
    361 ; TODO stack_fold_comisd
    362 
    363 define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
    364   ;CHECK-LABEL: stack_fold_comisd_int
    365   ;CHECK:       vcomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    366   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    367   %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
    368   ret i32 %2
    369 }
    370 declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
    371 
    372 ; TODO stack_fold_comiss
    373 
    374 define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
    375   ;CHECK-LABEL: stack_fold_comiss_int
    376   ;CHECK:       vcomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    377   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    378   %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
    379   ret i32 %2
    380 }
    381 declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
    382 
    383 define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
    384   ;CHECK-LABEL: stack_fold_cvtdq2pd
    385   ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    386   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    387   %2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0)
    388   ret <2 x double> %2
    389 }
    390 declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
    391 
    392 define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) {
    393   ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm
    394   ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    395   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    396   %2 = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0)
    397   ret <4 x double> %2
    398 }
    399 declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
    400 
    401 define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
    402   ;CHECK-LABEL: stack_fold_cvtdq2ps
    403   ;CHECK:   vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    404   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    405   %2 = sitofp <4 x i32> %a0 to <4 x float>
    406   ret <4 x float> %2
    407 }
    408 
    409 define <8 x float> @stack_fold_cvtdq2ps_ymm(<8 x i32> %a0) {
    410   ;CHECK-LABEL: stack_fold_cvtdq2ps_ymm
    411   ;CHECK:   vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    412   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    413   %2 = sitofp <8 x i32> %a0 to <8 x float>
    414   ret <8 x float> %2
    415 }
    416 
    417 define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
    418   ;CHECK-LABEL: stack_fold_cvtpd2dq
    419   ;CHECK:   vcvtpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    420   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    421   %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
    422   ret <4 x i32> %2
    423 }
    424 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
    425 
    426 define <4 x i32> @stack_fold_cvtpd2dq_ymm(<4 x double> %a0) {
    427   ;CHECK-LABEL: stack_fold_cvtpd2dq_ymm
    428   ;CHECK:   vcvtpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    429   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    430   %2 = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
    431   ret <4 x i32> %2
    432 }
    433 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
    434 
    435 define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
    436   ;CHECK-LABEL: stack_fold_cvtpd2ps
    437   ;CHECK:   vcvtpd2psx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    438   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    439   %2 = fptrunc <2 x double> %a0 to <2 x float>
    440   ret <2 x float> %2
    441 }
    442 
    443 define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
    444   ;CHECK-LABEL: stack_fold_cvtpd2ps_ymm
    445   ;CHECK:   vcvtpd2psy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    446   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    447   %2 = fptrunc <4 x double> %a0 to <4 x float>
    448   ret <4 x float> %2
    449 }
    450 
    451 define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) {
    452   ;CHECK-LABEL: stack_fold_cvtph2ps
    453   ;CHECK:   vcvtph2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    454   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    455   %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)
    456   ret <4 x float> %2
    457 }
    458 declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
    459 
    460 define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) {
    461   ;CHECK-LABEL: stack_fold_cvtph2ps_ymm
    462   ;CHECK:   vcvtph2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    463   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    464   %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)
    465   ret <8 x float> %2
    466 }
    467 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
    468 
    469 define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
    470   ;CHECK-LABEL: stack_fold_cvtps2dq
    471   ;CHECK:  vcvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    472   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    473   %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
    474   ret <4 x i32> %2
    475 }
    476 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
    477 
    478 define <8 x i32> @stack_fold_cvtps2dq_ymm(<8 x float> %a0) {
    479   ;CHECK-LABEL: stack_fold_cvtps2dq_ymm
    480   ;CHECK:  vcvtps2dq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    481   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    482   %2 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
    483   ret <8 x i32> %2
    484 }
    485 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
    486 
    487 define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
    488   ;CHECK-LABEL: stack_fold_cvtps2pd
    489   ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    490   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    491   %2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)
    492   ret <2 x double> %2
    493 }
    494 declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
    495 
    496 define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) {
    497   ;CHECK-LABEL: stack_fold_cvtps2pd_ymm
    498   ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    499   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    500   %2 = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0)
    501   ret <4 x double> %2
    502 }
    503 declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
    504 
    505 define <8 x i16> @stack_fold_cvtps2ph(<4 x float> %a0) {
    506   ;CHECK-LABEL: stack_fold_cvtps2ph
    507   ;CHECK:   vcvtps2ph $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
    508   %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
    509   %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    510   ret <8 x i16> %1
    511 }
    512 declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
    513 
    514 define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) {
    515   ;CHECK-LABEL: stack_fold_cvtps2ph_ymm
    516   ;CHECK:   vcvtps2ph $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
    517   %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
    518   %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    519   ret <8 x i16> %1
    520 }
    521 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
    522 
    523 ; TODO stack_fold_cvtsd2si
    524 
    525 define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
    526   ;CHECK-LABEL: stack_fold_cvtsd2si_int
    527   ;CHECK:  cvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    528   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    529   %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
    530   ret i32 %2
    531 }
    532 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
    533 
    534 ; TODO stack_fold_cvtsd2si64
    535 
    536 define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
    537   ;CHECK-LABEL: stack_fold_cvtsd2si64_int
    538   ;CHECK:  cvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    539   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    540   %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
    541   ret i64 %2
    542 }
    543 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
    544 
    545 ; TODO stack_fold_cvtsd2ss
    546 
    547 define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) {
    548   ;CHECK-LABEL: stack_fold_cvtsd2ss_int
    549   ;CHECK:  cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    550   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    551   %2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
    552   ret <4 x float> %2
    553 }
    554 declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
    555 
    556 define double @stack_fold_cvtsi2sd(i32 %a0) {
    557   ;CHECK-LABEL: stack_fold_cvtsi2sd
    558   ;CHECK:  cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    559   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    560   %2 = sitofp i32 %a0 to double
    561   ret double %2
    562 }
    563 
    564 define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
    565   ;CHECK-LABEL: stack_fold_cvtsi2sd_int
    566   ;CHECK:  cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    567   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    568   %2 = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 0x0, double 0x0>, i32 %a0)
    569   ret <2 x double> %2
    570 }
    571 declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
    572 
    573 define double @stack_fold_cvtsi642sd(i64 %a0) {
    574   ;CHECK-LABEL: stack_fold_cvtsi642sd
    575   ;CHECK:  cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    576   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    577   %2 = sitofp i64 %a0 to double
    578   ret double %2
    579 }
    580 
    581 define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
    582   ;CHECK-LABEL: stack_fold_cvtsi642sd_int
    583   ;CHECK:  cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    584   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    585   %2 = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> <double 0x0, double 0x0>, i64 %a0)
    586   ret <2 x double> %2
    587 }
    588 declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
    589 
    590 define float @stack_fold_cvtsi2ss(i32 %a0) {
    591   ;CHECK-LABEL: stack_fold_cvtsi2ss
    592   ;CHECK:  cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    593   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    594   %2 = sitofp i32 %a0 to float
    595   ret float %2
    596 }
    597 
    598 define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
    599   ;CHECK-LABEL: stack_fold_cvtsi2ss_int
    600   ;CHECK:  cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    601   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    602   %2 = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i32 %a0)
    603   ret <4 x float> %2
    604 }
    605 declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
    606 
    607 define float @stack_fold_cvtsi642ss(i64 %a0) {
    608   ;CHECK-LABEL: stack_fold_cvtsi642ss
    609   ;CHECK:  cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    610   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    611   %2 = sitofp i64 %a0 to float
    612   ret float %2
    613 }
    614 
    615 define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
    616   ;CHECK-LABEL: stack_fold_cvtsi642ss_int
    617   ;CHECK:  cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    618   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    619   %2 = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i64 %a0)
    620   ret <4 x float> %2
    621 }
    622 declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
    623 
    624 ; TODO stack_fold_cvtss2sd
    625 
    626 define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) {
    627   ;CHECK-LABEL: stack_fold_cvtss2sd_int
    628   ;CHECK:  cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    629   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    630   %2 = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %a0)
    631   ret <2 x double> %2
    632 }
    633 declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
    634 
    635 ; TODO stack_fold_cvtss2si
    636 
    637 define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
    638   ;CHECK-LABEL: stack_fold_cvtss2si_int
    639   ;CHECK:  vcvtss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    640   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    641   %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
    642   ret i32 %2
    643 }
    644 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
    645 
    646 ; TODO stack_fold_cvtss2si64
    647 
    648 define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
    649   ;CHECK-LABEL: stack_fold_cvtss2si64_int
    650   ;CHECK:  vcvtss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    651   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    652   %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
    653   ret i64 %2
    654 }
    655 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
    656 
    657 define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
    658   ;CHECK-LABEL: stack_fold_cvttpd2dq
    659   ;CHECK:  vcvttpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    660   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    661   %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
    662   ret <4 x i32> %2
    663 }
    664 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
    665 
    666 define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) {
    667   ;CHECK-LABEL: stack_fold_cvttpd2dq_ymm
    668   ;CHECK:  vcvttpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    669   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    670   %2 = fptosi <4 x double> %a0 to <4 x i32>
    671   ret <4 x i32> %2
    672 }
    673 
    674 define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
    675   ;CHECK-LABEL: stack_fold_cvttps2dq
    676   ;CHECK:  vcvttps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    677   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    678   %2 = fptosi <4 x float> %a0 to <4 x i32>
    679   ret <4 x i32> %2
    680 }
    681 
    682 define <8 x i32> @stack_fold_cvttps2dq_ymm(<8 x float> %a0) {
    683   ;CHECK-LABEL: stack_fold_cvttps2dq_ymm
    684   ;CHECK:  vcvttps2dq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    685   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    686   %2 = fptosi <8 x float> %a0 to <8 x i32>
    687   ret <8 x i32> %2
    688 }
    689 
    690 define i32 @stack_fold_cvttsd2si(double %a0) {
    691   ;CHECK-LABEL: stack_fold_cvttsd2si
    692   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 8-byte Folded Reload
    693   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    694   %2 = fptosi double %a0 to i32
    695   ret i32 %2
    696 }
    697 
    698 define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
    699   ;CHECK-LABEL: stack_fold_cvttsd2si_int
    700   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    701   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    702   %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
    703   ret i32 %2
    704 }
    705 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
    706 
    707 define i64 @stack_fold_cvttsd2si64(double %a0) {
    708   ;CHECK-LABEL: stack_fold_cvttsd2si64
    709   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
    710   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    711   %2 = fptosi double %a0 to i64
    712   ret i64 %2
    713 }
    714 
    715 define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
    716   ;CHECK-LABEL: stack_fold_cvttsd2si64_int
    717   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    718   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    719   %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
    720   ret i64 %2
    721 }
    722 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
    723 
    724 define i32 @stack_fold_cvttss2si(float %a0) {
    725   ;CHECK-LABEL: stack_fold_cvttss2si
    726   ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
    727   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    728   %2 = fptosi float %a0 to i32
    729   ret i32 %2
    730 }
    731 
    732 define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
    733   ;CHECK-LABEL: stack_fold_cvttss2si_int
    734   ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    735   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    736   %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
    737   ret i32 %2
    738 }
    739 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
    740 
    741 define i64 @stack_fold_cvttss2si64(float %a0) {
    742   ;CHECK-LABEL: stack_fold_cvttss2si64
    743   ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
    744   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    745   %2 = fptosi float %a0 to i64
    746   ret i64 %2
    747 }
    748 
    749 define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
    750   ;CHECK-LABEL: stack_fold_cvttss2si64_int
    751   ;CHECK:  cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    752   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    753   %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
    754   ret i64 %2
    755 }
    756 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
    757 
    758 define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
    759   ;CHECK-LABEL: stack_fold_divpd
    760   ;CHECK:       vdivpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    761   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    762   %2 = fdiv <2 x double> %a0, %a1
    763   ret <2 x double> %2
    764 }
    765 
    766 define <4 x double> @stack_fold_divpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    767   ;CHECK-LABEL: stack_fold_divpd_ymm
    768   ;CHECK:       vdivpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    769   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    770   %2 = fdiv <4 x double> %a0, %a1
    771   ret <4 x double> %2
    772 }
    773 
    774 define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
    775   ;CHECK-LABEL: stack_fold_divps
    776   ;CHECK:       vdivps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    777   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    778   %2 = fdiv <4 x float> %a0, %a1
    779   ret <4 x float> %2
    780 }
    781 
    782 define <8 x float> @stack_fold_divps_ymm(<8 x float> %a0, <8 x float> %a1) {
    783   ;CHECK-LABEL: stack_fold_divps_ymm
    784   ;CHECK:       vdivps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    785   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    786   %2 = fdiv <8 x float> %a0, %a1
    787   ret <8 x float> %2
    788 }
    789 
    790 define double @stack_fold_divsd(double %a0, double %a1) {
    791   ;CHECK-LABEL: stack_fold_divsd
    792   ;CHECK:       vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    793   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    794   %2 = fdiv double %a0, %a1
    795   ret double %2
    796 }
    797 
    798 define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
    799   ;CHECK-LABEL: stack_fold_divsd_int
    800   ;CHECK:       vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    801   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    802   %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1)
    803   ret <2 x double> %2
    804 }
    805 declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
    806 
    807 define float @stack_fold_divss(float %a0, float %a1) {
    808   ;CHECK-LABEL: stack_fold_divss
    809   ;CHECK:       vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    810   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    811   %2 = fdiv float %a0, %a1
    812   ret float %2
    813 }
    814 
    815 define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
    816   ;CHECK-LABEL: stack_fold_divss_int
    817   ;CHECK:       vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    818   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    819   %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1)
    820   ret <4 x float> %2
    821 }
    822 declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
    823 
    824 define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
    825   ;CHECK-LABEL: stack_fold_dppd
    826   ;CHECK:       vdppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    827   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    828   %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
    829   ret <2 x double> %2
    830 }
    831 declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
    832 
    833 define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
    834   ;CHECK-LABEL: stack_fold_dpps
    835   ;CHECK:       vdpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    836   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    837   %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
    838   ret <4 x float> %2
    839 }
    840 declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
    841 
    842 define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) {
    843   ;CHECK-LABEL: stack_fold_dpps_ymm
    844   ;CHECK:       vdpps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    845   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    846   %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
    847   ret <8 x float> %2
    848 }
    849 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
    850 
    851 define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) {
    852   ;CHECK-LABEL: stack_fold_extractf128
    853   ;CHECK:       vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
    854   %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    855   %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    856   ret <4 x float> %1
    857 }
    858 
    859 define i32 @stack_fold_extractps(<4 x float> %a0) {
    860   ;CHECK-LABEL: stack_fold_extractps
    861   ;CHECK:       vextractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
    862   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
    863   %1 = extractelement <4 x float> %a0, i32 1
    864   %2 = bitcast float %1 to i32
    865   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    866   ret i32 %2
    867 }
    868 
    869 define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
    870   ;CHECK-LABEL: stack_fold_haddpd
    871   ;CHECK:       vhaddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    872   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    873   %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
    874   ret <2 x double> %2
    875 }
    876 declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
    877 
    878 define <4 x double> @stack_fold_haddpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    879   ;CHECK-LABEL: stack_fold_haddpd_ymm
    880   ;CHECK:       vhaddpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    881   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    882   %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
    883   ret <4 x double> %2
    884 }
    885 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
    886 
    887 define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
    888   ;CHECK-LABEL: stack_fold_haddps
    889   ;CHECK:       vhaddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    890   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    891   %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
    892   ret <4 x float> %2
    893 }
    894 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
    895 
    896 define <8 x float> @stack_fold_haddps_ymm(<8 x float> %a0, <8 x float> %a1) {
    897   ;CHECK-LABEL: stack_fold_haddps_ymm
    898   ;CHECK:       vhaddps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    899   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    900   %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
    901   ret <8 x float> %2
    902 }
    903 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
    904 
    905 define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
    906   ;CHECK-LABEL: stack_fold_hsubpd
    907   ;CHECK:       vhsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    908   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    909   %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
    910   ret <2 x double> %2
    911 }
    912 declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
    913 
    914 define <4 x double> @stack_fold_hsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    915   ;CHECK-LABEL: stack_fold_hsubpd_ymm
    916   ;CHECK:       vhsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    917   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    918   %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
    919   ret <4 x double> %2
    920 }
    921 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
    922 
    923 define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
    924   ;CHECK-LABEL: stack_fold_hsubps
    925   ;CHECK:       vhsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    926   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    927   %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
    928   ret <4 x float> %2
    929 }
    930 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
    931 
    932 define <8 x float> @stack_fold_hsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
    933   ;CHECK-LABEL: stack_fold_hsubps_ymm
    934   ;CHECK:       vhsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    935   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    936   %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
    937   ret <8 x float> %2
    938 }
    939 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
    940 
    941 define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) {
    942   ;CHECK-LABEL: stack_fold_insertf128
    943   ;CHECK:       vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    944   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    945   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    946   ret <8 x float> %2
    947 }
    948 
    949 define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
    950   ;CHECK-LABEL: stack_fold_insertps
    951   ;CHECK:       vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    952   ;CHECK-NEXT:                                                                              {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
    953   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    954   %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
    955   ret <4 x float> %2
    956 }
    957 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
    958 
    959 define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
    960   ;CHECK-LABEL: stack_fold_maxpd
    961   ;CHECK:       vmaxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    962   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    963   %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
    964   ret <2 x double> %2
    965 }
    966 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
    967 
    968 define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    969   ;CHECK-LABEL: stack_fold_maxpd_ymm
    970   ;CHECK:       vmaxpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    971   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    972   %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
    973   ret <4 x double> %2
    974 }
    975 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
    976 
    977 define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
    978   ;CHECK-LABEL: stack_fold_maxps
    979   ;CHECK:       vmaxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    980   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    981   %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
    982   ret <4 x float> %2
    983 }
    984 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
    985 
    986 define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) {
    987   ;CHECK-LABEL: stack_fold_maxps_ymm
    988   ;CHECK:       vmaxps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    989   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    990   %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
    991   ret <8 x float> %2
    992 }
    993 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
    994 
    995 define double @stack_fold_maxsd(double %a0, double %a1) {
    996   ;CHECK-LABEL: stack_fold_maxsd
    997   ;CHECK:       vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    998   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    999   %2 = fcmp ogt double %a0, %a1
   1000   %3 = select i1 %2, double %a0, double %a1
   1001   ret double %3
   1002 }
   1003 
   1004 define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
   1005   ;CHECK-LABEL: stack_fold_maxsd_int
   1006   ;CHECK:       vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1007   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1008   %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
   1009   ret <2 x double> %2
   1010 }
   1011 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
   1012 
   1013 define float @stack_fold_maxss(float %a0, float %a1) {
   1014   ;CHECK-LABEL: stack_fold_maxss
   1015   ;CHECK:       vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1016   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1017   %2 = fcmp ogt float %a0, %a1
   1018   %3 = select i1 %2, float %a0, float %a1
   1019   ret float %3
   1020 }
   1021 
   1022 define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
   1023   ;CHECK-LABEL: stack_fold_maxss_int
   1024   ;CHECK:       vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1025   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1026   %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
   1027   ret <4 x float> %2
   1028 }
   1029 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
   1030 
   1031 define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
   1032   ;CHECK-LABEL: stack_fold_minpd
   1033   ;CHECK:       vminpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1034   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1035   %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
   1036   ret <2 x double> %2
   1037 }
   1038 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
   1039 
   1040 define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1041   ;CHECK-LABEL: stack_fold_minpd_ymm
   1042   ;CHECK:       vminpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1043   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1044   %2 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
   1045   ret <4 x double> %2
   1046 }
   1047 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
   1048 
   1049 define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
   1050   ;CHECK-LABEL: stack_fold_minps
   1051   ;CHECK:       vminps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1052   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1053   %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   1054   ret <4 x float> %2
   1055 }
   1056 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
   1057 
   1058 define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1059   ;CHECK-LABEL: stack_fold_minps_ymm
   1060   ;CHECK:       vminps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1061   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1062   %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
   1063   ret <8 x float> %2
   1064 }
   1065 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
   1066 
   1067 define double @stack_fold_minsd(double %a0, double %a1) {
   1068   ;CHECK-LABEL: stack_fold_minsd
   1069   ;CHECK:       vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1070   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1071   %2 = fcmp olt double %a0, %a1
   1072   %3 = select i1 %2, double %a0, double %a1
   1073   ret double %3
   1074 }
   1075 
   1076 define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
   1077   ;CHECK-LABEL: stack_fold_minsd_int
   1078   ;CHECK:       vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1079   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1080   %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
   1081   ret <2 x double> %2
   1082 }
   1083 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
   1084 
   1085 define float @stack_fold_minss(float %a0, float %a1) {
   1086   ;CHECK-LABEL: stack_fold_minss
   1087   ;CHECK:       vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1088   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1089   %2 = fcmp olt float %a0, %a1
   1090   %3 = select i1 %2, float %a0, float %a1
   1091   ret float %3
   1092 }
   1093 
   1094 define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
   1095   ;CHECK-LABEL: stack_fold_minss_int
   1096   ;CHECK:       vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1097   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1098   %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
   1099   ret <4 x float> %2
   1100 }
   1101 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
   1102 
   1103 define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
   1104   ;CHECK-LABEL: stack_fold_movddup
   1105   ;CHECK:   vmovddup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1106   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1107   %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
   1108   ret <2 x double> %2
   1109 }
   1110 
   1111 define <4 x double> @stack_fold_movddup_ymm(<4 x double> %a0) {
   1112   ;CHECK-LABEL: stack_fold_movddup_ymm
   1113   ;CHECK:   vmovddup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1114   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1115   %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   1116   ret <4 x double> %2
   1117 }
   1118 
   1119 ; TODO stack_fold_movhpd (load / store)
   1120 ; TODO stack_fold_movhps (load / store)
   1121 
   1122 ; TODO stack_fold_movlpd (load / store)
   1123 ; TODO stack_fold_movlps (load / store)
   1124 
   1125 define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
   1126   ;CHECK-LABEL: stack_fold_movshdup
   1127   ;CHECK:   vmovshdup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1128   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1129   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
   1130   ret <4 x float> %2
   1131 }
   1132 
   1133 define <8 x float> @stack_fold_movshdup_ymm(<8 x float> %a0) {
   1134   ;CHECK-LABEL: stack_fold_movshdup_ymm
   1135   ;CHECK:   vmovshdup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1136   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1137   %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   1138   ret <8 x float> %2
   1139 }
   1140 
   1141 define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
   1142   ;CHECK-LABEL: stack_fold_movsldup
   1143   ;CHECK:   vmovsldup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1144   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1145   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   1146   ret <4 x float> %2
   1147 }
   1148 
   1149 define <8 x float> @stack_fold_movsldup_ymm(<8 x float> %a0) {
   1150   ;CHECK-LABEL: stack_fold_movsldup_ymm
   1151   ;CHECK:   vmovsldup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1152   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1153   %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1154   ret <8 x float> %2
   1155 }
   1156 
   1157 define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
   1158   ;CHECK-LABEL: stack_fold_mulpd
   1159   ;CHECK:       vmulpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1160   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1161   %2 = fmul <2 x double> %a0, %a1
   1162   ret <2 x double> %2
   1163 }
   1164 
   1165 define <4 x double> @stack_fold_mulpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1166   ;CHECK-LABEL: stack_fold_mulpd_ymm
   1167   ;CHECK:       vmulpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1168   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1169   %2 = fmul <4 x double> %a0, %a1
   1170   ret <4 x double> %2
   1171 }
   1172 
   1173 define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
   1174   ;CHECK-LABEL: stack_fold_mulps
   1175   ;CHECK:       vmulps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1176   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1177   %2 = fmul <4 x float> %a0, %a1
   1178   ret <4 x float> %2
   1179 }
   1180 
   1181 define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1182   ;CHECK-LABEL: stack_fold_mulps_ymm
   1183   ;CHECK:       vmulps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1184   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1185   %2 = fmul <8 x float> %a0, %a1
   1186   ret <8 x float> %2
   1187 }
   1188 
   1189 define double @stack_fold_mulsd(double %a0, double %a1) {
   1190   ;CHECK-LABEL: stack_fold_mulsd
   1191   ;CHECK:       vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1192   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1193   %2 = fmul double %a0, %a1
   1194   ret double %2
   1195 }
   1196 
   1197 define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
   1198   ;CHECK-LABEL: stack_fold_mulsd_int
   1199   ;CHECK:       vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1200   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1201   %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1)
   1202   ret <2 x double> %2
   1203 }
   1204 declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone
   1205 
   1206 define float @stack_fold_mulss(float %a0, float %a1) {
   1207   ;CHECK-LABEL: stack_fold_mulss
   1208   ;CHECK:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1209   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1210   %2 = fmul float %a0, %a1
   1211   ret float %2
   1212 }
   1213 
   1214 define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
   1215   ;CHECK-LABEL: stack_fold_mulss_int
   1216   ;CHECK:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1217   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1218   %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1)
   1219   ret <4 x float> %2
   1220 }
   1221 declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone
   1222 
   1223 define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
   1224   ;CHECK-LABEL: stack_fold_orpd
   1225   ;CHECK:       vorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1226   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1227   %2 = bitcast <2 x double> %a0 to <2 x i64>
   1228   %3 = bitcast <2 x double> %a1 to <2 x i64>
   1229   %4 = or <2 x i64> %2, %3
   1230   %5 = bitcast <2 x i64> %4 to <2 x double>
   1231   ; fadd forces execution domain
   1232   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
   1233   ret <2 x double> %6
   1234 }
   1235 
   1236 define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1237   ;CHECK-LABEL: stack_fold_orpd_ymm
   1238   ;CHECK:       vorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1239   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1240   %2 = bitcast <4 x double> %a0 to <4 x i64>
   1241   %3 = bitcast <4 x double> %a1 to <4 x i64>
   1242   %4 = or <4 x i64> %2, %3
   1243   %5 = bitcast <4 x i64> %4 to <4 x double>
   1244   ; fadd forces execution domain
   1245   %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
   1246   ret <4 x double> %6
   1247 }
   1248 
   1249 define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
   1250   ;CHECK-LABEL: stack_fold_orps
   1251   ;CHECK:       vorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1252   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1253   %2 = bitcast <4 x float> %a0 to <2 x i64>
   1254   %3 = bitcast <4 x float> %a1 to <2 x i64>
   1255   %4 = or <2 x i64> %2, %3
   1256   %5 = bitcast <2 x i64> %4 to <4 x float>
   1257   ; fadd forces execution domain
   1258   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
   1259   ret <4 x float> %6
   1260 }
   1261 
   1262 define <8 x float> @stack_fold_orps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1263   ;CHECK-LABEL: stack_fold_orps_ymm
   1264   ;CHECK:       vorps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1265   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1266   %2 = bitcast <8 x float> %a0 to <4 x i64>
   1267   %3 = bitcast <8 x float> %a1 to <4 x i64>
   1268   %4 = or <4 x i64> %2, %3
   1269   %5 = bitcast <4 x i64> %4 to <8 x float>
   1270   ; fadd forces execution domain
   1271   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1272   ret <8 x float> %6
   1273 }
   1274 
   1275 define <8 x float> @stack_fold_perm2f128(<8 x float> %a0, <8 x float> %a1) {
   1276   ;CHECK-LABEL: stack_fold_perm2f128
   1277   ;CHECK:   vperm2f128 $33, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1278   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1279   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1280   ret <8 x float> %2
   1281 }
   1282 
   1283 define <2 x double> @stack_fold_permilpd(<2 x double> %a0) {
   1284   ;CHECK-LABEL: stack_fold_permilpd
   1285   ;CHECK:   vpermilpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1286   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1287   %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   1288   ret <2 x double> %2
   1289 }
   1290 
   1291 define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) {
   1292   ;CHECK-LABEL: stack_fold_permilpd_ymm
   1293   ;CHECK:   vpermilpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1294   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1295   %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   1296   ret <4 x double> %2
   1297 }
   1298 
   1299 define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) {
   1300   ;CHECK-LABEL: stack_fold_permilpdvar
   1301   ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1302   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1303   %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
   1304   ret <2 x double> %2
   1305 }
   1306 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
   1307 
   1308 define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) {
   1309   ;CHECK-LABEL: stack_fold_permilpdvar_ymm
   1310   ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1311   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1312   %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
   1313   ret <4 x double> %2
   1314 }
   1315 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
   1316 
   1317 define <4 x float> @stack_fold_permilps(<4 x float> %a0) {
   1318   ;CHECK-LABEL: stack_fold_permilps
   1319   ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1320   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1321   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   1322   ret <4 x float> %2
   1323 }
   1324 
   1325 define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) {
   1326   ;CHECK-LABEL: stack_fold_permilps_ymm
   1327   ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1328   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1329   %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   1330   ret <8 x float> %2
   1331 }
   1332 
   1333 define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) {
   1334   ;CHECK-LABEL: stack_fold_permilpsvar
   1335   ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1336   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1337   %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
   1338   ret <4 x float> %2
   1339 }
   1340 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
   1341 
   1342 define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) {
   1343   ;CHECK-LABEL: stack_fold_permilpsvar_ymm
   1344   ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1345   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1346   %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
   1347   ret <8 x float> %2
   1348 }
   1349 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
   1350 
   1351 ; TODO stack_fold_rcpps
   1352 
   1353 define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
   1354   ;CHECK-LABEL: stack_fold_rcpps_int
   1355   ;CHECK:       vrcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1356   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1357   %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
   1358   ret <4 x float> %2
   1359 }
   1360 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
   1361 
   1362 ; TODO stack_fold_rcpps_ymm
   1363 
   1364 define <8 x float> @stack_fold_rcpps_ymm_int(<8 x float> %a0) {
   1365   ;CHECK-LABEL: stack_fold_rcpps_ymm_int
   1366   ;CHECK:       vrcpps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1367   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1368   %2 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
   1369   ret <8 x float> %2
   1370 }
   1371 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
   1372 
   1373 ; TODO stack_fold_rcpss
   1374 
   1375 define <4 x float> @stack_fold_rcpss_int(<4 x float> %a0) {
   1376   ;CHECK-LABEL: stack_fold_rcpss_int
   1377   ;CHECK:       vrcpss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1378   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1379   %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
   1380   ret <4 x float> %2
   1381 }
   1382 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
   1383 
   1384 define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
   1385   ;CHECK-LABEL: stack_fold_roundpd
   1386   ;CHECK:  vroundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1387   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1388   %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
   1389   ret <2 x double> %2
   1390 }
   1391 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
   1392 
   1393 define <4 x double> @stack_fold_roundpd_ymm(<4 x double> %a0) {
   1394   ;CHECK-LABEL: stack_fold_roundpd_ymm
   1395   ;CHECK:  vroundpd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1396   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1397   %2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
   1398   ret <4 x double> %2
   1399 }
   1400 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
   1401 
   1402 define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
   1403   ;CHECK-LABEL: stack_fold_roundps
   1404   ;CHECK:  vroundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1405   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1406   %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
   1407   ret <4 x float> %2
   1408 }
   1409 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
   1410 
   1411 define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) {
   1412   ;CHECK-LABEL: stack_fold_roundps_ymm
   1413   ;CHECK:  vroundps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1414   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1415   %2 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
   1416   ret <8 x float> %2
   1417 }
   1418 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
   1419 
   1420 define double @stack_fold_roundsd(double %a0) optsize {
   1421   ;CHECK-LABEL: stack_fold_roundsd
   1422   ;CHECK:       vroundsd $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1423   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1424   %2 = call double @llvm.floor.f64(double %a0)
   1425   ret double %2
   1426 }
   1427 declare double @llvm.floor.f64(double) nounwind readnone
   1428 
   1429 ; TODO stack_fold_roundsd_int
   1430 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
   1431 
   1432 define float @stack_fold_roundss(float %a0) optsize {
   1433   ;CHECK-LABEL: stack_fold_roundss
   1434   ;CHECK:       vroundss $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1435   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1436   %2 = call float @llvm.floor.f32(float %a0)
   1437   ret float %2
   1438 }
   1439 declare float @llvm.floor.f32(float) nounwind readnone
   1440 
   1441 ; TODO stack_fold_roundss_int
   1442 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
   1443 
   1444 ; TODO stack_fold_rsqrtps
   1445 
   1446 define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
   1447   ;CHECK-LABEL: stack_fold_rsqrtps_int
   1448   ;CHECK:       vrsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1449   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1450   %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
   1451   ret <4 x float> %2
   1452 }
   1453 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
   1454 
   1455 ; TODO stack_fold_rsqrtps_ymm
   1456 
   1457 define <8 x float> @stack_fold_rsqrtps_ymm_int(<8 x float> %a0) {
   1458   ;CHECK-LABEL: stack_fold_rsqrtps_ymm_int
   1459   ;CHECK:       vrsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1460   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1461   %2 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
   1462   ret <8 x float> %2
   1463 }
   1464 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
   1465 
   1466 ; TODO stack_fold_rsqrtss
   1467 
   1468 define <4 x float> @stack_fold_rsqrtss_int(<4 x float> %a0) {
   1469   ;CHECK-LABEL: stack_fold_rsqrtss_int
   1470   ;CHECK:       vrsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1471   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1472   %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
   1473   ret <4 x float> %2
   1474 }
   1475 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
   1476 
   1477 define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
   1478   ;CHECK-LABEL: stack_fold_shufpd
   1479   ;CHECK:       vshufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1480   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1481   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
   1482   ret <2 x double> %2
   1483 }
   1484 
   1485 define <4 x double> @stack_fold_shufpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1486   ;CHECK-LABEL: stack_fold_shufpd_ymm
   1487   ;CHECK:       vshufpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1488   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1489   %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
   1490   ret <4 x double> %2
   1491 }
   1492 
   1493 define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
   1494   ;CHECK-LABEL: stack_fold_shufps
   1495   ;CHECK:       vshufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1496   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1497   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
   1498   ret <4 x float> %2
   1499 }
   1500 
   1501 define <8 x float> @stack_fold_shufps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1502   ;CHECK-LABEL: stack_fold_shufps_ymm
   1503   ;CHECK:       vshufps $148, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1504   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1505   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 13, i32 14>
   1506   ret <8 x float> %2
   1507 }
   1508 
   1509 define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
   1510   ;CHECK-LABEL: stack_fold_sqrtpd
   1511   ;CHECK:       vsqrtpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1512   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1513   %2 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
   1514   ret <2 x double> %2
   1515 }
   1516 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
   1517 
   1518 define <4 x double> @stack_fold_sqrtpd_ymm(<4 x double> %a0) {
   1519   ;CHECK-LABEL: stack_fold_sqrtpd_ymm
   1520   ;CHECK:       vsqrtpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1521   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1522   %2 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
   1523   ret <4 x double> %2
   1524 }
   1525 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
   1526 
   1527 define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
   1528   ;CHECK-LABEL: stack_fold_sqrtps
   1529   ;CHECK:       vsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1530   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1531   %2 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
   1532   ret <4 x float> %2
   1533 }
   1534 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
   1535 
   1536 define <8 x float> @stack_fold_sqrtps_ymm(<8 x float> %a0) {
   1537   ;CHECK-LABEL: stack_fold_sqrtps_ymm
   1538   ;CHECK:       vsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1539   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1540   %2 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
   1541   ret <8 x float> %2
   1542 }
   1543 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
   1544 
   1545 define double @stack_fold_sqrtsd(double %a0) {
   1546   ;CHECK-LABEL: stack_fold_sqrtsd
   1547   ;CHECK:       vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1548   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1549   %2 = call double @llvm.sqrt.f64(double %a0)
   1550   ret double %2
   1551 }
   1552 declare double @llvm.sqrt.f64(double) nounwind readnone
   1553 
   1554 define <2 x double> @stack_fold_sqrtsd_int(<2 x double> %a0) {
   1555   ;CHECK-LABEL: stack_fold_sqrtsd_int
   1556   ;CHECK:       vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1557   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1558   %2 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
   1559   ret <2 x double> %2
   1560 }
   1561 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
   1562 
   1563 define float @stack_fold_sqrtss(float %a0) {
   1564   ;CHECK-LABEL: stack_fold_sqrtss
   1565   ;CHECK:       vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1566   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1567   %2 = call float @llvm.sqrt.f32(float %a0)
   1568   ret float %2
   1569 }
   1570 declare float @llvm.sqrt.f32(float) nounwind readnone
   1571 
   1572 define <4 x float> @stack_fold_sqrtss_int(<4 x float> %a0) {
   1573   ;CHECK-LABEL: stack_fold_sqrtss_int
   1574   ;CHECK:       vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1575   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1576   %2 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
   1577   ret <4 x float> %2
   1578 }
   1579 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
   1580 
   1581 define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
   1582   ;CHECK-LABEL: stack_fold_subpd
   1583   ;CHECK:       vsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1584   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1585   %2 = fsub <2 x double> %a0, %a1
   1586   ret <2 x double> %2
   1587 }
   1588 
   1589 define <4 x double> @stack_fold_subpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1590   ;CHECK-LABEL: stack_fold_subpd_ymm
   1591   ;CHECK:       vsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1592   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1593   %2 = fsub <4 x double> %a0, %a1
   1594   ret <4 x double> %2
   1595 }
   1596 
   1597 define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
   1598   ;CHECK-LABEL: stack_fold_subps
   1599   ;CHECK:       vsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1600   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1601   %2 = fsub <4 x float> %a0, %a1
   1602   ret <4 x float> %2
   1603 }
   1604 
   1605 define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1606   ;CHECK-LABEL: stack_fold_subps_ymm
   1607   ;CHECK:       vsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1608   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1609   %2 = fsub <8 x float> %a0, %a1
   1610   ret <8 x float> %2
   1611 }
   1612 
   1613 define double @stack_fold_subsd(double %a0, double %a1) {
   1614   ;CHECK-LABEL: stack_fold_subsd
   1615   ;CHECK:       vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1616   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1617   %2 = fsub double %a0, %a1
   1618   ret double %2
   1619 }
   1620 
   1621 define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
   1622   ;CHECK-LABEL: stack_fold_subsd_int
   1623   ;CHECK:       vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1624   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1625   %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1)
   1626   ret <2 x double> %2
   1627 }
   1628 declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone
   1629 
   1630 define float @stack_fold_subss(float %a0, float %a1) {
   1631   ;CHECK-LABEL: stack_fold_subss
   1632   ;CHECK:       vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1633   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1634   %2 = fsub float %a0, %a1
   1635   ret float %2
   1636 }
   1637 
   1638 define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
   1639   ;CHECK-LABEL: stack_fold_subss_int
   1640   ;CHECK:       vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1641   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1642   %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1)
   1643   ret <4 x float> %2
   1644 }
   1645 declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone
   1646 
   1647 define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) {
   1648   ;CHECK-LABEL: stack_fold_testpd
   1649   ;CHECK:       vtestpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1650   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1651   %2 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
   1652   ret i32 %2
   1653 }
   1654 declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
   1655 
   1656 define i32 @stack_fold_testpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1657   ;CHECK-LABEL: stack_fold_testpd_ymm
   1658   ;CHECK:       vtestpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1659   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1660   %2 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
   1661   ret i32 %2
   1662 }
   1663 declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
   1664 
   1665 define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) {
   1666   ;CHECK-LABEL: stack_fold_testps
   1667   ;CHECK:       vtestps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1668   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1669   %2 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
   1670   ret i32 %2
   1671 }
   1672 declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
   1673 
   1674 define i32 @stack_fold_testps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1675   ;CHECK-LABEL: stack_fold_testps_ymm
   1676   ;CHECK:       vtestps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1677   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1678   %2 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
   1679   ret i32 %2
   1680 }
   1681 declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
   1682 
   1683 define i32 @stack_fold_ucomisd(double %a0, double %a1) {
   1684   ;CHECK-LABEL: stack_fold_ucomisd
   1685   ;CHECK:       vucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1686   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1687   %2 = fcmp ueq double %a0, %a1
   1688   %3 = select i1 %2, i32 1, i32 -1
   1689   ret i32 %3
   1690 }
   1691 
   1692 define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
   1693   ;CHECK-LABEL: stack_fold_ucomisd_int
   1694   ;CHECK:       vucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1695   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1696   %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
   1697   ret i32 %2
   1698 }
   1699 declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
   1700 
   1701 define i32 @stack_fold_ucomiss(float %a0, float %a1) {
   1702   ;CHECK-LABEL: stack_fold_ucomiss
   1703   ;CHECK:       vucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1704   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1705   %2 = fcmp ueq float %a0, %a1
   1706   %3 = select i1 %2, i32 1, i32 -1
   1707   ret i32 %3
   1708 }
   1709 
   1710 define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
   1711   ;CHECK-LABEL: stack_fold_ucomiss_int
   1712   ;CHECK:       vucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1713   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1714   %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
   1715   ret i32 %2
   1716 }
   1717 declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
   1718 
   1719 define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
   1720   ;CHECK-LABEL: stack_fold_unpckhpd
   1721   ;CHECK:       vunpckhpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1722   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1723   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
   1724   ; fadd forces execution domain
   1725   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
   1726   ret <2 x double> %3
   1727 }
   1728 
   1729 define <4 x double> @stack_fold_unpckhpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1730   ;CHECK-LABEL: stack_fold_unpckhpd_ymm
   1731   ;CHECK:       vunpckhpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1732   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1733   %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   1734   ; fadd forces execution domain
   1735   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
   1736   ret <4 x double> %3
   1737 }
   1738 
   1739 define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
   1740   ;CHECK-LABEL: stack_fold_unpckhps
   1741   ;CHECK:       vunpckhps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1742   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1743   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   1744   ; fadd forces execution domain
   1745   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
   1746   ret <4 x float> %3
   1747 }
   1748 
   1749 define <8 x float> @stack_fold_unpckhps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1750   ;CHECK-LABEL: stack_fold_unpckhps_ymm
   1751   ;CHECK:       vunpckhps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1752   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1753   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   1754   ; fadd forces execution domain
   1755   %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1756   ret <8 x float> %3
   1757 }
   1758 
   1759 define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
   1760   ;CHECK-LABEL: stack_fold_unpcklpd
   1761   ;CHECK:       vunpcklpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1762   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1763   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
   1764   ; fadd forces execution domain
   1765   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
   1766   ret <2 x double> %3
   1767 }
   1768 
   1769 define <4 x double> @stack_fold_unpcklpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1770   ;CHECK-LABEL: stack_fold_unpcklpd_ymm
   1771   ;CHECK:       vunpcklpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1772   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1773   %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   1774   ; fadd forces execution domain
   1775   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
   1776   ret <4 x double> %3
   1777 }
   1778 
   1779 define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
   1780   ;CHECK-LABEL: stack_fold_unpcklps
   1781   ;CHECK:       vunpcklps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1782   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1783   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   1784   ; fadd forces execution domain
   1785   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
   1786   ret <4 x float> %3
   1787 }
   1788 
   1789 define <8 x float> @stack_fold_unpcklps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1790   ;CHECK-LABEL: stack_fold_unpcklps_ymm
   1791   ;CHECK:       vunpcklps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1792   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1793   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   1794   ; fadd forces execution domain
   1795   %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1796   ret <8 x float> %3
   1797 }
   1798 
   1799 define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
   1800   ;CHECK-LABEL: stack_fold_xorpd
   1801   ;CHECK:       vxorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1802   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1803   %2 = bitcast <2 x double> %a0 to <2 x i64>
   1804   %3 = bitcast <2 x double> %a1 to <2 x i64>
   1805   %4 = xor <2 x i64> %2, %3
   1806   %5 = bitcast <2 x i64> %4 to <2 x double>
   1807   ; fadd forces execution domain
   1808   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
   1809   ret <2 x double> %6
   1810 }
   1811 
   1812 define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1813   ;CHECK-LABEL: stack_fold_xorpd_ymm
   1814   ;CHECK:       vxorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1815   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1816   %2 = bitcast <4 x double> %a0 to <4 x i64>
   1817   %3 = bitcast <4 x double> %a1 to <4 x i64>
   1818   %4 = xor <4 x i64> %2, %3
   1819   %5 = bitcast <4 x i64> %4 to <4 x double>
   1820   ; fadd forces execution domain
   1821   %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
   1822   ret <4 x double> %6
   1823 }
   1824 
   1825 define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
   1826   ;CHECK-LABEL: stack_fold_xorps
   1827   ;CHECK:       vxorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1828   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1829   %2 = bitcast <4 x float> %a0 to <2 x i64>
   1830   %3 = bitcast <4 x float> %a1 to <2 x i64>
   1831   %4 = xor <2 x i64> %2, %3
   1832   %5 = bitcast <2 x i64> %4 to <4 x float>
   1833   ; fadd forces execution domain
   1834   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
   1835   ret <4 x float> %6
   1836 }
   1837 
   1838 define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1839   ;CHECK-LABEL: stack_fold_xorps_ymm
   1840   ;CHECK:       vxorps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1841   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1842   %2 = bitcast <8 x float> %a0 to <4 x i64>
   1843   %3 = bitcast <8 x float> %a1 to <4 x i64>
   1844   %4 = xor <4 x i64> %2, %3
   1845   %5 = bitcast <4 x i64> %4 to <8 x float>
   1846   ; fadd forces execution domain
   1847   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1848   ret <8 x float> %6
   1849 }
   1850