Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c < %s | FileCheck %s
      2 
      3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
      4 target triple = "x86_64-unknown-unknown"
      5 
      6 ; Stack reload folding tests.
      7 ;
      8 ; By including a nop call with sideeffects we can force a partial register spill of the
      9 ; relevant registers and check that the reload is correctly folded into the instruction.
     10 
     11 define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
     12   ;CHECK-LABEL: stack_fold_addpd
     13   ;CHECK:       vaddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     14   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     15   %2 = fadd <2 x double> %a0, %a1
     16   ret <2 x double> %2
     17 }
     18 
     19 define <4 x double> @stack_fold_addpd_ymm(<4 x double> %a0, <4 x double> %a1) {
     20   ;CHECK-LABEL: stack_fold_addpd_ymm
     21   ;CHECK:       vaddpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     22   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     23   %2 = fadd <4 x double> %a0, %a1
     24   ret <4 x double> %2
     25 }
     26 
     27 define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
     28   ;CHECK-LABEL: stack_fold_addps
     29   ;CHECK:       vaddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     30   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     31   %2 = fadd <4 x float> %a0, %a1
     32   ret <4 x float> %2
     33 }
     34 
     35 define <8 x float> @stack_fold_addps_ymm(<8 x float> %a0, <8 x float> %a1) {
     36   ;CHECK-LABEL: stack_fold_addps_ymm
     37   ;CHECK:       vaddps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     38   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     39   %2 = fadd <8 x float> %a0, %a1
     40   ret <8 x float> %2
     41 }
     42 
     43 define double @stack_fold_addsd(double %a0, double %a1) {
     44   ;CHECK-LABEL: stack_fold_addsd
     45   ;CHECK:       vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
     46   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     47   %2 = fadd double %a0, %a1
     48   ret double %2
     49 }
     50 
     51 define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
     52   ;CHECK-LABEL: stack_fold_addsd_int
     53   ;CHECK:       vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     54   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     55   %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1)
     56   ret <2 x double> %2
     57 }
     58 declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
     59 
     60 define float @stack_fold_addss(float %a0, float %a1) {
     61   ;CHECK-LABEL: stack_fold_addss
     62   ;CHECK:       vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
     63   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     64   %2 = fadd float %a0, %a1
     65   ret float %2
     66 }
     67 
     68 define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
     69   ;CHECK-LABEL: stack_fold_addss_int
     70   ;CHECK:       vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     71   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     72   %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1)
     73   ret <4 x float> %2
     74 }
     75 declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
     76 
     77 define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
     78   ;CHECK-LABEL: stack_fold_addsubpd
     79   ;CHECK:       vaddsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     80   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     81   %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
     82   ret <2 x double> %2
     83 }
     84 declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
     85 
     86 define <4 x double> @stack_fold_addsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
     87   ;CHECK-LABEL: stack_fold_addsubpd_ymm
     88   ;CHECK:       vaddsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     89   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     90   %2 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
     91   ret <4 x double> %2
     92 }
     93 declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
     94 
     95 define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
     96   ;CHECK-LABEL: stack_fold_addsubps
     97   ;CHECK:       vaddsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     98   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     99   %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
    100   ret <4 x float> %2
    101 }
    102 declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
    103 
    104 define <8 x float> @stack_fold_addsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
    105   ;CHECK-LABEL: stack_fold_addsubps_ymm
    106   ;CHECK:       vaddsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    107   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    108   %2 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
    109   ret <8 x float> %2
    110 }
    111 declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
    112 
    113 define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
    114   ;CHECK-LABEL: stack_fold_andnpd
    115   ;CHECK:       vandnpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    116   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    117   %2 = bitcast <2 x double> %a0 to <2 x i64>
    118   %3 = bitcast <2 x double> %a1 to <2 x i64>
    119   %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
    120   %5 = and <2 x i64> %4, %3
    121   %6 = bitcast <2 x i64> %5 to <2 x double>
    122   ; fadd forces execution domain
    123   %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
    124   ret <2 x double> %7
    125 }
    126 
    127 define <4 x double> @stack_fold_andnpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    128   ;CHECK-LABEL: stack_fold_andnpd_ymm
    129   ;CHECK:       vandnpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    130   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    131   %2 = bitcast <4 x double> %a0 to <4 x i64>
    132   %3 = bitcast <4 x double> %a1 to <4 x i64>
    133   %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
    134   %5 = and <4 x i64> %4, %3
    135   %6 = bitcast <4 x i64> %5 to <4 x double>
    136   ; fadd forces execution domain
    137   %7 = fadd <4 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0>
    138   ret <4 x double> %7
    139 }
    140 
    141 define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
    142   ;CHECK-LABEL: stack_fold_andnps
    143   ;CHECK:       vandnps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    144   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    145   %2 = bitcast <4 x float> %a0 to <2 x i64>
    146   %3 = bitcast <4 x float> %a1 to <2 x i64>
    147   %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
    148   %5 = and <2 x i64> %4, %3
    149   %6 = bitcast <2 x i64> %5 to <4 x float>
    150   ; fadd forces execution domain
    151   %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
    152   ret <4 x float> %7
    153 }
    154 
    155 define <8 x float> @stack_fold_andnps_ymm(<8 x float> %a0, <8 x float> %a1) {
    156   ;CHECK-LABEL: stack_fold_andnps_ymm
    157   ;CHECK:       vandnps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    158   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    159   %2 = bitcast <8 x float> %a0 to <4 x i64>
    160   %3 = bitcast <8 x float> %a1 to <4 x i64>
    161   %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
    162   %5 = and <4 x i64> %4, %3
    163   %6 = bitcast <4 x i64> %5 to <8 x float>
    164   ; fadd forces execution domain
    165   %7 = fadd <8 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
    166   ret <8 x float> %7
    167 }
    168 
    169 define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
    170   ;CHECK-LABEL: stack_fold_andpd
    171   ;CHECK:       vandpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    172   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    173   %2 = bitcast <2 x double> %a0 to <2 x i64>
    174   %3 = bitcast <2 x double> %a1 to <2 x i64>
    175   %4 = and <2 x i64> %2, %3
    176   %5 = bitcast <2 x i64> %4 to <2 x double>
    177   ; fadd forces execution domain
    178   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
    179   ret <2 x double> %6
    180 }
    181 
    182 define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    183   ;CHECK-LABEL: stack_fold_andpd_ymm
    184   ;CHECK:       vandpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    185   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    186   %2 = bitcast <4 x double> %a0 to <4 x i64>
    187   %3 = bitcast <4 x double> %a1 to <4 x i64>
    188   %4 = and <4 x i64> %2, %3
    189   %5 = bitcast <4 x i64> %4 to <4 x double>
    190   ; fadd forces execution domain
    191   %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
    192   ret <4 x double> %6
    193 }
    194 
    195 define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
    196   ;CHECK-LABEL: stack_fold_andps
    197   ;CHECK:       vandps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    198   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    199   %2 = bitcast <4 x float> %a0 to <2 x i64>
    200   %3 = bitcast <4 x float> %a1 to <2 x i64>
    201   %4 = and <2 x i64> %2, %3
    202   %5 = bitcast <2 x i64> %4 to <4 x float>
    203   ; fadd forces execution domain
    204   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
    205   ret <4 x float> %6
    206 }
    207 
    208 define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) {
    209   ;CHECK-LABEL: stack_fold_andps_ymm
    210   ;CHECK:       vandps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    211   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    212   %2 = bitcast <8 x float> %a0 to <4 x i64>
    213   %3 = bitcast <8 x float> %a1 to <4 x i64>
    214   %4 = and <4 x i64> %2, %3
    215   %5 = bitcast <4 x i64> %4 to <8 x float>
    216   ; fadd forces execution domain
    217   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
    218   ret <8 x float> %6
    219 }
    220 
    221 define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
    222   ;CHECK-LABEL: stack_fold_blendpd
    223   ;CHECK:       vblendpd $2, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    224   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    225   %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
    226   ret <2 x double> %2
    227 }
    228 
    229 define <4 x double> @stack_fold_blendpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    230   ;CHECK-LABEL: stack_fold_blendpd_ymm
    231   ;CHECK:       vblendpd $6, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    232   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    233   %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %a0, <4 x double> %a1
    234   ret <4 x double> %2
    235 }
    236 
    237 define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
    238   ;CHECK-LABEL: stack_fold_blendps
    239   ;CHECK:       vblendps $6, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    240   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    241   %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
    242   ret <4 x float> %2
    243 }
    244 
    245 define <8 x float> @stack_fold_blendps_ymm(<8 x float> %a0, <8 x float> %a1) {
    246   ;CHECK-LABEL: stack_fold_blendps_ymm
    247   ;CHECK:       vblendps $102, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    248   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    249   %2 = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %a0, <8 x float> %a1
    250   ret <8 x float> %2
    251 }
    252 
    253 define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
    254   ;CHECK-LABEL: stack_fold_blendvpd
    255   ;CHECK:       vblendvpd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    256   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    257   %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
    258   ret <2 x double> %2
    259 }
    260 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
    261 
    262 define <4 x double> @stack_fold_blendvpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %c) {
    263   ;CHECK-LABEL: stack_fold_blendvpd_ymm
    264   ;CHECK:       vblendvpd {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    265   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    266   %2 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a1, <4 x double> %c, <4 x double> %a0)
    267   ret <4 x double> %2
    268 }
    269 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
    270 
    271 define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
    272   ;CHECK-LABEL: stack_fold_blendvps
    273   ;CHECK:       vblendvps {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    274   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    275   %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
    276   ret <4 x float> %2
    277 }
    278 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
    279 
    280 define <8 x float> @stack_fold_blendvps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %c) {
    281   ;CHECK-LABEL: stack_fold_blendvps_ymm
    282   ;CHECK:       vblendvps {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    283   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    284   %2 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a1, <8 x float> %c, <8 x float> %a0)
    285   ret <8 x float> %2
    286 }
    287 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
    288 
    289 define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
    290   ;CHECK-LABEL: stack_fold_cmppd
    291   ;CHECK:       vcmpeqpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    292   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    293   %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
    294   ret <2 x double> %2
    295 }
    296 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
    297 
    298 define <4 x double> @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
    299   ;CHECK-LABEL: stack_fold_cmppd_ymm
    300   ;CHECK:       vcmpeqpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    301   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    302   %2 = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
    303   ret <4 x double> %2
    304 }
    305 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
    306 
    307 define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
    308   ;CHECK-LABEL: stack_fold_cmpps
    309   ;CHECK:       vcmpeqps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    310   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    311   %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
    312   ret <4 x float> %2
    313 }
    314 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
    315 
    316 define <8 x float> @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
    317   ;CHECK-LABEL: stack_fold_cmpps_ymm
    318   ;CHECK:       vcmpeqps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    319   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    320   %2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0)
    321   ret <8 x float> %2
    322 }
    323 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
    324 
    325 define i32 @stack_fold_cmpsd(double %a0, double %a1) {
    326   ;CHECK-LABEL: stack_fold_cmpsd
    327   ;CHECK:       vcmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    328   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    329   %2 = fcmp oeq double %a0, %a1
    330   %3 = zext i1 %2 to i32
    331   ret i32 %3
    332 }
    333 
    334 define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
    335   ;CHECK-LABEL: stack_fold_cmpsd_int
    336   ;CHECK:       vcmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    337   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    338   %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
    339   ret <2 x double> %2
    340 }
    341 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
    342 
    343 define i32 @stack_fold_cmpss(float %a0, float %a1) {
    344   ;CHECK-LABEL: stack_fold_cmpss
    345   ;CHECK:       vcmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    346   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    347   %2 = fcmp oeq float %a0, %a1
    348   %3 = zext i1 %2 to i32
    349   ret i32 %3
    350 }
    351 
    352 define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
    353   ;CHECK-LABEL: stack_fold_cmpss_int
    354   ;CHECK:       vcmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    355   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    356   %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
    357   ret <4 x float> %2
    358 }
    359 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
    360 
    361 ; TODO stack_fold_comisd
    362 
    363 define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
    364   ;CHECK-LABEL: stack_fold_comisd_int
    365   ;CHECK:       vcomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    366   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    367   %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
    368   ret i32 %2
    369 }
    370 declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
    371 
    372 ; TODO stack_fold_comiss
    373 
    374 define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
    375   ;CHECK-LABEL: stack_fold_comiss_int
    376   ;CHECK:       vcomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    377   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    378   %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
    379   ret i32 %2
    380 }
    381 declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
    382 
    383 define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
    384   ;CHECK-LABEL: stack_fold_cvtdq2pd
    385   ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    386   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    387   %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
    388   %3 = sitofp <2 x i32> %2 to <2 x double>
    389   ret <2 x double> %3
    390 }
    391 define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
    392   ;CHECK-LABEL: stack_fold_cvtdq2pd_int
    393   ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    394   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    395   %2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0)
    396   ret <2 x double> %2
    397 }
    398 declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
    399 
    400 define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) {
    401   ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm
    402   ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    403   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    404   %2 = sitofp <4 x i32> %a0 to <4 x double>
    405   ret <4 x double> %2
    406 }
    407 
    408 define <4 x double> @stack_fold_cvtdq2pd_ymm_int(<4 x i32> %a0) {
    409   ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm_int
    410   ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    411   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    412   %2 = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0)
    413   ret <4 x double> %2
    414 }
    415 declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
    416 
    417 define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
    418   ;CHECK-LABEL: stack_fold_cvtdq2ps
    419   ;CHECK:   vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    420   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    421   %2 = sitofp <4 x i32> %a0 to <4 x float>
    422   ret <4 x float> %2
    423 }
    424 
    425 define <8 x float> @stack_fold_cvtdq2ps_ymm(<8 x i32> %a0) {
    426   ;CHECK-LABEL: stack_fold_cvtdq2ps_ymm
    427   ;CHECK:   vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    428   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    429   %2 = sitofp <8 x i32> %a0 to <8 x float>
    430   ret <8 x float> %2
    431 }
    432 
    433 define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
    434   ;CHECK-LABEL: stack_fold_cvtpd2dq
    435   ;CHECK:   vcvtpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    436   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    437   %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
    438   ret <4 x i32> %2
    439 }
    440 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
    441 
    442 define <4 x i32> @stack_fold_cvtpd2dq_ymm(<4 x double> %a0) {
    443   ;CHECK-LABEL: stack_fold_cvtpd2dq_ymm
    444   ;CHECK:   vcvtpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    445   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    446   %2 = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
    447   ret <4 x i32> %2
    448 }
    449 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
    450 
    451 define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
    452   ;CHECK-LABEL: stack_fold_cvtpd2ps
    453   ;CHECK:   vcvtpd2psx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    454   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    455   %2 = fptrunc <2 x double> %a0 to <2 x float>
    456   ret <2 x float> %2
    457 }
    458 
    459 define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
    460   ;CHECK-LABEL: stack_fold_cvtpd2ps_ymm
    461   ;CHECK:   vcvtpd2psy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    462   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    463   %2 = fptrunc <4 x double> %a0 to <4 x float>
    464   ret <4 x float> %2
    465 }
    466 
    467 define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) {
    468   ;CHECK-LABEL: stack_fold_cvtph2ps
    469   ;CHECK:   vcvtph2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    470   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    471   %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)
    472   ret <4 x float> %2
    473 }
    474 declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
    475 
    476 define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) {
    477   ;CHECK-LABEL: stack_fold_cvtph2ps_ymm
    478   ;CHECK:   vcvtph2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    479   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    480   %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)
    481   ret <8 x float> %2
    482 }
    483 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
    484 
    485 define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
    486   ;CHECK-LABEL: stack_fold_cvtps2dq
    487   ;CHECK:  vcvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    488   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    489   %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
    490   ret <4 x i32> %2
    491 }
    492 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
    493 
    494 define <8 x i32> @stack_fold_cvtps2dq_ymm(<8 x float> %a0) {
    495   ;CHECK-LABEL: stack_fold_cvtps2dq_ymm
    496   ;CHECK:  vcvtps2dq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    497   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    498   %2 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
    499   ret <8 x i32> %2
    500 }
    501 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
    502 
    503 define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
    504   ;CHECK-LABEL: stack_fold_cvtps2pd
    505   ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    506   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    507   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
    508   %3 = fpext <2 x float> %2 to <2 x double>
    509   ret <2 x double> %3
    510 }
    511 
    512 define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
    513   ;CHECK-LABEL: stack_fold_cvtps2pd_int
    514   ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    515   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    516   %2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)
    517   ret <2 x double> %2
    518 }
    519 declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
    520 
    521 define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) {
    522   ;CHECK-LABEL: stack_fold_cvtps2pd_ymm
    523   ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    524   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    525   %2 = fpext <4 x float> %a0 to <4 x double>
    526   ret <4 x double> %2
    527 }
    528 
    529 define <4 x double> @stack_fold_cvtps2pd_ymm_int(<4 x float> %a0) {
    530   ;CHECK-LABEL: stack_fold_cvtps2pd_ymm_int
    531   ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    532   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    533   %2 = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0)
    534   ret <4 x double> %2
    535 }
    536 declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
    537 
    538 define <8 x i16> @stack_fold_cvtps2ph(<4 x float> %a0) {
    539   ;CHECK-LABEL: stack_fold_cvtps2ph
    540   ;CHECK:   vcvtps2ph $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
    541   %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
    542   %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    543   ret <8 x i16> %1
    544 }
    545 declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
    546 
    547 define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) {
    548   ;CHECK-LABEL: stack_fold_cvtps2ph_ymm
    549   ;CHECK:   vcvtps2ph $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
    550   %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
    551   %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    552   ret <8 x i16> %1
    553 }
    554 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
    555 
    556 ; TODO stack_fold_cvtsd2si
    557 
    558 define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
    559   ;CHECK-LABEL: stack_fold_cvtsd2si_int
    560   ;CHECK:  cvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    561   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    562   %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
    563   ret i32 %2
    564 }
    565 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
    566 
    567 ; TODO stack_fold_cvtsd2si64
    568 
    569 define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
    570   ;CHECK-LABEL: stack_fold_cvtsd2si64_int
    571   ;CHECK:  cvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    572   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    573   %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
    574   ret i64 %2
    575 }
    576 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
    577 
    578 ; TODO stack_fold_cvtsd2ss
    579 
    580 define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) {
    581   ;CHECK-LABEL: stack_fold_cvtsd2ss_int
    582   ;CHECK:  cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    583   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    584   %2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
    585   ret <4 x float> %2
    586 }
    587 declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
    588 
    589 define double @stack_fold_cvtsi2sd(i32 %a0) {
    590   ;CHECK-LABEL: stack_fold_cvtsi2sd
    591   ;CHECK:  cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    592   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    593   %2 = sitofp i32 %a0 to double
    594   ret double %2
    595 }
    596 
    597 define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
    598   ;CHECK-LABEL: stack_fold_cvtsi2sd_int
    599   ;CHECK:  cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    600   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    601   %2 = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 0x0, double 0x0>, i32 %a0)
    602   ret <2 x double> %2
    603 }
    604 declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
    605 
    606 define double @stack_fold_cvtsi642sd(i64 %a0) {
    607   ;CHECK-LABEL: stack_fold_cvtsi642sd
    608   ;CHECK:  cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    609   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    610   %2 = sitofp i64 %a0 to double
    611   ret double %2
    612 }
    613 
    614 define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
    615   ;CHECK-LABEL: stack_fold_cvtsi642sd_int
    616   ;CHECK:  cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    617   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    618   %2 = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> <double 0x0, double 0x0>, i64 %a0)
    619   ret <2 x double> %2
    620 }
    621 declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
    622 
    623 define float @stack_fold_cvtsi2ss(i32 %a0) {
    624   ;CHECK-LABEL: stack_fold_cvtsi2ss
    625   ;CHECK:  cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    626   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    627   %2 = sitofp i32 %a0 to float
    628   ret float %2
    629 }
    630 
    631 define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
    632   ;CHECK-LABEL: stack_fold_cvtsi2ss_int
    633   ;CHECK:  cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    634   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    635   %2 = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i32 %a0)
    636   ret <4 x float> %2
    637 }
    638 declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
    639 
    640 define float @stack_fold_cvtsi642ss(i64 %a0) {
    641   ;CHECK-LABEL: stack_fold_cvtsi642ss
    642   ;CHECK:  cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    643   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    644   %2 = sitofp i64 %a0 to float
    645   ret float %2
    646 }
    647 
    648 define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
    649   ;CHECK-LABEL: stack_fold_cvtsi642ss_int
    650   ;CHECK:  cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    651   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    652   %2 = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i64 %a0)
    653   ret <4 x float> %2
    654 }
    655 declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
    656 
    657 ; TODO stack_fold_cvtss2sd
    658 
    659 define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) {
    660   ;CHECK-LABEL: stack_fold_cvtss2sd_int
    661   ;CHECK:  cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    662   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    663   %2 = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %a0)
    664   ret <2 x double> %2
    665 }
    666 declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
    667 
    668 ; TODO stack_fold_cvtss2si
    669 
    670 define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
    671   ;CHECK-LABEL: stack_fold_cvtss2si_int
    672   ;CHECK:  vcvtss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    673   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    674   %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
    675   ret i32 %2
    676 }
    677 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
    678 
    679 ; TODO stack_fold_cvtss2si64
    680 
    681 define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
    682   ;CHECK-LABEL: stack_fold_cvtss2si64_int
    683   ;CHECK:  vcvtss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    684   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    685   %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
    686   ret i64 %2
    687 }
    688 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
    689 
    690 define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
    691   ;CHECK-LABEL: stack_fold_cvttpd2dq
    692   ;CHECK:  vcvttpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    693   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    694   %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
    695   ret <4 x i32> %2
    696 }
    697 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
    698 
    699 define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) {
    700   ;CHECK-LABEL: stack_fold_cvttpd2dq_ymm
    701   ;CHECK:  vcvttpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    702   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    703   %2 = fptosi <4 x double> %a0 to <4 x i32>
    704   ret <4 x i32> %2
    705 }
    706 
    707 define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
    708   ;CHECK-LABEL: stack_fold_cvttps2dq
    709   ;CHECK:  vcvttps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    710   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    711   %2 = fptosi <4 x float> %a0 to <4 x i32>
    712   ret <4 x i32> %2
    713 }
    714 
    715 define <8 x i32> @stack_fold_cvttps2dq_ymm(<8 x float> %a0) {
    716   ;CHECK-LABEL: stack_fold_cvttps2dq_ymm
    717   ;CHECK:  vcvttps2dq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    718   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    719   %2 = fptosi <8 x float> %a0 to <8 x i32>
    720   ret <8 x i32> %2
    721 }
    722 
    723 define i32 @stack_fold_cvttsd2si(double %a0) {
    724   ;CHECK-LABEL: stack_fold_cvttsd2si
    725   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 8-byte Folded Reload
    726   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    727   %2 = fptosi double %a0 to i32
    728   ret i32 %2
    729 }
    730 
    731 define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
    732   ;CHECK-LABEL: stack_fold_cvttsd2si_int
    733   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    734   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    735   %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
    736   ret i32 %2
    737 }
    738 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
    739 
    740 define i64 @stack_fold_cvttsd2si64(double %a0) {
    741   ;CHECK-LABEL: stack_fold_cvttsd2si64
    742   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
    743   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    744   %2 = fptosi double %a0 to i64
    745   ret i64 %2
    746 }
    747 
    748 define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
    749   ;CHECK-LABEL: stack_fold_cvttsd2si64_int
    750   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    751   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    752   %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
    753   ret i64 %2
    754 }
    755 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
    756 
    757 define i32 @stack_fold_cvttss2si(float %a0) {
    758   ;CHECK-LABEL: stack_fold_cvttss2si
    759   ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
    760   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    761   %2 = fptosi float %a0 to i32
    762   ret i32 %2
    763 }
    764 
    765 define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
    766   ;CHECK-LABEL: stack_fold_cvttss2si_int
    767   ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    768   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    769   %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
    770   ret i32 %2
    771 }
    772 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
    773 
    774 define i64 @stack_fold_cvttss2si64(float %a0) {
    775   ;CHECK-LABEL: stack_fold_cvttss2si64
    776   ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
    777   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    778   %2 = fptosi float %a0 to i64
    779   ret i64 %2
    780 }
    781 
    782 define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
    783   ;CHECK-LABEL: stack_fold_cvttss2si64_int
    784   ;CHECK:  cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    785   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    786   %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
    787   ret i64 %2
    788 }
    789 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
    790 
    791 define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
    792   ;CHECK-LABEL: stack_fold_divpd
    793   ;CHECK:       vdivpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    794   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    795   %2 = fdiv <2 x double> %a0, %a1
    796   ret <2 x double> %2
    797 }
    798 
    799 define <4 x double> @stack_fold_divpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    800   ;CHECK-LABEL: stack_fold_divpd_ymm
    801   ;CHECK:       vdivpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    802   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    803   %2 = fdiv <4 x double> %a0, %a1
    804   ret <4 x double> %2
    805 }
    806 
    807 define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
    808   ;CHECK-LABEL: stack_fold_divps
    809   ;CHECK:       vdivps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    810   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    811   %2 = fdiv <4 x float> %a0, %a1
    812   ret <4 x float> %2
    813 }
    814 
    815 define <8 x float> @stack_fold_divps_ymm(<8 x float> %a0, <8 x float> %a1) {
    816   ;CHECK-LABEL: stack_fold_divps_ymm
    817   ;CHECK:       vdivps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    818   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    819   %2 = fdiv <8 x float> %a0, %a1
    820   ret <8 x float> %2
    821 }
    822 
    823 define double @stack_fold_divsd(double %a0, double %a1) {
    824   ;CHECK-LABEL: stack_fold_divsd
    825   ;CHECK:       vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    826   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    827   %2 = fdiv double %a0, %a1
    828   ret double %2
    829 }
    830 
    831 define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
    832   ;CHECK-LABEL: stack_fold_divsd_int
    833   ;CHECK:       vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    834   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    835   %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1)
    836   ret <2 x double> %2
    837 }
    838 declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
    839 
    840 define float @stack_fold_divss(float %a0, float %a1) {
    841   ;CHECK-LABEL: stack_fold_divss
    842   ;CHECK:       vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    843   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    844   %2 = fdiv float %a0, %a1
    845   ret float %2
    846 }
    847 
    848 define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
    849   ;CHECK-LABEL: stack_fold_divss_int
    850   ;CHECK:       vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    851   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    852   %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1)
    853   ret <4 x float> %2
    854 }
    855 declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
    856 
    857 define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
    858   ;CHECK-LABEL: stack_fold_dppd
    859   ;CHECK:       vdppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    860   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    861   %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
    862   ret <2 x double> %2
    863 }
    864 declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
    865 
    866 define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
    867   ;CHECK-LABEL: stack_fold_dpps
    868   ;CHECK:       vdpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    869   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    870   %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
    871   ret <4 x float> %2
    872 }
    873 declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
    874 
    875 define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) {
    876   ;CHECK-LABEL: stack_fold_dpps_ymm
    877   ;CHECK:       vdpps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    878   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    879   %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
    880   ret <8 x float> %2
    881 }
    882 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
    883 
    884 define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) {
    885   ;CHECK-LABEL: stack_fold_extractf128
    886   ;CHECK:       vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
    887   %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    888   %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    889   ret <4 x float> %1
    890 }
    891 
    892 define i32 @stack_fold_extractps(<4 x float> %a0) {
    893   ;CHECK-LABEL: stack_fold_extractps
    894   ;CHECK:       vextractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
    895   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
    896   %1 = extractelement <4 x float> %a0, i32 1
    897   %2 = bitcast float %1 to i32
    898   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    899   ret i32 %2
    900 }
    901 
    902 define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
    903   ;CHECK-LABEL: stack_fold_haddpd
    904   ;CHECK:       vhaddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    905   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    906   %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
    907   ret <2 x double> %2
    908 }
    909 declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
    910 
    911 define <4 x double> @stack_fold_haddpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    912   ;CHECK-LABEL: stack_fold_haddpd_ymm
    913   ;CHECK:       vhaddpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    914   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    915   %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
    916   ret <4 x double> %2
    917 }
    918 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
    919 
    920 define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
    921   ;CHECK-LABEL: stack_fold_haddps
    922   ;CHECK:       vhaddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    923   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    924   %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
    925   ret <4 x float> %2
    926 }
    927 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
    928 
    929 define <8 x float> @stack_fold_haddps_ymm(<8 x float> %a0, <8 x float> %a1) {
    930   ;CHECK-LABEL: stack_fold_haddps_ymm
    931   ;CHECK:       vhaddps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    932   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    933   %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
    934   ret <8 x float> %2
    935 }
    936 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
    937 
    938 define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
    939   ;CHECK-LABEL: stack_fold_hsubpd
    940   ;CHECK:       vhsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    941   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    942   %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
    943   ret <2 x double> %2
    944 }
    945 declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
    946 
    947 define <4 x double> @stack_fold_hsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    948   ;CHECK-LABEL: stack_fold_hsubpd_ymm
    949   ;CHECK:       vhsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    950   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    951   %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
    952   ret <4 x double> %2
    953 }
    954 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
    955 
    956 define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
    957   ;CHECK-LABEL: stack_fold_hsubps
    958   ;CHECK:       vhsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    959   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    960   %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
    961   ret <4 x float> %2
    962 }
    963 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
    964 
    965 define <8 x float> @stack_fold_hsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
    966   ;CHECK-LABEL: stack_fold_hsubps_ymm
    967   ;CHECK:       vhsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    968   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    969   %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
    970   ret <8 x float> %2
    971 }
    972 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
    973 
    974 define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) {
    975   ;CHECK-LABEL: stack_fold_insertf128
    976   ;CHECK:       vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    977   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    978   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    979   ret <8 x float> %2
    980 }
    981 
    982 define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
    983   ;CHECK-LABEL: stack_fold_insertps
    984   ;CHECK:       vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    985   ;CHECK-NEXT:                                                                              {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
    986   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    987   %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
    988   ret <4 x float> %2
    989 }
    990 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
    991 
    992 define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
    993   ;CHECK-LABEL: stack_fold_maxpd
    994   ;CHECK:       vmaxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    995   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    996   %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
    997   ret <2 x double> %2
    998 }
    999 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
   1000 
   1001 define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1002   ;CHECK-LABEL: stack_fold_maxpd_ymm
   1003   ;CHECK:       vmaxpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1004   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1005   %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
   1006   ret <4 x double> %2
   1007 }
   1008 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
   1009 
   1010 define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
   1011   ;CHECK-LABEL: stack_fold_maxps
   1012   ;CHECK:       vmaxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1013   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1014   %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   1015   ret <4 x float> %2
   1016 }
   1017 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
   1018 
   1019 define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1020   ;CHECK-LABEL: stack_fold_maxps_ymm
   1021   ;CHECK:       vmaxps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1022   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1023   %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
   1024   ret <8 x float> %2
   1025 }
   1026 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
   1027 
   1028 define double @stack_fold_maxsd(double %a0, double %a1) {
   1029   ;CHECK-LABEL: stack_fold_maxsd
   1030   ;CHECK:       vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1031   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1032   %2 = fcmp ogt double %a0, %a1
   1033   %3 = select i1 %2, double %a0, double %a1
   1034   ret double %3
   1035 }
   1036 
   1037 define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
   1038   ;CHECK-LABEL: stack_fold_maxsd_int
   1039   ;CHECK:       vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1040   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1041   %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
   1042   ret <2 x double> %2
   1043 }
   1044 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
   1045 
   1046 define float @stack_fold_maxss(float %a0, float %a1) {
   1047   ;CHECK-LABEL: stack_fold_maxss
   1048   ;CHECK:       vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1049   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1050   %2 = fcmp ogt float %a0, %a1
   1051   %3 = select i1 %2, float %a0, float %a1
   1052   ret float %3
   1053 }
   1054 
   1055 define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
   1056   ;CHECK-LABEL: stack_fold_maxss_int
   1057   ;CHECK:       vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1058   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1059   %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
   1060   ret <4 x float> %2
   1061 }
   1062 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
   1063 
   1064 define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
   1065   ;CHECK-LABEL: stack_fold_minpd
   1066   ;CHECK:       vminpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1067   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1068   %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
   1069   ret <2 x double> %2
   1070 }
   1071 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
   1072 
   1073 define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1074   ;CHECK-LABEL: stack_fold_minpd_ymm
   1075   ;CHECK:       vminpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1076   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1077   %2 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
   1078   ret <4 x double> %2
   1079 }
   1080 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
   1081 
   1082 define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
   1083   ;CHECK-LABEL: stack_fold_minps
   1084   ;CHECK:       vminps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1085   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1086   %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   1087   ret <4 x float> %2
   1088 }
   1089 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
   1090 
   1091 define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1092   ;CHECK-LABEL: stack_fold_minps_ymm
   1093   ;CHECK:       vminps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1094   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1095   %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
   1096   ret <8 x float> %2
   1097 }
   1098 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
   1099 
   1100 define double @stack_fold_minsd(double %a0, double %a1) {
   1101   ;CHECK-LABEL: stack_fold_minsd
   1102   ;CHECK:       vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1103   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1104   %2 = fcmp olt double %a0, %a1
   1105   %3 = select i1 %2, double %a0, double %a1
   1106   ret double %3
   1107 }
   1108 
   1109 define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
   1110   ;CHECK-LABEL: stack_fold_minsd_int
   1111   ;CHECK:       vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1112   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1113   %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
   1114   ret <2 x double> %2
   1115 }
   1116 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
   1117 
   1118 define float @stack_fold_minss(float %a0, float %a1) {
   1119   ;CHECK-LABEL: stack_fold_minss
   1120   ;CHECK:       vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1121   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1122   %2 = fcmp olt float %a0, %a1
   1123   %3 = select i1 %2, float %a0, float %a1
   1124   ret float %3
   1125 }
   1126 
   1127 define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
   1128   ;CHECK-LABEL: stack_fold_minss_int
   1129   ;CHECK:       vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1130   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1131   %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
   1132   ret <4 x float> %2
   1133 }
   1134 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
   1135 
   1136 define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
   1137   ;CHECK-LABEL: stack_fold_movddup
   1138   ;CHECK:   vmovddup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1139   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1140   %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
   1141   ret <2 x double> %2
   1142 }
   1143 
   1144 define <4 x double> @stack_fold_movddup_ymm(<4 x double> %a0) {
   1145   ;CHECK-LABEL: stack_fold_movddup_ymm
   1146   ;CHECK:   vmovddup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1147   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1148   %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   1149   ret <4 x double> %2
   1150 }
   1151 
   1152 ; TODO stack_fold_movhpd (load / store)
   1153 ; TODO stack_fold_movhps (load / store)
   1154 
   1155 ; TODO stack_fold_movlpd (load / store)
   1156 ; TODO stack_fold_movlps (load / store)
   1157 
   1158 define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
   1159   ;CHECK-LABEL: stack_fold_movshdup
   1160   ;CHECK:   vmovshdup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1161   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1162   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
   1163   ret <4 x float> %2
   1164 }
   1165 
   1166 define <8 x float> @stack_fold_movshdup_ymm(<8 x float> %a0) {
   1167   ;CHECK-LABEL: stack_fold_movshdup_ymm
   1168   ;CHECK:   vmovshdup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1169   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1170   %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   1171   ret <8 x float> %2
   1172 }
   1173 
   1174 define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
   1175   ;CHECK-LABEL: stack_fold_movsldup
   1176   ;CHECK:   vmovsldup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1177   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1178   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   1179   ret <4 x float> %2
   1180 }
   1181 
   1182 define <8 x float> @stack_fold_movsldup_ymm(<8 x float> %a0) {
   1183   ;CHECK-LABEL: stack_fold_movsldup_ymm
   1184   ;CHECK:   vmovsldup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1185   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1186   %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1187   ret <8 x float> %2
   1188 }
   1189 
   1190 define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
   1191   ;CHECK-LABEL: stack_fold_mulpd
   1192   ;CHECK:       vmulpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1193   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1194   %2 = fmul <2 x double> %a0, %a1
   1195   ret <2 x double> %2
   1196 }
   1197 
   1198 define <4 x double> @stack_fold_mulpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1199   ;CHECK-LABEL: stack_fold_mulpd_ymm
   1200   ;CHECK:       vmulpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1201   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1202   %2 = fmul <4 x double> %a0, %a1
   1203   ret <4 x double> %2
   1204 }
   1205 
   1206 define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
   1207   ;CHECK-LABEL: stack_fold_mulps
   1208   ;CHECK:       vmulps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1209   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1210   %2 = fmul <4 x float> %a0, %a1
   1211   ret <4 x float> %2
   1212 }
   1213 
   1214 define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1215   ;CHECK-LABEL: stack_fold_mulps_ymm
   1216   ;CHECK:       vmulps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1217   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1218   %2 = fmul <8 x float> %a0, %a1
   1219   ret <8 x float> %2
   1220 }
   1221 
   1222 define double @stack_fold_mulsd(double %a0, double %a1) {
   1223   ;CHECK-LABEL: stack_fold_mulsd
   1224   ;CHECK:       vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1225   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1226   %2 = fmul double %a0, %a1
   1227   ret double %2
   1228 }
   1229 
   1230 define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
   1231   ;CHECK-LABEL: stack_fold_mulsd_int
   1232   ;CHECK:       vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1233   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1234   %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1)
   1235   ret <2 x double> %2
   1236 }
   1237 declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone
   1238 
   1239 define float @stack_fold_mulss(float %a0, float %a1) {
   1240   ;CHECK-LABEL: stack_fold_mulss
   1241   ;CHECK:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1242   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1243   %2 = fmul float %a0, %a1
   1244   ret float %2
   1245 }
   1246 
   1247 define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
   1248   ;CHECK-LABEL: stack_fold_mulss_int
   1249   ;CHECK:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1250   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1251   %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1)
   1252   ret <4 x float> %2
   1253 }
   1254 declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone
   1255 
   1256 define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
   1257   ;CHECK-LABEL: stack_fold_orpd
   1258   ;CHECK:       vorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1259   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1260   %2 = bitcast <2 x double> %a0 to <2 x i64>
   1261   %3 = bitcast <2 x double> %a1 to <2 x i64>
   1262   %4 = or <2 x i64> %2, %3
   1263   %5 = bitcast <2 x i64> %4 to <2 x double>
   1264   ; fadd forces execution domain
   1265   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
   1266   ret <2 x double> %6
   1267 }
   1268 
   1269 define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1270   ;CHECK-LABEL: stack_fold_orpd_ymm
   1271   ;CHECK:       vorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1272   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1273   %2 = bitcast <4 x double> %a0 to <4 x i64>
   1274   %3 = bitcast <4 x double> %a1 to <4 x i64>
   1275   %4 = or <4 x i64> %2, %3
   1276   %5 = bitcast <4 x i64> %4 to <4 x double>
   1277   ; fadd forces execution domain
   1278   %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
   1279   ret <4 x double> %6
   1280 }
   1281 
   1282 define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
   1283   ;CHECK-LABEL: stack_fold_orps
   1284   ;CHECK:       vorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1285   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1286   %2 = bitcast <4 x float> %a0 to <2 x i64>
   1287   %3 = bitcast <4 x float> %a1 to <2 x i64>
   1288   %4 = or <2 x i64> %2, %3
   1289   %5 = bitcast <2 x i64> %4 to <4 x float>
   1290   ; fadd forces execution domain
   1291   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
   1292   ret <4 x float> %6
   1293 }
   1294 
   1295 define <8 x float> @stack_fold_orps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1296   ;CHECK-LABEL: stack_fold_orps_ymm
   1297   ;CHECK:       vorps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1298   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1299   %2 = bitcast <8 x float> %a0 to <4 x i64>
   1300   %3 = bitcast <8 x float> %a1 to <4 x i64>
   1301   %4 = or <4 x i64> %2, %3
   1302   %5 = bitcast <4 x i64> %4 to <8 x float>
   1303   ; fadd forces execution domain
   1304   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1305   ret <8 x float> %6
   1306 }
   1307 
   1308 define <8 x float> @stack_fold_perm2f128(<8 x float> %a0, <8 x float> %a1) {
   1309   ;CHECK-LABEL: stack_fold_perm2f128
   1310   ;CHECK:   vperm2f128 $33, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1311   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1312   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1313   ret <8 x float> %2
   1314 }
   1315 
   1316 define <2 x double> @stack_fold_permilpd(<2 x double> %a0) {
   1317   ;CHECK-LABEL: stack_fold_permilpd
   1318   ;CHECK:   vpermilpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1319   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1320   %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   1321   ret <2 x double> %2
   1322 }
   1323 
   1324 define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) {
   1325   ;CHECK-LABEL: stack_fold_permilpd_ymm
   1326   ;CHECK:   vpermilpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1327   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1328   %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   1329   ret <4 x double> %2
   1330 }
   1331 
   1332 define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) {
   1333   ;CHECK-LABEL: stack_fold_permilpdvar
   1334   ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1335   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1336   %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
   1337   ret <2 x double> %2
   1338 }
   1339 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
   1340 
   1341 define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) {
   1342   ;CHECK-LABEL: stack_fold_permilpdvar_ymm
   1343   ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1344   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1345   %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
   1346   ret <4 x double> %2
   1347 }
   1348 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
   1349 
   1350 define <4 x float> @stack_fold_permilps(<4 x float> %a0) {
   1351   ;CHECK-LABEL: stack_fold_permilps
   1352   ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1353   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1354   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   1355   ret <4 x float> %2
   1356 }
   1357 
   1358 define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) {
   1359   ;CHECK-LABEL: stack_fold_permilps_ymm
   1360   ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1361   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1362   %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   1363   ret <8 x float> %2
   1364 }
   1365 
   1366 define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) {
   1367   ;CHECK-LABEL: stack_fold_permilpsvar
   1368   ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1369   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1370   %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
   1371   ret <4 x float> %2
   1372 }
   1373 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
   1374 
   1375 define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) {
   1376   ;CHECK-LABEL: stack_fold_permilpsvar_ymm
   1377   ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1378   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1379   %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
   1380   ret <8 x float> %2
   1381 }
   1382 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
   1383 
   1384 ; TODO stack_fold_rcpps
   1385 
   1386 define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
   1387   ;CHECK-LABEL: stack_fold_rcpps_int
   1388   ;CHECK:       vrcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1389   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1390   %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
   1391   ret <4 x float> %2
   1392 }
   1393 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
   1394 
   1395 ; TODO stack_fold_rcpps_ymm
   1396 
   1397 define <8 x float> @stack_fold_rcpps_ymm_int(<8 x float> %a0) {
   1398   ;CHECK-LABEL: stack_fold_rcpps_ymm_int
   1399   ;CHECK:       vrcpps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1400   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1401   %2 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
   1402   ret <8 x float> %2
   1403 }
   1404 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
   1405 
   1406 ; TODO stack_fold_rcpss
   1407 
   1408 define <4 x float> @stack_fold_rcpss_int(<4 x float> %a0) {
   1409   ;CHECK-LABEL: stack_fold_rcpss_int
   1410   ;CHECK:       vrcpss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1411   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1412   %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
   1413   ret <4 x float> %2
   1414 }
   1415 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
   1416 
   1417 define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
   1418   ;CHECK-LABEL: stack_fold_roundpd
   1419   ;CHECK:  vroundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1420   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1421   %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
   1422   ret <2 x double> %2
   1423 }
   1424 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
   1425 
   1426 define <4 x double> @stack_fold_roundpd_ymm(<4 x double> %a0) {
   1427   ;CHECK-LABEL: stack_fold_roundpd_ymm
   1428   ;CHECK:  vroundpd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1429   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1430   %2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
   1431   ret <4 x double> %2
   1432 }
   1433 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
   1434 
   1435 define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
   1436   ;CHECK-LABEL: stack_fold_roundps
   1437   ;CHECK:  vroundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1438   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1439   %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
   1440   ret <4 x float> %2
   1441 }
   1442 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
   1443 
   1444 define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) {
   1445   ;CHECK-LABEL: stack_fold_roundps_ymm
   1446   ;CHECK:  vroundps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1447   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1448   %2 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
   1449   ret <8 x float> %2
   1450 }
   1451 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
   1452 
   1453 define double @stack_fold_roundsd(double %a0) optsize {
   1454   ;CHECK-LABEL: stack_fold_roundsd
   1455   ;CHECK:       vroundsd $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1456   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1457   %2 = call double @llvm.floor.f64(double %a0)
   1458   ret double %2
   1459 }
   1460 declare double @llvm.floor.f64(double) nounwind readnone
   1461 
   1462 ; TODO stack_fold_roundsd_int
   1463 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
   1464 
   1465 define float @stack_fold_roundss(float %a0) optsize {
   1466   ;CHECK-LABEL: stack_fold_roundss
   1467   ;CHECK:       vroundss $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1468   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1469   %2 = call float @llvm.floor.f32(float %a0)
   1470   ret float %2
   1471 }
   1472 declare float @llvm.floor.f32(float) nounwind readnone
   1473 
   1474 ; TODO stack_fold_roundss_int
   1475 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
   1476 
   1477 ; TODO stack_fold_rsqrtps
   1478 
   1479 define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
   1480   ;CHECK-LABEL: stack_fold_rsqrtps_int
   1481   ;CHECK:       vrsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1482   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1483   %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
   1484   ret <4 x float> %2
   1485 }
   1486 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
   1487 
   1488 ; TODO stack_fold_rsqrtps_ymm
   1489 
   1490 define <8 x float> @stack_fold_rsqrtps_ymm_int(<8 x float> %a0) {
   1491   ;CHECK-LABEL: stack_fold_rsqrtps_ymm_int
   1492   ;CHECK:       vrsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1493   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1494   %2 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
   1495   ret <8 x float> %2
   1496 }
   1497 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
   1498 
   1499 ; TODO stack_fold_rsqrtss
   1500 
   1501 define <4 x float> @stack_fold_rsqrtss_int(<4 x float> %a0) {
   1502   ;CHECK-LABEL: stack_fold_rsqrtss_int
   1503   ;CHECK:       vrsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1504   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1505   %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
   1506   ret <4 x float> %2
   1507 }
   1508 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
   1509 
   1510 define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
   1511   ;CHECK-LABEL: stack_fold_shufpd
   1512   ;CHECK:       vshufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1513   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1514   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
   1515   ret <2 x double> %2
   1516 }
   1517 
   1518 define <4 x double> @stack_fold_shufpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1519   ;CHECK-LABEL: stack_fold_shufpd_ymm
   1520   ;CHECK:       vshufpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1521   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1522   %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
   1523   ret <4 x double> %2
   1524 }
   1525 
   1526 define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
   1527   ;CHECK-LABEL: stack_fold_shufps
   1528   ;CHECK:       vshufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1529   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1530   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
   1531   ret <4 x float> %2
   1532 }
   1533 
   1534 define <8 x float> @stack_fold_shufps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1535   ;CHECK-LABEL: stack_fold_shufps_ymm
   1536   ;CHECK:       vshufps $148, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1537   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1538   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 13, i32 14>
   1539   ret <8 x float> %2
   1540 }
   1541 
   1542 define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
   1543   ;CHECK-LABEL: stack_fold_sqrtpd
   1544   ;CHECK:       vsqrtpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1545   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1546   %2 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
   1547   ret <2 x double> %2
   1548 }
   1549 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
   1550 
   1551 define <4 x double> @stack_fold_sqrtpd_ymm(<4 x double> %a0) {
   1552   ;CHECK-LABEL: stack_fold_sqrtpd_ymm
   1553   ;CHECK:       vsqrtpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1554   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1555   %2 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
   1556   ret <4 x double> %2
   1557 }
   1558 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
   1559 
   1560 define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
   1561   ;CHECK-LABEL: stack_fold_sqrtps
   1562   ;CHECK:       vsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1563   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1564   %2 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
   1565   ret <4 x float> %2
   1566 }
   1567 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
   1568 
   1569 define <8 x float> @stack_fold_sqrtps_ymm(<8 x float> %a0) {
   1570   ;CHECK-LABEL: stack_fold_sqrtps_ymm
   1571   ;CHECK:       vsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1572   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1573   %2 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
   1574   ret <8 x float> %2
   1575 }
   1576 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
   1577 
   1578 define double @stack_fold_sqrtsd(double %a0) {
   1579   ;CHECK-LABEL: stack_fold_sqrtsd
   1580   ;CHECK:       vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1581   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1582   %2 = call double @llvm.sqrt.f64(double %a0)
   1583   ret double %2
   1584 }
   1585 declare double @llvm.sqrt.f64(double) nounwind readnone
   1586 
   1587 define <2 x double> @stack_fold_sqrtsd_int(<2 x double> %a0) {
   1588   ;CHECK-LABEL: stack_fold_sqrtsd_int
   1589   ;CHECK:       vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1590   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1591   %2 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
   1592   ret <2 x double> %2
   1593 }
   1594 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
   1595 
   1596 define float @stack_fold_sqrtss(float %a0) {
   1597   ;CHECK-LABEL: stack_fold_sqrtss
   1598   ;CHECK:       vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1599   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1600   %2 = call float @llvm.sqrt.f32(float %a0)
   1601   ret float %2
   1602 }
   1603 declare float @llvm.sqrt.f32(float) nounwind readnone
   1604 
   1605 define <4 x float> @stack_fold_sqrtss_int(<4 x float> %a0) {
   1606   ;CHECK-LABEL: stack_fold_sqrtss_int
   1607   ;CHECK:       vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1608   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1609   %2 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
   1610   ret <4 x float> %2
   1611 }
   1612 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
   1613 
   1614 define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
   1615   ;CHECK-LABEL: stack_fold_subpd
   1616   ;CHECK:       vsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1617   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1618   %2 = fsub <2 x double> %a0, %a1
   1619   ret <2 x double> %2
   1620 }
   1621 
   1622 define <4 x double> @stack_fold_subpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1623   ;CHECK-LABEL: stack_fold_subpd_ymm
   1624   ;CHECK:       vsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1625   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1626   %2 = fsub <4 x double> %a0, %a1
   1627   ret <4 x double> %2
   1628 }
   1629 
   1630 define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
   1631   ;CHECK-LABEL: stack_fold_subps
   1632   ;CHECK:       vsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1633   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1634   %2 = fsub <4 x float> %a0, %a1
   1635   ret <4 x float> %2
   1636 }
   1637 
   1638 define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1639   ;CHECK-LABEL: stack_fold_subps_ymm
   1640   ;CHECK:       vsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1641   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1642   %2 = fsub <8 x float> %a0, %a1
   1643   ret <8 x float> %2
   1644 }
   1645 
   1646 define double @stack_fold_subsd(double %a0, double %a1) {
   1647   ;CHECK-LABEL: stack_fold_subsd
   1648   ;CHECK:       vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1649   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1650   %2 = fsub double %a0, %a1
   1651   ret double %2
   1652 }
   1653 
   1654 define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
   1655   ;CHECK-LABEL: stack_fold_subsd_int
   1656   ;CHECK:       vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1657   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1658   %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1)
   1659   ret <2 x double> %2
   1660 }
   1661 declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone
   1662 
   1663 define float @stack_fold_subss(float %a0, float %a1) {
   1664   ;CHECK-LABEL: stack_fold_subss
   1665   ;CHECK:       vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1666   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1667   %2 = fsub float %a0, %a1
   1668   ret float %2
   1669 }
   1670 
   1671 define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
   1672   ;CHECK-LABEL: stack_fold_subss_int
   1673   ;CHECK:       vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1674   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1675   %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1)
   1676   ret <4 x float> %2
   1677 }
   1678 declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone
   1679 
   1680 define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) {
   1681   ;CHECK-LABEL: stack_fold_testpd
   1682   ;CHECK:       vtestpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1683   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1684   %2 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
   1685   ret i32 %2
   1686 }
   1687 declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
   1688 
   1689 define i32 @stack_fold_testpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1690   ;CHECK-LABEL: stack_fold_testpd_ymm
   1691   ;CHECK:       vtestpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1692   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1693   %2 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
   1694   ret i32 %2
   1695 }
   1696 declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
   1697 
   1698 define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) {
   1699   ;CHECK-LABEL: stack_fold_testps
   1700   ;CHECK:       vtestps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1701   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1702   %2 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
   1703   ret i32 %2
   1704 }
   1705 declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
   1706 
   1707 define i32 @stack_fold_testps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1708   ;CHECK-LABEL: stack_fold_testps_ymm
   1709   ;CHECK:       vtestps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1710   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1711   %2 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
   1712   ret i32 %2
   1713 }
   1714 declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
   1715 
   1716 define i32 @stack_fold_ucomisd(double %a0, double %a1) {
   1717   ;CHECK-LABEL: stack_fold_ucomisd
   1718   ;CHECK:       vucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1719   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1720   %2 = fcmp ueq double %a0, %a1
   1721   %3 = select i1 %2, i32 1, i32 -1
   1722   ret i32 %3
   1723 }
   1724 
   1725 define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
   1726   ;CHECK-LABEL: stack_fold_ucomisd_int
   1727   ;CHECK:       vucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1728   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1729   %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
   1730   ret i32 %2
   1731 }
   1732 declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
   1733 
   1734 define i32 @stack_fold_ucomiss(float %a0, float %a1) {
   1735   ;CHECK-LABEL: stack_fold_ucomiss
   1736   ;CHECK:       vucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1737   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1738   %2 = fcmp ueq float %a0, %a1
   1739   %3 = select i1 %2, i32 1, i32 -1
   1740   ret i32 %3
   1741 }
   1742 
   1743 define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
   1744   ;CHECK-LABEL: stack_fold_ucomiss_int
   1745   ;CHECK:       vucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1746   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1747   %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
   1748   ret i32 %2
   1749 }
   1750 declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
   1751 
   1752 define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
   1753   ;CHECK-LABEL: stack_fold_unpckhpd
   1754   ;CHECK:       vunpckhpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1755   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1756   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
   1757   ; fadd forces execution domain
   1758   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
   1759   ret <2 x double> %3
   1760 }
   1761 
   1762 define <4 x double> @stack_fold_unpckhpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1763   ;CHECK-LABEL: stack_fold_unpckhpd_ymm
   1764   ;CHECK:       vunpckhpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1765   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1766   %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   1767   ; fadd forces execution domain
   1768   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
   1769   ret <4 x double> %3
   1770 }
   1771 
   1772 define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
   1773   ;CHECK-LABEL: stack_fold_unpckhps
   1774   ;CHECK:       vunpckhps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1775   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1776   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   1777   ; fadd forces execution domain
   1778   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
   1779   ret <4 x float> %3
   1780 }
   1781 
   1782 define <8 x float> @stack_fold_unpckhps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1783   ;CHECK-LABEL: stack_fold_unpckhps_ymm
   1784   ;CHECK:       vunpckhps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1785   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1786   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   1787   ; fadd forces execution domain
   1788   %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1789   ret <8 x float> %3
   1790 }
   1791 
   1792 define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
   1793   ;CHECK-LABEL: stack_fold_unpcklpd
   1794   ;CHECK:       vunpcklpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1795   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1796   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
   1797   ; fadd forces execution domain
   1798   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
   1799   ret <2 x double> %3
   1800 }
   1801 
   1802 define <4 x double> @stack_fold_unpcklpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1803   ;CHECK-LABEL: stack_fold_unpcklpd_ymm
   1804   ;CHECK:       vunpcklpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1805   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1806   %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   1807   ; fadd forces execution domain
   1808   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
   1809   ret <4 x double> %3
   1810 }
   1811 
   1812 define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
   1813   ;CHECK-LABEL: stack_fold_unpcklps
   1814   ;CHECK:       vunpcklps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1815   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1816   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   1817   ; fadd forces execution domain
   1818   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
   1819   ret <4 x float> %3
   1820 }
   1821 
   1822 define <8 x float> @stack_fold_unpcklps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1823   ;CHECK-LABEL: stack_fold_unpcklps_ymm
   1824   ;CHECK:       vunpcklps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1825   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1826   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   1827   ; fadd forces execution domain
   1828   %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1829   ret <8 x float> %3
   1830 }
   1831 
   1832 define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
   1833   ;CHECK-LABEL: stack_fold_xorpd
   1834   ;CHECK:       vxorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1835   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1836   %2 = bitcast <2 x double> %a0 to <2 x i64>
   1837   %3 = bitcast <2 x double> %a1 to <2 x i64>
   1838   %4 = xor <2 x i64> %2, %3
   1839   %5 = bitcast <2 x i64> %4 to <2 x double>
   1840   ; fadd forces execution domain
   1841   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
   1842   ret <2 x double> %6
   1843 }
   1844 
   1845 define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1846   ;CHECK-LABEL: stack_fold_xorpd_ymm
   1847   ;CHECK:       vxorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1848   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1849   %2 = bitcast <4 x double> %a0 to <4 x i64>
   1850   %3 = bitcast <4 x double> %a1 to <4 x i64>
   1851   %4 = xor <4 x i64> %2, %3
   1852   %5 = bitcast <4 x i64> %4 to <4 x double>
   1853   ; fadd forces execution domain
   1854   %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
   1855   ret <4 x double> %6
   1856 }
   1857 
   1858 define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
   1859   ;CHECK-LABEL: stack_fold_xorps
   1860   ;CHECK:       vxorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1861   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1862   %2 = bitcast <4 x float> %a0 to <2 x i64>
   1863   %3 = bitcast <4 x float> %a1 to <2 x i64>
   1864   %4 = xor <2 x i64> %2, %3
   1865   %5 = bitcast <2 x i64> %4 to <4 x float>
   1866   ; fadd forces execution domain
   1867   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
   1868   ret <4 x float> %6
   1869 }
   1870 
   1871 define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1872   ;CHECK-LABEL: stack_fold_xorps_ymm
   1873   ;CHECK:       vxorps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1874   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1875   %2 = bitcast <8 x float> %a0 to <4 x i64>
   1876   %3 = bitcast <8 x float> %a1 to <4 x i64>
   1877   %4 = xor <4 x i64> %2, %3
   1878   %5 = bitcast <4 x i64> %4 to <8 x float>
   1879   ; fadd forces execution domain
   1880   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1881   ret <8 x float> %6
   1882 }
   1883