Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -O3 -verify-machineinstrs -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c < %s | FileCheck %s
      2 
      3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
      4 target triple = "x86_64-unknown-unknown"
      5 
      6 ; Stack reload folding tests.
      7 ;
      8 ; By including a nop call with sideeffects we can force a partial register spill of the
      9 ; relevant registers and check that the reload is correctly folded into the instruction.
     10 
     11 define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
     12   ;CHECK-LABEL: stack_fold_addpd
     13   ;CHECK:       vaddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     14   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     15   %2 = fadd <2 x double> %a0, %a1
     16   ret <2 x double> %2
     17 }
     18 
     19 define <4 x double> @stack_fold_addpd_ymm(<4 x double> %a0, <4 x double> %a1) {
     20   ;CHECK-LABEL: stack_fold_addpd_ymm
     21   ;CHECK:       vaddpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     22   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     23   %2 = fadd <4 x double> %a0, %a1
     24   ret <4 x double> %2
     25 }
     26 
     27 define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
     28   ;CHECK-LABEL: stack_fold_addps
     29   ;CHECK:       vaddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     30   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     31   %2 = fadd <4 x float> %a0, %a1
     32   ret <4 x float> %2
     33 }
     34 
     35 define <8 x float> @stack_fold_addps_ymm(<8 x float> %a0, <8 x float> %a1) {
     36   ;CHECK-LABEL: stack_fold_addps_ymm
     37   ;CHECK:       vaddps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     38   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     39   %2 = fadd <8 x float> %a0, %a1
     40   ret <8 x float> %2
     41 }
     42 
     43 define double @stack_fold_addsd(double %a0, double %a1) {
     44   ;CHECK-LABEL: stack_fold_addsd
     45   ;CHECK:       vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
     46   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     47   %2 = fadd double %a0, %a1
     48   ret double %2
     49 }
     50 
     51 define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
     52   ;CHECK-LABEL: stack_fold_addsd_int
     53   ;CHECK:       vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     54   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     55   %2 = extractelement <2 x double> %a0, i32 0
     56   %3 = extractelement <2 x double> %a1, i32 0
     57   %4 = fadd double %2, %3
     58   %5 = insertelement <2 x double> %a0, double %4, i32 0
     59   ret <2 x double> %5
     60 }
     61 declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
     62 
     63 define float @stack_fold_addss(float %a0, float %a1) {
     64   ;CHECK-LABEL: stack_fold_addss
     65   ;CHECK:       vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
     66   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     67   %2 = fadd float %a0, %a1
     68   ret float %2
     69 }
     70 
     71 define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
     72   ;CHECK-LABEL: stack_fold_addss_int
     73   ;CHECK:       vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     74   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     75   %2 = extractelement <4 x float> %a0, i32 0
     76   %3 = extractelement <4 x float> %a1, i32 0
     77   %4 = fadd float %2, %3
     78   %5 = insertelement <4 x float> %a0, float %4, i32 0
     79   ret <4 x float> %5
     80 }
     81 declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
     82 
     83 define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
     84   ;CHECK-LABEL: stack_fold_addsubpd
     85   ;CHECK:       vaddsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     86   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     87   %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
     88   ret <2 x double> %2
     89 }
     90 declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
     91 
     92 define <4 x double> @stack_fold_addsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
     93   ;CHECK-LABEL: stack_fold_addsubpd_ymm
     94   ;CHECK:       vaddsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     95   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     96   %2 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
     97   ret <4 x double> %2
     98 }
     99 declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
    100 
    101 define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
    102   ;CHECK-LABEL: stack_fold_addsubps
    103   ;CHECK:       vaddsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    104   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    105   %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
    106   ret <4 x float> %2
    107 }
    108 declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
    109 
    110 define <8 x float> @stack_fold_addsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
    111   ;CHECK-LABEL: stack_fold_addsubps_ymm
    112   ;CHECK:       vaddsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    113   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    114   %2 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
    115   ret <8 x float> %2
    116 }
    117 declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
    118 
    119 define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
    120   ;CHECK-LABEL: stack_fold_andnpd
    121   ;CHECK:       vandnpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    122   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    123   %2 = bitcast <2 x double> %a0 to <2 x i64>
    124   %3 = bitcast <2 x double> %a1 to <2 x i64>
    125   %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
    126   %5 = and <2 x i64> %4, %3
    127   %6 = bitcast <2 x i64> %5 to <2 x double>
    128   ; fadd forces execution domain
    129   %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
    130   ret <2 x double> %7
    131 }
    132 
    133 define <4 x double> @stack_fold_andnpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    134   ;CHECK-LABEL: stack_fold_andnpd_ymm
    135   ;CHECK:       vandnpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    136   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    137   %2 = bitcast <4 x double> %a0 to <4 x i64>
    138   %3 = bitcast <4 x double> %a1 to <4 x i64>
    139   %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
    140   %5 = and <4 x i64> %4, %3
    141   %6 = bitcast <4 x i64> %5 to <4 x double>
    142   ; fadd forces execution domain
    143   %7 = fadd <4 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0>
    144   ret <4 x double> %7
    145 }
    146 
    147 define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
    148   ;CHECK-LABEL: stack_fold_andnps
    149   ;CHECK:       vandnps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    150   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    151   %2 = bitcast <4 x float> %a0 to <2 x i64>
    152   %3 = bitcast <4 x float> %a1 to <2 x i64>
    153   %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
    154   %5 = and <2 x i64> %4, %3
    155   %6 = bitcast <2 x i64> %5 to <4 x float>
    156   ; fadd forces execution domain
    157   %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
    158   ret <4 x float> %7
    159 }
    160 
    161 define <8 x float> @stack_fold_andnps_ymm(<8 x float> %a0, <8 x float> %a1) {
    162   ;CHECK-LABEL: stack_fold_andnps_ymm
    163   ;CHECK:       vandnps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    164   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    165   %2 = bitcast <8 x float> %a0 to <4 x i64>
    166   %3 = bitcast <8 x float> %a1 to <4 x i64>
    167   %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
    168   %5 = and <4 x i64> %4, %3
    169   %6 = bitcast <4 x i64> %5 to <8 x float>
    170   ; fadd forces execution domain
    171   %7 = fadd <8 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
    172   ret <8 x float> %7
    173 }
    174 
    175 define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
    176   ;CHECK-LABEL: stack_fold_andpd
    177   ;CHECK:       vandpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    178   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    179   %2 = bitcast <2 x double> %a0 to <2 x i64>
    180   %3 = bitcast <2 x double> %a1 to <2 x i64>
    181   %4 = and <2 x i64> %2, %3
    182   %5 = bitcast <2 x i64> %4 to <2 x double>
    183   ; fadd forces execution domain
    184   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
    185   ret <2 x double> %6
    186 }
    187 
    188 define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    189   ;CHECK-LABEL: stack_fold_andpd_ymm
    190   ;CHECK:       vandpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    191   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    192   %2 = bitcast <4 x double> %a0 to <4 x i64>
    193   %3 = bitcast <4 x double> %a1 to <4 x i64>
    194   %4 = and <4 x i64> %2, %3
    195   %5 = bitcast <4 x i64> %4 to <4 x double>
    196   ; fadd forces execution domain
    197   %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
    198   ret <4 x double> %6
    199 }
    200 
    201 define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
    202   ;CHECK-LABEL: stack_fold_andps
    203   ;CHECK:       vandps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    204   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    205   %2 = bitcast <4 x float> %a0 to <2 x i64>
    206   %3 = bitcast <4 x float> %a1 to <2 x i64>
    207   %4 = and <2 x i64> %2, %3
    208   %5 = bitcast <2 x i64> %4 to <4 x float>
    209   ; fadd forces execution domain
    210   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
    211   ret <4 x float> %6
    212 }
    213 
    214 define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) {
    215   ;CHECK-LABEL: stack_fold_andps_ymm
    216   ;CHECK:       vandps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    217   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    218   %2 = bitcast <8 x float> %a0 to <4 x i64>
    219   %3 = bitcast <8 x float> %a1 to <4 x i64>
    220   %4 = and <4 x i64> %2, %3
    221   %5 = bitcast <4 x i64> %4 to <8 x float>
    222   ; fadd forces execution domain
    223   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
    224   ret <8 x float> %6
    225 }
    226 
    227 define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
    228   ;CHECK-LABEL: stack_fold_blendpd
    229   ;CHECK:       vblendpd $2, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    230   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    231   %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
    232   ; fadd forces execution domain
    233   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
    234   ret <2 x double> %3
    235 }
    236 
    237 define <4 x double> @stack_fold_blendpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    238   ;CHECK-LABEL: stack_fold_blendpd_ymm
    239   ;CHECK:       vblendpd $6, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    240   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    241   %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %a0, <4 x double> %a1
    242   ; fadd forces execution domain
    243   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
    244   ret <4 x double> %3}
    245 
    246 define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
    247   ;CHECK-LABEL: stack_fold_blendps
    248   ;CHECK:       vblendps $6, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    249   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    250   %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
    251   ; fadd forces execution domain
    252   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
    253   ret <4 x float> %3
    254 }
    255 
    256 define <8 x float> @stack_fold_blendps_ymm(<8 x float> %a0, <8 x float> %a1) {
    257   ;CHECK-LABEL: stack_fold_blendps_ymm
    258   ;CHECK:       vblendps $102, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    259   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    260   %2 = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %a0, <8 x float> %a1
    261   ; fadd forces execution domain
    262   %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
    263   ret <8 x float> %3
    264 }
    265 
    266 define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
    267   ;CHECK-LABEL: stack_fold_blendvpd
    268   ;CHECK:       vblendvpd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    269   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    270   %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
    271   ret <2 x double> %2
    272 }
    273 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
    274 
    275 define <4 x double> @stack_fold_blendvpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %c) {
    276   ;CHECK-LABEL: stack_fold_blendvpd_ymm
    277   ;CHECK:       vblendvpd {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    278   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    279   %2 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a1, <4 x double> %c, <4 x double> %a0)
    280   ret <4 x double> %2
    281 }
    282 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
    283 
    284 define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
    285   ;CHECK-LABEL: stack_fold_blendvps
    286   ;CHECK:       vblendvps {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    287   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    288   %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
    289   ret <4 x float> %2
    290 }
    291 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
    292 
    293 define <8 x float> @stack_fold_blendvps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %c) {
    294   ;CHECK-LABEL: stack_fold_blendvps_ymm
    295   ;CHECK:       vblendvps {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    296   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    297   %2 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a1, <8 x float> %c, <8 x float> %a0)
    298   ret <8 x float> %2
    299 }
    300 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
    301 
    302 define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
    303   ;CHECK-LABEL: stack_fold_cmppd
    304   ;CHECK:       vcmpeqpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    305   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    306   %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
    307   ret <2 x double> %2
    308 }
    309 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
    310 
    311 define <4 x double> @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
    312   ;CHECK-LABEL: stack_fold_cmppd_ymm
    313   ;CHECK:       vcmpeqpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    314   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    315   %2 = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
    316   ret <4 x double> %2
    317 }
    318 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
    319 
    320 define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
    321   ;CHECK-LABEL: stack_fold_cmpps
    322   ;CHECK:       vcmpeqps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    323   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    324   %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
    325   ret <4 x float> %2
    326 }
    327 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
    328 
    329 define <8 x float> @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
    330   ;CHECK-LABEL: stack_fold_cmpps_ymm
    331   ;CHECK:       vcmpeqps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    332   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    333   %2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0)
    334   ret <8 x float> %2
    335 }
    336 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
    337 
    338 define i32 @stack_fold_cmpsd(double %a0, double %a1) {
    339   ;CHECK-LABEL: stack_fold_cmpsd
    340   ;CHECK:       vcmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    341   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    342   %2 = fcmp oeq double %a0, %a1
    343   %3 = zext i1 %2 to i32
    344   ret i32 %3
    345 }
    346 
    347 define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
    348   ;CHECK-LABEL: stack_fold_cmpsd_int
    349   ;CHECK:       vcmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    350   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    351   %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
    352   ret <2 x double> %2
    353 }
    354 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
    355 
    356 define i32 @stack_fold_cmpss(float %a0, float %a1) {
    357   ;CHECK-LABEL: stack_fold_cmpss
    358   ;CHECK:       vcmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    359   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    360   %2 = fcmp oeq float %a0, %a1
    361   %3 = zext i1 %2 to i32
    362   ret i32 %3
    363 }
    364 
    365 define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
    366   ;CHECK-LABEL: stack_fold_cmpss_int
    367   ;CHECK:       vcmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    368   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    369   %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
    370   ret <4 x float> %2
    371 }
    372 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
    373 
    374 ; TODO stack_fold_comisd
    375 
    376 define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
    377   ;CHECK-LABEL: stack_fold_comisd_int
    378   ;CHECK:       vcomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    379   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    380   %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
    381   ret i32 %2
    382 }
    383 declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
    384 
    385 ; TODO stack_fold_comiss
    386 
    387 define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
    388   ;CHECK-LABEL: stack_fold_comiss_int
    389   ;CHECK:       vcomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    390   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    391   %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
    392   ret i32 %2
    393 }
    394 declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
    395 
    396 define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
    397   ;CHECK-LABEL: stack_fold_cvtdq2pd
    398   ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    399   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    400   %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
    401   %3 = sitofp <2 x i32> %2 to <2 x double>
    402   ret <2 x double> %3
    403 }
    404 define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
    405   ;CHECK-LABEL: stack_fold_cvtdq2pd_int
    406   ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    407   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    408   %2 = shufflevector <4 x i32> %a0, <4 x i32> %a0, <2 x i32> <i32 0, i32 1>
    409   %cvt = sitofp <2 x i32> %2 to <2 x double>
    410   ret <2 x double> %cvt
    411 }
    412 
    413 define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) {
    414   ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm
    415   ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    416   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    417   %2 = sitofp <4 x i32> %a0 to <4 x double>
    418   ret <4 x double> %2
    419 }
    420 
    421 define <4 x double> @stack_fold_cvtdq2pd_ymm_int(<4 x i32> %a0) {
    422   ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm_int
    423   ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    424   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    425   %cvt = sitofp <4 x i32> %a0 to <4 x double>
    426   ret <4 x double> %cvt
    427 }
    428 
    429 define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
    430   ;CHECK-LABEL: stack_fold_cvtdq2ps
    431   ;CHECK:   vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    432   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    433   %2 = sitofp <4 x i32> %a0 to <4 x float>
    434   ret <4 x float> %2
    435 }
    436 
    437 define <8 x float> @stack_fold_cvtdq2ps_ymm(<8 x i32> %a0) {
    438   ;CHECK-LABEL: stack_fold_cvtdq2ps_ymm
    439   ;CHECK:   vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    440   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    441   %2 = sitofp <8 x i32> %a0 to <8 x float>
    442   ret <8 x float> %2
    443 }
    444 
    445 define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
    446   ;CHECK-LABEL: stack_fold_cvtpd2dq
    447   ;CHECK:   vcvtpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    448   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    449   %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
    450   ret <4 x i32> %2
    451 }
    452 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
    453 
    454 define <4 x i32> @stack_fold_cvtpd2dq_ymm(<4 x double> %a0) {
    455   ;CHECK-LABEL: stack_fold_cvtpd2dq_ymm
    456   ;CHECK:   vcvtpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    457   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    458   %2 = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
    459   ret <4 x i32> %2
    460 }
    461 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
    462 
    463 define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
    464   ;CHECK-LABEL: stack_fold_cvtpd2ps
    465   ;CHECK:   vcvtpd2psx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    466   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    467   %2 = fptrunc <2 x double> %a0 to <2 x float>
    468   ret <2 x float> %2
    469 }
    470 
    471 define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
    472   ;CHECK-LABEL: stack_fold_cvtpd2ps_ymm
    473   ;CHECK:   vcvtpd2psy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    474   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    475   %2 = fptrunc <4 x double> %a0 to <4 x float>
    476   ret <4 x float> %2
    477 }
    478 
    479 define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) {
    480   ;CHECK-LABEL: stack_fold_cvtph2ps
    481   ;CHECK:   vcvtph2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    482   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    483   %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)
    484   ret <4 x float> %2
    485 }
    486 declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
    487 
    488 define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) {
    489   ;CHECK-LABEL: stack_fold_cvtph2ps_ymm
    490   ;CHECK:   vcvtph2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    491   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    492   %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)
    493   ret <8 x float> %2
    494 }
    495 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
    496 
    497 define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
    498   ;CHECK-LABEL: stack_fold_cvtps2dq
    499   ;CHECK:  vcvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    500   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    501   %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
    502   ret <4 x i32> %2
    503 }
    504 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
    505 
    506 define <8 x i32> @stack_fold_cvtps2dq_ymm(<8 x float> %a0) {
    507   ;CHECK-LABEL: stack_fold_cvtps2dq_ymm
    508   ;CHECK:  vcvtps2dq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    509   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    510   %2 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
    511   ret <8 x i32> %2
    512 }
    513 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
    514 
    515 define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
    516   ;CHECK-LABEL: stack_fold_cvtps2pd
    517   ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    518   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    519   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
    520   %3 = fpext <2 x float> %2 to <2 x double>
    521   ret <2 x double> %3
    522 }
    523 
    524 define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
    525   ;CHECK-LABEL: stack_fold_cvtps2pd_int
    526   ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    527   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    528   %2 = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1>
    529   %cvtps2pd = fpext <2 x float> %2 to <2 x double>
    530   ret <2 x double> %cvtps2pd
    531 }
    532 
    533 define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) {
    534   ;CHECK-LABEL: stack_fold_cvtps2pd_ymm
    535   ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    536   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    537   %2 = fpext <4 x float> %a0 to <4 x double>
    538   ret <4 x double> %2
    539 }
    540 
    541 define <4 x double> @stack_fold_cvtps2pd_ymm_int(<4 x float> %a0) {
    542   ;CHECK-LABEL: stack_fold_cvtps2pd_ymm_int
    543   ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    544   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    545   %cvtps2pd = fpext <4 x float> %a0 to <4 x double>
    546   ret <4 x double> %cvtps2pd
    547 }
    548 
    549 define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) {
    550   ;CHECK-LABEL: stack_fold_cvtps2ph_ymm
    551   ;CHECK:   vcvtps2ph $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
    552   %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
    553   %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    554   ret <8 x i16> %1
    555 }
    556 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
    557 
    558 ; TODO stack_fold_cvtsd2si
    559 
    560 define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
    561   ;CHECK-LABEL: stack_fold_cvtsd2si_int
    562   ;CHECK:  vcvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    563   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    564   %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
    565   ret i32 %2
    566 }
    567 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
    568 
    569 ; TODO stack_fold_cvtsd2si64
    570 
    571 define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
    572   ;CHECK-LABEL: stack_fold_cvtsd2si64_int
    573   ;CHECK:  vcvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    574   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    575   %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
    576   ret i64 %2
    577 }
    578 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
    579 
    580 ; TODO: This fold shouldn't require optsize. Not folding doesn't prevent reading an undef register since the registers are a mix of XMM and GPR.
    581 define double @stack_fold_cvtsi2sd(i32 %a0) optsize {
    582   ;CHECK-LABEL: stack_fold_cvtsi2sd
    583   ;CHECK:  vcvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    584   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    585   %2 = sitofp i32 %a0 to double
    586   ret double %2
    587 }
    588 
    589 ; TODO: This fold shouldn't require optsize. Not folding doesn't prevent reading an undef register since the registers are a mix of XMM and GPR.
    590 define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) optsize {
    591   ;CHECK-LABEL: stack_fold_cvtsi2sd_int
    592   ;CHECK:  vcvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    593   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    594   %2 = sitofp i32 %a0 to double
    595   %3 = insertelement <2 x double> zeroinitializer, double %2, i64 0
    596   ret <2 x double> %3
    597 }
    598 
    599 ; TODO: This fold shouldn't require optsize. Not folding doesn't prevent reading an undef register since the registers are a mix of XMM and GPR.
    600 define double @stack_fold_cvtsi642sd(i64 %a0) optsize {
    601   ;CHECK-LABEL: stack_fold_cvtsi642sd
    602   ;CHECK:  vcvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    603   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    604   %2 = sitofp i64 %a0 to double
    605   ret double %2
    606 }
    607 
    608 ; TODO: This fold shouldn't require optsize. Not folding doesn't prevent reading an undef register since the registers are a mix of XMM and GPR.
    609 define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) optsize {
    610   ;CHECK-LABEL: stack_fold_cvtsi642sd_int
    611   ;CHECK:  vcvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    612   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    613   %2 = sitofp i64 %a0 to double
    614   %3 = insertelement <2 x double> zeroinitializer, double %2, i64 0
    615   ret <2 x double> %3
    616 }
    617 
    618 ; TODO: This fold shouldn't require optsize. Not folding doesn't prevent reading an undef register since the registers are a mix of XMM and GPR.
    619 define float @stack_fold_cvtsi2ss(i32 %a0) optsize {
    620   ;CHECK-LABEL: stack_fold_cvtsi2ss
    621   ;CHECK:  vcvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    622   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    623   %2 = sitofp i32 %a0 to float
    624   ret float %2
    625 }
    626 
    627 ; TODO: This fold shouldn't require optsize. Not folding doesn't prevent reading an undef register since the registers are a mix of XMM and GPR.
    628 define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) optsize {
    629   ;CHECK-LABEL: stack_fold_cvtsi2ss_int
    630   ;CHECK:  vcvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    631   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    632   %2 = sitofp i32 %a0 to float
    633   %3 = insertelement <4 x float> zeroinitializer, float %2, i64 0
    634   ret <4 x float> %3
    635 }
    636 
    637 ; TODO: This fold shouldn't require optsize. Not folding doesn't prevent reading an undef register since the registers are a mix of XMM and GPR.
    638 define float @stack_fold_cvtsi642ss(i64 %a0) optsize {
    639   ;CHECK-LABEL: stack_fold_cvtsi642ss
    640   ;CHECK:  vcvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    641   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    642   %2 = sitofp i64 %a0 to float
    643   ret float %2
    644 }
    645 
    646 ; TODO: This fold shouldn't require optsize. Not folding doesn't prevent reading an undef register since the registers are a mix of XMM and GPR.
    647 define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) optsize {
    648   ;CHECK-LABEL: stack_fold_cvtsi642ss_int
    649   ;CHECK:  vcvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    650   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    651   %2 = sitofp i64 %a0 to float
    652   %3 = insertelement <4 x float> zeroinitializer, float %2, i64 0
    653   ret <4 x float> %3
    654 }
    655 
    656 ; TODO stack_fold_cvtss2si
    657 
    658 define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
    659   ;CHECK-LABEL: stack_fold_cvtss2si_int
    660   ;CHECK:  vcvtss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    661   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    662   %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
    663   ret i32 %2
    664 }
    665 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
    666 
    667 ; TODO stack_fold_cvtss2si64
    668 
    669 define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
    670   ;CHECK-LABEL: stack_fold_cvtss2si64_int
    671   ;CHECK:  vcvtss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    672   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    673   %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
    674   ret i64 %2
    675 }
    676 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
    677 
    678 define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
    679   ;CHECK-LABEL: stack_fold_cvttpd2dq
    680   ;CHECK:  vcvttpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    681   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    682   %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
    683   ret <4 x i32> %2
    684 }
    685 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
    686 
    687 define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) {
    688   ;CHECK-LABEL: stack_fold_cvttpd2dq_ymm
    689   ;CHECK:  vcvttpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    690   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    691   %2 = fptosi <4 x double> %a0 to <4 x i32>
    692   ret <4 x i32> %2
    693 }
    694 
    695 define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
    696   ;CHECK-LABEL: stack_fold_cvttps2dq
    697   ;CHECK:  vcvttps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    698   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    699   %2 = fptosi <4 x float> %a0 to <4 x i32>
    700   ret <4 x i32> %2
    701 }
    702 
    703 define <8 x i32> @stack_fold_cvttps2dq_ymm(<8 x float> %a0) {
    704   ;CHECK-LABEL: stack_fold_cvttps2dq_ymm
    705   ;CHECK:  vcvttps2dq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    706   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    707   %2 = fptosi <8 x float> %a0 to <8 x i32>
    708   ret <8 x i32> %2
    709 }
    710 
    711 define i32 @stack_fold_cvttsd2si(double %a0) {
    712   ;CHECK-LABEL: stack_fold_cvttsd2si
    713   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 8-byte Folded Reload
    714   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    715   %2 = fptosi double %a0 to i32
    716   ret i32 %2
    717 }
    718 
    719 define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
    720   ;CHECK-LABEL: stack_fold_cvttsd2si_int
    721   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    722   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    723   %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
    724   ret i32 %2
    725 }
    726 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
    727 
    728 define i64 @stack_fold_cvttsd2si64(double %a0) {
    729   ;CHECK-LABEL: stack_fold_cvttsd2si64
    730   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
    731   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    732   %2 = fptosi double %a0 to i64
    733   ret i64 %2
    734 }
    735 
    736 define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
    737   ;CHECK-LABEL: stack_fold_cvttsd2si64_int
    738   ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    739   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    740   %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
    741   ret i64 %2
    742 }
    743 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
    744 
    745 define i32 @stack_fold_cvttss2si(float %a0) {
    746   ;CHECK-LABEL: stack_fold_cvttss2si
    747   ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
    748   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    749   %2 = fptosi float %a0 to i32
    750   ret i32 %2
    751 }
    752 
    753 define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
    754   ;CHECK-LABEL: stack_fold_cvttss2si_int
    755   ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
    756   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    757   %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
    758   ret i32 %2
    759 }
    760 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
    761 
    762 define i64 @stack_fold_cvttss2si64(float %a0) {
    763   ;CHECK-LABEL: stack_fold_cvttss2si64
    764   ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
    765   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    766   %2 = fptosi float %a0 to i64
    767   ret i64 %2
    768 }
    769 
    770 define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
    771   ;CHECK-LABEL: stack_fold_cvttss2si64_int
    772   ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
    773   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    774   %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
    775   ret i64 %2
    776 }
    777 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
    778 
    779 define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
    780   ;CHECK-LABEL: stack_fold_divpd
    781   ;CHECK:       vdivpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    782   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    783   %2 = fdiv <2 x double> %a0, %a1
    784   ret <2 x double> %2
    785 }
    786 
    787 define <4 x double> @stack_fold_divpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    788   ;CHECK-LABEL: stack_fold_divpd_ymm
    789   ;CHECK:       vdivpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    790   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    791   %2 = fdiv <4 x double> %a0, %a1
    792   ret <4 x double> %2
    793 }
    794 
    795 define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
    796   ;CHECK-LABEL: stack_fold_divps
    797   ;CHECK:       vdivps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    798   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    799   %2 = fdiv <4 x float> %a0, %a1
    800   ret <4 x float> %2
    801 }
    802 
    803 define <8 x float> @stack_fold_divps_ymm(<8 x float> %a0, <8 x float> %a1) {
    804   ;CHECK-LABEL: stack_fold_divps_ymm
    805   ;CHECK:       vdivps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    806   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    807   %2 = fdiv <8 x float> %a0, %a1
    808   ret <8 x float> %2
    809 }
    810 
    811 define double @stack_fold_divsd(double %a0, double %a1) {
    812   ;CHECK-LABEL: stack_fold_divsd
    813   ;CHECK:       vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    814   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    815   %2 = fdiv double %a0, %a1
    816   ret double %2
    817 }
    818 
    819 define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
    820   ;CHECK-LABEL: stack_fold_divsd_int
    821   ;CHECK:       vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    822   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    823   %2 = extractelement <2 x double> %a0, i32 0
    824   %3 = extractelement <2 x double> %a1, i32 0
    825   %4 = fdiv double %2, %3
    826   %5 = insertelement <2 x double> %a0, double %4, i32 0
    827   ret <2 x double> %5
    828 }
    829 
    830 define float @stack_fold_divss(float %a0, float %a1) {
    831   ;CHECK-LABEL: stack_fold_divss
    832   ;CHECK:       vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    833   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    834   %2 = fdiv float %a0, %a1
    835   ret float %2
    836 }
    837 
    838 define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
    839   ;CHECK-LABEL: stack_fold_divss_int
    840   ;CHECK:       vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    841   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    842   %2 = extractelement <4 x float> %a0, i32 0
    843   %3 = extractelement <4 x float> %a1, i32 0
    844   %4 = fdiv float %2, %3
    845   %5 = insertelement <4 x float> %a0, float %4, i32 0
    846   ret <4 x float> %5
    847 }
    848 
    849 define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
    850   ;CHECK-LABEL: stack_fold_dppd
    851   ;CHECK:       vdppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    852   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    853   %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
    854   ret <2 x double> %2
    855 }
    856 declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
    857 
    858 define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
    859   ;CHECK-LABEL: stack_fold_dpps
    860   ;CHECK:       vdpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    861   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    862   %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
    863   ret <4 x float> %2
    864 }
    865 declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
    866 
    867 define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) {
    868   ;CHECK-LABEL: stack_fold_dpps_ymm
    869   ;CHECK:       vdpps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    870   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    871   %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
    872   ret <8 x float> %2
    873 }
    874 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
    875 
    876 define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) {
    877   ;CHECK-LABEL: stack_fold_extractf128
    878   ;CHECK:       vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
    879   %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    880   %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    881   ret <4 x float> %1
    882 }
    883 
    884 define i32 @stack_fold_extractps(<4 x float> %a0) {
    885   ;CHECK-LABEL: stack_fold_extractps
    886   ;CHECK:       vextractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
    887   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
    888   ; fadd forces execution domain
    889   %1 = fadd <4 x float> %a0, <float 1.0, float 2.0, float 3.0, float 4.0>
    890   %2 = extractelement <4 x float> %1, i32 1
    891   %3 = bitcast float %2 to i32
    892   %4 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    893   ret i32 %3
    894 }
    895 
    896 define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
    897   ;CHECK-LABEL: stack_fold_haddpd
    898   ;CHECK:       vhaddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    899   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    900   %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
    901   ret <2 x double> %2
    902 }
    903 declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
    904 
    905 define <4 x double> @stack_fold_haddpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    906   ;CHECK-LABEL: stack_fold_haddpd_ymm
    907   ;CHECK:       vhaddpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    908   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    909   %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
    910   ret <4 x double> %2
    911 }
    912 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
    913 
    914 define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
    915   ;CHECK-LABEL: stack_fold_haddps
    916   ;CHECK:       vhaddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    917   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    918   %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
    919   ret <4 x float> %2
    920 }
    921 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
    922 
    923 define <8 x float> @stack_fold_haddps_ymm(<8 x float> %a0, <8 x float> %a1) {
    924   ;CHECK-LABEL: stack_fold_haddps_ymm
    925   ;CHECK:       vhaddps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    926   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    927   %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
    928   ret <8 x float> %2
    929 }
    930 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
    931 
    932 define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
    933   ;CHECK-LABEL: stack_fold_hsubpd
    934   ;CHECK:       vhsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    935   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    936   %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
    937   ret <2 x double> %2
    938 }
    939 declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
    940 
    941 define <4 x double> @stack_fold_hsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
    942   ;CHECK-LABEL: stack_fold_hsubpd_ymm
    943   ;CHECK:       vhsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    944   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    945   %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
    946   ret <4 x double> %2
    947 }
    948 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
    949 
    950 define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
    951   ;CHECK-LABEL: stack_fold_hsubps
    952   ;CHECK:       vhsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    953   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    954   %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
    955   ret <4 x float> %2
    956 }
    957 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
    958 
    959 define <8 x float> @stack_fold_hsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
    960   ;CHECK-LABEL: stack_fold_hsubps_ymm
    961   ;CHECK:       vhsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    962   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    963   %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
    964   ret <8 x float> %2
    965 }
    966 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
    967 
    968 define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) {
    969   ;CHECK-LABEL: stack_fold_insertf128
    970   ;CHECK:       vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    971   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    972   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    973   ret <8 x float> %2
    974 }
    975 
    976 define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
    977   ;CHECK-LABEL: stack_fold_insertps
    978   ;CHECK:       vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    979   ;CHECK-NEXT:                                                                              {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
    980   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    981   %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
    982   ret <4 x float> %2
    983 }
    984 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
    985 
    986 define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
    987   ;CHECK-LABEL: stack_fold_maxpd
    988   ;CHECK:       vmaxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    989   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    990   %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
    991   ret <2 x double> %2
    992 }
    993 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
    994 
    995 define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
    996   ;CHECK-LABEL: stack_fold_maxpd_commutable
    997   ;CHECK:       vmaxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    998   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    999   %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
   1000   ret <2 x double> %2
   1001 }
   1002 
   1003 define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
   1004   ;CHECK-LABEL: stack_fold_maxpd_ymm
   1005   ;CHECK:       vmaxpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1006   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1007   %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
   1008   ret <4 x double> %2
   1009 }
   1010 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
   1011 
   1012 define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
   1013   ;CHECK-LABEL: stack_fold_maxpd_ymm_commutable
   1014   ;CHECK:       vmaxpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1015   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1016   %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
   1017   ret <4 x double> %2
   1018 }
   1019 
   1020 define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
   1021   ;CHECK-LABEL: stack_fold_maxps
   1022   ;CHECK:       vmaxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1023   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1024   %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   1025   ret <4 x float> %2
   1026 }
   1027 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
   1028 
   1029 define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
   1030   ;CHECK-LABEL: stack_fold_maxps_commutable
   1031   ;CHECK:       vmaxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1032   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1033   %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   1034   ret <4 x float> %2
   1035 }
   1036 
   1037 define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
   1038   ;CHECK-LABEL: stack_fold_maxps_ymm
   1039   ;CHECK:       vmaxps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1040   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1041   %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
   1042   ret <8 x float> %2
   1043 }
   1044 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
   1045 
   1046 define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
   1047   ;CHECK-LABEL: stack_fold_maxps_ymm_commutable
   1048   ;CHECK:       vmaxps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1049   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1050   %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
   1051   ret <8 x float> %2
   1052 }
   1053 
   1054 define double @stack_fold_maxsd(double %a0, double %a1) #0 {
   1055   ;CHECK-LABEL: stack_fold_maxsd
   1056   ;CHECK:       vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1057   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1058   %2 = fcmp ogt double %a0, %a1
   1059   %3 = select i1 %2, double %a0, double %a1
   1060   ret double %3
   1061 }
   1062 
   1063 define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
   1064   ;CHECK-LABEL: stack_fold_maxsd_commutable
   1065   ;CHECK:       vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1066   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1067   %2 = fcmp ogt double %a0, %a1
   1068   %3 = select i1 %2, double %a0, double %a1
   1069   ret double %3
   1070 }
   1071 
   1072 define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
   1073   ;CHECK-LABEL: stack_fold_maxsd_int
   1074   ;CHECK:       vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1075   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1076   %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
   1077   ret <2 x double> %2
   1078 }
   1079 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
   1080 
   1081 define float @stack_fold_maxss(float %a0, float %a1) #0 {
   1082   ;CHECK-LABEL: stack_fold_maxss
   1083   ;CHECK:       vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1084   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1085   %2 = fcmp ogt float %a0, %a1
   1086   %3 = select i1 %2, float %a0, float %a1
   1087   ret float %3
   1088 }
   1089 
   1090 define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
   1091   ;CHECK-LABEL: stack_fold_maxss_commutable
   1092   ;CHECK:       vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1093   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1094   %2 = fcmp ogt float %a0, %a1
   1095   %3 = select i1 %2, float %a0, float %a1
   1096   ret float %3
   1097 }
   1098 
   1099 define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
   1100   ;CHECK-LABEL: stack_fold_maxss_int
   1101   ;CHECK:       vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1102   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1103   %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
   1104   ret <4 x float> %2
   1105 }
   1106 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
   1107 
   1108 define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
   1109   ;CHECK-LABEL: stack_fold_minpd
   1110   ;CHECK:       vminpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1111   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1112   %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
   1113   ret <2 x double> %2
   1114 }
   1115 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
   1116 
   1117 define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
   1118   ;CHECK-LABEL: stack_fold_minpd_commutable
   1119   ;CHECK:       vminpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1120   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1121   %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
   1122   ret <2 x double> %2
   1123 }
   1124 
   1125 define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
   1126   ;CHECK-LABEL: stack_fold_minpd_ymm
   1127   ;CHECK:       vminpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1128   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1129   %2 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
   1130   ret <4 x double> %2
   1131 }
   1132 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
   1133 
   1134 define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
   1135   ;CHECK-LABEL: stack_fold_minpd_ymm_commutable
   1136   ;CHECK:       vminpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1137   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1138   %2 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
   1139   ret <4 x double> %2
   1140 }
   1141 
   1142 define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
   1143   ;CHECK-LABEL: stack_fold_minps
   1144   ;CHECK:       vminps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1145   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1146   %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   1147   ret <4 x float> %2
   1148 }
   1149 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
   1150 
   1151 define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
   1152   ;CHECK-LABEL: stack_fold_minps_commutable
   1153   ;CHECK:       vminps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1154   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1155   %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   1156   ret <4 x float> %2
   1157 }
   1158 
   1159 define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
   1160   ;CHECK-LABEL: stack_fold_minps_ymm
   1161   ;CHECK:       vminps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1162   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1163   %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
   1164   ret <8 x float> %2
   1165 }
   1166 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
   1167 
   1168 define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
   1169   ;CHECK-LABEL: stack_fold_minps_ymm_commutable
   1170   ;CHECK:       vminps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1171   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1172   %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
   1173   ret <8 x float> %2
   1174 }
   1175 
   1176 define double @stack_fold_minsd(double %a0, double %a1) #0 {
   1177   ;CHECK-LABEL: stack_fold_minsd
   1178   ;CHECK:       vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1179   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1180   %2 = fcmp olt double %a0, %a1
   1181   %3 = select i1 %2, double %a0, double %a1
   1182   ret double %3
   1183 }
   1184 
   1185 define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
   1186   ;CHECK-LABEL: stack_fold_minsd_commutable
   1187   ;CHECK:       vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1188   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1189   %2 = fcmp olt double %a0, %a1
   1190   %3 = select i1 %2, double %a0, double %a1
   1191   ret double %3
   1192 }
   1193 
   1194 define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
   1195   ;CHECK-LABEL: stack_fold_minsd_int
   1196   ;CHECK:       vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1197   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1198   %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
   1199   ret <2 x double> %2
   1200 }
   1201 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
   1202 
   1203 define float @stack_fold_minss(float %a0, float %a1) #0 {
   1204   ;CHECK-LABEL: stack_fold_minss
   1205   ;CHECK:       vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1206   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1207   %2 = fcmp olt float %a0, %a1
   1208   %3 = select i1 %2, float %a0, float %a1
   1209   ret float %3
   1210 }
   1211 
   1212 define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
   1213   ;CHECK-LABEL: stack_fold_minss_commutable
   1214   ;CHECK:       vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1215   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1216   %2 = fcmp olt float %a0, %a1
   1217   %3 = select i1 %2, float %a0, float %a1
   1218   ret float %3
   1219 }
   1220 
   1221 define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
   1222   ;CHECK-LABEL: stack_fold_minss_int
   1223   ;CHECK:       vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1224   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1225   %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
   1226   ret <4 x float> %2
   1227 }
   1228 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
   1229 
   1230 define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
   1231   ;CHECK-LABEL: stack_fold_movddup
   1232   ;CHECK:   vmovddup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1233   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1234   %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
   1235   ret <2 x double> %2
   1236 }
   1237 
   1238 define <4 x double> @stack_fold_movddup_ymm(<4 x double> %a0) {
   1239   ;CHECK-LABEL: stack_fold_movddup_ymm
   1240   ;CHECK:   vmovddup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1241   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1242   %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   1243   ret <4 x double> %2
   1244 }
   1245 
   1246 ; TODO stack_fold_movhpd (load / store)
   1247 ; TODO stack_fold_movhps (load / store)
   1248 
   1249 ; TODO stack_fold_movlpd (load / store)
   1250 ; TODO stack_fold_movlps (load / store)
   1251 
   1252 define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
   1253   ;CHECK-LABEL: stack_fold_movshdup
   1254   ;CHECK:   vmovshdup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1255   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1256   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
   1257   ret <4 x float> %2
   1258 }
   1259 
   1260 define <8 x float> @stack_fold_movshdup_ymm(<8 x float> %a0) {
   1261   ;CHECK-LABEL: stack_fold_movshdup_ymm
   1262   ;CHECK:   vmovshdup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1263   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1264   %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   1265   ret <8 x float> %2
   1266 }
   1267 
   1268 define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
   1269   ;CHECK-LABEL: stack_fold_movsldup
   1270   ;CHECK:   vmovsldup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1271   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1272   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   1273   ret <4 x float> %2
   1274 }
   1275 
   1276 define <8 x float> @stack_fold_movsldup_ymm(<8 x float> %a0) {
   1277   ;CHECK-LABEL: stack_fold_movsldup_ymm
   1278   ;CHECK:   vmovsldup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1279   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1280   %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1281   ret <8 x float> %2
   1282 }
   1283 
   1284 define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
   1285   ;CHECK-LABEL: stack_fold_mulpd
   1286   ;CHECK:       vmulpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1287   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1288   %2 = fmul <2 x double> %a0, %a1
   1289   ret <2 x double> %2
   1290 }
   1291 
   1292 define <4 x double> @stack_fold_mulpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1293   ;CHECK-LABEL: stack_fold_mulpd_ymm
   1294   ;CHECK:       vmulpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1295   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1296   %2 = fmul <4 x double> %a0, %a1
   1297   ret <4 x double> %2
   1298 }
   1299 
   1300 define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
   1301   ;CHECK-LABEL: stack_fold_mulps
   1302   ;CHECK:       vmulps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1303   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1304   %2 = fmul <4 x float> %a0, %a1
   1305   ret <4 x float> %2
   1306 }
   1307 
   1308 define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1309   ;CHECK-LABEL: stack_fold_mulps_ymm
   1310   ;CHECK:       vmulps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1311   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1312   %2 = fmul <8 x float> %a0, %a1
   1313   ret <8 x float> %2
   1314 }
   1315 
   1316 define double @stack_fold_mulsd(double %a0, double %a1) {
   1317   ;CHECK-LABEL: stack_fold_mulsd
   1318   ;CHECK:       vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1319   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1320   %2 = fmul double %a0, %a1
   1321   ret double %2
   1322 }
   1323 
   1324 define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
   1325   ;CHECK-LABEL: stack_fold_mulsd_int
   1326   ;CHECK:       vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1327   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1328   %2 = extractelement <2 x double> %a0, i32 0
   1329   %3 = extractelement <2 x double> %a1, i32 0
   1330   %4 = fmul double %2, %3
   1331   %5 = insertelement <2 x double> %a0, double %4, i32 0
   1332   ret <2 x double> %5
   1333 }
   1334 
   1335 define float @stack_fold_mulss(float %a0, float %a1) {
   1336   ;CHECK-LABEL: stack_fold_mulss
   1337   ;CHECK:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1338   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1339   %2 = fmul float %a0, %a1
   1340   ret float %2
   1341 }
   1342 
   1343 define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
   1344   ;CHECK-LABEL: stack_fold_mulss_int
   1345   ;CHECK:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1346   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1347   %2 = extractelement <4 x float> %a0, i32 0
   1348   %3 = extractelement <4 x float> %a1, i32 0
   1349   %4 = fmul float %2, %3
   1350   %5 = insertelement <4 x float> %a0, float %4, i32 0
   1351   ret <4 x float> %5
   1352 }
   1353 
   1354 define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
   1355   ;CHECK-LABEL: stack_fold_orpd
   1356   ;CHECK:       vorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1357   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1358   %2 = bitcast <2 x double> %a0 to <2 x i64>
   1359   %3 = bitcast <2 x double> %a1 to <2 x i64>
   1360   %4 = or <2 x i64> %2, %3
   1361   %5 = bitcast <2 x i64> %4 to <2 x double>
   1362   ; fadd forces execution domain
   1363   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
   1364   ret <2 x double> %6
   1365 }
   1366 
   1367 define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1368   ;CHECK-LABEL: stack_fold_orpd_ymm
   1369   ;CHECK:       vorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1370   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1371   %2 = bitcast <4 x double> %a0 to <4 x i64>
   1372   %3 = bitcast <4 x double> %a1 to <4 x i64>
   1373   %4 = or <4 x i64> %2, %3
   1374   %5 = bitcast <4 x i64> %4 to <4 x double>
   1375   ; fadd forces execution domain
   1376   %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
   1377   ret <4 x double> %6
   1378 }
   1379 
   1380 define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
   1381   ;CHECK-LABEL: stack_fold_orps
   1382   ;CHECK:       vorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1383   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1384   %2 = bitcast <4 x float> %a0 to <2 x i64>
   1385   %3 = bitcast <4 x float> %a1 to <2 x i64>
   1386   %4 = or <2 x i64> %2, %3
   1387   %5 = bitcast <2 x i64> %4 to <4 x float>
   1388   ; fadd forces execution domain
   1389   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
   1390   ret <4 x float> %6
   1391 }
   1392 
   1393 define <8 x float> @stack_fold_orps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1394   ;CHECK-LABEL: stack_fold_orps_ymm
   1395   ;CHECK:       vorps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1396   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1397   %2 = bitcast <8 x float> %a0 to <4 x i64>
   1398   %3 = bitcast <8 x float> %a1 to <4 x i64>
   1399   %4 = or <4 x i64> %2, %3
   1400   %5 = bitcast <4 x i64> %4 to <8 x float>
   1401   ; fadd forces execution domain
   1402   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1403   ret <8 x float> %6
   1404 }
   1405 
   1406 define <8 x float> @stack_fold_perm2f128(<8 x float> %a0, <8 x float> %a1) {
   1407   ;CHECK-LABEL: stack_fold_perm2f128
   1408   ;CHECK:   vperm2f128 $33, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1409   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1410   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1411   ret <8 x float> %2
   1412 }
   1413 
   1414 define <2 x double> @stack_fold_permilpd(<2 x double> %a0) {
   1415   ;CHECK-LABEL: stack_fold_permilpd
   1416   ;CHECK:   vpermilpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1417   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1418   %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   1419   ret <2 x double> %2
   1420 }
   1421 
   1422 define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) {
   1423   ;CHECK-LABEL: stack_fold_permilpd_ymm
   1424   ;CHECK:   vpermilpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1425   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1426   %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   1427   ret <4 x double> %2
   1428 }
   1429 
   1430 define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) {
   1431   ;CHECK-LABEL: stack_fold_permilpdvar
   1432   ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1433   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1434   %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
   1435   ret <2 x double> %2
   1436 }
   1437 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
   1438 
   1439 define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) {
   1440   ;CHECK-LABEL: stack_fold_permilpdvar_ymm
   1441   ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1442   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1443   %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
   1444   ret <4 x double> %2
   1445 }
   1446 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
   1447 
   1448 define <4 x float> @stack_fold_permilps(<4 x float> %a0) {
   1449   ;CHECK-LABEL: stack_fold_permilps
   1450   ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1451   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1452   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   1453   ret <4 x float> %2
   1454 }
   1455 
   1456 define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) {
   1457   ;CHECK-LABEL: stack_fold_permilps_ymm
   1458   ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1459   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1460   %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   1461   ret <8 x float> %2
   1462 }
   1463 
   1464 define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) {
   1465   ;CHECK-LABEL: stack_fold_permilpsvar
   1466   ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1467   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1468   %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
   1469   ret <4 x float> %2
   1470 }
   1471 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
   1472 
   1473 define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) {
   1474   ;CHECK-LABEL: stack_fold_permilpsvar_ymm
   1475   ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1476   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1477   %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
   1478   ret <8 x float> %2
   1479 }
   1480 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
   1481 
   1482 ; TODO stack_fold_rcpps
   1483 
   1484 define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
   1485   ;CHECK-LABEL: stack_fold_rcpps_int
   1486   ;CHECK:       vrcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1487   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1488   %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
   1489   ret <4 x float> %2
   1490 }
   1491 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
   1492 
   1493 ; TODO stack_fold_rcpps_ymm
   1494 
   1495 define <8 x float> @stack_fold_rcpps_ymm_int(<8 x float> %a0) {
   1496   ;CHECK-LABEL: stack_fold_rcpps_ymm_int
   1497   ;CHECK:       vrcpps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1498   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1499   %2 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
   1500   ret <8 x float> %2
   1501 }
   1502 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
   1503 
   1504 ; TODO stack_fold_rcpss
   1505 ; TODO stack_fold_rcpss_int
   1506 
   1507 define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
   1508   ;CHECK-LABEL: stack_fold_roundpd
   1509   ;CHECK:  vroundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1510   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1511   %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
   1512   ret <2 x double> %2
   1513 }
   1514 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
   1515 
   1516 define <4 x double> @stack_fold_roundpd_ymm(<4 x double> %a0) {
   1517   ;CHECK-LABEL: stack_fold_roundpd_ymm
   1518   ;CHECK:  vroundpd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1519   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1520   %2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
   1521   ret <4 x double> %2
   1522 }
   1523 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
   1524 
   1525 define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
   1526   ;CHECK-LABEL: stack_fold_roundps
   1527   ;CHECK:  vroundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1528   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1529   %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
   1530   ret <4 x float> %2
   1531 }
   1532 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
   1533 
   1534 define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) {
   1535   ;CHECK-LABEL: stack_fold_roundps_ymm
   1536   ;CHECK:  vroundps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1537   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1538   %2 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
   1539   ret <8 x float> %2
   1540 }
   1541 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
   1542 
   1543 define double @stack_fold_roundsd(double %a0) optsize {
   1544   ;CHECK-LABEL: stack_fold_roundsd
   1545   ;CHECK:       vroundsd $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1546   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1547   %2 = call double @llvm.floor.f64(double %a0)
   1548   ret double %2
   1549 }
   1550 declare double @llvm.floor.f64(double) nounwind readnone
   1551 
   1552 define <2 x double> @stack_fold_roundsd_int(<2 x double> %a0, <2 x double> %a1) optsize {
   1553   ;CHECK-LABEL: stack_fold_roundsd_int
   1554   ;CHECK:       vroundsd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1555   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1556   %2 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
   1557   ret <2 x double> %2
   1558 }
   1559 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
   1560 
   1561 define float @stack_fold_roundss(float %a0) optsize {
   1562   ;CHECK-LABEL: stack_fold_roundss
   1563   ;CHECK:       vroundss $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1564   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1565   %2 = call float @llvm.floor.f32(float %a0)
   1566   ret float %2
   1567 }
   1568 declare float @llvm.floor.f32(float) nounwind readnone
   1569 
   1570 define <4 x float> @stack_fold_roundss_int(<4 x float> %a0, <4 x float> %a1) optsize {
   1571   ;CHECK-LABEL: stack_fold_roundss_int
   1572   ;CHECK:       vroundss $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1573   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1574   %2 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
   1575   ret <4 x float> %2
   1576 }
   1577 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
   1578 
   1579 ; TODO stack_fold_rsqrtps
   1580 
   1581 define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
   1582   ;CHECK-LABEL: stack_fold_rsqrtps_int
   1583   ;CHECK:       vrsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1584   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1585   %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
   1586   ret <4 x float> %2
   1587 }
   1588 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
   1589 
   1590 ; TODO stack_fold_rsqrtps_ymm
   1591 
   1592 define <8 x float> @stack_fold_rsqrtps_ymm_int(<8 x float> %a0) {
   1593   ;CHECK-LABEL: stack_fold_rsqrtps_ymm_int
   1594   ;CHECK:       vrsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1595   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1596   %2 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
   1597   ret <8 x float> %2
   1598 }
   1599 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
   1600 
   1601 ; TODO stack_fold_rsqrtss
   1602 ; TODO stack_fold_rsqrtss_int
   1603 
   1604 define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
   1605   ;CHECK-LABEL: stack_fold_shufpd
   1606   ;CHECK:       vshufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1607   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1608   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
   1609   ; fadd forces execution domain
   1610   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
   1611   ret <2 x double> %3
   1612 }
   1613 
   1614 define <4 x double> @stack_fold_shufpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1615   ;CHECK-LABEL: stack_fold_shufpd_ymm
   1616   ;CHECK:       vshufpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1617   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1618   %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
   1619   ; fadd forces execution domain
   1620   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
   1621   ret <4 x double> %3
   1622 }
   1623 
   1624 define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
   1625   ;CHECK-LABEL: stack_fold_shufps
   1626   ;CHECK:       vshufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1627   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1628   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
   1629   ret <4 x float> %2
   1630 }
   1631 
   1632 define <8 x float> @stack_fold_shufps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1633   ;CHECK-LABEL: stack_fold_shufps_ymm
   1634   ;CHECK:       vshufps $148, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1635   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1636   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 13, i32 14>
   1637   ret <8 x float> %2
   1638 }
   1639 
   1640 define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
   1641   ;CHECK-LABEL: stack_fold_sqrtpd
   1642   ;CHECK:       vsqrtpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1643   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1644   %2 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0)
   1645   ret <2 x double> %2
   1646 }
   1647 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
   1648 
   1649 define <4 x double> @stack_fold_sqrtpd_ymm(<4 x double> %a0) {
   1650   ;CHECK-LABEL: stack_fold_sqrtpd_ymm
   1651   ;CHECK:       vsqrtpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1652   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1653   %2 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0)
   1654   ret <4 x double> %2
   1655 }
   1656 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
   1657 
   1658 define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
   1659   ;CHECK-LABEL: stack_fold_sqrtps
   1660   ;CHECK:       vsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1661   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1662   %2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
   1663   ret <4 x float> %2
   1664 }
   1665 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
   1666 
   1667 define <8 x float> @stack_fold_sqrtps_ymm(<8 x float> %a0) {
   1668   ;CHECK-LABEL: stack_fold_sqrtps_ymm
   1669   ;CHECK:       vsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1670   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1671   %2 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0)
   1672   ret <8 x float> %2
   1673 }
   1674 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
   1675 
   1676 define double @stack_fold_sqrtsd(double %a0) optsize {
   1677   ;CHECK-LABEL: stack_fold_sqrtsd
   1678   ;CHECK:       vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1679   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1680   %2 = call double @llvm.sqrt.f64(double %a0)
   1681   ret double %2
   1682 }
   1683 declare double @llvm.sqrt.f64(double) nounwind readnone
   1684 
   1685 ; TODO stack_fold_sqrtsd_int
   1686 
   1687 define float @stack_fold_sqrtss(float %a0) optsize {
   1688   ;CHECK-LABEL: stack_fold_sqrtss
   1689   ;CHECK:       vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1690   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1691   %2 = call float @llvm.sqrt.f32(float %a0)
   1692   ret float %2
   1693 }
   1694 declare float @llvm.sqrt.f32(float) nounwind readnone
   1695 
   1696 ; TODO stack_fold_sqrtss_int
   1697 
   1698 define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
   1699   ;CHECK-LABEL: stack_fold_subpd
   1700   ;CHECK:       vsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1701   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1702   %2 = fsub <2 x double> %a0, %a1
   1703   ret <2 x double> %2
   1704 }
   1705 
   1706 define <4 x double> @stack_fold_subpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1707   ;CHECK-LABEL: stack_fold_subpd_ymm
   1708   ;CHECK:       vsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1709   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1710   %2 = fsub <4 x double> %a0, %a1
   1711   ret <4 x double> %2
   1712 }
   1713 
   1714 define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
   1715   ;CHECK-LABEL: stack_fold_subps
   1716   ;CHECK:       vsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1717   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1718   %2 = fsub <4 x float> %a0, %a1
   1719   ret <4 x float> %2
   1720 }
   1721 
   1722 define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1723   ;CHECK-LABEL: stack_fold_subps_ymm
   1724   ;CHECK:       vsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1725   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1726   %2 = fsub <8 x float> %a0, %a1
   1727   ret <8 x float> %2
   1728 }
   1729 
   1730 define double @stack_fold_subsd(double %a0, double %a1) {
   1731   ;CHECK-LABEL: stack_fold_subsd
   1732   ;CHECK:       vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1733   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1734   %2 = fsub double %a0, %a1
   1735   ret double %2
   1736 }
   1737 
   1738 define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
   1739   ;CHECK-LABEL: stack_fold_subsd_int
   1740   ;CHECK:       vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1741   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1742   %2 = extractelement <2 x double> %a0, i32 0
   1743   %3 = extractelement <2 x double> %a1, i32 0
   1744   %4 = fsub double %2, %3
   1745   %5 = insertelement <2 x double> %a0, double %4, i32 0
   1746   ret <2 x double> %5
   1747 }
   1748 
   1749 define float @stack_fold_subss(float %a0, float %a1) {
   1750   ;CHECK-LABEL: stack_fold_subss
   1751   ;CHECK:       vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1752   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1753   %2 = fsub float %a0, %a1
   1754   ret float %2
   1755 }
   1756 
   1757 define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
   1758   ;CHECK-LABEL: stack_fold_subss_int
   1759   ;CHECK:       vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1760   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1761   %2 = extractelement <4 x float> %a0, i32 0
   1762   %3 = extractelement <4 x float> %a1, i32 0
   1763   %4 = fsub float %2, %3
   1764   %5 = insertelement <4 x float> %a0, float %4, i32 0
   1765   ret <4 x float> %5
   1766 }
   1767 
   1768 define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) {
   1769   ;CHECK-LABEL: stack_fold_testpd
   1770   ;CHECK:       vtestpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1771   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1772   %2 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
   1773   ret i32 %2
   1774 }
   1775 declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
   1776 
   1777 define i32 @stack_fold_testpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1778   ;CHECK-LABEL: stack_fold_testpd_ymm
   1779   ;CHECK:       vtestpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1780   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1781   %2 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
   1782   ret i32 %2
   1783 }
   1784 declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
   1785 
   1786 define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) {
   1787   ;CHECK-LABEL: stack_fold_testps
   1788   ;CHECK:       vtestps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1789   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1790   %2 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
   1791   ret i32 %2
   1792 }
   1793 declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
   1794 
   1795 define i32 @stack_fold_testps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1796   ;CHECK-LABEL: stack_fold_testps_ymm
   1797   ;CHECK:       vtestps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1798   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1799   %2 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
   1800   ret i32 %2
   1801 }
   1802 declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
   1803 
   1804 define i32 @stack_fold_ucomisd(double %a0, double %a1) {
   1805   ;CHECK-LABEL: stack_fold_ucomisd
   1806   ;CHECK:       vucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
   1807   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1808   %2 = fcmp ueq double %a0, %a1
   1809   %3 = select i1 %2, i32 1, i32 -1
   1810   ret i32 %3
   1811 }
   1812 
   1813 define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
   1814   ;CHECK-LABEL: stack_fold_ucomisd_int
   1815   ;CHECK:       vucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1816   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1817   %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
   1818   ret i32 %2
   1819 }
   1820 declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
   1821 
   1822 define i32 @stack_fold_ucomiss(float %a0, float %a1) {
   1823   ;CHECK-LABEL: stack_fold_ucomiss
   1824   ;CHECK:       vucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
   1825   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1826   %2 = fcmp ueq float %a0, %a1
   1827   %3 = select i1 %2, i32 1, i32 -1
   1828   ret i32 %3
   1829 }
   1830 
   1831 define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
   1832   ;CHECK-LABEL: stack_fold_ucomiss_int
   1833   ;CHECK:       vucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1834   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1835   %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
   1836   ret i32 %2
   1837 }
   1838 declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
   1839 
   1840 define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
   1841   ;CHECK-LABEL: stack_fold_unpckhpd
   1842   ;CHECK:       vunpckhpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1843   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1844   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
   1845   ; fadd forces execution domain
   1846   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
   1847   ret <2 x double> %3
   1848 }
   1849 
   1850 define <4 x double> @stack_fold_unpckhpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1851   ;CHECK-LABEL: stack_fold_unpckhpd_ymm
   1852   ;CHECK:       vunpckhpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1853   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1854   %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   1855   ; fadd forces execution domain
   1856   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
   1857   ret <4 x double> %3
   1858 }
   1859 
   1860 define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
   1861   ;CHECK-LABEL: stack_fold_unpckhps
   1862   ;CHECK:       vunpckhps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1863   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1864   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   1865   ; fadd forces execution domain
   1866   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
   1867   ret <4 x float> %3
   1868 }
   1869 
   1870 define <8 x float> @stack_fold_unpckhps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1871   ;CHECK-LABEL: stack_fold_unpckhps_ymm
   1872   ;CHECK:       vunpckhps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1873   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1874   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   1875   ; fadd forces execution domain
   1876   %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1877   ret <8 x float> %3
   1878 }
   1879 
   1880 define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
   1881   ;CHECK-LABEL: stack_fold_unpcklpd
   1882   ;CHECK:       vunpcklpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1883   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1884   %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
   1885   ; fadd forces execution domain
   1886   %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
   1887   ret <2 x double> %3
   1888 }
   1889 
   1890 define <4 x double> @stack_fold_unpcklpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1891   ;CHECK-LABEL: stack_fold_unpcklpd_ymm
   1892   ;CHECK:       vunpcklpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1893   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1894   %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   1895   ; fadd forces execution domain
   1896   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
   1897   ret <4 x double> %3
   1898 }
   1899 
   1900 define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
   1901   ;CHECK-LABEL: stack_fold_unpcklps
   1902   ;CHECK:       vunpcklps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1903   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1904   %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   1905   ; fadd forces execution domain
   1906   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
   1907   ret <4 x float> %3
   1908 }
   1909 
   1910 define <8 x float> @stack_fold_unpcklps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1911   ;CHECK-LABEL: stack_fold_unpcklps_ymm
   1912   ;CHECK:       vunpcklps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1913   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1914   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   1915   ; fadd forces execution domain
   1916   %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1917   ret <8 x float> %3
   1918 }
   1919 
   1920 define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
   1921   ;CHECK-LABEL: stack_fold_xorpd
   1922   ;CHECK:       vxorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1923   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1924   %2 = bitcast <2 x double> %a0 to <2 x i64>
   1925   %3 = bitcast <2 x double> %a1 to <2 x i64>
   1926   %4 = xor <2 x i64> %2, %3
   1927   %5 = bitcast <2 x i64> %4 to <2 x double>
   1928   ; fadd forces execution domain
   1929   %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
   1930   ret <2 x double> %6
   1931 }
   1932 
   1933 define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
   1934   ;CHECK-LABEL: stack_fold_xorpd_ymm
   1935   ;CHECK:       vxorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1936   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1937   %2 = bitcast <4 x double> %a0 to <4 x i64>
   1938   %3 = bitcast <4 x double> %a1 to <4 x i64>
   1939   %4 = xor <4 x i64> %2, %3
   1940   %5 = bitcast <4 x i64> %4 to <4 x double>
   1941   ; fadd forces execution domain
   1942   %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
   1943   ret <4 x double> %6
   1944 }
   1945 
   1946 define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
   1947   ;CHECK-LABEL: stack_fold_xorps
   1948   ;CHECK:       vxorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1949   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1950   %2 = bitcast <4 x float> %a0 to <2 x i64>
   1951   %3 = bitcast <4 x float> %a1 to <2 x i64>
   1952   %4 = xor <2 x i64> %2, %3
   1953   %5 = bitcast <2 x i64> %4 to <4 x float>
   1954   ; fadd forces execution domain
   1955   %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
   1956   ret <4 x float> %6
   1957 }
   1958 
   1959 define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
   1960   ;CHECK-LABEL: stack_fold_xorps_ymm
   1961   ;CHECK:       vxorps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1962   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1963   %2 = bitcast <8 x float> %a0 to <4 x i64>
   1964   %3 = bitcast <8 x float> %a1 to <4 x i64>
   1965   %4 = xor <4 x i64> %2, %3
   1966   %5 = bitcast <4 x i64> %4 to <8 x float>
   1967   ; fadd forces execution domain
   1968   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   1969   ret <8 x float> %6
   1970 }
   1971 
   1972 attributes #0 = { "unsafe-fp-math"="false" }
   1973 attributes #1 = { "unsafe-fp-math"="true" }
   1974