Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 < %s | FileCheck %s
      2 
      3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
      4 target triple = "x86_64-unknown-unknown"
      5 
      6 ; Stack reload folding tests.
      7 ;
      8 ; By including a nop call with sideeffects we can force a partial register spill of the
      9 ; relevant registers and check that the reload is correctly folded into the instruction.
     10 
     11 define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) {
     12   ;CHECK-LABEL: stack_fold_broadcastsd_ymm
     13   ;CHECK:       vbroadcastsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     14   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     15   %2 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
     16   ; fadd forces execution domain
     17   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
     18   ret <4 x double> %3
     19 }
     20 
     21 define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) {
     22   ;CHECK-LABEL: stack_fold_broadcastss
     23   ;CHECK:       vbroadcastss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     24   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     25   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
     26   ; fadd forces execution domain
     27   %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
     28   ret <4 x float> %3
     29 }
     30 
     31 define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) {
     32   ;CHECK-LABEL: stack_fold_broadcastss_ymm
     33   ;CHECK:       vbroadcastss {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     34   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     35   %2 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
     36   ; fadd forces execution domain
     37   %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
     38   ret <8 x float> %3
     39 }
     40 
     41 define <4 x i32> @stack_fold_extracti128(<8 x i32> %a0, <8 x i32> %a1) {
     42   ;CHECK-LABEL: stack_fold_extracti128
     43   ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
     44   ; add forces execution domain
     45   %1 = add <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     46   %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
     47   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     48   ret <4 x i32> %2
     49 }
     50 
     51 define <8 x i32> @stack_fold_inserti128(<4 x i32> %a0, <4 x i32> %a1) {
     52   ;CHECK-LABEL: stack_fold_inserti128
     53   ;CHECK:       vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     54   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     55   %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
     56   ; add forces execution domain
     57   %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     58   ret <8 x i32> %3
     59 }
     60 
     61 define <16 x i16> @stack_fold_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
     62   ;CHECK-LABEL: stack_fold_mpsadbw
     63   ;CHECK:       vmpsadbw $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     64   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     65   %2 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7)
     66   ret <16 x i16> %2
     67 }
     68 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
     69 
     70 define <32 x i8> @stack_fold_pabsb(<32 x i8> %a0) {
     71   ;CHECK-LABEL: stack_fold_pabsb
     72   ;CHECK:       vpabsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     73   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     74   %2 = icmp sgt <32 x i8> %a0, zeroinitializer
     75   %3 = sub <32 x i8> zeroinitializer, %a0
     76   %4 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %3
     77   ret <32 x i8> %4
     78 }
     79 
     80 define <8 x i32> @stack_fold_pabsd(<8 x i32> %a0) {
     81   ;CHECK-LABEL: stack_fold_pabsd
     82   ;CHECK:       vpabsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     83   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     84   %2 = icmp sgt <8 x i32> %a0, zeroinitializer
     85   %3 = sub <8 x i32> zeroinitializer, %a0
     86   %4 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %3
     87   ret <8 x i32> %4
     88 }
     89 
     90 define <16 x i16> @stack_fold_pabsw(<16 x i16> %a0) {
     91   ;CHECK-LABEL: stack_fold_pabsw
     92   ;CHECK:       vpabsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
     93   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     94   %2 = icmp sgt <16 x i16> %a0, zeroinitializer
     95   %3 = sub <16 x i16> zeroinitializer, %a0
     96   %4 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %3
     97   ret <16 x i16> %4
     98 }
     99 
    100 define <16 x i16> @stack_fold_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
    101   ;CHECK-LABEL: stack_fold_packssdw
    102   ;CHECK:       vpackssdw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    103   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    104   %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
    105   ret <16 x i16> %2
    106 }
    107 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
    108 
    109 define <32 x i8> @stack_fold_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
    110   ;CHECK-LABEL: stack_fold_packsswb
    111   ;CHECK:       vpacksswb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    112   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    113   %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
    114   ret <32 x i8> %2
    115 }
    116 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
    117 
    118 define <16 x i16> @stack_fold_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
    119   ;CHECK-LABEL: stack_fold_packusdw
    120   ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    121   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    122   %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
    123   ret <16 x i16> %2
    124 }
    125 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
    126 
    127 define <32 x i8> @stack_fold_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
    128   ;CHECK-LABEL: stack_fold_packuswb
    129   ;CHECK:       vpackuswb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    130   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    131   %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
    132   ret <32 x i8> %2
    133 }
    134 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
    135 
    136 define <32 x i8> @stack_fold_paddb(<32 x i8> %a0, <32 x i8> %a1) {
    137   ;CHECK-LABEL: stack_fold_paddb
    138   ;CHECK:       vpaddb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    139   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    140   %2 = add <32 x i8> %a0, %a1
    141   ret <32 x i8> %2
    142 }
    143 
    144 define <8 x i32> @stack_fold_paddd(<8 x i32> %a0, <8 x i32> %a1) {
    145   ;CHECK-LABEL: stack_fold_paddd
    146   ;CHECK:       vpaddd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    147   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    148   %2 = add <8 x i32> %a0, %a1
    149   ret <8 x i32> %2
    150 }
    151 
    152 define <4 x i64> @stack_fold_paddq(<4 x i64> %a0, <4 x i64> %a1) {
    153   ;CHECK-LABEL: stack_fold_paddq
    154   ;CHECK:       vpaddq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    155   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    156   %2 = add <4 x i64> %a0, %a1
    157   ret <4 x i64> %2
    158 }
    159 
    160 define <32 x i8> @stack_fold_paddsb(<32 x i8> %a0, <32 x i8> %a1) {
    161   ;CHECK-LABEL: stack_fold_paddsb
    162   ;CHECK:       vpaddsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    163   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    164   %2 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1)
    165   ret <32 x i8> %2
    166 }
    167 declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
    168 
    169 define <16 x i16> @stack_fold_paddsw(<16 x i16> %a0, <16 x i16> %a1) {
    170   ;CHECK-LABEL: stack_fold_paddsw
    171   ;CHECK:       vpaddsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    172   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    173   %2 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1)
    174   ret <16 x i16> %2
    175 }
    176 declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
    177 
    178 define <32 x i8> @stack_fold_paddusb(<32 x i8> %a0, <32 x i8> %a1) {
    179   ;CHECK-LABEL: stack_fold_paddusb
    180   ;CHECK:       vpaddusb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    181   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    182   %2 = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1)
    183   ret <32 x i8> %2
    184 }
    185 declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
    186 
    187 define <16 x i16> @stack_fold_paddusw(<16 x i16> %a0, <16 x i16> %a1) {
    188   ;CHECK-LABEL: stack_fold_paddusw
    189   ;CHECK:       vpaddusw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    190   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    191   %2 = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1)
    192   ret <16 x i16> %2
    193 }
    194 declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
    195 
    196 define <16 x i16> @stack_fold_paddw(<16 x i16> %a0, <16 x i16> %a1) {
    197   ;CHECK-LABEL: stack_fold_paddw
    198   ;CHECK:       vpaddw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    199   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    200   %2 = add <16 x i16> %a0, %a1
    201   ret <16 x i16> %2
    202 }
    203 
    204 define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) {
    205   ;CHECK-LABEL: stack_fold_palignr
    206   ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    207   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    208   %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
    209   ret <32 x i8> %2
    210 }
    211 
    212 define <32 x i8> @stack_fold_pand(<32 x i8> %a0, <32 x i8> %a1) {
    213   ;CHECK-LABEL: stack_fold_pand
    214   ;CHECK:       vpand {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    215   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    216   %2 = and <32 x i8> %a0, %a1
    217   ; add forces execution domain
    218   %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    219   ret <32 x i8> %3
    220 }
    221 
    222 define <32 x i8> @stack_fold_pandn(<32 x i8> %a0, <32 x i8> %a1) {
    223   ;CHECK-LABEL: stack_fold_pandn
    224   ;CHECK:       vpandn {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    225   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    226   %2 = xor <32 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
    227   %3 = and <32 x i8> %2, %a1
    228   ; add forces execution domain
    229   %4 = add <32 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    230   ret <32 x i8> %4
    231 }
    232 
    233 define <32 x i8> @stack_fold_pavgb(<32 x i8> %a0, <32 x i8> %a1) {
    234   ;CHECK-LABEL: stack_fold_pavgb
    235   ;CHECK:       vpavgb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    236   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    237   %2 = zext <32 x i8> %a0 to <32 x i16>
    238   %3 = zext <32 x i8> %a1 to <32 x i16>
    239   %4 = add <32 x i16> %2, %3
    240   %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    241   %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    242   %7 = trunc <32 x i16> %6 to <32 x i8>
    243   ret <32 x i8> %7
    244 }
    245 
    246 define <16 x i16> @stack_fold_pavgw(<16 x i16> %a0, <16 x i16> %a1) {
    247   ;CHECK-LABEL: stack_fold_pavgw
    248   ;CHECK:       vpavgw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    249   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    250   %2 = zext <16 x i16> %a0 to <16 x i32>
    251   %3 = zext <16 x i16> %a1 to <16 x i32>
    252   %4 = add <16 x i32> %2, %3
    253   %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    254   %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    255   %7 = trunc <16 x i32> %6 to <16 x i16>
    256   ret <16 x i16> %7
    257 }
    258 
    259 define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) {
    260   ;CHECK-LABEL: stack_fold_pblendd
    261   ;CHECK:       vpblendd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    262   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    263   %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
    264   ; add forces execution domain
    265   %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
    266   ret <4 x i32> %3
    267 }
    268 
    269 define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
    270   ;CHECK-LABEL: stack_fold_pblendd_ymm
    271   ;CHECK:       vpblendd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    272   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    273   %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
    274   ; add forces execution domain
    275   %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    276   ret <8 x i32> %3
    277 }
    278 
    279 define <32 x i8> @stack_fold_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %c) {
    280   ;CHECK-LABEL: stack_fold_pblendvb
    281   ;CHECK:       vpblendvb {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    282   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    283   %2 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a1, <32 x i8> %c, <32 x i8> %a0)
    284   ret <32 x i8> %2
    285 }
    286 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
    287 
    288 define <16 x i16> @stack_fold_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
    289   ;CHECK-LABEL: stack_fold_pblendw
    290   ;CHECK:       vpblendw $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    291   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    292   %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
    293   ret <16 x i16> %2
    294 }
    295 
    296 define <16 x i8> @stack_fold_pbroadcastb(<16 x i8> %a0) {
    297   ;CHECK-LABEL: stack_fold_pbroadcastb
    298   ;CHECK:       vpbroadcastb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    299   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    300   %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer
    301   ret <16 x i8> %2
    302 }
    303 
    304 define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) {
    305   ;CHECK-LABEL: stack_fold_pbroadcastb_ymm
    306   ;CHECK:       vpbroadcastb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    307   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    308   %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> zeroinitializer
    309   ret <32 x i8> %2
    310 }
    311 
    312 define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) {
    313   ;CHECK-LABEL: stack_fold_pbroadcastd
    314   ;CHECK:       vpbroadcastd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    315   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    316   %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> zeroinitializer
    317   ; add forces execution domain
    318   %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
    319   ret <4 x i32> %3
    320 }
    321 
    322 define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) {
    323   ;CHECK-LABEL: stack_fold_pbroadcastd_ymm
    324   ;CHECK:       vpbroadcastd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    325   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    326   %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> zeroinitializer
    327   ; add forces execution domain
    328   %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    329   ret <8 x i32> %3
    330 }
    331 
    332 define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) {
    333   ;CHECK-LABEL: stack_fold_pbroadcastq
    334   ;CHECK:       vpbroadcastq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    335   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    336   %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
    337   ; add forces execution domain
    338   %3 = add <2 x i64> %2, <i64 1, i64 1>
    339   ret <2 x i64> %3
    340 }
    341 
    342 define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) {
    343   ;CHECK-LABEL: stack_fold_pbroadcastq_ymm
    344   ;CHECK:       vpbroadcastq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    345   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    346   %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
    347   ; add forces execution domain
    348   %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
    349   ret <4 x i64> %3
    350 }
    351 
    352 define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) {
    353   ;CHECK-LABEL: stack_fold_pbroadcastw
    354   ;CHECK:       vpbroadcastw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    355   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    356   %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
    357   ret <8 x i16> %2
    358 }
    359 
    360 define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) {
    361   ;CHECK-LABEL: stack_fold_pbroadcastw_ymm
    362   ;CHECK:       vpbroadcastw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    363   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    364   %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> zeroinitializer
    365   ret <16 x i16> %2
    366 }
    367 
    368 define <32 x i8> @stack_fold_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1) {
    369   ;CHECK-LABEL: stack_fold_pcmpeqb
    370   ;CHECK:       vpcmpeqb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    371   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    372   %2 = icmp eq <32 x i8> %a0, %a1
    373   %3 = sext <32 x i1> %2 to <32 x i8>
    374   ret <32 x i8> %3
    375 }
    376 
    377 define <8 x i32> @stack_fold_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1) {
    378   ;CHECK-LABEL: stack_fold_pcmpeqd
    379   ;CHECK:       vpcmpeqd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    380   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    381   %2 = icmp eq <8 x i32> %a0, %a1
    382   %3 = sext <8 x i1> %2 to <8 x i32>
    383   ret <8 x i32> %3
    384 }
    385 
    386 define <4 x i64> @stack_fold_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1) {
    387   ;CHECK-LABEL: stack_fold_pcmpeqq
    388   ;CHECK:       vpcmpeqq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    389   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    390   %2 = icmp eq <4 x i64> %a0, %a1
    391   %3 = sext <4 x i1> %2 to <4 x i64>
    392   ret <4 x i64> %3
    393 }
    394 
    395 define <16 x i16> @stack_fold_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1) {
    396   ;CHECK-LABEL: stack_fold_pcmpeqw
    397   ;CHECK:       vpcmpeqw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    398   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    399   %2 = icmp eq <16 x i16> %a0, %a1
    400   %3 = sext <16 x i1> %2 to <16 x i16>
    401   ret <16 x i16> %3
    402 }
    403 
    404 define <32 x i8> @stack_fold_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1) {
    405   ;CHECK-LABEL: stack_fold_pcmpgtb
    406   ;CHECK:       vpcmpgtb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    407   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    408   %2 = icmp sgt <32 x i8> %a0, %a1
    409   %3 = sext <32 x i1> %2 to <32 x i8>
    410   ret <32 x i8> %3
    411 }
    412 
    413 define <8 x i32> @stack_fold_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1) {
    414   ;CHECK-LABEL: stack_fold_pcmpgtd
    415   ;CHECK:       vpcmpgtd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    416   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    417   %2 = icmp sgt <8 x i32> %a0, %a1
    418   %3 = sext <8 x i1> %2 to <8 x i32>
    419   ret <8 x i32> %3
    420 }
    421 
    422 define <4 x i64> @stack_fold_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1) {
    423   ;CHECK-LABEL: stack_fold_pcmpgtq
    424   ;CHECK:       vpcmpgtq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    425   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    426   %2 = icmp sgt <4 x i64> %a0, %a1
    427   %3 = sext <4 x i1> %2 to <4 x i64>
    428   ret <4 x i64> %3
    429 }
    430 
    431 define <16 x i16> @stack_fold_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1) {
    432   ;CHECK-LABEL: stack_fold_pcmpgtw
    433   ;CHECK:       vpcmpgtw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    434   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    435   %2 = icmp sgt <16 x i16> %a0, %a1
    436   %3 = sext <16 x i1> %2 to <16 x i16>
    437   ret <16 x i16> %3
    438 }
    439 
    440 define <8 x i32> @stack_fold_perm2i128(<8 x i32> %a0, <8 x i32> %a1) {
    441   ;CHECK-LABEL: stack_fold_perm2i128
    442   ;CHECK:   vperm2i128 $33, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    443   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    444   %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    445   ; add forces execution domain
    446   %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    447   ret <8 x i32> %3
    448 }
    449 
    450 define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
    451   ;CHECK-LABEL: stack_fold_permd
    452   ;CHECK:   vpermd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    453   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    454   %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
    455   ; add forces execution domain
    456   %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    457   ret <8 x i32> %3
    458 }
    459 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
    460 
    461 define <4 x double> @stack_fold_permpd(<4 x double> %a0) {
    462   ;CHECK-LABEL: stack_fold_permpd
    463   ;CHECK:   vpermpd $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    464   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    465   %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
    466   ; fadd forces execution domain
    467   %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
    468   ret <4 x double> %3
    469 }
    470 
    471 define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) {
    472   ;CHECK-LABEL: stack_fold_permps
    473   ;CHECK:       vpermps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    474   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    475   %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0)
    476   ret <8 x float> %2
    477 }
    478 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
    479 
    480 define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
    481   ;CHECK-LABEL: stack_fold_permq
    482   ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    483   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    484   %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
    485   ; add forces execution domain
    486   %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
    487   ret <4 x i64> %3
    488 }
    489 
    490 define <8 x i32> @stack_fold_phaddd(<8 x i32> %a0, <8 x i32> %a1) {
    491   ;CHECK-LABEL: stack_fold_phaddd
    492   ;CHECK:       vphaddd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    493   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    494   %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
    495   ret <8 x i32> %2
    496 }
    497 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
    498 
    499 define <16 x i16> @stack_fold_phaddsw(<16 x i16> %a0, <16 x i16> %a1) {
    500   ;CHECK-LABEL: stack_fold_phaddsw
    501   ;CHECK:       vphaddsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    502   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    503   %2 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1)
    504   ret <16 x i16> %2
    505 }
    506 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
    507 
    508 define <16 x i16> @stack_fold_phaddw(<16 x i16> %a0, <16 x i16> %a1) {
    509   ;CHECK-LABEL: stack_fold_phaddw
    510   ;CHECK:       vphaddw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    511   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    512   %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
    513   ret <16 x i16> %2
    514 }
    515 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
    516 
    517 define <8 x i32> @stack_fold_phsubd(<8 x i32> %a0, <8 x i32> %a1) {
    518   ;CHECK-LABEL: stack_fold_phsubd
    519   ;CHECK:       vphsubd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    520   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    521   %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
    522   ret <8 x i32> %2
    523 }
    524 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
    525 
    526 define <16 x i16> @stack_fold_phsubsw(<16 x i16> %a0, <16 x i16> %a1) {
    527   ;CHECK-LABEL: stack_fold_phsubsw
    528   ;CHECK:       vphsubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    529   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    530   %2 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1)
    531   ret <16 x i16> %2
    532 }
    533 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
    534 
    535 define <16 x i16> @stack_fold_phsubw(<16 x i16> %a0, <16 x i16> %a1) {
    536   ;CHECK-LABEL: stack_fold_phsubw
    537   ;CHECK:       vphsubw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    538   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    539   %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
    540   ret <16 x i16> %2
    541 }
    542 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
    543 
    544 define <16 x i16> @stack_fold_pmaddubsw(<32 x i8> %a0, <32 x i8> %a1) {
    545   ;CHECK-LABEL: stack_fold_pmaddubsw
    546   ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    547   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    548   %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
    549   ret <16 x i16> %2
    550 }
    551 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
    552 
    553 define <8 x i32> @stack_fold_pmaddwd(<16 x i16> %a0, <16 x i16> %a1) {
    554   ;CHECK-LABEL: stack_fold_pmaddwd
    555   ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    556   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    557   %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
    558   ret <8 x i32> %2
    559 }
    560 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
    561 
    562 define <32 x i8> @stack_fold_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
    563   ;CHECK-LABEL: stack_fold_pmaxsb
    564   ;CHECK:       vpmaxsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    565   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    566   %2 = icmp sgt <32 x i8> %a0, %a1
    567   %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
    568   ret <32 x i8> %3
    569 }
    570 
    571 define <8 x i32> @stack_fold_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) {
    572   ;CHECK-LABEL: stack_fold_pmaxsd
    573   ;CHECK:       vpmaxsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    574   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    575   %2 = icmp sgt <8 x i32> %a0, %a1
    576   %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
    577   ret <8 x i32> %3
    578 }
    579 
    580 define <16 x i16> @stack_fold_pmaxsw(<16 x i16> %a0, <16 x i16> %a1) {
    581   ;CHECK-LABEL: stack_fold_pmaxsw
    582   ;CHECK:       vpmaxsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    583   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    584   %2 = icmp sgt <16 x i16> %a0, %a1
    585   %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
    586   ret <16 x i16> %3
    587 }
    588 
    589 define <32 x i8> @stack_fold_pmaxub(<32 x i8> %a0, <32 x i8> %a1) {
    590   ;CHECK-LABEL: stack_fold_pmaxub
    591   ;CHECK:       vpmaxub {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    592   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    593   %2 = icmp ugt <32 x i8> %a0, %a1
    594   %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
    595   ret <32 x i8> %3
    596 }
    597 
    598 define <8 x i32> @stack_fold_pmaxud(<8 x i32> %a0, <8 x i32> %a1) {
    599   ;CHECK-LABEL: stack_fold_pmaxud
    600   ;CHECK:       vpmaxud {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    601   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    602   %2 = icmp ugt <8 x i32> %a0, %a1
    603   %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
    604   ret <8 x i32> %3
    605 }
    606 
    607 define <16 x i16> @stack_fold_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) {
    608   ;CHECK-LABEL: stack_fold_pmaxuw
    609   ;CHECK:       vpmaxuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    610   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    611   %2 = icmp ugt <16 x i16> %a0, %a1
    612   %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
    613   ret <16 x i16> %3
    614 }
    615 
    616 define <32 x i8> @stack_fold_pminsb(<32 x i8> %a0, <32 x i8> %a1) {
    617   ;CHECK-LABEL: stack_fold_pminsb
    618   ;CHECK:       vpminsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    619   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    620   %2 = icmp slt <32 x i8> %a0, %a1
    621   %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
    622   ret <32 x i8> %3
    623 }
    624 
    625 define <8 x i32> @stack_fold_pminsd(<8 x i32> %a0, <8 x i32> %a1) {
    626   ;CHECK-LABEL: stack_fold_pminsd
    627   ;CHECK:       vpminsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    628   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    629   %2 = icmp slt <8 x i32> %a0, %a1
    630   %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
    631   ret <8 x i32> %3
    632 }
    633 
    634 define <16 x i16> @stack_fold_pminsw(<16 x i16> %a0, <16 x i16> %a1) {
    635   ;CHECK-LABEL: stack_fold_pminsw
    636   ;CHECK:       vpminsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    637   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    638   %2 = icmp slt <16 x i16> %a0, %a1
    639   %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
    640   ret <16 x i16> %3
    641 }
    642 
    643 define <32 x i8> @stack_fold_pminub(<32 x i8> %a0, <32 x i8> %a1) {
    644   ;CHECK-LABEL: stack_fold_pminub
    645   ;CHECK:       vpminub {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    646   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    647   %2 = icmp ult <32 x i8> %a0, %a1
    648   %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
    649   ret <32 x i8> %3
    650 }
    651 
    652 define <8 x i32> @stack_fold_pminud(<8 x i32> %a0, <8 x i32> %a1) {
    653   ;CHECK-LABEL: stack_fold_pminud
    654   ;CHECK:       vpminud {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    655   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    656   %2 = icmp ult <8 x i32> %a0, %a1
    657   %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
    658   ret <8 x i32> %3
    659 }
    660 
    661 define <16 x i16> @stack_fold_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
    662   ;CHECK-LABEL: stack_fold_pminuw
    663   ;CHECK:       vpminuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    664   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    665   %2 = icmp ult <16 x i16> %a0, %a1
    666   %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
    667   ret <16 x i16> %3
    668 }
    669 
    670 define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
    671   ;CHECK-LABEL: stack_fold_pmovsxbd
    672   ;CHECK:       vpmovsxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    673   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    674   %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    675   %3 = sext <8 x i8> %2 to <8 x i32>
    676   ret <8 x i32> %3
    677 }
    678 
    679 define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
    680   ;CHECK-LABEL: stack_fold_pmovsxbq
    681   ;CHECK:       pmovsxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    682   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    683   %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    684   %3 = sext <4 x i8> %2 to <4 x i64>
    685   ret <4 x i64> %3
    686 }
    687 
    688 define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
    689   ;CHECK-LABEL: stack_fold_pmovsxbw
    690   ;CHECK:       vpmovsxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    691   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    692   %2 = sext <16 x i8> %a0 to <16 x i16>
    693   ret <16 x i16> %2
    694 }
    695 
    696 define <4 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
    697   ;CHECK-LABEL: stack_fold_pmovsxdq
    698   ;CHECK:       vpmovsxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    699   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    700   %2 = sext <4 x i32> %a0 to <4 x i64>
    701   ret <4 x i64> %2
    702 }
    703 
    704 define <8 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
    705   ;CHECK-LABEL: stack_fold_pmovsxwd
    706   ;CHECK:       vpmovsxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    707   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    708   %2 = sext <8 x i16> %a0 to <8 x i32>
    709   ret <8 x i32> %2
    710 }
    711 
    712 define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
    713   ;CHECK-LABEL: stack_fold_pmovsxwq
    714   ;CHECK:       vpmovsxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    715   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    716   %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    717   %3 = sext <4 x i16> %2 to <4 x i64>
    718   ret <4 x i64> %3
    719 }
    720 
    721 define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
    722   ;CHECK-LABEL: stack_fold_pmovzxbd
    723   ;CHECK:       vpmovzxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    724   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    725   %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    726   %3 = zext <8 x i8> %2 to <8 x i32>
    727   ret <8 x i32> %3
    728 }
    729 
    730 define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
    731   ;CHECK-LABEL: stack_fold_pmovzxbq
    732   ;CHECK:       vpmovzxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    733   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    734   %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    735   %3 = zext <4 x i8> %2 to <4 x i64>
    736   ret <4 x i64> %3
    737 }
    738 
    739 define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
    740   ;CHECK-LABEL: stack_fold_pmovzxbw
    741   ;CHECK:       vpmovzxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    742   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    743   %2 = zext <16 x i8> %a0 to <16 x i16>
    744   ret <16 x i16> %2
    745 }
    746 
    747 define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
    748   ;CHECK-LABEL: stack_fold_pmovzxdq
    749   ;CHECK:       vpmovzxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    750   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    751   %2 = zext <4 x i32> %a0 to <4 x i64>
    752   ret <4 x i64> %2
    753 }
    754 
    755 define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
    756   ;CHECK-LABEL: stack_fold_pmovzxwd
    757   ;CHECK:       vpmovzxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    758   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    759   %2 = zext <8 x i16> %a0 to <8 x i32>
    760   ret <8 x i32> %2
    761 }
    762 
    763 define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
    764   ;CHECK-LABEL: stack_fold_pmovzxwq
    765   ;CHECK:       vpmovzxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    766   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    767   %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    768   %3 = zext <4 x i16> %2 to <4 x i64>
    769   ret <4 x i64> %3
    770 }
    771 
    772 define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) {
    773   ;CHECK-LABEL: stack_fold_pmuldq
    774   ;CHECK:       vpmuldq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    775   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    776   %2 = bitcast <8 x i32> %a0 to <4 x i64>
    777   %3 = bitcast <8 x i32> %a1 to <4 x i64>
    778   %4 = shl <4 x i64> %2, <i64 32, i64 32, i64 32, i64 32>
    779   %5 = ashr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
    780   %6 = shl <4 x i64> %3, <i64 32, i64 32, i64 32, i64 32>
    781   %7 = ashr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
    782   %8 = mul <4 x i64> %5, %7
    783   ret <4 x i64> %8
    784 }
    785 
    786 define <16 x i16> @stack_fold_pmulhrsw(<16 x i16> %a0, <16 x i16> %a1) {
    787   ;CHECK-LABEL: stack_fold_pmulhrsw
    788   ;CHECK:       vpmulhrsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    789   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    790   %2 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1)
    791   ret <16 x i16> %2
    792 }
    793 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
    794 
    795 define <16 x i16> @stack_fold_pmulhuw(<16 x i16> %a0, <16 x i16> %a1) {
    796   ;CHECK-LABEL: stack_fold_pmulhuw
    797   ;CHECK:       vpmulhuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    798   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    799   %2 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1)
    800   ret <16 x i16> %2
    801 }
    802 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
    803 
    804 define <16 x i16> @stack_fold_pmulhw(<16 x i16> %a0, <16 x i16> %a1) {
    805   ;CHECK-LABEL: stack_fold_pmulhw
    806   ;CHECK:       vpmulhw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    807   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    808   %2 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1)
    809   ret <16 x i16> %2
    810 }
    811 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
    812 
    813 define <8 x i32> @stack_fold_pmulld(<8 x i32> %a0, <8 x i32> %a1) {
    814   ;CHECK-LABEL: stack_fold_pmulld
    815   ;CHECK:       vpmulld {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    816   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    817   %2 = mul <8 x i32> %a0, %a1
    818   ret <8 x i32> %2
    819 }
    820 
    821 define <16 x i16> @stack_fold_pmullw(<16 x i16> %a0, <16 x i16> %a1) {
    822   ;CHECK-LABEL: stack_fold_pmullw
    823   ;CHECK:       vpmullw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    824   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    825   %2 = mul <16 x i16> %a0, %a1
    826   ret <16 x i16> %2
    827 }
    828 
    829 define <4 x i64> @stack_fold_pmuludq(<8 x i32> %a0, <8 x i32> %a1) {
    830   ;CHECK-LABEL: stack_fold_pmuludq
    831   ;CHECK:       vpmuludq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    832   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    833   %2 = bitcast <8 x i32> %a0 to <4 x i64>
    834   %3 = bitcast <8 x i32> %a1 to <4 x i64>
    835   %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
    836   %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
    837   %6 = mul <4 x i64> %4, %5
    838   ret <4 x i64> %6
    839 }
    840 
    841 define <32 x i8> @stack_fold_por(<32 x i8> %a0, <32 x i8> %a1) {
    842   ;CHECK-LABEL: stack_fold_por
    843   ;CHECK:       vpor {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    844   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    845   %2 = or <32 x i8> %a0, %a1
    846   ; add forces execution domain
    847   %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    848   ret <32 x i8> %3
    849 }
    850 
    851 define <4 x i64> @stack_fold_psadbw(<32 x i8> %a0, <32 x i8> %a1) {
    852   ;CHECK-LABEL: stack_fold_psadbw
    853   ;CHECK:       vpsadbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    854   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    855   %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1)
    856   ret <4 x i64> %2
    857 }
    858 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
    859 
    860 define <32 x i8> @stack_fold_pshufb(<32 x i8> %a0, <32 x i8> %a1) {
    861   ;CHECK-LABEL: stack_fold_pshufb
    862   ;CHECK:       vpshufb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    863   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    864   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
    865   ret <32 x i8> %2
    866 }
    867 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
    868 
    869 define <8 x i32> @stack_fold_pshufd(<8 x i32> %a0) {
    870   ;CHECK-LABEL: stack_fold_pshufd
    871   ;CHECK:       vpshufd $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    872   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    873   %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
    874   ; add forces execution domain
    875   %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    876   ret <8 x i32> %3
    877 }
    878 
    879 define <16 x i16> @stack_fold_vpshufhw(<16 x i16> %a0) {
    880   ;CHECK-LABEL: stack_fold_vpshufhw
    881   ;CHECK:       vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    882   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    883   %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
    884   ret <16 x i16> %2
    885 }
    886 
    887 define <16 x i16> @stack_fold_vpshuflw(<16 x i16> %a0) {
    888   ;CHECK-LABEL: stack_fold_vpshuflw
    889   ;CHECK:       vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    890   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    891   %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
    892   ret <16 x i16> %2
    893 }
    894 
    895 define <32 x i8> @stack_fold_psignb(<32 x i8> %a0, <32 x i8> %a1) {
    896   ;CHECK-LABEL: stack_fold_psignb
    897   ;CHECK:       vpsignb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    898   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    899   %2 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1)
    900   ret <32 x i8> %2
    901 }
    902 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
    903 
    904 define <8 x i32> @stack_fold_psignd(<8 x i32> %a0, <8 x i32> %a1) {
    905   ;CHECK-LABEL: stack_fold_psignd
    906   ;CHECK:       vpsignd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    907   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    908   %2 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1)
    909   ret <8 x i32> %2
    910 }
    911 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
    912 
    913 define <16 x i16> @stack_fold_psignw(<16 x i16> %a0, <16 x i16> %a1) {
    914   ;CHECK-LABEL: stack_fold_psignw
    915   ;CHECK:       vpsignw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    916   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    917   %2 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1)
    918   ret <16 x i16> %2
    919 }
    920 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
    921 
    922 define <8 x i32> @stack_fold_pslld(<8 x i32> %a0, <4 x i32> %a1) {
    923   ;CHECK-LABEL: stack_fold_pslld
    924   ;CHECK:       vpslld {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    925   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    926   %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1)
    927   ret <8 x i32> %2
    928 }
    929 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
    930 
    931 define <4 x i64> @stack_fold_psllq(<4 x i64> %a0, <2 x i64> %a1) {
    932   ;CHECK-LABEL: stack_fold_psllq
    933   ;CHECK:       vpsllq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    934   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    935   %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
    936   ret <4 x i64> %2
    937 }
    938 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
    939 
    940 define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) {
    941   ;CHECK-LABEL: stack_fold_psllvd
    942   ;CHECK:       vpsllvd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    943   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    944   %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1)
    945   ret <4 x i32> %2
    946 }
    947 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
    948 
    949 define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
    950   ;CHECK-LABEL: stack_fold_psllvd_ymm
    951   ;CHECK:       vpsllvd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    952   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    953   %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1)
    954   ret <8 x i32> %2
    955 }
    956 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
    957 
    958 define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) {
    959   ;CHECK-LABEL: stack_fold_psllvq
    960   ;CHECK:       vpsllvq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    961   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    962   %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
    963   ret <2 x i64> %2
    964 }
    965 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
    966 
    967 define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
    968   ;CHECK-LABEL: stack_fold_psllvq_ymm
    969   ;CHECK:       vpsllvq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
    970   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    971   %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
    972   ret <4 x i64> %2
    973 }
    974 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
    975 
    976 define <16 x i16> @stack_fold_psllw(<16 x i16> %a0, <8 x i16> %a1) {
    977   ;CHECK-LABEL: stack_fold_psllw
    978   ;CHECK:       vpsllw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    979   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    980   %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1)
    981   ret <16 x i16> %2
    982 }
    983 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
    984 
    985 define <8 x i32> @stack_fold_psrad(<8 x i32> %a0, <4 x i32> %a1) {
    986   ;CHECK-LABEL: stack_fold_psrad
    987   ;CHECK:       vpsrad {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    988   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    989   %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1)
    990   ret <8 x i32> %2
    991 }
    992 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
    993 
    994 define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) {
    995   ;CHECK-LABEL: stack_fold_psravd
    996   ;CHECK:       vpsravd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    997   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    998   %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1)
    999   ret <4 x i32> %2
   1000 }
   1001 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
   1002 
   1003 define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
   1004   ;CHECK-LABEL: stack_fold_psravd_ymm
   1005   ;CHECK:       vpsravd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1006   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1007   %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1)
   1008   ret <8 x i32> %2
   1009 }
   1010 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
   1011 
   1012 define <16 x i16> @stack_fold_psraw(<16 x i16> %a0, <8 x i16> %a1) {
   1013   ;CHECK-LABEL: stack_fold_psraw
   1014   ;CHECK:       vpsraw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1015   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1016   %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1)
   1017   ret <16 x i16> %2
   1018 }
   1019 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
   1020 
   1021 define <8 x i32> @stack_fold_psrld(<8 x i32> %a0, <4 x i32> %a1) {
   1022   ;CHECK-LABEL: stack_fold_psrld
   1023   ;CHECK:       vpsrld {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1024   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1025   %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1)
   1026   ret <8 x i32> %2
   1027 }
   1028 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
   1029 
   1030 define <4 x i64> @stack_fold_psrlq(<4 x i64> %a0, <2 x i64> %a1) {
   1031   ;CHECK-LABEL: stack_fold_psrlq
   1032   ;CHECK:       vpsrlq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1033   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1034   %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
   1035   ret <4 x i64> %2
   1036 }
   1037 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
   1038 
   1039 define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) {
   1040   ;CHECK-LABEL: stack_fold_psrlvd
   1041   ;CHECK:       vpsrlvd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1042   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1043   %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1)
   1044   ret <4 x i32> %2
   1045 }
   1046 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
   1047 
   1048 define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
   1049   ;CHECK-LABEL: stack_fold_psrlvd_ymm
   1050   ;CHECK:       vpsrlvd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1051   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1052   %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1)
   1053   ret <8 x i32> %2
   1054 }
   1055 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
   1056 
   1057 define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) {
   1058   ;CHECK-LABEL: stack_fold_psrlvq
   1059   ;CHECK:       vpsrlvq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1060   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1061   %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
   1062   ret <2 x i64> %2
   1063 }
   1064 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
   1065 
   1066 define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
   1067   ;CHECK-LABEL: stack_fold_psrlvq_ymm
   1068   ;CHECK:       vpsrlvq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1069   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1070   %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
   1071   ret <4 x i64> %2
   1072 }
   1073 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
   1074 
   1075 define <16 x i16> @stack_fold_psrlw(<16 x i16> %a0, <8 x i16> %a1) {
   1076   ;CHECK-LABEL: stack_fold_psrlw
   1077   ;CHECK:       vpsrlw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1078   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1079   %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1)
   1080   ret <16 x i16> %2
   1081 }
   1082 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
   1083 
   1084 define <32 x i8> @stack_fold_psubb(<32 x i8> %a0, <32 x i8> %a1) {
   1085   ;CHECK-LABEL: stack_fold_psubb
   1086   ;CHECK:       vpsubb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1087   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1088   %2 = sub <32 x i8> %a0, %a1
   1089   ret <32 x i8> %2
   1090 }
   1091 
   1092 define <8 x i32> @stack_fold_psubd(<8 x i32> %a0, <8 x i32> %a1) {
   1093   ;CHECK-LABEL: stack_fold_psubd
   1094   ;CHECK:       vpsubd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1095   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1096   %2 = sub <8 x i32> %a0, %a1
   1097   ret <8 x i32> %2
   1098 }
   1099 
   1100 define <4 x i64> @stack_fold_psubq(<4 x i64> %a0, <4 x i64> %a1) {
   1101   ;CHECK-LABEL: stack_fold_psubq
   1102   ;CHECK:       vpsubq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1103   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1104   %2 = sub <4 x i64> %a0, %a1
   1105   ret <4 x i64> %2
   1106 }
   1107 
   1108 define <32 x i8> @stack_fold_psubsb(<32 x i8> %a0, <32 x i8> %a1) {
   1109   ;CHECK-LABEL: stack_fold_psubsb
   1110   ;CHECK:       vpsubsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1111   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1112   %2 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1)
   1113   ret <32 x i8> %2
   1114 }
   1115 declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
   1116 
   1117 define <16 x i16> @stack_fold_psubsw(<16 x i16> %a0, <16 x i16> %a1) {
   1118   ;CHECK-LABEL: stack_fold_psubsw
   1119   ;CHECK:       vpsubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1120   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1121   %2 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1)
   1122   ret <16 x i16> %2
   1123 }
   1124 declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
   1125 
   1126 define <32 x i8> @stack_fold_psubusb(<32 x i8> %a0, <32 x i8> %a1) {
   1127   ;CHECK-LABEL: stack_fold_psubusb
   1128   ;CHECK:       vpsubusb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1129   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1130   %2 = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1)
   1131   ret <32 x i8> %2
   1132 }
   1133 declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
   1134 
   1135 define <16 x i16> @stack_fold_psubusw(<16 x i16> %a0, <16 x i16> %a1) {
   1136   ;CHECK-LABEL: stack_fold_psubusw
   1137   ;CHECK:       vpsubusw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1138   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1139   %2 = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1)
   1140   ret <16 x i16> %2
   1141 }
   1142 declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
   1143 
   1144 define <16 x i16> @stack_fold_psubw(<16 x i16> %a0, <16 x i16> %a1) {
   1145   ;CHECK-LABEL: stack_fold_psubw
   1146   ;CHECK:       vpsubw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1147   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1148   %2 = sub <16 x i16> %a0, %a1
   1149   ret <16 x i16> %2
   1150 }
   1151 
   1152 define <32 x i8> @stack_fold_punpckhbw(<32 x i8> %a0, <32 x i8> %a1) {
   1153   ;CHECK-LABEL: stack_fold_punpckhbw
   1154   ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1155   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1156   %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   1157   ret <32 x i8> %2
   1158 }
   1159 
   1160 define <8 x i32> @stack_fold_punpckhdq(<8 x i32> %a0, <8 x i32> %a1) {
   1161   ;CHECK-LABEL: stack_fold_punpckhdq
   1162   ;CHECK:       vpunpckhdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1163   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1164   %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   1165   ; add forces execution domain
   1166   %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1167   ret <8 x i32> %3
   1168 }
   1169 
   1170 define <4 x i64> @stack_fold_punpckhqdq(<4 x i64> %a0, <4 x i64> %a1) {
   1171   ;CHECK-LABEL: stack_fold_punpckhqdq
   1172   ;CHECK:       vpunpckhqdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1173   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1174   %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   1175   ; add forces execution domain
   1176   %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
   1177   ret <4 x i64> %3
   1178 }
   1179 
   1180 define <16 x i16> @stack_fold_punpckhwd(<16 x i16> %a0, <16 x i16> %a1) {
   1181   ;CHECK-LABEL: stack_fold_punpckhwd
   1182   ;CHECK:       vpunpckhwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1183   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1184   %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   1185   ret <16 x i16> %2
   1186 }
   1187 
   1188 define <32 x i8> @stack_fold_punpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
   1189   ;CHECK-LABEL: stack_fold_punpcklbw
   1190   ;CHECK:       vpunpcklbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1191   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1192   %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
   1193   ret <32 x i8> %2
   1194 }
   1195 
   1196 define <8 x i32> @stack_fold_punpckldq(<8 x i32> %a0, <8 x i32> %a1) {
   1197   ;CHECK-LABEL: stack_fold_punpckldq
   1198   ;CHECK:       vpunpckldq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1199   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1200   %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   1201   ; add forces execution domain
   1202   %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1203   ret <8 x i32> %3
   1204 }
   1205 
   1206 define <4 x i64> @stack_fold_punpcklqdq(<4 x i64> %a0, <4 x i64> %a1) {
   1207   ;CHECK-LABEL: stack_fold_punpcklqdq
   1208   ;CHECK:       vpunpcklqdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1209   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1210   %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   1211   ; add forces execution domain
   1212   %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
   1213   ret <4 x i64> %3
   1214 }
   1215 
   1216 define <16 x i16> @stack_fold_punpcklwd(<16 x i16> %a0, <16 x i16> %a1) {
   1217   ;CHECK-LABEL: stack_fold_punpcklwd
   1218   ;CHECK:       vpunpcklwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1219   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1220   %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
   1221   ret <16 x i16> %2
   1222 }
   1223 
   1224 define <32 x i8> @stack_fold_pxor(<32 x i8> %a0, <32 x i8> %a1) {
   1225   ;CHECK-LABEL: stack_fold_pxor
   1226   ;CHECK:       vpxor {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   1227   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1228   %2 = xor <32 x i8> %a0, %a1
   1229   ; add forces execution domain
   1230   %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   1231   ret <32 x i8> %3
   1232 }
   1233