Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+aes,+pclmul < %s | FileCheck %s
      2 
      3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
      4 target triple = "x86_64-unknown-unknown"
      5 
      6 ; Stack reload folding tests.
      7 ;
      8 ; By including a nop call with sideeffects we can force a partial register spill of the
      9 ; relevant registers and check that the reload is correctly folded into the instruction.
     10 
     11 define <2 x i64> @stack_fold_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
     12   ;CHECK-LABEL: stack_fold_aesdec
     13   ;CHECK:       aesdec {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     14   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     15   %2 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1)
     16   ret <2 x i64> %2
     17 }
     18 declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
     19 
     20 define <2 x i64> @stack_fold_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
     21   ;CHECK-LABEL: stack_fold_aesdeclast
     22   ;CHECK:       aesdeclast {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     23   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     24   %2 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1)
     25   ret <2 x i64> %2
     26 }
     27 declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
     28 
     29 define <2 x i64> @stack_fold_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
     30   ;CHECK-LABEL: stack_fold_aesenc
     31   ;CHECK:       aesenc {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     32   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     33   %2 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1)
     34   ret <2 x i64> %2
     35 }
     36 declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
     37 
     38 define <2 x i64> @stack_fold_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
     39   ;CHECK-LABEL: stack_fold_aesenclast
     40   ;CHECK:       aesenclast {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     41   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     42   %2 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1)
     43   ret <2 x i64> %2
     44 }
     45 declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
     46 
     47 define <2 x i64> @stack_fold_aesimc(<2 x i64> %a0) {
     48   ;CHECK-LABEL: stack_fold_aesimc
     49   ;CHECK:       aesimc {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     50   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     51   %2 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0)
     52   ret <2 x i64> %2
     53 }
     54 declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
     55 
     56 define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
     57   ;CHECK-LABEL: stack_fold_aeskeygenassist
     58   ;CHECK:       aeskeygenassist $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
     59   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
     60   %2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
     61   ret <2 x i64> %2
     62 }
     63 declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
     64 
     65 define i32 @stack_fold_crc32_32_8(i32 %a0, i8 %a1) {
     66   ;CHECK-LABEL: stack_fold_crc32_32_8
     67   ;CHECK:       crc32b {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 1-byte Folded Reload
     68   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
     69   %2 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
     70   ret i32 %2
     71 }
     72 declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
     73 
     74 define i32 @stack_fold_crc32_32_16(i32 %a0, i16 %a1) {
     75   ;CHECK-LABEL: stack_fold_crc32_32_16
     76   ;CHECK:       crc32w {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 2-byte Folded Reload
     77   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
     78   %2 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
     79   ret i32 %2
     80 }
     81 declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
     82 
     83 define i32 @stack_fold_crc32_32_32(i32 %a0, i32 %a1) {
     84   ;CHECK-LABEL: stack_fold_crc32_32_32
     85   ;CHECK:       crc32l {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
     86   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
     87   %2 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
     88   ret i32 %2
     89 }
     90 declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
     91 
     92 define i64 @stack_fold_crc32_64_64(i64 %a0, i64 %a1) {
     93   ;CHECK-LABEL: stack_fold_crc32_64_64
     94   ;CHECK:       crc32q {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
     95   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
     96   %2 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
     97   ret i64 %2
     98 }
     99 declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
    100 
    101 define <4 x i32> @stack_fold_movd_load(i32 %a0) {
    102   ;CHECK-LABEL: stack_fold_movd_load
    103   ;CHECK:       movd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    104   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    105   %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0
    106   ; add forces execution domain
    107   %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
    108   ret <4 x i32> %3
    109 }
    110 
    111 define i32 @stack_fold_movd_store(<4 x i32> %a0) {
    112   ;CHECK-LABEL: stack_fold_movd_store
    113   ;CHECK:       movd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
    114   ; add forces execution domain
    115   %1 = add <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
    116   %2 = extractelement <4 x i32> %1, i32 0
    117   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    118   ret i32 %2
    119 }
    120 
    121 define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
    122   ;CHECK-LABEL: stack_fold_movq_load
    123   ;CHECK:       movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    124   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    125   %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
    126   ; add forces execution domain
    127   %3 = add <2 x i64> %2, <i64 1, i64 1>
    128   ret <2 x i64> %3
    129 }
    130 
    131 define i64 @stack_fold_movq_store(<2 x i64> %a0) {
    132   ;CHECK-LABEL: stack_fold_movq_store
    133   ;CHECK:       movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
    134   ; add forces execution domain
    135   %1 = add <2 x i64> %a0, <i64 1, i64 1>
    136   %2 = extractelement <2 x i64> %1, i32 0
    137   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    138   ret i64 %2
    139 }
    140 
    141 define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
    142   ;CHECK-LABEL: stack_fold_mpsadbw
    143   ;CHECK:       mpsadbw $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    144   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    145   %2 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
    146   ret <8 x i16> %2
    147 }
    148 declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
    149 
    150 define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
    151   ;CHECK-LABEL: stack_fold_pabsb
    152   ;CHECK:       pabsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    153   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    154   %2 = icmp sgt <16 x i8> %a0, zeroinitializer
    155   %3 = sub <16 x i8> zeroinitializer, %a0
    156   %4 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %3
    157   ret <16 x i8> %4
    158 }
    159 
    160 define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
    161   ;CHECK-LABEL: stack_fold_pabsd
    162   ;CHECK:       pabsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    163   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    164   %2 = icmp sgt <4 x i32> %a0, zeroinitializer
    165   %3 = sub <4 x i32> zeroinitializer, %a0
    166   %4 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %3
    167   ret <4 x i32> %4
    168 }
    169 
    170 define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
    171   ;CHECK-LABEL: stack_fold_pabsw
    172   ;CHECK:       pabsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    173   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    174   %2 = icmp sgt <8 x i16> %a0, zeroinitializer
    175   %3 = sub <8 x i16> zeroinitializer, %a0
    176   %4 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %3
    177   ret <8 x i16> %4
    178 }
    179 
    180 define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
    181   ;CHECK-LABEL: stack_fold_packssdw
    182   ;CHECK:       packssdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    183   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    184   %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
    185   ret <8 x i16> %2
    186 }
    187 declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
    188 
    189 define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
    190   ;CHECK-LABEL: stack_fold_packsswb
    191   ;CHECK:       packsswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    192   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    193   %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
    194   ret <16 x i8> %2
    195 }
    196 declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
    197 
    198 define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
    199   ;CHECK-LABEL: stack_fold_packusdw
    200   ;CHECK:       packusdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    201   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    202   %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
    203   ret <8 x i16> %2
    204 }
    205 declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
    206 
    207 define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
    208   ;CHECK-LABEL: stack_fold_packuswb
    209   ;CHECK:       packuswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    210   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    211   %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
    212   ret <16 x i8> %2
    213 }
    214 declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
    215 
    216 define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
    217   ;CHECK-LABEL: stack_fold_paddb
    218   ;CHECK:       paddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    219   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    220   %2 = add <16 x i8> %a0, %a1
    221   ret <16 x i8> %2
    222 }
    223 
    224 define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
    225   ;CHECK-LABEL: stack_fold_paddd
    226   ;CHECK:       paddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    227   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    228   %2 = add <4 x i32> %a0, %a1
    229   ret <4 x i32> %2
    230 }
    231 
    232 define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) {
    233   ;CHECK-LABEL: stack_fold_paddq
    234   ;CHECK:       paddq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    235   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    236   %2 = add <2 x i64> %a0, %a1
    237   ret <2 x i64> %2
    238 }
    239 
    240 define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) {
    241   ;CHECK-LABEL: stack_fold_paddsb
    242   ;CHECK:       paddsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    243   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    244   %2 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1)
    245   ret <16 x i8> %2
    246 }
    247 declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
    248 
    249 define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) {
    250   ;CHECK-LABEL: stack_fold_paddsw
    251   ;CHECK:       paddsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    252   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    253   %2 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
    254   ret <8 x i16> %2
    255 }
    256 declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
    257 
    258 define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) {
    259   ;CHECK-LABEL: stack_fold_paddusb
    260   ;CHECK:       paddusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    261   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    262   %2 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1)
    263   ret <16 x i8> %2
    264 }
    265 declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
    266 
    267 define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) {
    268   ;CHECK-LABEL: stack_fold_paddusw
    269   ;CHECK:       paddusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    270   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    271   %2 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
    272   ret <8 x i16> %2
    273 }
    274 declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
    275 
    276 define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) {
    277   ;CHECK-LABEL: stack_fold_paddw
    278   ;CHECK:       paddw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    279   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    280   %2 = add <8 x i16> %a0, %a1
    281   ret <8 x i16> %2
    282 }
    283 
    284 define <16 x i8> @stack_fold_palignr(<16 x i8> %a0, <16 x i8> %a1) {
    285   ;CHECK-LABEL: stack_fold_palignr
    286   ;CHECK:       palignr $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    287   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    288   %2 = shufflevector <16 x i8> %a1, <16 x i8> %a0, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
    289   ret <16 x i8> %2
    290 }
    291 
    292 define <16 x i8> @stack_fold_pand(<16 x i8> %a0, <16 x i8> %a1) {
    293   ;CHECK-LABEL: stack_fold_pand
    294   ;CHECK:       pand {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    295   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    296   %2 = and <16 x i8> %a0, %a1
    297   ; add forces execution domain
    298   %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    299   ret <16 x i8> %3
    300 }
    301 
    302 define <16 x i8> @stack_fold_pandn(<16 x i8> %a0, <16 x i8> %a1) {
    303   ;CHECK-LABEL: stack_fold_pandn
    304   ;CHECK:       pandn {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    305   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    306   %2 = xor <16 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
    307   %3 = and <16 x i8> %2, %a1
    308   ; add forces execution domain
    309   %4 = add <16 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    310   ret <16 x i8> %4
    311 }
    312 
    313 define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
    314   ;CHECK-LABEL: stack_fold_pavgb
    315   ;CHECK:       pavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    316   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    317   %2 = zext <16 x i8> %a0 to <16 x i16>
    318   %3 = zext <16 x i8> %a1 to <16 x i16>
    319   %4 = add <16 x i16> %2, %3
    320   %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    321   %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    322   %7 = trunc <16 x i16> %6 to <16 x i8>
    323   ret <16 x i8> %7
    324 }
    325 
    326 define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
    327   ;CHECK-LABEL: stack_fold_pavgw
    328   ;CHECK:       pavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    329   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    330   %2 = zext <8 x i16> %a0 to <8 x i32>
    331   %3 = zext <8 x i16> %a1 to <8 x i32>
    332   %4 = add <8 x i32> %2, %3
    333   %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    334   %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    335   %7 = trunc <8 x i32> %6 to <8 x i16>
    336   ret <8 x i16> %7
    337 }
    338 
    339 define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
    340   ;CHECK-LABEL: stack_fold_pblendvb
    341   ;CHECK:       pblendvb %xmm0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    342   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    343   %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
    344   ret <16 x i8> %2
    345 }
    346 declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
    347 
    348 define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
    349   ;CHECK-LABEL: stack_fold_pblendw
    350   ;CHECK:       pblendw $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    351   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    352   %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
    353   ret <8 x i16> %2
    354 }
    355 
    356 define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
    357   ;CHECK-LABEL: stack_fold_pclmulqdq
    358   ;CHECK:       pclmulqdq $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    359   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    360   %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
    361   ret <2 x i64> %2
    362 }
    363 declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
    364 
    365 define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
    366   ;CHECK-LABEL: stack_fold_pcmpeqb
    367   ;CHECK:       pcmpeqb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    368   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    369   %2 = icmp eq <16 x i8> %a0, %a1
    370   %3 = sext <16 x i1> %2 to <16 x i8>
    371   ret <16 x i8> %3
    372 }
    373 
    374 define <4 x i32> @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) {
    375   ;CHECK-LABEL: stack_fold_pcmpeqd
    376   ;CHECK:       pcmpeqd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    377   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    378   %2 = icmp eq <4 x i32> %a0, %a1
    379   %3 = sext <4 x i1> %2 to <4 x i32>
    380   ret <4 x i32> %3
    381 }
    382 
    383 define <2 x i64> @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) {
    384   ;CHECK-LABEL: stack_fold_pcmpeqq
    385   ;CHECK:       pcmpeqq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    386   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    387   %2 = icmp eq <2 x i64> %a0, %a1
    388   %3 = sext <2 x i1> %2 to <2 x i64>
    389   ret <2 x i64> %3
    390 }
    391 
    392 define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
    393   ;CHECK-LABEL: stack_fold_pcmpeqw
    394   ;CHECK:       pcmpeqw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    395   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    396   %2 = icmp eq <8 x i16> %a0, %a1
    397   %3 = sext <8 x i1> %2 to <8 x i16>
    398   ret <8 x i16> %3
    399 }
    400 
    401 define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {
    402   ;CHECK-LABEL: stack_fold_pcmpestri
    403   ;CHECK:       pcmpestri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    404   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
    405   %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
    406   ret i32 %2
    407 }
    408 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
    409 
    410 define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {
    411   ;CHECK-LABEL: stack_fold_pcmpestrm
    412   ;CHECK:       pcmpestrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    413   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
    414   %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
    415   ret <16 x i8> %2
    416 }
    417 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
    418 
    419 define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
    420   ;CHECK-LABEL: stack_fold_pcmpgtb
    421   ;CHECK:       pcmpgtb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    422   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    423   %2 = icmp sgt <16 x i8> %a0, %a1
    424   %3 = sext <16 x i1> %2 to <16 x i8>
    425   ret <16 x i8> %3
    426 }
    427 
    428 define <4 x i32> @stack_fold_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1) {
    429   ;CHECK-LABEL: stack_fold_pcmpgtd
    430   ;CHECK:       pcmpgtd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    431   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    432   %2 = icmp sgt <4 x i32> %a0, %a1
    433   %3 = sext <4 x i1> %2 to <4 x i32>
    434   ret <4 x i32> %3
    435 }
    436 
    437 define <2 x i64> @stack_fold_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1) {
    438   ;CHECK-LABEL: stack_fold_pcmpgtq
    439   ;CHECK:       pcmpgtq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    440   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    441   %2 = icmp sgt <2 x i64> %a0, %a1
    442   %3 = sext <2 x i1> %2 to <2 x i64>
    443   ret <2 x i64> %3
    444 }
    445 
    446 define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) {
    447   ;CHECK-LABEL: stack_fold_pcmpgtw
    448   ;CHECK:       pcmpgtw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    449   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    450   %2 = icmp sgt <8 x i16> %a0, %a1
    451   %3 = sext <8 x i1> %2 to <8 x i16>
    452   ret <8 x i16> %3
    453 }
    454 
    455 define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {
    456   ;CHECK-LABEL: stack_fold_pcmpistri
    457   ;CHECK:       pcmpistri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    458   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    459   %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
    460   ret i32 %2
    461 }
    462 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
    463 
    464 define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {
    465   ;CHECK-LABEL: stack_fold_pcmpistrm
    466   ;CHECK:       pcmpistrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    467   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    468   %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
    469   ret <16 x i8> %2
    470 }
    471 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
    472 
    473 ; TODO stack_fold_pextrb
    474 
    475 ; We can't naively fold pextrw as it only writes to a 16-bit memory location
    476 ; even though it can store to a 32-bit register.
    477 define i16 @stack_fold_pextrw(<8 x i16> %a0) {
    478 ; CHECK-LABEL: stack_fold_pextrw
    479 ; CHECK:       pextrw $1, {{%xmm[0-9][0-9]*}}, %[[GPR32:(e[a-z]+|r[0-9]+d)]]
    480 ; CHECK:       movl %[[GPR32]], {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Spill
    481 ; CHECK:       movl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
    482 entry:
    483 ; add forces execution domain
    484   %add = add <8 x i16> %a0, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
    485   %extract = extractelement <8 x i16> %add, i32 1
    486   %asm = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    487   ret i16 %extract
    488 }
    489 
    490 define i32 @stack_fold_pextrd(<4 x i32> %a0) {
    491   ;CHECK-LABEL: stack_fold_pextrd
    492   ;CHECK:       pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
    493   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
    494   ; add forces execution domain
    495   %1 = add <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
    496   %2 = extractelement <4 x i32> %1, i32 1
    497   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    498   ret i32 %2
    499 }
    500 
    501 define i64 @stack_fold_pextrq(<2 x i64> %a0) {
    502   ;CHECK-LABEL: stack_fold_pextrq
    503   ;CHECK:       pextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
    504   ;CHECK:       movq    {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload
    505   %1 = extractelement <2 x i64> %a0, i32 1
    506   %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    507   ret i64 %1
    508 }
    509 
    510 define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
    511   ;CHECK-LABEL: stack_fold_phaddd
    512   ;CHECK:       phaddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    513   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    514   %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
    515   ret <4 x i32> %2
    516 }
    517 declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
    518 
    519 define <8 x i16> @stack_fold_phaddsw(<8 x i16> %a0, <8 x i16> %a1) {
    520   ;CHECK-LABEL: stack_fold_phaddsw
    521   ;CHECK:       phaddsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    522   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    523   %2 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
    524   ret <8 x i16> %2
    525 }
    526 declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
    527 
    528 define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) {
    529   ;CHECK-LABEL: stack_fold_phaddw
    530   ;CHECK:       phaddw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    531   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    532   %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
    533   ret <8 x i16> %2
    534 }
    535 declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
    536 
    537 define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {
    538   ;CHECK-LABEL: stack_fold_phminposuw
    539   ;CHECK:       phminposuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    540   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    541   %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)
    542   ret <8 x i16> %2
    543 }
    544 declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
    545 
    546 define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
    547   ;CHECK-LABEL: stack_fold_phsubd
    548   ;CHECK:       phsubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    549   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    550   %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
    551   ret <4 x i32> %2
    552 }
    553 declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
    554 
    555 define <8 x i16> @stack_fold_phsubsw(<8 x i16> %a0, <8 x i16> %a1) {
    556   ;CHECK-LABEL: stack_fold_phsubsw
    557   ;CHECK:       phsubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    558   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    559   %2 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
    560   ret <8 x i16> %2
    561 }
    562 declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
    563 
    564 define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) {
    565   ;CHECK-LABEL: stack_fold_phsubw
    566   ;CHECK:       phsubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    567   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    568   %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
    569   ret <8 x i16> %2
    570 }
    571 declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
    572 
    573 define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
    574   ;CHECK-LABEL: stack_fold_pinsrb
    575   ;CHECK:       pinsrb $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    576   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    577   %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
    578   ret <16 x i8> %2
    579 }
    580 
    581 define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
    582   ;CHECK-LABEL: stack_fold_pinsrd
    583   ;CHECK:       pinsrd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    584   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    585   %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
    586   ret <4 x i32> %2
    587 }
    588 
    589 define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
    590   ;CHECK-LABEL: stack_fold_pinsrq
    591   ;CHECK:       pinsrq $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
    592   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    593   %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
    594   ret <2 x i64> %2
    595 }
    596 
    597 define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
    598   ;CHECK-LABEL: stack_fold_pinsrw
    599   ;CHECK:       pinsrw $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
    600   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
    601   %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
    602   ret <8 x i16> %2
    603 }
    604 
    605 define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
    606   ;CHECK-LABEL: stack_fold_pmaddubsw
    607   ;CHECK:       pmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    608   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    609   %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
    610   ret <8 x i16> %2
    611 }
    612 declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
    613 
    614 define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
    615   ;CHECK-LABEL: stack_fold_pmaddwd
    616   ;CHECK:       pmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    617   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    618   %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
    619   ret <4 x i32> %2
    620 }
    621 declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
    622 
    623 define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
    624   ;CHECK-LABEL: stack_fold_pmaxsb
    625   ;CHECK:       pmaxsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    626   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    627   %2 = icmp sgt <16 x i8> %a0, %a1
    628   %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
    629   ret <16 x i8> %3
    630 }
    631 
    632 define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
    633   ;CHECK-LABEL: stack_fold_pmaxsd
    634   ;CHECK:       pmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    635   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    636   %2 = icmp sgt <4 x i32> %a0, %a1
    637   %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
    638   ret <4 x i32> %3
    639 }
    640 
    641 define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
    642   ;CHECK-LABEL: stack_fold_pmaxsw
    643   ;CHECK:       pmaxsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    644   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    645   %2 = icmp sgt <8 x i16> %a0, %a1
    646   %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
    647   ret <8 x i16> %3
    648 }
    649 
    650 define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
    651   ;CHECK-LABEL: stack_fold_pmaxub
    652   ;CHECK:       pmaxub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    653   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    654   %2 = icmp ugt <16 x i8> %a0, %a1
    655   %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
    656   ret <16 x i8> %3
    657 }
    658 
    659 define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
    660   ;CHECK-LABEL: stack_fold_pmaxud
    661   ;CHECK:       pmaxud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    662   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    663   %2 = icmp ugt <4 x i32> %a0, %a1
    664   %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
    665   ret <4 x i32> %3
    666 }
    667 
    668 define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
    669   ;CHECK-LABEL: stack_fold_pmaxuw
    670   ;CHECK:       pmaxuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    671   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    672   %2 = icmp ugt <8 x i16> %a0, %a1
    673   %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
    674   ret <8 x i16> %3
    675 }
    676 
    677 define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
    678   ;CHECK-LABEL: stack_fold_pminsb
    679   ;CHECK:       pminsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    680   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    681   %2 = icmp slt <16 x i8> %a0, %a1
    682   %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
    683   ret <16 x i8> %3
    684 }
    685 
    686 define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
    687   ;CHECK-LABEL: stack_fold_pminsd
    688   ;CHECK:       pminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    689   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    690   %2 = icmp slt <4 x i32> %a0, %a1
    691   %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
    692   ret <4 x i32> %3
    693 }
    694 
    695 define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
    696   ;CHECK-LABEL: stack_fold_pminsw
    697   ;CHECK:       pminsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    698   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    699   %2 = icmp slt <8 x i16> %a0, %a1
    700   %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
    701   ret <8 x i16> %3
    702 }
    703 
    704 define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
    705   ;CHECK-LABEL: stack_fold_pminub
    706   ;CHECK:       pminub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    707   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    708   %2 = icmp ult <16 x i8> %a0, %a1
    709   %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
    710   ret <16 x i8> %3
    711 }
    712 
    713 define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
    714   ;CHECK-LABEL: stack_fold_pminud
    715   ;CHECK:       pminud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    716   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    717   %2 = icmp ult <4 x i32> %a0, %a1
    718   %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
    719   ret <4 x i32> %3
    720 }
    721 
    722 define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
    723   ;CHECK-LABEL: stack_fold_pminuw
    724   ;CHECK:       pminuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    725   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    726   %2 = icmp ult <8 x i16> %a0, %a1
    727   %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
    728   ret <8 x i16> %3
    729 }
    730 
    731 define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
    732   ;CHECK-LABEL: stack_fold_pmovsxbd
    733   ;CHECK:       pmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    734   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    735   %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    736   %3 = sext <4 x i8> %2 to <4 x i32>
    737   ret <4 x i32> %3
    738 }
    739 
    740 define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
    741   ;CHECK-LABEL: stack_fold_pmovsxbq
    742   ;CHECK:       pmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    743   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    744   %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
    745   %3 = sext <2 x i8> %2 to <2 x i64>
    746   ret <2 x i64> %3
    747 }
    748 
    749 define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
    750   ;CHECK-LABEL: stack_fold_pmovsxbw
    751   ;CHECK:       pmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    752   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    753   %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    754   %3 = sext <8 x i8> %2 to <8 x i16>
    755   ret <8 x i16> %3
    756 }
    757 
    758 define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
    759   ;CHECK-LABEL: stack_fold_pmovsxdq
    760   ;CHECK:       pmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    761   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    762   %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
    763   %3 = sext <2 x i32> %2 to <2 x i64>
    764   ret <2 x i64> %3
    765 }
    766 
    767 define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
    768   ;CHECK-LABEL: stack_fold_pmovsxwd
    769   ;CHECK:       pmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    770   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    771   %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    772   %3 = sext <4 x i16> %2 to <4 x i32>
    773   ret <4 x i32> %3
    774 }
    775 
    776 define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
    777   ;CHECK-LABEL: stack_fold_pmovsxwq
    778   ;CHECK:       pmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    779   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    780   %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
    781   %3 = sext <2 x i16> %2 to <2 x i64>
    782   ret <2 x i64> %3
    783 }
    784 
    785 define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
    786   ;CHECK-LABEL: stack_fold_pmovzxbd
    787   ;CHECK:       pmovzxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    788   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    789   %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27>
    790   %3 = bitcast <16 x i8> %2 to <4 x i32>
    791   ret <4 x i32> %3
    792 }
    793 
    794 define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
    795   ;CHECK-LABEL: stack_fold_pmovzxbq
    796   ;CHECK:       pmovzxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    797   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    798   %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28>
    799   %3 = bitcast <16 x i8> %2 to <2 x i64>
    800   ret <2 x i64> %3
    801 }
    802 
    803 define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
    804   ;CHECK-LABEL: stack_fold_pmovzxbw
    805   ;CHECK:       pmovzxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    806   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    807   %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
    808   %3 = bitcast <16 x i8> %2 to <8 x i16>
    809   ret <8 x i16> %3
    810 }
    811 
    812 define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
    813   ;CHECK-LABEL: stack_fold_pmovzxdq
    814   ;CHECK:       pmovzxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    815   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    816   %2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
    817   %3 = bitcast <4 x i32> %2 to <2 x i64>
    818   ret <2 x i64> %3
    819 }
    820 
    821 define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
    822   ;CHECK-LABEL: stack_fold_pmovzxwd
    823   ;CHECK:       pmovzxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    824   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    825   %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
    826   %3 = bitcast <8 x i16> %2 to <4 x i32>
    827   ret <4 x i32> %3
    828 }
    829 
    830 define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
    831   ;CHECK-LABEL: stack_fold_pmovzxwq
    832   ;CHECK:       pmovzxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    833   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    834   %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13>
    835   %3 = bitcast <8 x i16> %2 to <2 x i64>
    836   ret <2 x i64> %3
    837 }
    838 
    839 define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
    840   ;CHECK-LABEL: stack_fold_pmuldq
    841   ;CHECK:       pmuldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    842   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    843   %2 = bitcast <4 x i32> %a0 to <2 x i64>
    844   %3 = bitcast <4 x i32> %a1 to <2 x i64>
    845   %4 = shl <2 x i64> %2, <i64 32, i64 32>
    846   %5 = ashr <2 x i64> %4, <i64 32, i64 32>
    847   %6 = shl <2 x i64> %3, <i64 32, i64 32>
    848   %7 = ashr <2 x i64> %6, <i64 32, i64 32>
    849   %8 = mul <2 x i64> %5, %7
    850   ret <2 x i64> %8
    851 }
    852 
    853 define <8 x i16> @stack_fold_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1) {
    854   ;CHECK-LABEL: stack_fold_pmulhrsw
    855   ;CHECK:       pmulhrsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    856   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    857   %2 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
    858   ret <8 x i16> %2
    859 }
    860 declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
    861 
    862 define <8 x i16> @stack_fold_pmulhuw(<8 x i16> %a0, <8 x i16> %a1) {
    863   ;CHECK-LABEL: stack_fold_pmulhuw
    864   ;CHECK:       pmulhuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    865   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    866   %2 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
    867   ret <8 x i16> %2
    868 }
    869 declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
    870 
    871 define <8 x i16> @stack_fold_pmulhw(<8 x i16> %a0, <8 x i16> %a1) {
    872   ;CHECK-LABEL: stack_fold_pmulhw
    873   ;CHECK:       pmulhw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    874   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    875   %2 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
    876   ret <8 x i16> %2
    877 }
    878 declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
    879 
    880 define <4 x i32> @stack_fold_pmulld(<4 x i32> %a0, <4 x i32> %a1) {
    881   ;CHECK-LABEL: stack_fold_pmulld
    882   ;CHECK:       pmulld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    883   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    884   %2 = mul <4 x i32> %a0, %a1
    885   ret <4 x i32> %2
    886 }
    887 
    888 define <8 x i16> @stack_fold_pmullw(<8 x i16> %a0, <8 x i16> %a1) {
    889   ;CHECK-LABEL: stack_fold_pmullw
    890   ;CHECK:       pmullw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    891   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    892   %2 = mul <8 x i16> %a0, %a1
    893   ret <8 x i16> %2
    894 }
    895 
    896 define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
    897   ;CHECK-LABEL: stack_fold_pmuludq
    898   ;CHECK:       pmuludq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    899   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    900   %2 = bitcast <4 x i32> %a0 to <2 x i64>
    901   %3 = bitcast <4 x i32> %a1 to <2 x i64>
    902   %4 = and <2 x i64> %2, <i64 4294967295, i64 4294967295>
    903   %5 = and <2 x i64> %3, <i64 4294967295, i64 4294967295>
    904   %6 = mul <2 x i64> %4, %5
    905   ret <2 x i64> %6
    906 }
    907 
    908 define <16 x i8> @stack_fold_por(<16 x i8> %a0, <16 x i8> %a1) {
    909   ;CHECK-LABEL: stack_fold_por
    910   ;CHECK:       por {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    911   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    912   %2 = or <16 x i8> %a0, %a1
    913   ; add forces execution domain
    914   %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    915   ret <16 x i8> %3
    916 }
    917 
    918 define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
    919   ;CHECK-LABEL: stack_fold_psadbw
    920   ;CHECK:       psadbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    921   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    922   %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
    923   ret <2 x i64> %2
    924 }
    925 declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
    926 
    927 define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
    928   ;CHECK-LABEL: stack_fold_pshufb
    929   ;CHECK:       pshufb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    930   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    931   %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
    932   ret <16 x i8> %2
    933 }
    934 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
    935 
    936 define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
    937   ;CHECK-LABEL: stack_fold_pshufd
    938   ;CHECK:       pshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    939   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    940   %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
    941   ret <4 x i32> %2
    942 }
    943 
    944 define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
    945   ;CHECK-LABEL: stack_fold_pshufhw
    946   ;CHECK:       pshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    947   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    948   %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
    949   ret <8 x i16> %2
    950 }
    951 
    952 define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
    953   ;CHECK-LABEL: stack_fold_pshuflw
    954   ;CHECK:       pshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    955   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    956   %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
    957   ret <8 x i16> %2
    958 }
    959 
    960 define <16 x i8> @stack_fold_psignb(<16 x i8> %a0, <16 x i8> %a1) {
    961   ;CHECK-LABEL: stack_fold_psignb
    962   ;CHECK:       psignb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    963   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    964   %2 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
    965   ret <16 x i8> %2
    966 }
    967 declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
    968 
    969 define <4 x i32> @stack_fold_psignd(<4 x i32> %a0, <4 x i32> %a1) {
    970   ;CHECK-LABEL: stack_fold_psignd
    971   ;CHECK:       psignd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    972   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    973   %2 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
    974   ret <4 x i32> %2
    975 }
    976 declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
    977 
    978 define <8 x i16> @stack_fold_psignw(<8 x i16> %a0, <8 x i16> %a1) {
    979   ;CHECK-LABEL: stack_fold_psignw
    980   ;CHECK:       psignw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    981   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    982   %2 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
    983   ret <8 x i16> %2
    984 }
    985 declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
    986 
    987 define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
    988   ;CHECK-LABEL: stack_fold_pslld
    989   ;CHECK:       pslld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    990   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
    991   %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
    992   ret <4 x i32> %2
    993 }
    994 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
    995 
    996 define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
    997   ;CHECK-LABEL: stack_fold_psllq
    998   ;CHECK:       psllq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
    999   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1000   %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
   1001   ret <2 x i64> %2
   1002 }
   1003 declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
   1004 
   1005 define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
   1006   ;CHECK-LABEL: stack_fold_psllw
   1007   ;CHECK:       psllw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1008   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1009   %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
   1010   ret <8 x i16> %2
   1011 }
   1012 declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
   1013 
   1014 define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
   1015   ;CHECK-LABEL: stack_fold_psrad
   1016   ;CHECK:       psrad {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1017   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1018   %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
   1019   ret <4 x i32> %2
   1020 }
   1021 declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
   1022 
   1023 define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
   1024   ;CHECK-LABEL: stack_fold_psraw
   1025   ;CHECK:       psraw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1026   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1027   %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
   1028   ret <8 x i16> %2
   1029 }
   1030 declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
   1031 
   1032 define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
   1033   ;CHECK-LABEL: stack_fold_psrld
   1034   ;CHECK:       psrld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1035   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1036   %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
   1037   ret <4 x i32> %2
   1038 }
   1039 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
   1040 
   1041 define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
   1042   ;CHECK-LABEL: stack_fold_psrlq
   1043   ;CHECK:       psrlq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1044   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1045   %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
   1046   ret <2 x i64> %2
   1047 }
   1048 declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
   1049 
   1050 define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
   1051   ;CHECK-LABEL: stack_fold_psrlw
   1052   ;CHECK:       psrlw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1053   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1054   %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
   1055   ret <8 x i16> %2
   1056 }
   1057 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
   1058 
   1059 define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
   1060   ;CHECK-LABEL: stack_fold_psubb
   1061   ;CHECK:       psubb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1062   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1063   %2 = sub <16 x i8> %a0, %a1
   1064   ret <16 x i8> %2
   1065 }
   1066 
   1067 define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
   1068   ;CHECK-LABEL: stack_fold_psubd
   1069   ;CHECK:       psubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1070   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1071   %2 = sub <4 x i32> %a0, %a1
   1072   ret <4 x i32> %2
   1073 }
   1074 
   1075 define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
   1076   ;CHECK-LABEL: stack_fold_psubq
   1077   ;CHECK:       psubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1078   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1079   %2 = sub <2 x i64> %a0, %a1
   1080   ret <2 x i64> %2
   1081 }
   1082 
   1083 define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
   1084   ;CHECK-LABEL: stack_fold_psubsb
   1085   ;CHECK:       psubsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1086   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1087   %2 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
   1088   ret <16 x i8> %2
   1089 }
   1090 declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
   1091 
   1092 define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
   1093   ;CHECK-LABEL: stack_fold_psubsw
   1094   ;CHECK:       psubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1095   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1096   %2 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
   1097   ret <8 x i16> %2
   1098 }
   1099 declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
   1100 
   1101 define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
   1102   ;CHECK-LABEL: stack_fold_psubusb
   1103   ;CHECK:       psubusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1104   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1105   %2 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
   1106   ret <16 x i8> %2
   1107 }
   1108 declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
   1109 
   1110 define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
   1111   ;CHECK-LABEL: stack_fold_psubusw
   1112   ;CHECK:       psubusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1113   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1114   %2 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
   1115   ret <8 x i16> %2
   1116 }
   1117 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
   1118 
   1119 define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
   1120   ;CHECK-LABEL: stack_fold_psubw
   1121   ;CHECK:       psubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1122   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1123   %2 = sub <8 x i16> %a0, %a1
   1124   ret <8 x i16> %2
   1125 }
   1126 
   1127 define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {
   1128   ;CHECK-LABEL: stack_fold_ptest
   1129   ;CHECK:       ptest {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1130   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1131   %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
   1132   ret i32 %2
   1133 }
   1134 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
   1135 
   1136 define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
   1137   ;CHECK-LABEL: stack_fold_punpckhbw
   1138   ;CHECK:       punpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1139   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1140   %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   1141   ret <16 x i8> %2
   1142 }
   1143 
   1144 define <4 x i32> @stack_fold_punpckhdq(<4 x i32> %a0, <4 x i32> %a1) {
   1145   ;CHECK-LABEL: stack_fold_punpckhdq
   1146   ;CHECK:       punpckhdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1147   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1148   %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   1149   ; add forces execution domain
   1150   %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
   1151   ret <4 x i32> %3
   1152 }
   1153 
   1154 define <2 x i64> @stack_fold_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1) {
   1155   ;CHECK-LABEL: stack_fold_punpckhqdq
   1156   ;CHECK:       punpckhqdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1157   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1158   %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
   1159   ; add forces execution domain
   1160   %3 = add <2 x i64> %2, <i64 1, i64 1>
   1161   ret <2 x i64> %3
   1162 }
   1163 
   1164 define <8 x i16> @stack_fold_punpckhwd(<8 x i16> %a0, <8 x i16> %a1) {
   1165   ;CHECK-LABEL: stack_fold_punpckhwd
   1166   ;CHECK:       punpckhwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1167   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1168   %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   1169   ret <8 x i16> %2
   1170 }
   1171 
   1172 define <16 x i8> @stack_fold_punpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
   1173   ;CHECK-LABEL: stack_fold_punpcklbw
   1174   ;CHECK:       punpcklbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1175   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1176   %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   1177   ret <16 x i8> %2
   1178 }
   1179 
   1180 define <4 x i32> @stack_fold_punpckldq(<4 x i32> %a0, <4 x i32> %a1) {
   1181   ;CHECK-LABEL: stack_fold_punpckldq
   1182   ;CHECK:       punpckldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1183   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1184   %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   1185   ; add forces execution domain
   1186   %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
   1187   ret <4 x i32> %3
   1188 }
   1189 
   1190 define <2 x i64> @stack_fold_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1) {
   1191   ;CHECK-LABEL: stack_fold_punpcklqdq
   1192   ;CHECK:       punpcklqdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1193   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1194   %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
   1195   ; add forces execution domain
   1196   %3 = add <2 x i64> %2, <i64 1, i64 1>
   1197   ret <2 x i64> %3
   1198 }
   1199 
   1200 define <8 x i16> @stack_fold_punpcklwd(<8 x i16> %a0, <8 x i16> %a1) {
   1201   ;CHECK-LABEL: stack_fold_punpcklwd
   1202   ;CHECK:       punpcklwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1203   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1204   %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   1205   ret <8 x i16> %2
   1206 }
   1207 
   1208 define <16 x i8> @stack_fold_pxor(<16 x i8> %a0, <16 x i8> %a1) {
   1209   ;CHECK-LABEL: stack_fold_pxor
   1210   ;CHECK:       pxor {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   1211   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   1212   %2 = xor <16 x i8> %a0, %a1
   1213   ; add forces execution domain
   1214   %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   1215   ret <16 x i8> %3
   1216 }
   1217