1 ; RUN: llc -O3 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+xop < %s | FileCheck %s 2 3 define <16 x i8> @commute_fold_vpcomb(<16 x i8>* %a0, <16 x i8> %a1) { 4 ;CHECK-LABEL: commute_fold_vpcomb 5 ;CHECK: vpcomgtb (%rdi), %xmm0, %xmm0 6 %1 = load <16 x i8>, <16 x i8>* %a0 7 %2 = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %1, <16 x i8> %a1, i8 0) ; vpcomltb 8 ret <16 x i8> %2 9 } 10 declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone 11 12 define <4 x i32> @commute_fold_vpcomd(<4 x i32>* %a0, <4 x i32> %a1) { 13 ;CHECK-LABEL: commute_fold_vpcomd 14 ;CHECK: vpcomged (%rdi), %xmm0, %xmm0 15 %1 = load <4 x i32>, <4 x i32>* %a0 16 %2 = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %1, <4 x i32> %a1, i8 1) ; vpcomled 17 ret <4 x i32> %2 18 } 19 declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone 20 21 define <2 x i64> @commute_fold_vpcomq(<2 x i64>* %a0, <2 x i64> %a1) { 22 ;CHECK-LABEL: commute_fold_vpcomq 23 ;CHECK: vpcomltq (%rdi), %xmm0, %xmm0 24 %1 = load <2 x i64>, <2 x i64>* %a0 25 %2 = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %1, <2 x i64> %a1, i8 2) ; vpcomgtq 26 ret <2 x i64> %2 27 } 28 declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone 29 30 define <16 x i8> @commute_fold_vpcomub(<16 x i8>* %a0, <16 x i8> %a1) { 31 ;CHECK-LABEL: commute_fold_vpcomub 32 ;CHECK: vpcomleub (%rdi), %xmm0, %xmm0 33 %1 = load <16 x i8>, <16 x i8>* %a0 34 %2 = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %1, <16 x i8> %a1, i8 3) ; vpcomgeub 35 ret <16 x i8> %2 36 } 37 declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone 38 39 define <4 x i32> @commute_fold_vpcomud(<4 x i32>* %a0, <4 x i32> %a1) { 40 ;CHECK-LABEL: commute_fold_vpcomud 41 ;CHECK: vpcomequd (%rdi), %xmm0, %xmm0 42 %1 = load <4 x i32>, <4 x i32>* %a0 43 %2 = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %1, <4 x i32> %a1, i8 4) ; vpcomequd 44 ret <4 x i32> %2 45 } 46 declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone 47 48 define <2 x i64> @commute_fold_vpcomuq(<2 x i64>* %a0, <2 x i64> %a1) { 49 ;CHECK-LABEL: commute_fold_vpcomuq 50 ;CHECK: vpcomnequq (%rdi), %xmm0, %xmm0 51 %1 = load <2 x i64>, <2 x i64>* %a0 52 %2 = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %1, <2 x i64> %a1, i8 5) ; vpcomnequq 53 ret <2 x i64> %2 54 } 55 declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone 56 57 define <8 x i16> @commute_fold_vpcomuw(<8 x i16>* %a0, <8 x i16> %a1) { 58 ;CHECK-LABEL: commute_fold_vpcomuw 59 ;CHECK: vpcomfalseuw (%rdi), %xmm0, %xmm0 60 %1 = load <8 x i16>, <8 x i16>* %a0 61 %2 = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %1, <8 x i16> %a1, i8 6) ; vpcomfalseuw 62 ret <8 x i16> %2 63 } 64 declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone 65 66 define <8 x i16> @commute_fold_vpcomw(<8 x i16>* %a0, <8 x i16> %a1) { 67 ;CHECK-LABEL: commute_fold_vpcomw 68 ;CHECK: vpcomtruew (%rdi), %xmm0, %xmm0 69 %1 = load <8 x i16>, <8 x i16>* %a0 70 %2 = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %1, <8 x i16> %a1, i8 7) ; vpcomtruew 71 ret <8 x i16> %2 72 } 73 declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone 74 75 define <4 x i32> @commute_fold_vpmacsdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) { 76 ;CHECK-LABEL: commute_fold_vpmacsdd 77 ;CHECK: vpmacsdd %xmm1, (%rdi), %xmm0, %xmm0 78 %1 = load <4 x i32>, <4 x i32>* %a0 79 %2 = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2) 80 ret <4 x i32> %2 81 } 82 declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 83 84 define <2 x i64> @commute_fold_vpmacsdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { 85 ;CHECK-LABEL: commute_fold_vpmacsdqh 86 ;CHECK: vpmacsdqh %xmm1, (%rdi), %xmm0, %xmm0 87 %1 = load <4 x i32>, <4 x i32>* %a0 88 %2 = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) 89 ret <2 x i64> %2 90 } 91 declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 92 93 define <2 x i64> @commute_fold_vpmacsdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { 94 ;CHECK-LABEL: commute_fold_vpmacsdql 95 ;CHECK: vpmacsdql %xmm1, (%rdi), %xmm0, %xmm0 96 %1 = load <4 x i32>, <4 x i32>* %a0 97 %2 = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) 98 ret <2 x i64> %2 99 } 100 declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 101 102 define <4 x i32> @commute_fold_vpmacssdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) { 103 ;CHECK-LABEL: commute_fold_vpmacssdd 104 ;CHECK: vpmacssdd %xmm1, (%rdi), %xmm0, %xmm0 105 %1 = load <4 x i32>, <4 x i32>* %a0 106 %2 = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2) 107 ret <4 x i32> %2 108 } 109 declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 110 111 define <2 x i64> @commute_fold_vpmacssdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { 112 ;CHECK-LABEL: commute_fold_vpmacssdqh 113 ;CHECK: vpmacssdqh %xmm1, (%rdi), %xmm0, %xmm0 114 %1 = load <4 x i32>, <4 x i32>* %a0 115 %2 = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) 116 ret <2 x i64> %2 117 } 118 declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 119 120 define <2 x i64> @commute_fold_vpmacssdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { 121 ;CHECK-LABEL: commute_fold_vpmacssdql 122 ;CHECK: vpmacssdql %xmm1, (%rdi), %xmm0, %xmm0 123 %1 = load <4 x i32>, <4 x i32>* %a0 124 %2 = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) 125 ret <2 x i64> %2 126 } 127 declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 128 129 define <4 x i32> @commute_fold_vpmacsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { 130 ;CHECK-LABEL: commute_fold_vpmacsswd 131 ;CHECK: vpmacsswd %xmm1, (%rdi), %xmm0, %xmm0 132 %1 = load <8 x i16>, <8 x i16>* %a0 133 %2 = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) 134 ret <4 x i32> %2 135 } 136 declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 137 138 define <8 x i16> @commute_fold_vpmacssww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) { 139 ;CHECK-LABEL: commute_fold_vpmacssww 140 ;CHECK: vpmacssww %xmm1, (%rdi), %xmm0, %xmm0 141 %1 = load <8 x i16>, <8 x i16>* %a0 142 %2 = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2) 143 ret <8 x i16> %2 144 } 145 declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone 146 147 define <4 x i32> @commute_fold_vpmacswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { 148 ;CHECK-LABEL: commute_fold_vpmacswd 149 ;CHECK: vpmacswd %xmm1, (%rdi), %xmm0, %xmm0 150 %1 = load <8 x i16>, <8 x i16>* %a0 151 %2 = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) 152 ret <4 x i32> %2 153 } 154 declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 155 156 define <8 x i16> @commute_fold_vpmacsww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) { 157 ;CHECK-LABEL: commute_fold_vpmacsww 158 ;CHECK: vpmacsww %xmm1, (%rdi), %xmm0, %xmm0 159 %1 = load <8 x i16>, <8 x i16>* %a0 160 %2 = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2) 161 ret <8 x i16> %2 162 } 163 declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone 164 165 define <4 x i32> @commute_fold_vpmadcsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { 166 ;CHECK-LABEL: commute_fold_vpmadcsswd 167 ;CHECK: vpmadcsswd %xmm1, (%rdi), %xmm0, %xmm0 168 %1 = load <8 x i16>, <8 x i16>* %a0 169 %2 = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) 170 ret <4 x i32> %2 171 } 172 declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 173 174 define <4 x i32> @commute_fold_vpmadcswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { 175 ;CHECK-LABEL: commute_fold_vpmadcswd 176 ;CHECK: vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0 177 %1 = load <8 x i16>, <8 x i16>* %a0 178 %2 = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) 179 ret <4 x i32> %2 180 } 181 declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 182 183 184 185