1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F 5 6 ; 'signum' test cases (PR13248) 7 8 ; 9 ; generic implementation for 128-bit vectors 10 ; 11 12 define void @signum32a(<4 x float>*) { 13 ; AVX-LABEL: signum32a: 14 ; AVX: # %bb.0: # %entry 15 ; AVX-NEXT: vmovaps (%rdi), %xmm0 16 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 17 ; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 18 ; AVX-NEXT: vcvtdq2ps %xmm2, %xmm2 19 ; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 20 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 21 ; AVX-NEXT: vsubps %xmm0, %xmm2, %xmm0 22 ; AVX-NEXT: vmovaps %xmm0, (%rdi) 23 ; AVX-NEXT: retq 24 entry: 25 %1 = load <4 x float>, <4 x float>* %0 26 %2 = fcmp olt <4 x float> %1, zeroinitializer 27 %3 = sitofp <4 x i1> %2 to <4 x float> 28 %4 = fcmp ogt <4 x float> %1, zeroinitializer 29 %5 = sitofp <4 x i1> %4 to <4 x float> 30 %6 = fsub <4 x float> %3, %5 31 store <4 x float> %6, <4 x float>* %0 32 ret void 33 } 34 35 define void @signum64a(<2 x double>*) { 36 ; AVX-LABEL: signum64a: 37 ; AVX: # %bb.0: # %entry 38 ; AVX-NEXT: vmovapd (%rdi), %xmm0 39 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 40 ; AVX-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 41 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] 42 ; AVX-NEXT: vcvtdq2pd %xmm2, %xmm2 43 ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 44 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] 45 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 46 ; AVX-NEXT: vsubpd %xmm0, %xmm2, %xmm0 47 ; AVX-NEXT: vmovapd %xmm0, (%rdi) 48 ; AVX-NEXT: retq 49 entry: 50 %1 = load <2 x double>, <2 x double>* %0 51 %2 = fcmp olt <2 x double> %1, zeroinitializer 52 %3 = sitofp <2 x i1> %2 to <2 x double> 53 %4 = fcmp ogt <2 x double> %1, zeroinitializer 54 %5 = sitofp <2 x i1> %4 to <2 x double> 55 %6 = fsub <2 x double> %3, %5 56 store <2 x double> %6, <2 x double>* %0 57 ret void 58 } 59 60 ; 61 ; generic implementation for 256-bit vectors 62 ; 63 64 define void @signum32b(<8 x float>*) { 65 ; AVX-LABEL: signum32b: 66 ; AVX: # %bb.0: # %entry 67 ; AVX-NEXT: vmovaps (%rdi), %ymm0 68 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 69 ; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 70 ; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 71 ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 72 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 73 ; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0 74 ; AVX-NEXT: vmovaps %ymm0, (%rdi) 75 ; AVX-NEXT: vzeroupper 76 ; AVX-NEXT: retq 77 entry: 78 %1 = load <8 x float>, <8 x float>* %0 79 %2 = fcmp olt <8 x float> %1, zeroinitializer 80 %3 = sitofp <8 x i1> %2 to <8 x float> 81 %4 = fcmp ogt <8 x float> %1, zeroinitializer 82 %5 = sitofp <8 x i1> %4 to <8 x float> 83 %6 = fsub <8 x float> %3, %5 84 store <8 x float> %6, <8 x float>* %0 85 ret void 86 } 87 88 define void @signum64b(<4 x double>*) { 89 ; AVX1-LABEL: signum64b: 90 ; AVX1: # %bb.0: # %entry 91 ; AVX1-NEXT: vmovapd (%rdi), %ymm0 92 ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 93 ; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 94 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 95 ; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 96 ; AVX1-NEXT: vcvtdq2pd %xmm2, %ymm2 97 ; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 98 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 99 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 100 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 101 ; AVX1-NEXT: vsubpd %ymm0, %ymm2, %ymm0 102 ; AVX1-NEXT: vmovapd %ymm0, (%rdi) 103 ; AVX1-NEXT: vzeroupper 104 ; AVX1-NEXT: retq 105 ; 106 ; AVX2-LABEL: signum64b: 107 ; AVX2: # %bb.0: # %entry 108 ; AVX2-NEXT: vmovapd (%rdi), %ymm0 109 ; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 110 ; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 111 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 112 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 113 ; AVX2-NEXT: vcvtdq2pd %xmm2, %ymm2 114 ; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 115 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 116 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 117 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 118 ; AVX2-NEXT: vsubpd %ymm0, %ymm2, %ymm0 119 ; AVX2-NEXT: vmovapd %ymm0, (%rdi) 120 ; AVX2-NEXT: vzeroupper 121 ; AVX2-NEXT: retq 122 ; 123 ; AVX512F-LABEL: signum64b: 124 ; AVX512F: # %bb.0: # %entry 125 ; AVX512F-NEXT: vmovapd (%rdi), %ymm0 126 ; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1 127 ; AVX512F-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 128 ; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 129 ; AVX512F-NEXT: vcvtdq2pd %xmm2, %ymm2 130 ; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 131 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 132 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 133 ; AVX512F-NEXT: vsubpd %ymm0, %ymm2, %ymm0 134 ; AVX512F-NEXT: vmovapd %ymm0, (%rdi) 135 ; AVX512F-NEXT: vzeroupper 136 ; AVX512F-NEXT: retq 137 entry: 138 %1 = load <4 x double>, <4 x double>* %0 139 %2 = fcmp olt <4 x double> %1, zeroinitializer 140 %3 = sitofp <4 x i1> %2 to <4 x double> 141 %4 = fcmp ogt <4 x double> %1, zeroinitializer 142 %5 = sitofp <4 x i1> %4 to <4 x double> 143 %6 = fsub <4 x double> %3, %5 144 store <4 x double> %6, <4 x double>* %0 145 ret void 146 } 147 148 ; 149 ; implementation using AVX intrinsics for 256-bit vectors 150 ; 151 152 define void @signum32c(<8 x float>*) { 153 ; AVX-LABEL: signum32c: 154 ; AVX: # %bb.0: # %entry 155 ; AVX-NEXT: vmovaps (%rdi), %ymm0 156 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 157 ; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 158 ; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 159 ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 160 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 161 ; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0 162 ; AVX-NEXT: vmovaps %ymm0, (%rdi) 163 ; AVX-NEXT: vzeroupper 164 ; AVX-NEXT: retq 165 entry: 166 %1 = load <8 x float>, <8 x float>* %0 167 %2 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %1, <8 x float> zeroinitializer, i8 1) 168 %3 = bitcast <8 x float> %2 to <8 x i32> 169 %4 = sitofp <8 x i32> %3 to <8 x float> 170 %5 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %1, i8 1) 171 %6 = bitcast <8 x float> %5 to <8 x i32> 172 %7 = sitofp <8 x i32> %6 to <8 x float> 173 %8 = fsub <8 x float> %4, %7 174 store <8 x float> %8, <8 x float>* %0 175 ret void 176 } 177 178 define void @signum64c(<4 x double>*) { 179 ; AVX1-LABEL: signum64c: 180 ; AVX1: # %bb.0: # %entry 181 ; AVX1-NEXT: vmovapd (%rdi), %ymm0 182 ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 183 ; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 184 ; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 185 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 186 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 187 ; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 188 ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 189 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 190 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 191 ; AVX1-NEXT: vmovaps %ymm0, (%rdi) 192 ; AVX1-NEXT: vzeroupper 193 ; AVX1-NEXT: retq 194 ; 195 ; AVX2-LABEL: signum64c: 196 ; AVX2: # %bb.0: # %entry 197 ; AVX2-NEXT: vmovapd (%rdi), %ymm0 198 ; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 199 ; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 200 ; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 201 ; AVX2-NEXT: vpsubd %ymm0, %ymm2, %ymm0 202 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 203 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 204 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 205 ; AVX2-NEXT: vmovaps %ymm0, (%rdi) 206 ; AVX2-NEXT: vzeroupper 207 ; AVX2-NEXT: retq 208 ; 209 ; AVX512F-LABEL: signum64c: 210 ; AVX512F: # %bb.0: # %entry 211 ; AVX512F-NEXT: vmovapd (%rdi), %ymm0 212 ; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1 213 ; AVX512F-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 214 ; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 215 ; AVX512F-NEXT: vpsubd %ymm0, %ymm2, %ymm0 216 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 217 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 218 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 219 ; AVX512F-NEXT: vmovaps %ymm0, (%rdi) 220 ; AVX512F-NEXT: vzeroupper 221 ; AVX512F-NEXT: retq 222 entry: 223 %x = load <4 x double>, <4 x double>* %0 224 %xgt = tail call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %x, <4 x double> zeroinitializer, i8 1) 225 %igt = bitcast <4 x double> %xgt to <8 x i32> 226 %xlt = tail call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %x, i8 1) 227 %ilt = bitcast <4 x double> %xlt to <8 x i32> 228 ; it is important to use %igt twice as source in order to make LLVM use a shuffle operation 229 %isign = sub <8 x i32> %igt, %ilt 230 %ssign = shufflevector <8 x i32> %isign, <8 x i32> %isign, <4 x i32> <i32 0, i32 2, i32 12, i32 14> 231 %sign = tail call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %ssign) 232 store <4 x double> %sign, <4 x double>* %0 233 ret void 234 } 235 236 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 237 238 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 239 240 declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone 241