1 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE 2 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE 3 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX 4 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512vl -mcpu=skx | FileCheck %s --check-prefix=AVX 5 6 define double @t1(float* nocapture %x) nounwind readonly ssp { 7 entry: 8 ; SSE-LABEL: t1: 9 ; SSE: movss ([[A0:%rdi|%rcx]]), %xmm0 10 ; SSE: cvtss2sd %xmm0, %xmm0 11 12 %0 = load float, float* %x, align 4 13 %1 = fpext float %0 to double 14 ret double %1 15 } 16 17 define float @t2(double* nocapture %x) nounwind readonly ssp optsize { 18 entry: 19 ; SSE-LABEL: t2: 20 ; SSE: cvtsd2ss ([[A0]]), %xmm0 21 %0 = load double, double* %x, align 8 22 %1 = fptrunc double %0 to float 23 ret float %1 24 } 25 26 define float @squirtf(float* %x) nounwind { 27 entry: 28 ; SSE-LABEL: squirtf: 29 ; SSE: movss ([[A0]]), %xmm0 30 ; SSE: sqrtss %xmm0, %xmm0 31 %z = load float, float* %x 32 %t = call float @llvm.sqrt.f32(float %z) 33 ret float %t 34 } 35 36 define double @squirt(double* %x) nounwind { 37 entry: 38 ; SSE-LABEL: squirt: 39 ; SSE: movsd ([[A0]]), %xmm0 40 ; SSE: sqrtsd %xmm0, %xmm0 41 %z = load double, double* %x 42 %t = call double @llvm.sqrt.f64(double %z) 43 ret double %t 44 } 45 46 define float @squirtf_size(float* %x) nounwind optsize { 47 entry: 48 ; SSE-LABEL: squirtf_size: 49 ; SSE: sqrtss ([[A0]]), %xmm0 50 %z = load float, float* %x 51 %t = call float @llvm.sqrt.f32(float %z) 52 ret float %t 53 } 54 55 define double @squirt_size(double* %x) nounwind optsize { 56 entry: 57 ; SSE-LABEL: squirt_size: 58 ; SSE: sqrtsd ([[A0]]), %xmm0 59 %z = load double, double* %x 60 %t = call double @llvm.sqrt.f64(double %z) 61 ret double %t 62 } 63 64 declare float @llvm.sqrt.f32(float) 65 declare double @llvm.sqrt.f64(double) 66 67 ; SSE-LABEL: loopdep1 68 ; SSE: for.body{{$}} 69 ; 70 ; This loop contains two cvtsi2ss instructions that update the same xmm 71 ; register. Verify that the break false dependency fix pass breaks those 72 ; dependencies by inserting xorps instructions. 73 ; 74 ; If the register allocator chooses different registers for the two cvtsi2ss 75 ; instructions, they are still dependent on themselves. 76 ; SSE: xorps [[XMM1:%xmm[0-9]+]] 77 ; SSE: , [[XMM1]] 78 ; SSE: cvtsi2ssl %{{.*}}, [[XMM1]] 79 ; SSE: xorps [[XMM2:%xmm[0-9]+]] 80 ; SSE: , [[XMM2]] 81 ; SSE: cvtsi2ssl %{{.*}}, [[XMM2]] 82 ; 83 define float @loopdep1(i32 %m) nounwind uwtable readnone ssp { 84 entry: 85 %tobool3 = icmp eq i32 %m, 0 86 br i1 %tobool3, label %for.end, label %for.body 87 88 for.body: ; preds = %entry, %for.body 89 %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ] 90 %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ] 91 %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ] 92 %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ] 93 %conv = sitofp i32 %n.04 to float 94 %add = fadd float %s1.06, %conv 95 %conv1 = sitofp i32 %m.addr.07 to float 96 %add2 = fadd float %s2.05, %conv1 97 %inc = add nsw i32 %n.04, 1 98 %dec = add nsw i32 %m.addr.07, -1 99 %tobool = icmp eq i32 %dec, 0 100 br i1 %tobool, label %for.end, label %for.body 101 102 for.end: ; preds = %for.body, %entry 103 %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] 104 %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ] 105 %sub = fsub float %s1.0.lcssa, %s2.0.lcssa 106 ret float %sub 107 } 108 109 ; rdar:15221834 False AVX register dependencies cause 5x slowdown on 110 ; flops-6. Make sure the unused register read by vcvtsi2sdq is zeroed 111 ; to avoid cyclic dependence on a write to the same register in a 112 ; previous iteration. 113 114 ; AVX-LABEL: loopdep2: 115 ; AVX-LABEL: %loop 116 ; AVX: vxorps %[[REG:xmm.]], %{{xmm.}}, %{{xmm.}} 117 ; AVX: vcvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]], %{{xmm.}} 118 ; SSE-LABEL: loopdep2: 119 ; SSE-LABEL: %loop 120 ; SSE: xorps %[[REG:xmm.]], %[[REG]] 121 ; SSE: cvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]] 122 define i64 @loopdep2(i64* nocapture %x, double* nocapture %y) nounwind { 123 entry: 124 %vx = load i64, i64* %x 125 br label %loop 126 loop: 127 %i = phi i64 [ 1, %entry ], [ %inc, %loop ] 128 %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] 129 %fi = sitofp i64 %i to double 130 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 131 %vy = load double, double* %y 132 %fipy = fadd double %fi, %vy 133 %iipy = fptosi double %fipy to i64 134 %s2 = add i64 %s1, %iipy 135 %inc = add nsw i64 %i, 1 136 %exitcond = icmp eq i64 %inc, 156250000 137 br i1 %exitcond, label %ret, label %loop 138 ret: 139 ret i64 %s2 140 } 141 142 ; This loop contains a cvtsi2sd instruction that has a loop-carried 143 ; false dependency on an xmm that is modified by other scalar instructions 144 ; that follow it in the loop. Additionally, the source of convert is a 145 ; memory operand. Verify the break false dependency fix pass breaks this 146 ; dependency by inserting a xor before the convert. 147 @x = common global [1024 x double] zeroinitializer, align 16 148 @y = common global [1024 x double] zeroinitializer, align 16 149 @z = common global [1024 x double] zeroinitializer, align 16 150 @w = common global [1024 x double] zeroinitializer, align 16 151 @v = common global [1024 x i32] zeroinitializer, align 16 152 153 define void @loopdep3() { 154 entry: 155 br label %for.cond1.preheader 156 157 for.cond1.preheader: ; preds = %for.inc14, %entry 158 %i.025 = phi i32 [ 0, %entry ], [ %inc15, %for.inc14 ] 159 br label %for.body3 160 161 for.body3: 162 %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] 163 %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @v, i64 0, i64 %indvars.iv 164 %0 = load i32, i32* %arrayidx, align 4 165 %conv = sitofp i32 %0 to double 166 %arrayidx5 = getelementptr inbounds [1024 x double], [1024 x double]* @x, i64 0, i64 %indvars.iv 167 %1 = load double, double* %arrayidx5, align 8 168 %mul = fmul double %conv, %1 169 %arrayidx7 = getelementptr inbounds [1024 x double], [1024 x double]* @y, i64 0, i64 %indvars.iv 170 %2 = load double, double* %arrayidx7, align 8 171 %mul8 = fmul double %mul, %2 172 %arrayidx10 = getelementptr inbounds [1024 x double], [1024 x double]* @z, i64 0, i64 %indvars.iv 173 %3 = load double, double* %arrayidx10, align 8 174 %mul11 = fmul double %mul8, %3 175 %arrayidx13 = getelementptr inbounds [1024 x double], [1024 x double]* @w, i64 0, i64 %indvars.iv 176 store double %mul11, double* %arrayidx13, align 8 177 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 178 %exitcond = icmp eq i64 %indvars.iv.next, 1024 179 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 180 br i1 %exitcond, label %for.inc14, label %for.body3 181 182 for.inc14: ; preds = %for.body3 183 %inc15 = add nsw i32 %i.025, 1 184 %exitcond26 = icmp eq i32 %inc15, 100000 185 br i1 %exitcond26, label %for.end16, label %for.cond1.preheader 186 187 for.end16: ; preds = %for.inc14 188 ret void 189 190 ;SSE-LABEL:@loopdep3 191 ;SSE: xorps [[XMM0:%xmm[0-9]+]], [[XMM0]] 192 ;SSE-NEXT: cvtsi2sdl {{.*}}, [[XMM0]] 193 ;SSE-NEXT: mulsd {{.*}}, [[XMM0]] 194 ;SSE-NEXT: mulsd {{.*}}, [[XMM0]] 195 ;SSE-NEXT: mulsd {{.*}}, [[XMM0]] 196 ;SSE-NEXT: movsd [[XMM0]], 197 ;AVX-LABEL:@loopdep3 198 ;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]] 199 ;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], {{%xmm[0-9]+}} 200 ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] 201 ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] 202 ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] 203 ;AVX-NEXT: vmovsd [[XMM0]], 204 } 205 206 define double @inlineasmdep(i64 %arg) { 207 top: 208 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() 209 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"() 210 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() 211 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() 212 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() 213 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() 214 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() 215 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 216 %tmp1 = sitofp i64 %arg to double 217 ret double %tmp1 218 ;AVX-LABEL:@inlineasmdep 219 ;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]] 220 ;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}} 221 } 222 223 ; Make sure we are making a smart choice regarding undef registers and 224 ; hiding the false dependency behind a true dependency 225 define double @truedeps(float %arg) { 226 top: 227 tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() 228 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() 229 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() 230 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() 231 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() 232 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() 233 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() 234 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() 235 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 236 %tmp1 = fpext float %arg to double 237 ret double %tmp1 238 ;AVX-LABEL:@truedeps 239 ;AVX-NOT: vxorps 240 ;AVX: vcvtss2sd [[XMM0:%xmm[0-9]+]], [[XMM0]], {{%xmm[0-9]+}} 241 } 242 243 ; Make sure we are making a smart choice regarding undef registers and 244 ; choosing the register with the highest clearence 245 define double @clearence(i64 %arg) { 246 top: 247 tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() 248 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() 249 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() 250 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() 251 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() 252 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() 253 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() 254 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() 255 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 256 %tmp1 = sitofp i64 %arg to double 257 ret double %tmp1 258 ;AVX-LABEL:@clearence 259 ;AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] 260 ;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}} 261 } 262 263 ; Make sure we are making a smart choice regarding undef registers in order to 264 ; avoid a cyclic dependence on a write to the same register in a previous 265 ; iteration, especially when we cannot zero out the undef register because it 266 ; is alive. 267 define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind { 268 entry: 269 %vx = load i64, i64* %x 270 br label %loop 271 loop: 272 %i = phi i64 [ 1, %entry ], [ %inc, %loop ] 273 %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] 274 %fi = sitofp i64 %i to double 275 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() 276 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() 277 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() 278 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() 279 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() 280 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() 281 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 282 %vy = load double, double* %y 283 %fipy = fadd double %fi, %vy 284 %iipy = fptosi double %fipy to i64 285 %s2 = add i64 %s1, %iipy 286 %inc = add nsw i64 %i, 1 287 %exitcond = icmp eq i64 %inc, 156250000 288 br i1 %exitcond, label %ret, label %loop 289 ret: 290 ret i64 %s2 291 ;AVX-LABEL:@loopclearence 292 ;Registers 4-7 are not used and therefore one of them should be chosen 293 ;AVX-NOT: {{%xmm[4-7]}} 294 ;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}} 295 ;AVX-NOT: [[XMM4_7]] 296 } 297 298 ; Make sure we are making a smart choice regarding undef registers even for more 299 ; complicated loop structures. This example is the inner loop from 300 ; julia> a = falses(10000); a[1:4:end] = true 301 ; julia> linspace(1.0,2.0,10000)[a] 302 define void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) { 303 entry: 304 tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"() 305 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() 306 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() 307 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() 308 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() 309 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() 310 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 311 br label %loop 312 313 loop: 314 %phi_i = phi i64 [ 1, %entry ], [ %nexti, %loop_end ] 315 %phi_j = phi i64 [ 1, %entry ], [ %nextj, %loop_end ] 316 %phi_k = phi i64 [ 0, %entry ], [ %nextk, %loop_end ] 317 br label %inner_loop 318 319 inner_loop: 320 %phi = phi i64 [ %phi_k, %loop ], [ %nextk, %inner_loop ] 321 %idx = lshr i64 %phi, 6 322 %inputptr = getelementptr i64, i64* %x, i64 %idx 323 %input = load i64, i64* %inputptr, align 8 324 %masked = and i64 %phi, 63 325 %shiftedmasked = shl i64 1, %masked 326 %maskedinput = and i64 %input, %shiftedmasked 327 %cmp = icmp eq i64 %maskedinput, 0 328 %nextk = add i64 %phi, 1 329 br i1 %cmp, label %inner_loop, label %loop_end 330 331 loop_end: 332 %nexti = add i64 %phi_i, 1 333 %nextj = add i64 %phi_j, 1 334 ; Register use, plus us clobbering 7-15 above, basically forces xmm6 here as 335 ; the only reasonable choice. The primary thing we care about is that it's 336 ; not one of the registers used in the loop (e.g. not the output reg here) 337 ;AVX-NOT: %xmm6 338 ;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} 339 ;AVX-NOT: %xmm6 340 %nexti_f = sitofp i64 %nexti to double 341 %sub = fsub double %c1, %nexti_f 342 %mul = fmul double %sub, %c2 343 ;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} 344 ;AVX-NOT: %xmm6 345 %phi_f = sitofp i64 %phi to double 346 %mul2 = fmul double %phi_f, %c3 347 %add2 = fadd double %mul, %mul2 348 %div = fdiv double %add2, %c4 349 %prev_j = add i64 %phi_j, -1 350 %outptr = getelementptr double, double* %y, i64 %prev_j 351 store double %div, double* %outptr, align 8 352 %done = icmp slt i64 %size, %nexti 353 br i1 %done, label %loopdone, label %loop 354 355 loopdone: 356 ret void 357 } 358