1 ; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s 2 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4 ;; This test checks that x86-cmov-converter optimization does not transform CMOV 5 ;; instruction when the gain (in cycles) of converting to branch is less than 6 ;; a fix threshold (measured for "-x86-cmov-converter-threshold=4"). 7 ;; 8 ;; Test was created using the following command line: 9 ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o - 10 ;; Where foo.c is: 11 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 12 ;;int bar(int *a, int *b, int n) { 13 ;; int sum = 0; 14 ;; for (int i = 0; i < n; ++i) { 15 ;; int x = a[i] * a[i+1] * a[i+2]; 16 ;; int y = b[i] * b[i+1]; 17 ;; sum += y > x ? x : 0; 18 ;; } 19 ;; return sum; 20 ;;} 21 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 22 ;; Adding a test to the above function shows code with CMOV is 25% faster than 23 ;; the code with branch. 24 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 25 ;;#define N 10000 26 ;;int A[N]; 27 ;;int B[N]; 28 ;; 29 ;; 30 ;; 31 ;;int main () { 32 ;; for (int i=0; i< N; ++i) { 33 ;; A[i] = i%4; 34 ;; B[i] = i%5; 35 ;; } 36 ;; int sum = 0; 37 ;; for (int i=0; i< N*10; ++i) 38 ;; sum += bar(A, B, N); 39 ;; return sum; 40 ;;} 41 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 42 43 ; CHECK-NOT: jg 44 ; CHECK: cmovle 45 define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 { 46 entry: 47 %cmp30 = icmp sgt i32 %n, 0 48 br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup 49 50 for.body.preheader: ; preds = %entry 51 %.pre = load i32, i32* %a, align 4 52 %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1 53 %.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4 54 %.pre35 = load i32, i32* %b, align 4 55 %wide.trip.count = zext i32 %n to i64 56 br label %for.body 57 58 for.cond.cleanup: ; preds = %for.body, %entry 59 %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ] 60 ret i32 %sum.0.lcssa 61 62 for.body: ; preds = %for.body, %for.body.preheader 63 %0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ] 64 %1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ] 65 %2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ] 66 %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 67 %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ] 68 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 69 %mul = mul nsw i32 %1, %2 70 %3 = add nuw nsw i64 %indvars.iv, 2 71 %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3 72 %4 = load i32, i32* %arrayidx5, align 4 73 %mul6 = mul nsw i32 %mul, %4 74 %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next 75 %5 = load i32, i32* %arrayidx11, align 4 76 %mul12 = mul nsw i32 %5, %0 77 %cmp13 = icmp sgt i32 %mul12, %mul6 78 %cond = select i1 %cmp13, i32 %mul6, i32 0 79 %add14 = add nsw i32 %cond, %sum.032 80 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count 81 br i1 %exitcond, label %for.cond.cleanup, label %for.body 82 } 83 84 attributes #0 = {"target-cpu"="skylake"} 85 86 !llvm.module.flags = !{!0, !1} 87 !llvm.ident = !{!2} 88 89 !0 = !{i32 1, !"wchar_size", i32 2} 90 !1 = !{i32 7, !"PIC Level", i32 2} 91 !2 = !{!"clang version 5.0.0 (trunk)"} 92