1 ; RUN: opt %s -analyze -divergence | FileCheck %s 2 3 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" 4 target triple = "nvptx64-nvidia-cuda" 5 6 ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) 7 define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { 8 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge' 9 entry: 10 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 11 %cond = icmp slt i32 %n, 0 12 br i1 %cond, label %then, label %else ; uniform 13 ; CHECK-NOT: DIVERGENT: br i1 %cond, 14 then: 15 %a1 = add i32 %a, %tid 16 br label %merge 17 else: 18 %b2 = add i32 %b, %tid 19 br label %merge 20 merge: 21 %c = phi i32 [ %a1, %then ], [ %b2, %else ] 22 ret i32 %c 23 } 24 25 ; c = a; 26 ; if (threadIdx.x < 5) // divergent: data dependent 27 ; c = b; 28 ; return c; // c is divergent: sync dependent 29 define i32 @sync(i32 %a, i32 %b) { 30 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync' 31 bb1: 32 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() 33 %cond = icmp slt i32 %tid, 5 34 br i1 %cond, label %bb2, label %bb3 35 ; CHECK: DIVERGENT: br i1 %cond, 36 bb2: 37 br label %bb3 38 bb3: 39 %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid 40 ; CHECK: DIVERGENT: %c = 41 ret i32 %c 42 } 43 44 ; c = 0; 45 ; if (threadIdx.x >= 5) { // divergent 46 ; c = (n < 0 ? a : b); // c here is uniform because n is uniform 47 ; } 48 ; // c here is divergent because it is sync dependent on threadIdx.x >= 5 49 ; return c; 50 define i32 @mixed(i32 %n, i32 %a, i32 %b) { 51 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed' 52 bb1: 53 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() 54 %cond = icmp slt i32 %tid, 5 55 br i1 %cond, label %bb6, label %bb2 56 ; CHECK: DIVERGENT: br i1 %cond, 57 bb2: 58 %cond2 = icmp slt i32 %n, 0 59 br i1 %cond2, label %bb4, label %bb3 60 bb3: 61 br label %bb5 62 bb4: 63 br label %bb5 64 bb5: 65 %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ] 66 ; CHECK-NOT: DIVERGENT: %c = 67 br label %bb6 68 bb6: 69 %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ] 70 ; CHECK: DIVERGENT: %c2 = 71 ret i32 %c2 72 } 73 74 ; We conservatively treats all parameters of a __device__ function as divergent. 75 define i32 @device(i32 %n, i32 %a, i32 %b) { 76 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device' 77 ; CHECK: DIVERGENT: i32 %n 78 ; CHECK: DIVERGENT: i32 %a 79 ; CHECK: DIVERGENT: i32 %b 80 entry: 81 %cond = icmp slt i32 %n, 0 82 br i1 %cond, label %then, label %else 83 ; CHECK: DIVERGENT: br i1 %cond, 84 then: 85 br label %merge 86 else: 87 br label %merge 88 merge: 89 %c = phi i32 [ %a, %then ], [ %b, %else ] 90 ret i32 %c 91 } 92 93 ; int i = 0; 94 ; do { 95 ; i++; // i here is uniform 96 ; } while (i < laneid); 97 ; return i == 10 ? 0 : 1; // i here is divergent 98 ; 99 ; The i defined in the loop is used outside. 100 define i32 @loop() { 101 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop' 102 entry: 103 %laneid = call i32 @llvm.ptx.read.laneid() 104 br label %loop 105 loop: 106 %i = phi i32 [ 0, %entry ], [ %i1, %loop ] 107 ; CHECK-NOT: DIVERGENT: %i = 108 %i1 = add i32 %i, 1 109 %exit_cond = icmp sge i32 %i1, %laneid 110 br i1 %exit_cond, label %loop_exit, label %loop 111 loop_exit: 112 %cond = icmp eq i32 %i, 10 113 br i1 %cond, label %then, label %else 114 ; CHECK: DIVERGENT: br i1 %cond, 115 then: 116 ret i32 0 117 else: 118 ret i32 1 119 } 120 121 ; Same as @loop, but the loop is in the LCSSA form. 122 define i32 @lcssa() { 123 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa' 124 entry: 125 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 126 br label %loop 127 loop: 128 %i = phi i32 [ 0, %entry ], [ %i1, %loop ] 129 ; CHECK-NOT: DIVERGENT: %i = 130 %i1 = add i32 %i, 1 131 %exit_cond = icmp sge i32 %i1, %tid 132 br i1 %exit_cond, label %loop_exit, label %loop 133 loop_exit: 134 %i.lcssa = phi i32 [ %i, %loop ] 135 ; CHECK: DIVERGENT: %i.lcssa = 136 %cond = icmp eq i32 %i.lcssa, 10 137 br i1 %cond, label %then, label %else 138 ; CHECK: DIVERGENT: br i1 %cond, 139 then: 140 ret i32 0 141 else: 142 ret i32 1 143 } 144 145 ; This test contains an unstructured loop. 146 ; +-------------- entry ----------------+ 147 ; | | 148 ; V V 149 ; i1 = phi(0, i3) i2 = phi(0, i3) 150 ; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 151 ; ^ | ^ 152 ; | V | 153 ; +-------- switch (tid / i3) ----------+ 154 ; | 155 ; V 156 ; if (i3 == 5) // divergent 157 ; because sync dependent on (tid / i3). 158 define i32 @unstructured_loop(i1 %entry_cond) { 159 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop' 160 entry: 161 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 162 br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 163 loop_entry_1: 164 %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] 165 %j1 = add i32 %i1, 1 166 br label %loop_body 167 loop_entry_2: 168 %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] 169 %j2 = add i32 %i2, 2 170 br label %loop_body 171 loop_body: 172 %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] 173 br label %loop_latch 174 loop_latch: 175 %div = sdiv i32 %tid, %i3 176 switch i32 %div, label %branch [ i32 1, label %loop_entry_1 177 i32 2, label %loop_entry_2 ] 178 branch: 179 %cmp = icmp eq i32 %i3, 5 180 br i1 %cmp, label %then, label %else 181 ; CHECK: DIVERGENT: br i1 %cmp, 182 then: 183 ret i32 0 184 else: 185 ret i32 1 186 } 187 188 ; Verifies sync-dependence is computed correctly in the absense of loops. 189 define i32 @sync_no_loop(i32 %arg) { 190 entry: 191 %0 = add i32 %arg, 1 192 %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 193 %1 = icmp sge i32 %tid, 10 194 br i1 %1, label %bb1, label %bb2 195 196 bb1: 197 br label %bb3 198 199 bb2: 200 br label %bb3 201 202 bb3: 203 %2 = add i32 %0, 2 204 ; CHECK-NOT: DIVERGENT: %2 205 ret i32 %2 206 } 207 208 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() 209 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() 210 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() 211 declare i32 @llvm.ptx.read.laneid() 212 213 !nvvm.annotations = !{!0, !1, !2, !3, !4, !5} 214 !0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1} 215 !1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1} 216 !2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1} 217 !3 = !{i32 ()* @loop, !"kernel", i32 1} 218 !4 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1} 219 !5 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} 220