1 ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s 2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 3 4 ; Add an extra verifier runs. There were some cases where invalid IR 5 ; was produced but happened to be fixed by the later passes. 6 7 ; Make sure divergent control flow with multiple exits from a region 8 ; is properly handled. UnifyFunctionExitNodes should be run before 9 ; StructurizeCFG. 10 11 ; IR-LABEL: @multi_divergent_region_exit_ret_ret( 12 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) 13 ; IR: %2 = extractvalue { i1, i64 } %1, 0 14 ; IR: %3 = extractvalue { i1, i64 } %1, 1 15 ; IR: br i1 %2, label %LeafBlock1, label %Flow 16 17 ; IR: Flow: 18 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] 19 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] 20 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) 21 ; IR: %7 = extractvalue { i1, i64 } %6, 0 22 ; IR: %8 = extractvalue { i1, i64 } %6, 1 23 ; IR: br i1 %7, label %LeafBlock, label %Flow1 24 25 ; IR: LeafBlock: 26 ; IR: br label %Flow1 27 28 ; IR: LeafBlock1: 29 ; IR: br label %Flow{{$}} 30 31 ; IR: Flow2: 32 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] 33 ; IR: call void @llvm.amdgcn.end.cf(i64 %19) 34 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) 35 ; IR: %13 = extractvalue { i1, i64 } %12, 0 36 ; IR: %14 = extractvalue { i1, i64 } %12, 1 37 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock 38 39 ; IR: exit0: 40 ; IR: store volatile i32 9, i32 addrspace(1)* undef 41 ; IR: br label %UnifiedReturnBlock 42 43 ; IR: Flow1: 44 ; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] 45 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] 46 ; IR: call void @llvm.amdgcn.end.cf(i64 %8) 47 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) 48 ; IR: %18 = extractvalue { i1, i64 } %17, 0 49 ; IR: %19 = extractvalue { i1, i64 } %17, 1 50 ; IR: br i1 %18, label %exit1, label %Flow2 51 52 ; IR: exit1: 53 ; IR: store volatile i32 17, i32 addrspace(3)* undef 54 ; IR: br label %Flow2 55 56 ; IR: UnifiedReturnBlock: 57 ; IR: call void @llvm.amdgcn.end.cf(i64 %14) 58 ; IR: ret void 59 60 61 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret: 62 ; GCN: v_cmp_lt_i32_e32 vcc, 1 63 ; GCN: s_and_saveexec_b64 64 ; GCN: s_xor_b64 65 66 67 ; FIXME: Why is this compare essentially repeated? 68 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]] 69 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc 70 ; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]] 71 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc 72 73 ; GCN: ; %Flow4 74 ; GCN-NEXT: s_or_b64 exec, exec 75 ; GCN: v_cmp_ne_u32_e32 vcc, 0 76 77 ; GCN: ; %exit1 78 ; GCN: ds_write_b32 79 80 ; GCN: %Flow5 81 ; GCN-NEXT: s_or_b64 exec, exec 82 ; GCN: v_cmp_ne_u32_e32 vcc, 0 83 ; GCN-NEXT: s_and_saveexec_b64 84 85 ; GCN: ; %exit0 86 ; GCN: buffer_store_dword 87 88 ; GCN: ; %UnifiedReturnBlock 89 ; GCN-NEXT: s_endpgm 90 define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { 91 entry: 92 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 93 %tmp1 = add i32 0, %tmp 94 %tmp2 = zext i32 %tmp1 to i64 95 %tmp3 = add i64 0, %tmp2 96 %tmp4 = shl i64 %tmp3, 32 97 %tmp5 = ashr exact i64 %tmp4, 32 98 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 99 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 100 %tmp8 = sext i32 %tmp7 to i64 101 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 102 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 103 %tmp13 = zext i32 %tmp10 to i64 104 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 105 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 106 %Pivot = icmp slt i32 %tmp16, 2 107 br i1 %Pivot, label %LeafBlock, label %LeafBlock1 108 109 LeafBlock: ; preds = %entry 110 %SwitchLeaf = icmp eq i32 %tmp16, 1 111 br i1 %SwitchLeaf, label %exit0, label %exit1 112 113 LeafBlock1: ; preds = %entry 114 %SwitchLeaf2 = icmp eq i32 %tmp16, 2 115 br i1 %SwitchLeaf2, label %exit0, label %exit1 116 117 exit0: ; preds = %LeafBlock, %LeafBlock1 118 store volatile i32 9, i32 addrspace(1)* undef 119 ret void 120 121 exit1: ; preds = %LeafBlock, %LeafBlock1 122 store volatile i32 17, i32 addrspace(3)* undef 123 ret void 124 } 125 126 ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable( 127 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) 128 129 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) 130 131 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] 132 ; IR: call void @llvm.amdgcn.end.cf(i64 %19) 133 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) 134 ; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock 135 136 137 ; IR: UnifiedUnreachableBlock: 138 ; IR-NEXT: unreachable 139 140 141 ; FIXME: Probably should insert an s_endpgm anyway. 142 ; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable: 143 ; GCN: ; %UnifiedUnreachableBlock 144 ; GCN-NEXT: .Lfunc_end 145 define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { 146 entry: 147 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 148 %tmp1 = add i32 0, %tmp 149 %tmp2 = zext i32 %tmp1 to i64 150 %tmp3 = add i64 0, %tmp2 151 %tmp4 = shl i64 %tmp3, 32 152 %tmp5 = ashr exact i64 %tmp4, 32 153 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 154 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 155 %tmp8 = sext i32 %tmp7 to i64 156 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 157 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 158 %tmp13 = zext i32 %tmp10 to i64 159 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 160 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 161 %Pivot = icmp slt i32 %tmp16, 2 162 br i1 %Pivot, label %LeafBlock, label %LeafBlock1 163 164 LeafBlock: ; preds = %entry 165 %SwitchLeaf = icmp eq i32 %tmp16, 1 166 br i1 %SwitchLeaf, label %exit0, label %exit1 167 168 LeafBlock1: ; preds = %entry 169 %SwitchLeaf2 = icmp eq i32 %tmp16, 2 170 br i1 %SwitchLeaf2, label %exit0, label %exit1 171 172 exit0: ; preds = %LeafBlock, %LeafBlock1 173 store volatile i32 9, i32 addrspace(1)* undef 174 unreachable 175 176 exit1: ; preds = %LeafBlock, %LeafBlock1 177 store volatile i32 17, i32 addrspace(3)* undef 178 unreachable 179 } 180 181 ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret( 182 ; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2 183 ; IR: llvm.amdgcn.if 184 ; IR: br i1 185 186 ; IR: {{^}}Flow: 187 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] 188 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] 189 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) 190 ; IR: br i1 %7, label %LeafBlock, label %Flow1 191 192 ; IR: {{^}}LeafBlock: 193 ; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1 194 ; IR: %9 = xor i1 %divergent.cond1, true 195 ; IR: br label %Flow1 196 197 ; IR: LeafBlock1: 198 ; IR: %uniform.cond0 = icmp eq i32 %arg3, 2 199 ; IR: %10 = xor i1 %uniform.cond0, true 200 ; IR: br label %Flow 201 202 ; IR: Flow2: 203 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] 204 ; IR: call void @llvm.amdgcn.end.cf(i64 %19) 205 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) 206 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock 207 208 ; IR: exit0: 209 ; IR: store volatile i32 9, i32 addrspace(1)* undef 210 ; IR: br label %UnifiedReturnBlock 211 212 ; IR: {{^}}Flow1: 213 ; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ] 214 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] 215 ; IR: call void @llvm.amdgcn.end.cf(i64 %8) 216 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) 217 ; IR: %18 = extractvalue { i1, i64 } %17, 0 218 ; IR: %19 = extractvalue { i1, i64 } %17, 1 219 ; IR: br i1 %18, label %exit1, label %Flow2 220 221 ; IR: exit1: 222 ; IR: store volatile i32 17, i32 addrspace(3)* undef 223 ; IR: br label %Flow2 224 225 ; IR: UnifiedReturnBlock: 226 ; IR: call void @llvm.amdgcn.end.cf(i64 %14) 227 ; IR: ret void 228 define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { 229 entry: 230 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 231 %tmp1 = add i32 0, %tmp 232 %tmp2 = zext i32 %tmp1 to i64 233 %tmp3 = add i64 0, %tmp2 234 %tmp4 = shl i64 %tmp3, 32 235 %tmp5 = ashr exact i64 %tmp4, 32 236 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 237 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 238 %tmp8 = sext i32 %tmp7 to i64 239 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 240 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 241 %tmp13 = zext i32 %tmp10 to i64 242 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 243 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 244 %divergent.cond0 = icmp slt i32 %tmp16, 2 245 br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1 246 247 LeafBlock: ; preds = %entry 248 %divergent.cond1 = icmp eq i32 %tmp16, 1 249 br i1 %divergent.cond1, label %exit0, label %exit1 250 251 LeafBlock1: ; preds = %entry 252 %uniform.cond0 = icmp eq i32 %arg3, 2 253 br i1 %uniform.cond0, label %exit0, label %exit1 254 255 exit0: ; preds = %LeafBlock, %LeafBlock1 256 store volatile i32 9, i32 addrspace(1)* undef 257 ret void 258 259 exit1: ; preds = %LeafBlock, %LeafBlock1 260 store volatile i32 17, i32 addrspace(3)* undef 261 ret void 262 } 263 264 ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret( 265 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) 266 ; IR: br i1 %2, label %LeafBlock1, label %Flow 267 268 ; IR: Flow: 269 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] 270 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] 271 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) 272 273 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] 274 ; IR: call void @llvm.amdgcn.end.cf(i64 %19) 275 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) 276 277 define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { 278 entry: 279 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 280 %tmp1 = add i32 0, %tmp 281 %tmp2 = zext i32 %tmp1 to i64 282 %tmp3 = add i64 0, %tmp2 283 %tmp4 = shl i64 %tmp3, 32 284 %tmp5 = ashr exact i64 %tmp4, 32 285 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 286 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 287 %tmp8 = sext i32 %tmp7 to i64 288 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 289 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 290 %tmp13 = zext i32 %tmp10 to i64 291 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 292 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 293 %Pivot = icmp slt i32 %tmp16, 2 294 br i1 %Pivot, label %LeafBlock, label %LeafBlock1 295 296 LeafBlock: ; preds = %entry 297 %SwitchLeaf = icmp eq i32 %arg3, 1 298 br i1 %SwitchLeaf, label %exit0, label %exit1 299 300 LeafBlock1: ; preds = %entry 301 %SwitchLeaf2 = icmp eq i32 %tmp16, 2 302 br i1 %SwitchLeaf2, label %exit0, label %exit1 303 304 exit0: ; preds = %LeafBlock, %LeafBlock1 305 store volatile i32 9, i32 addrspace(1)* undef 306 ret void 307 308 exit1: ; preds = %LeafBlock, %LeafBlock1 309 store volatile i32 17, i32 addrspace(3)* undef 310 ret void 311 } 312 313 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value( 314 ; IR: Flow2: 315 ; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] 316 ; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ] 317 ; IR: call void @llvm.amdgcn.end.cf(i64 %20) 318 319 ; IR: UnifiedReturnBlock: 320 ; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ] 321 ; IR: call void @llvm.amdgcn.end.cf(i64 %15) 322 ; IR: ret float %UnifiedRetVal 323 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 { 324 entry: 325 %Pivot = icmp slt i32 %vgpr, 2 326 br i1 %Pivot, label %LeafBlock, label %LeafBlock1 327 328 LeafBlock: ; preds = %entry 329 %SwitchLeaf = icmp eq i32 %vgpr, 1 330 br i1 %SwitchLeaf, label %exit0, label %exit1 331 332 LeafBlock1: ; preds = %entry 333 %SwitchLeaf2 = icmp eq i32 %vgpr, 2 334 br i1 %SwitchLeaf2, label %exit0, label %exit1 335 336 exit0: ; preds = %LeafBlock, %LeafBlock1 337 store i32 9, i32 addrspace(1)* undef 338 ret float 1.0 339 340 exit1: ; preds = %LeafBlock, %LeafBlock1 341 store i32 17, i32 addrspace(3)* undef 342 ret float 2.0 343 } 344 345 ; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value( 346 347 ; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value: 348 ; GCN: s_cmp_gt_i32 s0, 1 349 ; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]] 350 351 ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0 352 353 ; GCN: {{^}}[[FLOW]]: 354 ; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]] 355 356 ; GCN: v_mov_b32_e32 v0, 2.0 357 ; GCN: s_or_b64 exec, exec 358 ; GCN-NOT: s_and_b64 exec, exec 359 ; GCN: v_mov_b32_e32 v0, 1.0 360 361 ; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock 362 ; GCN-NEXT: s_or_b64 exec, exec 363 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 364 ; GCN-NEXT: ; return 365 366 define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 { 367 entry: 368 %uniform.cond = icmp slt i32 %sgpr, 2 369 br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1 370 371 LeafBlock: ; preds = %entry 372 %divergent.cond0 = icmp eq i32 %vgpr, 3 373 br i1 %divergent.cond0, label %exit0, label %exit1 374 375 LeafBlock1: ; preds = %entry 376 %divergent.cond1 = icmp eq i32 %vgpr, 7 377 br i1 %divergent.cond1, label %exit0, label %exit1 378 379 exit0: ; preds = %LeafBlock, %LeafBlock1 380 store i32 9, i32 addrspace(1)* undef 381 ret float 1.0 382 383 exit1: ; preds = %LeafBlock, %LeafBlock1 384 store i32 17, i32 addrspace(3)* undef 385 ret float 2.0 386 } 387 388 ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable( 389 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) 390 391 ; IR: Flow: 392 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] 393 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] 394 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) 395 396 ; IR: Flow2: 397 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] 398 ; IR: call void @llvm.amdgcn.end.cf(i64 %19) 399 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) 400 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock 401 402 ; IR: exit0: 403 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef 404 ; IR-NEXT: br label %UnifiedReturnBlock 405 406 ; IR: Flow1: 407 ; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] 408 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] 409 ; IR: call void @llvm.amdgcn.end.cf(i64 %8) 410 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) 411 ; IR: %18 = extractvalue { i1, i64 } %17, 0 412 ; IR: %19 = extractvalue { i1, i64 } %17, 1 413 ; IR: br i1 %18, label %exit1, label %Flow2 414 415 ; IR: exit1: 416 ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef 417 ; IR-NEXT: call void @llvm.amdgcn.unreachable() 418 ; IR-NEXT: br label %Flow2 419 420 ; IR: UnifiedReturnBlock: 421 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) 422 ; IR-NEXT: ret void 423 define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { 424 entry: 425 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 426 %tmp1 = add i32 0, %tmp 427 %tmp2 = zext i32 %tmp1 to i64 428 %tmp3 = add i64 0, %tmp2 429 %tmp4 = shl i64 %tmp3, 32 430 %tmp5 = ashr exact i64 %tmp4, 32 431 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 432 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 433 %tmp8 = sext i32 %tmp7 to i64 434 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 435 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 436 %tmp13 = zext i32 %tmp10 to i64 437 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 438 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 439 %Pivot = icmp slt i32 %tmp16, 2 440 br i1 %Pivot, label %LeafBlock, label %LeafBlock1 441 442 LeafBlock: ; preds = %entry 443 %SwitchLeaf = icmp eq i32 %tmp16, 1 444 br i1 %SwitchLeaf, label %exit0, label %exit1 445 446 LeafBlock1: ; preds = %entry 447 %SwitchLeaf2 = icmp eq i32 %tmp16, 2 448 br i1 %SwitchLeaf2, label %exit0, label %exit1 449 450 exit0: ; preds = %LeafBlock, %LeafBlock1 451 store volatile i32 17, i32 addrspace(3)* undef 452 ret void 453 454 exit1: ; preds = %LeafBlock, %LeafBlock1 455 store volatile i32 9, i32 addrspace(1)* undef 456 unreachable 457 } 458 459 ; The non-uniformity of the branch to the exiting blocks requires 460 ; looking at transitive predecessors. 461 462 ; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable( 463 464 ; IR: exit0: ; preds = %Flow2 465 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef 466 ; IR-NEXT: br label %UnifiedReturnBlock 467 468 469 ; IR: indirect.exit1: 470 ; IR: %load = load volatile i32, i32 addrspace(1)* undef 471 ; IR: store volatile i32 %load, i32 addrspace(1)* undef 472 ; IR: store volatile i32 9, i32 addrspace(1)* undef 473 ; IR: call void @llvm.amdgcn.unreachable() 474 ; IR-NEXT: br label %Flow2 475 476 ; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2 477 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) 478 ; IR-NEXT: ret void 479 define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { 480 entry: 481 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 482 %tmp1 = add i32 0, %tmp 483 %tmp2 = zext i32 %tmp1 to i64 484 %tmp3 = add i64 0, %tmp2 485 %tmp4 = shl i64 %tmp3, 32 486 %tmp5 = ashr exact i64 %tmp4, 32 487 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 488 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 489 %tmp8 = sext i32 %tmp7 to i64 490 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 491 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 492 %tmp13 = zext i32 %tmp10 to i64 493 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 494 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 495 %Pivot = icmp slt i32 %tmp16, 2 496 br i1 %Pivot, label %LeafBlock, label %LeafBlock1 497 498 LeafBlock: ; preds = %entry 499 %SwitchLeaf = icmp eq i32 %tmp16, 1 500 br i1 %SwitchLeaf, label %exit0, label %indirect.exit1 501 502 LeafBlock1: ; preds = %entry 503 %SwitchLeaf2 = icmp eq i32 %tmp16, 2 504 br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1 505 506 exit0: ; preds = %LeafBlock, %LeafBlock1 507 store volatile i32 17, i32 addrspace(3)* undef 508 ret void 509 510 indirect.exit1: 511 %load = load volatile i32, i32 addrspace(1)* undef 512 store volatile i32 %load, i32 addrspace(1)* undef 513 br label %exit1 514 515 exit1: ; preds = %LeafBlock, %LeafBlock1 516 store volatile i32 9, i32 addrspace(1)* undef 517 unreachable 518 } 519 520 ; IR-LABEL: @multi_divergent_region_exit_ret_switch( 521 define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { 522 entry: 523 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 524 %tmp1 = add i32 0, %tmp 525 %tmp2 = zext i32 %tmp1 to i64 526 %tmp3 = add i64 0, %tmp2 527 %tmp4 = shl i64 %tmp3, 32 528 %tmp5 = ashr exact i64 %tmp4, 32 529 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 530 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 531 %tmp8 = sext i32 %tmp7 to i64 532 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 533 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 534 %tmp13 = zext i32 %tmp10 to i64 535 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 536 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 537 switch i32 %tmp16, label %exit1 538 [ i32 1, label %LeafBlock 539 i32 2, label %LeafBlock1 540 i32 3, label %exit0 ] 541 542 LeafBlock: ; preds = %entry 543 %SwitchLeaf = icmp eq i32 %tmp16, 1 544 br i1 %SwitchLeaf, label %exit0, label %exit1 545 546 LeafBlock1: ; preds = %entry 547 %SwitchLeaf2 = icmp eq i32 %tmp16, 2 548 br i1 %SwitchLeaf2, label %exit0, label %exit1 549 550 exit0: ; preds = %LeafBlock, %LeafBlock1 551 store volatile i32 17, i32 addrspace(3)* undef 552 ret void 553 554 exit1: ; preds = %LeafBlock, %LeafBlock1 555 store volatile i32 9, i32 addrspace(1)* undef 556 unreachable 557 } 558 559 ; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle( 560 define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 { 561 entry: 562 %uniform.cond0 = icmp eq i32 %arg0, 4 563 br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret 564 565 divergent.multi.exit.region: 566 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 567 %divergent.cond0 = icmp eq i32 %id.x, 0 568 br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1 569 570 divergent.ret0: 571 store volatile i32 11, i32 addrspace(3)* undef 572 ret void 573 574 divergent.ret1: 575 store volatile i32 42, i32 addrspace(3)* undef 576 ret void 577 578 uniform.ret: 579 store volatile i32 9, i32 addrspace(1)* undef 580 ret void 581 } 582 583 ; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle( 584 define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 { 585 entry: 586 %uniform.cond0 = icmp eq i32 %arg0, 4 587 br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret 588 589 divergent.multi.exit.region: 590 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 591 %divergent.cond0 = icmp eq i32 %id.x, 0 592 br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1 593 594 divergent.if: 595 %vgpr0 = load volatile float, float addrspace(1)* undef 596 %divergent.cond1 = fcmp ogt float %vgpr0, 1.0 597 br i1 %divergent.cond1, label %divergent.then, label %divergent.endif 598 599 divergent.then: 600 %vgpr1 = load volatile float, float addrspace(1)* undef 601 %divergent.cond2 = fcmp olt float %vgpr1, 4.0 602 store volatile i32 33, i32 addrspace(1)* undef 603 br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif 604 605 divergent.endif: 606 store volatile i32 38, i32 addrspace(1)* undef 607 br label %divergent.ret0 608 609 divergent.ret0: 610 store volatile i32 11, i32 addrspace(3)* undef 611 ret void 612 613 divergent.ret1: 614 store volatile i32 42, i32 addrspace(3)* undef 615 ret void 616 617 uniform.ret: 618 store volatile i32 9, i32 addrspace(1)* undef 619 ret void 620 } 621 622 ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle( 623 ; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region 624 ; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ] 625 ; IR: br i1 %8, label %uniform.if, label %Flow2 626 627 ; IR: Flow: ; preds = %uniform.then, %uniform.if 628 ; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ] 629 ; IR: br i1 %11, label %uniform.endif, label %uniform.ret0 630 631 ; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2 632 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6) 633 ; IR-NEXT: ret void 634 define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 { 635 entry: 636 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 637 %divergent.cond0 = icmp eq i32 %id.x, 0 638 br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret 639 640 uniform.multi.exit.region: 641 %uniform.cond0 = icmp eq i32 %arg0, 4 642 br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1 643 644 uniform.if: 645 %sgpr0 = load volatile i32, i32 addrspace(4)* undef 646 %uniform.cond1 = icmp slt i32 %sgpr0, 1 647 br i1 %uniform.cond1, label %uniform.then, label %uniform.endif 648 649 uniform.then: 650 %sgpr1 = load volatile i32, i32 addrspace(4)* undef 651 %uniform.cond2 = icmp sge i32 %sgpr1, 4 652 store volatile i32 33, i32 addrspace(1)* undef 653 br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif 654 655 uniform.endif: 656 store volatile i32 38, i32 addrspace(1)* undef 657 br label %uniform.ret0 658 659 uniform.ret0: 660 store volatile i32 11, i32 addrspace(3)* undef 661 ret void 662 663 uniform.ret1: 664 store volatile i32 42, i32 addrspace(3)* undef 665 ret void 666 667 divergent.ret: 668 store volatile i32 9, i32 addrspace(1)* undef 669 ret void 670 } 671 672 ; IR-LABEL: @multi_divergent_unreachable_exit( 673 ; IR: UnifiedUnreachableBlock: 674 ; IR-NEXT: call void @llvm.amdgcn.unreachable() 675 ; IR-NEXT: br label %UnifiedReturnBlock 676 677 ; IR: UnifiedReturnBlock: 678 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 679 ; IR-NEXT: ret void 680 define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 { 681 bb: 682 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 683 switch i32 %tmp, label %bb3 [ 684 i32 2, label %bb1 685 i32 0, label %bb2 686 ] 687 688 bb1: ; preds = %bb 689 unreachable 690 691 bb2: ; preds = %bb 692 unreachable 693 694 bb3: ; preds = %bb 695 switch i32 undef, label %bb5 [ 696 i32 2, label %bb4 697 ] 698 699 bb4: ; preds = %bb3 700 ret void 701 702 bb5: ; preds = %bb3 703 unreachable 704 } 705 706 declare i32 @llvm.amdgcn.workitem.id.x() #1 707 708 attributes #0 = { nounwind } 709 attributes #1 = { nounwind readnone } 710