Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
      2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
      3 
      4 ; Add an extra verifier runs. There were some cases where invalid IR
      5 ; was produced but happened to be fixed by the later passes.
      6 
      7 ; Make sure divergent control flow with multiple exits from a region
      8 ; is properly handled. UnifyFunctionExitNodes should be run before
      9 ; StructurizeCFG.
     10 
     11 ; IR-LABEL: @multi_divergent_region_exit_ret_ret(
     12 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
     13 ; IR: %2 = extractvalue { i1, i64 } %1, 0
     14 ; IR: %3 = extractvalue { i1, i64 } %1, 1
     15 ; IR: br i1 %2, label %LeafBlock1, label %Flow
     16 
     17 ; IR: Flow:
     18 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
     19 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
     20 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
     21 ; IR: %7 = extractvalue { i1, i64 } %6, 0
     22 ; IR: %8 = extractvalue { i1, i64 } %6, 1
     23 ; IR: br i1 %7, label %LeafBlock, label %Flow1
     24 
     25 ; IR: LeafBlock:
     26 ; IR: br label %Flow1
     27 
     28 ; IR: LeafBlock1:
     29 ; IR: br label %Flow{{$}}
     30 
     31 ; IR:  Flow2:
     32 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
     33 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
     34 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
     35 ; IR: %13 = extractvalue { i1, i64 } %12, 0
     36 ; IR: %14 = extractvalue { i1, i64 } %12, 1
     37 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
     38 
     39 ; IR: exit0:
     40 ; IR: store volatile i32 9, i32 addrspace(1)* undef
     41 ; IR: br label %UnifiedReturnBlock
     42 
     43 ; IR: Flow1:
     44 ; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
     45 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
     46 ; IR: call void @llvm.amdgcn.end.cf(i64 %8)
     47 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
     48 ; IR: %18 = extractvalue { i1, i64 } %17, 0
     49 ; IR: %19 = extractvalue { i1, i64 } %17, 1
     50 ; IR: br i1 %18, label %exit1, label %Flow2
     51 
     52 ; IR: exit1:
     53 ; IR: store volatile i32 17, i32 addrspace(3)* undef
     54 ; IR:  br label %Flow2
     55 
     56 ; IR: UnifiedReturnBlock:
     57 ; IR: call void @llvm.amdgcn.end.cf(i64 %14)
     58 ; IR: ret void
     59 
     60 
     61 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
     62 ; GCN: v_cmp_lt_i32_e32 vcc, 1
     63 ; GCN: s_and_saveexec_b64
     64 ; GCN: s_xor_b64
     65 
     66 
     67 ; FIXME: Why is this compare essentially repeated?
     68 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
     69 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
     70 ; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
     71 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
     72 
     73 ; GCN: ; %Flow4
     74 ; GCN-NEXT: s_or_b64 exec, exec
     75 ; GCN: v_cmp_ne_u32_e32 vcc, 0
     76 
     77 ; GCN: ; %exit1
     78 ; GCN: ds_write_b32
     79 
     80 ; GCN: %Flow5
     81 ; GCN-NEXT: s_or_b64 exec, exec
     82 ; GCN: v_cmp_ne_u32_e32 vcc, 0
     83 ; GCN-NEXT: s_and_saveexec_b64
     84 
     85 ; GCN: ; %exit0
     86 ; GCN: buffer_store_dword
     87 
     88 ; GCN: ; %UnifiedReturnBlock
     89 ; GCN-NEXT: s_endpgm
     90 define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
     91 entry:
     92   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
     93   %tmp1 = add i32 0, %tmp
     94   %tmp2 = zext i32 %tmp1 to i64
     95   %tmp3 = add i64 0, %tmp2
     96   %tmp4 = shl i64 %tmp3, 32
     97   %tmp5 = ashr exact i64 %tmp4, 32
     98   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
     99   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
    100   %tmp8 = sext i32 %tmp7 to i64
    101   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
    102   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
    103   %tmp13 = zext i32 %tmp10 to i64
    104   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
    105   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
    106   %Pivot = icmp slt i32 %tmp16, 2
    107   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
    108 
    109 LeafBlock:                                        ; preds = %entry
    110   %SwitchLeaf = icmp eq i32 %tmp16, 1
    111   br i1 %SwitchLeaf, label %exit0, label %exit1
    112 
    113 LeafBlock1:                                       ; preds = %entry
    114   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
    115   br i1 %SwitchLeaf2, label %exit0, label %exit1
    116 
    117 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
    118   store volatile i32 9, i32 addrspace(1)* undef
    119   ret void
    120 
    121 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
    122   store volatile i32 17, i32 addrspace(3)* undef
    123   ret void
    124 }
    125 
    126 ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
    127 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
    128 
    129 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
    130 
    131 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
    132 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
    133 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
    134 ; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
    135 
    136 
    137 ; IR: UnifiedUnreachableBlock:
    138 ; IR-NEXT: unreachable
    139 
    140 
    141 ; FIXME: Probably should insert an s_endpgm anyway.
    142 ; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
    143 ; GCN: ; %UnifiedUnreachableBlock
    144 ; GCN-NEXT: .Lfunc_end
    145 define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
    146 entry:
    147   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    148   %tmp1 = add i32 0, %tmp
    149   %tmp2 = zext i32 %tmp1 to i64
    150   %tmp3 = add i64 0, %tmp2
    151   %tmp4 = shl i64 %tmp3, 32
    152   %tmp5 = ashr exact i64 %tmp4, 32
    153   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
    154   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
    155   %tmp8 = sext i32 %tmp7 to i64
    156   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
    157   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
    158   %tmp13 = zext i32 %tmp10 to i64
    159   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
    160   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
    161   %Pivot = icmp slt i32 %tmp16, 2
    162   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
    163 
    164 LeafBlock:                                        ; preds = %entry
    165   %SwitchLeaf = icmp eq i32 %tmp16, 1
    166   br i1 %SwitchLeaf, label %exit0, label %exit1
    167 
    168 LeafBlock1:                                       ; preds = %entry
    169   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
    170   br i1 %SwitchLeaf2, label %exit0, label %exit1
    171 
    172 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
    173   store volatile i32 9, i32 addrspace(1)* undef
    174   unreachable
    175 
    176 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
    177   store volatile i32 17, i32 addrspace(3)* undef
    178   unreachable
    179 }
    180 
    181 ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
    182 ; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
    183 ; IR: llvm.amdgcn.if
    184 ; IR: br i1
    185 
    186 ; IR: {{^}}Flow:
    187 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
    188 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
    189 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
    190 ; IR: br i1 %7, label %LeafBlock, label %Flow1
    191 
    192 ; IR: {{^}}LeafBlock:
    193 ; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
    194 ; IR: %9 = xor i1 %divergent.cond1, true
    195 ; IR: br label %Flow1
    196 
    197 ; IR: LeafBlock1:
    198 ; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
    199 ; IR: %10 = xor i1 %uniform.cond0, true
    200 ; IR: br label %Flow
    201 
    202 ; IR: Flow2:
    203 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
    204 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
    205 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
    206 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
    207 
    208 ; IR: exit0:
    209 ; IR: store volatile i32 9, i32 addrspace(1)* undef
    210 ; IR: br label %UnifiedReturnBlock
    211 
    212 ; IR: {{^}}Flow1:
    213 ; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
    214 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
    215 ; IR: call void @llvm.amdgcn.end.cf(i64 %8)
    216 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
    217 ; IR: %18 = extractvalue { i1, i64 } %17, 0
    218 ; IR: %19 = extractvalue { i1, i64 } %17, 1
    219 ; IR: br i1 %18, label %exit1, label %Flow2
    220 
    221 ; IR: exit1:
    222 ; IR: store volatile i32 17, i32 addrspace(3)* undef
    223 ; IR: br label %Flow2
    224 
    225 ; IR: UnifiedReturnBlock:
    226 ; IR: call void @llvm.amdgcn.end.cf(i64 %14)
    227 ; IR: ret void
    228 define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
    229 entry:
    230   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    231   %tmp1 = add i32 0, %tmp
    232   %tmp2 = zext i32 %tmp1 to i64
    233   %tmp3 = add i64 0, %tmp2
    234   %tmp4 = shl i64 %tmp3, 32
    235   %tmp5 = ashr exact i64 %tmp4, 32
    236   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
    237   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
    238   %tmp8 = sext i32 %tmp7 to i64
    239   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
    240   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
    241   %tmp13 = zext i32 %tmp10 to i64
    242   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
    243   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
    244   %divergent.cond0 = icmp slt i32 %tmp16, 2
    245   br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
    246 
    247 LeafBlock:                                        ; preds = %entry
    248   %divergent.cond1 = icmp eq i32 %tmp16, 1
    249   br i1 %divergent.cond1, label %exit0, label %exit1
    250 
    251 LeafBlock1:                                       ; preds = %entry
    252   %uniform.cond0 = icmp eq i32 %arg3, 2
    253   br i1 %uniform.cond0, label %exit0, label %exit1
    254 
    255 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
    256   store volatile i32 9, i32 addrspace(1)* undef
    257   ret void
    258 
    259 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
    260   store volatile i32 17, i32 addrspace(3)* undef
    261   ret void
    262 }
    263 
    264 ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
    265 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
    266 ; IR: br i1 %2, label %LeafBlock1, label %Flow
    267 
    268 ; IR: Flow:
    269 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
    270 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
    271 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
    272 
    273 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
    274 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
    275 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
    276 
    277 define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
    278 entry:
    279   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    280   %tmp1 = add i32 0, %tmp
    281   %tmp2 = zext i32 %tmp1 to i64
    282   %tmp3 = add i64 0, %tmp2
    283   %tmp4 = shl i64 %tmp3, 32
    284   %tmp5 = ashr exact i64 %tmp4, 32
    285   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
    286   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
    287   %tmp8 = sext i32 %tmp7 to i64
    288   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
    289   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
    290   %tmp13 = zext i32 %tmp10 to i64
    291   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
    292   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
    293   %Pivot = icmp slt i32 %tmp16, 2
    294   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
    295 
    296 LeafBlock:                                        ; preds = %entry
    297   %SwitchLeaf = icmp eq i32 %arg3, 1
    298   br i1 %SwitchLeaf, label %exit0, label %exit1
    299 
    300 LeafBlock1:                                       ; preds = %entry
    301   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
    302   br i1 %SwitchLeaf2, label %exit0, label %exit1
    303 
    304 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
    305   store volatile i32 9, i32 addrspace(1)* undef
    306   ret void
    307 
    308 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
    309   store volatile i32 17, i32 addrspace(3)* undef
    310   ret void
    311 }
    312 
    313 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
    314 ; IR: Flow2:
    315 ; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
    316 ; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
    317 ; IR: call void @llvm.amdgcn.end.cf(i64 %20)
    318 
    319 ; IR: UnifiedReturnBlock:
    320 ; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
    321 ; IR: call void @llvm.amdgcn.end.cf(i64 %15)
    322 ; IR: ret float %UnifiedRetVal
    323 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
    324 entry:
    325   %Pivot = icmp slt i32 %vgpr, 2
    326   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
    327 
    328 LeafBlock:                                        ; preds = %entry
    329   %SwitchLeaf = icmp eq i32 %vgpr, 1
    330   br i1 %SwitchLeaf, label %exit0, label %exit1
    331 
    332 LeafBlock1:                                       ; preds = %entry
    333   %SwitchLeaf2 = icmp eq i32 %vgpr, 2
    334   br i1 %SwitchLeaf2, label %exit0, label %exit1
    335 
    336 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
    337   store i32 9, i32 addrspace(1)* undef
    338   ret float 1.0
    339 
    340 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
    341   store i32 17, i32 addrspace(3)* undef
    342   ret float 2.0
    343 }
    344 
    345 ; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
    346 
    347 ; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
    348 ; GCN: s_cmp_gt_i32 s0, 1
    349 ; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
    350 
    351 ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
    352 
    353 ; GCN: {{^}}[[FLOW]]:
    354 ; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
    355 
    356 ; GCN: v_mov_b32_e32 v0, 2.0
    357 ; GCN: s_or_b64 exec, exec
    358 ; GCN-NOT: s_and_b64 exec, exec
    359 ; GCN: v_mov_b32_e32 v0, 1.0
    360 
    361 ; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
    362 ; GCN-NEXT: s_or_b64 exec, exec
    363 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
    364 ; GCN-NEXT: ; return
    365 
    366 define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
    367 entry:
    368   %uniform.cond = icmp slt i32 %sgpr, 2
    369   br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
    370 
    371 LeafBlock:                                        ; preds = %entry
    372   %divergent.cond0 = icmp eq i32 %vgpr, 3
    373   br i1 %divergent.cond0, label %exit0, label %exit1
    374 
    375 LeafBlock1:                                       ; preds = %entry
    376   %divergent.cond1 = icmp eq i32 %vgpr, 7
    377   br i1 %divergent.cond1, label %exit0, label %exit1
    378 
    379 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
    380   store i32 9, i32 addrspace(1)* undef
    381   ret float 1.0
    382 
    383 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
    384   store i32 17, i32 addrspace(3)* undef
    385   ret float 2.0
    386 }
    387 
    388 ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
    389 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
    390 
    391 ; IR: Flow:
    392 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
    393 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
    394 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
    395 
    396 ; IR: Flow2:
    397 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
    398 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
    399 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
    400 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
    401 
    402 ; IR: exit0:
    403 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
    404 ; IR-NEXT: br label %UnifiedReturnBlock
    405 
    406 ; IR: Flow1:
    407 ; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
    408 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
    409 ; IR: call void @llvm.amdgcn.end.cf(i64 %8)
    410 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
    411 ; IR: %18 = extractvalue { i1, i64 } %17, 0
    412 ; IR: %19 = extractvalue { i1, i64 } %17, 1
    413 ; IR: br i1 %18, label %exit1, label %Flow2
    414 
    415 ; IR: exit1:
    416 ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
    417 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
    418 ; IR-NEXT: br label %Flow2
    419 
    420 ; IR: UnifiedReturnBlock:
    421 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
    422 ; IR-NEXT: ret void
    423 define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
    424 entry:
    425   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    426   %tmp1 = add i32 0, %tmp
    427   %tmp2 = zext i32 %tmp1 to i64
    428   %tmp3 = add i64 0, %tmp2
    429   %tmp4 = shl i64 %tmp3, 32
    430   %tmp5 = ashr exact i64 %tmp4, 32
    431   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
    432   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
    433   %tmp8 = sext i32 %tmp7 to i64
    434   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
    435   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
    436   %tmp13 = zext i32 %tmp10 to i64
    437   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
    438   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
    439   %Pivot = icmp slt i32 %tmp16, 2
    440   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
    441 
    442 LeafBlock:                                        ; preds = %entry
    443   %SwitchLeaf = icmp eq i32 %tmp16, 1
    444   br i1 %SwitchLeaf, label %exit0, label %exit1
    445 
    446 LeafBlock1:                                       ; preds = %entry
    447   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
    448   br i1 %SwitchLeaf2, label %exit0, label %exit1
    449 
    450 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
    451   store volatile i32 17, i32 addrspace(3)* undef
    452   ret void
    453 
    454 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
    455   store volatile i32 9, i32 addrspace(1)* undef
    456   unreachable
    457 }
    458 
    459 ; The non-uniformity of the branch to the exiting blocks requires
    460 ; looking at transitive predecessors.
    461 
    462 ; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
    463 
    464 ; IR: exit0:                                            ; preds = %Flow2
    465 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
    466 ; IR-NEXT: br label %UnifiedReturnBlock
    467 
    468 
    469 ; IR: indirect.exit1:
    470 ; IR: %load = load volatile i32, i32 addrspace(1)* undef
    471 ; IR: store volatile i32 %load, i32 addrspace(1)* undef
    472 ; IR: store volatile i32 9, i32 addrspace(1)* undef
    473 ; IR: call void @llvm.amdgcn.unreachable()
    474 ; IR-NEXT: br label %Flow2
    475 
    476 ; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
    477 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
    478 ; IR-NEXT: ret void
    479 define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
    480 entry:
    481   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    482   %tmp1 = add i32 0, %tmp
    483   %tmp2 = zext i32 %tmp1 to i64
    484   %tmp3 = add i64 0, %tmp2
    485   %tmp4 = shl i64 %tmp3, 32
    486   %tmp5 = ashr exact i64 %tmp4, 32
    487   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
    488   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
    489   %tmp8 = sext i32 %tmp7 to i64
    490   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
    491   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
    492   %tmp13 = zext i32 %tmp10 to i64
    493   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
    494   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
    495   %Pivot = icmp slt i32 %tmp16, 2
    496   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
    497 
    498 LeafBlock:                                        ; preds = %entry
    499   %SwitchLeaf = icmp eq i32 %tmp16, 1
    500   br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
    501 
    502 LeafBlock1:                                       ; preds = %entry
    503   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
    504   br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
    505 
    506 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
    507   store volatile i32 17, i32 addrspace(3)* undef
    508   ret void
    509 
    510 indirect.exit1:
    511   %load = load volatile i32, i32 addrspace(1)* undef
    512   store volatile i32 %load, i32 addrspace(1)* undef
    513   br label %exit1
    514 
    515 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
    516   store volatile i32 9, i32 addrspace(1)* undef
    517   unreachable
    518 }
    519 
    520 ; IR-LABEL: @multi_divergent_region_exit_ret_switch(
    521 define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
    522 entry:
    523   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    524   %tmp1 = add i32 0, %tmp
    525   %tmp2 = zext i32 %tmp1 to i64
    526   %tmp3 = add i64 0, %tmp2
    527   %tmp4 = shl i64 %tmp3, 32
    528   %tmp5 = ashr exact i64 %tmp4, 32
    529   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
    530   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
    531   %tmp8 = sext i32 %tmp7 to i64
    532   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
    533   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
    534   %tmp13 = zext i32 %tmp10 to i64
    535   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
    536   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
    537   switch i32 %tmp16, label %exit1
    538     [ i32 1, label %LeafBlock
    539       i32 2, label %LeafBlock1
    540       i32 3, label %exit0 ]
    541 
    542 LeafBlock:                                        ; preds = %entry
    543   %SwitchLeaf = icmp eq i32 %tmp16, 1
    544   br i1 %SwitchLeaf, label %exit0, label %exit1
    545 
    546 LeafBlock1:                                       ; preds = %entry
    547   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
    548   br i1 %SwitchLeaf2, label %exit0, label %exit1
    549 
    550 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
    551   store volatile i32 17, i32 addrspace(3)* undef
    552   ret void
    553 
    554 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
    555   store volatile i32 9, i32 addrspace(1)* undef
    556   unreachable
    557 }
    558 
    559 ; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
    560 define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
    561 entry:
    562   %uniform.cond0 = icmp eq i32 %arg0, 4
    563   br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
    564 
    565 divergent.multi.exit.region:
    566   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    567   %divergent.cond0 = icmp eq i32 %id.x, 0
    568   br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
    569 
    570 divergent.ret0:
    571   store volatile i32 11, i32 addrspace(3)* undef
    572   ret void
    573 
    574 divergent.ret1:
    575   store volatile i32 42, i32 addrspace(3)* undef
    576   ret void
    577 
    578 uniform.ret:
    579   store volatile i32 9, i32 addrspace(1)* undef
    580   ret void
    581 }
    582 
    583 ; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
    584 define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
    585 entry:
    586   %uniform.cond0 = icmp eq i32 %arg0, 4
    587   br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
    588 
    589 divergent.multi.exit.region:
    590   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    591   %divergent.cond0 = icmp eq i32 %id.x, 0
    592   br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
    593 
    594 divergent.if:
    595   %vgpr0 = load volatile float, float addrspace(1)* undef
    596   %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
    597   br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
    598 
    599 divergent.then:
    600   %vgpr1 = load volatile float, float addrspace(1)* undef
    601   %divergent.cond2 = fcmp olt float %vgpr1, 4.0
    602   store volatile i32 33, i32 addrspace(1)* undef
    603   br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
    604 
    605 divergent.endif:
    606   store volatile i32 38, i32 addrspace(1)* undef
    607   br label %divergent.ret0
    608 
    609 divergent.ret0:
    610   store volatile i32 11, i32 addrspace(3)* undef
    611   ret void
    612 
    613 divergent.ret1:
    614   store volatile i32 42, i32 addrspace(3)* undef
    615   ret void
    616 
    617 uniform.ret:
    618   store volatile i32 9, i32 addrspace(1)* undef
    619   ret void
    620 }
    621 
    622 ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
    623 ; IR: Flow1:                                            ; preds = %uniform.ret1, %uniform.multi.exit.region
    624 ; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
    625 ; IR: br i1 %8, label %uniform.if, label %Flow2
    626 
    627 ; IR: Flow:                                             ; preds = %uniform.then, %uniform.if
    628 ; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
    629 ; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
    630 
    631 ; IR: UnifiedReturnBlock:                               ; preds = %Flow3, %Flow2
    632 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
    633 ; IR-NEXT: ret void
    634 define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
    635 entry:
    636   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    637   %divergent.cond0 = icmp eq i32 %id.x, 0
    638   br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
    639 
    640 uniform.multi.exit.region:
    641   %uniform.cond0 = icmp eq i32 %arg0, 4
    642   br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
    643 
    644 uniform.if:
    645   %sgpr0 = load volatile i32, i32 addrspace(4)* undef
    646   %uniform.cond1 = icmp slt i32 %sgpr0, 1
    647   br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
    648 
    649 uniform.then:
    650   %sgpr1 = load volatile i32, i32 addrspace(4)* undef
    651   %uniform.cond2 = icmp sge i32 %sgpr1, 4
    652   store volatile i32 33, i32 addrspace(1)* undef
    653   br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
    654 
    655 uniform.endif:
    656   store volatile i32 38, i32 addrspace(1)* undef
    657   br label %uniform.ret0
    658 
    659 uniform.ret0:
    660   store volatile i32 11, i32 addrspace(3)* undef
    661   ret void
    662 
    663 uniform.ret1:
    664   store volatile i32 42, i32 addrspace(3)* undef
    665   ret void
    666 
    667 divergent.ret:
    668   store volatile i32 9, i32 addrspace(1)* undef
    669   ret void
    670 }
    671 
    672 ; IR-LABEL: @multi_divergent_unreachable_exit(
    673 ; IR: UnifiedUnreachableBlock:
    674 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
    675 ; IR-NEXT: br label %UnifiedReturnBlock
    676 
    677 ; IR: UnifiedReturnBlock:
    678 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
    679 ; IR-NEXT: ret void
    680 define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
    681 bb:
    682   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
    683   switch i32 %tmp, label %bb3 [
    684     i32 2, label %bb1
    685     i32 0, label %bb2
    686   ]
    687 
    688 bb1:                                              ; preds = %bb
    689   unreachable
    690 
    691 bb2:                                              ; preds = %bb
    692   unreachable
    693 
    694 bb3:                                              ; preds = %bb
    695   switch i32 undef, label %bb5 [
    696     i32 2, label %bb4
    697   ]
    698 
    699 bb4:                                              ; preds = %bb3
    700   ret void
    701 
    702 bb5:                                              ; preds = %bb3
    703   unreachable
    704 }
    705 
    706 declare i32 @llvm.amdgcn.workitem.id.x() #1
    707 
    708 attributes #0 = { nounwind }
    709 attributes #1 = { nounwind readnone }
    710