Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -check-prefix=GCN %s
      2 
      3 
      4 ; FIXME: We should use llvm-mc for this, but we can't even parse our own output.
      5 ;        See PR33579.
      6 ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -o %t.o -filetype=obj %s
      7 ; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=OBJ %s
      8 
      9 ; OBJ:       Relocations [
     10 ; OBJ-NEXT: ]
     11 
     12 ; Restrict maximum branch to between +7 and -8 dwords
     13 
     14 ; Used to emit an always 4 byte instruction. Inline asm always assumes
     15 ; each instruction is the maximum size.
     16 declare void @llvm.amdgcn.s.sleep(i32) #0
     17 
     18 declare i32 @llvm.amdgcn.workitem.id.x() #1
     19 
     20 
     21 ; GCN-LABEL: {{^}}uniform_conditional_max_short_forward_branch:
     22 ; GCN: s_load_dword [[CND:s[0-9]+]]
     23 ; GCN: s_cmp_eq_u32 [[CND]], 0
     24 ; GCN-NEXT: s_cbranch_scc1 [[BB3:BB[0-9]+_[0-9]+]]
     25 
     26 
     27 ; GCN-NEXT: ; %bb.1: ; %bb2
     28 ; GCN-NEXT: ;;#ASMSTART
     29 ; GCN-NEXT: v_nop_e64
     30 ; GCN-NEXT: v_nop_e64
     31 ; GCN-NEXT: v_nop_e64
     32 ; GCN-NEXT: ;;#ASMEND
     33 ; GCN-NEXT: s_sleep 0
     34 
     35 ; GCN-NEXT: [[BB3]]: ; %bb3
     36 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
     37 ; GCN: buffer_store_dword [[V_CND]]
     38 ; GCN: s_endpgm
     39 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
     40 bb:
     41   %cmp = icmp eq i32 %cnd, 0
     42   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
     43 
     44 bb2:
     45 ; 24 bytes
     46   call void asm sideeffect
     47    "v_nop_e64
     48     v_nop_e64
     49     v_nop_e64", ""() #0
     50   call void @llvm.amdgcn.s.sleep(i32 0)
     51   br label %bb3
     52 
     53 bb3:
     54   store volatile i32 %cnd, i32 addrspace(1)* %arg
     55   ret void
     56 }
     57 
     58 ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_branch:
     59 ; GCN: s_load_dword [[CND:s[0-9]+]]
     60 ; GCN: s_cmp_eq_u32 [[CND]], 0
     61 ; GCN-NEXT: s_cbranch_scc0 [[LONGBB:BB[0-9]+_[0-9]+]]
     62 
     63 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
     64 ; GCN-NEXT: s_getpc_b64 vcc
     65 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
     66 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
     67 ; GCN-NEXT: s_setpc_b64 vcc
     68 
     69 ; GCN-NEXT: [[LONGBB]]:
     70 ; GCN-NEXT: ;;#ASMSTART
     71 ; GCN: v_nop_e64
     72 ; GCN: v_nop_e64
     73 ; GCN: v_nop_e64
     74 ; GCN: v_nop_e64
     75 ; GCN-NEXT: ;;#ASMEND
     76 
     77 ; GCN-NEXT: [[ENDBB]]:
     78 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
     79 ; GCN: buffer_store_dword [[V_CND]]
     80 ; GCN: s_endpgm
     81 define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
     82 bb0:
     83   %cmp = icmp eq i32 %cnd, 0
     84   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
     85 
     86 bb2:
     87 ; 32 bytes
     88   call void asm sideeffect
     89    "v_nop_e64
     90     v_nop_e64
     91     v_nop_e64
     92     v_nop_e64", ""() #0
     93   br label %bb3
     94 
     95 bb3:
     96   store volatile i32 %cnd, i32 addrspace(1)* %arg
     97   ret void
     98 }
     99 
    100 ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch:
    101 ; GCN: s_load_dword [[CND:s[0-9]+]]
    102 ; GCN-DAG: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
    103 ; GCN-DAG: v_cmp_eq_f32_e64 [[UNMASKED:s\[[0-9]+:[0-9]+\]]], [[CND]], 0
    104 ; GCN-DAG: s_and_b64 vcc, exec, [[UNMASKED]]
    105 ; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]]
    106 
    107 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
    108 ; GCN-NEXT: s_getpc_b64 vcc
    109 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
    110 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
    111 ; GCN-NEXT: s_setpc_b64 vcc
    112 
    113 ; GCN-NEXT: [[LONGBB]]:
    114 ; GCN: v_nop_e64
    115 ; GCN: v_nop_e64
    116 ; GCN: v_nop_e64
    117 ; GCN: v_nop_e64
    118 
    119 ; GCN: [[ENDBB]]:
    120 ; GCN: buffer_store_dword [[V_CND]]
    121 ; GCN: s_endpgm
    122 define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
    123 bb0:
    124   %cmp = fcmp oeq float %cnd, 0.0
    125   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
    126 
    127 bb2:
    128   call void asm sideeffect " ; 32 bytes
    129     v_nop_e64
    130     v_nop_e64
    131     v_nop_e64
    132     v_nop_e64", ""() #0
    133   br label %bb3
    134 
    135 bb3:
    136   store volatile float %cnd, float addrspace(1)* %arg
    137   ret void
    138 }
    139 
    140 ; GCN-LABEL: {{^}}min_long_forward_vbranch:
    141 
    142 ; GCN: buffer_load_dword
    143 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
    144 ; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
    145 
    146 ; GCN: v_nop_e64
    147 ; GCN: v_nop_e64
    148 ; GCN: v_nop_e64
    149 ; GCN: v_nop_e64
    150 
    151 ; GCN: s_or_b64 exec, exec, [[SAVE]]
    152 ; GCN: buffer_store_dword
    153 ; GCN: s_endpgm
    154 define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
    155 bb:
    156   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    157   %tid.ext = zext i32 %tid to i64
    158   %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tid.ext
    159   %load = load volatile i32, i32 addrspace(1)* %gep
    160   %cmp = icmp eq i32 %load, 0
    161   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
    162 
    163 bb2:
    164   call void asm sideeffect " ; 32 bytes
    165     v_nop_e64
    166     v_nop_e64
    167     v_nop_e64
    168     v_nop_e64", ""() #0
    169   br label %bb3
    170 
    171 bb3:
    172   store volatile i32 %load, i32 addrspace(1)* %gep
    173   ret void
    174 }
    175 
    176 ; GCN-LABEL: {{^}}long_backward_sbranch:
    177 ; GCN: s_mov_b32 [[LOOPIDX:s[0-9]+]], 0{{$}}
    178 
    179 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; %bb2
    180 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
    181 ; GCN-NEXT: s_add_i32 [[INC:s[0-9]+]], [[LOOPIDX]], 1
    182 ; GCN-NEXT: s_cmp_lt_i32 [[INC]], 10
    183 
    184 ; GCN-NEXT: ;;#ASMSTART
    185 ; GCN-NEXT: v_nop_e64
    186 ; GCN-NEXT: v_nop_e64
    187 ; GCN-NEXT: v_nop_e64
    188 ; GCN-NEXT: ;;#ASMEND
    189 
    190 ; GCN-NEXT: s_cbranch_scc0 [[ENDBB:BB[0-9]+_[0-9]+]]
    191 
    192 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
    193 ; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1
    194 ; GCN-NEXT: s_getpc_b64 vcc
    195 ; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONG_JUMP]]+4)-[[LOOPBB]]
    196 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
    197 ; GCN-NEXT: s_setpc_b64 vcc
    198 
    199 ; GCN-NEXT: [[ENDBB]]:
    200 ; GCN-NEXT: s_endpgm
    201 define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
    202 bb:
    203   br label %bb2
    204 
    205 bb2:
    206   %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ]
    207    ; 24 bytes
    208   call void asm sideeffect
    209    "v_nop_e64
    210     v_nop_e64
    211     v_nop_e64", ""() #0
    212   %inc = add nsw i32 %loop.idx, 1 ; add cost 4
    213   %cmp = icmp slt i32 %inc, 10 ; condition cost = 8
    214   br i1 %cmp, label %bb2, label %bb3 ; -
    215 
    216 bb3:
    217   ret void
    218 }
    219 
    220 ; Requires expansion of unconditional branch from %bb2 to %bb4 (and
    221 ; expansion of conditional branch from %bb to %bb3.
    222 
    223 ; GCN-LABEL: {{^}}uniform_unconditional_min_long_forward_branch:
    224 ; GCN: s_cmp_eq_u32
    225 ; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]+_[0-9]+]]
    226 
    227 ; GCN-NEXT: [[LONG_JUMP0:BB[0-9]+_[0-9]+]]: ; %bb0
    228 ; GCN-NEXT: s_getpc_b64 vcc
    229 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]_[0-9]+]]-([[LONG_JUMP0]]+4)
    230 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
    231 ; GCN-NEXT: s_setpc_b64 vcc
    232 
    233 ; GCN-NEXT: [[BB2]]: ; %bb2
    234 ; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
    235 ; GCN: buffer_store_dword [[BB2_K]]
    236 
    237 ; GCN-NEXT: [[LONG_JUMP1:BB[0-9]+_[0-9]+]]: ; %bb2
    238 ; GCN-NEXT: s_getpc_b64 vcc
    239 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB4:BB[0-9]_[0-9]+]]-([[LONG_JUMP1]]+4)
    240 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
    241 ; GCN-NEXT: s_setpc_b64 vcc
    242 
    243 ; GCN: [[BB3]]: ; %bb3
    244 ; GCN: v_nop_e64
    245 ; GCN: v_nop_e64
    246 ; GCN: v_nop_e64
    247 ; GCN: v_nop_e64
    248 ; GCN: ;;#ASMEND
    249 
    250 ; GCN-NEXT: [[BB4]]: ; %bb4
    251 ; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63
    252 ; GCN: buffer_store_dword [[BB4_K]]
    253 ; GCN-NEXT: s_endpgm
    254 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
    255 define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
    256 bb0:
    257   %tmp = icmp ne i32 %arg1, 0
    258   br i1 %tmp, label %bb2, label %bb3
    259 
    260 bb2:
    261   store volatile i32 17, i32 addrspace(1)* undef
    262   br label %bb4
    263 
    264 bb3:
    265   ; 32 byte asm
    266   call void asm sideeffect
    267    "v_nop_e64
    268     v_nop_e64
    269     v_nop_e64
    270     v_nop_e64", ""() #0
    271   br label %bb4
    272 
    273 bb4:
    274   store volatile i32 63, i32 addrspace(1)* %arg
    275   ret void
    276 }
    277 
    278 ; GCN-LABEL: {{^}}uniform_unconditional_min_long_backward_branch:
    279 ; GCN-NEXT: ; %bb.0: ; %entry
    280 
    281 ; GCN-NEXT: [[LOOP:BB[0-9]_[0-9]+]]: ; %loop
    282 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
    283 ; GCN-NEXT: ;;#ASMSTART
    284 ; GCN-NEXT: v_nop_e64
    285 ; GCN-NEXT: v_nop_e64
    286 ; GCN-NEXT: v_nop_e64
    287 ; GCN-NEXT: v_nop_e64
    288 ; GCN-NEXT: ;;#ASMEND
    289 
    290 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
    291 ; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1
    292 ; GCN-NEXT: s_getpc_b64 vcc
    293 ; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP]]
    294 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
    295 ; GCN-NEXT: s_setpc_b64 vcc
    296 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
    297 define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
    298 entry:
    299   br label %loop
    300 
    301 loop:
    302   ; 32 byte asm
    303   call void asm sideeffect
    304    "v_nop_e64
    305     v_nop_e64
    306     v_nop_e64
    307     v_nop_e64", ""() #0
    308   br label %loop
    309 }
    310 
    311 ; Expansion of branch from %bb1 to %bb3 introduces need to expand
    312 ; branch from %bb0 to %bb2
    313 
    314 ; GCN-LABEL: {{^}}expand_requires_expand:
    315 ; GCN-NEXT: ; %bb.0: ; %bb0
    316 ; GCN: s_load_dword
    317 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 0{{$}}
    318 ; GCN-NEXT: s_cbranch_scc0 [[BB1:BB[0-9]+_[0-9]+]]
    319 
    320 ; GCN-NEXT: [[LONGBB0:BB[0-9]+_[0-9]+]]: ; %bb0
    321 ; GCN-NEXT: s_getpc_b64 vcc
    322 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB0]]+4)
    323 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
    324 ; GCN-NEXT: s_setpc_b64 vcc
    325 
    326 ; GCN-NEXT: [[BB1]]: ; %bb1
    327 ; GCN-NEXT: s_load_dword
    328 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
    329 ; GCN-NEXT: s_cmp_eq_u32 s{{[0-9]+}}, 3{{$}}
    330 ; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]_[0-9]+]]
    331 
    332 ; GCN-NEXT: [[LONGBB1:BB[0-9]+_[0-9]+]]: ; %bb1
    333 ; GCN-NEXT: s_getpc_b64 vcc
    334 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]+_[0-9]+]]-([[LONGBB1]]+4)
    335 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
    336 ; GCN-NEXT: s_setpc_b64 vcc
    337 
    338 ; GCN-NEXT: [[BB2]]: ; %bb2
    339 ; GCN-NEXT: ;;#ASMSTART
    340 ; GCN-NEXT: v_nop_e64
    341 ; GCN-NEXT: v_nop_e64
    342 ; GCN-NEXT: v_nop_e64
    343 ; GCN-NEXT: v_nop_e64
    344 ; GCN-NEXT: ;;#ASMEND
    345 
    346 ; GCN-NEXT: [[BB3]]: ; %bb3
    347 ; GCN-NEXT: ;;#ASMSTART
    348 ; GCN-NEXT: v_nop_e64
    349 ; GCN-NEXT: ;;#ASMEND
    350 ; GCN-NEXT: ;;#ASMSTART
    351 ; GCN-NEXT: v_nop_e64
    352 ; GCN-NEXT: ;;#ASMEND
    353 ; GCN-NEXT: s_endpgm
    354 define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
    355 bb0:
    356   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    357   %cmp0 = icmp slt i32 %cond0, 0
    358   br i1 %cmp0, label %bb2, label %bb1
    359 
    360 bb1:
    361   %val = load volatile i32, i32 addrspace(4)* undef
    362   %cmp1 = icmp eq i32 %val, 3
    363   br i1 %cmp1, label %bb3, label %bb2
    364 
    365 bb2:
    366   call void asm sideeffect
    367    "v_nop_e64
    368     v_nop_e64
    369     v_nop_e64
    370     v_nop_e64", ""() #0
    371   br label %bb3
    372 
    373 bb3:
    374 ; These NOPs prevent tail-duplication-based outlining
    375 ; from firing, which defeats the need to expand the branches and this test.
    376   call void asm sideeffect
    377    "v_nop_e64", ""() #0
    378   call void asm sideeffect
    379    "v_nop_e64", ""() #0
    380   ret void
    381 }
    382 
    383 ; Requires expanding of required skip branch.
    384 
    385 ; GCN-LABEL: {{^}}uniform_inside_divergent:
    386 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
    387 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
    388 ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
    389 ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
    390 
    391 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %entry
    392 ; GCN-NEXT: s_getpc_b64 vcc
    393 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB]]+4)
    394 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
    395 ; GCN-NEXT: s_setpc_b64 vcc
    396 
    397 ; GCN-NEXT: [[IF]]: ; %if
    398 ; GCN: buffer_store_dword
    399 ; GCN: s_cmp_lg_u32
    400 ; GCN: s_cbranch_scc1 [[ENDIF]]
    401 
    402 ; GCN-NEXT: ; %bb.2: ; %if_uniform
    403 ; GCN: buffer_store_dword
    404 
    405 ; GCN-NEXT: [[ENDIF]]: ; %endif
    406 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
    407 ; GCN-NEXT: s_sleep 5
    408 ; GCN-NEXT: s_endpgm
    409 define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
    410 entry:
    411   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    412   %d_cmp = icmp ult i32 %tid, 16
    413   br i1 %d_cmp, label %if, label %endif
    414 
    415 if:
    416   store i32 0, i32 addrspace(1)* %out
    417   %u_cmp = icmp eq i32 %cond, 0
    418   br i1 %u_cmp, label %if_uniform, label %endif
    419 
    420 if_uniform:
    421   store i32 1, i32 addrspace(1)* %out
    422   br label %endif
    423 
    424 endif:
    425   ; layout can remove the split branch if it can copy the return block.
    426   ; This call makes the return block long enough that it doesn't get copied.
    427   call void @llvm.amdgcn.s.sleep(i32 5);
    428   ret void
    429 }
    430 
    431 ; si_mask_branch
    432 
    433 ; GCN-LABEL: {{^}}analyze_mask_branch:
    434 ; GCN: v_cmp_nlt_f32_e32 vcc
    435 ; GCN-NEXT: s_and_saveexec_b64 [[TEMP_MASK:s\[[0-9]+:[0-9]+\]]], vcc
    436 ; GCN-NEXT: s_xor_b64  [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[TEMP_MASK]]
    437 ; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
    438 
    439 ; GCN: [[FLOW]]: ; %Flow
    440 ; GCN-NEXT: s_or_saveexec_b64 [[TEMP_MASK1:s\[[0-9]+:[0-9]+\]]], [[MASK]]
    441 ; GCN-NEXT: s_xor_b64 exec, exec, [[TEMP_MASK1]]
    442 ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
    443 
    444 ; GCN: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop
    445 ; GCN: ;;#ASMSTART
    446 ; GCN: v_nop_e64
    447 ; GCN: v_nop_e64
    448 ; GCN: v_nop_e64
    449 ; GCN: v_nop_e64
    450 ; GCN: v_nop_e64
    451 ; GCN: v_nop_e64
    452 ; GCN: ;;#ASMEND
    453 ; GCN: s_cbranch_vccz [[RET]]
    454 
    455 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
    456 ; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
    457 ; GCN-NEXT: s_getpc_b64 vcc
    458 ; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP_BODY]]
    459 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
    460 ; GCN-NEXT: s_setpc_b64 vcc
    461 
    462 ; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
    463 ; GCN-NEXT: s_endpgm
    464 define amdgpu_kernel void @analyze_mask_branch() #0 {
    465 entry:
    466   %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
    467   %cmp0 = fcmp ogt float %reg, 0.000000e+00
    468   br i1 %cmp0, label %loop, label %ret
    469 
    470 loop:
    471   %phi = phi float [ 0.000000e+00, %loop_body ], [ 1.000000e+00, %entry ]
    472   call void asm sideeffect
    473     "v_nop_e64
    474      v_nop_e64", ""() #0
    475   %cmp1 = fcmp olt float %phi, 8.0
    476   br i1 %cmp1, label %loop_body, label %ret
    477 
    478 loop_body:
    479   call void asm sideeffect
    480   "v_nop_e64
    481    v_nop_e64
    482    v_nop_e64
    483    v_nop_e64", ""() #0
    484   br label %loop
    485 
    486 ret:
    487   store volatile i32 7, i32 addrspace(1)* undef
    488   ret void
    489 }
    490 
    491 ; GCN-LABEL: {{^}}long_branch_hang:
    492 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
    493 ; GCN: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
    494 ; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
    495 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
    496 
    497 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
    498 ; GCN: s_setpc_b64
    499 
    500 ; GCN-NEXT: [[LONG_BR_0]]:
    501 ; GCN-DAG: v_cmp_lt_i32
    502 ; GCN-DAG: v_cmp_gt_i32
    503 ; GCN: s_cbranch_vccnz
    504 
    505 ; GCN: s_setpc_b64
    506 ; GCN: s_setpc_b64
    507 
    508 ; GCN: [[LONG_BR_DEST0]]
    509 ; GCN: s_cbranch_vccz
    510 ; GCN: s_setpc_b64
    511 
    512 ; GCN: s_endpgm
    513 define amdgpu_kernel void @long_branch_hang(i32 addrspace(1)* nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 {
    514 bb:
    515   %tmp = icmp slt i32 %arg2, 9
    516   %tmp6 = icmp eq i32 %arg1, 0
    517   %tmp7 = icmp sgt i32 %arg4, 0
    518   %tmp8 = icmp sgt i32 %arg4, 5
    519   br i1 %tmp8, label %bb9, label %bb13
    520 
    521 bb9:                                              ; preds = %bb
    522   %tmp10 = and i1 %tmp7, %tmp
    523   %tmp11 = icmp slt i32 %arg3, %arg4
    524   %tmp12 = or i1 %tmp11, %tmp7
    525   br i1 %tmp12, label %bb19, label %bb14
    526 
    527 bb13:                                             ; preds = %bb
    528   call void asm sideeffect
    529   "v_nop_e64
    530    v_nop_e64
    531    v_nop_e64
    532    v_nop_e64", ""() #0
    533   br i1 %tmp6, label %bb19, label %bb14
    534 
    535 bb14:                                             ; preds = %bb13, %bb9
    536   %tmp15 = icmp slt i32 %arg3, %arg4
    537   %tmp16 = or i1 %tmp15, %tmp
    538   %tmp17 = and i1 %tmp6, %tmp16
    539   %tmp18 = zext i1 %tmp17 to i32
    540   br label %bb19
    541 
    542 bb19:                                             ; preds = %bb14, %bb13, %bb9
    543   %tmp20 = phi i32 [ undef, %bb9 ], [ undef, %bb13 ], [ %tmp18, %bb14 ]
    544   %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %arg5
    545   store i32 %tmp20, i32 addrspace(1)* %tmp21, align 4
    546   ret void
    547 }
    548 
    549 attributes #0 = { nounwind }
    550 attributes #1 = { nounwind readnone }
    551