Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
      2 
      3 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
      4 
      5 ; SI-LABEL: {{^}}test_if:
      6 ; Make sure the i1 values created by the cfg structurizer pass are
      7 ; moved using VALU instructions
      8 
      9 
     10 ; waitcnt should be inserted after exec modification
     11 ; SI: v_cmp_lt_i32_e32 vcc, 0,
     12 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
     13 ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
     14 ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
     15 ; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
     16 
     17 ; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
     18 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
     19 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1
     20 ; SI: s_and_saveexec_b64
     21 ; SI-NEXT: ; mask branch
     22 
     23 ; v_mov should be after exec modification
     24 ; SI: [[FLOW_BB]]:
     25 ; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
     26 ; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
     27 ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
     28 ; SI-NEXT: ; mask branch
     29 ;
     30 define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
     31 entry:
     32   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
     33   switch i32 %tid, label %default [
     34     i32 0, label %case0
     35     i32 1, label %case1
     36   ]
     37 
     38 case0:
     39   %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
     40   store i32 13, i32 addrspace(1)* %arrayidx1, align 4
     41   br label %end
     42 
     43 case1:
     44   %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
     45   store i32 17, i32 addrspace(1)* %arrayidx5, align 4
     46   br label %end
     47 
     48 default:
     49   %cmp8 = icmp eq i32 %tid, 2
     50   %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
     51   br i1 %cmp8, label %if, label %else
     52 
     53 if:
     54   store i32 19, i32 addrspace(1)* %arrayidx10, align 4
     55   br label %end
     56 
     57 else:
     58   store i32 21, i32 addrspace(1)* %arrayidx10, align 4
     59   br label %end
     60 
     61 end:
     62   ret void
     63 }
     64 
     65 ; SI-LABEL: {{^}}simple_test_v_if:
     66 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
     67 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
     68 ; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
     69 
     70 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
     71 ; SI: buffer_store_dword
     72 
     73 ; SI-NEXT: {{^}}[[EXIT]]:
     74 ; SI: s_endpgm
     75 define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
     76   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
     77   %is.0 = icmp ne i32 %tid, 0
     78   br i1 %is.0, label %then, label %exit
     79 
     80 then:
     81   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
     82   store i32 999, i32 addrspace(1)* %gep
     83   br label %exit
     84 
     85 exit:
     86   ret void
     87 }
     88 
     89 ; FIXME: It would be better to endpgm in the then block.
     90 
     91 ; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
     92 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
     93 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
     94 ; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
     95 
     96 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
     97 ; SI: buffer_store_dword
     98 
     99 ; SI-NEXT: {{^}}[[EXIT]]:
    100 ; SI: s_endpgm
    101 define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
    102   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    103   %is.0 = icmp ne i32 %tid, 0
    104   br i1 %is.0, label %then, label %exit
    105 
    106 then:
    107   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
    108   store i32 999, i32 addrspace(1)* %gep
    109   ret void
    110 
    111 exit:
    112   ret void
    113 }
    114 
    115 ; Final block has more than a ret to execute. This was miscompiled
    116 ; before function exit blocks were unified since the endpgm would
    117 ; terminate the then wavefront before reaching the store.
    118 
    119 ; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
    120 ; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
    121 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
    122 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
    123 ; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
    124 
    125 ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
    126 ; SI: ds_write_b32
    127 
    128 ; SI-NEXT: {{^}}[[FLOW]]:
    129 ; SI-NEXT: s_or_saveexec_b64
    130 ; SI-NEXT: s_xor_b64 exec, exec
    131 ; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
    132 
    133 ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
    134 ; SI: s_waitcnt
    135 ; SI-NEXT: buffer_store_dword
    136 
    137 ; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
    138 ; SI: s_endpgm
    139 define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
    140   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    141   %is.0 = icmp ne i32 %tid, 0
    142   br i1 %is.0, label %then, label %exit
    143 
    144 then:
    145   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
    146   store i32 999, i32 addrspace(1)* %gep
    147   ret void
    148 
    149 exit:
    150   store volatile i32 7, i32 addrspace(3)* undef
    151   ret void
    152 }
    153 
    154 ; SI-LABEL: {{^}}simple_test_v_loop:
    155 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
    156 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
    157 ; SI-NEXT: ; mask branch
    158 ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
    159 
    160 ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
    161 
    162 ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
    163 ; SI: buffer_load_dword
    164 ; SI-DAG: buffer_store_dword
    165 ; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100
    166 ; SI: s_cbranch_vccz [[LABEL_LOOP]]
    167 ; SI: [[LABEL_EXIT]]:
    168 ; SI: s_endpgm
    169 
    170 define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
    171 entry:
    172   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
    173   %is.0 = icmp ne i32 %tid, 0
    174   %limit = add i32 %tid, 64
    175   br i1 %is.0, label %loop, label %exit
    176 
    177 loop:
    178   %i = phi i32 [%tid, %entry], [%i.inc, %loop]
    179   %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i
    180   %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i
    181   %load = load i32, i32 addrspace(1)* %src
    182   store i32 %load, i32 addrspace(1)* %gep.dst
    183   %i.inc = add nsw i32 %i, 1
    184   %cmp = icmp eq i32 %limit, %i.inc
    185   br i1 %cmp, label %exit, label %loop
    186 
    187 exit:
    188   ret void
    189 }
    190 
    191 ; SI-LABEL: {{^}}multi_vcond_loop:
    192 
    193 ; Load loop limit from buffer
    194 ; Branch to exit if uniformly not taken
    195 ; SI: ; %bb.0:
    196 ; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
    197 ; SI: v_cmp_lt_i32_e32 vcc
    198 ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
    199 ; SI-NEXT: ; mask branch
    200 ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
    201 
    202 ; Initialize inner condition to false
    203 ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader
    204 ; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
    205 ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
    206 
    207 ; Clear exec bits for workitems that load -1s
    208 ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
    209 ; SI: buffer_load_dword [[B:v[0-9]+]]
    210 ; SI: buffer_load_dword [[A:v[0-9]+]]
    211 ; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
    212 ; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
    213 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
    214 ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
    215 ; SI: s_xor_b64 [[ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG2]]
    216 ; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]
    217 
    218 ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20
    219 ; SI: buffer_store_dword
    220 ; SI: v_cmp_ge_i64_e{{32|64}} [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
    221 ; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
    222 
    223 ; SI: [[LABEL_FLOW]]:
    224 ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
    225 ; SI-NEXT: s_or_b64 exec, exec, [[ORNEG3]]
    226 ; SI-NEXT: s_mov_b64 [[MOVED_TMP:s\[[0-9]+:[0-9]+\]]], [[TMP]]
    227 ; SI-NEXT: s_and_b64 [[MASKED_ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG3]]
    228 ; SI-NEXT: s_or_b64 [[COND_STATE]], [[MASKED_ORNEG3]], [[MOVED_TMP]]
    229 ; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
    230 ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
    231 
    232 ; SI: [[LABEL_EXIT]]:
    233 ; SI-NOT: [[COND_STATE]]
    234 ; SI: s_endpgm
    235 
    236 define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
    237 bb:
    238   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    239   %tmp4 = sext i32 %tmp to i64
    240   %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
    241   %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
    242   %tmp7 = icmp sgt i32 %tmp6, 0
    243   %tmp8 = sext i32 %tmp6 to i64
    244   br i1 %tmp7, label %bb10, label %bb26
    245 
    246 bb10:                                             ; preds = %bb, %bb20
    247   %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
    248   %tmp12 = add nsw i64 %tmp11, %tmp4
    249   %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12
    250   %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
    251   %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12
    252   %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
    253   %tmp17 = icmp ne i32 %tmp14, -1
    254   %tmp18 = icmp ne i32 %tmp16, -1
    255   %tmp19 = and i1 %tmp17, %tmp18
    256   br i1 %tmp19, label %bb20, label %bb26
    257 
    258 bb20:                                             ; preds = %bb10
    259   %tmp21 = add nsw i32 %tmp16, %tmp14
    260   %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12
    261   store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
    262   %tmp23 = add nuw nsw i64 %tmp11, 1
    263   %tmp24 = icmp slt i64 %tmp23, %tmp8
    264   br i1 %tmp24, label %bb10, label %bb26
    265 
    266 bb26:                                             ; preds = %bb10, %bb20, %bb
    267   ret void
    268 }
    269 
    270 attributes #0 = { nounwind readnone }
    271 attributes #1 = { nounwind }
    272