Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
      2 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
      3 
      4 ; SI-LABEL: {{^}}uniform_if_scc:
      5 ; SI-DAG: s_cmp_eq_i32 s{{[0-9]+}}, 0
      6 ; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
      7 ; SI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
      8 
      9 ; Fall-through to the else
     10 ; SI: v_mov_b32_e32 [[STORE_VAL]], 1
     11 
     12 ; SI: [[IF_LABEL]]:
     13 ; SI: buffer_store_dword [[STORE_VAL]]
     14 define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
     15 entry:
     16   %cmp0 = icmp eq i32 %cond, 0
     17   br i1 %cmp0, label %if, label %else
     18 
     19 if:
     20   br label %done
     21 
     22 else:
     23   br label %done
     24 
     25 done:
     26   %value = phi i32 [0, %if], [1, %else]
     27   store i32 %value, i32 addrspace(1)* %out
     28   ret void
     29 }
     30 
     31 ; SI-LABEL: {{^}}uniform_if_vcc:
     32 ; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
     33 ; also scheduled the write first.
     34 ; SI-DAG: v_cmp_eq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
     35 ; SI-DAG: s_and_b64 vcc, exec, [[COND]]
     36 ; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
     37 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
     38 
     39 ; Fall-through to the else
     40 ; SI: v_mov_b32_e32 [[STORE_VAL]], 1
     41 
     42 ; SI: [[IF_LABEL]]:
     43 ; SI: buffer_store_dword [[STORE_VAL]]
     44 define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
     45 entry:
     46   %cmp0 = fcmp oeq float %cond, 0.0
     47   br i1 %cmp0, label %if, label %else
     48 
     49 if:
     50   br label %done
     51 
     52 else:
     53   br label %done
     54 
     55 done:
     56   %value = phi i32 [0, %if], [1, %else]
     57   store i32 %value, i32 addrspace(1)* %out
     58   ret void
     59 }
     60 
     61 ; SI-LABEL: {{^}}uniform_if_swap_br_targets_scc:
     62 ; SI-DAG: s_cmp_lg_i32 s{{[0-9]+}}, 0
     63 ; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
     64 ; SI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
     65 
     66 ; Fall-through to the else
     67 ; SI: v_mov_b32_e32 [[STORE_VAL]], 1
     68 
     69 ; SI: [[IF_LABEL]]:
     70 ; SI: buffer_store_dword [[STORE_VAL]]
     71 define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
     72 entry:
     73   %cmp0 = icmp eq i32 %cond, 0
     74   br i1 %cmp0, label %else, label %if
     75 
     76 if:
     77   br label %done
     78 
     79 else:
     80   br label %done
     81 
     82 done:
     83   %value = phi i32 [0, %if], [1, %else]
     84   store i32 %value, i32 addrspace(1)* %out
     85   ret void
     86 }
     87 
     88 ; SI-LABEL: {{^}}uniform_if_swap_br_targets_vcc:
     89 ; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
     90 ; also scheduled the write first.
     91 ; SI-DAG: v_cmp_neq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
     92 ; SI-DAG: s_and_b64 vcc, exec, [[COND]]
     93 ; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
     94 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
     95 
     96 ; Fall-through to the else
     97 ; SI: v_mov_b32_e32 [[STORE_VAL]], 1
     98 
     99 ; SI: [[IF_LABEL]]:
    100 ; SI: buffer_store_dword [[STORE_VAL]]
    101 define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
    102 entry:
    103   %cmp0 = fcmp oeq float %cond, 0.0
    104   br i1 %cmp0, label %else, label %if
    105 
    106 if:
    107   br label %done
    108 
    109 else:
    110   br label %done
    111 
    112 done:
    113   %value = phi i32 [0, %if], [1, %else]
    114   store i32 %value, i32 addrspace(1)* %out
    115   ret void
    116 }
    117 
    118 ; SI-LABEL: {{^}}uniform_if_move_valu:
    119 ; SI: v_add_f32_e32 [[CMP:v[0-9]+]]
    120 ; Using a floating-point value in an integer compare will cause the compare to
    121 ; be selected for the SALU and then later moved to the VALU.
    122 ; SI: v_cmp_ne_i32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]]
    123 ; SI: s_and_b64 vcc, exec, [[COND]]
    124 ; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
    125 ; SI: buffer_store_dword
    126 ; SI: [[ENDIF_LABEL]]:
    127 ; SI: s_endpgm
    128 define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
    129 entry:
    130   %a.0 = fadd float %a, 10.0
    131   %cond = bitcast float %a.0 to i32
    132   %cmp = icmp eq i32 %cond, 5
    133   br i1 %cmp, label %if, label %endif
    134 
    135 if:
    136   store i32 0, i32 addrspace(1)* %out
    137   br label %endif
    138 
    139 endif:
    140   ret void
    141 }
    142 
    143 ; SI-LABEL: {{^}}uniform_if_move_valu_commute:
    144 ; SI: v_add_f32_e32 [[CMP:v[0-9]+]]
    145 ; Using a floating-point value in an integer compare will cause the compare to
    146 ; be selected for the SALU and then later moved to the VALU.
    147 ; SI: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]]
    148 ; SI: s_and_b64 vcc, exec, [[COND]]
    149 ; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
    150 ; SI: buffer_store_dword
    151 ; SI: [[ENDIF_LABEL]]:
    152 ; SI: s_endpgm
    153 define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
    154 entry:
    155   %a.0 = fadd float %a, 10.0
    156   %cond = bitcast float %a.0 to i32
    157   %cmp = icmp ugt i32 %cond, 5
    158   br i1 %cmp, label %if, label %endif
    159 
    160 if:
    161   store i32 0, i32 addrspace(1)* %out
    162   br label %endif
    163 
    164 endif:
    165   ret void
    166 }
    167 
    168 
    169 ; SI-LABEL: {{^}}uniform_if_else_ret:
    170 ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
    171 ; SI-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
    172 
    173 ; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
    174 ; SI: buffer_store_dword [[TWO]]
    175 ; SI: s_endpgm
    176 
    177 ; SI: {{^}}[[IF_LABEL]]:
    178 ; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
    179 ; SI: buffer_store_dword [[ONE]]
    180 ; SI: s_endpgm
    181 define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
    182 entry:
    183   %cmp = icmp eq i32 %a, 0
    184   br i1 %cmp, label %if.then, label %if.else
    185 
    186 if.then:                                          ; preds = %entry
    187   store i32 1, i32 addrspace(1)* %out
    188   br label %if.end
    189 
    190 if.else:                                          ; preds = %entry
    191   store i32 2, i32 addrspace(1)* %out
    192   br label %if.end
    193 
    194 if.end:                                           ; preds = %if.else, %if.then
    195   ret void
    196 }
    197 
    198 ; SI-LABEL: {{^}}uniform_if_else:
    199 ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
    200 ; SI-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
    201 
    202 ; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
    203 ; SI: buffer_store_dword [[TWO]]
    204 ; SI: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]]
    205 
    206 ; SI: [[IF_LABEL]]:
    207 ; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
    208 ; SI: buffer_store_dword [[ONE]]
    209 
    210 ; SI: [[ENDIF_LABEL]]:
    211 ; SI: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
    212 ; SI: buffer_store_dword [[THREE]]
    213 ; SI: s_endpgm
    214 define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
    215 entry:
    216   %cmp = icmp eq i32 %a, 0
    217   br i1 %cmp, label %if.then, label %if.else
    218 
    219 if.then:                                          ; preds = %entry
    220   store i32 1, i32 addrspace(1)* %out0
    221   br label %if.end
    222 
    223 if.else:                                          ; preds = %entry
    224   store i32 2, i32 addrspace(1)* %out0
    225   br label %if.end
    226 
    227 if.end:                                           ; preds = %if.else, %if.then
    228   store i32 3, i32 addrspace(1)* %out1
    229   ret void
    230 }
    231 
    232 ; SI-LABEL: {{^}}icmp_2_users:
    233 ; SI: s_cmp_lt_i32 s{{[0-9]+}}, 1
    234 ; SI: s_cbranch_scc1 [[LABEL:[a-zA-Z0-9_]+]]
    235 ; SI: buffer_store_dword
    236 ; SI: [[LABEL]]:
    237 ; SI: s_endpgm
    238 define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
    239 main_body:
    240   %0 = icmp sgt i32 %cond, 0
    241   %1 = sext i1 %0 to i32
    242   br i1 %0, label %IF, label %ENDIF
    243 
    244 IF:
    245   store i32 %1, i32 addrspace(1)* %out
    246   br label %ENDIF
    247 
    248 ENDIF:                                            ; preds = %IF, %main_body
    249   ret void
    250 }
    251 
    252 ; SI-LABEL: {{^}}icmp_users_different_blocks:
    253 ; SI: s_load_dword [[COND:s[0-9]+]]
    254 ; SI: s_cmp_lt_i32 [[COND]], 1
    255 ; SI: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
    256 ; SI: v_cmp_lt_i32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0, [[COND]]
    257 ; SI: s_and_b64 vcc, exec, [[MASK]]
    258 ; SI: s_cbranch_vccnz [[EXIT]]
    259 ; SI: buffer_store
    260 ; SI: {{^}}[[EXIT]]:
    261 ; SI: s_endpgm
    262 define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
    263 bb:
    264   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    265   %cmp0 = icmp sgt i32 %cond0, 0
    266   %cmp1 = icmp sgt i32 %cond1, 0
    267   br i1 %cmp0, label %bb2, label %bb9
    268 
    269 bb2:                                              ; preds = %bb
    270   %tmp2 = sext i1 %cmp1 to i32
    271   %tmp3 = add i32 %tmp2, %tmp
    272   br i1 %cmp1, label %bb9, label %bb7
    273 
    274 bb7:                                              ; preds = %bb5
    275   store i32 %tmp3, i32 addrspace(1)* %out
    276   br label %bb9
    277 
    278 bb9:                                              ; preds = %bb8, %bb4
    279   ret void
    280 }
    281 
    282 ; SI-LABEL: {{^}}uniform_loop:
    283 ; SI: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]:
    284 ; FIXME: We need to teach SIFixSGPRCopies about uniform branches so we
    285 ;        get s_add_i32 here.
    286 ; SI: v_add_i32_e32 [[I:v[0-9]+]], vcc, -1, v{{[0-9]+}}
    287 ; SI: v_cmp_ne_i32_e32 vcc, 0, [[I]]
    288 ; SI: s_and_b64 vcc, exec, vcc
    289 ; SI: s_cbranch_vccnz [[LOOP_LABEL]]
    290 ; SI: s_endpgm
    291 define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
    292 entry:
    293   br label %loop
    294 
    295 loop:
    296   %i = phi i32 [0, %entry], [%i.i, %loop]
    297   %i.i = add i32 %i, 1
    298   %cmp = icmp eq i32 %a, %i.i
    299   br i1 %cmp, label %done, label %loop
    300 
    301 done:
    302   ret void
    303 }
    304 
    305 ; Test uniform and divergent.
    306 
    307 ; SI-LABEL: {{^}}uniform_inside_divergent:
    308 ; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
    309 ; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
    310 ; SI: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
    311 ; SI: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
    312 ; SI: s_cmp_lg_i32 {{s[0-9]+}}, 0
    313 ; SI: s_cbranch_scc1 [[ENDIF_LABEL]]
    314 ; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
    315 ; SI: buffer_store_dword [[ONE]]
    316 define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
    317 entry:
    318   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
    319   %d_cmp = icmp ult i32 %tid, 16
    320   br i1 %d_cmp, label %if, label %endif
    321 
    322 if:
    323   store i32 0, i32 addrspace(1)* %out
    324   %u_cmp = icmp eq i32 %cond, 0
    325   br i1 %u_cmp, label %if_uniform, label %endif
    326 
    327 if_uniform:
    328   store i32 1, i32 addrspace(1)* %out
    329   br label %endif
    330 
    331 endif:
    332   ret void
    333 }
    334 
    335 ; SI-LABEL: {{^}}divergent_inside_uniform:
    336 ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
    337 ; SI: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]]
    338 ; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
    339 ; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
    340 ; SI: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
    341 ; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
    342 ; SI: buffer_store_dword [[ONE]]
    343 ; SI: [[ENDIF_LABEL]]:
    344 ; SI: s_endpgm
    345 define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
    346 entry:
    347   %u_cmp = icmp eq i32 %cond, 0
    348   br i1 %u_cmp, label %if, label %endif
    349 
    350 if:
    351   store i32 0, i32 addrspace(1)* %out
    352   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
    353   %d_cmp = icmp ult i32 %tid, 16
    354   br i1 %d_cmp, label %if_uniform, label %endif
    355 
    356 if_uniform:
    357   store i32 1, i32 addrspace(1)* %out
    358   br label %endif
    359 
    360 endif:
    361   ret void
    362 }
    363 
    364 ; SI-LABEL: {{^}}divergent_if_uniform_if:
    365 ; SI: v_cmp_eq_i32_e32 vcc, 0, v0
    366 ; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
    367 ; SI: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
    368 ; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
    369 ; SI: buffer_store_dword [[ONE]]
    370 ; SI: s_or_b64 exec, exec, [[MASK]]
    371 ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
    372 ; SI: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]]
    373 ; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
    374 ; SI: buffer_store_dword [[TWO]]
    375 ; SI: [[EXIT]]:
    376 ; SI: s_endpgm
    377 define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
    378 entry:
    379   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
    380   %d_cmp = icmp eq i32 %tid, 0
    381   br i1 %d_cmp, label %if, label %endif
    382 
    383 if:
    384   store i32 1, i32 addrspace(1)* %out
    385   br label %endif
    386 
    387 endif:
    388   %u_cmp = icmp eq i32 %cond, 0
    389   br i1 %u_cmp, label %if_uniform, label %exit
    390 
    391 if_uniform:
    392   store i32 2, i32 addrspace(1)* %out
    393   br label %exit
    394 
    395 exit:
    396   ret void
    397 }
    398 
    399 ; The condition of the branches in the two blocks are
    400 ; uniform. MachineCSE replaces the 2nd condition with the inverse of
    401 ; the first, leaving an scc use in a different block than it was
    402 ; defed.
    403 
    404 ; SI-LABEL: {{^}}cse_uniform_condition_different_blocks:
    405 ; SI: s_load_dword [[COND:s[0-9]+]]
    406 ; SI: s_cmp_lt_i32 [[COND]], 1
    407 ; SI: s_cbranch_scc1 BB[[FNNUM:[0-9]+]]_3
    408 
    409 ; SI: BB#1:
    410 ; SI-NOT: cmp
    411 ; SI: buffer_load_dword
    412 ; SI: buffer_store_dword
    413 ; SI: s_cbranch_scc1 BB[[FNNUM]]_3
    414 
    415 ; SI: BB[[FNNUM]]_3:
    416 ; SI: s_endpgm
    417 define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
    418 bb:
    419   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    420   %tmp1 = icmp sgt i32 %cond, 0
    421   br i1 %tmp1, label %bb2, label %bb9
    422 
    423 bb2:                                              ; preds = %bb
    424   %tmp3 = load volatile i32, i32 addrspace(1)* undef
    425   store volatile i32 0, i32 addrspace(1)* undef
    426   %tmp9 = icmp sle i32 %cond, 0
    427   br i1 %tmp9, label %bb9, label %bb7
    428 
    429 bb7:                                              ; preds = %bb5
    430   store i32 %tmp3, i32 addrspace(1)* %out
    431   br label %bb9
    432 
    433 bb9:                                              ; preds = %bb8, %bb4
    434   ret void
    435 }
    436 
    437 declare i32 @llvm.amdgcn.workitem.id.x() #0
    438 
    439 attributes #0 = { readnone }
    440