Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
      2 ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
      3 
      4 ; FIXME: This leaves behind a now unnecessary and with exec
      5 
      6 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle:
      7 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
      8 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
      9 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
     10 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
     11 ; GCN: buffer_store_dword [[RESULT]]
     12 define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
     13 entry:
     14   %v = load float, float addrspace(1)* %in
     15   %cc = fcmp oeq float %v, 1.000000e+00
     16   br i1 %cc, label %if, label %endif
     17 
     18 if:
     19   %u = fadd float %v, %v
     20   br label %endif
     21 
     22 endif:
     23   %r = phi float [ %v, %entry ], [ %u, %if ]
     24   store float %r, float addrspace(1)* %out
     25   ret void
     26 }
     27 
     28 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond:
     29 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
     30 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
     31 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
     32 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
     33 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc
     34 ; GCN: buffer_store_dword [[RESULT]]
     35 define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
     36 entry:
     37   %v = load float, float addrspace(1)* %in
     38   %cc = fcmp oeq float %v, 1.000000e+00
     39   br i1 %cc, label %if, label %else
     40 
     41 if:
     42   %u0 = fadd float %v, %v
     43   br label %endif
     44 
     45 else:
     46   %u1 = fmul float %v, %v
     47   br label %endif
     48 
     49 endif:
     50   %r = phi float [ %u0, %if ], [ %u1, %else ]
     51   store float %r, float addrspace(1)* %out
     52   ret void
     53 }
     54 
     55 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
     56 ; GCN: ; clobber vcc
     57 ; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
     58 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
     59 ; GCN: s_mov_b64 vcc, [[CMP]]
     60 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
     61 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
     62 entry:
     63   %v = load i32, i32 addrspace(1)* %in
     64   %cc = fcmp oeq float %k, 1.000000e+00
     65   br i1 %cc, label %if, label %endif
     66 
     67 if:
     68   call void asm "; clobber $0", "~{VCC}"() #0
     69   %u = add i32 %v, %v
     70   br label %endif
     71 
     72 endif:
     73   %r = phi i32 [ %v, %entry ], [ %u, %if ]
     74   store i32 %r, i32 addrspace(1)* %out
     75   ret void
     76 }
     77 
     78 ; Longest chain of cheap instructions to convert
     79 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap:
     80 ; GCN: v_mul_f32
     81 ; GCN: v_mul_f32
     82 ; GCN: v_mul_f32
     83 ; GCN: v_mul_f32
     84 ; GCN: v_mul_f32
     85 ; GCN: v_mul_f32
     86 ; GCN: v_mul_f32
     87 ; GCN: v_mul_f32
     88 ; GCN: v_mul_f32
     89 ; GCN: v_cndmask_b32_e32
     90 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
     91 entry:
     92   %v = load float, float addrspace(1)* %in
     93   %cc = fcmp oeq float %v, 1.000000e+00
     94   br i1 %cc, label %if, label %endif
     95 
     96 if:
     97   %u.0 = fmul float %v, %v
     98   %u.1 = fmul float %v, %u.0
     99   %u.2 = fmul float %v, %u.1
    100   %u.3 = fmul float %v, %u.2
    101   %u.4 = fmul float %v, %u.3
    102   %u.5 = fmul float %v, %u.4
    103   %u.6 = fmul float %v, %u.5
    104   %u.7 = fmul float %v, %u.6
    105   %u.8 = fmul float %v, %u.7
    106   br label %endif
    107 
    108 endif:
    109   %r = phi float [ %v, %entry ], [ %u.8, %if ]
    110   store float %r, float addrspace(1)* %out
    111   ret void
    112 }
    113 
    114 ; Short chain of cheap instructions to not convert
    115 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive:
    116 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
    117 
    118 ; GCN: v_mul_f32
    119 ; GCN: v_mul_f32
    120 ; GCN: v_mul_f32
    121 ; GCN: v_mul_f32
    122 ; GCN: v_mul_f32
    123 ; GCN: v_mul_f32
    124 ; GCN: v_mul_f32
    125 ; GCN: v_mul_f32
    126 ; GCN: v_mul_f32
    127 ; GCN: v_mul_f32
    128 
    129 ; GCN: [[ENDIF]]:
    130 ; GCN: buffer_store_dword
    131 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    132 entry:
    133   %v = load float, float addrspace(1)* %in
    134   %cc = fcmp oeq float %v, 1.000000e+00
    135   br i1 %cc, label %if, label %endif
    136 
    137 if:
    138   %u.0 = fmul float %v, %v
    139   %u.1 = fmul float %v, %u.0
    140   %u.2 = fmul float %v, %u.1
    141   %u.3 = fmul float %v, %u.2
    142   %u.4 = fmul float %v, %u.3
    143   %u.5 = fmul float %v, %u.4
    144   %u.6 = fmul float %v, %u.5
    145   %u.7 = fmul float %v, %u.6
    146   %u.8 = fmul float %v, %u.7
    147   %u.9 = fmul float %v, %u.8
    148   br label %endif
    149 
    150 endif:
    151   %r = phi float [ %v, %entry ], [ %u.9, %if ]
    152   store float %r, float addrspace(1)* %out
    153   ret void
    154 }
    155 
    156 ; Should still branch over fdiv expansion
    157 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive:
    158 ; GCN: v_cmp_neq_f32_e32
    159 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
    160 
    161 ; GCN: v_div_scale_f32
    162 
    163 ; GCN: [[ENDIF]]:
    164 ; GCN: buffer_store_dword
    165 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    166 entry:
    167   %v = load float, float addrspace(1)* %in
    168   %cc = fcmp oeq float %v, 1.000000e+00
    169   br i1 %cc, label %if, label %endif
    170 
    171 if:
    172   %u = fdiv float %v, %v
    173   br label %endif
    174 
    175 endif:
    176   %r = phi float [ %v, %entry ], [ %u, %if ]
    177   store float %r, float addrspace(1)* %out
    178   ret void
    179 }
    180 
    181 ; vcc branch with SGPR inputs
    182 ; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle:
    183 ; GCN: v_cmp_neq_f32_e64
    184 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
    185 
    186 ; GCN: s_add_i32
    187 
    188 ; GCN: [[ENDIF]]:
    189 ; GCN: buffer_store_dword
    190 define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 {
    191 entry:
    192   %v = load i32, i32 addrspace(4)* %in
    193   %cc = fcmp oeq float %cnd, 1.000000e+00
    194   br i1 %cc, label %if, label %endif
    195 
    196 if:
    197   %u = add i32 %v, %v
    198   br label %endif
    199 
    200 endif:
    201   %r = phi i32 [ %v, %entry ], [ %u, %if ]
    202   store i32 %r, i32 addrspace(1)* %out
    203   ret void
    204 
    205 }
    206 
    207 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
    208 ; GCN: v_cndmask_b32
    209 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 {
    210 entry:
    211   %v = load float, float addrspace(4)* %in
    212   %cc = fcmp oeq float %v, 1.000000e+00
    213   br i1 %cc, label %if, label %endif
    214 
    215 if:
    216   %u = fadd float %v, %v
    217   br label %endif
    218 
    219 endif:
    220   %r = phi float [ %v, %entry ], [ %u, %if ]
    221   store float %r, float addrspace(1)* %out
    222   ret void
    223 }
    224 
    225 ; Due to broken cost heuristic, this is not if converted like
    226 ; test_vccnz_ifcvt_triangle_constant_load even though it should be.
    227 
    228 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
    229 ; GCN: v_cndmask_b32
    230 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
    231 entry:
    232   %cc = fcmp oeq float %v, 1.000000e+00
    233   br i1 %cc, label %if, label %endif
    234 
    235 if:
    236   %u = fadd float %v, %v
    237   br label %endif
    238 
    239 endif:
    240   %r = phi float [ %v, %entry ], [ %u, %if ]
    241   store float %r, float addrspace(1)* %out
    242   ret void
    243 }
    244 
    245 ; Scalar branch and scalar inputs
    246 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle:
    247 ; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
    248 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
    249 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
    250 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
    251 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 {
    252 entry:
    253   %v = load i32, i32 addrspace(4)* %in
    254   %cc = icmp eq i32 %cond, 1
    255   br i1 %cc, label %if, label %endif
    256 
    257 if:
    258   %u = add i32 %v, %v
    259   br label %endif
    260 
    261 endif:
    262   %r = phi i32 [ %v, %entry ], [ %u, %if ]
    263   call void asm sideeffect "; reg use $0", "s"(i32 %r) #0
    264   ret void
    265 }
    266 
    267 ; FIXME: Should be able to use VALU compare and select
    268 ; Scalar branch but VGPR select operands
    269 ; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle:
    270 ; GCN: s_cmp_lg_u32
    271 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
    272 
    273 ; GCN: v_add_f32_e32
    274 
    275 ; GCN: [[ENDIF]]:
    276 ; GCN: buffer_store_dword
    277 define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
    278 entry:
    279   %v = load float, float addrspace(1)* %in
    280   %cc = icmp eq i32 %cond, 1
    281   br i1 %cc, label %if, label %endif
    282 
    283 if:
    284   %u = fadd float %v, %v
    285   br label %endif
    286 
    287 endif:
    288   %r = phi float [ %v, %entry ], [ %u, %if ]
    289   store float %r, float addrspace(1)* %out
    290   ret void
    291 }
    292 
    293 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64:
    294 ; GCN: s_add_u32
    295 ; GCN: s_addc_u32
    296 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
    297 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
    298 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 {
    299 entry:
    300   %v = load i64, i64 addrspace(4)* %in
    301   %cc = icmp eq i32 %cond, 1
    302   br i1 %cc, label %if, label %endif
    303 
    304 if:
    305   %u = add i64 %v, %v
    306   br label %endif
    307 
    308 endif:
    309   %r = phi i64 [ %v, %entry ], [ %u, %if ]
    310   call void asm sideeffect "; reg use $0", "s"(i64 %r) #0
    311   ret void
    312 }
    313 
    314 ; TODO: Can do s_cselect_b64; s_cselect_b32
    315 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96:
    316 ; GCN: s_add_i32
    317 ; GCN: s_add_i32
    318 ; GCN: s_add_i32
    319 ; GCN: s_add_i32
    320 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
    321 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
    322 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
    323 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 {
    324 entry:
    325   %v = load <3 x i32>, <3 x i32> addrspace(4)* %in
    326   %cc = icmp eq i32 %cond, 1
    327   br i1 %cc, label %if, label %endif
    328 
    329 if:
    330   %u = add <3 x i32> %v, %v
    331   br label %endif
    332 
    333 endif:
    334   %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
    335   %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    336   call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0
    337   ret void
    338 }
    339 
    340 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128:
    341 ; GCN: s_add_i32
    342 ; GCN: s_add_i32
    343 ; GCN: s_add_i32
    344 ; GCN: s_add_i32
    345 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
    346 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
    347 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
    348 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 {
    349 entry:
    350   %v = load <4 x i32>, <4 x i32> addrspace(4)* %in
    351   %cc = icmp eq i32 %cond, 1
    352   br i1 %cc, label %if, label %endif
    353 
    354 if:
    355   %u = add <4 x i32> %v, %v
    356   br label %endif
    357 
    358 endif:
    359   %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
    360   call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0
    361   ret void
    362 }
    363 
    364 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
    365 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
    366 ; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}}
    367 define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
    368 entry:
    369   %cmp0 = icmp eq i32 %cond, 0
    370   br i1 %cmp0, label %else, label %if
    371 
    372 if:
    373   br label %done
    374 
    375 else:
    376   br label %done
    377 
    378 done:
    379   %value = phi i32 [0, %if], [1, %else]
    380   store i32 %value, i32 addrspace(1)* %out
    381   ret void
    382 }
    383 
    384 ; GCN-LABEL: {{^}}ifcvt_undef_scc:
    385 ; GCN: {{^}}; %bb.0:
    386 ; GCN-NEXT: s_load_dwordx2
    387 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0
    388 define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
    389 entry:
    390   br i1 undef, label %else, label %if
    391 
    392 if:
    393   br label %done
    394 
    395 else:
    396   br label %done
    397 
    398 done:
    399   %value = phi i32 [0, %if], [1, %else]
    400   store i32 %value, i32 addrspace(1)* %out
    401   ret void
    402 }
    403 
    404 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256:
    405 ; GCN: v_cmp_neq_f32
    406 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
    407 
    408 ; GCN: v_add_i32
    409 ; GCN: v_add_i32
    410 
    411 ; GCN: [[ENDIF]]:
    412 ; GCN: buffer_store_dword
    413 define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
    414 entry:
    415   %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
    416   %cc = fcmp oeq float %cnd, 1.000000e+00
    417   br i1 %cc, label %if, label %endif
    418 
    419 if:
    420   %u = add <8 x i32> %v, %v
    421   br label %endif
    422 
    423 endif:
    424   %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
    425   store <8 x i32> %r, <8 x i32> addrspace(1)* %out
    426   ret void
    427 }
    428 
    429 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512:
    430 ; GCN: v_cmp_neq_f32
    431 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
    432 
    433 ; GCN: v_add_i32
    434 ; GCN: v_add_i32
    435 
    436 ; GCN: [[ENDIF]]:
    437 ; GCN: buffer_store_dword
    438 define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
    439 entry:
    440   %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
    441   %cc = fcmp oeq float %cnd, 1.000000e+00
    442   br i1 %cc, label %if, label %endif
    443 
    444 if:
    445   %u = add <16 x i32> %v, %v
    446   br label %endif
    447 
    448 endif:
    449   %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
    450   store <16 x i32> %r, <16 x i32> addrspace(1)* %out
    451   ret void
    452 }
    453 
    454 attributes #0 = { nounwind }
    455