1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 2 ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 3 4 ; FIXME: This leaves behind a now unnecessary and with exec 5 6 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle: 7 ; GCN: buffer_load_dword [[VAL:v[0-9]+]] 8 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]] 9 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] 10 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc 11 ; GCN: buffer_store_dword [[RESULT]] 12 define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 13 entry: 14 %v = load float, float addrspace(1)* %in 15 %cc = fcmp oeq float %v, 1.000000e+00 16 br i1 %cc, label %if, label %endif 17 18 if: 19 %u = fadd float %v, %v 20 br label %endif 21 22 endif: 23 %r = phi float [ %v, %entry ], [ %u, %if ] 24 store float %r, float addrspace(1)* %out 25 ret void 26 } 27 28 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond: 29 ; GCN: buffer_load_dword [[VAL:v[0-9]+]] 30 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]] 31 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] 32 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]] 33 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc 34 ; GCN: buffer_store_dword [[RESULT]] 35 define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 36 entry: 37 %v = load float, float addrspace(1)* %in 38 %cc = fcmp oeq float %v, 1.000000e+00 39 br i1 %cc, label %if, label %else 40 41 if: 42 %u0 = fadd float %v, %v 43 br label %endif 44 45 else: 46 %u1 = fmul float %v, %v 47 br label %endif 48 49 endif: 50 %r = phi float [ %u0, %if ], [ %u1, %else ] 51 store float %r, float addrspace(1)* %out 52 ret void 53 } 54 55 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber: 56 ; GCN: ; clobber vcc 57 ; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 58 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc 59 ; GCN: s_mov_b64 vcc, [[CMP]] 60 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc 61 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 { 62 entry: 63 %v = load i32, i32 addrspace(1)* %in 64 %cc = fcmp oeq float %k, 1.000000e+00 65 br i1 %cc, label %if, label %endif 66 67 if: 68 call void asm "; clobber $0", "~{VCC}"() #0 69 %u = add i32 %v, %v 70 br label %endif 71 72 endif: 73 %r = phi i32 [ %v, %entry ], [ %u, %if ] 74 store i32 %r, i32 addrspace(1)* %out 75 ret void 76 } 77 78 ; Longest chain of cheap instructions to convert 79 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap: 80 ; GCN: v_mul_f32 81 ; GCN: v_mul_f32 82 ; GCN: v_mul_f32 83 ; GCN: v_mul_f32 84 ; GCN: v_mul_f32 85 ; GCN: v_mul_f32 86 ; GCN: v_mul_f32 87 ; GCN: v_mul_f32 88 ; GCN: v_mul_f32 89 ; GCN: v_cndmask_b32_e32 90 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 91 entry: 92 %v = load float, float addrspace(1)* %in 93 %cc = fcmp oeq float %v, 1.000000e+00 94 br i1 %cc, label %if, label %endif 95 96 if: 97 %u.0 = fmul float %v, %v 98 %u.1 = fmul float %v, %u.0 99 %u.2 = fmul float %v, %u.1 100 %u.3 = fmul float %v, %u.2 101 %u.4 = fmul float %v, %u.3 102 %u.5 = fmul float %v, %u.4 103 %u.6 = fmul float %v, %u.5 104 %u.7 = fmul float %v, %u.6 105 %u.8 = fmul float %v, %u.7 106 br label %endif 107 108 endif: 109 %r = phi float [ %v, %entry ], [ %u.8, %if ] 110 store float %r, float addrspace(1)* %out 111 ret void 112 } 113 114 ; Short chain of cheap instructions to not convert 115 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive: 116 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] 117 118 ; GCN: v_mul_f32 119 ; GCN: v_mul_f32 120 ; GCN: v_mul_f32 121 ; GCN: v_mul_f32 122 ; GCN: v_mul_f32 123 ; GCN: v_mul_f32 124 ; GCN: v_mul_f32 125 ; GCN: v_mul_f32 126 ; GCN: v_mul_f32 127 ; GCN: v_mul_f32 128 129 ; GCN: [[ENDIF]]: 130 ; GCN: buffer_store_dword 131 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 132 entry: 133 %v = load float, float addrspace(1)* %in 134 %cc = fcmp oeq float %v, 1.000000e+00 135 br i1 %cc, label %if, label %endif 136 137 if: 138 %u.0 = fmul float %v, %v 139 %u.1 = fmul float %v, %u.0 140 %u.2 = fmul float %v, %u.1 141 %u.3 = fmul float %v, %u.2 142 %u.4 = fmul float %v, %u.3 143 %u.5 = fmul float %v, %u.4 144 %u.6 = fmul float %v, %u.5 145 %u.7 = fmul float %v, %u.6 146 %u.8 = fmul float %v, %u.7 147 %u.9 = fmul float %v, %u.8 148 br label %endif 149 150 endif: 151 %r = phi float [ %v, %entry ], [ %u.9, %if ] 152 store float %r, float addrspace(1)* %out 153 ret void 154 } 155 156 ; Should still branch over fdiv expansion 157 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive: 158 ; GCN: v_cmp_neq_f32_e32 159 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] 160 161 ; GCN: v_div_scale_f32 162 163 ; GCN: [[ENDIF]]: 164 ; GCN: buffer_store_dword 165 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 166 entry: 167 %v = load float, float addrspace(1)* %in 168 %cc = fcmp oeq float %v, 1.000000e+00 169 br i1 %cc, label %if, label %endif 170 171 if: 172 %u = fdiv float %v, %v 173 br label %endif 174 175 endif: 176 %r = phi float [ %v, %entry ], [ %u, %if ] 177 store float %r, float addrspace(1)* %out 178 ret void 179 } 180 181 ; vcc branch with SGPR inputs 182 ; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle: 183 ; GCN: v_cmp_neq_f32_e64 184 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] 185 186 ; GCN: s_add_i32 187 188 ; GCN: [[ENDIF]]: 189 ; GCN: buffer_store_dword 190 define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 { 191 entry: 192 %v = load i32, i32 addrspace(4)* %in 193 %cc = fcmp oeq float %cnd, 1.000000e+00 194 br i1 %cc, label %if, label %endif 195 196 if: 197 %u = add i32 %v, %v 198 br label %endif 199 200 endif: 201 %r = phi i32 [ %v, %entry ], [ %u, %if ] 202 store i32 %r, i32 addrspace(1)* %out 203 ret void 204 205 } 206 207 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load: 208 ; GCN: v_cndmask_b32 209 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 { 210 entry: 211 %v = load float, float addrspace(4)* %in 212 %cc = fcmp oeq float %v, 1.000000e+00 213 br i1 %cc, label %if, label %endif 214 215 if: 216 %u = fadd float %v, %v 217 br label %endif 218 219 endif: 220 %r = phi float [ %v, %entry ], [ %u, %if ] 221 store float %r, float addrspace(1)* %out 222 ret void 223 } 224 225 ; Due to broken cost heuristic, this is not if converted like 226 ; test_vccnz_ifcvt_triangle_constant_load even though it should be. 227 228 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload: 229 ; GCN: v_cndmask_b32 230 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 { 231 entry: 232 %cc = fcmp oeq float %v, 1.000000e+00 233 br i1 %cc, label %if, label %endif 234 235 if: 236 %u = fadd float %v, %v 237 br label %endif 238 239 endif: 240 %r = phi float [ %v, %entry ], [ %u, %if ] 241 store float %r, float addrspace(1)* %out 242 ret void 243 } 244 245 ; Scalar branch and scalar inputs 246 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle: 247 ; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 248 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]] 249 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 250 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]] 251 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 { 252 entry: 253 %v = load i32, i32 addrspace(4)* %in 254 %cc = icmp eq i32 %cond, 1 255 br i1 %cc, label %if, label %endif 256 257 if: 258 %u = add i32 %v, %v 259 br label %endif 260 261 endif: 262 %r = phi i32 [ %v, %entry ], [ %u, %if ] 263 call void asm sideeffect "; reg use $0", "s"(i32 %r) #0 264 ret void 265 } 266 267 ; FIXME: Should be able to use VALU compare and select 268 ; Scalar branch but VGPR select operands 269 ; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle: 270 ; GCN: s_cmp_lg_u32 271 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] 272 273 ; GCN: v_add_f32_e32 274 275 ; GCN: [[ENDIF]]: 276 ; GCN: buffer_store_dword 277 define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 { 278 entry: 279 %v = load float, float addrspace(1)* %in 280 %cc = icmp eq i32 %cond, 1 281 br i1 %cc, label %if, label %endif 282 283 if: 284 %u = fadd float %v, %v 285 br label %endif 286 287 endif: 288 %r = phi float [ %v, %entry ], [ %u, %if ] 289 store float %r, float addrspace(1)* %out 290 ret void 291 } 292 293 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64: 294 ; GCN: s_add_u32 295 ; GCN: s_addc_u32 296 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 297 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 298 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 { 299 entry: 300 %v = load i64, i64 addrspace(4)* %in 301 %cc = icmp eq i32 %cond, 1 302 br i1 %cc, label %if, label %endif 303 304 if: 305 %u = add i64 %v, %v 306 br label %endif 307 308 endif: 309 %r = phi i64 [ %v, %entry ], [ %u, %if ] 310 call void asm sideeffect "; reg use $0", "s"(i64 %r) #0 311 ret void 312 } 313 314 ; TODO: Can do s_cselect_b64; s_cselect_b32 315 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96: 316 ; GCN: s_add_i32 317 ; GCN: s_add_i32 318 ; GCN: s_add_i32 319 ; GCN: s_add_i32 320 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 321 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 322 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 323 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 { 324 entry: 325 %v = load <3 x i32>, <3 x i32> addrspace(4)* %in 326 %cc = icmp eq i32 %cond, 1 327 br i1 %cc, label %if, label %endif 328 329 if: 330 %u = add <3 x i32> %v, %v 331 br label %endif 332 333 endif: 334 %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ] 335 %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 336 call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0 337 ret void 338 } 339 340 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128: 341 ; GCN: s_add_i32 342 ; GCN: s_add_i32 343 ; GCN: s_add_i32 344 ; GCN: s_add_i32 345 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 346 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 347 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 348 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 { 349 entry: 350 %v = load <4 x i32>, <4 x i32> addrspace(4)* %in 351 %cc = icmp eq i32 %cond, 1 352 br i1 %cc, label %if, label %endif 353 354 if: 355 %u = add <4 x i32> %v, %v 356 br label %endif 357 358 endif: 359 %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ] 360 call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0 361 ret void 362 } 363 364 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select: 365 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 366 ; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}} 367 define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) { 368 entry: 369 %cmp0 = icmp eq i32 %cond, 0 370 br i1 %cmp0, label %else, label %if 371 372 if: 373 br label %done 374 375 else: 376 br label %done 377 378 done: 379 %value = phi i32 [0, %if], [1, %else] 380 store i32 %value, i32 addrspace(1)* %out 381 ret void 382 } 383 384 ; GCN-LABEL: {{^}}ifcvt_undef_scc: 385 ; GCN: {{^}}; %bb.0: 386 ; GCN-NEXT: s_load_dwordx2 387 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0 388 define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) { 389 entry: 390 br i1 undef, label %else, label %if 391 392 if: 393 br label %done 394 395 else: 396 br label %done 397 398 done: 399 %value = phi i32 [0, %if], [1, %else] 400 store i32 %value, i32 addrspace(1)* %out 401 ret void 402 } 403 404 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256: 405 ; GCN: v_cmp_neq_f32 406 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] 407 408 ; GCN: v_add_i32 409 ; GCN: v_add_i32 410 411 ; GCN: [[ENDIF]]: 412 ; GCN: buffer_store_dword 413 define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 { 414 entry: 415 %v = load <8 x i32>, <8 x i32> addrspace(1)* %in 416 %cc = fcmp oeq float %cnd, 1.000000e+00 417 br i1 %cc, label %if, label %endif 418 419 if: 420 %u = add <8 x i32> %v, %v 421 br label %endif 422 423 endif: 424 %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ] 425 store <8 x i32> %r, <8 x i32> addrspace(1)* %out 426 ret void 427 } 428 429 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512: 430 ; GCN: v_cmp_neq_f32 431 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] 432 433 ; GCN: v_add_i32 434 ; GCN: v_add_i32 435 436 ; GCN: [[ENDIF]]: 437 ; GCN: buffer_store_dword 438 define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 { 439 entry: 440 %v = load <16 x i32>, <16 x i32> addrspace(1)* %in 441 %cc = fcmp oeq float %cnd, 1.000000e+00 442 br i1 %cc, label %if, label %endif 443 444 if: 445 %u = add <16 x i32> %v, %v 446 br label %endif 447 448 endif: 449 %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ] 450 store <16 x i32> %r, <16 x i32> addrspace(1)* %out 451 ret void 452 } 453 454 attributes #0 = { nounwind } 455