1 /* 2 * Copyright 2013 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25 #include "tgsi/tgsi_parse.h" 26 #include "util/u_memory.h" 27 #include "util/u_upload_mgr.h" 28 #include "radeon/radeon_elf_util.h" 29 30 #include "amd_kernel_code_t.h" 31 #include "radeon/r600_cs.h" 32 #include "si_pipe.h" 33 #include "sid.h" 34 35 #define MAX_GLOBAL_BUFFERS 22 36 37 struct si_compute { 38 unsigned ir_type; 39 unsigned local_size; 40 unsigned private_size; 41 unsigned input_size; 42 struct si_shader shader; 43 44 struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS]; 45 unsigned use_code_object_v2 : 1; 46 unsigned variable_group_size : 1; 47 }; 48 49 struct dispatch_packet { 50 uint16_t header; 51 uint16_t setup; 52 uint16_t workgroup_size_x; 53 uint16_t workgroup_size_y; 54 uint16_t workgroup_size_z; 55 uint16_t reserved0; 56 uint32_t grid_size_x; 57 uint32_t grid_size_y; 58 uint32_t grid_size_z; 59 uint32_t private_segment_size; 60 uint32_t group_segment_size; 61 uint64_t kernel_object; 62 uint64_t kernarg_address; 63 uint64_t reserved2; 64 }; 65 66 static const amd_kernel_code_t *si_compute_get_code_object( 67 const struct si_compute *program, 68 uint64_t symbol_offset) 69 { 70 if (!program->use_code_object_v2) { 71 return NULL; 72 } 73 return (const amd_kernel_code_t*) 74 (program->shader.binary.code + symbol_offset); 75 } 76 77 static void code_object_to_config(const amd_kernel_code_t *code_object, 78 struct si_shader_config *out_config) { 79 80 uint32_t rsrc1 = code_object->compute_pgm_resource_registers; 81 uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32; 82 out_config->num_sgprs = code_object->wavefront_sgpr_count; 83 out_config->num_vgprs = code_object->workitem_vgpr_count; 84 out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1); 85 out_config->rsrc1 = rsrc1; 86 out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2)); 87 out_config->rsrc2 = rsrc2; 88 out_config->scratch_bytes_per_wave = 89 align(code_object->workitem_private_segment_byte_size * 64, 1024); 90 } 91 92 static void *si_create_compute_state( 93 struct pipe_context *ctx, 94 const struct pipe_compute_state *cso) 95 { 96 struct si_context *sctx = (struct si_context *)ctx; 97 struct si_screen *sscreen = (struct si_screen *)ctx->screen; 98 struct si_compute *program = CALLOC_STRUCT(si_compute); 99 struct si_shader *shader = &program->shader; 100 101 102 program->ir_type = cso->ir_type; 103 program->local_size = cso->req_local_mem; 104 program->private_size = cso->req_private_mem; 105 program->input_size = cso->req_input_mem; 106 program->use_code_object_v2 = HAVE_LLVM >= 0x0400 && 107 cso->ir_type == PIPE_SHADER_IR_NATIVE; 108 109 110 if (cso->ir_type == PIPE_SHADER_IR_TGSI) { 111 struct si_shader_selector sel; 112 bool scratch_enabled; 113 114 memset(&sel, 0, sizeof(sel)); 115 116 sel.tokens = tgsi_dup_tokens(cso->prog); 117 if (!sel.tokens) { 118 FREE(program); 119 return NULL; 120 } 121 122 tgsi_scan_shader(cso->prog, &sel.info); 123 sel.type = PIPE_SHADER_COMPUTE; 124 sel.local_size = cso->req_local_mem; 125 126 p_atomic_inc(&sscreen->b.num_shaders_created); 127 128 program->shader.selector = &sel; 129 program->shader.is_monolithic = true; 130 131 if (si_shader_create(sscreen, sctx->tm, &program->shader, 132 &sctx->b.debug)) { 133 FREE(sel.tokens); 134 FREE(program); 135 return NULL; 136 } 137 138 scratch_enabled = shader->config.scratch_bytes_per_wave > 0; 139 140 shader->config.rsrc1 = 141 S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) | 142 S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) | 143 S_00B848_DX10_CLAMP(1) | 144 S_00B848_FLOAT_MODE(shader->config.float_mode); 145 146 shader->config.rsrc2 = S_00B84C_USER_SGPR(SI_CS_NUM_USER_SGPR) | 147 S_00B84C_SCRATCH_EN(scratch_enabled) | 148 S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) | 149 S_00B84C_TGID_Z_EN(1) | S_00B84C_TIDIG_COMP_CNT(2) | 150 S_00B84C_LDS_SIZE(shader->config.lds_size); 151 152 program->variable_group_size = 153 sel.info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0; 154 155 FREE(sel.tokens); 156 program->shader.selector = NULL; 157 } else { 158 const struct pipe_llvm_program_header *header; 159 const char *code; 160 header = cso->prog; 161 code = cso->prog + sizeof(struct pipe_llvm_program_header); 162 163 radeon_elf_read(code, header->num_bytes, &program->shader.binary); 164 if (program->use_code_object_v2) { 165 const amd_kernel_code_t *code_object = 166 si_compute_get_code_object(program, 0); 167 code_object_to_config(code_object, &program->shader.config); 168 } else { 169 si_shader_binary_read_config(&program->shader.binary, 170 &program->shader.config, 0); 171 } 172 si_shader_dump(sctx->screen, &program->shader, &sctx->b.debug, 173 PIPE_SHADER_COMPUTE, stderr, true); 174 if (si_shader_binary_upload(sctx->screen, &program->shader) < 0) { 175 fprintf(stderr, "LLVM failed to upload shader\n"); 176 FREE(program); 177 return NULL; 178 } 179 } 180 181 return program; 182 } 183 184 static void si_bind_compute_state(struct pipe_context *ctx, void *state) 185 { 186 struct si_context *sctx = (struct si_context*)ctx; 187 sctx->cs_shader_state.program = (struct si_compute*)state; 188 } 189 190 static void si_set_global_binding( 191 struct pipe_context *ctx, unsigned first, unsigned n, 192 struct pipe_resource **resources, 193 uint32_t **handles) 194 { 195 unsigned i; 196 struct si_context *sctx = (struct si_context*)ctx; 197 struct si_compute *program = sctx->cs_shader_state.program; 198 199 assert(first + n <= MAX_GLOBAL_BUFFERS); 200 201 if (!resources) { 202 for (i = 0; i < n; i++) { 203 pipe_resource_reference(&program->global_buffers[first + i], NULL); 204 } 205 return; 206 } 207 208 for (i = 0; i < n; i++) { 209 uint64_t va; 210 uint32_t offset; 211 pipe_resource_reference(&program->global_buffers[first + i], resources[i]); 212 va = r600_resource(resources[i])->gpu_address; 213 offset = util_le32_to_cpu(*handles[i]); 214 va += offset; 215 va = util_cpu_to_le64(va); 216 memcpy(handles[i], &va, sizeof(va)); 217 } 218 } 219 220 static void si_initialize_compute(struct si_context *sctx) 221 { 222 struct radeon_winsys_cs *cs = sctx->b.gfx.cs; 223 uint64_t bc_va; 224 225 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3); 226 radeon_emit(cs, 0); 227 radeon_emit(cs, 0); 228 radeon_emit(cs, 0); 229 230 radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); 231 /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */ 232 radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); 233 radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff)); 234 235 if (sctx->b.chip_class >= CIK) { 236 /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ 237 radeon_set_sh_reg_seq(cs, 238 R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); 239 radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) | 240 S_00B864_SH1_CU_EN(0xffff)); 241 radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) | 242 S_00B868_SH1_CU_EN(0xffff)); 243 } 244 245 /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID 246 * and is now per pipe, so it should be handled in the 247 * kernel if we want to use something other than the default value, 248 * which is now 0x22f. 249 */ 250 if (sctx->b.chip_class <= SI) { 251 /* XXX: This should be: 252 * (number of compute units) * 4 * (waves per simd) - 1 */ 253 254 radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 255 0x190 /* Default value */); 256 } 257 258 /* Set the pointer to border colors. */ 259 bc_va = sctx->border_color_buffer->gpu_address; 260 261 if (sctx->b.chip_class >= CIK) { 262 radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2); 263 radeon_emit(cs, bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */ 264 radeon_emit(cs, bc_va >> 40); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */ 265 } else { 266 if (sctx->screen->b.info.drm_major == 3 || 267 (sctx->screen->b.info.drm_major == 2 && 268 sctx->screen->b.info.drm_minor >= 48)) { 269 radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, 270 bc_va >> 8); 271 } 272 } 273 274 sctx->cs_shader_state.emitted_program = NULL; 275 sctx->cs_shader_state.initialized = true; 276 } 277 278 static bool si_setup_compute_scratch_buffer(struct si_context *sctx, 279 struct si_shader *shader, 280 struct si_shader_config *config) 281 { 282 uint64_t scratch_bo_size, scratch_needed; 283 scratch_bo_size = 0; 284 scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves; 285 if (sctx->compute_scratch_buffer) 286 scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0; 287 288 if (scratch_bo_size < scratch_needed) { 289 r600_resource_reference(&sctx->compute_scratch_buffer, NULL); 290 291 sctx->compute_scratch_buffer = (struct r600_resource*) 292 pipe_buffer_create(&sctx->screen->b.b, 0, 293 PIPE_USAGE_DEFAULT, scratch_needed); 294 295 if (!sctx->compute_scratch_buffer) 296 return false; 297 } 298 299 if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) { 300 uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; 301 302 si_shader_apply_scratch_relocs(sctx, shader, config, scratch_va); 303 304 if (si_shader_binary_upload(sctx->screen, shader)) 305 return false; 306 307 r600_resource_reference(&shader->scratch_bo, 308 sctx->compute_scratch_buffer); 309 } 310 311 return true; 312 } 313 314 static bool si_switch_compute_shader(struct si_context *sctx, 315 struct si_compute *program, 316 struct si_shader *shader, 317 const amd_kernel_code_t *code_object, 318 unsigned offset) 319 { 320 struct radeon_winsys_cs *cs = sctx->b.gfx.cs; 321 struct si_shader_config inline_config = {0}; 322 struct si_shader_config *config; 323 uint64_t shader_va; 324 325 if (sctx->cs_shader_state.emitted_program == program && 326 sctx->cs_shader_state.offset == offset) 327 return true; 328 329 if (program->ir_type == PIPE_SHADER_IR_TGSI) { 330 config = &shader->config; 331 } else { 332 unsigned lds_blocks; 333 334 config = &inline_config; 335 if (code_object) { 336 code_object_to_config(code_object, config); 337 } else { 338 si_shader_binary_read_config(&shader->binary, config, offset); 339 } 340 341 lds_blocks = config->lds_size; 342 /* XXX: We are over allocating LDS. For SI, the shader reports 343 * LDS in blocks of 256 bytes, so if there are 4 bytes lds 344 * allocated in the shader and 4 bytes allocated by the state 345 * tracker, then we will set LDS_SIZE to 512 bytes rather than 256. 346 */ 347 if (sctx->b.chip_class <= SI) { 348 lds_blocks += align(program->local_size, 256) >> 8; 349 } else { 350 lds_blocks += align(program->local_size, 512) >> 9; 351 } 352 353 /* TODO: use si_multiwave_lds_size_workaround */ 354 assert(lds_blocks <= 0xFF); 355 356 config->rsrc2 &= C_00B84C_LDS_SIZE; 357 config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks); 358 } 359 360 if (!si_setup_compute_scratch_buffer(sctx, shader, config)) 361 return false; 362 363 if (shader->scratch_bo) { 364 COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; " 365 "Total Scratch: %u bytes\n", sctx->scratch_waves, 366 config->scratch_bytes_per_wave, 367 config->scratch_bytes_per_wave * 368 sctx->scratch_waves); 369 370 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, 371 shader->scratch_bo, RADEON_USAGE_READWRITE, 372 RADEON_PRIO_SCRATCH_BUFFER); 373 } 374 375 shader_va = shader->bo->gpu_address + offset; 376 if (program->use_code_object_v2) { 377 /* Shader code is placed after the amd_kernel_code_t 378 * struct. */ 379 shader_va += sizeof(amd_kernel_code_t); 380 } 381 382 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo, 383 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); 384 385 radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); 386 radeon_emit(cs, shader_va >> 8); 387 radeon_emit(cs, shader_va >> 40); 388 389 radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); 390 radeon_emit(cs, config->rsrc1); 391 radeon_emit(cs, config->rsrc2); 392 393 COMPUTE_DBG(sctx->screen, "COMPUTE_PGM_RSRC1: 0x%08x " 394 "COMPUTE_PGM_RSRC2: 0x%08x\n", config->rsrc1, config->rsrc2); 395 396 radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, 397 S_00B860_WAVES(sctx->scratch_waves) 398 | S_00B860_WAVESIZE(config->scratch_bytes_per_wave >> 10)); 399 400 sctx->cs_shader_state.emitted_program = program; 401 sctx->cs_shader_state.offset = offset; 402 sctx->cs_shader_state.uses_scratch = 403 config->scratch_bytes_per_wave != 0; 404 405 return true; 406 } 407 408 static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx, 409 const amd_kernel_code_t *code_object, 410 unsigned user_sgpr) 411 { 412 struct radeon_winsys_cs *cs = sctx->b.gfx.cs; 413 uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; 414 415 unsigned max_private_element_size = AMD_HSA_BITS_GET( 416 code_object->code_properties, 417 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE); 418 419 uint32_t scratch_dword0 = scratch_va & 0xffffffff; 420 uint32_t scratch_dword1 = 421 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | 422 S_008F04_SWIZZLE_ENABLE(1); 423 424 /* Disable address clamping */ 425 uint32_t scratch_dword2 = 0xffffffff; 426 uint32_t scratch_dword3 = 427 S_008F0C_ELEMENT_SIZE(max_private_element_size) | 428 S_008F0C_INDEX_STRIDE(3) | 429 S_008F0C_ADD_TID_ENABLE(1); 430 431 432 if (sctx->screen->b.chip_class < VI) { 433 /* BUF_DATA_FORMAT is ignored, but it cannot be 434 BUF_DATA_FORMAT_INVALID. */ 435 scratch_dword3 |= 436 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8); 437 } 438 439 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + 440 (user_sgpr * 4), 4); 441 radeon_emit(cs, scratch_dword0); 442 radeon_emit(cs, scratch_dword1); 443 radeon_emit(cs, scratch_dword2); 444 radeon_emit(cs, scratch_dword3); 445 } 446 447 static void si_setup_user_sgprs_co_v2(struct si_context *sctx, 448 const amd_kernel_code_t *code_object, 449 const struct pipe_grid_info *info, 450 uint64_t kernel_args_va) 451 { 452 struct si_compute *program = sctx->cs_shader_state.program; 453 struct radeon_winsys_cs *cs = sctx->b.gfx.cs; 454 455 static const enum amd_code_property_mask_t workgroup_count_masks [] = { 456 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 457 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 458 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z 459 }; 460 461 unsigned i, user_sgpr = 0; 462 if (AMD_HSA_BITS_GET(code_object->code_properties, 463 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) { 464 if (code_object->workitem_private_segment_byte_size > 0) { 465 setup_scratch_rsrc_user_sgprs(sctx, code_object, 466 user_sgpr); 467 } 468 user_sgpr += 4; 469 } 470 471 if (AMD_HSA_BITS_GET(code_object->code_properties, 472 AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) { 473 struct dispatch_packet dispatch; 474 unsigned dispatch_offset; 475 struct r600_resource *dispatch_buf = NULL; 476 uint64_t dispatch_va; 477 478 /* Upload dispatch ptr */ 479 memset(&dispatch, 0, sizeof(dispatch)); 480 481 dispatch.workgroup_size_x = info->block[0]; 482 dispatch.workgroup_size_y = info->block[1]; 483 dispatch.workgroup_size_z = info->block[2]; 484 485 dispatch.grid_size_x = info->grid[0] * info->block[0]; 486 dispatch.grid_size_y = info->grid[1] * info->block[1]; 487 dispatch.grid_size_z = info->grid[2] * info->block[2]; 488 489 dispatch.private_segment_size = program->private_size; 490 dispatch.group_segment_size = program->local_size; 491 492 dispatch.kernarg_address = kernel_args_va; 493 494 u_upload_data(sctx->b.uploader, 0, sizeof(dispatch), 256, 495 &dispatch, &dispatch_offset, 496 (struct pipe_resource**)&dispatch_buf); 497 498 if (!dispatch_buf) { 499 fprintf(stderr, "Error: Failed to allocate dispatch " 500 "packet."); 501 } 502 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, dispatch_buf, 503 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); 504 505 dispatch_va = dispatch_buf->gpu_address + dispatch_offset; 506 507 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + 508 (user_sgpr * 4), 2); 509 radeon_emit(cs, dispatch_va); 510 radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | 511 S_008F04_STRIDE(0)); 512 513 r600_resource_reference(&dispatch_buf, NULL); 514 user_sgpr += 2; 515 } 516 517 if (AMD_HSA_BITS_GET(code_object->code_properties, 518 AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) { 519 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + 520 (user_sgpr * 4), 2); 521 radeon_emit(cs, kernel_args_va); 522 radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) | 523 S_008F04_STRIDE(0)); 524 user_sgpr += 2; 525 } 526 527 for (i = 0; i < 3 && user_sgpr < 16; i++) { 528 if (code_object->code_properties & workgroup_count_masks[i]) { 529 radeon_set_sh_reg_seq(cs, 530 R_00B900_COMPUTE_USER_DATA_0 + 531 (user_sgpr * 4), 1); 532 radeon_emit(cs, info->grid[i]); 533 user_sgpr += 1; 534 } 535 } 536 } 537 538 static void si_upload_compute_input(struct si_context *sctx, 539 const amd_kernel_code_t *code_object, 540 const struct pipe_grid_info *info) 541 { 542 struct radeon_winsys_cs *cs = sctx->b.gfx.cs; 543 struct si_compute *program = sctx->cs_shader_state.program; 544 struct r600_resource *input_buffer = NULL; 545 unsigned kernel_args_size; 546 unsigned num_work_size_bytes = program->use_code_object_v2 ? 0 : 36; 547 uint32_t kernel_args_offset = 0; 548 uint32_t *kernel_args; 549 void *kernel_args_ptr; 550 uint64_t kernel_args_va; 551 unsigned i; 552 553 /* The extra num_work_size_bytes are for work group / work item size information */ 554 kernel_args_size = program->input_size + num_work_size_bytes; 555 556 u_upload_alloc(sctx->b.uploader, 0, kernel_args_size, 256, 557 &kernel_args_offset, 558 (struct pipe_resource**)&input_buffer, &kernel_args_ptr); 559 560 kernel_args = (uint32_t*)kernel_args_ptr; 561 kernel_args_va = input_buffer->gpu_address + kernel_args_offset; 562 563 if (!code_object) { 564 for (i = 0; i < 3; i++) { 565 kernel_args[i] = info->grid[i]; 566 kernel_args[i + 3] = info->grid[i] * info->block[i]; 567 kernel_args[i + 6] = info->block[i]; 568 } 569 } 570 571 memcpy(kernel_args + (num_work_size_bytes / 4), info->input, 572 program->input_size); 573 574 575 for (i = 0; i < (kernel_args_size / 4); i++) { 576 COMPUTE_DBG(sctx->screen, "input %u : %u\n", i, 577 kernel_args[i]); 578 } 579 580 581 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer, 582 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); 583 584 if (code_object) { 585 si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va); 586 } else { 587 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2); 588 radeon_emit(cs, kernel_args_va); 589 radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) | 590 S_008F04_STRIDE(0)); 591 } 592 593 r600_resource_reference(&input_buffer, NULL); 594 } 595 596 static void si_setup_tgsi_grid(struct si_context *sctx, 597 const struct pipe_grid_info *info) 598 { 599 struct radeon_winsys_cs *cs = sctx->b.gfx.cs; 600 unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 + 601 4 * SI_SGPR_GRID_SIZE; 602 603 if (info->indirect) { 604 uint64_t base_va = r600_resource(info->indirect)->gpu_address; 605 uint64_t va = base_va + info->indirect_offset; 606 int i; 607 608 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, 609 (struct r600_resource *)info->indirect, 610 RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); 611 612 for (i = 0; i < 3; ++i) { 613 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 614 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | 615 COPY_DATA_DST_SEL(COPY_DATA_REG)); 616 radeon_emit(cs, (va + 4 * i)); 617 radeon_emit(cs, (va + 4 * i) >> 32); 618 radeon_emit(cs, (grid_size_reg >> 2) + i); 619 radeon_emit(cs, 0); 620 } 621 } else { 622 struct si_compute *program = sctx->cs_shader_state.program; 623 624 radeon_set_sh_reg_seq(cs, grid_size_reg, program->variable_group_size ? 6 : 3); 625 radeon_emit(cs, info->grid[0]); 626 radeon_emit(cs, info->grid[1]); 627 radeon_emit(cs, info->grid[2]); 628 if (program->variable_group_size) { 629 radeon_emit(cs, info->block[0]); 630 radeon_emit(cs, info->block[1]); 631 radeon_emit(cs, info->block[2]); 632 } 633 } 634 } 635 636 static void si_emit_dispatch_packets(struct si_context *sctx, 637 const struct pipe_grid_info *info) 638 { 639 struct radeon_winsys_cs *cs = sctx->b.gfx.cs; 640 bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off; 641 unsigned waves_per_threadgroup = 642 DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 64); 643 644 radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 645 S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0)); 646 647 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); 648 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0])); 649 radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1])); 650 radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2])); 651 652 if (info->indirect) { 653 uint64_t base_va = r600_resource(info->indirect)->gpu_address; 654 655 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, 656 (struct r600_resource *)info->indirect, 657 RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); 658 659 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | 660 PKT3_SHADER_TYPE_S(1)); 661 radeon_emit(cs, 1); 662 radeon_emit(cs, base_va); 663 radeon_emit(cs, base_va >> 32); 664 665 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | 666 PKT3_SHADER_TYPE_S(1)); 667 radeon_emit(cs, info->indirect_offset); 668 radeon_emit(cs, 1); 669 } else { 670 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | 671 PKT3_SHADER_TYPE_S(1)); 672 radeon_emit(cs, info->grid[0]); 673 radeon_emit(cs, info->grid[1]); 674 radeon_emit(cs, info->grid[2]); 675 radeon_emit(cs, 1); 676 } 677 } 678 679 680 static void si_launch_grid( 681 struct pipe_context *ctx, const struct pipe_grid_info *info) 682 { 683 struct si_context *sctx = (struct si_context*)ctx; 684 struct si_compute *program = sctx->cs_shader_state.program; 685 const amd_kernel_code_t *code_object = 686 si_compute_get_code_object(program, info->pc); 687 int i; 688 /* HW bug workaround when CS threadgroups > 256 threads and async 689 * compute isn't used, i.e. only one compute job can run at a time. 690 * If async compute is possible, the threadgroup size must be limited 691 * to 256 threads on all queues to avoid the bug. 692 * Only SI and certain CIK chips are affected. 693 */ 694 bool cs_regalloc_hang = 695 (sctx->b.chip_class == SI || 696 sctx->b.family == CHIP_BONAIRE || 697 sctx->b.family == CHIP_KABINI) && 698 info->block[0] * info->block[1] * info->block[2] > 256; 699 700 if (cs_regalloc_hang) 701 sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | 702 SI_CONTEXT_CS_PARTIAL_FLUSH; 703 704 si_decompress_compute_textures(sctx); 705 706 /* Add buffer sizes for memory checking in need_cs_space. */ 707 r600_context_add_resource_size(ctx, &program->shader.bo->b.b); 708 /* TODO: add the scratch buffer */ 709 710 if (info->indirect) { 711 r600_context_add_resource_size(ctx, info->indirect); 712 713 /* The hw doesn't read the indirect buffer via TC L2. */ 714 if (r600_resource(info->indirect)->TC_L2_dirty) { 715 sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; 716 r600_resource(info->indirect)->TC_L2_dirty = false; 717 } 718 } 719 720 si_need_cs_space(sctx); 721 722 if (!sctx->cs_shader_state.initialized) 723 si_initialize_compute(sctx); 724 725 if (sctx->b.flags) 726 si_emit_cache_flush(sctx); 727 728 if (!si_switch_compute_shader(sctx, program, &program->shader, 729 code_object, info->pc)) 730 return; 731 732 si_upload_compute_shader_descriptors(sctx); 733 si_emit_compute_shader_userdata(sctx); 734 735 if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) { 736 sctx->atoms.s.render_cond->emit(&sctx->b, 737 sctx->atoms.s.render_cond); 738 si_set_atom_dirty(sctx, sctx->atoms.s.render_cond, false); 739 } 740 741 if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE) 742 si_upload_compute_input(sctx, code_object, info); 743 744 /* Global buffers */ 745 for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) { 746 struct r600_resource *buffer = 747 (struct r600_resource*)program->global_buffers[i]; 748 if (!buffer) { 749 continue; 750 } 751 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer, 752 RADEON_USAGE_READWRITE, 753 RADEON_PRIO_COMPUTE_GLOBAL); 754 } 755 756 if (program->ir_type == PIPE_SHADER_IR_TGSI) 757 si_setup_tgsi_grid(sctx, info); 758 759 si_ce_pre_draw_synchronization(sctx); 760 761 si_emit_dispatch_packets(sctx, info); 762 763 si_ce_post_draw_synchronization(sctx); 764 765 sctx->compute_is_busy = true; 766 sctx->b.num_compute_calls++; 767 if (sctx->cs_shader_state.uses_scratch) 768 sctx->b.num_spill_compute_calls++; 769 770 if (cs_regalloc_hang) 771 sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; 772 } 773 774 775 static void si_delete_compute_state(struct pipe_context *ctx, void* state){ 776 struct si_compute *program = (struct si_compute *)state; 777 struct si_context *sctx = (struct si_context*)ctx; 778 779 if (!state) { 780 return; 781 } 782 783 if (program == sctx->cs_shader_state.program) 784 sctx->cs_shader_state.program = NULL; 785 786 if (program == sctx->cs_shader_state.emitted_program) 787 sctx->cs_shader_state.emitted_program = NULL; 788 789 si_shader_destroy(&program->shader); 790 FREE(program); 791 } 792 793 static void si_set_compute_resources(struct pipe_context * ctx_, 794 unsigned start, unsigned count, 795 struct pipe_surface ** surfaces) { } 796 797 void si_init_compute_functions(struct si_context *sctx) 798 { 799 sctx->b.b.create_compute_state = si_create_compute_state; 800 sctx->b.b.delete_compute_state = si_delete_compute_state; 801 sctx->b.b.bind_compute_state = si_bind_compute_state; 802 /* ctx->context.create_sampler_view = evergreen_compute_create_sampler_view; */ 803 sctx->b.b.set_compute_resources = si_set_compute_resources; 804 sctx->b.b.set_global_binding = si_set_global_binding; 805 sctx->b.b.launch_grid = si_launch_grid; 806 } 807