Home | History | Annotate | Download | only in radeonsi
      1 /*
      2  * Copyright 2013 Advanced Micro Devices, Inc.
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * on the rights to use, copy, modify, merge, publish, distribute, sub
      8  * license, and/or sell copies of the Software, and to permit persons to whom
      9  * the Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  *
     23  */
     24 
     25 #include "tgsi/tgsi_parse.h"
     26 #include "util/u_memory.h"
     27 #include "util/u_upload_mgr.h"
     28 #include "radeon/radeon_elf_util.h"
     29 
     30 #include "amd_kernel_code_t.h"
     31 #include "radeon/r600_cs.h"
     32 #include "si_pipe.h"
     33 #include "sid.h"
     34 
     35 #define MAX_GLOBAL_BUFFERS 22
     36 
     37 struct si_compute {
     38 	unsigned ir_type;
     39 	unsigned local_size;
     40 	unsigned private_size;
     41 	unsigned input_size;
     42 	struct si_shader shader;
     43 
     44 	struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
     45 	unsigned use_code_object_v2 : 1;
     46 	unsigned variable_group_size : 1;
     47 };
     48 
     49 struct dispatch_packet {
     50 	uint16_t header;
     51 	uint16_t setup;
     52 	uint16_t workgroup_size_x;
     53 	uint16_t workgroup_size_y;
     54 	uint16_t workgroup_size_z;
     55 	uint16_t reserved0;
     56 	uint32_t grid_size_x;
     57 	uint32_t grid_size_y;
     58 	uint32_t grid_size_z;
     59 	uint32_t private_segment_size;
     60 	uint32_t group_segment_size;
     61 	uint64_t kernel_object;
     62 	uint64_t kernarg_address;
     63 	uint64_t reserved2;
     64 };
     65 
     66 static const amd_kernel_code_t *si_compute_get_code_object(
     67 	const struct si_compute *program,
     68 	uint64_t symbol_offset)
     69 {
     70 	if (!program->use_code_object_v2) {
     71 		return NULL;
     72 	}
     73 	return (const amd_kernel_code_t*)
     74 		(program->shader.binary.code + symbol_offset);
     75 }
     76 
     77 static void code_object_to_config(const amd_kernel_code_t *code_object,
     78 				  struct si_shader_config *out_config) {
     79 
     80 	uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
     81 	uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
     82 	out_config->num_sgprs = code_object->wavefront_sgpr_count;
     83 	out_config->num_vgprs = code_object->workitem_vgpr_count;
     84 	out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
     85 	out_config->rsrc1 = rsrc1;
     86 	out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2));
     87 	out_config->rsrc2 = rsrc2;
     88 	out_config->scratch_bytes_per_wave =
     89 		align(code_object->workitem_private_segment_byte_size * 64, 1024);
     90 }
     91 
     92 static void *si_create_compute_state(
     93 	struct pipe_context *ctx,
     94 	const struct pipe_compute_state *cso)
     95 {
     96 	struct si_context *sctx = (struct si_context *)ctx;
     97 	struct si_screen *sscreen = (struct si_screen *)ctx->screen;
     98 	struct si_compute *program = CALLOC_STRUCT(si_compute);
     99 	struct si_shader *shader = &program->shader;
    100 
    101 
    102 	program->ir_type = cso->ir_type;
    103 	program->local_size = cso->req_local_mem;
    104 	program->private_size = cso->req_private_mem;
    105 	program->input_size = cso->req_input_mem;
    106 	program->use_code_object_v2 = HAVE_LLVM >= 0x0400 &&
    107 					cso->ir_type == PIPE_SHADER_IR_NATIVE;
    108 
    109 
    110 	if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
    111 		struct si_shader_selector sel;
    112 		bool scratch_enabled;
    113 
    114 		memset(&sel, 0, sizeof(sel));
    115 
    116 		sel.tokens = tgsi_dup_tokens(cso->prog);
    117 		if (!sel.tokens) {
    118 			FREE(program);
    119 			return NULL;
    120 		}
    121 
    122 		tgsi_scan_shader(cso->prog, &sel.info);
    123 		sel.type = PIPE_SHADER_COMPUTE;
    124 		sel.local_size = cso->req_local_mem;
    125 
    126 		p_atomic_inc(&sscreen->b.num_shaders_created);
    127 
    128 		program->shader.selector = &sel;
    129 		program->shader.is_monolithic = true;
    130 
    131 		if (si_shader_create(sscreen, sctx->tm, &program->shader,
    132 		                     &sctx->b.debug)) {
    133 			FREE(sel.tokens);
    134 			FREE(program);
    135 			return NULL;
    136 		}
    137 
    138 		scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
    139 
    140 		shader->config.rsrc1 =
    141 			   S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
    142 			   S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) |
    143 			   S_00B848_DX10_CLAMP(1) |
    144 			   S_00B848_FLOAT_MODE(shader->config.float_mode);
    145 
    146 		shader->config.rsrc2 = S_00B84C_USER_SGPR(SI_CS_NUM_USER_SGPR) |
    147 			   S_00B84C_SCRATCH_EN(scratch_enabled) |
    148 			   S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) |
    149 			   S_00B84C_TGID_Z_EN(1) | S_00B84C_TIDIG_COMP_CNT(2) |
    150 			   S_00B84C_LDS_SIZE(shader->config.lds_size);
    151 
    152 		program->variable_group_size =
    153 			sel.info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
    154 
    155 		FREE(sel.tokens);
    156 		program->shader.selector = NULL;
    157 	} else {
    158 		const struct pipe_llvm_program_header *header;
    159 		const char *code;
    160 		header = cso->prog;
    161 		code = cso->prog + sizeof(struct pipe_llvm_program_header);
    162 
    163 		radeon_elf_read(code, header->num_bytes, &program->shader.binary);
    164 		if (program->use_code_object_v2) {
    165 			const amd_kernel_code_t *code_object =
    166 				si_compute_get_code_object(program, 0);
    167 			code_object_to_config(code_object, &program->shader.config);
    168 		} else {
    169 			si_shader_binary_read_config(&program->shader.binary,
    170 				     &program->shader.config, 0);
    171 		}
    172 		si_shader_dump(sctx->screen, &program->shader, &sctx->b.debug,
    173 			       PIPE_SHADER_COMPUTE, stderr, true);
    174 		if (si_shader_binary_upload(sctx->screen, &program->shader) < 0) {
    175 			fprintf(stderr, "LLVM failed to upload shader\n");
    176 			FREE(program);
    177 			return NULL;
    178 		}
    179 	}
    180 
    181 	return program;
    182 }
    183 
    184 static void si_bind_compute_state(struct pipe_context *ctx, void *state)
    185 {
    186 	struct si_context *sctx = (struct si_context*)ctx;
    187 	sctx->cs_shader_state.program = (struct si_compute*)state;
    188 }
    189 
    190 static void si_set_global_binding(
    191 	struct pipe_context *ctx, unsigned first, unsigned n,
    192 	struct pipe_resource **resources,
    193 	uint32_t **handles)
    194 {
    195 	unsigned i;
    196 	struct si_context *sctx = (struct si_context*)ctx;
    197 	struct si_compute *program = sctx->cs_shader_state.program;
    198 
    199 	assert(first + n <= MAX_GLOBAL_BUFFERS);
    200 
    201 	if (!resources) {
    202 		for (i = 0; i < n; i++) {
    203 			pipe_resource_reference(&program->global_buffers[first + i], NULL);
    204 		}
    205 		return;
    206 	}
    207 
    208 	for (i = 0; i < n; i++) {
    209 		uint64_t va;
    210 		uint32_t offset;
    211 		pipe_resource_reference(&program->global_buffers[first + i], resources[i]);
    212 		va = r600_resource(resources[i])->gpu_address;
    213 		offset = util_le32_to_cpu(*handles[i]);
    214 		va += offset;
    215 		va = util_cpu_to_le64(va);
    216 		memcpy(handles[i], &va, sizeof(va));
    217 	}
    218 }
    219 
    220 static void si_initialize_compute(struct si_context *sctx)
    221 {
    222 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
    223 	uint64_t bc_va;
    224 
    225 	radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
    226 	radeon_emit(cs, 0);
    227 	radeon_emit(cs, 0);
    228 	radeon_emit(cs, 0);
    229 
    230 	radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
    231 	/* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
    232 	radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
    233 	radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff));
    234 
    235 	if (sctx->b.chip_class >= CIK) {
    236 		/* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
    237 		radeon_set_sh_reg_seq(cs,
    238 		                     R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
    239 		radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) |
    240 		                S_00B864_SH1_CU_EN(0xffff));
    241 		radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) |
    242 		                S_00B868_SH1_CU_EN(0xffff));
    243 	}
    244 
    245 	/* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
    246 	 * and is now per pipe, so it should be handled in the
    247 	 * kernel if we want to use something other than the default value,
    248 	 * which is now 0x22f.
    249 	 */
    250 	if (sctx->b.chip_class <= SI) {
    251 		/* XXX: This should be:
    252 		 * (number of compute units) * 4 * (waves per simd) - 1 */
    253 
    254 		radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID,
    255 		                  0x190 /* Default value */);
    256 	}
    257 
    258 	/* Set the pointer to border colors. */
    259 	bc_va = sctx->border_color_buffer->gpu_address;
    260 
    261 	if (sctx->b.chip_class >= CIK) {
    262 		radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2);
    263 		radeon_emit(cs, bc_va >> 8);  /* R_030E00_TA_CS_BC_BASE_ADDR */
    264 		radeon_emit(cs, bc_va >> 40); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
    265 	} else {
    266 		if (sctx->screen->b.info.drm_major == 3 ||
    267 		    (sctx->screen->b.info.drm_major == 2 &&
    268 		     sctx->screen->b.info.drm_minor >= 48)) {
    269 			radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR,
    270 					      bc_va >> 8);
    271 		}
    272 	}
    273 
    274 	sctx->cs_shader_state.emitted_program = NULL;
    275 	sctx->cs_shader_state.initialized = true;
    276 }
    277 
    278 static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
    279                                             struct si_shader *shader,
    280                                             struct si_shader_config *config)
    281 {
    282 	uint64_t scratch_bo_size, scratch_needed;
    283 	scratch_bo_size = 0;
    284 	scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
    285 	if (sctx->compute_scratch_buffer)
    286 		scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
    287 
    288 	if (scratch_bo_size < scratch_needed) {
    289 		r600_resource_reference(&sctx->compute_scratch_buffer, NULL);
    290 
    291 		sctx->compute_scratch_buffer = (struct r600_resource*)
    292 			pipe_buffer_create(&sctx->screen->b.b, 0,
    293 					   PIPE_USAGE_DEFAULT, scratch_needed);
    294 
    295 		if (!sctx->compute_scratch_buffer)
    296 			return false;
    297 	}
    298 
    299 	if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
    300 		uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
    301 
    302 		si_shader_apply_scratch_relocs(sctx, shader, config, scratch_va);
    303 
    304 		if (si_shader_binary_upload(sctx->screen, shader))
    305 			return false;
    306 
    307 		r600_resource_reference(&shader->scratch_bo,
    308 		                        sctx->compute_scratch_buffer);
    309 	}
    310 
    311 	return true;
    312 }
    313 
    314 static bool si_switch_compute_shader(struct si_context *sctx,
    315                                      struct si_compute *program,
    316 				     struct si_shader *shader,
    317 				     const amd_kernel_code_t *code_object,
    318 				     unsigned offset)
    319 {
    320 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
    321 	struct si_shader_config inline_config = {0};
    322 	struct si_shader_config *config;
    323 	uint64_t shader_va;
    324 
    325 	if (sctx->cs_shader_state.emitted_program == program &&
    326 	    sctx->cs_shader_state.offset == offset)
    327 		return true;
    328 
    329 	if (program->ir_type == PIPE_SHADER_IR_TGSI) {
    330 		config = &shader->config;
    331 	} else {
    332 		unsigned lds_blocks;
    333 
    334 		config = &inline_config;
    335 		if (code_object) {
    336 			code_object_to_config(code_object, config);
    337 		} else {
    338 			si_shader_binary_read_config(&shader->binary, config, offset);
    339 		}
    340 
    341 		lds_blocks = config->lds_size;
    342 		/* XXX: We are over allocating LDS.  For SI, the shader reports
    343 		* LDS in blocks of 256 bytes, so if there are 4 bytes lds
    344 		* allocated in the shader and 4 bytes allocated by the state
    345 		* tracker, then we will set LDS_SIZE to 512 bytes rather than 256.
    346 		*/
    347 		if (sctx->b.chip_class <= SI) {
    348 			lds_blocks += align(program->local_size, 256) >> 8;
    349 		} else {
    350 			lds_blocks += align(program->local_size, 512) >> 9;
    351 		}
    352 
    353 		/* TODO: use si_multiwave_lds_size_workaround */
    354 		assert(lds_blocks <= 0xFF);
    355 
    356 		config->rsrc2 &= C_00B84C_LDS_SIZE;
    357 		config->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
    358 	}
    359 
    360 	if (!si_setup_compute_scratch_buffer(sctx, shader, config))
    361 		return false;
    362 
    363 	if (shader->scratch_bo) {
    364 		COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
    365 		            "Total Scratch: %u bytes\n", sctx->scratch_waves,
    366 			    config->scratch_bytes_per_wave,
    367 			    config->scratch_bytes_per_wave *
    368 			    sctx->scratch_waves);
    369 
    370 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
    371 			      shader->scratch_bo, RADEON_USAGE_READWRITE,
    372 			      RADEON_PRIO_SCRATCH_BUFFER);
    373 	}
    374 
    375 	shader_va = shader->bo->gpu_address + offset;
    376 	if (program->use_code_object_v2) {
    377 		/* Shader code is placed after the amd_kernel_code_t
    378 		 * struct. */
    379 		shader_va += sizeof(amd_kernel_code_t);
    380 	}
    381 
    382 	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
    383 	                          RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
    384 
    385 	radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
    386 	radeon_emit(cs, shader_va >> 8);
    387 	radeon_emit(cs, shader_va >> 40);
    388 
    389 	radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
    390 	radeon_emit(cs, config->rsrc1);
    391 	radeon_emit(cs, config->rsrc2);
    392 
    393 	COMPUTE_DBG(sctx->screen, "COMPUTE_PGM_RSRC1: 0x%08x "
    394 		"COMPUTE_PGM_RSRC2: 0x%08x\n", config->rsrc1, config->rsrc2);
    395 
    396 	radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
    397 	          S_00B860_WAVES(sctx->scratch_waves)
    398 	             | S_00B860_WAVESIZE(config->scratch_bytes_per_wave >> 10));
    399 
    400 	sctx->cs_shader_state.emitted_program = program;
    401 	sctx->cs_shader_state.offset = offset;
    402 	sctx->cs_shader_state.uses_scratch =
    403 		config->scratch_bytes_per_wave != 0;
    404 
    405 	return true;
    406 }
    407 
    408 static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
    409 					  const amd_kernel_code_t *code_object,
    410 					  unsigned user_sgpr)
    411 {
    412 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
    413 	uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
    414 
    415 	unsigned max_private_element_size = AMD_HSA_BITS_GET(
    416 			code_object->code_properties,
    417 			AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
    418 
    419 	uint32_t scratch_dword0 = scratch_va & 0xffffffff;
    420 	uint32_t scratch_dword1 =
    421 		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
    422 		S_008F04_SWIZZLE_ENABLE(1);
    423 
    424 	/* Disable address clamping */
    425 	uint32_t scratch_dword2 = 0xffffffff;
    426 	uint32_t scratch_dword3 =
    427 		S_008F0C_ELEMENT_SIZE(max_private_element_size) |
    428 		S_008F0C_INDEX_STRIDE(3) |
    429 		S_008F0C_ADD_TID_ENABLE(1);
    430 
    431 
    432 	if (sctx->screen->b.chip_class < VI) {
    433 		/* BUF_DATA_FORMAT is ignored, but it cannot be
    434 		   BUF_DATA_FORMAT_INVALID. */
    435 		scratch_dword3 |=
    436 			S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
    437 	}
    438 
    439 	radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
    440 							(user_sgpr * 4), 4);
    441 	radeon_emit(cs, scratch_dword0);
    442 	radeon_emit(cs, scratch_dword1);
    443 	radeon_emit(cs, scratch_dword2);
    444 	radeon_emit(cs, scratch_dword3);
    445 }
    446 
    447 static void si_setup_user_sgprs_co_v2(struct si_context *sctx,
    448                                       const amd_kernel_code_t *code_object,
    449 				      const struct pipe_grid_info *info,
    450 				      uint64_t kernel_args_va)
    451 {
    452 	struct si_compute *program = sctx->cs_shader_state.program;
    453 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
    454 
    455 	static const enum amd_code_property_mask_t workgroup_count_masks [] = {
    456 		AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
    457 		AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
    458 		AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z
    459 	};
    460 
    461 	unsigned i, user_sgpr = 0;
    462 	if (AMD_HSA_BITS_GET(code_object->code_properties,
    463 			AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
    464 		if (code_object->workitem_private_segment_byte_size > 0) {
    465 			setup_scratch_rsrc_user_sgprs(sctx, code_object,
    466 								user_sgpr);
    467 		}
    468 		user_sgpr += 4;
    469 	}
    470 
    471 	if (AMD_HSA_BITS_GET(code_object->code_properties,
    472 			AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
    473 		struct dispatch_packet dispatch;
    474 		unsigned dispatch_offset;
    475 		struct r600_resource *dispatch_buf = NULL;
    476 		uint64_t dispatch_va;
    477 
    478 		/* Upload dispatch ptr */
    479 		memset(&dispatch, 0, sizeof(dispatch));
    480 
    481 		dispatch.workgroup_size_x = info->block[0];
    482 		dispatch.workgroup_size_y = info->block[1];
    483 		dispatch.workgroup_size_z = info->block[2];
    484 
    485 		dispatch.grid_size_x = info->grid[0] * info->block[0];
    486 		dispatch.grid_size_y = info->grid[1] * info->block[1];
    487 		dispatch.grid_size_z = info->grid[2] * info->block[2];
    488 
    489 		dispatch.private_segment_size = program->private_size;
    490 		dispatch.group_segment_size = program->local_size;
    491 
    492 		dispatch.kernarg_address = kernel_args_va;
    493 
    494 		u_upload_data(sctx->b.uploader, 0, sizeof(dispatch), 256,
    495 				&dispatch, &dispatch_offset,
    496 				(struct pipe_resource**)&dispatch_buf);
    497 
    498 		if (!dispatch_buf) {
    499 			fprintf(stderr, "Error: Failed to allocate dispatch "
    500 					"packet.");
    501 		}
    502 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, dispatch_buf,
    503 				  RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
    504 
    505 		dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
    506 
    507 		radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
    508 							(user_sgpr * 4), 2);
    509 		radeon_emit(cs, dispatch_va);
    510 		radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) |
    511                                 S_008F04_STRIDE(0));
    512 
    513 		r600_resource_reference(&dispatch_buf, NULL);
    514 		user_sgpr += 2;
    515 	}
    516 
    517 	if (AMD_HSA_BITS_GET(code_object->code_properties,
    518 			AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
    519 		radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
    520 							(user_sgpr * 4), 2);
    521 		radeon_emit(cs, kernel_args_va);
    522 		radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
    523 		                S_008F04_STRIDE(0));
    524 		user_sgpr += 2;
    525 	}
    526 
    527 	for (i = 0; i < 3 && user_sgpr < 16; i++) {
    528 		if (code_object->code_properties & workgroup_count_masks[i]) {
    529 			radeon_set_sh_reg_seq(cs,
    530 				R_00B900_COMPUTE_USER_DATA_0 +
    531 				(user_sgpr * 4), 1);
    532 			radeon_emit(cs, info->grid[i]);
    533 			user_sgpr += 1;
    534 		}
    535 	}
    536 }
    537 
    538 static void si_upload_compute_input(struct si_context *sctx,
    539 				    const amd_kernel_code_t *code_object,
    540 				    const struct pipe_grid_info *info)
    541 {
    542 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
    543 	struct si_compute *program = sctx->cs_shader_state.program;
    544 	struct r600_resource *input_buffer = NULL;
    545 	unsigned kernel_args_size;
    546 	unsigned num_work_size_bytes = program->use_code_object_v2 ? 0 : 36;
    547 	uint32_t kernel_args_offset = 0;
    548 	uint32_t *kernel_args;
    549 	void *kernel_args_ptr;
    550 	uint64_t kernel_args_va;
    551 	unsigned i;
    552 
    553 	/* The extra num_work_size_bytes are for work group / work item size information */
    554 	kernel_args_size = program->input_size + num_work_size_bytes;
    555 
    556 	u_upload_alloc(sctx->b.uploader, 0, kernel_args_size, 256,
    557 		       &kernel_args_offset,
    558 		       (struct pipe_resource**)&input_buffer, &kernel_args_ptr);
    559 
    560 	kernel_args = (uint32_t*)kernel_args_ptr;
    561 	kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
    562 
    563 	if (!code_object) {
    564 		for (i = 0; i < 3; i++) {
    565 			kernel_args[i] = info->grid[i];
    566 			kernel_args[i + 3] = info->grid[i] * info->block[i];
    567 			kernel_args[i + 6] = info->block[i];
    568 		}
    569 	}
    570 
    571 	memcpy(kernel_args + (num_work_size_bytes / 4), info->input,
    572 	       program->input_size);
    573 
    574 
    575 	for (i = 0; i < (kernel_args_size / 4); i++) {
    576 		COMPUTE_DBG(sctx->screen, "input %u : %u\n", i,
    577 			kernel_args[i]);
    578 	}
    579 
    580 
    581 	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer,
    582 				  RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
    583 
    584 	if (code_object) {
    585 		si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va);
    586 	} else {
    587 		radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
    588 		radeon_emit(cs, kernel_args_va);
    589 		radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
    590 		                S_008F04_STRIDE(0));
    591 	}
    592 
    593 	r600_resource_reference(&input_buffer, NULL);
    594 }
    595 
    596 static void si_setup_tgsi_grid(struct si_context *sctx,
    597                                 const struct pipe_grid_info *info)
    598 {
    599 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
    600 	unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 +
    601 	                          4 * SI_SGPR_GRID_SIZE;
    602 
    603 	if (info->indirect) {
    604 		uint64_t base_va = r600_resource(info->indirect)->gpu_address;
    605 		uint64_t va = base_va + info->indirect_offset;
    606 		int i;
    607 
    608 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
    609 		                 (struct r600_resource *)info->indirect,
    610 		                 RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
    611 
    612 		for (i = 0; i < 3; ++i) {
    613 			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
    614 			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
    615 					COPY_DATA_DST_SEL(COPY_DATA_REG));
    616 			radeon_emit(cs, (va +  4 * i));
    617 			radeon_emit(cs, (va + 4 * i) >> 32);
    618 			radeon_emit(cs, (grid_size_reg >> 2) + i);
    619 			radeon_emit(cs, 0);
    620 		}
    621 	} else {
    622 		struct si_compute *program = sctx->cs_shader_state.program;
    623 
    624 		radeon_set_sh_reg_seq(cs, grid_size_reg, program->variable_group_size ? 6 : 3);
    625 		radeon_emit(cs, info->grid[0]);
    626 		radeon_emit(cs, info->grid[1]);
    627 		radeon_emit(cs, info->grid[2]);
    628 		if (program->variable_group_size) {
    629 			radeon_emit(cs, info->block[0]);
    630 			radeon_emit(cs, info->block[1]);
    631 			radeon_emit(cs, info->block[2]);
    632 		}
    633 	}
    634 }
    635 
    636 static void si_emit_dispatch_packets(struct si_context *sctx,
    637                                      const struct pipe_grid_info *info)
    638 {
    639 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
    640 	bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
    641 	unsigned waves_per_threadgroup =
    642 		DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 64);
    643 
    644 	radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
    645 			  S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0));
    646 
    647 	radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
    648 	radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
    649 	radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
    650 	radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
    651 
    652 	if (info->indirect) {
    653 		uint64_t base_va = r600_resource(info->indirect)->gpu_address;
    654 
    655 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
    656 		                 (struct r600_resource *)info->indirect,
    657 		                 RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
    658 
    659 		radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
    660 		                PKT3_SHADER_TYPE_S(1));
    661 		radeon_emit(cs, 1);
    662 		radeon_emit(cs, base_va);
    663 		radeon_emit(cs, base_va >> 32);
    664 
    665 		radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) |
    666 		                PKT3_SHADER_TYPE_S(1));
    667 		radeon_emit(cs, info->indirect_offset);
    668 		radeon_emit(cs, 1);
    669 	} else {
    670 		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) |
    671 		                PKT3_SHADER_TYPE_S(1));
    672 		radeon_emit(cs, info->grid[0]);
    673 		radeon_emit(cs, info->grid[1]);
    674 		radeon_emit(cs, info->grid[2]);
    675 		radeon_emit(cs, 1);
    676 	}
    677 }
    678 
    679 
    680 static void si_launch_grid(
    681 		struct pipe_context *ctx, const struct pipe_grid_info *info)
    682 {
    683 	struct si_context *sctx = (struct si_context*)ctx;
    684 	struct si_compute *program = sctx->cs_shader_state.program;
    685 	const amd_kernel_code_t *code_object =
    686 		si_compute_get_code_object(program, info->pc);
    687 	int i;
    688 	/* HW bug workaround when CS threadgroups > 256 threads and async
    689 	 * compute isn't used, i.e. only one compute job can run at a time.
    690 	 * If async compute is possible, the threadgroup size must be limited
    691 	 * to 256 threads on all queues to avoid the bug.
    692 	 * Only SI and certain CIK chips are affected.
    693 	 */
    694 	bool cs_regalloc_hang =
    695 		(sctx->b.chip_class == SI ||
    696 		 sctx->b.family == CHIP_BONAIRE ||
    697 		 sctx->b.family == CHIP_KABINI) &&
    698 		info->block[0] * info->block[1] * info->block[2] > 256;
    699 
    700 	if (cs_regalloc_hang)
    701 		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
    702 				 SI_CONTEXT_CS_PARTIAL_FLUSH;
    703 
    704 	si_decompress_compute_textures(sctx);
    705 
    706 	/* Add buffer sizes for memory checking in need_cs_space. */
    707 	r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
    708 	/* TODO: add the scratch buffer */
    709 
    710 	if (info->indirect) {
    711 		r600_context_add_resource_size(ctx, info->indirect);
    712 
    713 		/* The hw doesn't read the indirect buffer via TC L2. */
    714 		if (r600_resource(info->indirect)->TC_L2_dirty) {
    715 			sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
    716 			r600_resource(info->indirect)->TC_L2_dirty = false;
    717 		}
    718 	}
    719 
    720 	si_need_cs_space(sctx);
    721 
    722 	if (!sctx->cs_shader_state.initialized)
    723 		si_initialize_compute(sctx);
    724 
    725 	if (sctx->b.flags)
    726 		si_emit_cache_flush(sctx);
    727 
    728 	if (!si_switch_compute_shader(sctx, program, &program->shader,
    729 					code_object, info->pc))
    730 		return;
    731 
    732 	si_upload_compute_shader_descriptors(sctx);
    733 	si_emit_compute_shader_userdata(sctx);
    734 
    735 	if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) {
    736 		sctx->atoms.s.render_cond->emit(&sctx->b,
    737 		                                sctx->atoms.s.render_cond);
    738 		si_set_atom_dirty(sctx, sctx->atoms.s.render_cond, false);
    739 	}
    740 
    741 	if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE)
    742 		si_upload_compute_input(sctx, code_object, info);
    743 
    744 	/* Global buffers */
    745 	for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) {
    746 		struct r600_resource *buffer =
    747 				(struct r600_resource*)program->global_buffers[i];
    748 		if (!buffer) {
    749 			continue;
    750 		}
    751 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer,
    752 					  RADEON_USAGE_READWRITE,
    753 					  RADEON_PRIO_COMPUTE_GLOBAL);
    754 	}
    755 
    756 	if (program->ir_type == PIPE_SHADER_IR_TGSI)
    757 		si_setup_tgsi_grid(sctx, info);
    758 
    759 	si_ce_pre_draw_synchronization(sctx);
    760 
    761 	si_emit_dispatch_packets(sctx, info);
    762 
    763 	si_ce_post_draw_synchronization(sctx);
    764 
    765 	sctx->compute_is_busy = true;
    766 	sctx->b.num_compute_calls++;
    767 	if (sctx->cs_shader_state.uses_scratch)
    768 		sctx->b.num_spill_compute_calls++;
    769 
    770 	if (cs_regalloc_hang)
    771 		sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
    772 }
    773 
    774 
    775 static void si_delete_compute_state(struct pipe_context *ctx, void* state){
    776 	struct si_compute *program = (struct si_compute *)state;
    777 	struct si_context *sctx = (struct si_context*)ctx;
    778 
    779 	if (!state) {
    780 		return;
    781 	}
    782 
    783 	if (program == sctx->cs_shader_state.program)
    784 		sctx->cs_shader_state.program = NULL;
    785 
    786 	if (program == sctx->cs_shader_state.emitted_program)
    787 		sctx->cs_shader_state.emitted_program = NULL;
    788 
    789 	si_shader_destroy(&program->shader);
    790 	FREE(program);
    791 }
    792 
    793 static void si_set_compute_resources(struct pipe_context * ctx_,
    794 		unsigned start, unsigned count,
    795 		struct pipe_surface ** surfaces) { }
    796 
    797 void si_init_compute_functions(struct si_context *sctx)
    798 {
    799 	sctx->b.b.create_compute_state = si_create_compute_state;
    800 	sctx->b.b.delete_compute_state = si_delete_compute_state;
    801 	sctx->b.b.bind_compute_state = si_bind_compute_state;
    802 /*	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view; */
    803 	sctx->b.b.set_compute_resources = si_set_compute_resources;
    804 	sctx->b.b.set_global_binding = si_set_global_binding;
    805 	sctx->b.b.launch_grid = si_launch_grid;
    806 }
    807