Home | History | Annotate | Download | only in r600
      1 /*
      2  * Copyright 2011 Adam Rak <adam.rak (at) streamnovation.com>
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * on the rights to use, copy, modify, merge, publish, distribute, sub
      8  * license, and/or sell copies of the Software, and to permit persons to whom
      9  * the Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  *
     23  * Authors:
     24  *      Adam Rak <adam.rak (at) streamnovation.com>
     25  */
     26 
     27 #include <stdio.h>
     28 #include <errno.h>
     29 #include "pipe/p_defines.h"
     30 #include "pipe/p_state.h"
     31 #include "pipe/p_context.h"
     32 #include "util/u_blitter.h"
     33 #include "util/u_double_list.h"
     34 #include "util/u_transfer.h"
     35 #include "util/u_surface.h"
     36 #include "util/u_pack_color.h"
     37 #include "util/u_memory.h"
     38 #include "util/u_inlines.h"
     39 #include "util/u_framebuffer.h"
     40 #include "pipebuffer/pb_buffer.h"
     41 #include "r600.h"
     42 #include "evergreend.h"
     43 #include "r600_resource.h"
     44 #include "r600_shader.h"
     45 #include "r600_pipe.h"
     46 #include "r600_formats.h"
     47 #include "evergreen_compute.h"
     48 #include "r600_hw_context_priv.h"
     49 #include "evergreen_compute_internal.h"
     50 #include "compute_memory_pool.h"
     51 #ifdef HAVE_OPENCL
     52 #include "llvm_wrapper.h"
     53 #endif
     54 
     55 /**
     56 RAT0 is for global binding write
     57 VTX1 is for global binding read
     58 
     59 for wrting images RAT1...
     60 for reading images TEX2...
     61   TEX2-RAT1 is paired
     62 
     63 TEX2... consumes the same fetch resources, that VTX2... would consume
     64 
     65 CONST0 and VTX0 is for parameters
     66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
     67   also constant cached
     68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
     69   the constant cache can handle
     70 
     71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
     72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
     73 we should reserve another one too.=> 10 image binding for writing max.
     74 
     75 from Nvidia OpenCL:
     76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
     77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
     78 
     79 so 10 for writing is enough. 176 is the max for reading according to the docs
     80 
     81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
     82 writable images will consume TEX slots, VTX slots too because of linear indexing
     83 
     84 */
     85 
     86 static void evergreen_cs_set_vertex_buffer(
     87 	struct r600_context * rctx,
     88 	unsigned vb_index,
     89 	unsigned offset,
     90 	struct pipe_resource * buffer)
     91 {
     92 	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
     93 	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
     94 	vb->stride = 1;
     95 	vb->buffer_offset = offset;
     96 	vb->buffer = buffer;
     97 	vb->user_buffer = NULL;
     98 
     99 	r600_inval_vertex_cache(rctx);
    100 	state->enabled_mask |= 1 << vb_index;
    101 	state->dirty_mask |= 1 << vb_index;
    102 	r600_atom_dirty(rctx, &state->atom);
    103 }
    104 
    105 const struct u_resource_vtbl r600_global_buffer_vtbl =
    106 {
    107 	u_default_resource_get_handle, /* get_handle */
    108 	r600_compute_global_buffer_destroy, /* resource_destroy */
    109 	r600_compute_global_get_transfer, /* get_transfer */
    110 	r600_compute_global_transfer_destroy, /* transfer_destroy */
    111 	r600_compute_global_transfer_map, /* transfer_map */
    112 	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
    113 	r600_compute_global_transfer_unmap, /* transfer_unmap */
    114 	r600_compute_global_transfer_inline_write /* transfer_inline_write */
    115 };
    116 
    117 
    118 void *evergreen_create_compute_state(
    119 	struct pipe_context *ctx_,
    120 	const const struct pipe_compute_state *cso)
    121 {
    122 	struct r600_context *ctx = (struct r600_context *)ctx_;
    123 	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
    124 	void *p;
    125 
    126 #ifdef HAVE_OPENCL
    127 	const struct pipe_llvm_program_header * header;
    128 	const unsigned char * code;
    129 
    130 	COMPUTE_DBG("*** evergreen_create_compute_state\n");
    131 
    132 	header = cso->prog;
    133 	code = cso->prog + sizeof(struct pipe_llvm_program_header);
    134 #endif
    135 
    136 	shader->ctx = (struct r600_context*)ctx;
    137 	shader->resources = (struct evergreen_compute_resource*)
    138 			CALLOC(sizeof(struct evergreen_compute_resource),
    139 			get_compute_resource_num());
    140 	shader->local_size = cso->req_local_mem; ///TODO: assert it
    141 	shader->private_size = cso->req_private_mem;
    142 	shader->input_size = cso->req_input_mem;
    143 
    144 #ifdef HAVE_OPENCL
    145 	shader->mod = llvm_parse_bitcode(code, header->num_bytes);
    146 
    147 	r600_compute_shader_create(ctx_, shader->mod, &shader->bc);
    148 #endif
    149 	shader->shader_code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
    150 							shader->bc.ndw * 4);
    151 
    152 	p = ctx->ws->buffer_map(shader->shader_code_bo->cs_buf, ctx->cs,
    153 							PIPE_TRANSFER_WRITE);
    154 
    155 	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
    156 	ctx->ws->buffer_unmap(shader->shader_code_bo->cs_buf);
    157 	return shader;
    158 }
    159 
    160 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
    161 {
    162 	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
    163 
    164 	free(shader->resources);
    165 	free(shader);
    166 }
    167 
    168 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
    169 {
    170 	struct r600_context *ctx = (struct r600_context *)ctx_;
    171 
    172 	COMPUTE_DBG("*** evergreen_bind_compute_state\n");
    173 
    174 	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
    175 }
    176 
    177 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
    178  * kernel parameters there are inplicit parameters that need to be stored
    179  * in the vertex buffer as well.  Here is how these parameters are organized in
    180  * the buffer:
    181  *
    182  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
    183  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
    184  * DWORDS 6-8: Number of work items within each work group in each dimension
    185  *             (x,y,z)
    186  * DWORDS 9+ : Kernel parameters
    187  */
    188 void evergreen_compute_upload_input(
    189 	struct pipe_context *ctx_,
    190 	const uint *block_layout,
    191 	const uint *grid_layout,
    192 	const void *input)
    193 {
    194 	struct r600_context *ctx = (struct r600_context *)ctx_;
    195 	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
    196 	int i;
    197 	unsigned kernel_parameters_offset_bytes = 36;
    198 	uint32_t * num_work_groups_start;
    199 	uint32_t * global_size_start;
    200 	uint32_t * local_size_start;
    201 	uint32_t * kernel_parameters_start;
    202 
    203 	if (shader->input_size == 0) {
    204 		return;
    205 	}
    206 
    207 	if (!shader->kernel_param) {
    208 		unsigned buffer_size = shader->input_size;
    209 
    210 		/* Add space for the grid dimensions */
    211 		buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
    212 		shader->kernel_param = r600_compute_buffer_alloc_vram(
    213 						ctx->screen, buffer_size);
    214 	}
    215 
    216 	num_work_groups_start = ctx->ws->buffer_map(
    217 		shader->kernel_param->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
    218 	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
    219 	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
    220 	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
    221 
    222 	/* Copy the work group size */
    223 	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
    224 
    225 	/* Copy the global size */
    226 	for (i = 0; i < 3; i++) {
    227 		global_size_start[i] = grid_layout[i] * block_layout[i];
    228 	}
    229 
    230 	/* Copy the local dimensions */
    231 	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
    232 
    233 	/* Copy the kernel inputs */
    234 	memcpy(kernel_parameters_start, input, shader->input_size);
    235 
    236 	for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
    237 					(shader->input_size / 4); i++) {
    238 		COMPUTE_DBG("input %i : %i\n", i,
    239 			((unsigned*)num_work_groups_start)[i]);
    240 	}
    241 
    242 	ctx->ws->buffer_unmap(shader->kernel_param->cs_buf);
    243 
    244 	///ID=0 is reserved for the parameters
    245 	evergreen_cs_set_vertex_buffer(ctx, 0, 0,
    246 			(struct pipe_resource*)shader->kernel_param);
    247 	///ID=0 is reserved for parameters
    248 	evergreen_set_const_cache(shader, 0, shader->kernel_param,
    249 						shader->input_size, 0);
    250 }
    251 
    252 static void evergreen_emit_direct_dispatch(
    253 		struct r600_context *rctx,
    254 		const uint *block_layout, const uint *grid_layout)
    255 {
    256 	int i;
    257 	struct radeon_winsys_cs *cs = rctx->cs;
    258 	unsigned num_waves;
    259 	unsigned num_pipes = rctx->screen->info.r600_max_pipes;
    260 	unsigned wave_divisor = (16 * num_pipes);
    261 	int group_size = 1;
    262 	int grid_size = 1;
    263 	/* XXX: Enable lds and get size from cs_shader_state */
    264 	unsigned lds_size = 0;
    265 
    266 	/* Calculate group_size/grid_size */
    267 	for (i = 0; i < 3; i++) {
    268 		group_size *= block_layout[i];
    269 	}
    270 
    271 	for (i = 0; i < 3; i++)	{
    272 		grid_size *= grid_layout[i];
    273 	}
    274 
    275 	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
    276 	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
    277 			wave_divisor - 1) / wave_divisor;
    278 
    279 	COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
    280 							num_pipes, num_waves);
    281 
    282 	/* XXX: Partition the LDS between PS/CS.  By default half (4096 dwords
    283 	 * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders.
    284 	 * We may need to allocat the entire LDS space for Compute Shaders.
    285 	 *
    286 	 * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords)
    287 	 * CM: CM_R_0286FC_SPI_LDS_MGMT :=  S_0286FC_NUM_LS_LDS(lds_dwords)
    288 	 */
    289 
    290 	r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
    291 
    292 	r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
    293 	r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
    294 	r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
    295 	r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
    296 
    297 	r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
    298 								group_size);
    299 
    300 	r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
    301 	r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
    302 	r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
    303 	r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
    304 
    305 	r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
    306 					lds_size | (num_waves << 14));
    307 
    308 	/* Dispatch packet */
    309 	r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
    310 	r600_write_value(cs, grid_layout[0]);
    311 	r600_write_value(cs, grid_layout[1]);
    312 	r600_write_value(cs, grid_layout[2]);
    313 	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
    314 	r600_write_value(cs, 1);
    315 }
    316 
    317 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
    318 		const uint *grid_layout)
    319 {
    320 	struct radeon_winsys_cs *cs = ctx->cs;
    321 	int i;
    322 
    323 	struct r600_resource *onebo = NULL;
    324 	struct r600_pipe_state *cb_state;
    325 	struct evergreen_compute_resource *resources =
    326 					ctx->cs_shader_state.shader->resources;
    327 
    328 	/* Initialize all the compute-related registers.
    329 	 *
    330 	 * See evergreen_init_atom_start_compute_cs() in this file for the list
    331 	 * of registers initialized by the start_compute_cs_cmd atom.
    332 	 */
    333 	r600_emit_atom(ctx, &ctx->start_compute_cs_cmd.atom);
    334 
    335 	/* Emit cb_state */
    336         cb_state = ctx->states[R600_PIPE_STATE_FRAMEBUFFER];
    337 	r600_context_pipe_state_emit(ctx, cb_state, RADEON_CP_PACKET3_COMPUTE_MODE);
    338 
    339 	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
    340 	r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
    341 					ctx->compute_cb_target_mask);
    342 
    343 
    344 	/* Emit vertex buffer state */
    345 	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
    346 	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
    347 
    348 	/* Emit compute shader state */
    349 	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
    350 
    351 	for (i = 0; i < get_compute_resource_num(); i++) {
    352 		if (resources[i].enabled) {
    353 			int j;
    354 			COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw);
    355 
    356 			for (j = 0; j < resources[i].cs_end; j++) {
    357 				if (resources[i].do_reloc[j]) {
    358 					assert(resources[i].bo);
    359 					evergreen_emit_ctx_reloc(ctx,
    360 						resources[i].bo,
    361 						resources[i].usage);
    362 				}
    363 
    364 				cs->buf[cs->cdw++] = resources[i].cs[j];
    365 			}
    366 
    367 			if (resources[i].bo) {
    368 				onebo = resources[i].bo;
    369 				evergreen_emit_ctx_reloc(ctx,
    370 					resources[i].bo,
    371 					resources[i].usage);
    372 
    373 				///special case for textures
    374 				if (resources[i].do_reloc
    375 					[resources[i].cs_end] == 2) {
    376 					evergreen_emit_ctx_reloc(ctx,
    377 						resources[i].bo,
    378 						resources[i].usage);
    379 				}
    380 			}
    381 		}
    382 	}
    383 
    384 	/* Emit dispatch state and dispatch packet */
    385 	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
    386 
    387 	/* r600_flush_framebuffer() updates the cb_flush_flags and then
    388 	 * calls r600_emit_atom() on the ctx->surface_sync_cmd.atom, which emits
    389 	 * a SURFACE_SYNC packet via r600_emit_surface_sync().
    390 	 *
    391 	 * XXX r600_emit_surface_sync() hardcodes the CP_COHER_SIZE to
    392 	 * 0xffffffff, so we will need to add a field to struct
    393 	 * r600_surface_sync_cmd if we want to manually set this value.
    394 	 */
    395 	r600_flush_framebuffer(ctx, true /* Flush now */);
    396 
    397 #if 0
    398 	COMPUTE_DBG("cdw: %i\n", cs->cdw);
    399 	for (i = 0; i < cs->cdw; i++) {
    400 		COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]);
    401 	}
    402 #endif
    403 
    404 	ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE);
    405 
    406 	ctx->pm4_dirty_cdwords = 0;
    407 	ctx->flags = 0;
    408 
    409 	COMPUTE_DBG("shader started\n");
    410 
    411 	ctx->ws->buffer_wait(onebo->buf, 0);
    412 
    413 	COMPUTE_DBG("...\n");
    414 
    415 	ctx->streamout_start = TRUE;
    416 	ctx->streamout_append_bitmask = ~0;
    417 
    418 }
    419 
    420 
    421 /**
    422  * Emit function for r600_cs_shader_state atom
    423  */
    424 void evergreen_emit_cs_shader(
    425 		struct r600_context *rctx,
    426 		struct r600_atom *atom)
    427 {
    428 	struct r600_cs_shader_state *state =
    429 					(struct r600_cs_shader_state*)atom;
    430 	struct r600_pipe_compute *shader = state->shader;
    431 	struct radeon_winsys_cs *cs = rctx->cs;
    432 	uint64_t va;
    433 
    434 	va = r600_resource_va(&rctx->screen->screen, &shader->shader_code_bo->b.b);
    435 
    436 	r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
    437 	r600_write_value(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
    438 	r600_write_value(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
    439 			S_0288D4_NUM_GPRS(shader->bc.ngpr)
    440 			| S_0288D4_STACK_SIZE(shader->bc.nstack));
    441 	r600_write_value(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
    442 
    443 	r600_write_value(cs, PKT3C(PKT3_NOP, 0, 0));
    444 	r600_write_value(cs, r600_context_bo_reloc(rctx, shader->shader_code_bo,
    445 							RADEON_USAGE_READ));
    446 
    447 	r600_inval_shader_cache(rctx);
    448 }
    449 
    450 static void evergreen_launch_grid(
    451 		struct pipe_context *ctx_,
    452 		const uint *block_layout, const uint *grid_layout,
    453 		uint32_t pc, const void *input)
    454 {
    455 	struct r600_context *ctx = (struct r600_context *)ctx_;
    456 
    457 	COMPUTE_DBG("PC: %i\n", pc);
    458 
    459 	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
    460 	compute_emit_cs(ctx, block_layout, grid_layout);
    461 }
    462 
    463 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
    464 		unsigned start, unsigned count,
    465 		struct pipe_surface ** surfaces)
    466 {
    467 	struct r600_context *ctx = (struct r600_context *)ctx_;
    468 	struct r600_surface **resources = (struct r600_surface **)surfaces;
    469 
    470 	COMPUTE_DBG("*** evergreen_set_compute_resources: start = %u count = %u\n",
    471 			start, count);
    472 
    473 	for (int i = 0; i < count; i++)	{
    474 		/* The First two vertex buffers are reserved for parameters and
    475 		 * global buffers. */
    476 		unsigned vtx_id = 2 + i;
    477 		if (resources[i]) {
    478 			struct r600_resource_global *buffer =
    479 				(struct r600_resource_global*)
    480 				resources[i]->base.texture;
    481 			if (resources[i]->base.writable) {
    482 				assert(i+1 < 12);
    483 
    484 				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
    485 				(struct r600_resource *)resources[i]->base.texture,
    486 				buffer->chunk->start_in_dw*4,
    487 				resources[i]->base.texture->width0);
    488 			}
    489 
    490 			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
    491 					buffer->chunk->start_in_dw * 4,
    492 					resources[i]->base.texture);
    493 		}
    494 	}
    495 }
    496 
    497 static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
    498 		unsigned start_slot, unsigned count,
    499 		struct pipe_sampler_view **views)
    500 {
    501 	struct r600_context *ctx = (struct r600_context *)ctx_;
    502 	struct r600_pipe_sampler_view **resource =
    503 		(struct r600_pipe_sampler_view **)views;
    504 
    505 	for (int i = 0; i < count; i++)	{
    506 		if (resource[i]) {
    507 			assert(i+1 < 12);
    508 			///FETCH0 = VTX0 (param buffer),
    509 			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
    510 			evergreen_set_tex_resource(ctx->cs_shader_state.shader, resource[i], i+2);
    511 		}
    512 	}
    513 }
    514 
    515 static void evergreen_bind_compute_sampler_states(
    516 	struct pipe_context *ctx_,
    517 	unsigned start_slot,
    518 	unsigned num_samplers,
    519 	void **samplers_)
    520 {
    521 	struct r600_context *ctx = (struct r600_context *)ctx_;
    522 	struct compute_sampler_state ** samplers =
    523 		(struct compute_sampler_state **)samplers_;
    524 
    525 	for (int i = 0; i < num_samplers; i++) {
    526 		if (samplers[i]) {
    527 			evergreen_set_sampler_resource(
    528 				ctx->cs_shader_state.shader, samplers[i], i);
    529 		}
    530 	}
    531 }
    532 
    533 static void evergreen_set_global_binding(
    534 	struct pipe_context *ctx_, unsigned first, unsigned n,
    535 	struct pipe_resource **resources,
    536 	uint32_t **handles)
    537 {
    538 	struct r600_context *ctx = (struct r600_context *)ctx_;
    539 	struct compute_memory_pool *pool = ctx->screen->global_pool;
    540 	struct r600_resource_global **buffers =
    541 		(struct r600_resource_global **)resources;
    542 
    543 	COMPUTE_DBG("*** evergreen_set_global_binding first = %u n = %u\n",
    544 			first, n);
    545 
    546 	if (!resources) {
    547 		/* XXX: Unset */
    548 		return;
    549 	}
    550 
    551 	compute_memory_finalize_pending(pool, ctx_);
    552 
    553 	for (int i = 0; i < n; i++)
    554 	{
    555 		assert(resources[i]->target == PIPE_BUFFER);
    556 		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
    557 
    558 		*(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
    559 	}
    560 
    561 	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
    562 	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
    563 				(struct pipe_resource*)pool->bo);
    564 }
    565 
    566 /**
    567  * This function initializes all the compute specific registers that need to
    568  * be initialized for each compute command stream.  Registers that are common
    569  * to both compute and 3D will be initialized at the beginning of each compute
    570  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
    571  * packet requires that the shader type bit be set, we must initialize all
    572  * context registers needed for compute in this function.  The registers
    573  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
    574  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
    575  * on the GPU family.
    576  */
    577 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
    578 {
    579 	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
    580 	int num_threads;
    581 	int num_stack_entries;
    582 
    583 	/* since all required registers are initialised in the
    584 	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
    585 	 */
    586 	r600_init_command_buffer(cb, 256, EMIT_EARLY);
    587 	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
    588 
    589 	switch (ctx->family) {
    590 	case CHIP_CEDAR:
    591 	default:
    592 		num_threads = 128;
    593 		num_stack_entries = 256;
    594 		break;
    595 	case CHIP_REDWOOD:
    596 		num_threads = 128;
    597 		num_stack_entries = 256;
    598 		break;
    599 	case CHIP_JUNIPER:
    600 		num_threads = 128;
    601 		num_stack_entries = 512;
    602 		break;
    603 	case CHIP_CYPRESS:
    604 	case CHIP_HEMLOCK:
    605 		num_threads = 128;
    606 		num_stack_entries = 512;
    607 		break;
    608 	case CHIP_PALM:
    609 		num_threads = 128;
    610 		num_stack_entries = 256;
    611 		break;
    612 	case CHIP_SUMO:
    613 		num_threads = 128;
    614 		num_stack_entries = 256;
    615 		break;
    616 	case CHIP_SUMO2:
    617 		num_threads = 128;
    618 		num_stack_entries = 512;
    619 		break;
    620 	case CHIP_BARTS:
    621 		num_threads = 128;
    622 		num_stack_entries = 512;
    623 		break;
    624 	case CHIP_TURKS:
    625 		num_threads = 128;
    626 		num_stack_entries = 256;
    627 		break;
    628 	case CHIP_CAICOS:
    629 		num_threads = 128;
    630 		num_stack_entries = 256;
    631 		break;
    632 	}
    633 
    634 	/* Config Registers */
    635 	evergreen_init_common_regs(cb, ctx->chip_class
    636 			, ctx->family, ctx->screen->info.drm_minor);
    637 
    638 	/* The primitive type always needs to be POINTLIST for compute. */
    639 	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
    640 						V_008958_DI_PT_POINTLIST);
    641 
    642 	if (ctx->chip_class < CAYMAN) {
    643 
    644 		/* These registers control which simds can be used by each stage.
    645 		 * The default for these registers is 0xffffffff, which means
    646 		 * all simds are available for each stage.  It's possible we may
    647 		 * want to play around with these in the future, but for now
    648 		 * the default value is fine.
    649 		 *
    650 		 * R_008E20_SQ_STATIC_THREAD_MGMT1
    651 		 * R_008E24_SQ_STATIC_THREAD_MGMT2
    652 		 * R_008E28_SQ_STATIC_THREAD_MGMT3
    653 		 */
    654 
    655 		/* XXX: We may need to adjust the thread and stack resouce
    656 		 * values for 3D/compute interop */
    657 
    658 		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
    659 
    660 		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
    661 		 * Set the number of threads used by the PS/VS/GS/ES stage to
    662 		 * 0.
    663 		 */
    664 		r600_store_value(cb, 0);
    665 
    666 		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
    667 		 * Set the number of threads used by the CS (aka LS) stage to
    668 		 * the maximum number of threads and set the number of threads
    669 		 * for the HS stage to 0. */
    670 		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
    671 
    672 		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
    673 		 * Set the Control Flow stack entries to 0 for PS/VS stages */
    674 		r600_store_value(cb, 0);
    675 
    676 		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
    677 		 * Set the Control Flow stack entries to 0 for GS/ES stages */
    678 		r600_store_value(cb, 0);
    679 
    680 		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
    681 		 * Set the Contol Flow stack entries to 0 for the HS stage, and
    682 		 * set it to the maximum value for the CS (aka LS) stage. */
    683 		r600_store_value(cb,
    684 			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
    685 	}
    686 
    687 	/* Context Registers */
    688 
    689 	if (ctx->chip_class < CAYMAN) {
    690 		/* workaround for hw issues with dyn gpr - must set all limits
    691 		 * to 240 instead of 0, 0x1e == 240 / 8
    692 		 */
    693 		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
    694 				S_028838_PS_GPRS(0x1e) |
    695 				S_028838_VS_GPRS(0x1e) |
    696 				S_028838_GS_GPRS(0x1e) |
    697 				S_028838_ES_GPRS(0x1e) |
    698 				S_028838_HS_GPRS(0x1e) |
    699 				S_028838_LS_GPRS(0x1e));
    700 	}
    701 
    702 	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
    703 	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
    704 		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
    705 
    706 	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
    707 
    708 	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
    709 						S_0286E8_TID_IN_GROUP_ENA
    710 						| S_0286E8_TGID_ENA
    711 						| S_0286E8_DISABLE_INDEX_PACK)
    712 						;
    713 
    714 	/* The LOOP_CONST registers are an optimizations for loops that allows
    715 	 * you to store the initial counter, increment value, and maximum
    716 	 * counter value in a register so that hardware can calculate the
    717 	 * correct number of iterations for the loop, so that you don't need
    718 	 * to have the loop counter in your shader code.  We don't currently use
    719 	 * this optimization, so we must keep track of the counter in the
    720 	 * shader and use a break instruction to exit loops.  However, the
    721 	 * hardware will still uses this register to determine when to exit a
    722 	 * loop, so we need to initialize the counter to 0, set the increment
    723 	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
    724 	 * is the maximum value allowed.  This gives us a maximum of 4096
    725 	 * iterations for our loops, but hopefully our break instruction will
    726 	 * execute before some time before the 4096th iteration.
    727 	 */
    728 	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
    729 }
    730 
    731 void evergreen_init_compute_state_functions(struct r600_context *ctx)
    732 {
    733 	ctx->context.create_compute_state = evergreen_create_compute_state;
    734 	ctx->context.delete_compute_state = evergreen_delete_compute_state;
    735 	ctx->context.bind_compute_state = evergreen_bind_compute_state;
    736 //	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
    737 	ctx->context.set_compute_resources = evergreen_set_compute_resources;
    738 	ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
    739 	ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
    740 	ctx->context.set_global_binding = evergreen_set_global_binding;
    741 	ctx->context.launch_grid = evergreen_launch_grid;
    742 
    743 	/* We always use at least two vertex buffers for compute, one for
    744          * parameters and one for global memory */
    745 	ctx->cs_vertex_buffer_state.enabled_mask =
    746 	ctx->cs_vertex_buffer_state.dirty_mask = 1 | 2;
    747 }
    748 
    749 
    750 struct pipe_resource *r600_compute_global_buffer_create(
    751 	struct pipe_screen *screen,
    752 	const struct pipe_resource *templ)
    753 {
    754 	assert(templ->target == PIPE_BUFFER);
    755 	assert(templ->bind & PIPE_BIND_GLOBAL);
    756 	assert(templ->array_size == 1 || templ->array_size == 0);
    757 	assert(templ->depth0 == 1 || templ->depth0 == 0);
    758 	assert(templ->height0 == 1 || templ->height0 == 0);
    759 
    760 	struct r600_resource_global* result = (struct r600_resource_global*)
    761 		CALLOC(sizeof(struct r600_resource_global), 1);
    762 	struct r600_screen* rscreen = (struct r600_screen*)screen;
    763 
    764 	COMPUTE_DBG("*** r600_compute_global_buffer_create\n");
    765 	COMPUTE_DBG("width = %u array_size = %u\n", templ->width0,
    766 			templ->array_size);
    767 
    768 	result->base.b.vtbl = &r600_global_buffer_vtbl;
    769 	result->base.b.b.screen = screen;
    770 	result->base.b.b = *templ;
    771 	pipe_reference_init(&result->base.b.b.reference, 1);
    772 
    773 	int size_in_dw = (templ->width0+3) / 4;
    774 
    775 	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
    776 
    777 	if (result->chunk == NULL)
    778 	{
    779 		free(result);
    780 		return NULL;
    781 	}
    782 
    783 	return &result->base.b.b;
    784 }
    785 
    786 void r600_compute_global_buffer_destroy(
    787 	struct pipe_screen *screen,
    788 	struct pipe_resource *res)
    789 {
    790 	assert(res->target == PIPE_BUFFER);
    791 	assert(res->bind & PIPE_BIND_GLOBAL);
    792 
    793 	struct r600_resource_global* buffer = (struct r600_resource_global*)res;
    794 	struct r600_screen* rscreen = (struct r600_screen*)screen;
    795 
    796 	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
    797 
    798 	buffer->chunk = NULL;
    799 	free(res);
    800 }
    801 
    802 void* r600_compute_global_transfer_map(
    803 	struct pipe_context *ctx_,
    804 	struct pipe_transfer* transfer)
    805 {
    806 	assert(transfer->resource->target == PIPE_BUFFER);
    807 	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
    808 	assert(transfer->box.x >= 0);
    809 	assert(transfer->box.y == 0);
    810 	assert(transfer->box.z == 0);
    811 
    812 	struct r600_context *ctx = (struct r600_context *)ctx_;
    813 	struct r600_resource_global* buffer =
    814 		(struct r600_resource_global*)transfer->resource;
    815 
    816 	uint32_t* map;
    817 	///TODO: do it better, mapping is not possible if the pool is too big
    818 
    819 	if (!(map = ctx->ws->buffer_map(buffer->chunk->pool->bo->cs_buf,
    820 						ctx->cs, transfer->usage))) {
    821 		return NULL;
    822 	}
    823 
    824 	COMPUTE_DBG("buffer start: %lli\n", buffer->chunk->start_in_dw);
    825 	return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
    826 }
    827 
    828 void r600_compute_global_transfer_unmap(
    829 	struct pipe_context *ctx_,
    830 	struct pipe_transfer* transfer)
    831 {
    832 	assert(transfer->resource->target == PIPE_BUFFER);
    833 	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
    834 
    835 	struct r600_context *ctx = (struct r600_context *)ctx_;
    836 	struct r600_resource_global* buffer =
    837 		(struct r600_resource_global*)transfer->resource;
    838 
    839 	ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
    840 }
    841 
    842 struct pipe_transfer * r600_compute_global_get_transfer(
    843 	struct pipe_context *ctx_,
    844 	struct pipe_resource *resource,
    845 	unsigned level,
    846 	unsigned usage,
    847 	const struct pipe_box *box)
    848 {
    849 	struct r600_context *ctx = (struct r600_context *)ctx_;
    850 	struct compute_memory_pool *pool = ctx->screen->global_pool;
    851 
    852 	compute_memory_finalize_pending(pool, ctx_);
    853 
    854 	assert(resource->target == PIPE_BUFFER);
    855 	struct r600_context *rctx = (struct r600_context*)ctx_;
    856 	struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
    857 
    858 	transfer->resource = resource;
    859 	transfer->level = level;
    860 	transfer->usage = usage;
    861 	transfer->box = *box;
    862 	transfer->stride = 0;
    863 	transfer->layer_stride = 0;
    864 	transfer->data = NULL;
    865 
    866 	/* Note strides are zero, this is ok for buffers, but not for
    867 	* textures 2d & higher at least.
    868 	*/
    869 	return transfer;
    870 }
    871 
    872 void r600_compute_global_transfer_destroy(
    873 	struct pipe_context *ctx_,
    874 	struct pipe_transfer *transfer)
    875 {
    876 	struct r600_context *rctx = (struct r600_context*)ctx_;
    877 	util_slab_free(&rctx->pool_transfers, transfer);
    878 }
    879 
    880 void r600_compute_global_transfer_flush_region(
    881 	struct pipe_context *ctx_,
    882 	struct pipe_transfer *transfer,
    883 	const struct pipe_box *box)
    884 {
    885 	assert(0 && "TODO");
    886 }
    887 
    888 void r600_compute_global_transfer_inline_write(
    889 	struct pipe_context *pipe,
    890 	struct pipe_resource *resource,
    891 	unsigned level,
    892 	unsigned usage,
    893 	const struct pipe_box *box,
    894 	const void *data,
    895 	unsigned stride,
    896 	unsigned layer_stride)
    897 {
    898 	assert(0 && "TODO");
    899 }
    900