Home | History | Annotate | Download | only in r600
      1 /*
      2  * Copyright 2011 Adam Rak <adam.rak (at) streamnovation.com>
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * on the rights to use, copy, modify, merge, publish, distribute, sub
      8  * license, and/or sell copies of the Software, and to permit persons to whom
      9  * the Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  *
     23  * Authors:
     24  *      Adam Rak <adam.rak (at) streamnovation.com>
     25  */
     26 
     27 #include <stdio.h>
     28 #include <errno.h>
     29 #include "pipe/p_defines.h"
     30 #include "pipe/p_state.h"
     31 #include "pipe/p_context.h"
     32 #include "util/u_blitter.h"
     33 #include "util/list.h"
     34 #include "util/u_transfer.h"
     35 #include "util/u_surface.h"
     36 #include "util/u_pack_color.h"
     37 #include "util/u_memory.h"
     38 #include "util/u_inlines.h"
     39 #include "util/u_framebuffer.h"
     40 #include "pipebuffer/pb_buffer.h"
     41 #include "evergreend.h"
     42 #include "r600_shader.h"
     43 #include "r600_pipe.h"
     44 #include "r600_formats.h"
     45 #include "evergreen_compute.h"
     46 #include "evergreen_compute_internal.h"
     47 #include "compute_memory_pool.h"
     48 #include "sb/sb_public.h"
     49 #include "radeon/radeon_elf_util.h"
     50 #include <inttypes.h>
     51 
     52 /**
     53 RAT0 is for global binding write
     54 VTX1 is for global binding read
     55 
     56 for wrting images RAT1...
     57 for reading images TEX2...
     58   TEX2-RAT1 is paired
     59 
     60 TEX2... consumes the same fetch resources, that VTX2... would consume
     61 
     62 CONST0 and VTX0 is for parameters
     63   CONST0 is binding smaller input parameter buffer, and for constant indexing,
     64   also constant cached
     65   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
     66   the constant cache can handle
     67 
     68 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
     69 because we reserve RAT0 for global bindings. With byteaddressing enabled,
     70 we should reserve another one too.=> 10 image binding for writing max.
     71 
     72 from Nvidia OpenCL:
     73   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
     74   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
     75 
     76 so 10 for writing is enough. 176 is the max for reading according to the docs
     77 
     78 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
     79 writable images will consume TEX slots, VTX slots too because of linear indexing
     80 
     81 */
     82 
     83 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
     84 						     unsigned size)
     85 {
     86 	struct pipe_resource *buffer = NULL;
     87 	assert(size);
     88 
     89 	buffer = pipe_buffer_create((struct pipe_screen*) screen,
     90 				    0, PIPE_USAGE_IMMUTABLE, size);
     91 
     92 	return (struct r600_resource *)buffer;
     93 }
     94 
     95 
     96 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
     97 			      unsigned id,
     98 			      struct r600_resource *bo,
     99 			      int start,
    100 			      int size)
    101 {
    102 	struct pipe_surface rat_templ;
    103 	struct r600_surface *surf = NULL;
    104 	struct r600_context *rctx = NULL;
    105 
    106 	assert(id < 12);
    107 	assert((size & 3) == 0);
    108 	assert((start & 0xFF) == 0);
    109 
    110 	rctx = pipe->ctx;
    111 
    112 	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
    113 
    114 	/* Create the RAT surface */
    115 	memset(&rat_templ, 0, sizeof(rat_templ));
    116 	rat_templ.format = PIPE_FORMAT_R32_UINT;
    117 	rat_templ.u.tex.level = 0;
    118 	rat_templ.u.tex.first_layer = 0;
    119 	rat_templ.u.tex.last_layer = 0;
    120 
    121 	/* Add the RAT the list of color buffers */
    122 	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
    123 		(struct pipe_context *)pipe->ctx,
    124 		(struct pipe_resource *)bo, &rat_templ);
    125 
    126 	/* Update the number of color buffers */
    127 	pipe->ctx->framebuffer.state.nr_cbufs =
    128 		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
    129 
    130 	/* Update the cb_target_mask
    131 	 * XXX: I think this is a potential spot for bugs once we start doing
    132 	 * GL interop.  cb_target_mask may be modified in the 3D sections
    133 	 * of this driver. */
    134 	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
    135 
    136 	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
    137 	evergreen_init_color_surface_rat(rctx, surf);
    138 }
    139 
    140 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
    141 					   unsigned vb_index,
    142 					   unsigned offset,
    143 					   struct pipe_resource *buffer)
    144 {
    145 	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
    146 	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
    147 	vb->stride = 1;
    148 	vb->buffer_offset = offset;
    149 	vb->buffer = buffer;
    150 	vb->user_buffer = NULL;
    151 
    152 	/* The vertex instructions in the compute shaders use the texture cache,
    153 	 * so we need to invalidate it. */
    154 	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
    155 	state->enabled_mask |= 1 << vb_index;
    156 	state->dirty_mask |= 1 << vb_index;
    157 	r600_mark_atom_dirty(rctx, &state->atom);
    158 }
    159 
    160 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
    161 					     unsigned cb_index,
    162 					     unsigned offset,
    163 					     unsigned size,
    164 					     struct pipe_resource *buffer)
    165 {
    166 	struct pipe_constant_buffer cb;
    167 	cb.buffer_size = size;
    168 	cb.buffer_offset = offset;
    169 	cb.buffer = buffer;
    170 	cb.user_buffer = NULL;
    171 
    172 	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
    173 }
    174 
    175 /* We need to define these R600 registers here, because we can't include
    176  * evergreend.h and r600d.h.
    177  */
    178 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
    179 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
    180 
    181 #ifdef HAVE_OPENCL
    182 
    183 static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
    184 					   struct r600_bytecode *bc,
    185 					   uint64_t symbol_offset,
    186 					   boolean *use_kill)
    187 {
    188        unsigned i;
    189        const unsigned char *config =
    190                radeon_shader_binary_config_start(binary, symbol_offset);
    191 
    192        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
    193                unsigned reg =
    194                        util_le32_to_cpu(*(uint32_t*)(config + i));
    195                unsigned value =
    196                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
    197                switch (reg) {
    198                /* R600 / R700 */
    199                case R_028850_SQ_PGM_RESOURCES_PS:
    200                case R_028868_SQ_PGM_RESOURCES_VS:
    201                /* Evergreen / Northern Islands */
    202                case R_028844_SQ_PGM_RESOURCES_PS:
    203                case R_028860_SQ_PGM_RESOURCES_VS:
    204                case R_0288D4_SQ_PGM_RESOURCES_LS:
    205                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
    206                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
    207                        break;
    208                case R_02880C_DB_SHADER_CONTROL:
    209                        *use_kill = G_02880C_KILL_ENABLE(value);
    210                        break;
    211                case R_0288E8_SQ_LDS_ALLOC:
    212                        bc->nlds_dw = value;
    213                        break;
    214                }
    215        }
    216 }
    217 
    218 static unsigned r600_create_shader(struct r600_bytecode *bc,
    219 				   const struct radeon_shader_binary *binary,
    220 				   boolean *use_kill)
    221 
    222 {
    223 	assert(binary->code_size % 4 == 0);
    224 	bc->bytecode = CALLOC(1, binary->code_size);
    225 	memcpy(bc->bytecode, binary->code, binary->code_size);
    226 	bc->ndw = binary->code_size / 4;
    227 
    228 	r600_shader_binary_read_config(binary, bc, 0, use_kill);
    229 	return 0;
    230 }
    231 
    232 #endif
    233 
    234 static void r600_destroy_shader(struct r600_bytecode *bc)
    235 {
    236 	FREE(bc->bytecode);
    237 }
    238 
    239 static void *evergreen_create_compute_state(struct pipe_context *ctx,
    240 					    const struct pipe_compute_state *cso)
    241 {
    242 	struct r600_context *rctx = (struct r600_context *)ctx;
    243 	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
    244 #ifdef HAVE_OPENCL
    245 	const struct pipe_llvm_program_header *header;
    246 	const char *code;
    247 	void *p;
    248 	boolean use_kill;
    249 
    250 	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
    251 	header = cso->prog;
    252 	code = cso->prog + sizeof(struct pipe_llvm_program_header);
    253 	radeon_shader_binary_init(&shader->binary);
    254 	radeon_elf_read(code, header->num_bytes, &shader->binary);
    255 	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
    256 
    257 	/* Upload code + ROdata */
    258 	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
    259 							shader->bc.ndw * 4);
    260 	p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
    261 	//TODO: use util_memcpy_cpu_to_le32 ?
    262 	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
    263 	rctx->b.ws->buffer_unmap(shader->code_bo->buf);
    264 #endif
    265 
    266 	shader->ctx = rctx;
    267 	shader->local_size = cso->req_local_mem;
    268 	shader->private_size = cso->req_private_mem;
    269 	shader->input_size = cso->req_input_mem;
    270 
    271 	return shader;
    272 }
    273 
    274 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
    275 {
    276 	struct r600_context *rctx = (struct r600_context *)ctx;
    277 	struct r600_pipe_compute *shader = state;
    278 
    279 	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
    280 
    281 	if (!shader)
    282 		return;
    283 
    284 	radeon_shader_binary_clean(&shader->binary);
    285 	r600_destroy_shader(&shader->bc);
    286 
    287 	/* TODO destroy shader->code_bo, shader->const_bo
    288 	 * we'll need something like r600_buffer_free */
    289 	FREE(shader);
    290 }
    291 
    292 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
    293 {
    294 	struct r600_context *rctx = (struct r600_context *)ctx;
    295 
    296 	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
    297 
    298 	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
    299 }
    300 
    301 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
    302  * kernel parameters there are implicit parameters that need to be stored
    303  * in the vertex buffer as well.  Here is how these parameters are organized in
    304  * the buffer:
    305  *
    306  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
    307  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
    308  * DWORDS 6-8: Number of work items within each work group in each dimension
    309  *             (x,y,z)
    310  * DWORDS 9+ : Kernel parameters
    311  */
    312 static void evergreen_compute_upload_input(struct pipe_context *ctx,
    313 					   const struct pipe_grid_info *info)
    314 {
    315 	struct r600_context *rctx = (struct r600_context *)ctx;
    316 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
    317 	unsigned i;
    318 	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
    319 	 * parameters.
    320 	 */
    321 	unsigned input_size = shader->input_size + 36;
    322 	uint32_t *num_work_groups_start;
    323 	uint32_t *global_size_start;
    324 	uint32_t *local_size_start;
    325 	uint32_t *kernel_parameters_start;
    326 	struct pipe_box box;
    327 	struct pipe_transfer *transfer = NULL;
    328 
    329 	if (shader->input_size == 0) {
    330 		return;
    331 	}
    332 
    333 	if (!shader->kernel_param) {
    334 		/* Add space for the grid dimensions */
    335 		shader->kernel_param = (struct r600_resource *)
    336 			pipe_buffer_create(ctx->screen, 0,
    337 					PIPE_USAGE_IMMUTABLE, input_size);
    338 	}
    339 
    340 	u_box_1d(0, input_size, &box);
    341 	num_work_groups_start = ctx->transfer_map(ctx,
    342 			(struct pipe_resource*)shader->kernel_param,
    343 			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
    344 			&box, &transfer);
    345 	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
    346 	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
    347 	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
    348 
    349 	/* Copy the work group size */
    350 	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
    351 
    352 	/* Copy the global size */
    353 	for (i = 0; i < 3; i++) {
    354 		global_size_start[i] = info->grid[i] * info->block[i];
    355 	}
    356 
    357 	/* Copy the local dimensions */
    358 	memcpy(local_size_start, info->block, 3 * sizeof(uint));
    359 
    360 	/* Copy the kernel inputs */
    361 	memcpy(kernel_parameters_start, info->input, shader->input_size);
    362 
    363 	for (i = 0; i < (input_size / 4); i++) {
    364 		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
    365 			((unsigned*)num_work_groups_start)[i]);
    366 	}
    367 
    368 	ctx->transfer_unmap(ctx, transfer);
    369 
    370 	/* ID=0 and ID=3 are reserved for the parameters.
    371 	 * LLVM will preferably use ID=0, but it does not work for dynamic
    372 	 * indices. */
    373 	evergreen_cs_set_vertex_buffer(rctx, 3, 0,
    374 			(struct pipe_resource*)shader->kernel_param);
    375 	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
    376 			(struct pipe_resource*)shader->kernel_param);
    377 }
    378 
    379 static void evergreen_emit_dispatch(struct r600_context *rctx,
    380 				    const struct pipe_grid_info *info)
    381 {
    382 	int i;
    383 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
    384 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
    385 	unsigned num_waves;
    386 	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
    387 	unsigned wave_divisor = (16 * num_pipes);
    388 	int group_size = 1;
    389 	int grid_size = 1;
    390 	unsigned lds_size = shader->local_size / 4 +
    391 		shader->bc.nlds_dw;
    392 
    393 
    394 	/* Calculate group_size/grid_size */
    395 	for (i = 0; i < 3; i++) {
    396 		group_size *= info->block[i];
    397 	}
    398 
    399 	for (i = 0; i < 3; i++)	{
    400 		grid_size *= info->grid[i];
    401 	}
    402 
    403 	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
    404 	num_waves = (info->block[0] * info->block[1] * info->block[2] +
    405 			wave_divisor - 1) / wave_divisor;
    406 
    407 	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
    408 				"%u wavefronts per thread block, "
    409 				"allocating %u dwords lds.\n",
    410 				num_pipes, num_waves, lds_size);
    411 
    412 	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
    413 
    414 	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
    415 	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
    416 	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
    417 	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
    418 
    419 	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
    420 								group_size);
    421 
    422 	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
    423 	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
    424 	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
    425 	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
    426 
    427 	if (rctx->b.chip_class < CAYMAN) {
    428 		assert(lds_size <= 8192);
    429 	} else {
    430 		/* Cayman appears to have a slightly smaller limit, see the
    431 		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
    432 		assert(lds_size <= 8160);
    433 	}
    434 
    435 	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
    436 					lds_size | (num_waves << 14));
    437 
    438 	/* Dispatch packet */
    439 	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
    440 	radeon_emit(cs, info->grid[0]);
    441 	radeon_emit(cs, info->grid[1]);
    442 	radeon_emit(cs, info->grid[2]);
    443 	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
    444 	radeon_emit(cs, 1);
    445 }
    446 
    447 static void compute_emit_cs(struct r600_context *rctx,
    448 			    const struct pipe_grid_info *info)
    449 {
    450 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
    451 	unsigned i;
    452 
    453 	/* make sure that the gfx ring is only one active */
    454 	if (radeon_emitted(rctx->b.dma.cs, 0)) {
    455 		rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
    456 	}
    457 
    458 	/* Initialize all the compute-related registers.
    459 	 *
    460 	 * See evergreen_init_atom_start_compute_cs() in this file for the list
    461 	 * of registers initialized by the start_compute_cs_cmd atom.
    462 	 */
    463 	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
    464 
    465 	/* emit config state */
    466 	if (rctx->b.chip_class == EVERGREEN)
    467 		r600_emit_atom(rctx, &rctx->config_state.atom);
    468 
    469 	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
    470 	r600_flush_emit(rctx);
    471 
    472 	/* Emit colorbuffers. */
    473 	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
    474 	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
    475 		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
    476 		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
    477 						       (struct r600_resource*)cb->base.texture,
    478 						       RADEON_USAGE_READWRITE,
    479 						       RADEON_PRIO_SHADER_RW_BUFFER);
    480 
    481 		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
    482 		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
    483 		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
    484 		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
    485 		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
    486 		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
    487 		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
    488 		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
    489 
    490 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
    491 		radeon_emit(cs, reloc);
    492 
    493 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
    494 		radeon_emit(cs, reloc);
    495 	}
    496 	for (; i < 8 ; i++)
    497 		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
    498 					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
    499 	for (; i < 12; i++)
    500 		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
    501 					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
    502 
    503 	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
    504 	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
    505 					rctx->compute_cb_target_mask);
    506 
    507 
    508 	/* Emit vertex buffer state */
    509 	rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
    510 	r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
    511 
    512 	/* Emit constant buffer state */
    513 	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
    514 
    515 	/* Emit sampler state */
    516 	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
    517 
    518 	/* Emit sampler view (texture resource) state */
    519 	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
    520 
    521 	/* Emit compute shader state */
    522 	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
    523 
    524 	/* Emit dispatch state and dispatch packet */
    525 	evergreen_emit_dispatch(rctx, info);
    526 
    527 	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
    528 	 */
    529 	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
    530 		      R600_CONTEXT_INV_VERTEX_CACHE |
    531 	              R600_CONTEXT_INV_TEX_CACHE;
    532 	r600_flush_emit(rctx);
    533 	rctx->b.flags = 0;
    534 
    535 	if (rctx->b.chip_class >= CAYMAN) {
    536 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
    537 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
    538 		/* DEALLOC_STATE prevents the GPU from hanging when a
    539 		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
    540 		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
    541 		 */
    542 		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
    543 		radeon_emit(cs, 0);
    544 	}
    545 
    546 #if 0
    547 	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
    548 	for (i = 0; i < cs->cdw; i++) {
    549 		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
    550 	}
    551 #endif
    552 
    553 }
    554 
    555 
    556 /**
    557  * Emit function for r600_cs_shader_state atom
    558  */
    559 void evergreen_emit_cs_shader(struct r600_context *rctx,
    560 			      struct r600_atom *atom)
    561 {
    562 	struct r600_cs_shader_state *state =
    563 					(struct r600_cs_shader_state*)atom;
    564 	struct r600_pipe_compute *shader = state->shader;
    565 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
    566 	uint64_t va;
    567 	struct r600_resource *code_bo;
    568 	unsigned ngpr, nstack;
    569 
    570 	code_bo = shader->code_bo;
    571 	va = shader->code_bo->gpu_address + state->pc;
    572 	ngpr = shader->bc.ngpr;
    573 	nstack = shader->bc.nstack;
    574 
    575 	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
    576 	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
    577 	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
    578 			S_0288D4_NUM_GPRS(ngpr)
    579 			| S_0288D4_STACK_SIZE(nstack));
    580 	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
    581 
    582 	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
    583 	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
    584 					      code_bo, RADEON_USAGE_READ,
    585 					      RADEON_PRIO_SHADER_BINARY));
    586 }
    587 
    588 static void evergreen_launch_grid(struct pipe_context *ctx,
    589 				  const struct pipe_grid_info *info)
    590 {
    591 	struct r600_context *rctx = (struct r600_context *)ctx;
    592 #ifdef HAVE_OPENCL
    593 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
    594 	boolean use_kill;
    595 
    596 	rctx->cs_shader_state.pc = info->pc;
    597 	/* Get the config information for this kernel. */
    598 	r600_shader_binary_read_config(&shader->binary, &shader->bc,
    599                                   info->pc, &use_kill);
    600 #endif
    601 
    602 	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
    603 
    604 
    605 	evergreen_compute_upload_input(ctx, info);
    606 	compute_emit_cs(rctx, info);
    607 }
    608 
    609 static void evergreen_set_compute_resources(struct pipe_context *ctx,
    610 					    unsigned start, unsigned count,
    611 					    struct pipe_surface **surfaces)
    612 {
    613 	struct r600_context *rctx = (struct r600_context *)ctx;
    614 	struct r600_surface **resources = (struct r600_surface **)surfaces;
    615 
    616 	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
    617 			start, count);
    618 
    619 	for (unsigned i = 0; i < count; i++) {
    620 		/* The First four vertex buffers are reserved for parameters and
    621 		 * global buffers. */
    622 		unsigned vtx_id = 4 + i;
    623 		if (resources[i]) {
    624 			struct r600_resource_global *buffer =
    625 				(struct r600_resource_global*)
    626 				resources[i]->base.texture;
    627 			if (resources[i]->base.writable) {
    628 				assert(i+1 < 12);
    629 
    630 				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
    631 				(struct r600_resource *)resources[i]->base.texture,
    632 				buffer->chunk->start_in_dw*4,
    633 				resources[i]->base.texture->width0);
    634 			}
    635 
    636 			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
    637 					buffer->chunk->start_in_dw * 4,
    638 					resources[i]->base.texture);
    639 		}
    640 	}
    641 }
    642 
    643 static void evergreen_set_global_binding(struct pipe_context *ctx,
    644 					 unsigned first, unsigned n,
    645 					 struct pipe_resource **resources,
    646 					 uint32_t **handles)
    647 {
    648 	struct r600_context *rctx = (struct r600_context *)ctx;
    649 	struct compute_memory_pool *pool = rctx->screen->global_pool;
    650 	struct r600_resource_global **buffers =
    651 		(struct r600_resource_global **)resources;
    652 	unsigned i;
    653 
    654 	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
    655 			first, n);
    656 
    657 	if (!resources) {
    658 		/* XXX: Unset */
    659 		return;
    660 	}
    661 
    662 	/* We mark these items for promotion to the pool if they
    663 	 * aren't already there */
    664 	for (i = first; i < first + n; i++) {
    665 		struct compute_memory_item *item = buffers[i]->chunk;
    666 
    667 		if (!is_item_in_pool(item))
    668 			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
    669 	}
    670 
    671 	if (compute_memory_finalize_pending(pool, ctx) == -1) {
    672 		/* XXX: Unset */
    673 		return;
    674 	}
    675 
    676 	for (i = first; i < first + n; i++)
    677 	{
    678 		uint32_t buffer_offset;
    679 		uint32_t handle;
    680 		assert(resources[i]->target == PIPE_BUFFER);
    681 		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
    682 
    683 		buffer_offset = util_le32_to_cpu(*(handles[i]));
    684 		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
    685 
    686 		*(handles[i]) = util_cpu_to_le32(handle);
    687 	}
    688 
    689 	/* globals for writing */
    690 	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
    691 	/* globals for reading */
    692 	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
    693 				(struct pipe_resource*)pool->bo);
    694 
    695 	/* constants for reading, LLVM puts them in text segment */
    696 	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
    697 				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
    698 }
    699 
    700 /**
    701  * This function initializes all the compute specific registers that need to
    702  * be initialized for each compute command stream.  Registers that are common
    703  * to both compute and 3D will be initialized at the beginning of each compute
    704  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
    705  * packet requires that the shader type bit be set, we must initialize all
    706  * context registers needed for compute in this function.  The registers
    707  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
    708  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
    709  * on the GPU family.
    710  */
    711 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
    712 {
    713 	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
    714 	int num_threads;
    715 	int num_stack_entries;
    716 
    717 	/* since all required registers are initialized in the
    718 	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
    719 	 */
    720 	r600_init_command_buffer(cb, 256);
    721 	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
    722 
    723 	/* This must be first. */
    724 	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
    725 	r600_store_value(cb, 0x80000000);
    726 	r600_store_value(cb, 0x80000000);
    727 
    728 	/* We're setting config registers here. */
    729 	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
    730 	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
    731 
    732 	switch (rctx->b.family) {
    733 	case CHIP_CEDAR:
    734 	default:
    735 		num_threads = 128;
    736 		num_stack_entries = 256;
    737 		break;
    738 	case CHIP_REDWOOD:
    739 		num_threads = 128;
    740 		num_stack_entries = 256;
    741 		break;
    742 	case CHIP_JUNIPER:
    743 		num_threads = 128;
    744 		num_stack_entries = 512;
    745 		break;
    746 	case CHIP_CYPRESS:
    747 	case CHIP_HEMLOCK:
    748 		num_threads = 128;
    749 		num_stack_entries = 512;
    750 		break;
    751 	case CHIP_PALM:
    752 		num_threads = 128;
    753 		num_stack_entries = 256;
    754 		break;
    755 	case CHIP_SUMO:
    756 		num_threads = 128;
    757 		num_stack_entries = 256;
    758 		break;
    759 	case CHIP_SUMO2:
    760 		num_threads = 128;
    761 		num_stack_entries = 512;
    762 		break;
    763 	case CHIP_BARTS:
    764 		num_threads = 128;
    765 		num_stack_entries = 512;
    766 		break;
    767 	case CHIP_TURKS:
    768 		num_threads = 128;
    769 		num_stack_entries = 256;
    770 		break;
    771 	case CHIP_CAICOS:
    772 		num_threads = 128;
    773 		num_stack_entries = 256;
    774 		break;
    775 	}
    776 
    777 	/* Config Registers */
    778 	if (rctx->b.chip_class < CAYMAN)
    779 		evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
    780 					   rctx->screen->b.info.drm_minor);
    781 	else
    782 		cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
    783 					rctx->screen->b.info.drm_minor);
    784 
    785 	/* The primitive type always needs to be POINTLIST for compute. */
    786 	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
    787 						V_008958_DI_PT_POINTLIST);
    788 
    789 	if (rctx->b.chip_class < CAYMAN) {
    790 
    791 		/* These registers control which simds can be used by each stage.
    792 		 * The default for these registers is 0xffffffff, which means
    793 		 * all simds are available for each stage.  It's possible we may
    794 		 * want to play around with these in the future, but for now
    795 		 * the default value is fine.
    796 		 *
    797 		 * R_008E20_SQ_STATIC_THREAD_MGMT1
    798 		 * R_008E24_SQ_STATIC_THREAD_MGMT2
    799 		 * R_008E28_SQ_STATIC_THREAD_MGMT3
    800 		 */
    801 
    802 		/* XXX: We may need to adjust the thread and stack resource
    803 		 * values for 3D/compute interop */
    804 
    805 		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
    806 
    807 		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
    808 		 * Set the number of threads used by the PS/VS/GS/ES stage to
    809 		 * 0.
    810 		 */
    811 		r600_store_value(cb, 0);
    812 
    813 		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
    814 		 * Set the number of threads used by the CS (aka LS) stage to
    815 		 * the maximum number of threads and set the number of threads
    816 		 * for the HS stage to 0. */
    817 		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
    818 
    819 		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
    820 		 * Set the Control Flow stack entries to 0 for PS/VS stages */
    821 		r600_store_value(cb, 0);
    822 
    823 		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
    824 		 * Set the Control Flow stack entries to 0 for GS/ES stages */
    825 		r600_store_value(cb, 0);
    826 
    827 		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
    828 		 * Set the Contol Flow stack entries to 0 for the HS stage, and
    829 		 * set it to the maximum value for the CS (aka LS) stage. */
    830 		r600_store_value(cb,
    831 			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
    832 	}
    833 	/* Give the compute shader all the available LDS space.
    834 	 * NOTE: This only sets the maximum number of dwords that a compute
    835 	 * shader can allocate.  When a shader is executed, we still need to
    836 	 * allocate the appropriate amount of LDS dwords using the
    837 	 * CM_R_0288E8_SQ_LDS_ALLOC register.
    838 	 */
    839 	if (rctx->b.chip_class < CAYMAN) {
    840 		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
    841 			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
    842 	} else {
    843 		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
    844 			S_0286FC_NUM_PS_LDS(0) |
    845 			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
    846 	}
    847 
    848 	/* Context Registers */
    849 
    850 	if (rctx->b.chip_class < CAYMAN) {
    851 		/* workaround for hw issues with dyn gpr - must set all limits
    852 		 * to 240 instead of 0, 0x1e == 240 / 8
    853 		 */
    854 		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
    855 				S_028838_PS_GPRS(0x1e) |
    856 				S_028838_VS_GPRS(0x1e) |
    857 				S_028838_GS_GPRS(0x1e) |
    858 				S_028838_ES_GPRS(0x1e) |
    859 				S_028838_HS_GPRS(0x1e) |
    860 				S_028838_LS_GPRS(0x1e));
    861 	}
    862 
    863 	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
    864 	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
    865 		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
    866 
    867 	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
    868 
    869 	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
    870 						S_0286E8_TID_IN_GROUP_ENA
    871 						| S_0286E8_TGID_ENA
    872 						| S_0286E8_DISABLE_INDEX_PACK)
    873 						;
    874 
    875 	/* The LOOP_CONST registers are an optimizations for loops that allows
    876 	 * you to store the initial counter, increment value, and maximum
    877 	 * counter value in a register so that hardware can calculate the
    878 	 * correct number of iterations for the loop, so that you don't need
    879 	 * to have the loop counter in your shader code.  We don't currently use
    880 	 * this optimization, so we must keep track of the counter in the
    881 	 * shader and use a break instruction to exit loops.  However, the
    882 	 * hardware will still uses this register to determine when to exit a
    883 	 * loop, so we need to initialize the counter to 0, set the increment
    884 	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
    885 	 * is the maximum value allowed.  This gives us a maximum of 4096
    886 	 * iterations for our loops, but hopefully our break instruction will
    887 	 * execute before some time before the 4096th iteration.
    888 	 */
    889 	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
    890 }
    891 
    892 void evergreen_init_compute_state_functions(struct r600_context *rctx)
    893 {
    894 	rctx->b.b.create_compute_state = evergreen_create_compute_state;
    895 	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
    896 	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
    897 //	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
    898 	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
    899 	rctx->b.b.set_global_binding = evergreen_set_global_binding;
    900 	rctx->b.b.launch_grid = evergreen_launch_grid;
    901 
    902 }
    903 
    904 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
    905 					      struct pipe_resource *resource,
    906 					      unsigned level,
    907 					      unsigned usage,
    908 					      const struct pipe_box *box,
    909 					      struct pipe_transfer **ptransfer)
    910 {
    911 	struct r600_context *rctx = (struct r600_context*)ctx;
    912 	struct compute_memory_pool *pool = rctx->screen->global_pool;
    913 	struct r600_resource_global* buffer =
    914 		(struct r600_resource_global*)resource;
    915 
    916 	struct compute_memory_item *item = buffer->chunk;
    917 	struct pipe_resource *dst = NULL;
    918 	unsigned offset = box->x;
    919 
    920 	if (is_item_in_pool(item)) {
    921 		compute_memory_demote_item(pool, item, ctx);
    922 	}
    923 	else {
    924 		if (item->real_buffer == NULL) {
    925 			item->real_buffer =
    926 					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
    927 		}
    928 	}
    929 
    930 	dst = (struct pipe_resource*)item->real_buffer;
    931 
    932 	if (usage & PIPE_TRANSFER_READ)
    933 		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
    934 
    935 	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
    936 			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
    937 			"width = %u, height = %u, depth = %u)\n", level, usage,
    938 			box->x, box->y, box->z, box->width, box->height,
    939 			box->depth);
    940 	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
    941 		"%u (box.x)\n", item->id, box->x);
    942 
    943 
    944 	assert(resource->target == PIPE_BUFFER);
    945 	assert(resource->bind & PIPE_BIND_GLOBAL);
    946 	assert(box->x >= 0);
    947 	assert(box->y == 0);
    948 	assert(box->z == 0);
    949 
    950 	///TODO: do it better, mapping is not possible if the pool is too big
    951 	return pipe_buffer_map_range(ctx, dst,
    952 			offset, box->width, usage, ptransfer);
    953 }
    954 
    955 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
    956 					       struct pipe_transfer *transfer)
    957 {
    958 	/* struct r600_resource_global are not real resources, they just map
    959 	 * to an offset within the compute memory pool.  The function
    960 	 * r600_compute_global_transfer_map() maps the memory pool
    961 	 * resource rather than the struct r600_resource_global passed to
    962 	 * it as an argument and then initalizes ptransfer->resource with
    963 	 * the memory pool resource (via pipe_buffer_map_range).
    964 	 * When transfer_unmap is called it uses the memory pool's
    965 	 * vtable which calls r600_buffer_transfer_map() rather than
    966 	 * this function.
    967 	 */
    968 	assert (!"This function should not be called");
    969 }
    970 
    971 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
    972 						      struct pipe_transfer *transfer,
    973 						      const struct pipe_box *box)
    974 {
    975 	assert(0 && "TODO");
    976 }
    977 
    978 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
    979 					       struct pipe_resource *res)
    980 {
    981 	struct r600_resource_global* buffer = NULL;
    982 	struct r600_screen* rscreen = NULL;
    983 
    984 	assert(res->target == PIPE_BUFFER);
    985 	assert(res->bind & PIPE_BIND_GLOBAL);
    986 
    987 	buffer = (struct r600_resource_global*)res;
    988 	rscreen = (struct r600_screen*)screen;
    989 
    990 	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
    991 
    992 	buffer->chunk = NULL;
    993 	free(res);
    994 }
    995 
    996 static const struct u_resource_vtbl r600_global_buffer_vtbl =
    997 {
    998 	u_default_resource_get_handle, /* get_handle */
    999 	r600_compute_global_buffer_destroy, /* resource_destroy */
   1000 	r600_compute_global_transfer_map, /* transfer_map */
   1001 	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
   1002 	r600_compute_global_transfer_unmap, /* transfer_unmap */
   1003 };
   1004 
   1005 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
   1006 							const struct pipe_resource *templ)
   1007 {
   1008 	struct r600_resource_global* result = NULL;
   1009 	struct r600_screen* rscreen = NULL;
   1010 	int size_in_dw = 0;
   1011 
   1012 	assert(templ->target == PIPE_BUFFER);
   1013 	assert(templ->bind & PIPE_BIND_GLOBAL);
   1014 	assert(templ->array_size == 1 || templ->array_size == 0);
   1015 	assert(templ->depth0 == 1 || templ->depth0 == 0);
   1016 	assert(templ->height0 == 1 || templ->height0 == 0);
   1017 
   1018 	result = (struct r600_resource_global*)
   1019 	CALLOC(sizeof(struct r600_resource_global), 1);
   1020 	rscreen = (struct r600_screen*)screen;
   1021 
   1022 	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
   1023 	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
   1024 			templ->array_size);
   1025 
   1026 	result->base.b.vtbl = &r600_global_buffer_vtbl;
   1027 	result->base.b.b = *templ;
   1028 	result->base.b.b.screen = screen;
   1029 	pipe_reference_init(&result->base.b.b.reference, 1);
   1030 
   1031 	size_in_dw = (templ->width0+3) / 4;
   1032 
   1033 	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
   1034 
   1035 	if (result->chunk == NULL)
   1036 	{
   1037 		free(result);
   1038 		return NULL;
   1039 	}
   1040 
   1041 	return &result->base.b.b;
   1042 }
   1043