Home | History | Annotate | Download | only in amdgpu
      1 /*
      2  * Copyright  2016 Red Hat.
      3  * Copyright  2016 Bas Nieuwenhuizen
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      9  * and/or sell copies of the Software, and to permit persons to whom the
     10  * Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     22  * IN THE SOFTWARE.
     23  */
     24 
     25 #include <stdlib.h>
     26 #include <amdgpu.h>
     27 #include <amdgpu_drm.h>
     28 #include <assert.h>
     29 
     30 #include "ac_debug.h"
     31 #include "amdgpu_id.h"
     32 #include "radv_radeon_winsys.h"
     33 #include "radv_amdgpu_cs.h"
     34 #include "radv_amdgpu_bo.h"
     35 #include "sid.h"
     36 
     37 struct radv_amdgpu_cs {
     38 	struct radeon_winsys_cs base;
     39 	struct radv_amdgpu_winsys *ws;
     40 
     41 	struct amdgpu_cs_ib_info    ib;
     42 
     43 	struct radeon_winsys_bo     *ib_buffer;
     44 	uint8_t                 *ib_mapped;
     45 	unsigned                    max_num_buffers;
     46 	unsigned                    num_buffers;
     47 	amdgpu_bo_handle            *handles;
     48 	uint8_t                     *priorities;
     49 
     50 	struct radeon_winsys_bo     **old_ib_buffers;
     51 	unsigned                    num_old_ib_buffers;
     52 	unsigned                    max_num_old_ib_buffers;
     53 	unsigned                    *ib_size_ptr;
     54 	bool                        failed;
     55 	bool                        is_chained;
     56 
     57 	int                         buffer_hash_table[1024];
     58 	unsigned                    hw_ip;
     59 };
     60 
     61 static inline struct radv_amdgpu_cs *
     62 radv_amdgpu_cs(struct radeon_winsys_cs *base)
     63 {
     64 	return (struct radv_amdgpu_cs*)base;
     65 }
     66 
     67 static int ring_to_hw_ip(enum ring_type ring)
     68 {
     69 	switch (ring) {
     70 	case RING_GFX:
     71 		return AMDGPU_HW_IP_GFX;
     72 	case RING_DMA:
     73 		return AMDGPU_HW_IP_DMA;
     74 	case RING_COMPUTE:
     75 		return AMDGPU_HW_IP_COMPUTE;
     76 	default:
     77 		unreachable("unsupported ring");
     78 	}
     79 }
     80 
     81 static void radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx *ctx,
     82 					 struct amdgpu_cs_fence *fence,
     83 					 struct amdgpu_cs_request *req)
     84 {
     85 	fence->context = ctx->ctx;
     86 	fence->ip_type = req->ip_type;
     87 	fence->ip_instance = req->ip_instance;
     88 	fence->ring = req->ring;
     89 	fence->fence = req->seq_no;
     90 }
     91 
     92 static struct radeon_winsys_fence *radv_amdgpu_create_fence()
     93 {
     94 	struct radv_amdgpu_cs_fence *fence = calloc(1, sizeof(struct amdgpu_cs_fence));
     95 	return (struct radeon_winsys_fence*)fence;
     96 }
     97 
     98 static void radv_amdgpu_destroy_fence(struct radeon_winsys_fence *_fence)
     99 {
    100 	struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
    101 	free(fence);
    102 }
    103 
    104 static bool radv_amdgpu_fence_wait(struct radeon_winsys *_ws,
    105 			      struct radeon_winsys_fence *_fence,
    106 			      bool absolute,
    107 			      uint64_t timeout)
    108 {
    109 	struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
    110 	unsigned flags = absolute ? AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE : 0;
    111 	int r;
    112 	uint32_t expired = 0;
    113 
    114 	/* Now use the libdrm query. */
    115 	r = amdgpu_cs_query_fence_status(fence,
    116 					 timeout,
    117 					 flags,
    118 					 &expired);
    119 
    120 	if (r) {
    121 		fprintf(stderr, "amdgpu: radv_amdgpu_cs_query_fence_status failed.\n");
    122 		return false;
    123 	}
    124 
    125 	if (expired)
    126 		return true;
    127 
    128 	return false;
    129 }
    130 
    131 static void radv_amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
    132 {
    133 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs);
    134 
    135 	if (cs->ib_buffer)
    136 		cs->ws->base.buffer_destroy(cs->ib_buffer);
    137 	else
    138 		free(cs->base.buf);
    139 
    140 	for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
    141 		cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]);
    142 
    143 	free(cs->old_ib_buffers);
    144 	free(cs->handles);
    145 	free(cs->priorities);
    146 	free(cs);
    147 }
    148 
    149 static boolean radv_amdgpu_init_cs(struct radv_amdgpu_cs *cs,
    150 				   enum ring_type ring_type)
    151 {
    152 	for (int i = 0; i < ARRAY_SIZE(cs->buffer_hash_table); ++i)
    153 		cs->buffer_hash_table[i] = -1;
    154 
    155 	cs->hw_ip = ring_to_hw_ip(ring_type);
    156 	return true;
    157 }
    158 
    159 static struct radeon_winsys_cs *
    160 radv_amdgpu_cs_create(struct radeon_winsys *ws,
    161 		      enum ring_type ring_type)
    162 {
    163 	struct radv_amdgpu_cs *cs;
    164 	uint32_t ib_size = 20 * 1024 * 4;
    165 	cs = calloc(1, sizeof(struct radv_amdgpu_cs));
    166 	if (!cs)
    167 		return NULL;
    168 
    169 	cs->ws = radv_amdgpu_winsys(ws);
    170 	radv_amdgpu_init_cs(cs, ring_type);
    171 
    172 	if (cs->ws->use_ib_bos) {
    173 		cs->ib_buffer = ws->buffer_create(ws, ib_size, 0,
    174 						RADEON_DOMAIN_GTT,
    175 						RADEON_FLAG_CPU_ACCESS);
    176 		if (!cs->ib_buffer) {
    177 			free(cs);
    178 			return NULL;
    179 		}
    180 
    181 		cs->ib_mapped = ws->buffer_map(cs->ib_buffer);
    182 		if (!cs->ib_mapped) {
    183 			ws->buffer_destroy(cs->ib_buffer);
    184 			free(cs);
    185 			return NULL;
    186 		}
    187 
    188 		cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->va;
    189 		cs->base.buf = (uint32_t *)cs->ib_mapped;
    190 		cs->base.max_dw = ib_size / 4 - 4;
    191 		cs->ib_size_ptr = &cs->ib.size;
    192 		cs->ib.size = 0;
    193 
    194 		ws->cs_add_buffer(&cs->base, cs->ib_buffer, 8);
    195 	} else {
    196 		cs->base.buf = malloc(16384);
    197 		cs->base.max_dw = 4096;
    198 		if (!cs->base.buf) {
    199 			free(cs);
    200 			return NULL;
    201 		}
    202 	}
    203 
    204 	return &cs->base;
    205 }
    206 
    207 static void radv_amdgpu_cs_grow(struct radeon_winsys_cs *_cs, size_t min_size)
    208 {
    209 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
    210 
    211 	if (cs->failed) {
    212 		cs->base.cdw = 0;
    213 		return;
    214 	}
    215 
    216 	if (!cs->ws->use_ib_bos) {
    217 		const uint64_t limit_dws = 0xffff8;
    218 		uint64_t ib_dws = MAX2(cs->base.cdw + min_size,
    219 				       MIN2(cs->base.max_dw * 2, limit_dws));
    220 
    221 		/* The total ib size cannot exceed limit_dws dwords. */
    222 		if (ib_dws > limit_dws)
    223 		{
    224 			cs->failed = true;
    225 			cs->base.cdw = 0;
    226 			return;
    227 		}
    228 
    229 		uint32_t *new_buf = realloc(cs->base.buf, ib_dws * 4);
    230 		if (new_buf) {
    231 			cs->base.buf = new_buf;
    232 			cs->base.max_dw = ib_dws;
    233 		} else {
    234 			cs->failed = true;
    235 			cs->base.cdw = 0;
    236 		}
    237 		return;
    238 	}
    239 
    240 	uint64_t ib_size = MAX2(min_size * 4 + 16, cs->base.max_dw * 4 * 2);
    241 
    242 	/* max that fits in the chain size field. */
    243 	ib_size = MIN2(ib_size, 0xfffff);
    244 
    245 	while (!cs->base.cdw || (cs->base.cdw & 7) != 4)
    246 		cs->base.buf[cs->base.cdw++] = 0xffff1000;
    247 
    248 	*cs->ib_size_ptr |= cs->base.cdw + 4;
    249 
    250 	if (cs->num_old_ib_buffers == cs->max_num_old_ib_buffers) {
    251 		cs->max_num_old_ib_buffers = MAX2(1, cs->max_num_old_ib_buffers * 2);
    252 		cs->old_ib_buffers = realloc(cs->old_ib_buffers,
    253 					     cs->max_num_old_ib_buffers * sizeof(void*));
    254 	}
    255 
    256 	cs->old_ib_buffers[cs->num_old_ib_buffers++] = cs->ib_buffer;
    257 
    258 	cs->ib_buffer = cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0,
    259 						   RADEON_DOMAIN_GTT,
    260 						   RADEON_FLAG_CPU_ACCESS);
    261 
    262 	if (!cs->ib_buffer) {
    263 		cs->base.cdw = 0;
    264 		cs->failed = true;
    265 		cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers];
    266 	}
    267 
    268 	cs->ib_mapped = cs->ws->base.buffer_map(cs->ib_buffer);
    269 	if (!cs->ib_mapped) {
    270 		cs->ws->base.buffer_destroy(cs->ib_buffer);
    271 		cs->base.cdw = 0;
    272 		cs->failed = true;
    273 		cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers];
    274 	}
    275 
    276 	cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer, 8);
    277 
    278 	cs->base.buf[cs->base.cdw++] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
    279 	cs->base.buf[cs->base.cdw++] = radv_amdgpu_winsys_bo(cs->ib_buffer)->va;
    280 	cs->base.buf[cs->base.cdw++] = radv_amdgpu_winsys_bo(cs->ib_buffer)->va >> 32;
    281 	cs->ib_size_ptr = cs->base.buf + cs->base.cdw;
    282 	cs->base.buf[cs->base.cdw++] = S_3F2_CHAIN(1) | S_3F2_VALID(1);
    283 
    284 	cs->base.buf = (uint32_t *)cs->ib_mapped;
    285 	cs->base.cdw = 0;
    286 	cs->base.max_dw = ib_size / 4 - 4;
    287 
    288 }
    289 
    290 static bool radv_amdgpu_cs_finalize(struct radeon_winsys_cs *_cs)
    291 {
    292 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
    293 
    294 	if (cs->ws->use_ib_bos) {
    295 		while (!cs->base.cdw || (cs->base.cdw & 7) != 0)
    296 			cs->base.buf[cs->base.cdw++] = 0xffff1000;
    297 
    298 		*cs->ib_size_ptr |= cs->base.cdw;
    299 
    300 		cs->is_chained = false;
    301 	}
    302 
    303 	return !cs->failed;
    304 }
    305 
    306 static void radv_amdgpu_cs_reset(struct radeon_winsys_cs *_cs)
    307 {
    308 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
    309 	cs->base.cdw = 0;
    310 	cs->failed = false;
    311 
    312 	for (unsigned i = 0; i < cs->num_buffers; ++i) {
    313 		unsigned hash = ((uintptr_t)cs->handles[i] >> 6) &
    314 		                 (ARRAY_SIZE(cs->buffer_hash_table) - 1);
    315 		cs->buffer_hash_table[hash] = -1;
    316 	}
    317 
    318 	cs->num_buffers = 0;
    319 
    320 	if (cs->ws->use_ib_bos) {
    321 		cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer, 8);
    322 
    323 		for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
    324 			cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]);
    325 
    326 		cs->num_old_ib_buffers = 0;
    327 		cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->va;
    328 		cs->ib_size_ptr = &cs->ib.size;
    329 		cs->ib.size = 0;
    330 	}
    331 }
    332 
    333 static int radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs *cs,
    334 				      amdgpu_bo_handle bo)
    335 {
    336 	unsigned hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
    337 	int index = cs->buffer_hash_table[hash];
    338 
    339 	if (index == -1)
    340 		return -1;
    341 
    342 	if (cs->handles[index] == bo)
    343 		return index;
    344 
    345 	for (unsigned i = 0; i < cs->num_buffers; ++i) {
    346 		if (cs->handles[i] == bo) {
    347 			cs->buffer_hash_table[hash] = i;
    348 			return i;
    349 		}
    350 	}
    351 
    352 	return -1;
    353 }
    354 
    355 static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs,
    356 					       amdgpu_bo_handle bo,
    357 					       uint8_t priority)
    358 {
    359 	unsigned hash;
    360 	int index = radv_amdgpu_cs_find_buffer(cs, bo);
    361 
    362 	if (index != -1) {
    363 		cs->priorities[index] = MAX2(cs->priorities[index], priority);
    364 		return;
    365 	}
    366 
    367 	if (cs->num_buffers == cs->max_num_buffers) {
    368 		unsigned new_count = MAX2(1, cs->max_num_buffers * 2);
    369 		cs->handles = realloc(cs->handles, new_count * sizeof(amdgpu_bo_handle));
    370 		cs->priorities = realloc(cs->priorities, new_count * sizeof(uint8_t));
    371 		cs->max_num_buffers = new_count;
    372 	}
    373 
    374 	cs->handles[cs->num_buffers] = bo;
    375 	cs->priorities[cs->num_buffers] = priority;
    376 
    377 	hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
    378 	cs->buffer_hash_table[hash] = cs->num_buffers;
    379 
    380 	++cs->num_buffers;
    381 }
    382 
    383 static void radv_amdgpu_cs_add_buffer(struct radeon_winsys_cs *_cs,
    384 				 struct radeon_winsys_bo *_bo,
    385 				 uint8_t priority)
    386 {
    387 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
    388 	struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
    389 
    390 	radv_amdgpu_cs_add_buffer_internal(cs, bo->bo, priority);
    391 }
    392 
    393 static void radv_amdgpu_cs_execute_secondary(struct radeon_winsys_cs *_parent,
    394 					     struct radeon_winsys_cs *_child)
    395 {
    396 	struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent);
    397 	struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child);
    398 
    399 	for (unsigned i = 0; i < child->num_buffers; ++i) {
    400 		radv_amdgpu_cs_add_buffer_internal(parent, child->handles[i],
    401 						   child->priorities[i]);
    402 	}
    403 
    404 	if (parent->ws->use_ib_bos) {
    405 		if (parent->base.cdw + 4 > parent->base.max_dw)
    406 			radv_amdgpu_cs_grow(&parent->base, 4);
    407 
    408 		parent->base.buf[parent->base.cdw++] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
    409 		parent->base.buf[parent->base.cdw++] = child->ib.ib_mc_address;
    410 		parent->base.buf[parent->base.cdw++] = child->ib.ib_mc_address >> 32;
    411 		parent->base.buf[parent->base.cdw++] = child->ib.size;
    412 	} else {
    413 		if (parent->base.cdw + child->base.cdw > parent->base.max_dw)
    414 			radv_amdgpu_cs_grow(&parent->base, child->base.cdw);
    415 
    416 		memcpy(parent->base.buf + parent->base.cdw, child->base.buf, 4 * child->base.cdw);
    417 		parent->base.cdw += child->base.cdw;
    418 	}
    419 }
    420 
    421 static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
    422 				      struct radeon_winsys_cs **cs_array,
    423 				      unsigned count,
    424 				      struct radv_amdgpu_winsys_bo *extra_bo,
    425 				      amdgpu_bo_list_handle *bo_list)
    426 {
    427 	int r;
    428 	if (ws->debug_all_bos) {
    429 		struct radv_amdgpu_winsys_bo *bo;
    430 		amdgpu_bo_handle *handles;
    431 		unsigned num = 0;
    432 
    433 		pthread_mutex_lock(&ws->global_bo_list_lock);
    434 
    435 		handles = malloc(sizeof(handles[0]) * ws->num_buffers);
    436 		if (!handles) {
    437 			pthread_mutex_unlock(&ws->global_bo_list_lock);
    438 			return -ENOMEM;
    439 		}
    440 
    441 		LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) {
    442 			assert(num < ws->num_buffers);
    443 			handles[num++] = bo->bo;
    444 		}
    445 
    446 		r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
    447 					  handles, NULL,
    448 					  bo_list);
    449 		free(handles);
    450 		pthread_mutex_unlock(&ws->global_bo_list_lock);
    451 	} else if (count == 1 && !extra_bo) {
    452 		struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0];
    453 		r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, cs->handles,
    454 					  cs->priorities, bo_list);
    455 	} else {
    456 		unsigned total_buffer_count = !!extra_bo;
    457 		unsigned unique_bo_count = !!extra_bo;
    458 		for (unsigned i = 0; i < count; ++i) {
    459 			struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[i];
    460 			total_buffer_count += cs->num_buffers;
    461 		}
    462 
    463 		amdgpu_bo_handle *handles = malloc(sizeof(amdgpu_bo_handle) * total_buffer_count);
    464 		uint8_t *priorities = malloc(sizeof(uint8_t) * total_buffer_count);
    465 		if (!handles || !priorities) {
    466 			free(handles);
    467 			free(priorities);
    468 			return -ENOMEM;
    469 		}
    470 
    471 		if (extra_bo) {
    472 			handles[0] = extra_bo->bo;
    473 			priorities[0] = 8;
    474 		}
    475 
    476 		for (unsigned i = 0; i < count; ++i) {
    477 			struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[i];
    478 			for (unsigned j = 0; j < cs->num_buffers; ++j) {
    479 				bool found = false;
    480 				for (unsigned k = 0; k < unique_bo_count; ++k) {
    481 					if (handles[k] == cs->handles[j]) {
    482 						found = true;
    483 						priorities[k] = MAX2(priorities[k],
    484 								     cs->priorities[j]);
    485 						break;
    486 					}
    487 				}
    488 				if (!found) {
    489 					handles[unique_bo_count] = cs->handles[j];
    490 					priorities[unique_bo_count] = cs->priorities[j];
    491 					++unique_bo_count;
    492 				}
    493 			}
    494 		}
    495 		r = amdgpu_bo_list_create(ws->dev, unique_bo_count, handles,
    496 					  priorities, bo_list);
    497 
    498 		free(handles);
    499 		free(priorities);
    500 	}
    501 
    502 	return r;
    503 }
    504 
    505 static void radv_assign_last_submit(struct radv_amdgpu_ctx *ctx,
    506 				    struct amdgpu_cs_request *request)
    507 {
    508 	radv_amdgpu_request_to_fence(ctx,
    509 	                             &ctx->last_submission[request->ip_type][request->ring],
    510 	                             request);
    511 }
    512 
    513 static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
    514 						int queue_idx,
    515 						struct radeon_winsys_cs **cs_array,
    516 						unsigned cs_count,
    517 						struct radeon_winsys_fence *_fence)
    518 {
    519 	int r;
    520 	struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
    521 	struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
    522 	struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]);
    523 	amdgpu_bo_list_handle bo_list;
    524 	struct amdgpu_cs_request request = {0};
    525 
    526 	for (unsigned i = cs_count; i--;) {
    527 		struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
    528 
    529 		if (cs->is_chained) {
    530 			*cs->ib_size_ptr -= 4;
    531 			cs->is_chained = false;
    532 		}
    533 
    534 		if (i + 1 < cs_count) {
    535 			struct radv_amdgpu_cs *next = radv_amdgpu_cs(cs_array[i + 1]);
    536 			assert(cs->base.cdw + 4 <= cs->base.max_dw);
    537 
    538 			cs->is_chained = true;
    539 			*cs->ib_size_ptr += 4;
    540 
    541 			cs->base.buf[cs->base.cdw + 0] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
    542 			cs->base.buf[cs->base.cdw + 1] = next->ib.ib_mc_address;
    543 			cs->base.buf[cs->base.cdw + 2] = next->ib.ib_mc_address >> 32;
    544 			cs->base.buf[cs->base.cdw + 3] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | next->ib.size;
    545 		}
    546 	}
    547 
    548 	r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, &bo_list);
    549 	if (r) {
    550 		fprintf(stderr, "amdgpu: Failed to created the BO list for submission\n");
    551 		return r;
    552 	}
    553 
    554 	request.ip_type = cs0->hw_ip;
    555 	request.ring = queue_idx;
    556 	request.number_of_ibs = 1;
    557 	request.ibs = &cs0->ib;
    558 	request.resources = bo_list;
    559 
    560 	r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1);
    561 	if (r) {
    562 		if (r == -ENOMEM)
    563 			fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
    564 		else
    565 			fprintf(stderr, "amdgpu: The CS has been rejected, "
    566 					"see dmesg for more information.\n");
    567 	}
    568 
    569 	amdgpu_bo_list_destroy(bo_list);
    570 
    571 	if (fence)
    572 		radv_amdgpu_request_to_fence(ctx, fence, &request);
    573 
    574 	radv_assign_last_submit(ctx, &request);
    575 
    576 	return r;
    577 }
    578 
    579 static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
    580 						 int queue_idx,
    581 						 struct radeon_winsys_cs **cs_array,
    582 						 unsigned cs_count,
    583 						 struct radeon_winsys_fence *_fence)
    584 {
    585 	int r;
    586 	struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
    587 	struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
    588 	amdgpu_bo_list_handle bo_list;
    589 	struct amdgpu_cs_request request;
    590 
    591 	assert(cs_count);
    592 
    593 	for (unsigned i = 0; i < cs_count;) {
    594 		struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[i]);
    595 		struct amdgpu_cs_ib_info ibs[AMDGPU_CS_MAX_IBS_PER_SUBMIT];
    596 		unsigned cnt = MIN2(AMDGPU_CS_MAX_IBS_PER_SUBMIT, cs_count - i);
    597 
    598 		memset(&request, 0, sizeof(request));
    599 
    600 		r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL, &bo_list);
    601 		if (r) {
    602 			fprintf(stderr, "amdgpu: Failed to created the BO list for submission\n");
    603 			return r;
    604 		}
    605 
    606 		request.ip_type = cs0->hw_ip;
    607 		request.ring = queue_idx;
    608 		request.resources = bo_list;
    609 		request.number_of_ibs = cnt;
    610 		request.ibs = ibs;
    611 
    612 		for (unsigned j = 0; j < cnt; ++j) {
    613 			struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]);
    614 			ibs[j] = cs->ib;
    615 
    616 			if (cs->is_chained) {
    617 				*cs->ib_size_ptr -= 4;
    618 				cs->is_chained = false;
    619 			}
    620 		}
    621 
    622 		r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1);
    623 		if (r) {
    624 			if (r == -ENOMEM)
    625 				fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
    626 			else
    627 				fprintf(stderr, "amdgpu: The CS has been rejected, "
    628 						"see dmesg for more information.\n");
    629 		}
    630 
    631 		amdgpu_bo_list_destroy(bo_list);
    632 
    633 		if (r)
    634 			return r;
    635 
    636 		i += cnt;
    637 	}
    638 	if (fence)
    639 		radv_amdgpu_request_to_fence(ctx, fence, &request);
    640 
    641 	radv_assign_last_submit(ctx, &request);
    642 
    643 	return 0;
    644 }
    645 
    646 static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
    647 					       int queue_idx,
    648 					       struct radeon_winsys_cs **cs_array,
    649 					       unsigned cs_count,
    650 					       struct radeon_winsys_fence *_fence)
    651 {
    652 	int r;
    653 	struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
    654 	struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
    655 	struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]);
    656 	struct radeon_winsys *ws = (struct radeon_winsys*)cs0->ws;
    657 	amdgpu_bo_list_handle bo_list;
    658 	struct amdgpu_cs_request request;
    659 	uint32_t pad_word = 0xffff1000U;
    660 
    661 	if (radv_amdgpu_winsys(ws)->family == FAMILY_SI)
    662 		pad_word = 0x80000000;
    663 
    664 	assert(cs_count);
    665 
    666 	for (unsigned i = 0; i < cs_count;) {
    667 		struct amdgpu_cs_ib_info ib = {0};
    668 		struct radeon_winsys_bo *bo = NULL;
    669 		uint32_t *ptr;
    670 		unsigned cnt = 0;
    671 		unsigned size = 0;
    672 
    673 		while (i + cnt < cs_count && 0xffff8 - size >= radv_amdgpu_cs(cs_array[i + cnt])->base.cdw) {
    674 			size += radv_amdgpu_cs(cs_array[i + cnt])->base.cdw;
    675 			++cnt;
    676 		}
    677 
    678 		assert(cnt);
    679 
    680 		bo = ws->buffer_create(ws, 4 * size, 4096, RADEON_DOMAIN_GTT, RADEON_FLAG_CPU_ACCESS);
    681 		ptr = ws->buffer_map(bo);
    682 
    683 		for (unsigned j = 0; j < cnt; ++j) {
    684 			struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]);
    685 			memcpy(ptr, cs->base.buf, 4 * cs->base.cdw);
    686 			ptr += cs->base.cdw;
    687 
    688 		}
    689 
    690 		while(!size || (size & 7)) {
    691 			*ptr++ = pad_word;
    692 			++size;
    693 		}
    694 
    695 		memset(&request, 0, sizeof(request));
    696 
    697 
    698 		r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt,
    699 		                               (struct radv_amdgpu_winsys_bo*)bo, &bo_list);
    700 		if (r) {
    701 			fprintf(stderr, "amdgpu: Failed to created the BO list for submission\n");
    702 			return r;
    703 		}
    704 
    705 		ib.size = size;
    706 		ib.ib_mc_address = ws->buffer_get_va(bo);
    707 
    708 		request.ip_type = cs0->hw_ip;
    709 		request.ring = queue_idx;
    710 		request.resources = bo_list;
    711 		request.number_of_ibs = 1;
    712 		request.ibs = &ib;
    713 
    714 		r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1);
    715 		if (r) {
    716 			if (r == -ENOMEM)
    717 				fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
    718 			else
    719 				fprintf(stderr, "amdgpu: The CS has been rejected, "
    720 						"see dmesg for more information.\n");
    721 		}
    722 
    723 		amdgpu_bo_list_destroy(bo_list);
    724 
    725 		ws->buffer_destroy(bo);
    726 		if (r)
    727 			return r;
    728 
    729 		i += cnt;
    730 	}
    731 	if (fence)
    732 		radv_amdgpu_request_to_fence(ctx, fence, &request);
    733 
    734 	radv_assign_last_submit(ctx, &request);
    735 
    736 	return 0;
    737 }
    738 
    739 static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx,
    740 					int queue_idx,
    741 					struct radeon_winsys_cs **cs_array,
    742 					unsigned cs_count,
    743 					struct radeon_winsys_sem **wait_sem,
    744 					unsigned wait_sem_count,
    745 					struct radeon_winsys_sem **signal_sem,
    746 					unsigned signal_sem_count,
    747 					bool can_patch,
    748 					struct radeon_winsys_fence *_fence)
    749 {
    750 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[0]);
    751 	struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
    752 	int ret;
    753 	int i;
    754 
    755 	for (i = 0; i < wait_sem_count; i++) {
    756 		amdgpu_semaphore_handle sem = (amdgpu_semaphore_handle)wait_sem[i];
    757 		amdgpu_cs_wait_semaphore(ctx->ctx, cs->hw_ip, 0, queue_idx,
    758 					 sem);
    759 	}
    760 	if (!cs->ws->use_ib_bos) {
    761 		ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, cs_array,
    762 							   cs_count, _fence);
    763 	} else if (can_patch && cs_count > AMDGPU_CS_MAX_IBS_PER_SUBMIT && false) {
    764 		ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, cs_array,
    765 							    cs_count, _fence);
    766 	} else {
    767 		ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, cs_array,
    768 							     cs_count, _fence);
    769 	}
    770 
    771 	for (i = 0; i < signal_sem_count; i++) {
    772 		amdgpu_semaphore_handle sem = (amdgpu_semaphore_handle)signal_sem[i];
    773 		amdgpu_cs_signal_semaphore(ctx->ctx, cs->hw_ip, 0, queue_idx,
    774 					   sem);
    775 	}
    776 	return ret;
    777 }
    778 
    779 
    780 static void *radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr)
    781 {
    782 	struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
    783 	void *ret = NULL;
    784 	for (unsigned i = 0; i <= cs->num_old_ib_buffers; ++i) {
    785 		struct radv_amdgpu_winsys_bo *bo;
    786 
    787 		bo = (struct radv_amdgpu_winsys_bo*)
    788 		       (i == cs->num_old_ib_buffers ? cs->ib_buffer : cs->old_ib_buffers[i]);
    789 		if (addr >= bo->va && addr - bo->va < bo->size) {
    790 			if (amdgpu_bo_cpu_map(bo->bo, &ret) == 0)
    791 				return (char *)ret + (addr - bo->va);
    792 		}
    793 	}
    794 	return ret;
    795 }
    796 
    797 static void radv_amdgpu_winsys_cs_dump(struct radeon_winsys_cs *_cs,
    798                                        FILE* file,
    799                                        uint32_t trace_id)
    800 {
    801 	struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
    802 
    803 	ac_parse_ib(file,
    804 		    radv_amdgpu_winsys_get_cpu_addr(cs, cs->ib.ib_mc_address),
    805 		    cs->ib.size, trace_id,  "main IB", cs->ws->info.chip_class,
    806 		    radv_amdgpu_winsys_get_cpu_addr, cs);
    807 }
    808 
    809 static struct radeon_winsys_ctx *radv_amdgpu_ctx_create(struct radeon_winsys *_ws)
    810 {
    811 	struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
    812 	struct radv_amdgpu_ctx *ctx = CALLOC_STRUCT(radv_amdgpu_ctx);
    813 	int r;
    814 
    815 	if (!ctx)
    816 		return NULL;
    817 	r = amdgpu_cs_ctx_create(ws->dev, &ctx->ctx);
    818 	if (r) {
    819 		fprintf(stderr, "amdgpu: radv_amdgpu_cs_ctx_create failed. (%i)\n", r);
    820 		goto error_create;
    821 	}
    822 	ctx->ws = ws;
    823 	return (struct radeon_winsys_ctx *)ctx;
    824 error_create:
    825 	FREE(ctx);
    826 	return NULL;
    827 }
    828 
    829 static void radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
    830 {
    831 	struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
    832 	amdgpu_cs_ctx_free(ctx->ctx);
    833 	FREE(ctx);
    834 }
    835 
    836 static bool radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx,
    837                                       enum ring_type ring_type, int ring_index)
    838 {
    839 	struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
    840 	int ip_type = ring_to_hw_ip(ring_type);
    841 
    842 	if (ctx->last_submission[ip_type][ring_index].fence) {
    843 		uint32_t expired;
    844 		int ret = amdgpu_cs_query_fence_status(&ctx->last_submission[ip_type][ring_index],
    845 		                                       1000000000ull, 0, &expired);
    846 
    847 		if (ret || !expired)
    848 			return false;
    849 	}
    850 
    851 	return true;
    852 }
    853 
    854 static struct radeon_winsys_sem *radv_amdgpu_create_sem(struct radeon_winsys *_ws)
    855 {
    856 	int ret;
    857 	amdgpu_semaphore_handle sem;
    858 
    859 	ret = amdgpu_cs_create_semaphore(&sem);
    860 	if (ret)
    861 		return NULL;
    862 	return (struct radeon_winsys_sem *)sem;
    863 }
    864 
    865 static void radv_amdgpu_destroy_sem(struct radeon_winsys_sem *_sem)
    866 {
    867 	amdgpu_semaphore_handle sem = (amdgpu_semaphore_handle)_sem;
    868 	amdgpu_cs_destroy_semaphore(sem);
    869 }
    870 
    871 void radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
    872 {
    873 	ws->base.ctx_create = radv_amdgpu_ctx_create;
    874 	ws->base.ctx_destroy = radv_amdgpu_ctx_destroy;
    875 	ws->base.ctx_wait_idle = radv_amdgpu_ctx_wait_idle;
    876 	ws->base.cs_create = radv_amdgpu_cs_create;
    877 	ws->base.cs_destroy = radv_amdgpu_cs_destroy;
    878 	ws->base.cs_grow = radv_amdgpu_cs_grow;
    879 	ws->base.cs_finalize = radv_amdgpu_cs_finalize;
    880 	ws->base.cs_reset = radv_amdgpu_cs_reset;
    881 	ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer;
    882 	ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary;
    883 	ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
    884 	ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
    885 	ws->base.create_fence = radv_amdgpu_create_fence;
    886 	ws->base.destroy_fence = radv_amdgpu_destroy_fence;
    887 	ws->base.create_sem = radv_amdgpu_create_sem;
    888 	ws->base.destroy_sem = radv_amdgpu_destroy_sem;
    889 	ws->base.fence_wait = radv_amdgpu_fence_wait;
    890 }
    891