1 /* 2 * Copyright 2008 Jrme Glisse 3 * Copyright 2010 Marek Olk <maraeo (at) gmail.com> 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining 7 * a copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS 18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * The above copyright notice and this permission notice (including the 24 * next paragraph) shall be included in all copies or substantial portions 25 * of the Software. 26 */ 27 /* 28 * Authors: 29 * Marek Olk <maraeo (at) gmail.com> 30 * 31 * Based on work from libdrm_radeon by: 32 * Aapo Tahkola <aet (at) rasterburn.org> 33 * Nicolai Haehnle <prefect_ (at) gmx.net> 34 * Jrme Glisse <glisse (at) freedesktop.org> 35 */ 36 37 /* 38 This file replaces libdrm's radeon_cs_gem with our own implemention. 39 It's optimized specifically for Radeon DRM. 40 Reloc writes and space checking are faster and simpler than their 41 counterparts in libdrm (the time complexity of all the functions 42 is O(1) in nearly all scenarios, thanks to hashing). 43 44 It works like this: 45 46 cs_add_reloc(cs, buf, read_domain, write_domain) adds a new relocation and 47 also adds the size of 'buf' to the used_gart and used_vram winsys variables 48 based on the domains, which are simply or'd for the accounting purposes. 49 The adding is skipped if the reloc is already present in the list, but it 50 accounts any newly-referenced domains. 51 52 cs_validate is then called, which just checks: 53 used_vram/gart < vram/gart_size * 0.8 54 The 0.8 number allows for some memory fragmentation. If the validation 55 fails, the pipe driver flushes CS and tries do the validation again, 56 i.e. it validates only that one operation. If it fails again, it drops 57 the operation on the floor and prints some nasty message to stderr. 58 (done in the pipe driver) 59 60 cs_write_reloc(cs, buf) just writes a reloc that has been added using 61 cs_add_reloc. The read_domain and write_domain parameters have been removed, 62 because we already specify them in cs_add_reloc. 63 */ 64 65 #include "radeon_drm_cs.h" 66 67 #include "util/u_memory.h" 68 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <stdint.h> 72 #include <xf86drm.h> 73 74 /* 75 * this are copy from radeon_drm, once an updated libdrm is released 76 * we should bump configure.ac requirement for it and remove the following 77 * field 78 */ 79 #ifndef RADEON_CHUNK_ID_FLAGS 80 #define RADEON_CHUNK_ID_FLAGS 0x03 81 82 /* The first dword of RADEON_CHUNK_ID_FLAGS is a uint32 of these flags: */ 83 #define RADEON_CS_KEEP_TILING_FLAGS 0x01 84 85 86 #endif 87 88 #ifndef RADEON_CS_USE_VM 89 #define RADEON_CS_USE_VM 0x02 90 /* The second dword of RADEON_CHUNK_ID_FLAGS is a uint32 that sets the ring type */ 91 #define RADEON_CS_RING_GFX 0 92 #define RADEON_CS_RING_COMPUTE 1 93 #endif 94 95 96 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t)) 97 98 static boolean radeon_init_cs_context(struct radeon_cs_context *csc, 99 struct radeon_drm_winsys *ws) 100 { 101 csc->fd = ws->fd; 102 csc->nrelocs = 512; 103 csc->relocs_bo = (struct radeon_bo**) 104 CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*)); 105 if (!csc->relocs_bo) { 106 return FALSE; 107 } 108 109 csc->relocs = (struct drm_radeon_cs_reloc*) 110 CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc)); 111 if (!csc->relocs) { 112 FREE(csc->relocs_bo); 113 return FALSE; 114 } 115 116 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB; 117 csc->chunks[0].length_dw = 0; 118 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf; 119 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS; 120 csc->chunks[1].length_dw = 0; 121 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; 122 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS; 123 csc->chunks[2].length_dw = 2; 124 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags; 125 126 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0]; 127 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1]; 128 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2]; 129 130 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array; 131 return TRUE; 132 } 133 134 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc) 135 { 136 unsigned i; 137 138 for (i = 0; i < csc->crelocs; i++) { 139 p_atomic_dec(&csc->relocs_bo[i]->num_cs_references); 140 radeon_bo_reference(&csc->relocs_bo[i], NULL); 141 } 142 143 csc->crelocs = 0; 144 csc->validated_crelocs = 0; 145 csc->chunks[0].length_dw = 0; 146 csc->chunks[1].length_dw = 0; 147 csc->used_gart = 0; 148 csc->used_vram = 0; 149 memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added)); 150 } 151 152 static void radeon_destroy_cs_context(struct radeon_cs_context *csc) 153 { 154 radeon_cs_context_cleanup(csc); 155 FREE(csc->relocs_bo); 156 FREE(csc->relocs); 157 } 158 159 DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE) 160 static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param); 161 162 static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws) 163 { 164 struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); 165 struct radeon_drm_cs *cs; 166 167 cs = CALLOC_STRUCT(radeon_drm_cs); 168 if (!cs) { 169 return NULL; 170 } 171 pipe_semaphore_init(&cs->flush_queued, 0); 172 pipe_semaphore_init(&cs->flush_completed, 0); 173 174 cs->ws = ws; 175 176 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) { 177 FREE(cs); 178 return NULL; 179 } 180 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) { 181 radeon_destroy_cs_context(&cs->csc1); 182 FREE(cs); 183 return NULL; 184 } 185 186 /* Set the first command buffer as current. */ 187 cs->csc = &cs->csc1; 188 cs->cst = &cs->csc2; 189 cs->base.buf = cs->csc->buf; 190 191 p_atomic_inc(&ws->num_cs); 192 if (cs->ws->num_cpus > 1 && debug_get_option_thread()) 193 cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs); 194 return &cs->base; 195 } 196 197 #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value) 198 199 static INLINE void update_reloc_domains(struct drm_radeon_cs_reloc *reloc, 200 enum radeon_bo_domain rd, 201 enum radeon_bo_domain wd, 202 enum radeon_bo_domain *added_domains) 203 { 204 *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain); 205 206 reloc->read_domains |= rd; 207 reloc->write_domain |= wd; 208 } 209 210 int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo) 211 { 212 struct drm_radeon_cs_reloc *reloc; 213 unsigned i; 214 unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1); 215 216 if (csc->is_handle_added[hash]) { 217 i = csc->reloc_indices_hashlist[hash]; 218 reloc = &csc->relocs[i]; 219 if (reloc->handle == bo->handle) { 220 return i; 221 } 222 223 /* Hash collision, look for the BO in the list of relocs linearly. */ 224 for (i = csc->crelocs; i != 0;) { 225 --i; 226 reloc = &csc->relocs[i]; 227 if (reloc->handle == bo->handle) { 228 /* Put this reloc in the hash list. 229 * This will prevent additional hash collisions if there are 230 * several consecutive get_reloc calls for the same buffer. 231 * 232 * Example: Assuming buffers A,B,C collide in the hash list, 233 * the following sequence of relocs: 234 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC 235 * will collide here: ^ and here: ^, 236 * meaning that we should get very few collisions in the end. */ 237 csc->reloc_indices_hashlist[hash] = i; 238 /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/ 239 return i; 240 } 241 } 242 } 243 244 return -1; 245 } 246 247 static unsigned radeon_add_reloc(struct radeon_cs_context *csc, 248 struct radeon_bo *bo, 249 enum radeon_bo_usage usage, 250 enum radeon_bo_domain domains, 251 enum radeon_bo_domain *added_domains) 252 { 253 struct drm_radeon_cs_reloc *reloc; 254 unsigned i; 255 unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1); 256 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0; 257 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0; 258 259 if (csc->is_handle_added[hash]) { 260 i = csc->reloc_indices_hashlist[hash]; 261 reloc = &csc->relocs[i]; 262 if (reloc->handle == bo->handle) { 263 update_reloc_domains(reloc, rd, wd, added_domains); 264 return i; 265 } 266 267 /* Hash collision, look for the BO in the list of relocs linearly. */ 268 for (i = csc->crelocs; i != 0;) { 269 --i; 270 reloc = &csc->relocs[i]; 271 if (reloc->handle == bo->handle) { 272 update_reloc_domains(reloc, rd, wd, added_domains); 273 274 csc->reloc_indices_hashlist[hash] = i; 275 /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/ 276 return i; 277 } 278 } 279 } 280 281 /* New relocation, check if the backing array is large enough. */ 282 if (csc->crelocs >= csc->nrelocs) { 283 uint32_t size; 284 csc->nrelocs += 10; 285 286 size = csc->nrelocs * sizeof(struct radeon_bo*); 287 csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size); 288 289 size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc); 290 csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size); 291 292 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; 293 } 294 295 /* Initialize the new relocation. */ 296 csc->relocs_bo[csc->crelocs] = NULL; 297 radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo); 298 p_atomic_inc(&bo->num_cs_references); 299 reloc = &csc->relocs[csc->crelocs]; 300 reloc->handle = bo->handle; 301 reloc->read_domains = rd; 302 reloc->write_domain = wd; 303 reloc->flags = 0; 304 305 csc->is_handle_added[hash] = TRUE; 306 csc->reloc_indices_hashlist[hash] = csc->crelocs; 307 308 csc->chunks[1].length_dw += RELOC_DWORDS; 309 310 *added_domains = rd | wd; 311 return csc->crelocs++; 312 } 313 314 static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs, 315 struct radeon_winsys_cs_handle *buf, 316 enum radeon_bo_usage usage, 317 enum radeon_bo_domain domains) 318 { 319 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 320 struct radeon_bo *bo = (struct radeon_bo*)buf; 321 enum radeon_bo_domain added_domains; 322 323 unsigned index = radeon_add_reloc(cs->csc, bo, usage, domains, &added_domains); 324 325 if (added_domains & RADEON_DOMAIN_GTT) 326 cs->csc->used_gart += bo->base.size; 327 if (added_domains & RADEON_DOMAIN_VRAM) 328 cs->csc->used_vram += bo->base.size; 329 330 return index; 331 } 332 333 static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs) 334 { 335 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 336 boolean status = 337 cs->csc->used_gart < cs->ws->info.gart_size * 0.8 && 338 cs->csc->used_vram < cs->ws->info.vram_size * 0.8; 339 340 if (status) { 341 cs->csc->validated_crelocs = cs->csc->crelocs; 342 } else { 343 /* Remove lately-added relocations. The validation failed with them 344 * and the CS is about to be flushed because of that. Keep only 345 * the already-validated relocations. */ 346 unsigned i; 347 348 for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) { 349 p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references); 350 radeon_bo_reference(&cs->csc->relocs_bo[i], NULL); 351 } 352 cs->csc->crelocs = cs->csc->validated_crelocs; 353 354 /* Flush if there are any relocs. Clean up otherwise. */ 355 if (cs->csc->crelocs) { 356 cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC); 357 } else { 358 radeon_cs_context_cleanup(cs->csc); 359 360 assert(cs->base.cdw == 0); 361 if (cs->base.cdw != 0) { 362 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__); 363 } 364 } 365 } 366 return status; 367 } 368 369 static boolean radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt) 370 { 371 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 372 boolean status = 373 (cs->csc->used_gart + gtt) < cs->ws->info.gart_size * 0.7 && 374 (cs->csc->used_vram + vram) < cs->ws->info.vram_size * 0.7; 375 376 return status; 377 } 378 379 static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs, 380 struct radeon_winsys_cs_handle *buf) 381 { 382 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 383 struct radeon_bo *bo = (struct radeon_bo*)buf; 384 385 unsigned index = radeon_get_reloc(cs->csc, bo); 386 387 if (index == -1) { 388 fprintf(stderr, "radeon: Cannot get a relocation in %s.\n", __func__); 389 return; 390 } 391 392 OUT_CS(&cs->base, 0xc0001000); 393 OUT_CS(&cs->base, index * RELOC_DWORDS); 394 } 395 396 static void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc) 397 { 398 unsigned i; 399 400 if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS, 401 &csc->cs, sizeof(struct drm_radeon_cs))) { 402 if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) { 403 unsigned i; 404 405 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n"); 406 for (i = 0; i < csc->chunks[0].length_dw; i++) { 407 fprintf(stderr, "0x%08X\n", csc->buf[i]); 408 } 409 } else { 410 fprintf(stderr, "radeon: The kernel rejected CS, " 411 "see dmesg for more information.\n"); 412 } 413 } 414 415 for (i = 0; i < csc->crelocs; i++) 416 p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls); 417 418 radeon_cs_context_cleanup(csc); 419 } 420 421 static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param) 422 { 423 struct radeon_drm_cs *cs = (struct radeon_drm_cs*)param; 424 425 while (1) { 426 pipe_semaphore_wait(&cs->flush_queued); 427 if (cs->kill_thread) 428 break; 429 radeon_drm_cs_emit_ioctl_oneshot(cs->cst); 430 pipe_semaphore_signal(&cs->flush_completed); 431 } 432 pipe_semaphore_signal(&cs->flush_completed); 433 return NULL; 434 } 435 436 void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs) 437 { 438 /* Wait for any pending ioctl to complete. */ 439 if (cs->thread && cs->flush_started) { 440 pipe_semaphore_wait(&cs->flush_completed); 441 cs->flush_started = 0; 442 } 443 } 444 445 static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags) 446 { 447 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 448 struct radeon_cs_context *tmp; 449 450 if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) { 451 fprintf(stderr, "radeon: command stream overflowed\n"); 452 } 453 454 radeon_drm_cs_sync_flush(cs); 455 456 /* Flip command streams. */ 457 tmp = cs->csc; 458 cs->csc = cs->cst; 459 cs->cst = tmp; 460 461 /* If the CS is not empty or overflowed, emit it in a separate thread. */ 462 if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS) { 463 unsigned i, crelocs = cs->cst->crelocs; 464 465 cs->cst->chunks[0].length_dw = cs->base.cdw; 466 467 for (i = 0; i < crelocs; i++) { 468 /* Update the number of active asynchronous CS ioctls for the buffer. */ 469 p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls); 470 } 471 472 cs->cst->flags[0] = 0; 473 cs->cst->flags[1] = RADEON_CS_RING_GFX; 474 cs->cst->cs.num_chunks = 2; 475 if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) { 476 cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS; 477 cs->cst->cs.num_chunks = 3; 478 } 479 if (cs->ws->info.r600_virtual_address) { 480 cs->cst->flags[0] |= RADEON_CS_USE_VM; 481 cs->cst->cs.num_chunks = 3; 482 } 483 if (flags & RADEON_FLUSH_COMPUTE) { 484 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE; 485 cs->cst->cs.num_chunks = 3; 486 } 487 488 if (cs->thread && 489 (flags & RADEON_FLUSH_ASYNC)) { 490 cs->flush_started = 1; 491 pipe_semaphore_signal(&cs->flush_queued); 492 } else { 493 radeon_drm_cs_emit_ioctl_oneshot(cs->cst); 494 } 495 } else { 496 radeon_cs_context_cleanup(cs->cst); 497 } 498 499 /* Prepare a new CS. */ 500 cs->base.buf = cs->csc->buf; 501 cs->base.cdw = 0; 502 } 503 504 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs) 505 { 506 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 507 radeon_drm_cs_sync_flush(cs); 508 if (cs->thread) { 509 cs->kill_thread = 1; 510 pipe_semaphore_signal(&cs->flush_queued); 511 pipe_semaphore_wait(&cs->flush_completed); 512 pipe_thread_wait(cs->thread); 513 } 514 pipe_semaphore_destroy(&cs->flush_queued); 515 pipe_semaphore_destroy(&cs->flush_completed); 516 radeon_cs_context_cleanup(&cs->csc1); 517 radeon_cs_context_cleanup(&cs->csc2); 518 p_atomic_dec(&cs->ws->num_cs); 519 radeon_destroy_cs_context(&cs->csc1); 520 radeon_destroy_cs_context(&cs->csc2); 521 FREE(cs); 522 } 523 524 static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs, 525 void (*flush)(void *ctx, unsigned flags), 526 void *user) 527 { 528 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 529 cs->flush_cs = flush; 530 cs->flush_data = user; 531 } 532 533 static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs, 534 struct radeon_winsys_cs_handle *_buf, 535 enum radeon_bo_usage usage) 536 { 537 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 538 struct radeon_bo *bo = (struct radeon_bo*)_buf; 539 int index; 540 541 if (!bo->num_cs_references) 542 return FALSE; 543 544 index = radeon_get_reloc(cs->csc, bo); 545 if (index == -1) 546 return FALSE; 547 548 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain) 549 return TRUE; 550 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains) 551 return TRUE; 552 553 return FALSE; 554 } 555 556 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws) 557 { 558 ws->base.cs_create = radeon_drm_cs_create; 559 ws->base.cs_destroy = radeon_drm_cs_destroy; 560 ws->base.cs_add_reloc = radeon_drm_cs_add_reloc; 561 ws->base.cs_validate = radeon_drm_cs_validate; 562 ws->base.cs_memory_below_limit = radeon_drm_cs_memory_below_limit; 563 ws->base.cs_write_reloc = radeon_drm_cs_write_reloc; 564 ws->base.cs_flush = radeon_drm_cs_flush; 565 ws->base.cs_set_flush_callback = radeon_drm_cs_set_flush; 566 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced; 567 } 568