1 /* 2 * Copyright 2011 Marek Olk <maraeo (at) gmail.com> 3 * Copyright 2015 Advanced Micro Devices, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining 7 * a copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS 18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * The above copyright notice and this permission notice (including the 24 * next paragraph) shall be included in all copies or substantial portions 25 * of the Software. 26 */ 27 28 #include "amdgpu_cs.h" 29 30 #include "util/os_time.h" 31 #include "state_tracker/drm_driver.h" 32 #include <amdgpu_drm.h> 33 #include <xf86drm.h> 34 #include <stdio.h> 35 #include <inttypes.h> 36 37 #ifndef AMDGPU_GEM_CREATE_VM_ALWAYS_VALID 38 #define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6) 39 #endif 40 41 /* Set to 1 for verbose output showing committed sparse buffer ranges. */ 42 #define DEBUG_SPARSE_COMMITS 0 43 44 struct amdgpu_sparse_backing_chunk { 45 uint32_t begin, end; 46 }; 47 48 static struct pb_buffer * 49 amdgpu_bo_create(struct radeon_winsys *rws, 50 uint64_t size, 51 unsigned alignment, 52 enum radeon_bo_domain domain, 53 enum radeon_bo_flag flags); 54 55 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, 56 enum radeon_bo_usage usage) 57 { 58 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 59 struct amdgpu_winsys *ws = bo->ws; 60 int64_t abs_timeout; 61 62 if (timeout == 0) { 63 if (p_atomic_read(&bo->num_active_ioctls)) 64 return false; 65 66 } else { 67 abs_timeout = os_time_get_absolute_timeout(timeout); 68 69 /* Wait if any ioctl is being submitted with this buffer. */ 70 if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout)) 71 return false; 72 } 73 74 if (bo->is_shared) { 75 /* We can't use user fences for shared buffers, because user fences 76 * are local to this process only. If we want to wait for all buffer 77 * uses in all processes, we have to use amdgpu_bo_wait_for_idle. 78 */ 79 bool buffer_busy = true; 80 int r; 81 82 r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy); 83 if (r) 84 fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__, 85 r); 86 return !buffer_busy; 87 } 88 89 if (timeout == 0) { 90 unsigned idle_fences; 91 bool buffer_idle; 92 93 simple_mtx_lock(&ws->bo_fence_lock); 94 95 for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) { 96 if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false)) 97 break; 98 } 99 100 /* Release the idle fences to avoid checking them again later. */ 101 for (unsigned i = 0; i < idle_fences; ++i) 102 amdgpu_fence_reference(&bo->fences[i], NULL); 103 104 memmove(&bo->fences[0], &bo->fences[idle_fences], 105 (bo->num_fences - idle_fences) * sizeof(*bo->fences)); 106 bo->num_fences -= idle_fences; 107 108 buffer_idle = !bo->num_fences; 109 simple_mtx_unlock(&ws->bo_fence_lock); 110 111 return buffer_idle; 112 } else { 113 bool buffer_idle = true; 114 115 simple_mtx_lock(&ws->bo_fence_lock); 116 while (bo->num_fences && buffer_idle) { 117 struct pipe_fence_handle *fence = NULL; 118 bool fence_idle = false; 119 120 amdgpu_fence_reference(&fence, bo->fences[0]); 121 122 /* Wait for the fence. */ 123 simple_mtx_unlock(&ws->bo_fence_lock); 124 if (amdgpu_fence_wait(fence, abs_timeout, true)) 125 fence_idle = true; 126 else 127 buffer_idle = false; 128 simple_mtx_lock(&ws->bo_fence_lock); 129 130 /* Release an idle fence to avoid checking it again later, keeping in 131 * mind that the fence array may have been modified by other threads. 132 */ 133 if (fence_idle && bo->num_fences && bo->fences[0] == fence) { 134 amdgpu_fence_reference(&bo->fences[0], NULL); 135 memmove(&bo->fences[0], &bo->fences[1], 136 (bo->num_fences - 1) * sizeof(*bo->fences)); 137 bo->num_fences--; 138 } 139 140 amdgpu_fence_reference(&fence, NULL); 141 } 142 simple_mtx_unlock(&ws->bo_fence_lock); 143 144 return buffer_idle; 145 } 146 } 147 148 static enum radeon_bo_domain amdgpu_bo_get_initial_domain( 149 struct pb_buffer *buf) 150 { 151 return ((struct amdgpu_winsys_bo*)buf)->initial_domain; 152 } 153 154 static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo) 155 { 156 for (unsigned i = 0; i < bo->num_fences; ++i) 157 amdgpu_fence_reference(&bo->fences[i], NULL); 158 159 FREE(bo->fences); 160 bo->num_fences = 0; 161 bo->max_fences = 0; 162 } 163 164 void amdgpu_bo_destroy(struct pb_buffer *_buf) 165 { 166 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 167 168 assert(bo->bo && "must not be called for slab entries"); 169 170 if (bo->ws->debug_all_bos) { 171 simple_mtx_lock(&bo->ws->global_bo_list_lock); 172 LIST_DEL(&bo->u.real.global_list_item); 173 bo->ws->num_buffers--; 174 simple_mtx_unlock(&bo->ws->global_bo_list_lock); 175 } 176 177 amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP); 178 amdgpu_va_range_free(bo->u.real.va_handle); 179 amdgpu_bo_free(bo->bo); 180 181 amdgpu_bo_remove_fences(bo); 182 183 if (bo->initial_domain & RADEON_DOMAIN_VRAM) 184 bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->info.gart_page_size); 185 else if (bo->initial_domain & RADEON_DOMAIN_GTT) 186 bo->ws->allocated_gtt -= align64(bo->base.size, bo->ws->info.gart_page_size); 187 188 if (bo->u.real.map_count >= 1) { 189 if (bo->initial_domain & RADEON_DOMAIN_VRAM) 190 bo->ws->mapped_vram -= bo->base.size; 191 else if (bo->initial_domain & RADEON_DOMAIN_GTT) 192 bo->ws->mapped_gtt -= bo->base.size; 193 bo->ws->num_mapped_buffers--; 194 } 195 196 FREE(bo); 197 } 198 199 static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf) 200 { 201 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 202 203 assert(bo->bo); /* slab buffers have a separate vtbl */ 204 205 if (bo->u.real.use_reusable_pool) 206 pb_cache_add_buffer(&bo->u.real.cache_entry); 207 else 208 amdgpu_bo_destroy(_buf); 209 } 210 211 static void *amdgpu_bo_map(struct pb_buffer *buf, 212 struct radeon_winsys_cs *rcs, 213 enum pipe_transfer_usage usage) 214 { 215 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; 216 struct amdgpu_winsys_bo *real; 217 struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs; 218 int r; 219 void *cpu = NULL; 220 uint64_t offset = 0; 221 222 assert(!bo->sparse); 223 224 /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */ 225 if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { 226 /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */ 227 if (usage & PIPE_TRANSFER_DONTBLOCK) { 228 if (!(usage & PIPE_TRANSFER_WRITE)) { 229 /* Mapping for read. 230 * 231 * Since we are mapping for read, we don't need to wait 232 * if the GPU is using the buffer for read too 233 * (neither one is changing it). 234 * 235 * Only check whether the buffer is being used for write. */ 236 if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, 237 RADEON_USAGE_WRITE)) { 238 cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); 239 return NULL; 240 } 241 242 if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, 243 RADEON_USAGE_WRITE)) { 244 return NULL; 245 } 246 } else { 247 if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) { 248 cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); 249 return NULL; 250 } 251 252 if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, 253 RADEON_USAGE_READWRITE)) { 254 return NULL; 255 } 256 } 257 } else { 258 uint64_t time = os_time_get_nano(); 259 260 if (!(usage & PIPE_TRANSFER_WRITE)) { 261 /* Mapping for read. 262 * 263 * Since we are mapping for read, we don't need to wait 264 * if the GPU is using the buffer for read too 265 * (neither one is changing it). 266 * 267 * Only check whether the buffer is being used for write. */ 268 if (cs) { 269 if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, 270 RADEON_USAGE_WRITE)) { 271 cs->flush_cs(cs->flush_data, 0, NULL); 272 } else { 273 /* Try to avoid busy-waiting in amdgpu_bo_wait. */ 274 if (p_atomic_read(&bo->num_active_ioctls)) 275 amdgpu_cs_sync_flush(rcs); 276 } 277 } 278 279 amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, 280 RADEON_USAGE_WRITE); 281 } else { 282 /* Mapping for write. */ 283 if (cs) { 284 if (amdgpu_bo_is_referenced_by_cs(cs, bo)) { 285 cs->flush_cs(cs->flush_data, 0, NULL); 286 } else { 287 /* Try to avoid busy-waiting in amdgpu_bo_wait. */ 288 if (p_atomic_read(&bo->num_active_ioctls)) 289 amdgpu_cs_sync_flush(rcs); 290 } 291 } 292 293 amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, 294 RADEON_USAGE_READWRITE); 295 } 296 297 bo->ws->buffer_wait_time += os_time_get_nano() - time; 298 } 299 } 300 301 /* If the buffer is created from user memory, return the user pointer. */ 302 if (bo->user_ptr) 303 return bo->user_ptr; 304 305 if (bo->bo) { 306 real = bo; 307 } else { 308 real = bo->u.slab.real; 309 offset = bo->va - real->va; 310 } 311 312 r = amdgpu_bo_cpu_map(real->bo, &cpu); 313 if (r) { 314 /* Clear the cache and try again. */ 315 pb_cache_release_all_buffers(&real->ws->bo_cache); 316 r = amdgpu_bo_cpu_map(real->bo, &cpu); 317 if (r) 318 return NULL; 319 } 320 321 if (p_atomic_inc_return(&real->u.real.map_count) == 1) { 322 if (real->initial_domain & RADEON_DOMAIN_VRAM) 323 real->ws->mapped_vram += real->base.size; 324 else if (real->initial_domain & RADEON_DOMAIN_GTT) 325 real->ws->mapped_gtt += real->base.size; 326 real->ws->num_mapped_buffers++; 327 } 328 return (uint8_t*)cpu + offset; 329 } 330 331 static void amdgpu_bo_unmap(struct pb_buffer *buf) 332 { 333 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; 334 struct amdgpu_winsys_bo *real; 335 336 assert(!bo->sparse); 337 338 if (bo->user_ptr) 339 return; 340 341 real = bo->bo ? bo : bo->u.slab.real; 342 343 if (p_atomic_dec_zero(&real->u.real.map_count)) { 344 if (real->initial_domain & RADEON_DOMAIN_VRAM) 345 real->ws->mapped_vram -= real->base.size; 346 else if (real->initial_domain & RADEON_DOMAIN_GTT) 347 real->ws->mapped_gtt -= real->base.size; 348 real->ws->num_mapped_buffers--; 349 } 350 351 amdgpu_bo_cpu_unmap(real->bo); 352 } 353 354 static const struct pb_vtbl amdgpu_winsys_bo_vtbl = { 355 amdgpu_bo_destroy_or_cache 356 /* other functions are never called */ 357 }; 358 359 static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo) 360 { 361 struct amdgpu_winsys *ws = bo->ws; 362 363 assert(bo->bo); 364 365 if (ws->debug_all_bos) { 366 simple_mtx_lock(&ws->global_bo_list_lock); 367 LIST_ADDTAIL(&bo->u.real.global_list_item, &ws->global_bo_list); 368 ws->num_buffers++; 369 simple_mtx_unlock(&ws->global_bo_list_lock); 370 } 371 } 372 373 static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, 374 uint64_t size, 375 unsigned alignment, 376 unsigned usage, 377 enum radeon_bo_domain initial_domain, 378 unsigned flags, 379 unsigned pb_cache_bucket) 380 { 381 struct amdgpu_bo_alloc_request request = {0}; 382 amdgpu_bo_handle buf_handle; 383 uint64_t va = 0; 384 struct amdgpu_winsys_bo *bo; 385 amdgpu_va_handle va_handle; 386 unsigned va_gap_size; 387 int r; 388 389 /* VRAM or GTT must be specified, but not both at the same time. */ 390 assert(util_bitcount(initial_domain & RADEON_DOMAIN_VRAM_GTT) == 1); 391 392 bo = CALLOC_STRUCT(amdgpu_winsys_bo); 393 if (!bo) { 394 return NULL; 395 } 396 397 pb_cache_init_entry(&ws->bo_cache, &bo->u.real.cache_entry, &bo->base, 398 pb_cache_bucket); 399 request.alloc_size = size; 400 request.phys_alignment = alignment; 401 402 if (initial_domain & RADEON_DOMAIN_VRAM) 403 request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM; 404 if (initial_domain & RADEON_DOMAIN_GTT) 405 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; 406 407 /* If VRAM is just stolen system memory, allow both VRAM and 408 * GTT, whichever has free space. If a buffer is evicted from 409 * VRAM to GTT, it will stay there. 410 * 411 * DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only 412 * placements even with a low amount of stolen VRAM. 413 */ 414 if (!ws->info.has_dedicated_vram && ws->info.drm_minor < 6) 415 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; 416 417 if (flags & RADEON_FLAG_NO_CPU_ACCESS) 418 request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 419 if (flags & RADEON_FLAG_GTT_WC) 420 request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; 421 /* TODO: Enable this once the kernel handles it efficiently. */ 422 /*if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && 423 ws->info.drm_minor >= 20) 424 request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;*/ 425 426 r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle); 427 if (r) { 428 fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n"); 429 fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size); 430 fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment); 431 fprintf(stderr, "amdgpu: domains : %u\n", initial_domain); 432 goto error_bo_alloc; 433 } 434 435 va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0; 436 if (size > ws->info.pte_fragment_size) 437 alignment = MAX2(alignment, ws->info.pte_fragment_size); 438 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, 439 size + va_gap_size, alignment, 0, &va, &va_handle, 0); 440 if (r) 441 goto error_va_alloc; 442 443 unsigned vm_flags = AMDGPU_VM_PAGE_READABLE | 444 AMDGPU_VM_PAGE_EXECUTABLE; 445 446 if (!(flags & RADEON_FLAG_READ_ONLY)) 447 vm_flags |= AMDGPU_VM_PAGE_WRITEABLE; 448 449 r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags, 450 AMDGPU_VA_OP_MAP); 451 if (r) 452 goto error_va_map; 453 454 pipe_reference_init(&bo->base.reference, 1); 455 bo->base.alignment = alignment; 456 bo->base.usage = usage; 457 bo->base.size = size; 458 bo->base.vtbl = &amdgpu_winsys_bo_vtbl; 459 bo->ws = ws; 460 bo->bo = buf_handle; 461 bo->va = va; 462 bo->u.real.va_handle = va_handle; 463 bo->initial_domain = initial_domain; 464 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); 465 bo->is_local = !!(request.flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID); 466 467 if (initial_domain & RADEON_DOMAIN_VRAM) 468 ws->allocated_vram += align64(size, ws->info.gart_page_size); 469 else if (initial_domain & RADEON_DOMAIN_GTT) 470 ws->allocated_gtt += align64(size, ws->info.gart_page_size); 471 472 amdgpu_add_buffer_to_global_list(bo); 473 474 return bo; 475 476 error_va_map: 477 amdgpu_va_range_free(va_handle); 478 479 error_va_alloc: 480 amdgpu_bo_free(buf_handle); 481 482 error_bo_alloc: 483 FREE(bo); 484 return NULL; 485 } 486 487 bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf) 488 { 489 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 490 491 if (amdgpu_bo_is_referenced_by_any_cs(bo)) { 492 return false; 493 } 494 495 return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE); 496 } 497 498 bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry) 499 { 500 struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */ 501 bo = container_of(entry, bo, u.slab.entry); 502 503 return amdgpu_bo_can_reclaim(&bo->base); 504 } 505 506 static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf) 507 { 508 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 509 510 assert(!bo->bo); 511 512 pb_slab_free(&bo->ws->bo_slabs, &bo->u.slab.entry); 513 } 514 515 static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = { 516 amdgpu_bo_slab_destroy 517 /* other functions are never called */ 518 }; 519 520 struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, 521 unsigned entry_size, 522 unsigned group_index) 523 { 524 struct amdgpu_winsys *ws = priv; 525 struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab); 526 enum radeon_bo_domain domains = radeon_domain_from_heap(heap); 527 enum radeon_bo_flag flags = radeon_flags_from_heap(heap); 528 uint32_t base_id; 529 530 if (!slab) 531 return NULL; 532 533 unsigned slab_size = 1 << AMDGPU_SLAB_BO_SIZE_LOG2; 534 slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base, 535 slab_size, slab_size, 536 domains, flags)); 537 if (!slab->buffer) 538 goto fail; 539 540 assert(slab->buffer->bo); 541 542 slab->base.num_entries = slab->buffer->base.size / entry_size; 543 slab->base.num_free = slab->base.num_entries; 544 slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries)); 545 if (!slab->entries) 546 goto fail_buffer; 547 548 LIST_INITHEAD(&slab->base.free); 549 550 base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries); 551 552 for (unsigned i = 0; i < slab->base.num_entries; ++i) { 553 struct amdgpu_winsys_bo *bo = &slab->entries[i]; 554 555 bo->base.alignment = entry_size; 556 bo->base.usage = slab->buffer->base.usage; 557 bo->base.size = entry_size; 558 bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl; 559 bo->ws = ws; 560 bo->va = slab->buffer->va + i * entry_size; 561 bo->initial_domain = domains; 562 bo->unique_id = base_id + i; 563 bo->u.slab.entry.slab = &slab->base; 564 bo->u.slab.entry.group_index = group_index; 565 bo->u.slab.real = slab->buffer; 566 567 LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free); 568 } 569 570 return &slab->base; 571 572 fail_buffer: 573 amdgpu_winsys_bo_reference(&slab->buffer, NULL); 574 fail: 575 FREE(slab); 576 return NULL; 577 } 578 579 void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab) 580 { 581 struct amdgpu_slab *slab = amdgpu_slab(pslab); 582 583 for (unsigned i = 0; i < slab->base.num_entries; ++i) 584 amdgpu_bo_remove_fences(&slab->entries[i]); 585 586 FREE(slab->entries); 587 amdgpu_winsys_bo_reference(&slab->buffer, NULL); 588 FREE(slab); 589 } 590 591 #if DEBUG_SPARSE_COMMITS 592 static void 593 sparse_dump(struct amdgpu_winsys_bo *bo, const char *func) 594 { 595 fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n" 596 "Commitments:\n", 597 __func__, bo, bo->base.size, bo->u.sparse.num_va_pages, func); 598 599 struct amdgpu_sparse_backing *span_backing = NULL; 600 uint32_t span_first_backing_page = 0; 601 uint32_t span_first_va_page = 0; 602 uint32_t va_page = 0; 603 604 for (;;) { 605 struct amdgpu_sparse_backing *backing = 0; 606 uint32_t backing_page = 0; 607 608 if (va_page < bo->u.sparse.num_va_pages) { 609 backing = bo->u.sparse.commitments[va_page].backing; 610 backing_page = bo->u.sparse.commitments[va_page].page; 611 } 612 613 if (span_backing && 614 (backing != span_backing || 615 backing_page != span_first_backing_page + (va_page - span_first_va_page))) { 616 fprintf(stderr, " %u..%u: backing=%p:%u..%u\n", 617 span_first_va_page, va_page - 1, span_backing, 618 span_first_backing_page, 619 span_first_backing_page + (va_page - span_first_va_page) - 1); 620 621 span_backing = NULL; 622 } 623 624 if (va_page >= bo->u.sparse.num_va_pages) 625 break; 626 627 if (backing && !span_backing) { 628 span_backing = backing; 629 span_first_backing_page = backing_page; 630 span_first_va_page = va_page; 631 } 632 633 va_page++; 634 } 635 636 fprintf(stderr, "Backing:\n"); 637 638 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { 639 fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->base.size); 640 for (unsigned i = 0; i < backing->num_chunks; ++i) 641 fprintf(stderr, " %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end); 642 } 643 } 644 #endif 645 646 /* 647 * Attempt to allocate the given number of backing pages. Fewer pages may be 648 * allocated (depending on the fragmentation of existing backing buffers), 649 * which will be reflected by a change to *pnum_pages. 650 */ 651 static struct amdgpu_sparse_backing * 652 sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages) 653 { 654 struct amdgpu_sparse_backing *best_backing; 655 unsigned best_idx; 656 uint32_t best_num_pages; 657 658 best_backing = NULL; 659 best_idx = 0; 660 best_num_pages = 0; 661 662 /* This is a very simple and inefficient best-fit algorithm. */ 663 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { 664 for (unsigned idx = 0; idx < backing->num_chunks; ++idx) { 665 uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin; 666 if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) || 667 (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) { 668 best_backing = backing; 669 best_idx = idx; 670 best_num_pages = cur_num_pages; 671 } 672 } 673 } 674 675 /* Allocate a new backing buffer if necessary. */ 676 if (!best_backing) { 677 struct pb_buffer *buf; 678 uint64_t size; 679 uint32_t pages; 680 681 best_backing = CALLOC_STRUCT(amdgpu_sparse_backing); 682 if (!best_backing) 683 return NULL; 684 685 best_backing->max_chunks = 4; 686 best_backing->chunks = CALLOC(best_backing->max_chunks, 687 sizeof(*best_backing->chunks)); 688 if (!best_backing->chunks) { 689 FREE(best_backing); 690 return NULL; 691 } 692 693 assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE)); 694 695 size = MIN3(bo->base.size / 16, 696 8 * 1024 * 1024, 697 bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE); 698 size = MAX2(size, RADEON_SPARSE_PAGE_SIZE); 699 700 buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE, 701 bo->initial_domain, 702 bo->u.sparse.flags | RADEON_FLAG_NO_SUBALLOC); 703 if (!buf) { 704 FREE(best_backing->chunks); 705 FREE(best_backing); 706 return NULL; 707 } 708 709 /* We might have gotten a bigger buffer than requested via caching. */ 710 pages = buf->size / RADEON_SPARSE_PAGE_SIZE; 711 712 best_backing->bo = amdgpu_winsys_bo(buf); 713 best_backing->num_chunks = 1; 714 best_backing->chunks[0].begin = 0; 715 best_backing->chunks[0].end = pages; 716 717 list_add(&best_backing->list, &bo->u.sparse.backing); 718 bo->u.sparse.num_backing_pages += pages; 719 720 best_idx = 0; 721 best_num_pages = pages; 722 } 723 724 *pnum_pages = MIN2(*pnum_pages, best_num_pages); 725 *pstart_page = best_backing->chunks[best_idx].begin; 726 best_backing->chunks[best_idx].begin += *pnum_pages; 727 728 if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) { 729 memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1], 730 sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1)); 731 best_backing->num_chunks--; 732 } 733 734 return best_backing; 735 } 736 737 static void 738 sparse_free_backing_buffer(struct amdgpu_winsys_bo *bo, 739 struct amdgpu_sparse_backing *backing) 740 { 741 struct amdgpu_winsys *ws = backing->bo->ws; 742 743 bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE; 744 745 simple_mtx_lock(&ws->bo_fence_lock); 746 amdgpu_add_fences(backing->bo, bo->num_fences, bo->fences); 747 simple_mtx_unlock(&ws->bo_fence_lock); 748 749 list_del(&backing->list); 750 amdgpu_winsys_bo_reference(&backing->bo, NULL); 751 FREE(backing->chunks); 752 FREE(backing); 753 } 754 755 /* 756 * Return a range of pages from the given backing buffer back into the 757 * free structure. 758 */ 759 static bool 760 sparse_backing_free(struct amdgpu_winsys_bo *bo, 761 struct amdgpu_sparse_backing *backing, 762 uint32_t start_page, uint32_t num_pages) 763 { 764 uint32_t end_page = start_page + num_pages; 765 unsigned low = 0; 766 unsigned high = backing->num_chunks; 767 768 /* Find the first chunk with begin >= start_page. */ 769 while (low < high) { 770 unsigned mid = low + (high - low) / 2; 771 772 if (backing->chunks[mid].begin >= start_page) 773 high = mid; 774 else 775 low = mid + 1; 776 } 777 778 assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin); 779 assert(low == 0 || backing->chunks[low - 1].end <= start_page); 780 781 if (low > 0 && backing->chunks[low - 1].end == start_page) { 782 backing->chunks[low - 1].end = end_page; 783 784 if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { 785 backing->chunks[low - 1].end = backing->chunks[low].end; 786 memmove(&backing->chunks[low], &backing->chunks[low + 1], 787 sizeof(*backing->chunks) * (backing->num_chunks - low - 1)); 788 backing->num_chunks--; 789 } 790 } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { 791 backing->chunks[low].begin = start_page; 792 } else { 793 if (backing->num_chunks >= backing->max_chunks) { 794 unsigned new_max_chunks = 2 * backing->max_chunks; 795 struct amdgpu_sparse_backing_chunk *new_chunks = 796 REALLOC(backing->chunks, 797 sizeof(*backing->chunks) * backing->max_chunks, 798 sizeof(*backing->chunks) * new_max_chunks); 799 if (!new_chunks) 800 return false; 801 802 backing->max_chunks = new_max_chunks; 803 backing->chunks = new_chunks; 804 } 805 806 memmove(&backing->chunks[low + 1], &backing->chunks[low], 807 sizeof(*backing->chunks) * (backing->num_chunks - low)); 808 backing->chunks[low].begin = start_page; 809 backing->chunks[low].end = end_page; 810 backing->num_chunks++; 811 } 812 813 if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 && 814 backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE) 815 sparse_free_backing_buffer(bo, backing); 816 817 return true; 818 } 819 820 static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf) 821 { 822 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 823 int r; 824 825 assert(!bo->bo && bo->sparse); 826 827 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, 828 (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE, 829 bo->va, 0, AMDGPU_VA_OP_CLEAR); 830 if (r) { 831 fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r); 832 } 833 834 while (!list_empty(&bo->u.sparse.backing)) { 835 struct amdgpu_sparse_backing *dummy = NULL; 836 sparse_free_backing_buffer(bo, 837 container_of(bo->u.sparse.backing.next, 838 dummy, list)); 839 } 840 841 amdgpu_va_range_free(bo->u.sparse.va_handle); 842 simple_mtx_destroy(&bo->u.sparse.commit_lock); 843 FREE(bo->u.sparse.commitments); 844 FREE(bo); 845 } 846 847 static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = { 848 amdgpu_bo_sparse_destroy 849 /* other functions are never called */ 850 }; 851 852 static struct pb_buffer * 853 amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size, 854 enum radeon_bo_domain domain, 855 enum radeon_bo_flag flags) 856 { 857 struct amdgpu_winsys_bo *bo; 858 uint64_t map_size; 859 uint64_t va_gap_size; 860 int r; 861 862 /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers 863 * that exceed this limit. This is not really a restriction: we don't have 864 * that much virtual address space anyway. 865 */ 866 if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE) 867 return NULL; 868 869 bo = CALLOC_STRUCT(amdgpu_winsys_bo); 870 if (!bo) 871 return NULL; 872 873 pipe_reference_init(&bo->base.reference, 1); 874 bo->base.alignment = RADEON_SPARSE_PAGE_SIZE; 875 bo->base.size = size; 876 bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl; 877 bo->ws = ws; 878 bo->initial_domain = domain; 879 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); 880 bo->sparse = true; 881 bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE; 882 883 bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); 884 bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages, 885 sizeof(*bo->u.sparse.commitments)); 886 if (!bo->u.sparse.commitments) 887 goto error_alloc_commitments; 888 889 simple_mtx_init(&bo->u.sparse.commit_lock, mtx_plain); 890 LIST_INITHEAD(&bo->u.sparse.backing); 891 892 /* For simplicity, we always map a multiple of the page size. */ 893 map_size = align64(size, RADEON_SPARSE_PAGE_SIZE); 894 va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0; 895 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, 896 map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE, 897 0, &bo->va, &bo->u.sparse.va_handle, 0); 898 if (r) 899 goto error_va_alloc; 900 901 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va, 902 AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP); 903 if (r) 904 goto error_va_map; 905 906 return &bo->base; 907 908 error_va_map: 909 amdgpu_va_range_free(bo->u.sparse.va_handle); 910 error_va_alloc: 911 simple_mtx_destroy(&bo->u.sparse.commit_lock); 912 FREE(bo->u.sparse.commitments); 913 error_alloc_commitments: 914 FREE(bo); 915 return NULL; 916 } 917 918 static bool 919 amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size, 920 bool commit) 921 { 922 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf); 923 struct amdgpu_sparse_commitment *comm; 924 uint32_t va_page, end_va_page; 925 bool ok = true; 926 int r; 927 928 assert(bo->sparse); 929 assert(offset % RADEON_SPARSE_PAGE_SIZE == 0); 930 assert(offset <= bo->base.size); 931 assert(size <= bo->base.size - offset); 932 assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size); 933 934 comm = bo->u.sparse.commitments; 935 va_page = offset / RADEON_SPARSE_PAGE_SIZE; 936 end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); 937 938 simple_mtx_lock(&bo->u.sparse.commit_lock); 939 940 #if DEBUG_SPARSE_COMMITS 941 sparse_dump(bo, __func__); 942 #endif 943 944 if (commit) { 945 while (va_page < end_va_page) { 946 uint32_t span_va_page; 947 948 /* Skip pages that are already committed. */ 949 if (comm[va_page].backing) { 950 va_page++; 951 continue; 952 } 953 954 /* Determine length of uncommitted span. */ 955 span_va_page = va_page; 956 while (va_page < end_va_page && !comm[va_page].backing) 957 va_page++; 958 959 /* Fill the uncommitted span with chunks of backing memory. */ 960 while (span_va_page < va_page) { 961 struct amdgpu_sparse_backing *backing; 962 uint32_t backing_start, backing_size; 963 964 backing_size = va_page - span_va_page; 965 backing = sparse_backing_alloc(bo, &backing_start, &backing_size); 966 if (!backing) { 967 ok = false; 968 goto out; 969 } 970 971 r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo, 972 (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE, 973 (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE, 974 bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE, 975 AMDGPU_VM_PAGE_READABLE | 976 AMDGPU_VM_PAGE_WRITEABLE | 977 AMDGPU_VM_PAGE_EXECUTABLE, 978 AMDGPU_VA_OP_REPLACE); 979 if (r) { 980 ok = sparse_backing_free(bo, backing, backing_start, backing_size); 981 assert(ok && "sufficient memory should already be allocated"); 982 983 ok = false; 984 goto out; 985 } 986 987 while (backing_size) { 988 comm[span_va_page].backing = backing; 989 comm[span_va_page].page = backing_start; 990 span_va_page++; 991 backing_start++; 992 backing_size--; 993 } 994 } 995 } 996 } else { 997 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, 998 (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE, 999 bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE, 1000 AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE); 1001 if (r) { 1002 ok = false; 1003 goto out; 1004 } 1005 1006 while (va_page < end_va_page) { 1007 struct amdgpu_sparse_backing *backing; 1008 uint32_t backing_start; 1009 uint32_t span_pages; 1010 1011 /* Skip pages that are already uncommitted. */ 1012 if (!comm[va_page].backing) { 1013 va_page++; 1014 continue; 1015 } 1016 1017 /* Group contiguous spans of pages. */ 1018 backing = comm[va_page].backing; 1019 backing_start = comm[va_page].page; 1020 comm[va_page].backing = NULL; 1021 1022 span_pages = 1; 1023 va_page++; 1024 1025 while (va_page < end_va_page && 1026 comm[va_page].backing == backing && 1027 comm[va_page].page == backing_start + span_pages) { 1028 comm[va_page].backing = NULL; 1029 va_page++; 1030 span_pages++; 1031 } 1032 1033 if (!sparse_backing_free(bo, backing, backing_start, span_pages)) { 1034 /* Couldn't allocate tracking data structures, so we have to leak */ 1035 fprintf(stderr, "amdgpu: leaking PRT backing memory\n"); 1036 ok = false; 1037 } 1038 } 1039 } 1040 out: 1041 1042 simple_mtx_unlock(&bo->u.sparse.commit_lock); 1043 1044 return ok; 1045 } 1046 1047 static unsigned eg_tile_split(unsigned tile_split) 1048 { 1049 switch (tile_split) { 1050 case 0: tile_split = 64; break; 1051 case 1: tile_split = 128; break; 1052 case 2: tile_split = 256; break; 1053 case 3: tile_split = 512; break; 1054 default: 1055 case 4: tile_split = 1024; break; 1056 case 5: tile_split = 2048; break; 1057 case 6: tile_split = 4096; break; 1058 } 1059 return tile_split; 1060 } 1061 1062 static unsigned eg_tile_split_rev(unsigned eg_tile_split) 1063 { 1064 switch (eg_tile_split) { 1065 case 64: return 0; 1066 case 128: return 1; 1067 case 256: return 2; 1068 case 512: return 3; 1069 default: 1070 case 1024: return 4; 1071 case 2048: return 5; 1072 case 4096: return 6; 1073 } 1074 } 1075 1076 static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf, 1077 struct radeon_bo_metadata *md) 1078 { 1079 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 1080 struct amdgpu_bo_info info = {0}; 1081 uint64_t tiling_flags; 1082 int r; 1083 1084 assert(bo->bo && "must not be called for slab entries"); 1085 1086 r = amdgpu_bo_query_info(bo->bo, &info); 1087 if (r) 1088 return; 1089 1090 tiling_flags = info.metadata.tiling_info; 1091 1092 if (bo->ws->info.chip_class >= GFX9) { 1093 md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE); 1094 } else { 1095 md->u.legacy.microtile = RADEON_LAYOUT_LINEAR; 1096 md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR; 1097 1098 if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */ 1099 md->u.legacy.macrotile = RADEON_LAYOUT_TILED; 1100 else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */ 1101 md->u.legacy.microtile = RADEON_LAYOUT_TILED; 1102 1103 md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG); 1104 md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); 1105 md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT); 1106 md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT)); 1107 md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT); 1108 md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS); 1109 md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ 1110 } 1111 1112 md->size_metadata = info.metadata.size_metadata; 1113 memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata)); 1114 } 1115 1116 static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf, 1117 struct radeon_bo_metadata *md) 1118 { 1119 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 1120 struct amdgpu_bo_metadata metadata = {0}; 1121 uint64_t tiling_flags = 0; 1122 1123 assert(bo->bo && "must not be called for slab entries"); 1124 1125 if (bo->ws->info.chip_class >= GFX9) { 1126 tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode); 1127 } else { 1128 if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED) 1129 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ 1130 else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED) 1131 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */ 1132 else 1133 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */ 1134 1135 tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config); 1136 tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw)); 1137 tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh)); 1138 if (md->u.legacy.tile_split) 1139 tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->u.legacy.tile_split)); 1140 tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea)); 1141 tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks)-1); 1142 1143 if (md->u.legacy.scanout) 1144 tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */ 1145 else 1146 tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */ 1147 } 1148 1149 metadata.tiling_info = tiling_flags; 1150 metadata.size_metadata = md->size_metadata; 1151 memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata)); 1152 1153 amdgpu_bo_set_metadata(bo->bo, &metadata); 1154 } 1155 1156 static struct pb_buffer * 1157 amdgpu_bo_create(struct radeon_winsys *rws, 1158 uint64_t size, 1159 unsigned alignment, 1160 enum radeon_bo_domain domain, 1161 enum radeon_bo_flag flags) 1162 { 1163 struct amdgpu_winsys *ws = amdgpu_winsys(rws); 1164 struct amdgpu_winsys_bo *bo; 1165 unsigned usage = 0, pb_cache_bucket = 0; 1166 1167 /* VRAM implies WC. This is not optional. */ 1168 assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC); 1169 1170 /* NO_CPU_ACCESS is valid with VRAM only. */ 1171 assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS)); 1172 1173 /* Sparse buffers must have NO_CPU_ACCESS set. */ 1174 assert(!(flags & RADEON_FLAG_SPARSE) || flags & RADEON_FLAG_NO_CPU_ACCESS); 1175 1176 /* Sub-allocate small buffers from slabs. */ 1177 if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) && 1178 size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) && 1179 alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) { 1180 struct pb_slab_entry *entry; 1181 int heap = radeon_get_heap_index(domain, flags); 1182 1183 if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS) 1184 goto no_slab; 1185 1186 entry = pb_slab_alloc(&ws->bo_slabs, size, heap); 1187 if (!entry) { 1188 /* Clear the cache and try again. */ 1189 pb_cache_release_all_buffers(&ws->bo_cache); 1190 1191 entry = pb_slab_alloc(&ws->bo_slabs, size, heap); 1192 } 1193 if (!entry) 1194 return NULL; 1195 1196 bo = NULL; 1197 bo = container_of(entry, bo, u.slab.entry); 1198 1199 pipe_reference_init(&bo->base.reference, 1); 1200 1201 return &bo->base; 1202 } 1203 no_slab: 1204 1205 if (flags & RADEON_FLAG_SPARSE) { 1206 assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0); 1207 1208 return amdgpu_bo_sparse_create(ws, size, domain, flags); 1209 } 1210 1211 /* This flag is irrelevant for the cache. */ 1212 flags &= ~RADEON_FLAG_NO_SUBALLOC; 1213 1214 /* Align size to page size. This is the minimum alignment for normal 1215 * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, 1216 * like constant/uniform buffers, can benefit from better and more reuse. 1217 */ 1218 size = align64(size, ws->info.gart_page_size); 1219 alignment = align(alignment, ws->info.gart_page_size); 1220 1221 bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING; 1222 1223 if (use_reusable_pool) { 1224 int heap = radeon_get_heap_index(domain, flags); 1225 assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); 1226 usage = 1 << heap; /* Only set one usage bit for each heap. */ 1227 1228 pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); 1229 assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); 1230 1231 /* Get a buffer from the cache. */ 1232 bo = (struct amdgpu_winsys_bo*) 1233 pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, 1234 pb_cache_bucket); 1235 if (bo) 1236 return &bo->base; 1237 } 1238 1239 /* Create a new one. */ 1240 bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, 1241 pb_cache_bucket); 1242 if (!bo) { 1243 /* Clear the cache and try again. */ 1244 pb_slabs_reclaim(&ws->bo_slabs); 1245 pb_cache_release_all_buffers(&ws->bo_cache); 1246 bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, 1247 pb_cache_bucket); 1248 if (!bo) 1249 return NULL; 1250 } 1251 1252 bo->u.real.use_reusable_pool = use_reusable_pool; 1253 return &bo->base; 1254 } 1255 1256 static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, 1257 struct winsys_handle *whandle, 1258 unsigned *stride, 1259 unsigned *offset) 1260 { 1261 struct amdgpu_winsys *ws = amdgpu_winsys(rws); 1262 struct amdgpu_winsys_bo *bo; 1263 enum amdgpu_bo_handle_type type; 1264 struct amdgpu_bo_import_result result = {0}; 1265 uint64_t va; 1266 amdgpu_va_handle va_handle; 1267 struct amdgpu_bo_info info = {0}; 1268 enum radeon_bo_domain initial = 0; 1269 int r; 1270 1271 /* Initialize the structure. */ 1272 bo = CALLOC_STRUCT(amdgpu_winsys_bo); 1273 if (!bo) { 1274 return NULL; 1275 } 1276 1277 switch (whandle->type) { 1278 case DRM_API_HANDLE_TYPE_SHARED: 1279 type = amdgpu_bo_handle_type_gem_flink_name; 1280 break; 1281 case DRM_API_HANDLE_TYPE_FD: 1282 type = amdgpu_bo_handle_type_dma_buf_fd; 1283 break; 1284 default: 1285 return NULL; 1286 } 1287 1288 r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result); 1289 if (r) 1290 goto error; 1291 1292 /* Get initial domains. */ 1293 r = amdgpu_bo_query_info(result.buf_handle, &info); 1294 if (r) 1295 goto error_query; 1296 1297 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, 1298 result.alloc_size, 1 << 20, 0, &va, &va_handle, 0); 1299 if (r) 1300 goto error_query; 1301 1302 r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP); 1303 if (r) 1304 goto error_va_map; 1305 1306 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM) 1307 initial |= RADEON_DOMAIN_VRAM; 1308 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT) 1309 initial |= RADEON_DOMAIN_GTT; 1310 1311 1312 pipe_reference_init(&bo->base.reference, 1); 1313 bo->base.alignment = info.phys_alignment; 1314 bo->bo = result.buf_handle; 1315 bo->base.size = result.alloc_size; 1316 bo->base.vtbl = &amdgpu_winsys_bo_vtbl; 1317 bo->ws = ws; 1318 bo->va = va; 1319 bo->u.real.va_handle = va_handle; 1320 bo->initial_domain = initial; 1321 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); 1322 bo->is_shared = true; 1323 1324 if (stride) 1325 *stride = whandle->stride; 1326 if (offset) 1327 *offset = whandle->offset; 1328 1329 if (bo->initial_domain & RADEON_DOMAIN_VRAM) 1330 ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size); 1331 else if (bo->initial_domain & RADEON_DOMAIN_GTT) 1332 ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size); 1333 1334 amdgpu_add_buffer_to_global_list(bo); 1335 1336 return &bo->base; 1337 1338 error_va_map: 1339 amdgpu_va_range_free(va_handle); 1340 1341 error_query: 1342 amdgpu_bo_free(result.buf_handle); 1343 1344 error: 1345 FREE(bo); 1346 return NULL; 1347 } 1348 1349 static bool amdgpu_bo_get_handle(struct pb_buffer *buffer, 1350 unsigned stride, unsigned offset, 1351 unsigned slice_size, 1352 struct winsys_handle *whandle) 1353 { 1354 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer); 1355 enum amdgpu_bo_handle_type type; 1356 int r; 1357 1358 /* Don't allow exports of slab entries and sparse buffers. */ 1359 if (!bo->bo) 1360 return false; 1361 1362 bo->u.real.use_reusable_pool = false; 1363 1364 switch (whandle->type) { 1365 case DRM_API_HANDLE_TYPE_SHARED: 1366 type = amdgpu_bo_handle_type_gem_flink_name; 1367 break; 1368 case DRM_API_HANDLE_TYPE_FD: 1369 type = amdgpu_bo_handle_type_dma_buf_fd; 1370 break; 1371 case DRM_API_HANDLE_TYPE_KMS: 1372 type = amdgpu_bo_handle_type_kms; 1373 break; 1374 default: 1375 return false; 1376 } 1377 1378 r = amdgpu_bo_export(bo->bo, type, &whandle->handle); 1379 if (r) 1380 return false; 1381 1382 whandle->stride = stride; 1383 whandle->offset = offset; 1384 whandle->offset += slice_size * whandle->layer; 1385 bo->is_shared = true; 1386 return true; 1387 } 1388 1389 static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws, 1390 void *pointer, uint64_t size) 1391 { 1392 struct amdgpu_winsys *ws = amdgpu_winsys(rws); 1393 amdgpu_bo_handle buf_handle; 1394 struct amdgpu_winsys_bo *bo; 1395 uint64_t va; 1396 amdgpu_va_handle va_handle; 1397 /* Avoid failure when the size is not page aligned */ 1398 uint64_t aligned_size = align64(size, ws->info.gart_page_size); 1399 1400 bo = CALLOC_STRUCT(amdgpu_winsys_bo); 1401 if (!bo) 1402 return NULL; 1403 1404 if (amdgpu_create_bo_from_user_mem(ws->dev, pointer, 1405 aligned_size, &buf_handle)) 1406 goto error; 1407 1408 if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, 1409 aligned_size, 1 << 12, 0, &va, &va_handle, 0)) 1410 goto error_va_alloc; 1411 1412 if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP)) 1413 goto error_va_map; 1414 1415 /* Initialize it. */ 1416 pipe_reference_init(&bo->base.reference, 1); 1417 bo->bo = buf_handle; 1418 bo->base.alignment = 0; 1419 bo->base.size = size; 1420 bo->base.vtbl = &amdgpu_winsys_bo_vtbl; 1421 bo->ws = ws; 1422 bo->user_ptr = pointer; 1423 bo->va = va; 1424 bo->u.real.va_handle = va_handle; 1425 bo->initial_domain = RADEON_DOMAIN_GTT; 1426 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); 1427 1428 ws->allocated_gtt += aligned_size; 1429 1430 amdgpu_add_buffer_to_global_list(bo); 1431 1432 return (struct pb_buffer*)bo; 1433 1434 error_va_map: 1435 amdgpu_va_range_free(va_handle); 1436 1437 error_va_alloc: 1438 amdgpu_bo_free(buf_handle); 1439 1440 error: 1441 FREE(bo); 1442 return NULL; 1443 } 1444 1445 static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf) 1446 { 1447 return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL; 1448 } 1449 1450 static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf) 1451 { 1452 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; 1453 1454 return !bo->bo && !bo->sparse; 1455 } 1456 1457 static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf) 1458 { 1459 return ((struct amdgpu_winsys_bo*)buf)->va; 1460 } 1461 1462 void amdgpu_bo_init_functions(struct amdgpu_winsys *ws) 1463 { 1464 ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata; 1465 ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata; 1466 ws->base.buffer_map = amdgpu_bo_map; 1467 ws->base.buffer_unmap = amdgpu_bo_unmap; 1468 ws->base.buffer_wait = amdgpu_bo_wait; 1469 ws->base.buffer_create = amdgpu_bo_create; 1470 ws->base.buffer_from_handle = amdgpu_bo_from_handle; 1471 ws->base.buffer_from_ptr = amdgpu_bo_from_ptr; 1472 ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr; 1473 ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated; 1474 ws->base.buffer_get_handle = amdgpu_bo_get_handle; 1475 ws->base.buffer_commit = amdgpu_bo_sparse_commit; 1476 ws->base.buffer_get_virtual_address = amdgpu_bo_get_va; 1477 ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain; 1478 } 1479