Home | History | Annotate | Download | only in drm
      1 /*
      2  * Copyright  2011 Marek Olk <maraeo (at) gmail.com>
      3  * Copyright  2015 Advanced Micro Devices, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining
      7  * a copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
     16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
     18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  *
     23  * The above copyright notice and this permission notice (including the
     24  * next paragraph) shall be included in all copies or substantial portions
     25  * of the Software.
     26  */
     27 
     28 #include "amdgpu_cs.h"
     29 
     30 #include "util/os_time.h"
     31 #include "state_tracker/drm_driver.h"
     32 #include <amdgpu_drm.h>
     33 #include <xf86drm.h>
     34 #include <stdio.h>
     35 #include <inttypes.h>
     36 
     37 #ifndef AMDGPU_GEM_CREATE_VM_ALWAYS_VALID
     38 #define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6)
     39 #endif
     40 
     41 /* Set to 1 for verbose output showing committed sparse buffer ranges. */
     42 #define DEBUG_SPARSE_COMMITS 0
     43 
     44 struct amdgpu_sparse_backing_chunk {
     45    uint32_t begin, end;
     46 };
     47 
     48 static struct pb_buffer *
     49 amdgpu_bo_create(struct radeon_winsys *rws,
     50                  uint64_t size,
     51                  unsigned alignment,
     52                  enum radeon_bo_domain domain,
     53                  enum radeon_bo_flag flags);
     54 
     55 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
     56                            enum radeon_bo_usage usage)
     57 {
     58    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
     59    struct amdgpu_winsys *ws = bo->ws;
     60    int64_t abs_timeout;
     61 
     62    if (timeout == 0) {
     63       if (p_atomic_read(&bo->num_active_ioctls))
     64          return false;
     65 
     66    } else {
     67       abs_timeout = os_time_get_absolute_timeout(timeout);
     68 
     69       /* Wait if any ioctl is being submitted with this buffer. */
     70       if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
     71          return false;
     72    }
     73 
     74    if (bo->is_shared) {
     75       /* We can't use user fences for shared buffers, because user fences
     76        * are local to this process only. If we want to wait for all buffer
     77        * uses in all processes, we have to use amdgpu_bo_wait_for_idle.
     78        */
     79       bool buffer_busy = true;
     80       int r;
     81 
     82       r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy);
     83       if (r)
     84          fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__,
     85                  r);
     86       return !buffer_busy;
     87    }
     88 
     89    if (timeout == 0) {
     90       unsigned idle_fences;
     91       bool buffer_idle;
     92 
     93       simple_mtx_lock(&ws->bo_fence_lock);
     94 
     95       for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) {
     96          if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false))
     97             break;
     98       }
     99 
    100       /* Release the idle fences to avoid checking them again later. */
    101       for (unsigned i = 0; i < idle_fences; ++i)
    102          amdgpu_fence_reference(&bo->fences[i], NULL);
    103 
    104       memmove(&bo->fences[0], &bo->fences[idle_fences],
    105               (bo->num_fences - idle_fences) * sizeof(*bo->fences));
    106       bo->num_fences -= idle_fences;
    107 
    108       buffer_idle = !bo->num_fences;
    109       simple_mtx_unlock(&ws->bo_fence_lock);
    110 
    111       return buffer_idle;
    112    } else {
    113       bool buffer_idle = true;
    114 
    115       simple_mtx_lock(&ws->bo_fence_lock);
    116       while (bo->num_fences && buffer_idle) {
    117          struct pipe_fence_handle *fence = NULL;
    118          bool fence_idle = false;
    119 
    120          amdgpu_fence_reference(&fence, bo->fences[0]);
    121 
    122          /* Wait for the fence. */
    123          simple_mtx_unlock(&ws->bo_fence_lock);
    124          if (amdgpu_fence_wait(fence, abs_timeout, true))
    125             fence_idle = true;
    126          else
    127             buffer_idle = false;
    128          simple_mtx_lock(&ws->bo_fence_lock);
    129 
    130          /* Release an idle fence to avoid checking it again later, keeping in
    131           * mind that the fence array may have been modified by other threads.
    132           */
    133          if (fence_idle && bo->num_fences && bo->fences[0] == fence) {
    134             amdgpu_fence_reference(&bo->fences[0], NULL);
    135             memmove(&bo->fences[0], &bo->fences[1],
    136                     (bo->num_fences - 1) * sizeof(*bo->fences));
    137             bo->num_fences--;
    138          }
    139 
    140          amdgpu_fence_reference(&fence, NULL);
    141       }
    142       simple_mtx_unlock(&ws->bo_fence_lock);
    143 
    144       return buffer_idle;
    145    }
    146 }
    147 
    148 static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
    149       struct pb_buffer *buf)
    150 {
    151    return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
    152 }
    153 
    154 static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
    155 {
    156    for (unsigned i = 0; i < bo->num_fences; ++i)
    157       amdgpu_fence_reference(&bo->fences[i], NULL);
    158 
    159    FREE(bo->fences);
    160    bo->num_fences = 0;
    161    bo->max_fences = 0;
    162 }
    163 
    164 void amdgpu_bo_destroy(struct pb_buffer *_buf)
    165 {
    166    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
    167 
    168    assert(bo->bo && "must not be called for slab entries");
    169 
    170    if (bo->ws->debug_all_bos) {
    171       simple_mtx_lock(&bo->ws->global_bo_list_lock);
    172       LIST_DEL(&bo->u.real.global_list_item);
    173       bo->ws->num_buffers--;
    174       simple_mtx_unlock(&bo->ws->global_bo_list_lock);
    175    }
    176 
    177    amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
    178    amdgpu_va_range_free(bo->u.real.va_handle);
    179    amdgpu_bo_free(bo->bo);
    180 
    181    amdgpu_bo_remove_fences(bo);
    182 
    183    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
    184       bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->info.gart_page_size);
    185    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
    186       bo->ws->allocated_gtt -= align64(bo->base.size, bo->ws->info.gart_page_size);
    187 
    188    if (bo->u.real.map_count >= 1) {
    189       if (bo->initial_domain & RADEON_DOMAIN_VRAM)
    190          bo->ws->mapped_vram -= bo->base.size;
    191       else if (bo->initial_domain & RADEON_DOMAIN_GTT)
    192          bo->ws->mapped_gtt -= bo->base.size;
    193       bo->ws->num_mapped_buffers--;
    194    }
    195 
    196    FREE(bo);
    197 }
    198 
    199 static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf)
    200 {
    201    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
    202 
    203    assert(bo->bo); /* slab buffers have a separate vtbl */
    204 
    205    if (bo->u.real.use_reusable_pool)
    206       pb_cache_add_buffer(&bo->u.real.cache_entry);
    207    else
    208       amdgpu_bo_destroy(_buf);
    209 }
    210 
    211 static void *amdgpu_bo_map(struct pb_buffer *buf,
    212                            struct radeon_winsys_cs *rcs,
    213                            enum pipe_transfer_usage usage)
    214 {
    215    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
    216    struct amdgpu_winsys_bo *real;
    217    struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
    218    int r;
    219    void *cpu = NULL;
    220    uint64_t offset = 0;
    221 
    222    assert(!bo->sparse);
    223 
    224    /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
    225    if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
    226       /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */
    227       if (usage & PIPE_TRANSFER_DONTBLOCK) {
    228          if (!(usage & PIPE_TRANSFER_WRITE)) {
    229             /* Mapping for read.
    230              *
    231              * Since we are mapping for read, we don't need to wait
    232              * if the GPU is using the buffer for read too
    233              * (neither one is changing it).
    234              *
    235              * Only check whether the buffer is being used for write. */
    236             if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
    237                                                                RADEON_USAGE_WRITE)) {
    238                cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
    239                return NULL;
    240             }
    241 
    242             if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
    243                                 RADEON_USAGE_WRITE)) {
    244                return NULL;
    245             }
    246          } else {
    247             if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
    248                cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
    249                return NULL;
    250             }
    251 
    252             if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
    253                                 RADEON_USAGE_READWRITE)) {
    254                return NULL;
    255             }
    256          }
    257       } else {
    258          uint64_t time = os_time_get_nano();
    259 
    260          if (!(usage & PIPE_TRANSFER_WRITE)) {
    261             /* Mapping for read.
    262              *
    263              * Since we are mapping for read, we don't need to wait
    264              * if the GPU is using the buffer for read too
    265              * (neither one is changing it).
    266              *
    267              * Only check whether the buffer is being used for write. */
    268             if (cs) {
    269                if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
    270                                                             RADEON_USAGE_WRITE)) {
    271                   cs->flush_cs(cs->flush_data, 0, NULL);
    272                } else {
    273                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
    274                   if (p_atomic_read(&bo->num_active_ioctls))
    275                      amdgpu_cs_sync_flush(rcs);
    276                }
    277             }
    278 
    279             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
    280                            RADEON_USAGE_WRITE);
    281          } else {
    282             /* Mapping for write. */
    283             if (cs) {
    284                if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
    285                   cs->flush_cs(cs->flush_data, 0, NULL);
    286                } else {
    287                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
    288                   if (p_atomic_read(&bo->num_active_ioctls))
    289                      amdgpu_cs_sync_flush(rcs);
    290                }
    291             }
    292 
    293             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
    294                            RADEON_USAGE_READWRITE);
    295          }
    296 
    297          bo->ws->buffer_wait_time += os_time_get_nano() - time;
    298       }
    299    }
    300 
    301    /* If the buffer is created from user memory, return the user pointer. */
    302    if (bo->user_ptr)
    303       return bo->user_ptr;
    304 
    305    if (bo->bo) {
    306       real = bo;
    307    } else {
    308       real = bo->u.slab.real;
    309       offset = bo->va - real->va;
    310    }
    311 
    312    r = amdgpu_bo_cpu_map(real->bo, &cpu);
    313    if (r) {
    314       /* Clear the cache and try again. */
    315       pb_cache_release_all_buffers(&real->ws->bo_cache);
    316       r = amdgpu_bo_cpu_map(real->bo, &cpu);
    317       if (r)
    318          return NULL;
    319    }
    320 
    321    if (p_atomic_inc_return(&real->u.real.map_count) == 1) {
    322       if (real->initial_domain & RADEON_DOMAIN_VRAM)
    323          real->ws->mapped_vram += real->base.size;
    324       else if (real->initial_domain & RADEON_DOMAIN_GTT)
    325          real->ws->mapped_gtt += real->base.size;
    326       real->ws->num_mapped_buffers++;
    327    }
    328    return (uint8_t*)cpu + offset;
    329 }
    330 
    331 static void amdgpu_bo_unmap(struct pb_buffer *buf)
    332 {
    333    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
    334    struct amdgpu_winsys_bo *real;
    335 
    336    assert(!bo->sparse);
    337 
    338    if (bo->user_ptr)
    339       return;
    340 
    341    real = bo->bo ? bo : bo->u.slab.real;
    342 
    343    if (p_atomic_dec_zero(&real->u.real.map_count)) {
    344       if (real->initial_domain & RADEON_DOMAIN_VRAM)
    345          real->ws->mapped_vram -= real->base.size;
    346       else if (real->initial_domain & RADEON_DOMAIN_GTT)
    347          real->ws->mapped_gtt -= real->base.size;
    348       real->ws->num_mapped_buffers--;
    349    }
    350 
    351    amdgpu_bo_cpu_unmap(real->bo);
    352 }
    353 
    354 static const struct pb_vtbl amdgpu_winsys_bo_vtbl = {
    355    amdgpu_bo_destroy_or_cache
    356    /* other functions are never called */
    357 };
    358 
    359 static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo)
    360 {
    361    struct amdgpu_winsys *ws = bo->ws;
    362 
    363    assert(bo->bo);
    364 
    365    if (ws->debug_all_bos) {
    366       simple_mtx_lock(&ws->global_bo_list_lock);
    367       LIST_ADDTAIL(&bo->u.real.global_list_item, &ws->global_bo_list);
    368       ws->num_buffers++;
    369       simple_mtx_unlock(&ws->global_bo_list_lock);
    370    }
    371 }
    372 
    373 static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
    374                                                  uint64_t size,
    375                                                  unsigned alignment,
    376                                                  unsigned usage,
    377                                                  enum radeon_bo_domain initial_domain,
    378                                                  unsigned flags,
    379                                                  unsigned pb_cache_bucket)
    380 {
    381    struct amdgpu_bo_alloc_request request = {0};
    382    amdgpu_bo_handle buf_handle;
    383    uint64_t va = 0;
    384    struct amdgpu_winsys_bo *bo;
    385    amdgpu_va_handle va_handle;
    386    unsigned va_gap_size;
    387    int r;
    388 
    389    /* VRAM or GTT must be specified, but not both at the same time. */
    390    assert(util_bitcount(initial_domain & RADEON_DOMAIN_VRAM_GTT) == 1);
    391 
    392    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
    393    if (!bo) {
    394       return NULL;
    395    }
    396 
    397    pb_cache_init_entry(&ws->bo_cache, &bo->u.real.cache_entry, &bo->base,
    398                        pb_cache_bucket);
    399    request.alloc_size = size;
    400    request.phys_alignment = alignment;
    401 
    402    if (initial_domain & RADEON_DOMAIN_VRAM)
    403       request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
    404    if (initial_domain & RADEON_DOMAIN_GTT)
    405       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
    406 
    407    /* If VRAM is just stolen system memory, allow both VRAM and
    408     * GTT, whichever has free space. If a buffer is evicted from
    409     * VRAM to GTT, it will stay there.
    410     *
    411     * DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only
    412     * placements even with a low amount of stolen VRAM.
    413     */
    414    if (!ws->info.has_dedicated_vram && ws->info.drm_minor < 6)
    415       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
    416 
    417    if (flags & RADEON_FLAG_NO_CPU_ACCESS)
    418       request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
    419    if (flags & RADEON_FLAG_GTT_WC)
    420       request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
    421    /* TODO: Enable this once the kernel handles it efficiently. */
    422    /*if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
    423        ws->info.drm_minor >= 20)
    424       request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;*/
    425 
    426    r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
    427    if (r) {
    428       fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
    429       fprintf(stderr, "amdgpu:    size      : %"PRIu64" bytes\n", size);
    430       fprintf(stderr, "amdgpu:    alignment : %u bytes\n", alignment);
    431       fprintf(stderr, "amdgpu:    domains   : %u\n", initial_domain);
    432       goto error_bo_alloc;
    433    }
    434 
    435    va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
    436    if (size > ws->info.pte_fragment_size)
    437 	   alignment = MAX2(alignment, ws->info.pte_fragment_size);
    438    r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
    439                              size + va_gap_size, alignment, 0, &va, &va_handle, 0);
    440    if (r)
    441       goto error_va_alloc;
    442 
    443    unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |
    444                        AMDGPU_VM_PAGE_EXECUTABLE;
    445 
    446    if (!(flags & RADEON_FLAG_READ_ONLY))
    447        vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;
    448 
    449    r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,
    450 			   AMDGPU_VA_OP_MAP);
    451    if (r)
    452       goto error_va_map;
    453 
    454    pipe_reference_init(&bo->base.reference, 1);
    455    bo->base.alignment = alignment;
    456    bo->base.usage = usage;
    457    bo->base.size = size;
    458    bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
    459    bo->ws = ws;
    460    bo->bo = buf_handle;
    461    bo->va = va;
    462    bo->u.real.va_handle = va_handle;
    463    bo->initial_domain = initial_domain;
    464    bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
    465    bo->is_local = !!(request.flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID);
    466 
    467    if (initial_domain & RADEON_DOMAIN_VRAM)
    468       ws->allocated_vram += align64(size, ws->info.gart_page_size);
    469    else if (initial_domain & RADEON_DOMAIN_GTT)
    470       ws->allocated_gtt += align64(size, ws->info.gart_page_size);
    471 
    472    amdgpu_add_buffer_to_global_list(bo);
    473 
    474    return bo;
    475 
    476 error_va_map:
    477    amdgpu_va_range_free(va_handle);
    478 
    479 error_va_alloc:
    480    amdgpu_bo_free(buf_handle);
    481 
    482 error_bo_alloc:
    483    FREE(bo);
    484    return NULL;
    485 }
    486 
    487 bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf)
    488 {
    489    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
    490 
    491    if (amdgpu_bo_is_referenced_by_any_cs(bo)) {
    492       return false;
    493    }
    494 
    495    return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE);
    496 }
    497 
    498 bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
    499 {
    500    struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */
    501    bo = container_of(entry, bo, u.slab.entry);
    502 
    503    return amdgpu_bo_can_reclaim(&bo->base);
    504 }
    505 
    506 static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf)
    507 {
    508    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
    509 
    510    assert(!bo->bo);
    511 
    512    pb_slab_free(&bo->ws->bo_slabs, &bo->u.slab.entry);
    513 }
    514 
    515 static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
    516    amdgpu_bo_slab_destroy
    517    /* other functions are never called */
    518 };
    519 
    520 struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
    521                                      unsigned entry_size,
    522                                      unsigned group_index)
    523 {
    524    struct amdgpu_winsys *ws = priv;
    525    struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab);
    526    enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
    527    enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
    528    uint32_t base_id;
    529 
    530    if (!slab)
    531       return NULL;
    532 
    533    unsigned slab_size = 1 << AMDGPU_SLAB_BO_SIZE_LOG2;
    534    slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base,
    535                                                     slab_size, slab_size,
    536                                                     domains, flags));
    537    if (!slab->buffer)
    538       goto fail;
    539 
    540    assert(slab->buffer->bo);
    541 
    542    slab->base.num_entries = slab->buffer->base.size / entry_size;
    543    slab->base.num_free = slab->base.num_entries;
    544    slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
    545    if (!slab->entries)
    546       goto fail_buffer;
    547 
    548    LIST_INITHEAD(&slab->base.free);
    549 
    550    base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries);
    551 
    552    for (unsigned i = 0; i < slab->base.num_entries; ++i) {
    553       struct amdgpu_winsys_bo *bo = &slab->entries[i];
    554 
    555       bo->base.alignment = entry_size;
    556       bo->base.usage = slab->buffer->base.usage;
    557       bo->base.size = entry_size;
    558       bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl;
    559       bo->ws = ws;
    560       bo->va = slab->buffer->va + i * entry_size;
    561       bo->initial_domain = domains;
    562       bo->unique_id = base_id + i;
    563       bo->u.slab.entry.slab = &slab->base;
    564       bo->u.slab.entry.group_index = group_index;
    565       bo->u.slab.real = slab->buffer;
    566 
    567       LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free);
    568    }
    569 
    570    return &slab->base;
    571 
    572 fail_buffer:
    573    amdgpu_winsys_bo_reference(&slab->buffer, NULL);
    574 fail:
    575    FREE(slab);
    576    return NULL;
    577 }
    578 
    579 void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
    580 {
    581    struct amdgpu_slab *slab = amdgpu_slab(pslab);
    582 
    583    for (unsigned i = 0; i < slab->base.num_entries; ++i)
    584       amdgpu_bo_remove_fences(&slab->entries[i]);
    585 
    586    FREE(slab->entries);
    587    amdgpu_winsys_bo_reference(&slab->buffer, NULL);
    588    FREE(slab);
    589 }
    590 
    591 #if DEBUG_SPARSE_COMMITS
    592 static void
    593 sparse_dump(struct amdgpu_winsys_bo *bo, const char *func)
    594 {
    595    fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n"
    596                    "Commitments:\n",
    597            __func__, bo, bo->base.size, bo->u.sparse.num_va_pages, func);
    598 
    599    struct amdgpu_sparse_backing *span_backing = NULL;
    600    uint32_t span_first_backing_page = 0;
    601    uint32_t span_first_va_page = 0;
    602    uint32_t va_page = 0;
    603 
    604    for (;;) {
    605       struct amdgpu_sparse_backing *backing = 0;
    606       uint32_t backing_page = 0;
    607 
    608       if (va_page < bo->u.sparse.num_va_pages) {
    609          backing = bo->u.sparse.commitments[va_page].backing;
    610          backing_page = bo->u.sparse.commitments[va_page].page;
    611       }
    612 
    613       if (span_backing &&
    614           (backing != span_backing ||
    615            backing_page != span_first_backing_page + (va_page - span_first_va_page))) {
    616          fprintf(stderr, " %u..%u: backing=%p:%u..%u\n",
    617                  span_first_va_page, va_page - 1, span_backing,
    618                  span_first_backing_page,
    619                  span_first_backing_page + (va_page - span_first_va_page) - 1);
    620 
    621          span_backing = NULL;
    622       }
    623 
    624       if (va_page >= bo->u.sparse.num_va_pages)
    625          break;
    626 
    627       if (backing && !span_backing) {
    628          span_backing = backing;
    629          span_first_backing_page = backing_page;
    630          span_first_va_page = va_page;
    631       }
    632 
    633       va_page++;
    634    }
    635 
    636    fprintf(stderr, "Backing:\n");
    637 
    638    list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
    639       fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->base.size);
    640       for (unsigned i = 0; i < backing->num_chunks; ++i)
    641          fprintf(stderr, "   %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end);
    642    }
    643 }
    644 #endif
    645 
    646 /*
    647  * Attempt to allocate the given number of backing pages. Fewer pages may be
    648  * allocated (depending on the fragmentation of existing backing buffers),
    649  * which will be reflected by a change to *pnum_pages.
    650  */
    651 static struct amdgpu_sparse_backing *
    652 sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages)
    653 {
    654    struct amdgpu_sparse_backing *best_backing;
    655    unsigned best_idx;
    656    uint32_t best_num_pages;
    657 
    658    best_backing = NULL;
    659    best_idx = 0;
    660    best_num_pages = 0;
    661 
    662    /* This is a very simple and inefficient best-fit algorithm. */
    663    list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
    664       for (unsigned idx = 0; idx < backing->num_chunks; ++idx) {
    665          uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin;
    666          if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) ||
    667             (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) {
    668             best_backing = backing;
    669             best_idx = idx;
    670             best_num_pages = cur_num_pages;
    671          }
    672       }
    673    }
    674 
    675    /* Allocate a new backing buffer if necessary. */
    676    if (!best_backing) {
    677       struct pb_buffer *buf;
    678       uint64_t size;
    679       uint32_t pages;
    680 
    681       best_backing = CALLOC_STRUCT(amdgpu_sparse_backing);
    682       if (!best_backing)
    683          return NULL;
    684 
    685       best_backing->max_chunks = 4;
    686       best_backing->chunks = CALLOC(best_backing->max_chunks,
    687                                     sizeof(*best_backing->chunks));
    688       if (!best_backing->chunks) {
    689          FREE(best_backing);
    690          return NULL;
    691       }
    692 
    693       assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE));
    694 
    695       size = MIN3(bo->base.size / 16,
    696                   8 * 1024 * 1024,
    697                   bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE);
    698       size = MAX2(size, RADEON_SPARSE_PAGE_SIZE);
    699 
    700       buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE,
    701                              bo->initial_domain,
    702                              bo->u.sparse.flags | RADEON_FLAG_NO_SUBALLOC);
    703       if (!buf) {
    704          FREE(best_backing->chunks);
    705          FREE(best_backing);
    706          return NULL;
    707       }
    708 
    709       /* We might have gotten a bigger buffer than requested via caching. */
    710       pages = buf->size / RADEON_SPARSE_PAGE_SIZE;
    711 
    712       best_backing->bo = amdgpu_winsys_bo(buf);
    713       best_backing->num_chunks = 1;
    714       best_backing->chunks[0].begin = 0;
    715       best_backing->chunks[0].end = pages;
    716 
    717       list_add(&best_backing->list, &bo->u.sparse.backing);
    718       bo->u.sparse.num_backing_pages += pages;
    719 
    720       best_idx = 0;
    721       best_num_pages = pages;
    722    }
    723 
    724    *pnum_pages = MIN2(*pnum_pages, best_num_pages);
    725    *pstart_page = best_backing->chunks[best_idx].begin;
    726    best_backing->chunks[best_idx].begin += *pnum_pages;
    727 
    728    if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) {
    729       memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1],
    730               sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1));
    731       best_backing->num_chunks--;
    732    }
    733 
    734    return best_backing;
    735 }
    736 
    737 static void
    738 sparse_free_backing_buffer(struct amdgpu_winsys_bo *bo,
    739                            struct amdgpu_sparse_backing *backing)
    740 {
    741    struct amdgpu_winsys *ws = backing->bo->ws;
    742 
    743    bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE;
    744 
    745    simple_mtx_lock(&ws->bo_fence_lock);
    746    amdgpu_add_fences(backing->bo, bo->num_fences, bo->fences);
    747    simple_mtx_unlock(&ws->bo_fence_lock);
    748 
    749    list_del(&backing->list);
    750    amdgpu_winsys_bo_reference(&backing->bo, NULL);
    751    FREE(backing->chunks);
    752    FREE(backing);
    753 }
    754 
    755 /*
    756  * Return a range of pages from the given backing buffer back into the
    757  * free structure.
    758  */
    759 static bool
    760 sparse_backing_free(struct amdgpu_winsys_bo *bo,
    761                     struct amdgpu_sparse_backing *backing,
    762                     uint32_t start_page, uint32_t num_pages)
    763 {
    764    uint32_t end_page = start_page + num_pages;
    765    unsigned low = 0;
    766    unsigned high = backing->num_chunks;
    767 
    768    /* Find the first chunk with begin >= start_page. */
    769    while (low < high) {
    770       unsigned mid = low + (high - low) / 2;
    771 
    772       if (backing->chunks[mid].begin >= start_page)
    773          high = mid;
    774       else
    775          low = mid + 1;
    776    }
    777 
    778    assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin);
    779    assert(low == 0 || backing->chunks[low - 1].end <= start_page);
    780 
    781    if (low > 0 && backing->chunks[low - 1].end == start_page) {
    782       backing->chunks[low - 1].end = end_page;
    783 
    784       if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
    785          backing->chunks[low - 1].end = backing->chunks[low].end;
    786          memmove(&backing->chunks[low], &backing->chunks[low + 1],
    787                  sizeof(*backing->chunks) * (backing->num_chunks - low - 1));
    788          backing->num_chunks--;
    789       }
    790    } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
    791       backing->chunks[low].begin = start_page;
    792    } else {
    793       if (backing->num_chunks >= backing->max_chunks) {
    794          unsigned new_max_chunks = 2 * backing->max_chunks;
    795          struct amdgpu_sparse_backing_chunk *new_chunks =
    796             REALLOC(backing->chunks,
    797                     sizeof(*backing->chunks) * backing->max_chunks,
    798                     sizeof(*backing->chunks) * new_max_chunks);
    799          if (!new_chunks)
    800             return false;
    801 
    802          backing->max_chunks = new_max_chunks;
    803          backing->chunks = new_chunks;
    804       }
    805 
    806       memmove(&backing->chunks[low + 1], &backing->chunks[low],
    807               sizeof(*backing->chunks) * (backing->num_chunks - low));
    808       backing->chunks[low].begin = start_page;
    809       backing->chunks[low].end = end_page;
    810       backing->num_chunks++;
    811    }
    812 
    813    if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 &&
    814        backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE)
    815       sparse_free_backing_buffer(bo, backing);
    816 
    817    return true;
    818 }
    819 
    820 static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf)
    821 {
    822    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
    823    int r;
    824 
    825    assert(!bo->bo && bo->sparse);
    826 
    827    r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
    828                            (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE,
    829                            bo->va, 0, AMDGPU_VA_OP_CLEAR);
    830    if (r) {
    831       fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r);
    832    }
    833 
    834    while (!list_empty(&bo->u.sparse.backing)) {
    835       struct amdgpu_sparse_backing *dummy = NULL;
    836       sparse_free_backing_buffer(bo,
    837                                  container_of(bo->u.sparse.backing.next,
    838                                               dummy, list));
    839    }
    840 
    841    amdgpu_va_range_free(bo->u.sparse.va_handle);
    842    simple_mtx_destroy(&bo->u.sparse.commit_lock);
    843    FREE(bo->u.sparse.commitments);
    844    FREE(bo);
    845 }
    846 
    847 static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = {
    848    amdgpu_bo_sparse_destroy
    849    /* other functions are never called */
    850 };
    851 
    852 static struct pb_buffer *
    853 amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size,
    854                         enum radeon_bo_domain domain,
    855                         enum radeon_bo_flag flags)
    856 {
    857    struct amdgpu_winsys_bo *bo;
    858    uint64_t map_size;
    859    uint64_t va_gap_size;
    860    int r;
    861 
    862    /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers
    863     * that exceed this limit. This is not really a restriction: we don't have
    864     * that much virtual address space anyway.
    865     */
    866    if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE)
    867       return NULL;
    868 
    869    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
    870    if (!bo)
    871       return NULL;
    872 
    873    pipe_reference_init(&bo->base.reference, 1);
    874    bo->base.alignment = RADEON_SPARSE_PAGE_SIZE;
    875    bo->base.size = size;
    876    bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl;
    877    bo->ws = ws;
    878    bo->initial_domain = domain;
    879    bo->unique_id =  __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
    880    bo->sparse = true;
    881    bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE;
    882 
    883    bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
    884    bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages,
    885                                      sizeof(*bo->u.sparse.commitments));
    886    if (!bo->u.sparse.commitments)
    887       goto error_alloc_commitments;
    888 
    889    simple_mtx_init(&bo->u.sparse.commit_lock, mtx_plain);
    890    LIST_INITHEAD(&bo->u.sparse.backing);
    891 
    892    /* For simplicity, we always map a multiple of the page size. */
    893    map_size = align64(size, RADEON_SPARSE_PAGE_SIZE);
    894    va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0;
    895    r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
    896                              map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE,
    897                              0, &bo->va, &bo->u.sparse.va_handle, 0);
    898    if (r)
    899       goto error_va_alloc;
    900 
    901    r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va,
    902                            AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
    903    if (r)
    904       goto error_va_map;
    905 
    906    return &bo->base;
    907 
    908 error_va_map:
    909    amdgpu_va_range_free(bo->u.sparse.va_handle);
    910 error_va_alloc:
    911    simple_mtx_destroy(&bo->u.sparse.commit_lock);
    912    FREE(bo->u.sparse.commitments);
    913 error_alloc_commitments:
    914    FREE(bo);
    915    return NULL;
    916 }
    917 
    918 static bool
    919 amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size,
    920                         bool commit)
    921 {
    922    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);
    923    struct amdgpu_sparse_commitment *comm;
    924    uint32_t va_page, end_va_page;
    925    bool ok = true;
    926    int r;
    927 
    928    assert(bo->sparse);
    929    assert(offset % RADEON_SPARSE_PAGE_SIZE == 0);
    930    assert(offset <= bo->base.size);
    931    assert(size <= bo->base.size - offset);
    932    assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size);
    933 
    934    comm = bo->u.sparse.commitments;
    935    va_page = offset / RADEON_SPARSE_PAGE_SIZE;
    936    end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
    937 
    938    simple_mtx_lock(&bo->u.sparse.commit_lock);
    939 
    940 #if DEBUG_SPARSE_COMMITS
    941    sparse_dump(bo, __func__);
    942 #endif
    943 
    944    if (commit) {
    945       while (va_page < end_va_page) {
    946          uint32_t span_va_page;
    947 
    948          /* Skip pages that are already committed. */
    949          if (comm[va_page].backing) {
    950             va_page++;
    951             continue;
    952          }
    953 
    954          /* Determine length of uncommitted span. */
    955          span_va_page = va_page;
    956          while (va_page < end_va_page && !comm[va_page].backing)
    957             va_page++;
    958 
    959          /* Fill the uncommitted span with chunks of backing memory. */
    960          while (span_va_page < va_page) {
    961             struct amdgpu_sparse_backing *backing;
    962             uint32_t backing_start, backing_size;
    963 
    964             backing_size = va_page - span_va_page;
    965             backing = sparse_backing_alloc(bo, &backing_start, &backing_size);
    966             if (!backing) {
    967                ok = false;
    968                goto out;
    969             }
    970 
    971             r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo,
    972                                     (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE,
    973                                     (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE,
    974                                     bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE,
    975                                     AMDGPU_VM_PAGE_READABLE |
    976                                     AMDGPU_VM_PAGE_WRITEABLE |
    977                                     AMDGPU_VM_PAGE_EXECUTABLE,
    978                                     AMDGPU_VA_OP_REPLACE);
    979             if (r) {
    980                ok = sparse_backing_free(bo, backing, backing_start, backing_size);
    981                assert(ok && "sufficient memory should already be allocated");
    982 
    983                ok = false;
    984                goto out;
    985             }
    986 
    987             while (backing_size) {
    988                comm[span_va_page].backing = backing;
    989                comm[span_va_page].page = backing_start;
    990                span_va_page++;
    991                backing_start++;
    992                backing_size--;
    993             }
    994          }
    995       }
    996    } else {
    997       r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
    998                               (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE,
    999                               bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE,
   1000                               AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);
   1001       if (r) {
   1002          ok = false;
   1003          goto out;
   1004       }
   1005 
   1006       while (va_page < end_va_page) {
   1007          struct amdgpu_sparse_backing *backing;
   1008          uint32_t backing_start;
   1009          uint32_t span_pages;
   1010 
   1011          /* Skip pages that are already uncommitted. */
   1012          if (!comm[va_page].backing) {
   1013             va_page++;
   1014             continue;
   1015          }
   1016 
   1017          /* Group contiguous spans of pages. */
   1018          backing = comm[va_page].backing;
   1019          backing_start = comm[va_page].page;
   1020          comm[va_page].backing = NULL;
   1021 
   1022          span_pages = 1;
   1023          va_page++;
   1024 
   1025          while (va_page < end_va_page &&
   1026                 comm[va_page].backing == backing &&
   1027                 comm[va_page].page == backing_start + span_pages) {
   1028             comm[va_page].backing = NULL;
   1029             va_page++;
   1030             span_pages++;
   1031          }
   1032 
   1033          if (!sparse_backing_free(bo, backing, backing_start, span_pages)) {
   1034             /* Couldn't allocate tracking data structures, so we have to leak */
   1035             fprintf(stderr, "amdgpu: leaking PRT backing memory\n");
   1036             ok = false;
   1037          }
   1038       }
   1039    }
   1040 out:
   1041 
   1042    simple_mtx_unlock(&bo->u.sparse.commit_lock);
   1043 
   1044    return ok;
   1045 }
   1046 
   1047 static unsigned eg_tile_split(unsigned tile_split)
   1048 {
   1049    switch (tile_split) {
   1050    case 0:     tile_split = 64;    break;
   1051    case 1:     tile_split = 128;   break;
   1052    case 2:     tile_split = 256;   break;
   1053    case 3:     tile_split = 512;   break;
   1054    default:
   1055    case 4:     tile_split = 1024;  break;
   1056    case 5:     tile_split = 2048;  break;
   1057    case 6:     tile_split = 4096;  break;
   1058    }
   1059    return tile_split;
   1060 }
   1061 
   1062 static unsigned eg_tile_split_rev(unsigned eg_tile_split)
   1063 {
   1064    switch (eg_tile_split) {
   1065    case 64:    return 0;
   1066    case 128:   return 1;
   1067    case 256:   return 2;
   1068    case 512:   return 3;
   1069    default:
   1070    case 1024:  return 4;
   1071    case 2048:  return 5;
   1072    case 4096:  return 6;
   1073    }
   1074 }
   1075 
   1076 static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf,
   1077                                        struct radeon_bo_metadata *md)
   1078 {
   1079    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
   1080    struct amdgpu_bo_info info = {0};
   1081    uint64_t tiling_flags;
   1082    int r;
   1083 
   1084    assert(bo->bo && "must not be called for slab entries");
   1085 
   1086    r = amdgpu_bo_query_info(bo->bo, &info);
   1087    if (r)
   1088       return;
   1089 
   1090    tiling_flags = info.metadata.tiling_info;
   1091 
   1092    if (bo->ws->info.chip_class >= GFX9) {
   1093       md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE);
   1094    } else {
   1095       md->u.legacy.microtile = RADEON_LAYOUT_LINEAR;
   1096       md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR;
   1097 
   1098       if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4)  /* 2D_TILED_THIN1 */
   1099          md->u.legacy.macrotile = RADEON_LAYOUT_TILED;
   1100       else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
   1101          md->u.legacy.microtile = RADEON_LAYOUT_TILED;
   1102 
   1103       md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG);
   1104       md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
   1105       md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
   1106       md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
   1107       md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
   1108       md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS);
   1109       md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
   1110    }
   1111 
   1112    md->size_metadata = info.metadata.size_metadata;
   1113    memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
   1114 }
   1115 
   1116 static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
   1117                                        struct radeon_bo_metadata *md)
   1118 {
   1119    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
   1120    struct amdgpu_bo_metadata metadata = {0};
   1121    uint64_t tiling_flags = 0;
   1122 
   1123    assert(bo->bo && "must not be called for slab entries");
   1124 
   1125    if (bo->ws->info.chip_class >= GFX9) {
   1126       tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode);
   1127    } else {
   1128       if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED)
   1129          tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
   1130       else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED)
   1131          tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
   1132       else
   1133          tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
   1134 
   1135       tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config);
   1136       tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw));
   1137       tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh));
   1138       if (md->u.legacy.tile_split)
   1139          tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->u.legacy.tile_split));
   1140       tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea));
   1141       tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks)-1);
   1142 
   1143       if (md->u.legacy.scanout)
   1144          tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
   1145       else
   1146          tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
   1147    }
   1148 
   1149    metadata.tiling_info = tiling_flags;
   1150    metadata.size_metadata = md->size_metadata;
   1151    memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
   1152 
   1153    amdgpu_bo_set_metadata(bo->bo, &metadata);
   1154 }
   1155 
   1156 static struct pb_buffer *
   1157 amdgpu_bo_create(struct radeon_winsys *rws,
   1158                  uint64_t size,
   1159                  unsigned alignment,
   1160                  enum radeon_bo_domain domain,
   1161                  enum radeon_bo_flag flags)
   1162 {
   1163    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
   1164    struct amdgpu_winsys_bo *bo;
   1165    unsigned usage = 0, pb_cache_bucket = 0;
   1166 
   1167    /* VRAM implies WC. This is not optional. */
   1168    assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
   1169 
   1170    /* NO_CPU_ACCESS is valid with VRAM only. */
   1171    assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS));
   1172 
   1173    /* Sparse buffers must have NO_CPU_ACCESS set. */
   1174    assert(!(flags & RADEON_FLAG_SPARSE) || flags & RADEON_FLAG_NO_CPU_ACCESS);
   1175 
   1176    /* Sub-allocate small buffers from slabs. */
   1177    if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
   1178        size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
   1179        alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) {
   1180       struct pb_slab_entry *entry;
   1181       int heap = radeon_get_heap_index(domain, flags);
   1182 
   1183       if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS)
   1184          goto no_slab;
   1185 
   1186       entry = pb_slab_alloc(&ws->bo_slabs, size, heap);
   1187       if (!entry) {
   1188          /* Clear the cache and try again. */
   1189          pb_cache_release_all_buffers(&ws->bo_cache);
   1190 
   1191          entry = pb_slab_alloc(&ws->bo_slabs, size, heap);
   1192       }
   1193       if (!entry)
   1194          return NULL;
   1195 
   1196       bo = NULL;
   1197       bo = container_of(entry, bo, u.slab.entry);
   1198 
   1199       pipe_reference_init(&bo->base.reference, 1);
   1200 
   1201       return &bo->base;
   1202    }
   1203 no_slab:
   1204 
   1205    if (flags & RADEON_FLAG_SPARSE) {
   1206       assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0);
   1207 
   1208       return amdgpu_bo_sparse_create(ws, size, domain, flags);
   1209    }
   1210 
   1211    /* This flag is irrelevant for the cache. */
   1212    flags &= ~RADEON_FLAG_NO_SUBALLOC;
   1213 
   1214    /* Align size to page size. This is the minimum alignment for normal
   1215     * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
   1216     * like constant/uniform buffers, can benefit from better and more reuse.
   1217     */
   1218    size = align64(size, ws->info.gart_page_size);
   1219    alignment = align(alignment, ws->info.gart_page_size);
   1220 
   1221    bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
   1222 
   1223    if (use_reusable_pool) {
   1224        int heap = radeon_get_heap_index(domain, flags);
   1225        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
   1226        usage = 1 << heap; /* Only set one usage bit for each heap. */
   1227 
   1228        pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
   1229        assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
   1230 
   1231        /* Get a buffer from the cache. */
   1232        bo = (struct amdgpu_winsys_bo*)
   1233             pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
   1234                                     pb_cache_bucket);
   1235        if (bo)
   1236           return &bo->base;
   1237    }
   1238 
   1239    /* Create a new one. */
   1240    bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
   1241                          pb_cache_bucket);
   1242    if (!bo) {
   1243       /* Clear the cache and try again. */
   1244       pb_slabs_reclaim(&ws->bo_slabs);
   1245       pb_cache_release_all_buffers(&ws->bo_cache);
   1246       bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
   1247                             pb_cache_bucket);
   1248       if (!bo)
   1249          return NULL;
   1250    }
   1251 
   1252    bo->u.real.use_reusable_pool = use_reusable_pool;
   1253    return &bo->base;
   1254 }
   1255 
   1256 static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
   1257                                                struct winsys_handle *whandle,
   1258                                                unsigned *stride,
   1259                                                unsigned *offset)
   1260 {
   1261    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
   1262    struct amdgpu_winsys_bo *bo;
   1263    enum amdgpu_bo_handle_type type;
   1264    struct amdgpu_bo_import_result result = {0};
   1265    uint64_t va;
   1266    amdgpu_va_handle va_handle;
   1267    struct amdgpu_bo_info info = {0};
   1268    enum radeon_bo_domain initial = 0;
   1269    int r;
   1270 
   1271    /* Initialize the structure. */
   1272    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
   1273    if (!bo) {
   1274       return NULL;
   1275    }
   1276 
   1277    switch (whandle->type) {
   1278    case DRM_API_HANDLE_TYPE_SHARED:
   1279       type = amdgpu_bo_handle_type_gem_flink_name;
   1280       break;
   1281    case DRM_API_HANDLE_TYPE_FD:
   1282       type = amdgpu_bo_handle_type_dma_buf_fd;
   1283       break;
   1284    default:
   1285       return NULL;
   1286    }
   1287 
   1288    r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result);
   1289    if (r)
   1290       goto error;
   1291 
   1292    /* Get initial domains. */
   1293    r = amdgpu_bo_query_info(result.buf_handle, &info);
   1294    if (r)
   1295       goto error_query;
   1296 
   1297    r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
   1298                              result.alloc_size, 1 << 20, 0, &va, &va_handle, 0);
   1299    if (r)
   1300       goto error_query;
   1301 
   1302    r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);
   1303    if (r)
   1304       goto error_va_map;
   1305 
   1306    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
   1307       initial |= RADEON_DOMAIN_VRAM;
   1308    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
   1309       initial |= RADEON_DOMAIN_GTT;
   1310 
   1311 
   1312    pipe_reference_init(&bo->base.reference, 1);
   1313    bo->base.alignment = info.phys_alignment;
   1314    bo->bo = result.buf_handle;
   1315    bo->base.size = result.alloc_size;
   1316    bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
   1317    bo->ws = ws;
   1318    bo->va = va;
   1319    bo->u.real.va_handle = va_handle;
   1320    bo->initial_domain = initial;
   1321    bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
   1322    bo->is_shared = true;
   1323 
   1324    if (stride)
   1325       *stride = whandle->stride;
   1326    if (offset)
   1327       *offset = whandle->offset;
   1328 
   1329    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
   1330       ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size);
   1331    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
   1332       ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size);
   1333 
   1334    amdgpu_add_buffer_to_global_list(bo);
   1335 
   1336    return &bo->base;
   1337 
   1338 error_va_map:
   1339    amdgpu_va_range_free(va_handle);
   1340 
   1341 error_query:
   1342    amdgpu_bo_free(result.buf_handle);
   1343 
   1344 error:
   1345    FREE(bo);
   1346    return NULL;
   1347 }
   1348 
   1349 static bool amdgpu_bo_get_handle(struct pb_buffer *buffer,
   1350                                  unsigned stride, unsigned offset,
   1351                                  unsigned slice_size,
   1352                                  struct winsys_handle *whandle)
   1353 {
   1354    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
   1355    enum amdgpu_bo_handle_type type;
   1356    int r;
   1357 
   1358    /* Don't allow exports of slab entries and sparse buffers. */
   1359    if (!bo->bo)
   1360       return false;
   1361 
   1362    bo->u.real.use_reusable_pool = false;
   1363 
   1364    switch (whandle->type) {
   1365    case DRM_API_HANDLE_TYPE_SHARED:
   1366       type = amdgpu_bo_handle_type_gem_flink_name;
   1367       break;
   1368    case DRM_API_HANDLE_TYPE_FD:
   1369       type = amdgpu_bo_handle_type_dma_buf_fd;
   1370       break;
   1371    case DRM_API_HANDLE_TYPE_KMS:
   1372       type = amdgpu_bo_handle_type_kms;
   1373       break;
   1374    default:
   1375       return false;
   1376    }
   1377 
   1378    r = amdgpu_bo_export(bo->bo, type, &whandle->handle);
   1379    if (r)
   1380       return false;
   1381 
   1382    whandle->stride = stride;
   1383    whandle->offset = offset;
   1384    whandle->offset += slice_size * whandle->layer;
   1385    bo->is_shared = true;
   1386    return true;
   1387 }
   1388 
   1389 static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
   1390 					    void *pointer, uint64_t size)
   1391 {
   1392     struct amdgpu_winsys *ws = amdgpu_winsys(rws);
   1393     amdgpu_bo_handle buf_handle;
   1394     struct amdgpu_winsys_bo *bo;
   1395     uint64_t va;
   1396     amdgpu_va_handle va_handle;
   1397     /* Avoid failure when the size is not page aligned */
   1398     uint64_t aligned_size = align64(size, ws->info.gart_page_size);
   1399 
   1400     bo = CALLOC_STRUCT(amdgpu_winsys_bo);
   1401     if (!bo)
   1402         return NULL;
   1403 
   1404     if (amdgpu_create_bo_from_user_mem(ws->dev, pointer,
   1405                                        aligned_size, &buf_handle))
   1406         goto error;
   1407 
   1408     if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
   1409                               aligned_size, 1 << 12, 0, &va, &va_handle, 0))
   1410         goto error_va_alloc;
   1411 
   1412     if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP))
   1413         goto error_va_map;
   1414 
   1415     /* Initialize it. */
   1416     pipe_reference_init(&bo->base.reference, 1);
   1417     bo->bo = buf_handle;
   1418     bo->base.alignment = 0;
   1419     bo->base.size = size;
   1420     bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
   1421     bo->ws = ws;
   1422     bo->user_ptr = pointer;
   1423     bo->va = va;
   1424     bo->u.real.va_handle = va_handle;
   1425     bo->initial_domain = RADEON_DOMAIN_GTT;
   1426     bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
   1427 
   1428     ws->allocated_gtt += aligned_size;
   1429 
   1430     amdgpu_add_buffer_to_global_list(bo);
   1431 
   1432     return (struct pb_buffer*)bo;
   1433 
   1434 error_va_map:
   1435     amdgpu_va_range_free(va_handle);
   1436 
   1437 error_va_alloc:
   1438     amdgpu_bo_free(buf_handle);
   1439 
   1440 error:
   1441     FREE(bo);
   1442     return NULL;
   1443 }
   1444 
   1445 static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)
   1446 {
   1447    return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL;
   1448 }
   1449 
   1450 static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf)
   1451 {
   1452    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
   1453 
   1454    return !bo->bo && !bo->sparse;
   1455 }
   1456 
   1457 static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
   1458 {
   1459    return ((struct amdgpu_winsys_bo*)buf)->va;
   1460 }
   1461 
   1462 void amdgpu_bo_init_functions(struct amdgpu_winsys *ws)
   1463 {
   1464    ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;
   1465    ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata;
   1466    ws->base.buffer_map = amdgpu_bo_map;
   1467    ws->base.buffer_unmap = amdgpu_bo_unmap;
   1468    ws->base.buffer_wait = amdgpu_bo_wait;
   1469    ws->base.buffer_create = amdgpu_bo_create;
   1470    ws->base.buffer_from_handle = amdgpu_bo_from_handle;
   1471    ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
   1472    ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
   1473    ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated;
   1474    ws->base.buffer_get_handle = amdgpu_bo_get_handle;
   1475    ws->base.buffer_commit = amdgpu_bo_sparse_commit;
   1476    ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
   1477    ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
   1478 }
   1479