Home | History | Annotate | Download | only in drm
      1 /*
      2  * Copyright  2008 Jrme Glisse
      3  * Copyright  2010 Marek Olk <maraeo (at) gmail.com>
      4  * Copyright  2015 Advanced Micro Devices, Inc.
      5  * All Rights Reserved.
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining
      8  * a copy of this software and associated documentation files (the
      9  * "Software"), to deal in the Software without restriction, including
     10  * without limitation the rights to use, copy, modify, merge, publish,
     11  * distribute, sub license, and/or sell copies of the Software, and to
     12  * permit persons to whom the Software is furnished to do so, subject to
     13  * the following conditions:
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
     17  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     18  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
     19  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     23  *
     24  * The above copyright notice and this permission notice (including the
     25  * next paragraph) shall be included in all copies or substantial portions
     26  * of the Software.
     27  */
     28 /*
     29  * Authors:
     30  *      Marek Olk <maraeo (at) gmail.com>
     31  */
     32 
     33 #include "amdgpu_cs.h"
     34 #include "os/os_time.h"
     35 #include <stdio.h>
     36 #include <amdgpu_drm.h>
     37 
     38 #include "amd/common/sid.h"
     39 
     40 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
     41 
     42 /* FENCES */
     43 
     44 static struct pipe_fence_handle *
     45 amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type,
     46                     unsigned ip_instance, unsigned ring)
     47 {
     48    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
     49 
     50    fence->reference.count = 1;
     51    fence->ctx = ctx;
     52    fence->fence.context = ctx->ctx;
     53    fence->fence.ip_type = ip_type;
     54    fence->fence.ip_instance = ip_instance;
     55    fence->fence.ring = ring;
     56    fence->submission_in_progress = true;
     57    p_atomic_inc(&ctx->refcount);
     58    return (struct pipe_fence_handle *)fence;
     59 }
     60 
     61 static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
     62 				struct amdgpu_cs_request* request,
     63 				uint64_t *user_fence_cpu_address)
     64 {
     65    struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
     66 
     67    rfence->fence.fence = request->seq_no;
     68    rfence->user_fence_cpu_address = user_fence_cpu_address;
     69    rfence->submission_in_progress = false;
     70 }
     71 
     72 static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
     73 {
     74    struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
     75 
     76    rfence->signalled = true;
     77    rfence->submission_in_progress = false;
     78 }
     79 
     80 bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
     81                        bool absolute)
     82 {
     83    struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
     84    uint32_t expired;
     85    int64_t abs_timeout;
     86    uint64_t *user_fence_cpu;
     87    int r;
     88 
     89    if (rfence->signalled)
     90       return true;
     91 
     92    if (absolute)
     93       abs_timeout = timeout;
     94    else
     95       abs_timeout = os_time_get_absolute_timeout(timeout);
     96 
     97    /* The fence might not have a number assigned if its IB is being
     98     * submitted in the other thread right now. Wait until the submission
     99     * is done. */
    100    if (!os_wait_until_zero_abs_timeout(&rfence->submission_in_progress,
    101                                        abs_timeout))
    102       return false;
    103 
    104    user_fence_cpu = rfence->user_fence_cpu_address;
    105    if (user_fence_cpu) {
    106       if (*user_fence_cpu >= rfence->fence.fence) {
    107          rfence->signalled = true;
    108          return true;
    109       }
    110 
    111       /* No timeout, just query: no need for the ioctl. */
    112       if (!absolute && !timeout)
    113          return false;
    114    }
    115 
    116    /* Now use the libdrm query. */
    117    r = amdgpu_cs_query_fence_status(&rfence->fence,
    118 				    abs_timeout,
    119 				    AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE,
    120 				    &expired);
    121    if (r) {
    122       fprintf(stderr, "amdgpu: amdgpu_cs_query_fence_status failed.\n");
    123       return false;
    124    }
    125 
    126    if (expired) {
    127       /* This variable can only transition from false to true, so it doesn't
    128        * matter if threads race for it. */
    129       rfence->signalled = true;
    130       return true;
    131    }
    132    return false;
    133 }
    134 
    135 static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
    136                                           struct pipe_fence_handle *fence,
    137                                           uint64_t timeout)
    138 {
    139    return amdgpu_fence_wait(fence, timeout, false);
    140 }
    141 
    142 static struct pipe_fence_handle *
    143 amdgpu_cs_get_next_fence(struct radeon_winsys_cs *rcs)
    144 {
    145    struct amdgpu_cs *cs = amdgpu_cs(rcs);
    146    struct pipe_fence_handle *fence = NULL;
    147 
    148    if (debug_get_option_noop())
    149       return NULL;
    150 
    151    if (cs->next_fence) {
    152       amdgpu_fence_reference(&fence, cs->next_fence);
    153       return fence;
    154    }
    155 
    156    fence = amdgpu_fence_create(cs->ctx,
    157                                cs->csc->request.ip_type,
    158                                cs->csc->request.ip_instance,
    159                                cs->csc->request.ring);
    160    if (!fence)
    161       return NULL;
    162 
    163    amdgpu_fence_reference(&cs->next_fence, fence);
    164    return fence;
    165 }
    166 
    167 /* CONTEXTS */
    168 
    169 static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws)
    170 {
    171    struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
    172    int r;
    173    struct amdgpu_bo_alloc_request alloc_buffer = {};
    174    amdgpu_bo_handle buf_handle;
    175 
    176    if (!ctx)
    177       return NULL;
    178 
    179    ctx->ws = amdgpu_winsys(ws);
    180    ctx->refcount = 1;
    181 
    182    r = amdgpu_cs_ctx_create(ctx->ws->dev, &ctx->ctx);
    183    if (r) {
    184       fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create failed. (%i)\n", r);
    185       goto error_create;
    186    }
    187 
    188    alloc_buffer.alloc_size = ctx->ws->info.gart_page_size;
    189    alloc_buffer.phys_alignment = ctx->ws->info.gart_page_size;
    190    alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
    191 
    192    r = amdgpu_bo_alloc(ctx->ws->dev, &alloc_buffer, &buf_handle);
    193    if (r) {
    194       fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
    195       goto error_user_fence_alloc;
    196    }
    197 
    198    r = amdgpu_bo_cpu_map(buf_handle, (void**)&ctx->user_fence_cpu_address_base);
    199    if (r) {
    200       fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
    201       goto error_user_fence_map;
    202    }
    203 
    204    memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
    205    ctx->user_fence_bo = buf_handle;
    206 
    207    return (struct radeon_winsys_ctx*)ctx;
    208 
    209 error_user_fence_map:
    210    amdgpu_bo_free(buf_handle);
    211 error_user_fence_alloc:
    212    amdgpu_cs_ctx_free(ctx->ctx);
    213 error_create:
    214    FREE(ctx);
    215    return NULL;
    216 }
    217 
    218 static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
    219 {
    220    amdgpu_ctx_unref((struct amdgpu_ctx*)rwctx);
    221 }
    222 
    223 static enum pipe_reset_status
    224 amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx)
    225 {
    226    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
    227    uint32_t result, hangs;
    228    int r;
    229 
    230    r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs);
    231    if (r) {
    232       fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r);
    233       return PIPE_NO_RESET;
    234    }
    235 
    236    switch (result) {
    237    case AMDGPU_CTX_GUILTY_RESET:
    238       return PIPE_GUILTY_CONTEXT_RESET;
    239    case AMDGPU_CTX_INNOCENT_RESET:
    240       return PIPE_INNOCENT_CONTEXT_RESET;
    241    case AMDGPU_CTX_UNKNOWN_RESET:
    242       return PIPE_UNKNOWN_CONTEXT_RESET;
    243    case AMDGPU_CTX_NO_RESET:
    244    default:
    245       return PIPE_NO_RESET;
    246    }
    247 }
    248 
    249 /* COMMAND SUBMISSION */
    250 
    251 static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs)
    252 {
    253    return cs->request.ip_type != AMDGPU_HW_IP_UVD &&
    254           cs->request.ip_type != AMDGPU_HW_IP_VCE;
    255 }
    256 
    257 static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs)
    258 {
    259    return cs->ctx->ws->info.chip_class >= CIK &&
    260           cs->ring_type == RING_GFX;
    261 }
    262 
    263 static unsigned amdgpu_cs_epilog_dws(enum ring_type ring_type)
    264 {
    265    if (ring_type == RING_GFX)
    266       return 4; /* for chaining */
    267 
    268    return 0;
    269 }
    270 
    271 int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
    272 {
    273    unsigned hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
    274    int i = cs->buffer_indices_hashlist[hash];
    275    struct amdgpu_cs_buffer *buffers;
    276    int num_buffers;
    277 
    278    if (bo->bo) {
    279       buffers = cs->real_buffers;
    280       num_buffers = cs->num_real_buffers;
    281    } else {
    282       buffers = cs->slab_buffers;
    283       num_buffers = cs->num_slab_buffers;
    284    }
    285 
    286    /* not found or found */
    287    if (i < 0 || (i < num_buffers && buffers[i].bo == bo))
    288       return i;
    289 
    290    /* Hash collision, look for the BO in the list of buffers linearly. */
    291    for (i = num_buffers - 1; i >= 0; i--) {
    292       if (buffers[i].bo == bo) {
    293          /* Put this buffer in the hash list.
    294           * This will prevent additional hash collisions if there are
    295           * several consecutive lookup_buffer calls for the same buffer.
    296           *
    297           * Example: Assuming buffers A,B,C collide in the hash list,
    298           * the following sequence of buffers:
    299           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
    300           * will collide here: ^ and here:   ^,
    301           * meaning that we should get very few collisions in the end. */
    302          cs->buffer_indices_hashlist[hash] = i;
    303          return i;
    304       }
    305    }
    306    return -1;
    307 }
    308 
    309 static int
    310 amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo *bo)
    311 {
    312    struct amdgpu_cs_context *cs = acs->csc;
    313    struct amdgpu_cs_buffer *buffer;
    314    unsigned hash;
    315    int idx = amdgpu_lookup_buffer(cs, bo);
    316 
    317    if (idx >= 0)
    318       return idx;
    319 
    320    /* New buffer, check if the backing array is large enough. */
    321    if (cs->num_real_buffers >= cs->max_real_buffers) {
    322       unsigned new_max =
    323          MAX2(cs->max_real_buffers + 16, (unsigned)(cs->max_real_buffers * 1.3));
    324       struct amdgpu_cs_buffer *new_buffers;
    325       amdgpu_bo_handle *new_handles;
    326       uint8_t *new_flags;
    327 
    328       new_buffers = MALLOC(new_max * sizeof(*new_buffers));
    329       new_handles = MALLOC(new_max * sizeof(*new_handles));
    330       new_flags = MALLOC(new_max * sizeof(*new_flags));
    331 
    332       if (!new_buffers || !new_handles || !new_flags) {
    333          fprintf(stderr, "amdgpu_lookup_or_add_buffer: allocation failed\n");
    334          FREE(new_buffers);
    335          FREE(new_handles);
    336          FREE(new_flags);
    337          return -1;
    338       }
    339 
    340       memcpy(new_buffers, cs->real_buffers, cs->num_real_buffers * sizeof(*new_buffers));
    341       memcpy(new_handles, cs->handles, cs->num_real_buffers * sizeof(*new_handles));
    342       memcpy(new_flags, cs->flags, cs->num_real_buffers * sizeof(*new_flags));
    343 
    344       FREE(cs->real_buffers);
    345       FREE(cs->handles);
    346       FREE(cs->flags);
    347 
    348       cs->max_real_buffers = new_max;
    349       cs->real_buffers = new_buffers;
    350       cs->handles = new_handles;
    351       cs->flags = new_flags;
    352    }
    353 
    354    idx = cs->num_real_buffers;
    355    buffer = &cs->real_buffers[idx];
    356 
    357    memset(buffer, 0, sizeof(*buffer));
    358    amdgpu_winsys_bo_reference(&buffer->bo, bo);
    359    cs->handles[idx] = bo->bo;
    360    cs->flags[idx] = 0;
    361    p_atomic_inc(&bo->num_cs_references);
    362    cs->num_real_buffers++;
    363 
    364    hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
    365    cs->buffer_indices_hashlist[hash] = idx;
    366 
    367    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
    368       acs->main.base.used_vram += bo->base.size;
    369    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
    370       acs->main.base.used_gart += bo->base.size;
    371 
    372    return idx;
    373 }
    374 
    375 static int amdgpu_lookup_or_add_slab_buffer(struct amdgpu_cs *acs,
    376                                             struct amdgpu_winsys_bo *bo)
    377 {
    378    struct amdgpu_cs_context *cs = acs->csc;
    379    struct amdgpu_cs_buffer *buffer;
    380    unsigned hash;
    381    int idx = amdgpu_lookup_buffer(cs, bo);
    382    int real_idx;
    383 
    384    if (idx >= 0)
    385       return idx;
    386 
    387    real_idx = amdgpu_lookup_or_add_real_buffer(acs, bo->u.slab.real);
    388    if (real_idx < 0)
    389       return -1;
    390 
    391    /* New buffer, check if the backing array is large enough. */
    392    if (cs->num_slab_buffers >= cs->max_slab_buffers) {
    393       unsigned new_max =
    394          MAX2(cs->max_slab_buffers + 16, (unsigned)(cs->max_slab_buffers * 1.3));
    395       struct amdgpu_cs_buffer *new_buffers;
    396 
    397       new_buffers = REALLOC(cs->slab_buffers,
    398                             cs->max_slab_buffers * sizeof(*new_buffers),
    399                             new_max * sizeof(*new_buffers));
    400       if (!new_buffers) {
    401          fprintf(stderr, "amdgpu_lookup_or_add_slab_buffer: allocation failed\n");
    402          return -1;
    403       }
    404 
    405       cs->max_slab_buffers = new_max;
    406       cs->slab_buffers = new_buffers;
    407    }
    408 
    409    idx = cs->num_slab_buffers;
    410    buffer = &cs->slab_buffers[idx];
    411 
    412    memset(buffer, 0, sizeof(*buffer));
    413    amdgpu_winsys_bo_reference(&buffer->bo, bo);
    414    buffer->u.slab.real_idx = real_idx;
    415    p_atomic_inc(&bo->num_cs_references);
    416    cs->num_slab_buffers++;
    417 
    418    hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
    419    cs->buffer_indices_hashlist[hash] = idx;
    420 
    421    return idx;
    422 }
    423 
    424 static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs,
    425                                     struct pb_buffer *buf,
    426                                     enum radeon_bo_usage usage,
    427                                     enum radeon_bo_domain domains,
    428                                     enum radeon_bo_priority priority)
    429 {
    430    /* Don't use the "domains" parameter. Amdgpu doesn't support changing
    431     * the buffer placement during command submission.
    432     */
    433    struct amdgpu_cs *acs = amdgpu_cs(rcs);
    434    struct amdgpu_cs_context *cs = acs->csc;
    435    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
    436    struct amdgpu_cs_buffer *buffer;
    437    int index;
    438 
    439    if (!bo->bo) {
    440       index = amdgpu_lookup_or_add_slab_buffer(acs, bo);
    441       if (index < 0)
    442          return 0;
    443 
    444       buffer = &cs->slab_buffers[index];
    445       buffer->usage |= usage;
    446 
    447       usage &= ~RADEON_USAGE_SYNCHRONIZED;
    448       index = buffer->u.slab.real_idx;
    449    } else {
    450       index = amdgpu_lookup_or_add_real_buffer(acs, bo);
    451       if (index < 0)
    452          return 0;
    453    }
    454 
    455    buffer = &cs->real_buffers[index];
    456    buffer->u.real.priority_usage |= 1llu << priority;
    457    buffer->usage |= usage;
    458    cs->flags[index] = MAX2(cs->flags[index], priority / 4);
    459    return index;
    460 }
    461 
    462 static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib)
    463 {
    464    struct pb_buffer *pb;
    465    uint8_t *mapped;
    466    unsigned buffer_size;
    467 
    468    /* Always create a buffer that is at least as large as the maximum seen IB
    469     * size, aligned to a power of two (and multiplied by 4 to reduce internal
    470     * fragmentation if chaining is not available). Limit to 512k dwords, which
    471     * is the largest power of two that fits into the size field of the
    472     * INDIRECT_BUFFER packet.
    473     */
    474    if (amdgpu_cs_has_chaining(amdgpu_cs_from_ib(ib)))
    475       buffer_size = 4 *util_next_power_of_two(ib->max_ib_size);
    476    else
    477       buffer_size = 4 *util_next_power_of_two(4 * ib->max_ib_size);
    478 
    479    buffer_size = MIN2(buffer_size, 4 * 512 * 1024);
    480 
    481    switch (ib->ib_type) {
    482    case IB_CONST_PREAMBLE:
    483       buffer_size = MAX2(buffer_size, 4 * 1024);
    484       break;
    485    case IB_CONST:
    486       buffer_size = MAX2(buffer_size, 16 * 1024 * 4);
    487       break;
    488    case IB_MAIN:
    489       buffer_size = MAX2(buffer_size, 8 * 1024 * 4);
    490       break;
    491    default:
    492       unreachable("unhandled IB type");
    493    }
    494 
    495    pb = ws->base.buffer_create(&ws->base, buffer_size,
    496                                ws->info.gart_page_size,
    497                                RADEON_DOMAIN_GTT,
    498                                RADEON_FLAG_CPU_ACCESS);
    499    if (!pb)
    500       return false;
    501 
    502    mapped = ws->base.buffer_map(pb, NULL, PIPE_TRANSFER_WRITE);
    503    if (!mapped) {
    504       pb_reference(&pb, NULL);
    505       return false;
    506    }
    507 
    508    pb_reference(&ib->big_ib_buffer, pb);
    509    pb_reference(&pb, NULL);
    510 
    511    ib->ib_mapped = mapped;
    512    ib->used_ib_space = 0;
    513 
    514    return true;
    515 }
    516 
    517 static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
    518 {
    519    switch (ib_type) {
    520    case IB_MAIN:
    521       /* Smaller submits means the GPU gets busy sooner and there is less
    522        * waiting for buffers and fences. Proof:
    523        *   http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
    524        */
    525       return 20 * 1024;
    526    case IB_CONST_PREAMBLE:
    527    case IB_CONST:
    528       /* There isn't really any reason to limit CE IB size beyond the natural
    529        * limit implied by the main IB, except perhaps GTT size. Just return
    530        * an extremely large value that we never get anywhere close to.
    531        */
    532       return 16 * 1024 * 1024;
    533    default:
    534       unreachable("bad ib_type");
    535    }
    536 }
    537 
    538 static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
    539                               enum ib_type ib_type)
    540 {
    541    struct amdgpu_winsys *aws = (struct amdgpu_winsys*)ws;
    542    /* Small IBs are better than big IBs, because the GPU goes idle quicker
    543     * and there is less waiting for buffers and fences. Proof:
    544     *   http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
    545     */
    546    struct amdgpu_ib *ib = NULL;
    547    struct amdgpu_cs_ib_info *info = &cs->csc->ib[ib_type];
    548    unsigned ib_size = 0;
    549 
    550    switch (ib_type) {
    551    case IB_CONST_PREAMBLE:
    552       ib = &cs->const_preamble_ib;
    553       ib_size = 256 * 4;
    554       break;
    555    case IB_CONST:
    556       ib = &cs->const_ib;
    557       ib_size = 8 * 1024 * 4;
    558       break;
    559    case IB_MAIN:
    560       ib = &cs->main;
    561       ib_size = 4 * 1024 * 4;
    562       break;
    563    default:
    564       unreachable("unhandled IB type");
    565    }
    566 
    567    if (!amdgpu_cs_has_chaining(cs)) {
    568       ib_size = MAX2(ib_size,
    569                      4 * MIN2(util_next_power_of_two(ib->max_ib_size),
    570                               amdgpu_ib_max_submit_dwords(ib_type)));
    571    }
    572 
    573    ib->max_ib_size = ib->max_ib_size - ib->max_ib_size / 32;
    574 
    575    ib->base.prev_dw = 0;
    576    ib->base.num_prev = 0;
    577    ib->base.current.cdw = 0;
    578    ib->base.current.buf = NULL;
    579 
    580    /* Allocate a new buffer for IBs if the current buffer is all used. */
    581    if (!ib->big_ib_buffer ||
    582        ib->used_ib_space + ib_size > ib->big_ib_buffer->size) {
    583       if (!amdgpu_ib_new_buffer(aws, ib))
    584          return false;
    585    }
    586 
    587    info->ib_mc_address = amdgpu_winsys_bo(ib->big_ib_buffer)->va +
    588                          ib->used_ib_space;
    589    info->size = 0;
    590    ib->ptr_ib_size = &info->size;
    591 
    592    amdgpu_cs_add_buffer(&cs->main.base, ib->big_ib_buffer,
    593                         RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
    594 
    595    ib->base.current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
    596 
    597    ib_size = ib->big_ib_buffer->size - ib->used_ib_space;
    598    ib->base.current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs->ring_type);
    599    return true;
    600 }
    601 
    602 static void amdgpu_ib_finalize(struct amdgpu_ib *ib)
    603 {
    604    *ib->ptr_ib_size |= ib->base.current.cdw;
    605    ib->used_ib_space += ib->base.current.cdw * 4;
    606    ib->max_ib_size = MAX2(ib->max_ib_size, ib->base.prev_dw + ib->base.current.cdw);
    607 }
    608 
    609 static bool amdgpu_init_cs_context(struct amdgpu_cs_context *cs,
    610                                    enum ring_type ring_type)
    611 {
    612    int i;
    613 
    614    switch (ring_type) {
    615    case RING_DMA:
    616       cs->request.ip_type = AMDGPU_HW_IP_DMA;
    617       break;
    618 
    619    case RING_UVD:
    620       cs->request.ip_type = AMDGPU_HW_IP_UVD;
    621       break;
    622 
    623    case RING_VCE:
    624       cs->request.ip_type = AMDGPU_HW_IP_VCE;
    625       break;
    626 
    627    case RING_COMPUTE:
    628       cs->request.ip_type = AMDGPU_HW_IP_COMPUTE;
    629       break;
    630 
    631    default:
    632    case RING_GFX:
    633       cs->request.ip_type = AMDGPU_HW_IP_GFX;
    634       break;
    635    }
    636 
    637    for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) {
    638       cs->buffer_indices_hashlist[i] = -1;
    639    }
    640 
    641    cs->request.number_of_ibs = 1;
    642    cs->request.ibs = &cs->ib[IB_MAIN];
    643 
    644    cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE;
    645    cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE |
    646                                      AMDGPU_IB_FLAG_PREAMBLE;
    647 
    648    return true;
    649 }
    650 
    651 static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs)
    652 {
    653    unsigned i;
    654 
    655    for (i = 0; i < cs->num_real_buffers; i++) {
    656       p_atomic_dec(&cs->real_buffers[i].bo->num_cs_references);
    657       amdgpu_winsys_bo_reference(&cs->real_buffers[i].bo, NULL);
    658    }
    659    for (i = 0; i < cs->num_slab_buffers; i++) {
    660       p_atomic_dec(&cs->slab_buffers[i].bo->num_cs_references);
    661       amdgpu_winsys_bo_reference(&cs->slab_buffers[i].bo, NULL);
    662    }
    663 
    664    cs->num_real_buffers = 0;
    665    cs->num_slab_buffers = 0;
    666    amdgpu_fence_reference(&cs->fence, NULL);
    667 
    668    for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) {
    669       cs->buffer_indices_hashlist[i] = -1;
    670    }
    671 }
    672 
    673 static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs)
    674 {
    675    amdgpu_cs_context_cleanup(cs);
    676    FREE(cs->flags);
    677    FREE(cs->real_buffers);
    678    FREE(cs->handles);
    679    FREE(cs->slab_buffers);
    680    FREE(cs->request.dependencies);
    681 }
    682 
    683 
    684 static struct radeon_winsys_cs *
    685 amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
    686                  enum ring_type ring_type,
    687                  void (*flush)(void *ctx, unsigned flags,
    688                                struct pipe_fence_handle **fence),
    689                  void *flush_ctx)
    690 {
    691    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
    692    struct amdgpu_cs *cs;
    693 
    694    cs = CALLOC_STRUCT(amdgpu_cs);
    695    if (!cs) {
    696       return NULL;
    697    }
    698 
    699    util_queue_fence_init(&cs->flush_completed);
    700 
    701    cs->ctx = ctx;
    702    cs->flush_cs = flush;
    703    cs->flush_data = flush_ctx;
    704    cs->ring_type = ring_type;
    705 
    706    cs->main.ib_type = IB_MAIN;
    707    cs->const_ib.ib_type = IB_CONST;
    708    cs->const_preamble_ib.ib_type = IB_CONST_PREAMBLE;
    709 
    710    if (!amdgpu_init_cs_context(&cs->csc1, ring_type)) {
    711       FREE(cs);
    712       return NULL;
    713    }
    714 
    715    if (!amdgpu_init_cs_context(&cs->csc2, ring_type)) {
    716       amdgpu_destroy_cs_context(&cs->csc1);
    717       FREE(cs);
    718       return NULL;
    719    }
    720 
    721    /* Set the first submission context as current. */
    722    cs->csc = &cs->csc1;
    723    cs->cst = &cs->csc2;
    724 
    725    if (!amdgpu_get_new_ib(&ctx->ws->base, cs, IB_MAIN)) {
    726       amdgpu_destroy_cs_context(&cs->csc2);
    727       amdgpu_destroy_cs_context(&cs->csc1);
    728       FREE(cs);
    729       return NULL;
    730    }
    731 
    732    p_atomic_inc(&ctx->ws->num_cs);
    733    return &cs->main.base;
    734 }
    735 
    736 static struct radeon_winsys_cs *
    737 amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs)
    738 {
    739    struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
    740    struct amdgpu_winsys *ws = cs->ctx->ws;
    741 
    742    /* only one const IB can be added */
    743    if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped)
    744       return NULL;
    745 
    746    if (!amdgpu_get_new_ib(&ws->base, cs, IB_CONST))
    747       return NULL;
    748 
    749    cs->csc->request.number_of_ibs = 2;
    750    cs->csc->request.ibs = &cs->csc->ib[IB_CONST];
    751 
    752    cs->cst->request.number_of_ibs = 2;
    753    cs->cst->request.ibs = &cs->cst->ib[IB_CONST];
    754 
    755    return &cs->const_ib.base;
    756 }
    757 
    758 static struct radeon_winsys_cs *
    759 amdgpu_cs_add_const_preamble_ib(struct radeon_winsys_cs *rcs)
    760 {
    761    struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
    762    struct amdgpu_winsys *ws = cs->ctx->ws;
    763 
    764    /* only one const preamble IB can be added and only when the const IB has
    765     * also been mapped */
    766    if (cs->ring_type != RING_GFX || !cs->const_ib.ib_mapped ||
    767        cs->const_preamble_ib.ib_mapped)
    768       return NULL;
    769 
    770    if (!amdgpu_get_new_ib(&ws->base, cs, IB_CONST_PREAMBLE))
    771       return NULL;
    772 
    773    cs->csc->request.number_of_ibs = 3;
    774    cs->csc->request.ibs = &cs->csc->ib[IB_CONST_PREAMBLE];
    775 
    776    cs->cst->request.number_of_ibs = 3;
    777    cs->cst->request.ibs = &cs->cst->ib[IB_CONST_PREAMBLE];
    778 
    779    return &cs->const_preamble_ib.base;
    780 }
    781 
    782 static bool amdgpu_cs_validate(struct radeon_winsys_cs *rcs)
    783 {
    784    return true;
    785 }
    786 
    787 static bool amdgpu_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
    788 {
    789    struct amdgpu_ib *ib = amdgpu_ib(rcs);
    790    struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib);
    791    unsigned requested_size = rcs->prev_dw + rcs->current.cdw + dw;
    792    uint64_t va;
    793    uint32_t *new_ptr_ib_size;
    794 
    795    assert(rcs->current.cdw <= rcs->current.max_dw);
    796 
    797    if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type))
    798       return false;
    799 
    800    ib->max_ib_size = MAX2(ib->max_ib_size, requested_size);
    801 
    802    if (rcs->current.max_dw - rcs->current.cdw >= dw)
    803       return true;
    804 
    805    if (!amdgpu_cs_has_chaining(cs))
    806       return false;
    807 
    808    /* Allocate a new chunk */
    809    if (rcs->num_prev >= rcs->max_prev) {
    810       unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
    811       struct radeon_winsys_cs_chunk *new_prev;
    812 
    813       new_prev = REALLOC(rcs->prev,
    814                          sizeof(*new_prev) * rcs->max_prev,
    815                          sizeof(*new_prev) * new_max_prev);
    816       if (!new_prev)
    817          return false;
    818 
    819       rcs->prev = new_prev;
    820       rcs->max_prev = new_max_prev;
    821    }
    822 
    823    if (!amdgpu_ib_new_buffer(cs->ctx->ws, ib))
    824       return false;
    825 
    826    assert(ib->used_ib_space == 0);
    827    va = amdgpu_winsys_bo(ib->big_ib_buffer)->va;
    828 
    829    /* This space was originally reserved. */
    830    rcs->current.max_dw += 4;
    831    assert(ib->used_ib_space + 4 * rcs->current.max_dw <= ib->big_ib_buffer->size);
    832 
    833    /* Pad with NOPs and add INDIRECT_BUFFER packet */
    834    while ((rcs->current.cdw & 7) != 4)
    835       radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
    836 
    837    radeon_emit(rcs, PKT3(ib->ib_type == IB_MAIN ? PKT3_INDIRECT_BUFFER_CIK
    838                                            : PKT3_INDIRECT_BUFFER_CONST, 2, 0));
    839    radeon_emit(rcs, va);
    840    radeon_emit(rcs, va >> 32);
    841    new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw];
    842    radeon_emit(rcs, S_3F2_CHAIN(1) | S_3F2_VALID(1));
    843 
    844    assert((rcs->current.cdw & 7) == 0);
    845    assert(rcs->current.cdw <= rcs->current.max_dw);
    846 
    847    *ib->ptr_ib_size |= rcs->current.cdw;
    848    ib->ptr_ib_size = new_ptr_ib_size;
    849 
    850    /* Hook up the new chunk */
    851    rcs->prev[rcs->num_prev].buf = rcs->current.buf;
    852    rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
    853    rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
    854    rcs->num_prev++;
    855 
    856    ib->base.prev_dw += ib->base.current.cdw;
    857    ib->base.current.cdw = 0;
    858 
    859    ib->base.current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
    860    ib->base.current.max_dw = ib->big_ib_buffer->size / 4 - amdgpu_cs_epilog_dws(cs->ring_type);
    861 
    862    amdgpu_cs_add_buffer(&cs->main.base, ib->big_ib_buffer,
    863                         RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
    864 
    865    return true;
    866 }
    867 
    868 static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
    869                                           struct radeon_bo_list_item *list)
    870 {
    871     struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
    872     int i;
    873 
    874     if (list) {
    875         for (i = 0; i < cs->num_real_buffers; i++) {
    876             list[i].bo_size = cs->real_buffers[i].bo->base.size;
    877             list[i].vm_address = cs->real_buffers[i].bo->va;
    878             list[i].priority_usage = cs->real_buffers[i].u.real.priority_usage;
    879         }
    880     }
    881     return cs->num_real_buffers;
    882 }
    883 
    884 DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false)
    885 
    886 static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs,
    887                                         struct amdgpu_cs_buffer *buffer)
    888 {
    889    struct amdgpu_cs_context *cs = acs->csc;
    890    struct amdgpu_winsys_bo *bo = buffer->bo;
    891    struct amdgpu_cs_fence *dep;
    892    unsigned new_num_fences = 0;
    893 
    894    for (unsigned j = 0; j < bo->num_fences; ++j) {
    895       struct amdgpu_fence *bo_fence = (void *)bo->fences[j];
    896       unsigned idx;
    897 
    898       if (bo_fence->ctx == acs->ctx &&
    899          bo_fence->fence.ip_type == cs->request.ip_type &&
    900          bo_fence->fence.ip_instance == cs->request.ip_instance &&
    901          bo_fence->fence.ring == cs->request.ring)
    902          continue;
    903 
    904       if (amdgpu_fence_wait((void *)bo_fence, 0, false))
    905          continue;
    906 
    907       amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]);
    908       new_num_fences++;
    909 
    910       if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED))
    911          continue;
    912 
    913       if (bo_fence->submission_in_progress)
    914          os_wait_until_zero(&bo_fence->submission_in_progress,
    915                             PIPE_TIMEOUT_INFINITE);
    916 
    917       idx = cs->request.number_of_dependencies++;
    918       if (idx >= cs->max_dependencies) {
    919          unsigned size;
    920 
    921          cs->max_dependencies = idx + 8;
    922          size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence);
    923          cs->request.dependencies = realloc(cs->request.dependencies, size);
    924       }
    925 
    926       dep = &cs->request.dependencies[idx];
    927       memcpy(dep, &bo_fence->fence, sizeof(*dep));
    928    }
    929 
    930    for (unsigned j = new_num_fences; j < bo->num_fences; ++j)
    931       amdgpu_fence_reference(&bo->fences[j], NULL);
    932 
    933    bo->num_fences = new_num_fences;
    934 }
    935 
    936 /* Since the kernel driver doesn't synchronize execution between different
    937  * rings automatically, we have to add fence dependencies manually.
    938  */
    939 static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
    940 {
    941    struct amdgpu_cs_context *cs = acs->csc;
    942    int i;
    943 
    944    cs->request.number_of_dependencies = 0;
    945 
    946    for (i = 0; i < cs->num_real_buffers; i++)
    947       amdgpu_add_fence_dependency(acs, &cs->real_buffers[i]);
    948    for (i = 0; i < cs->num_slab_buffers; i++)
    949       amdgpu_add_fence_dependency(acs, &cs->slab_buffers[i]);
    950 }
    951 
    952 static void amdgpu_add_fence(struct amdgpu_winsys_bo *bo,
    953                              struct pipe_fence_handle *fence)
    954 {
    955    if (bo->num_fences >= bo->max_fences) {
    956       unsigned new_max_fences = MAX2(1, bo->max_fences * 2);
    957       struct pipe_fence_handle **new_fences =
    958          REALLOC(bo->fences,
    959                  bo->num_fences * sizeof(*new_fences),
    960                  new_max_fences * sizeof(*new_fences));
    961       if (new_fences) {
    962          bo->fences = new_fences;
    963          bo->max_fences = new_max_fences;
    964       } else {
    965          fprintf(stderr, "amdgpu_add_fence: allocation failure, dropping fence\n");
    966          if (!bo->num_fences)
    967             return;
    968 
    969          bo->num_fences--; /* prefer to keep a more recent fence if possible */
    970          amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL);
    971       }
    972    }
    973 
    974    bo->fences[bo->num_fences] = NULL;
    975    amdgpu_fence_reference(&bo->fences[bo->num_fences], fence);
    976    bo->num_fences++;
    977 }
    978 
    979 void amdgpu_cs_submit_ib(void *job, int thread_index)
    980 {
    981    struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
    982    struct amdgpu_winsys *ws = acs->ctx->ws;
    983    struct amdgpu_cs_context *cs = acs->cst;
    984    int i, r;
    985 
    986    cs->request.fence_info.handle = NULL;
    987    if (amdgpu_cs_has_user_fence(cs)) {
    988 	cs->request.fence_info.handle = acs->ctx->user_fence_bo;
    989 	cs->request.fence_info.offset = acs->ring_type;
    990    }
    991 
    992    /* Create the buffer list.
    993     * Use a buffer list containing all allocated buffers if requested.
    994     */
    995    if (debug_get_option_all_bos()) {
    996       struct amdgpu_winsys_bo *bo;
    997       amdgpu_bo_handle *handles;
    998       unsigned num = 0;
    999 
   1000       pipe_mutex_lock(ws->global_bo_list_lock);
   1001 
   1002       handles = malloc(sizeof(handles[0]) * ws->num_buffers);
   1003       if (!handles) {
   1004          pipe_mutex_unlock(ws->global_bo_list_lock);
   1005          amdgpu_cs_context_cleanup(cs);
   1006          cs->error_code = -ENOMEM;
   1007          return;
   1008       }
   1009 
   1010       LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, u.real.global_list_item) {
   1011          assert(num < ws->num_buffers);
   1012          handles[num++] = bo->bo;
   1013       }
   1014 
   1015       r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
   1016                                 handles, NULL,
   1017                                 &cs->request.resources);
   1018       free(handles);
   1019       pipe_mutex_unlock(ws->global_bo_list_lock);
   1020    } else {
   1021       r = amdgpu_bo_list_create(ws->dev, cs->num_real_buffers,
   1022                                 cs->handles, cs->flags,
   1023                                 &cs->request.resources);
   1024    }
   1025 
   1026    if (r) {
   1027       fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r);
   1028       cs->request.resources = NULL;
   1029       amdgpu_fence_signalled(cs->fence);
   1030       cs->error_code = r;
   1031       goto cleanup;
   1032    }
   1033 
   1034    r = amdgpu_cs_submit(acs->ctx->ctx, 0, &cs->request, 1);
   1035    cs->error_code = r;
   1036    if (r) {
   1037       if (r == -ENOMEM)
   1038          fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
   1039       else
   1040          fprintf(stderr, "amdgpu: The CS has been rejected, "
   1041                  "see dmesg for more information (%i).\n", r);
   1042 
   1043       amdgpu_fence_signalled(cs->fence);
   1044    } else {
   1045       /* Success. */
   1046       uint64_t *user_fence = NULL;
   1047       if (amdgpu_cs_has_user_fence(cs))
   1048          user_fence = acs->ctx->user_fence_cpu_address_base +
   1049                       cs->request.fence_info.offset;
   1050       amdgpu_fence_submitted(cs->fence, &cs->request, user_fence);
   1051    }
   1052 
   1053    /* Cleanup. */
   1054    if (cs->request.resources)
   1055       amdgpu_bo_list_destroy(cs->request.resources);
   1056 
   1057 cleanup:
   1058    for (i = 0; i < cs->num_real_buffers; i++)
   1059       p_atomic_dec(&cs->real_buffers[i].bo->num_active_ioctls);
   1060    for (i = 0; i < cs->num_slab_buffers; i++)
   1061       p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls);
   1062 
   1063    amdgpu_cs_context_cleanup(cs);
   1064 }
   1065 
   1066 /* Make sure the previous submission is completed. */
   1067 void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
   1068 {
   1069    struct amdgpu_cs *cs = amdgpu_cs(rcs);
   1070 
   1071    /* Wait for any pending ioctl of this CS to complete. */
   1072    util_queue_job_wait(&cs->flush_completed);
   1073 }
   1074 
   1075 static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
   1076                            unsigned flags,
   1077                            struct pipe_fence_handle **fence)
   1078 {
   1079    struct amdgpu_cs *cs = amdgpu_cs(rcs);
   1080    struct amdgpu_winsys *ws = cs->ctx->ws;
   1081    int error_code = 0;
   1082 
   1083    rcs->current.max_dw += amdgpu_cs_epilog_dws(cs->ring_type);
   1084 
   1085    switch (cs->ring_type) {
   1086    case RING_DMA:
   1087       /* pad DMA ring to 8 DWs */
   1088       if (ws->info.chip_class <= SI) {
   1089          while (rcs->current.cdw & 7)
   1090             radeon_emit(rcs, 0xf0000000); /* NOP packet */
   1091       } else {
   1092          while (rcs->current.cdw & 7)
   1093             radeon_emit(rcs, 0x00000000); /* NOP packet */
   1094       }
   1095       break;
   1096    case RING_GFX:
   1097       /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */
   1098       if (ws->info.gfx_ib_pad_with_type2) {
   1099          while (rcs->current.cdw & 7)
   1100             radeon_emit(rcs, 0x80000000); /* type2 nop packet */
   1101       } else {
   1102          while (rcs->current.cdw & 7)
   1103             radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
   1104       }
   1105 
   1106       /* Also pad the const IB. */
   1107       if (cs->const_ib.ib_mapped)
   1108          while (!cs->const_ib.base.current.cdw || (cs->const_ib.base.current.cdw & 7))
   1109             radeon_emit(&cs->const_ib.base, 0xffff1000); /* type3 nop packet */
   1110 
   1111       if (cs->const_preamble_ib.ib_mapped)
   1112          while (!cs->const_preamble_ib.base.current.cdw || (cs->const_preamble_ib.base.current.cdw & 7))
   1113             radeon_emit(&cs->const_preamble_ib.base, 0xffff1000);
   1114       break;
   1115    case RING_UVD:
   1116       while (rcs->current.cdw & 15)
   1117          radeon_emit(rcs, 0x80000000); /* type2 nop packet */
   1118       break;
   1119    default:
   1120       break;
   1121    }
   1122 
   1123    if (rcs->current.cdw > rcs->current.max_dw) {
   1124       fprintf(stderr, "amdgpu: command stream overflowed\n");
   1125    }
   1126 
   1127    /* If the CS is not empty or overflowed.... */
   1128    if (radeon_emitted(&cs->main.base, 0) &&
   1129        cs->main.base.current.cdw <= cs->main.base.current.max_dw &&
   1130        !debug_get_option_noop()) {
   1131       struct amdgpu_cs_context *cur = cs->csc;
   1132       unsigned i, num_buffers;
   1133 
   1134       /* Set IB sizes. */
   1135       amdgpu_ib_finalize(&cs->main);
   1136 
   1137       if (cs->const_ib.ib_mapped)
   1138          amdgpu_ib_finalize(&cs->const_ib);
   1139 
   1140       if (cs->const_preamble_ib.ib_mapped)
   1141          amdgpu_ib_finalize(&cs->const_preamble_ib);
   1142 
   1143       /* Create a fence. */
   1144       amdgpu_fence_reference(&cur->fence, NULL);
   1145       if (cs->next_fence) {
   1146          /* just move the reference */
   1147          cur->fence = cs->next_fence;
   1148          cs->next_fence = NULL;
   1149       } else {
   1150          cur->fence = amdgpu_fence_create(cs->ctx,
   1151                                           cur->request.ip_type,
   1152                                           cur->request.ip_instance,
   1153                                           cur->request.ring);
   1154       }
   1155       if (fence)
   1156          amdgpu_fence_reference(fence, cur->fence);
   1157 
   1158       amdgpu_cs_sync_flush(rcs);
   1159 
   1160       /* Prepare buffers.
   1161        *
   1162        * This fence must be held until the submission is queued to ensure
   1163        * that the order of fence dependency updates matches the order of
   1164        * submissions.
   1165        */
   1166       pipe_mutex_lock(ws->bo_fence_lock);
   1167       amdgpu_add_fence_dependencies(cs);
   1168 
   1169       num_buffers = cur->num_real_buffers;
   1170       for (i = 0; i < num_buffers; i++) {
   1171          struct amdgpu_winsys_bo *bo = cur->real_buffers[i].bo;
   1172          p_atomic_inc(&bo->num_active_ioctls);
   1173          amdgpu_add_fence(bo, cur->fence);
   1174       }
   1175 
   1176       num_buffers = cur->num_slab_buffers;
   1177       for (i = 0; i < num_buffers; i++) {
   1178          struct amdgpu_winsys_bo *bo = cur->slab_buffers[i].bo;
   1179          p_atomic_inc(&bo->num_active_ioctls);
   1180          amdgpu_add_fence(bo, cur->fence);
   1181       }
   1182 
   1183       /* Swap command streams. "cst" is going to be submitted. */
   1184       cs->csc = cs->cst;
   1185       cs->cst = cur;
   1186 
   1187       /* Submit. */
   1188       util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
   1189                          amdgpu_cs_submit_ib, NULL);
   1190       /* The submission has been queued, unlock the fence now. */
   1191       pipe_mutex_unlock(ws->bo_fence_lock);
   1192 
   1193       if (!(flags & RADEON_FLUSH_ASYNC)) {
   1194          amdgpu_cs_sync_flush(rcs);
   1195          error_code = cur->error_code;
   1196       }
   1197    } else {
   1198       amdgpu_cs_context_cleanup(cs->csc);
   1199    }
   1200 
   1201    amdgpu_get_new_ib(&ws->base, cs, IB_MAIN);
   1202    if (cs->const_ib.ib_mapped)
   1203       amdgpu_get_new_ib(&ws->base, cs, IB_CONST);
   1204    if (cs->const_preamble_ib.ib_mapped)
   1205       amdgpu_get_new_ib(&ws->base, cs, IB_CONST_PREAMBLE);
   1206 
   1207    cs->main.base.used_gart = 0;
   1208    cs->main.base.used_vram = 0;
   1209 
   1210    if (cs->ring_type == RING_GFX)
   1211       ws->num_gfx_IBs++;
   1212    else if (cs->ring_type == RING_DMA)
   1213       ws->num_sdma_IBs++;
   1214 
   1215    return error_code;
   1216 }
   1217 
   1218 static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
   1219 {
   1220    struct amdgpu_cs *cs = amdgpu_cs(rcs);
   1221 
   1222    amdgpu_cs_sync_flush(rcs);
   1223    util_queue_fence_destroy(&cs->flush_completed);
   1224    p_atomic_dec(&cs->ctx->ws->num_cs);
   1225    pb_reference(&cs->main.big_ib_buffer, NULL);
   1226    FREE(cs->main.base.prev);
   1227    pb_reference(&cs->const_ib.big_ib_buffer, NULL);
   1228    FREE(cs->const_ib.base.prev);
   1229    pb_reference(&cs->const_preamble_ib.big_ib_buffer, NULL);
   1230    FREE(cs->const_preamble_ib.base.prev);
   1231    amdgpu_destroy_cs_context(&cs->csc1);
   1232    amdgpu_destroy_cs_context(&cs->csc2);
   1233    amdgpu_fence_reference(&cs->next_fence, NULL);
   1234    FREE(cs);
   1235 }
   1236 
   1237 static bool amdgpu_bo_is_referenced(struct radeon_winsys_cs *rcs,
   1238                                     struct pb_buffer *_buf,
   1239                                     enum radeon_bo_usage usage)
   1240 {
   1241    struct amdgpu_cs *cs = amdgpu_cs(rcs);
   1242    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
   1243 
   1244    return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
   1245 }
   1246 
   1247 void amdgpu_cs_init_functions(struct amdgpu_winsys *ws)
   1248 {
   1249    ws->base.ctx_create = amdgpu_ctx_create;
   1250    ws->base.ctx_destroy = amdgpu_ctx_destroy;
   1251    ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
   1252    ws->base.cs_create = amdgpu_cs_create;
   1253    ws->base.cs_add_const_ib = amdgpu_cs_add_const_ib;
   1254    ws->base.cs_add_const_preamble_ib = amdgpu_cs_add_const_preamble_ib;
   1255    ws->base.cs_destroy = amdgpu_cs_destroy;
   1256    ws->base.cs_add_buffer = amdgpu_cs_add_buffer;
   1257    ws->base.cs_validate = amdgpu_cs_validate;
   1258    ws->base.cs_check_space = amdgpu_cs_check_space;
   1259    ws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
   1260    ws->base.cs_flush = amdgpu_cs_flush;
   1261    ws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
   1262    ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
   1263    ws->base.cs_sync_flush = amdgpu_cs_sync_flush;
   1264    ws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
   1265    ws->base.fence_reference = amdgpu_fence_reference;
   1266 }
   1267