Home | History | Annotate | Download | only in drm
      1 /*
      2  * Copyright  2008 Jrme Glisse
      3  * Copyright  2010 Marek Olk <maraeo (at) gmail.com>
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining
      7  * a copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
     16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
     18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  *
     23  * The above copyright notice and this permission notice (including the
     24  * next paragraph) shall be included in all copies or substantial portions
     25  * of the Software.
     26  */
     27 /*
     28  * Authors:
     29  *      Marek Olk <maraeo (at) gmail.com>
     30  *
     31  * Based on work from libdrm_radeon by:
     32  *      Aapo Tahkola <aet (at) rasterburn.org>
     33  *      Nicolai Haehnle <prefect_ (at) gmx.net>
     34  *      Jrme Glisse <glisse (at) freedesktop.org>
     35  */
     36 
     37 /*
     38     This file replaces libdrm's radeon_cs_gem with our own implemention.
     39     It's optimized specifically for Radeon DRM.
     40     Adding buffers and space checking are faster and simpler than their
     41     counterparts in libdrm (the time complexity of all the functions
     42     is O(1) in nearly all scenarios, thanks to hashing).
     43 
     44     It works like this:
     45 
     46     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
     47     also adds the size of 'buf' to the used_gart and used_vram winsys variables
     48     based on the domains, which are simply or'd for the accounting purposes.
     49     The adding is skipped if the reloc is already present in the list, but it
     50     accounts any newly-referenced domains.
     51 
     52     cs_validate is then called, which just checks:
     53         used_vram/gart < vram/gart_size * 0.8
     54     The 0.8 number allows for some memory fragmentation. If the validation
     55     fails, the pipe driver flushes CS and tries do the validation again,
     56     i.e. it validates only that one operation. If it fails again, it drops
     57     the operation on the floor and prints some nasty message to stderr.
     58     (done in the pipe driver)
     59 
     60     cs_write_reloc(cs, buf) just writes a reloc that has been added using
     61     cs_add_buffer. The read_domain and write_domain parameters have been removed,
     62     because we already specify them in cs_add_buffer.
     63 */
     64 
     65 #include "radeon_drm_cs.h"
     66 
     67 #include "util/u_memory.h"
     68 #include "os/os_time.h"
     69 
     70 #include <stdio.h>
     71 #include <stdlib.h>
     72 #include <stdint.h>
     73 #include <xf86drm.h>
     74 
     75 
     76 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
     77 
     78 static struct pipe_fence_handle *
     79 radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
     80 static void radeon_fence_reference(struct pipe_fence_handle **dst,
     81                                    struct pipe_fence_handle *src);
     82 
     83 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
     84 {
     85     /* No context support here. Just return the winsys pointer
     86      * as the "context". */
     87     return (struct radeon_winsys_ctx*)ws;
     88 }
     89 
     90 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
     91 {
     92     /* No context support here. */
     93 }
     94 
     95 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
     96                                    struct radeon_drm_winsys *ws)
     97 {
     98     int i;
     99 
    100     csc->fd = ws->fd;
    101 
    102     csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
    103     csc->chunks[0].length_dw = 0;
    104     csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
    105     csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
    106     csc->chunks[1].length_dw = 0;
    107     csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
    108     csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
    109     csc->chunks[2].length_dw = 2;
    110     csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
    111 
    112     csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
    113     csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
    114     csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
    115 
    116     csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
    117 
    118     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
    119         csc->reloc_indices_hashlist[i] = -1;
    120     }
    121     return true;
    122 }
    123 
    124 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
    125 {
    126     unsigned i;
    127 
    128     for (i = 0; i < csc->num_relocs; i++) {
    129         p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
    130         radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
    131     }
    132     for (i = 0; i < csc->num_slab_buffers; ++i) {
    133         p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
    134         radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
    135     }
    136 
    137     csc->num_relocs = 0;
    138     csc->num_validated_relocs = 0;
    139     csc->num_slab_buffers = 0;
    140     csc->chunks[0].length_dw = 0;
    141     csc->chunks[1].length_dw = 0;
    142 
    143     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
    144         csc->reloc_indices_hashlist[i] = -1;
    145     }
    146 }
    147 
    148 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
    149 {
    150     radeon_cs_context_cleanup(csc);
    151     FREE(csc->slab_buffers);
    152     FREE(csc->relocs_bo);
    153     FREE(csc->relocs);
    154 }
    155 
    156 
    157 static struct radeon_winsys_cs *
    158 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
    159                      enum ring_type ring_type,
    160                      void (*flush)(void *ctx, unsigned flags,
    161                                    struct pipe_fence_handle **fence),
    162                      void *flush_ctx)
    163 {
    164     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
    165     struct radeon_drm_cs *cs;
    166 
    167     cs = CALLOC_STRUCT(radeon_drm_cs);
    168     if (!cs) {
    169         return NULL;
    170     }
    171     util_queue_fence_init(&cs->flush_completed);
    172 
    173     cs->ws = ws;
    174     cs->flush_cs = flush;
    175     cs->flush_data = flush_ctx;
    176 
    177     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
    178         FREE(cs);
    179         return NULL;
    180     }
    181     if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
    182         radeon_destroy_cs_context(&cs->csc1);
    183         FREE(cs);
    184         return NULL;
    185     }
    186 
    187     /* Set the first command buffer as current. */
    188     cs->csc = &cs->csc1;
    189     cs->cst = &cs->csc2;
    190     cs->base.current.buf = cs->csc->buf;
    191     cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
    192     cs->ring_type = ring_type;
    193 
    194     p_atomic_inc(&ws->num_cs);
    195     return &cs->base;
    196 }
    197 
    198 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
    199 {
    200     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
    201     struct radeon_bo_item *buffers;
    202     unsigned num_buffers;
    203     int i = csc->reloc_indices_hashlist[hash];
    204 
    205     if (bo->handle) {
    206         buffers = csc->relocs_bo;
    207         num_buffers = csc->num_relocs;
    208     } else {
    209         buffers = csc->slab_buffers;
    210         num_buffers = csc->num_slab_buffers;
    211     }
    212 
    213     /* not found or found */
    214     if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
    215         return i;
    216 
    217     /* Hash collision, look for the BO in the list of relocs linearly. */
    218     for (i = num_buffers - 1; i >= 0; i--) {
    219         if (buffers[i].bo == bo) {
    220             /* Put this reloc in the hash list.
    221              * This will prevent additional hash collisions if there are
    222              * several consecutive lookup_buffer calls for the same buffer.
    223              *
    224              * Example: Assuming buffers A,B,C collide in the hash list,
    225              * the following sequence of relocs:
    226              *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
    227              * will collide here: ^ and here:   ^,
    228              * meaning that we should get very few collisions in the end. */
    229             csc->reloc_indices_hashlist[hash] = i;
    230             return i;
    231         }
    232     }
    233     return -1;
    234 }
    235 
    236 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
    237                                                  struct radeon_bo *bo)
    238 {
    239     struct radeon_cs_context *csc = cs->csc;
    240     struct drm_radeon_cs_reloc *reloc;
    241     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
    242     int i = -1;
    243 
    244     i = radeon_lookup_buffer(csc, bo);
    245 
    246     if (i >= 0) {
    247         /* For async DMA, every add_buffer call must add a buffer to the list
    248          * no matter how many duplicates there are. This is due to the fact
    249          * the DMA CS checker doesn't use NOP packets for offset patching,
    250          * but always uses the i-th buffer from the list to patch the i-th
    251          * offset. If there are N offsets in a DMA CS, there must also be N
    252          * buffers in the relocation list.
    253          *
    254          * This doesn't have to be done if virtual memory is enabled,
    255          * because there is no offset patching with virtual memory.
    256          */
    257         if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
    258             return i;
    259         }
    260     }
    261 
    262     /* New relocation, check if the backing array is large enough. */
    263     if (csc->num_relocs >= csc->max_relocs) {
    264         uint32_t size;
    265         csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
    266 
    267         size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
    268         csc->relocs_bo = realloc(csc->relocs_bo, size);
    269 
    270         size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
    271         csc->relocs = realloc(csc->relocs, size);
    272 
    273         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
    274     }
    275 
    276     /* Initialize the new relocation. */
    277     csc->relocs_bo[csc->num_relocs].bo = NULL;
    278     csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
    279     radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
    280     p_atomic_inc(&bo->num_cs_references);
    281     reloc = &csc->relocs[csc->num_relocs];
    282     reloc->handle = bo->handle;
    283     reloc->read_domains = 0;
    284     reloc->write_domain = 0;
    285     reloc->flags = 0;
    286 
    287     csc->reloc_indices_hashlist[hash] = csc->num_relocs;
    288 
    289     csc->chunks[1].length_dw += RELOC_DWORDS;
    290 
    291     return csc->num_relocs++;
    292 }
    293 
    294 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
    295                                             struct radeon_bo *bo)
    296 {
    297     struct radeon_cs_context *csc = cs->csc;
    298     unsigned hash;
    299     struct radeon_bo_item *item;
    300     int idx;
    301     int real_idx;
    302 
    303     idx = radeon_lookup_buffer(csc, bo);
    304     if (idx >= 0)
    305         return idx;
    306 
    307     real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
    308 
    309     /* Check if the backing array is large enough. */
    310     if (csc->num_slab_buffers >= csc->max_slab_buffers) {
    311         unsigned new_max = MAX2(csc->max_slab_buffers + 16,
    312                                 (unsigned)(csc->max_slab_buffers * 1.3));
    313         struct radeon_bo_item *new_buffers =
    314             REALLOC(csc->slab_buffers,
    315                     csc->max_slab_buffers * sizeof(*new_buffers),
    316                     new_max * sizeof(*new_buffers));
    317         if (!new_buffers) {
    318             fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
    319             return -1;
    320         }
    321 
    322         csc->max_slab_buffers = new_max;
    323         csc->slab_buffers = new_buffers;
    324     }
    325 
    326     /* Initialize the new relocation. */
    327     idx = csc->num_slab_buffers++;
    328     item = &csc->slab_buffers[idx];
    329 
    330     item->bo = NULL;
    331     item->u.slab.real_idx = real_idx;
    332     radeon_bo_reference(&item->bo, bo);
    333     p_atomic_inc(&bo->num_cs_references);
    334 
    335     hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
    336     csc->reloc_indices_hashlist[hash] = idx;
    337 
    338     return idx;
    339 }
    340 
    341 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
    342                                         struct pb_buffer *buf,
    343                                         enum radeon_bo_usage usage,
    344                                         enum radeon_bo_domain domains,
    345                                         enum radeon_bo_priority priority)
    346 {
    347     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    348     struct radeon_bo *bo = (struct radeon_bo*)buf;
    349     enum radeon_bo_domain added_domains;
    350     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
    351     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
    352     struct drm_radeon_cs_reloc *reloc;
    353     int index;
    354 
    355     if (!bo->handle) {
    356         index = radeon_lookup_or_add_slab_buffer(cs, bo);
    357         if (index < 0)
    358             return 0;
    359 
    360         index = cs->csc->slab_buffers[index].u.slab.real_idx;
    361     } else {
    362         index = radeon_lookup_or_add_real_buffer(cs, bo);
    363     }
    364 
    365     reloc = &cs->csc->relocs[index];
    366     added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
    367     reloc->read_domains |= rd;
    368     reloc->write_domain |= wd;
    369     reloc->flags = MAX2(reloc->flags, priority);
    370     cs->csc->relocs_bo[index].u.real.priority_usage |= 1llu << priority;
    371 
    372     if (added_domains & RADEON_DOMAIN_VRAM)
    373         cs->base.used_vram += bo->base.size;
    374     else if (added_domains & RADEON_DOMAIN_GTT)
    375         cs->base.used_gart += bo->base.size;
    376 
    377     return index;
    378 }
    379 
    380 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
    381                                    struct pb_buffer *buf)
    382 {
    383     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    384 
    385     return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
    386 }
    387 
    388 static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
    389 {
    390     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    391     bool status =
    392         cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
    393         cs->base.used_vram < cs->ws->info.vram_size * 0.8;
    394 
    395     if (status) {
    396         cs->csc->num_validated_relocs = cs->csc->num_relocs;
    397     } else {
    398         /* Remove lately-added buffers. The validation failed with them
    399          * and the CS is about to be flushed because of that. Keep only
    400          * the already-validated buffers. */
    401         unsigned i;
    402 
    403         for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
    404             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
    405             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
    406         }
    407         cs->csc->num_relocs = cs->csc->num_validated_relocs;
    408 
    409         /* Flush if there are any relocs. Clean up otherwise. */
    410         if (cs->csc->num_relocs) {
    411             cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
    412         } else {
    413             radeon_cs_context_cleanup(cs->csc);
    414             cs->base.used_vram = 0;
    415             cs->base.used_gart = 0;
    416 
    417             assert(cs->base.current.cdw == 0);
    418             if (cs->base.current.cdw != 0) {
    419                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
    420             }
    421         }
    422     }
    423     return status;
    424 }
    425 
    426 static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
    427 {
    428    assert(rcs->current.cdw <= rcs->current.max_dw);
    429    return rcs->current.max_dw - rcs->current.cdw >= dw;
    430 }
    431 
    432 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
    433                                               struct radeon_bo_list_item *list)
    434 {
    435     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    436     int i;
    437 
    438     if (list) {
    439         for (i = 0; i < cs->csc->num_relocs; i++) {
    440             list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
    441             list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
    442             list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
    443         }
    444     }
    445     return cs->csc->num_relocs;
    446 }
    447 
    448 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
    449 {
    450     struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
    451     unsigned i;
    452     int r;
    453 
    454     r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
    455                             &csc->cs, sizeof(struct drm_radeon_cs));
    456     if (r) {
    457 	if (r == -ENOMEM)
    458 	    fprintf(stderr, "radeon: Not enough memory for command submission.\n");
    459 	else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
    460             unsigned i;
    461 
    462             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
    463             for (i = 0; i < csc->chunks[0].length_dw; i++) {
    464                 fprintf(stderr, "0x%08X\n", csc->buf[i]);
    465             }
    466         } else {
    467             fprintf(stderr, "radeon: The kernel rejected CS, "
    468                     "see dmesg for more information (%i).\n", r);
    469         }
    470     }
    471 
    472     for (i = 0; i < csc->num_relocs; i++)
    473         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
    474     for (i = 0; i < csc->num_slab_buffers; i++)
    475         p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
    476 
    477     radeon_cs_context_cleanup(csc);
    478 }
    479 
    480 /*
    481  * Make sure previous submission of this cs are completed
    482  */
    483 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
    484 {
    485     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    486 
    487     /* Wait for any pending ioctl of this CS to complete. */
    488     if (util_queue_is_initialized(&cs->ws->cs_queue))
    489         util_queue_job_wait(&cs->flush_completed);
    490 }
    491 
    492 /* Add the given fence to a slab buffer fence list.
    493  *
    494  * There is a potential race condition when bo participates in submissions on
    495  * two or more threads simultaneously. Since we do not know which of the
    496  * submissions will be sent to the GPU first, we have to keep the fences
    497  * of all submissions.
    498  *
    499  * However, fences that belong to submissions that have already returned from
    500  * their respective ioctl do not have to be kept, because we know that they
    501  * will signal earlier.
    502  */
    503 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
    504 {
    505     unsigned dst;
    506 
    507     assert(fence->num_cs_references);
    508 
    509     /* Cleanup older fences */
    510     dst = 0;
    511     for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
    512         if (bo->u.slab.fences[src]->num_cs_references) {
    513             bo->u.slab.fences[dst] = bo->u.slab.fences[src];
    514             dst++;
    515         } else {
    516             radeon_bo_reference(&bo->u.slab.fences[src], NULL);
    517         }
    518     }
    519     bo->u.slab.num_fences = dst;
    520 
    521     /* Check available space for the new fence */
    522     if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
    523         unsigned new_max_fences = bo->u.slab.max_fences + 1;
    524         struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
    525                                                 bo->u.slab.max_fences * sizeof(*new_fences),
    526                                                 new_max_fences * sizeof(*new_fences));
    527         if (!new_fences) {
    528             fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
    529             return;
    530         }
    531 
    532         bo->u.slab.fences = new_fences;
    533         bo->u.slab.max_fences = new_max_fences;
    534     }
    535 
    536     /* Add the new fence */
    537     bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
    538     radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
    539     bo->u.slab.num_fences++;
    540 }
    541 
    542 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
    543 
    544 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
    545                                unsigned flags,
    546                                struct pipe_fence_handle **pfence)
    547 {
    548     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    549     struct radeon_cs_context *tmp;
    550 
    551     switch (cs->ring_type) {
    552     case RING_DMA:
    553         /* pad DMA ring to 8 DWs */
    554         if (cs->ws->info.chip_class <= SI) {
    555             while (rcs->current.cdw & 7)
    556                 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
    557         } else {
    558             while (rcs->current.cdw & 7)
    559                 radeon_emit(&cs->base, 0x00000000); /* NOP packet */
    560         }
    561         break;
    562     case RING_GFX:
    563         /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
    564          * r6xx, requires at least 4 dw alignment to avoid a hw bug.
    565          */
    566         if (cs->ws->info.gfx_ib_pad_with_type2) {
    567             while (rcs->current.cdw & 7)
    568                 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
    569         } else {
    570             while (rcs->current.cdw & 7)
    571                 radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
    572         }
    573         break;
    574     case RING_UVD:
    575         while (rcs->current.cdw & 15)
    576             radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
    577         break;
    578     default:
    579         break;
    580     }
    581 
    582     if (rcs->current.cdw > rcs->current.max_dw) {
    583        fprintf(stderr, "radeon: command stream overflowed\n");
    584     }
    585 
    586     if (pfence || cs->csc->num_slab_buffers) {
    587         struct pipe_fence_handle *fence;
    588 
    589         if (cs->next_fence) {
    590             fence = cs->next_fence;
    591             cs->next_fence = NULL;
    592         } else {
    593             fence = radeon_cs_create_fence(rcs);
    594         }
    595 
    596         if (fence) {
    597             if (pfence)
    598                 radeon_fence_reference(pfence, fence);
    599 
    600             pipe_mutex_lock(cs->ws->bo_fence_lock);
    601             for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
    602                 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
    603                 p_atomic_inc(&bo->num_active_ioctls);
    604                 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
    605             }
    606             pipe_mutex_unlock(cs->ws->bo_fence_lock);
    607 
    608             radeon_fence_reference(&fence, NULL);
    609         }
    610     } else {
    611         radeon_fence_reference(&cs->next_fence, NULL);
    612     }
    613 
    614     radeon_drm_cs_sync_flush(rcs);
    615 
    616     /* Swap command streams. */
    617     tmp = cs->csc;
    618     cs->csc = cs->cst;
    619     cs->cst = tmp;
    620 
    621     /* If the CS is not empty or overflowed, emit it in a separate thread. */
    622     if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
    623         unsigned i, num_relocs;
    624 
    625         num_relocs = cs->cst->num_relocs;
    626 
    627         cs->cst->chunks[0].length_dw = cs->base.current.cdw;
    628 
    629         for (i = 0; i < num_relocs; i++) {
    630             /* Update the number of active asynchronous CS ioctls for the buffer. */
    631             p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
    632         }
    633 
    634         switch (cs->ring_type) {
    635         case RING_DMA:
    636             cs->cst->flags[0] = 0;
    637             cs->cst->flags[1] = RADEON_CS_RING_DMA;
    638             cs->cst->cs.num_chunks = 3;
    639             if (cs->ws->info.has_virtual_memory) {
    640                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
    641             }
    642             break;
    643 
    644         case RING_UVD:
    645             cs->cst->flags[0] = 0;
    646             cs->cst->flags[1] = RADEON_CS_RING_UVD;
    647             cs->cst->cs.num_chunks = 3;
    648             break;
    649 
    650         case RING_VCE:
    651             cs->cst->flags[0] = 0;
    652             cs->cst->flags[1] = RADEON_CS_RING_VCE;
    653             cs->cst->cs.num_chunks = 3;
    654             break;
    655 
    656         default:
    657         case RING_GFX:
    658         case RING_COMPUTE:
    659             cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
    660             cs->cst->flags[1] = RADEON_CS_RING_GFX;
    661             cs->cst->cs.num_chunks = 3;
    662 
    663             if (cs->ws->info.has_virtual_memory) {
    664                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
    665                 cs->cst->cs.num_chunks = 3;
    666             }
    667             if (flags & RADEON_FLUSH_END_OF_FRAME) {
    668                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
    669                 cs->cst->cs.num_chunks = 3;
    670             }
    671             if (cs->ring_type == RING_COMPUTE) {
    672                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
    673                 cs->cst->cs.num_chunks = 3;
    674             }
    675             break;
    676         }
    677 
    678         if (util_queue_is_initialized(&cs->ws->cs_queue)) {
    679             util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
    680                                radeon_drm_cs_emit_ioctl_oneshot, NULL);
    681             if (!(flags & RADEON_FLUSH_ASYNC))
    682                 radeon_drm_cs_sync_flush(rcs);
    683         } else {
    684             radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
    685         }
    686     } else {
    687         radeon_cs_context_cleanup(cs->cst);
    688     }
    689 
    690     /* Prepare a new CS. */
    691     cs->base.current.buf = cs->csc->buf;
    692     cs->base.current.cdw = 0;
    693     cs->base.used_vram = 0;
    694     cs->base.used_gart = 0;
    695 
    696     if (cs->ring_type == RING_GFX)
    697         cs->ws->num_gfx_IBs++;
    698     else if (cs->ring_type == RING_DMA)
    699         cs->ws->num_sdma_IBs++;
    700     return 0;
    701 }
    702 
    703 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
    704 {
    705     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    706 
    707     radeon_drm_cs_sync_flush(rcs);
    708     util_queue_fence_destroy(&cs->flush_completed);
    709     radeon_cs_context_cleanup(&cs->csc1);
    710     radeon_cs_context_cleanup(&cs->csc2);
    711     p_atomic_dec(&cs->ws->num_cs);
    712     radeon_destroy_cs_context(&cs->csc1);
    713     radeon_destroy_cs_context(&cs->csc2);
    714     radeon_fence_reference(&cs->next_fence, NULL);
    715     FREE(cs);
    716 }
    717 
    718 static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
    719                                     struct pb_buffer *_buf,
    720                                     enum radeon_bo_usage usage)
    721 {
    722     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    723     struct radeon_bo *bo = (struct radeon_bo*)_buf;
    724     int index;
    725 
    726     if (!bo->num_cs_references)
    727         return false;
    728 
    729     index = radeon_lookup_buffer(cs->csc, bo);
    730     if (index == -1)
    731         return false;
    732 
    733     if (!bo->handle)
    734         index = cs->csc->slab_buffers[index].u.slab.real_idx;
    735 
    736     if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
    737         return true;
    738     if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
    739         return true;
    740 
    741     return false;
    742 }
    743 
    744 /* FENCES */
    745 
    746 static struct pipe_fence_handle *
    747 radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
    748 {
    749     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    750     struct pb_buffer *fence;
    751 
    752     /* Create a fence, which is a dummy BO. */
    753     fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
    754                                        RADEON_DOMAIN_GTT, RADEON_FLAG_HANDLE);
    755     if (!fence)
    756        return NULL;
    757 
    758     /* Add the fence as a dummy relocation. */
    759     cs->ws->base.cs_add_buffer(rcs, fence,
    760                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
    761                               RADEON_PRIO_FENCE);
    762     return (struct pipe_fence_handle*)fence;
    763 }
    764 
    765 static bool radeon_fence_wait(struct radeon_winsys *ws,
    766                               struct pipe_fence_handle *fence,
    767                               uint64_t timeout)
    768 {
    769     return ws->buffer_wait((struct pb_buffer*)fence, timeout,
    770                            RADEON_USAGE_READWRITE);
    771 }
    772 
    773 static void radeon_fence_reference(struct pipe_fence_handle **dst,
    774                                    struct pipe_fence_handle *src)
    775 {
    776     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
    777 }
    778 
    779 static struct pipe_fence_handle *
    780 radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs)
    781 {
    782    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    783    struct pipe_fence_handle *fence = NULL;
    784 
    785    if (cs->next_fence) {
    786       radeon_fence_reference(&fence, cs->next_fence);
    787       return fence;
    788    }
    789 
    790    fence = radeon_cs_create_fence(rcs);
    791    if (!fence)
    792       return NULL;
    793 
    794    radeon_fence_reference(&cs->next_fence, fence);
    795    return fence;
    796 }
    797 
    798 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
    799 {
    800     ws->base.ctx_create = radeon_drm_ctx_create;
    801     ws->base.ctx_destroy = radeon_drm_ctx_destroy;
    802     ws->base.cs_create = radeon_drm_cs_create;
    803     ws->base.cs_destroy = radeon_drm_cs_destroy;
    804     ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
    805     ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
    806     ws->base.cs_validate = radeon_drm_cs_validate;
    807     ws->base.cs_check_space = radeon_drm_cs_check_space;
    808     ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
    809     ws->base.cs_flush = radeon_drm_cs_flush;
    810     ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
    811     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
    812     ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
    813     ws->base.fence_wait = radeon_fence_wait;
    814     ws->base.fence_reference = radeon_fence_reference;
    815 }
    816