Home | History | Annotate | Download | only in i965
      1 /*
      2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
      3  Intel funded Tungsten Graphics to
      4  develop this 3D driver.
      5 
      6  Permission is hereby granted, free of charge, to any person obtaining
      7  a copy of this software and associated documentation files (the
      8  "Software"), to deal in the Software without restriction, including
      9  without limitation the rights to use, copy, modify, merge, publish,
     10  distribute, sublicense, and/or sell copies of the Software, and to
     11  permit persons to whom the Software is furnished to do so, subject to
     12  the following conditions:
     13 
     14  The above copyright notice and this permission notice (including the
     15  next paragraph) shall be included in all copies or substantial
     16  portions of the Software.
     17 
     18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
     22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
     24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25 
     26  **********************************************************************/
     27  /*
     28   * Authors:
     29   *   Keith Whitwell <keithw (at) vmware.com>
     30   */
     31 
     32 /** @file brw_program_cache.c
     33  *
     34  * This file implements a simple program cache for 965.  The consumers can
     35  *  query the hash table of programs using a cache_id and program key, and
     36  * receive the corresponding program buffer object (plus associated auxiliary
     37  *  data) in return.  Objects in the cache may not have relocations
     38  * (pointers to other BOs) in them.
     39  *
     40  * The inner workings are a simple hash table based on a CRC of the
     41  * key data.
     42  *
     43  * Replacement is not implemented.  Instead, when the cache gets too
     44  * big we throw out all of the cache data and let it get regenerated.
     45  */
     46 
     47 #include "main/imports.h"
     48 #include "intel_batchbuffer.h"
     49 #include "brw_state.h"
     50 #include "brw_vs.h"
     51 #include "brw_wm.h"
     52 #include "brw_gs.h"
     53 #include "brw_cs.h"
     54 #include "brw_program.h"
     55 
     56 #define FILE_DEBUG_FLAG DEBUG_STATE
     57 
     58 struct brw_cache_item {
     59    /**
     60     * Effectively part of the key, cache_id identifies what kind of state
     61     * buffer is involved, and also which dirty flag should set.
     62     */
     63    enum brw_cache_id cache_id;
     64 
     65    /** 32-bit hash of the key data */
     66    GLuint hash;
     67 
     68    /** for variable-sized keys */
     69    GLuint key_size;
     70    GLuint aux_size;
     71    const void *key;
     72 
     73    uint32_t offset;
     74    uint32_t size;
     75 
     76    struct brw_cache_item *next;
     77 };
     78 
     79 static unsigned
     80 get_program_string_id(enum brw_cache_id cache_id, const void *key)
     81 {
     82    switch (cache_id) {
     83    case BRW_CACHE_VS_PROG:
     84       return ((struct brw_vs_prog_key *) key)->program_string_id;
     85    case BRW_CACHE_TCS_PROG:
     86       return ((struct brw_tcs_prog_key *) key)->program_string_id;
     87    case BRW_CACHE_TES_PROG:
     88       return ((struct brw_tes_prog_key *) key)->program_string_id;
     89    case BRW_CACHE_GS_PROG:
     90       return ((struct brw_gs_prog_key *) key)->program_string_id;
     91    case BRW_CACHE_CS_PROG:
     92       return ((struct brw_cs_prog_key *) key)->program_string_id;
     93    case BRW_CACHE_FS_PROG:
     94       return ((struct brw_wm_prog_key *) key)->program_string_id;
     95    default:
     96       unreachable("no program string id for this kind of program");
     97    }
     98 }
     99 
    100 static GLuint
    101 hash_key(struct brw_cache_item *item)
    102 {
    103    GLuint *ikey = (GLuint *)item->key;
    104    GLuint hash = item->cache_id, i;
    105 
    106    assert(item->key_size % 4 == 0);
    107 
    108    /* I'm sure this can be improved on:
    109     */
    110    for (i = 0; i < item->key_size/4; i++) {
    111       hash ^= ikey[i];
    112       hash = (hash << 5) | (hash >> 27);
    113    }
    114 
    115    return hash;
    116 }
    117 
    118 static int
    119 brw_cache_item_equals(const struct brw_cache_item *a,
    120                       const struct brw_cache_item *b)
    121 {
    122    return a->cache_id == b->cache_id &&
    123       a->hash == b->hash &&
    124       a->key_size == b->key_size &&
    125       (memcmp(a->key, b->key, a->key_size) == 0);
    126 }
    127 
    128 static struct brw_cache_item *
    129 search_cache(struct brw_cache *cache, GLuint hash,
    130              struct brw_cache_item *lookup)
    131 {
    132    struct brw_cache_item *c;
    133 
    134 #if 0
    135    int bucketcount = 0;
    136 
    137    for (c = cache->items[hash % cache->size]; c; c = c->next)
    138       bucketcount++;
    139 
    140    fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
    141            cache->size, bucketcount, cache->n_items);
    142 #endif
    143 
    144    for (c = cache->items[hash % cache->size]; c; c = c->next) {
    145       if (brw_cache_item_equals(lookup, c))
    146          return c;
    147    }
    148 
    149    return NULL;
    150 }
    151 
    152 
    153 static void
    154 rehash(struct brw_cache *cache)
    155 {
    156    struct brw_cache_item **items;
    157    struct brw_cache_item *c, *next;
    158    GLuint size, i;
    159 
    160    size = cache->size * 3;
    161    items = calloc(size, sizeof(*items));
    162 
    163    for (i = 0; i < cache->size; i++)
    164       for (c = cache->items[i]; c; c = next) {
    165          next = c->next;
    166          c->next = items[c->hash % size];
    167          items[c->hash % size] = c;
    168       }
    169 
    170    free(cache->items);
    171    cache->items = items;
    172    cache->size = size;
    173 }
    174 
    175 
    176 /**
    177  * Returns the buffer object matching cache_id and key, or NULL.
    178  */
    179 bool
    180 brw_search_cache(struct brw_cache *cache,
    181                  enum brw_cache_id cache_id,
    182                  const void *key, GLuint key_size,
    183                  uint32_t *inout_offset, void *inout_aux)
    184 {
    185    struct brw_context *brw = cache->brw;
    186    struct brw_cache_item *item;
    187    struct brw_cache_item lookup;
    188    GLuint hash;
    189 
    190    lookup.cache_id = cache_id;
    191    lookup.key = key;
    192    lookup.key_size = key_size;
    193    hash = hash_key(&lookup);
    194    lookup.hash = hash;
    195 
    196    item = search_cache(cache, hash, &lookup);
    197 
    198    if (item == NULL)
    199       return false;
    200 
    201    void *aux = ((char *) item->key) + item->key_size;
    202 
    203    if (item->offset != *inout_offset || aux != *((void **) inout_aux)) {
    204       brw->ctx.NewDriverState |= (1 << cache_id);
    205       *inout_offset = item->offset;
    206       *((void **) inout_aux) = aux;
    207    }
    208 
    209    return true;
    210 }
    211 
    212 static void
    213 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
    214 {
    215    struct brw_context *brw = cache->brw;
    216    drm_intel_bo *new_bo;
    217 
    218    new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
    219    if (brw->has_llc)
    220       drm_intel_gem_bo_map_unsynchronized(new_bo);
    221 
    222    /* Copy any existing data that needs to be saved. */
    223    if (cache->next_offset != 0) {
    224       if (brw->has_llc) {
    225          memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
    226       } else {
    227          drm_intel_bo_map(cache->bo, false);
    228          drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
    229                               cache->bo->virtual);
    230          drm_intel_bo_unmap(cache->bo);
    231       }
    232    }
    233 
    234    if (brw->has_llc)
    235       drm_intel_bo_unmap(cache->bo);
    236    drm_intel_bo_unreference(cache->bo);
    237    cache->bo = new_bo;
    238    cache->bo_used_by_gpu = false;
    239 
    240    /* Since we have a new BO in place, we need to signal the units
    241     * that depend on it (state base address on gen5+, or unit state before).
    242     */
    243    brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
    244    brw->batch.state_base_address_emitted = false;
    245 }
    246 
    247 /**
    248  * Attempts to find an item in the cache with identical data.
    249  */
    250 static const struct brw_cache_item *
    251 brw_lookup_prog(const struct brw_cache *cache,
    252                 enum brw_cache_id cache_id,
    253                 const void *data, unsigned data_size)
    254 {
    255    const struct brw_context *brw = cache->brw;
    256    unsigned i;
    257    const struct brw_cache_item *item;
    258 
    259    for (i = 0; i < cache->size; i++) {
    260       for (item = cache->items[i]; item; item = item->next) {
    261          int ret;
    262 
    263          if (item->cache_id != cache_id || item->size != data_size)
    264             continue;
    265 
    266          if (!brw->has_llc)
    267             drm_intel_bo_map(cache->bo, false);
    268          ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
    269          if (!brw->has_llc)
    270             drm_intel_bo_unmap(cache->bo);
    271          if (ret)
    272             continue;
    273 
    274          return item;
    275       }
    276    }
    277 
    278    return NULL;
    279 }
    280 
    281 static uint32_t
    282 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
    283 {
    284    uint32_t offset;
    285    struct brw_context *brw = cache->brw;
    286 
    287    /* Allocate space in the cache BO for our new program. */
    288    if (cache->next_offset + size > cache->bo->size) {
    289       uint32_t new_size = cache->bo->size * 2;
    290 
    291       while (cache->next_offset + size > new_size)
    292          new_size *= 2;
    293 
    294       brw_cache_new_bo(cache, new_size);
    295    }
    296 
    297    /* If we would block on writing to an in-use program BO, just
    298     * recreate it.
    299     */
    300    if (!brw->has_llc && cache->bo_used_by_gpu) {
    301       perf_debug("Copying busy program cache buffer.\n");
    302       brw_cache_new_bo(cache, cache->bo->size);
    303    }
    304 
    305    offset = cache->next_offset;
    306 
    307    /* Programs are always 64-byte aligned, so set up the next one now */
    308    cache->next_offset = ALIGN(offset + size, 64);
    309 
    310    return offset;
    311 }
    312 
    313 const void *
    314 brw_find_previous_compile(struct brw_cache *cache,
    315                           enum brw_cache_id cache_id,
    316                           unsigned program_string_id)
    317 {
    318    for (unsigned i = 0; i < cache->size; i++) {
    319       for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
    320          if (c->cache_id == cache_id &&
    321              get_program_string_id(cache_id, c->key) == program_string_id) {
    322             return c->key;
    323          }
    324       }
    325    }
    326 
    327    return NULL;
    328 }
    329 
    330 void
    331 brw_upload_cache(struct brw_cache *cache,
    332                  enum brw_cache_id cache_id,
    333                  const void *key,
    334                  GLuint key_size,
    335                  const void *data,
    336                  GLuint data_size,
    337                  const void *aux,
    338                  GLuint aux_size,
    339                  uint32_t *out_offset,
    340                  void *out_aux)
    341 {
    342    struct brw_context *brw = cache->brw;
    343    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
    344    const struct brw_cache_item *matching_data =
    345       brw_lookup_prog(cache, cache_id, data, data_size);
    346    GLuint hash;
    347    void *tmp;
    348 
    349    item->cache_id = cache_id;
    350    item->size = data_size;
    351    item->key = key;
    352    item->key_size = key_size;
    353    item->aux_size = aux_size;
    354    hash = hash_key(item);
    355    item->hash = hash;
    356 
    357    /* If we can find a matching prog in the cache already, then reuse the
    358     * existing stuff without creating new copy into the underlying buffer
    359     * object. This is notably useful for programs generating shaders at
    360     * runtime, where multiple shaders may compile to the same thing in our
    361     * backend.
    362     */
    363    if (matching_data) {
    364       item->offset = matching_data->offset;
    365    } else {
    366       item->offset = brw_alloc_item_data(cache, data_size);
    367 
    368       /* Copy data to the buffer */
    369       if (brw->has_llc) {
    370          memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
    371       } else {
    372          drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
    373       }
    374    }
    375 
    376    /* Set up the memory containing the key and aux_data */
    377    tmp = malloc(key_size + aux_size);
    378 
    379    memcpy(tmp, key, key_size);
    380    memcpy(tmp + key_size, aux, aux_size);
    381 
    382    item->key = tmp;
    383 
    384    if (cache->n_items > cache->size * 1.5f)
    385       rehash(cache);
    386 
    387    hash %= cache->size;
    388    item->next = cache->items[hash];
    389    cache->items[hash] = item;
    390    cache->n_items++;
    391 
    392    *out_offset = item->offset;
    393    *(void **)out_aux = (void *)((char *)item->key + item->key_size);
    394    cache->brw->ctx.NewDriverState |= 1 << cache_id;
    395 }
    396 
    397 void
    398 brw_init_caches(struct brw_context *brw)
    399 {
    400    struct brw_cache *cache = &brw->cache;
    401 
    402    cache->brw = brw;
    403 
    404    cache->size = 7;
    405    cache->n_items = 0;
    406    cache->items =
    407       calloc(cache->size, sizeof(struct brw_cache_item *));
    408 
    409    cache->bo = drm_intel_bo_alloc(brw->bufmgr, "program cache",  4096, 64);
    410    if (brw->has_llc)
    411       drm_intel_gem_bo_map_unsynchronized(cache->bo);
    412 }
    413 
    414 static void
    415 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    416 {
    417    struct brw_cache_item *c, *next;
    418    GLuint i;
    419 
    420    DBG("%s\n", __func__);
    421 
    422    for (i = 0; i < cache->size; i++) {
    423       for (c = cache->items[i]; c; c = next) {
    424          next = c->next;
    425          if (c->cache_id == BRW_CACHE_VS_PROG ||
    426              c->cache_id == BRW_CACHE_TCS_PROG ||
    427              c->cache_id == BRW_CACHE_TES_PROG ||
    428              c->cache_id == BRW_CACHE_GS_PROG ||
    429              c->cache_id == BRW_CACHE_FS_PROG ||
    430              c->cache_id == BRW_CACHE_CS_PROG) {
    431             const void *item_aux = c->key + c->key_size;
    432             brw_stage_prog_data_free(item_aux);
    433          }
    434          free((void *)c->key);
    435          free(c);
    436       }
    437       cache->items[i] = NULL;
    438    }
    439 
    440    cache->n_items = 0;
    441 
    442    /* Start putting programs into the start of the BO again, since
    443     * we'll never find the old results.
    444     */
    445    cache->next_offset = 0;
    446 
    447    /* We need to make sure that the programs get regenerated, since
    448     * any offsets leftover in brw_context will no longer be valid.
    449     */
    450    brw->NewGLState = ~0;
    451    brw->ctx.NewDriverState = ~0ull;
    452    brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0;
    453    brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull;
    454    brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0;
    455    brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull;
    456 
    457    /* Also, NULL out any stale program pointers. */
    458    brw->vs.base.prog_data = NULL;
    459    brw->tcs.base.prog_data = NULL;
    460    brw->tes.base.prog_data = NULL;
    461    brw->gs.base.prog_data = NULL;
    462    brw->wm.base.prog_data = NULL;
    463    brw->cs.base.prog_data = NULL;
    464 
    465    intel_batchbuffer_flush(brw);
    466 }
    467 
    468 void
    469 brw_program_cache_check_size(struct brw_context *brw)
    470 {
    471    /* un-tuned guess.  Each object is generally a page, so 2000 of them is 8 MB of
    472     * state cache.
    473     */
    474    if (brw->cache.n_items > 2000) {
    475       perf_debug("Exceeded state cache size limit.  Clearing the set "
    476                  "of compiled programs, which will trigger recompiles\n");
    477       brw_clear_cache(brw, &brw->cache);
    478    }
    479 }
    480 
    481 
    482 static void
    483 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
    484 {
    485 
    486    DBG("%s\n", __func__);
    487 
    488    if (brw->has_llc)
    489       drm_intel_bo_unmap(cache->bo);
    490    drm_intel_bo_unreference(cache->bo);
    491    cache->bo = NULL;
    492    brw_clear_cache(brw, cache);
    493    free(cache->items);
    494    cache->items = NULL;
    495    cache->size = 0;
    496 }
    497 
    498 
    499 void
    500 brw_destroy_caches(struct brw_context *brw)
    501 {
    502    brw_destroy_cache(brw, &brw->cache);
    503 }
    504 
    505 static const char *
    506 cache_name(enum brw_cache_id cache_id)
    507 {
    508    switch (cache_id) {
    509    case BRW_CACHE_VS_PROG:
    510       return "VS kernel";
    511    case BRW_CACHE_TCS_PROG:
    512       return "TCS kernel";
    513    case BRW_CACHE_TES_PROG:
    514       return "TES kernel";
    515    case BRW_CACHE_FF_GS_PROG:
    516       return "Fixed-function GS kernel";
    517    case BRW_CACHE_GS_PROG:
    518       return "GS kernel";
    519    case BRW_CACHE_CLIP_PROG:
    520       return "CLIP kernel";
    521    case BRW_CACHE_SF_PROG:
    522       return "SF kernel";
    523    case BRW_CACHE_FS_PROG:
    524       return "FS kernel";
    525    case BRW_CACHE_CS_PROG:
    526       return "CS kernel";
    527    default:
    528       return "unknown";
    529    }
    530 }
    531 
    532 void
    533 brw_print_program_cache(struct brw_context *brw)
    534 {
    535    const struct brw_cache *cache = &brw->cache;
    536    struct brw_cache_item *item;
    537 
    538    if (!brw->has_llc)
    539       drm_intel_bo_map(cache->bo, false);
    540 
    541    for (unsigned i = 0; i < cache->size; i++) {
    542       for (item = cache->items[i]; item; item = item->next) {
    543          fprintf(stderr, "%s:\n", cache_name(i));
    544          brw_disassemble(&brw->screen->devinfo, cache->bo->virtual,
    545                          item->offset, item->size, stderr);
    546       }
    547    }
    548 
    549    if (!brw->has_llc)
    550       drm_intel_bo_unmap(cache->bo);
    551 }
    552