Home | History | Annotate | Download | only in i965
      1 /*
      2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
      3  Intel funded Tungsten Graphics to
      4  develop this 3D driver.
      5 
      6  Permission is hereby granted, free of charge, to any person obtaining
      7  a copy of this software and associated documentation files (the
      8  "Software"), to deal in the Software without restriction, including
      9  without limitation the rights to use, copy, modify, merge, publish,
     10  distribute, sublicense, and/or sell copies of the Software, and to
     11  permit persons to whom the Software is furnished to do so, subject to
     12  the following conditions:
     13 
     14  The above copyright notice and this permission notice (including the
     15  next paragraph) shall be included in all copies or substantial
     16  portions of the Software.
     17 
     18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
     22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
     24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25 
     26  **********************************************************************/
     27  /*
     28   * Authors:
     29   *   Keith Whitwell <keithw (at) vmware.com>
     30   */
     31 
     32 /** @file brw_program_cache.c
     33  *
     34  * This file implements a simple program cache for 965.  The consumers can
     35  *  query the hash table of programs using a cache_id and program key, and
     36  * receive the corresponding program buffer object (plus associated auxiliary
     37  *  data) in return.  Objects in the cache may not have relocations
     38  * (pointers to other BOs) in them.
     39  *
     40  * The inner workings are a simple hash table based on a CRC of the
     41  * key data.
     42  *
     43  * Replacement is not implemented.  Instead, when the cache gets too
     44  * big we throw out all of the cache data and let it get regenerated.
     45  */
     46 
     47 #include "main/imports.h"
     48 #include "main/streaming-load-memcpy.h"
     49 #include "x86/common_x86_asm.h"
     50 #include "intel_batchbuffer.h"
     51 #include "brw_state.h"
     52 #include "brw_wm.h"
     53 #include "brw_gs.h"
     54 #include "brw_cs.h"
     55 #include "brw_program.h"
     56 #include "compiler/brw_eu.h"
     57 
     58 #define FILE_DEBUG_FLAG DEBUG_STATE
     59 
     60 struct brw_cache_item {
     61    /**
     62     * Effectively part of the key, cache_id identifies what kind of state
     63     * buffer is involved, and also which dirty flag should set.
     64     */
     65    enum brw_cache_id cache_id;
     66 
     67    /** 32-bit hash of the key data */
     68    GLuint hash;
     69 
     70    /** for variable-sized keys */
     71    GLuint key_size;
     72    GLuint prog_data_size;
     73    const void *key;
     74 
     75    uint32_t offset;
     76    uint32_t size;
     77 
     78    struct brw_cache_item *next;
     79 };
     80 
     81 static unsigned
     82 get_program_string_id(enum brw_cache_id cache_id, const void *key)
     83 {
     84    switch (cache_id) {
     85    case BRW_CACHE_VS_PROG:
     86       return ((struct brw_vs_prog_key *) key)->program_string_id;
     87    case BRW_CACHE_TCS_PROG:
     88       return ((struct brw_tcs_prog_key *) key)->program_string_id;
     89    case BRW_CACHE_TES_PROG:
     90       return ((struct brw_tes_prog_key *) key)->program_string_id;
     91    case BRW_CACHE_GS_PROG:
     92       return ((struct brw_gs_prog_key *) key)->program_string_id;
     93    case BRW_CACHE_CS_PROG:
     94       return ((struct brw_cs_prog_key *) key)->program_string_id;
     95    case BRW_CACHE_FS_PROG:
     96       return ((struct brw_wm_prog_key *) key)->program_string_id;
     97    default:
     98       unreachable("no program string id for this kind of program");
     99    }
    100 }
    101 
    102 static GLuint
    103 hash_key(struct brw_cache_item *item)
    104 {
    105    GLuint *ikey = (GLuint *)item->key;
    106    GLuint hash = item->cache_id, i;
    107 
    108    assert(item->key_size % 4 == 0);
    109 
    110    /* I'm sure this can be improved on:
    111     */
    112    for (i = 0; i < item->key_size/4; i++) {
    113       hash ^= ikey[i];
    114       hash = (hash << 5) | (hash >> 27);
    115    }
    116 
    117    return hash;
    118 }
    119 
    120 static int
    121 brw_cache_item_equals(const struct brw_cache_item *a,
    122                       const struct brw_cache_item *b)
    123 {
    124    return a->cache_id == b->cache_id &&
    125       a->hash == b->hash &&
    126       a->key_size == b->key_size &&
    127       (memcmp(a->key, b->key, a->key_size) == 0);
    128 }
    129 
    130 static struct brw_cache_item *
    131 search_cache(struct brw_cache *cache, GLuint hash,
    132              struct brw_cache_item *lookup)
    133 {
    134    struct brw_cache_item *c;
    135 
    136 #if 0
    137    int bucketcount = 0;
    138 
    139    for (c = cache->items[hash % cache->size]; c; c = c->next)
    140       bucketcount++;
    141 
    142    fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
    143            cache->size, bucketcount, cache->n_items);
    144 #endif
    145 
    146    for (c = cache->items[hash % cache->size]; c; c = c->next) {
    147       if (brw_cache_item_equals(lookup, c))
    148          return c;
    149    }
    150 
    151    return NULL;
    152 }
    153 
    154 
    155 static void
    156 rehash(struct brw_cache *cache)
    157 {
    158    struct brw_cache_item **items;
    159    struct brw_cache_item *c, *next;
    160    GLuint size, i;
    161 
    162    size = cache->size * 3;
    163    items = calloc(size, sizeof(*items));
    164 
    165    for (i = 0; i < cache->size; i++)
    166       for (c = cache->items[i]; c; c = next) {
    167          next = c->next;
    168          c->next = items[c->hash % size];
    169          items[c->hash % size] = c;
    170       }
    171 
    172    free(cache->items);
    173    cache->items = items;
    174    cache->size = size;
    175 }
    176 
    177 
    178 /**
    179  * Returns the buffer object matching cache_id and key, or NULL.
    180  */
    181 bool
    182 brw_search_cache(struct brw_cache *cache,
    183                  enum brw_cache_id cache_id,
    184                  const void *key, GLuint key_size,
    185                  uint32_t *inout_offset, void *inout_prog_data)
    186 {
    187    struct brw_context *brw = cache->brw;
    188    struct brw_cache_item *item;
    189    struct brw_cache_item lookup;
    190    GLuint hash;
    191 
    192    lookup.cache_id = cache_id;
    193    lookup.key = key;
    194    lookup.key_size = key_size;
    195    hash = hash_key(&lookup);
    196    lookup.hash = hash;
    197 
    198    item = search_cache(cache, hash, &lookup);
    199 
    200    if (item == NULL)
    201       return false;
    202 
    203    void *prog_data = ((char *) item->key) + item->key_size;
    204 
    205    if (item->offset != *inout_offset ||
    206        prog_data != *((void **) inout_prog_data)) {
    207       brw->ctx.NewDriverState |= (1 << cache_id);
    208       *inout_offset = item->offset;
    209       *((void **) inout_prog_data) = prog_data;
    210    }
    211 
    212    return true;
    213 }
    214 
    215 static void
    216 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
    217 {
    218    struct brw_context *brw = cache->brw;
    219    struct brw_bo *new_bo;
    220 
    221    perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
    222               (unsigned) cache->bo->size / 1024, new_size / 1024);
    223 
    224    new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
    225    if (can_do_exec_capture(brw->screen))
    226       new_bo->kflags = EXEC_OBJECT_CAPTURE;
    227 
    228    void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
    229                                        MAP_ASYNC | MAP_PERSISTENT);
    230 
    231    /* Copy any existing data that needs to be saved. */
    232    if (cache->next_offset != 0) {
    233 #ifdef USE_SSE41
    234       if (!cache->bo->cache_coherent && cpu_has_sse4_1)
    235          _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
    236       else
    237 #endif
    238          memcpy(map, cache->map, cache->next_offset);
    239    }
    240 
    241    brw_bo_unmap(cache->bo);
    242    brw_bo_unreference(cache->bo);
    243    cache->bo = new_bo;
    244    cache->map = map;
    245 
    246    /* Since we have a new BO in place, we need to signal the units
    247     * that depend on it (state base address on gen5+, or unit state before).
    248     */
    249    brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
    250    brw->batch.state_base_address_emitted = false;
    251 }
    252 
    253 /**
    254  * Attempts to find an item in the cache with identical data.
    255  */
    256 static const struct brw_cache_item *
    257 brw_lookup_prog(const struct brw_cache *cache,
    258                 enum brw_cache_id cache_id,
    259                 const void *data, unsigned data_size)
    260 {
    261    unsigned i;
    262    const struct brw_cache_item *item;
    263 
    264    for (i = 0; i < cache->size; i++) {
    265       for (item = cache->items[i]; item; item = item->next) {
    266          if (item->cache_id != cache_id || item->size != data_size ||
    267              memcmp(cache->map + item->offset, data, item->size) != 0)
    268             continue;
    269 
    270          return item;
    271       }
    272    }
    273 
    274    return NULL;
    275 }
    276 
    277 static uint32_t
    278 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
    279 {
    280    uint32_t offset;
    281 
    282    /* Allocate space in the cache BO for our new program. */
    283    if (cache->next_offset + size > cache->bo->size) {
    284       uint32_t new_size = cache->bo->size * 2;
    285 
    286       while (cache->next_offset + size > new_size)
    287          new_size *= 2;
    288 
    289       brw_cache_new_bo(cache, new_size);
    290    }
    291 
    292    offset = cache->next_offset;
    293 
    294    /* Programs are always 64-byte aligned, so set up the next one now */
    295    cache->next_offset = ALIGN(offset + size, 64);
    296 
    297    return offset;
    298 }
    299 
    300 const void *
    301 brw_find_previous_compile(struct brw_cache *cache,
    302                           enum brw_cache_id cache_id,
    303                           unsigned program_string_id)
    304 {
    305    for (unsigned i = 0; i < cache->size; i++) {
    306       for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
    307          if (c->cache_id == cache_id &&
    308              get_program_string_id(cache_id, c->key) == program_string_id) {
    309             return c->key;
    310          }
    311       }
    312    }
    313 
    314    return NULL;
    315 }
    316 
    317 void
    318 brw_upload_cache(struct brw_cache *cache,
    319                  enum brw_cache_id cache_id,
    320                  const void *key,
    321                  GLuint key_size,
    322                  const void *data,
    323                  GLuint data_size,
    324                  const void *prog_data,
    325                  GLuint prog_data_size,
    326                  uint32_t *out_offset,
    327                  void *out_prog_data)
    328 {
    329    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
    330    const struct brw_cache_item *matching_data =
    331       brw_lookup_prog(cache, cache_id, data, data_size);
    332    GLuint hash;
    333    void *tmp;
    334 
    335    item->cache_id = cache_id;
    336    item->size = data_size;
    337    item->key = key;
    338    item->key_size = key_size;
    339    item->prog_data_size = prog_data_size;
    340    hash = hash_key(item);
    341    item->hash = hash;
    342 
    343    /* If we can find a matching prog in the cache already, then reuse the
    344     * existing stuff without creating new copy into the underlying buffer
    345     * object. This is notably useful for programs generating shaders at
    346     * runtime, where multiple shaders may compile to the same thing in our
    347     * backend.
    348     */
    349    if (matching_data) {
    350       item->offset = matching_data->offset;
    351    } else {
    352       item->offset = brw_alloc_item_data(cache, data_size);
    353 
    354       /* Copy data to the buffer */
    355       memcpy(cache->map + item->offset, data, data_size);
    356    }
    357 
    358    /* Set up the memory containing the key and prog_data */
    359    tmp = malloc(key_size + prog_data_size);
    360 
    361    memcpy(tmp, key, key_size);
    362    memcpy(tmp + key_size, prog_data, prog_data_size);
    363 
    364    item->key = tmp;
    365 
    366    if (cache->n_items > cache->size * 1.5f)
    367       rehash(cache);
    368 
    369    hash %= cache->size;
    370    item->next = cache->items[hash];
    371    cache->items[hash] = item;
    372    cache->n_items++;
    373 
    374    *out_offset = item->offset;
    375    *(void **)out_prog_data = (void *)((char *)item->key + item->key_size);
    376    cache->brw->ctx.NewDriverState |= 1 << cache_id;
    377 }
    378 
    379 void
    380 brw_init_caches(struct brw_context *brw)
    381 {
    382    struct brw_cache *cache = &brw->cache;
    383 
    384    cache->brw = brw;
    385 
    386    cache->size = 7;
    387    cache->n_items = 0;
    388    cache->items =
    389       calloc(cache->size, sizeof(struct brw_cache_item *));
    390 
    391    cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384, 64);
    392    if (can_do_exec_capture(brw->screen))
    393       cache->bo->kflags = EXEC_OBJECT_CAPTURE;
    394 
    395    cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
    396                                            MAP_ASYNC | MAP_PERSISTENT);
    397 }
    398 
    399 static void
    400 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    401 {
    402    struct brw_cache_item *c, *next;
    403    GLuint i;
    404 
    405    DBG("%s\n", __func__);
    406 
    407    for (i = 0; i < cache->size; i++) {
    408       for (c = cache->items[i]; c; c = next) {
    409          next = c->next;
    410          if (c->cache_id == BRW_CACHE_VS_PROG ||
    411              c->cache_id == BRW_CACHE_TCS_PROG ||
    412              c->cache_id == BRW_CACHE_TES_PROG ||
    413              c->cache_id == BRW_CACHE_GS_PROG ||
    414              c->cache_id == BRW_CACHE_FS_PROG ||
    415              c->cache_id == BRW_CACHE_CS_PROG) {
    416             const void *item_prog_data = c->key + c->key_size;
    417             brw_stage_prog_data_free(item_prog_data);
    418          }
    419          free((void *)c->key);
    420          free(c);
    421       }
    422       cache->items[i] = NULL;
    423    }
    424 
    425    cache->n_items = 0;
    426 
    427    /* Start putting programs into the start of the BO again, since
    428     * we'll never find the old results.
    429     */
    430    cache->next_offset = 0;
    431 
    432    /* We need to make sure that the programs get regenerated, since
    433     * any offsets leftover in brw_context will no longer be valid.
    434     */
    435    brw->NewGLState = ~0;
    436    brw->ctx.NewDriverState = ~0ull;
    437    brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0;
    438    brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull;
    439    brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0;
    440    brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull;
    441 
    442    /* Also, NULL out any stale program pointers. */
    443    brw->vs.base.prog_data = NULL;
    444    brw->tcs.base.prog_data = NULL;
    445    brw->tes.base.prog_data = NULL;
    446    brw->gs.base.prog_data = NULL;
    447    brw->wm.base.prog_data = NULL;
    448    brw->cs.base.prog_data = NULL;
    449 
    450    intel_batchbuffer_flush(brw);
    451 }
    452 
    453 void
    454 brw_program_cache_check_size(struct brw_context *brw)
    455 {
    456    /* un-tuned guess.  Each object is generally a page, so 2000 of them is 8 MB of
    457     * state cache.
    458     */
    459    if (brw->cache.n_items > 2000) {
    460       perf_debug("Exceeded state cache size limit.  Clearing the set "
    461                  "of compiled programs, which will trigger recompiles\n");
    462       brw_clear_cache(brw, &brw->cache);
    463       brw_cache_new_bo(&brw->cache, brw->cache.bo->size);
    464    }
    465 }
    466 
    467 
    468 static void
    469 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
    470 {
    471 
    472    DBG("%s\n", __func__);
    473 
    474    /* This can be NULL if context creation failed early on */
    475    if (cache->bo) {
    476       brw_bo_unmap(cache->bo);
    477       brw_bo_unreference(cache->bo);
    478       cache->bo = NULL;
    479       cache->map = NULL;
    480    }
    481    brw_clear_cache(brw, cache);
    482    free(cache->items);
    483    cache->items = NULL;
    484    cache->size = 0;
    485 }
    486 
    487 
    488 void
    489 brw_destroy_caches(struct brw_context *brw)
    490 {
    491    brw_destroy_cache(brw, &brw->cache);
    492 }
    493 
    494 static const char *
    495 cache_name(enum brw_cache_id cache_id)
    496 {
    497    switch (cache_id) {
    498    case BRW_CACHE_VS_PROG:
    499       return "VS kernel";
    500    case BRW_CACHE_TCS_PROG:
    501       return "TCS kernel";
    502    case BRW_CACHE_TES_PROG:
    503       return "TES kernel";
    504    case BRW_CACHE_FF_GS_PROG:
    505       return "Fixed-function GS kernel";
    506    case BRW_CACHE_GS_PROG:
    507       return "GS kernel";
    508    case BRW_CACHE_CLIP_PROG:
    509       return "CLIP kernel";
    510    case BRW_CACHE_SF_PROG:
    511       return "SF kernel";
    512    case BRW_CACHE_FS_PROG:
    513       return "FS kernel";
    514    case BRW_CACHE_CS_PROG:
    515       return "CS kernel";
    516    default:
    517       return "unknown";
    518    }
    519 }
    520 
    521 void
    522 brw_print_program_cache(struct brw_context *brw)
    523 {
    524    const struct brw_cache *cache = &brw->cache;
    525    struct brw_cache_item *item;
    526 
    527    for (unsigned i = 0; i < cache->size; i++) {
    528       for (item = cache->items[i]; item; item = item->next) {
    529          fprintf(stderr, "%s:\n", cache_name(i));
    530          brw_disassemble(&brw->screen->devinfo, cache->map,
    531                          item->offset, item->size, stderr);
    532       }
    533    }
    534 }
    535