Home | History | Annotate | Download | only in util
      1 /**************************************************************************
      2  *
      3  * Copyright 2017 Advanced Micro Devices, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * on the rights to use, copy, modify, merge, publish, distribute, sub
     10  * license, and/or sell copies of the Software, and to permit persons to whom
     11  * the Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the next
     14  * paragraph) shall be included in all copies or substantial portions of the
     15  * Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     20  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     21  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     22  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     23  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     24  *
     25  **************************************************************************/
     26 
     27 /* This is a wrapper for pipe_context that executes all pipe_context calls
     28  * in another thread.
     29  *
     30  *
     31  * Guidelines for adopters and deviations from Gallium
     32  * ---------------------------------------------------
     33  *
     34  * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen
     35  *    driver functions that take a context (fence_finish, texture_get_handle)
     36  *    should manually unwrap pipe_context by doing:
     37  *      pipe = threaded_context_unwrap_sync(pipe);
     38  *
     39  *    pipe_context::priv is used to unwrap the context, so drivers and state
     40  *    trackers shouldn't use it.
     41  *
     42  *    No other objects are wrapped.
     43  *
     44  * 2) Drivers must subclass and initialize these structures:
     45  *    - threaded_resource for pipe_resource (use threaded_resource_init/deinit)
     46  *    - threaded_query for pipe_query (zero memory)
     47  *    - threaded_transfer for pipe_transfer (zero memory)
     48  *
     49  * 3) The threaded context must not be enabled for contexts that can use video
     50  *    codecs.
     51  *
     52  * 4) Changes in driver behavior:
     53  *    - begin_query and end_query always return true; return values from
     54  *      the driver are ignored.
     55  *    - generate_mipmap uses is_format_supported to determine success;
     56  *      the return value from the driver is ignored.
     57  *    - resource_commit always returns true; failures are ignored.
     58  *    - set_debug_callback is skipped if the callback is synchronous.
     59  *
     60  *
     61  * Thread-safety requirements on context functions
     62  * -----------------------------------------------
     63  *
     64  * These pipe_context functions are executed directly, so they shouldn't use
     65  * pipe_context in an unsafe way. They are de-facto screen functions now:
     66  * - create_query
     67  * - create_batch_query
     68  * - create_*_state (all CSOs and shaders)
     69  *     - Make sure the shader compiler doesn't use any per-context stuff.
     70  *       (e.g. LLVM target machine)
     71  *     - Only pipe_context's debug callback for shader dumps is guaranteed to
     72  *       be up to date, because set_debug_callback synchronizes execution.
     73  * - create_surface
     74  * - surface_destroy
     75  * - create_sampler_view
     76  * - sampler_view_destroy
     77  * - stream_output_target_destroy
     78  * - transfer_map (only unsychronized buffer mappings)
     79  * - get_query_result (when threaded_query::flushed == true)
     80  *
     81  * Create calls causing a sync that can't be async due to driver limitations:
     82  * - create_stream_output_target
     83  *
     84  *
     85  * Transfer_map rules for buffer mappings
     86  * --------------------------------------
     87  *
     88  * 1) If transfer_map has PIPE_TRANSFER_UNSYNCHRONIZED, the call is made
     89  *    in the non-driver thread without flushing the queue. The driver will
     90  *    receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_TRANSFER_-
     91  *    UNSYNCHRONIZED to indicate this.
     92  *    Note that transfer_unmap is always enqueued and called from the driver
     93  *    thread.
     94  *
     95  * 2) The driver isn't allowed to infer unsychronized mappings by tracking
     96  *    the valid buffer range. The threaded context always sends TC_TRANSFER_-
     97  *    MAP_NO_INFER_UNSYNCHRONIZED to indicate this. Ignoring the flag will lead
     98  *    to failures.
     99  *    The threaded context does its own detection of unsynchronized mappings.
    100  *
    101  * 3) The driver isn't allowed to do buffer invalidations by itself under any
    102  *    circumstances. This is necessary for unsychronized maps to map the latest
    103  *    version of the buffer. (because invalidations can be queued, while
    104  *    unsychronized maps are not queued and they should return the latest
    105  *    storage after invalidation). The threaded context always sends
    106  *    TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
    107  *    indicate this. Ignoring the flag will lead to failures.
    108  *    The threaded context uses its own buffer invalidation mechanism.
    109  *
    110  *
    111  * Rules for fences
    112  * ----------------
    113  *
    114  * Flushes will be executed asynchronously in the driver thread if a
    115  * create_fence callback is provided. This affects fence semantics as follows.
    116  *
    117  * When the threaded context wants to perform an asynchronous flush, it will
    118  * use the create_fence callback to pre-create the fence from the calling
    119  * thread. This pre-created fence will be passed to pipe_context::flush
    120  * together with the TC_FLUSH_ASYNC flag.
    121  *
    122  * The callback receives the unwrapped context as a parameter, but must use it
    123  * in a thread-safe way because it is called from a non-driver thread.
    124  *
    125  * If the threaded_context does not immediately flush the current batch, the
    126  * callback also receives a tc_unflushed_batch_token. If fence_finish is called
    127  * on the returned fence in the context that created the fence,
    128  * threaded_context_flush must be called.
    129  *
    130  * The driver must implement pipe_context::fence_server_sync properly, since
    131  * the threaded context handles PIPE_FLUSH_ASYNC.
    132  *
    133  *
    134  * Additional requirements
    135  * -----------------------
    136  *
    137  * get_query_result:
    138  *    If threaded_query::flushed == true, get_query_result should assume that
    139  *    it's called from a non-driver thread, in which case the driver shouldn't
    140  *    use the context in an unsafe way.
    141  *
    142  * replace_buffer_storage:
    143  *    The driver has to implement this callback, which will be called when
    144  *    the threaded context wants to replace a resource's backing storage with
    145  *    another resource's backing storage. The threaded context uses it to
    146  *    implement buffer invalidation. This call is always queued.
    147  *
    148  *
    149  * Performance gotchas
    150  * -------------------
    151  *
    152  * Buffer invalidations are done unconditionally - they don't check whether
    153  * the buffer is busy. This can cause drivers to have more live allocations
    154  * and CPU mappings than necessary.
    155  *
    156  *
    157  * How it works (queue architecture)
    158  * ---------------------------------
    159  *
    160  * There is a multithreaded queue consisting of batches, each batch consisting
    161  * of call slots. Each call slot consists of an 8-byte header (call ID +
    162  * call size + constant 32-bit marker for integrity checking) and an 8-byte
    163  * body for per-call data. That is 16 bytes per call slot.
    164  *
    165  * Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger
    166  * calls occupy multiple call slots depending on the size needed by call
    167  * parameters. That means that calls can have a variable size in the batch.
    168  * For example, set_vertex_buffers(count = any, buffers = NULL) occupies only
    169  * 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots.
    170  * Even though the first call slot can use only 8 bytes for data, additional
    171  * call slots used by the same call can use all 16 bytes for data.
    172  * For example, a call using 2 call slots has 24 bytes of space for data.
    173  *
    174  * Once a batch is full and there is no space for the next call, it's flushed,
    175  * meaning that it's added to the queue for execution in the other thread.
    176  * The batches are ordered in a ring and reused once they are idle again.
    177  * The batching is necessary for low queue/mutex overhead.
    178  *
    179  */
    180 
    181 #ifndef U_THREADED_CONTEXT_H
    182 #define U_THREADED_CONTEXT_H
    183 
    184 #include "pipe/p_context.h"
    185 #include "pipe/p_state.h"
    186 #include "util/u_inlines.h"
    187 #include "util/u_queue.h"
    188 #include "util/u_range.h"
    189 #include "util/slab.h"
    190 
    191 struct threaded_context;
    192 struct tc_unflushed_batch_token;
    193 
    194 /* These are transfer flags sent to drivers. */
    195 /* Never infer whether it's safe to use unsychronized mappings: */
    196 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
    197 /* Don't invalidate buffers: */
    198 #define TC_TRANSFER_MAP_NO_INVALIDATE        (1u << 30)
    199 /* transfer_map is called from a non-driver thread: */
    200 #define TC_TRANSFER_MAP_THREADED_UNSYNC      (1u << 31)
    201 
    202 /* Custom flush flags sent to drivers. */
    203 /* fence is pre-populated with a fence created by the create_fence callback */
    204 #define TC_FLUSH_ASYNC        (1u << 31)
    205 
    206 /* Size of the queue = number of batch slots in memory.
    207  * - 1 batch is always idle and records new commands
    208  * - 1 batch is being executed
    209  * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
    210  *
    211  * Use a size as small as possible for low CPU L2 cache usage but large enough
    212  * so that the queue isn't stalled too often for not having enough idle batch
    213  * slots.
    214  */
    215 #define TC_MAX_BATCHES        10
    216 
    217 /* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer)
    218  * can occupy multiple call slots.
    219  *
    220  * The idea is to have batches as small as possible but large enough so that
    221  * the queuing and mutex overhead is negligible.
    222  */
    223 #define TC_CALLS_PER_BATCH    192
    224 
    225 /* Threshold for when to use the queue or sync. */
    226 #define TC_MAX_STRING_MARKER_BYTES  512
    227 
    228 /* Threshold for when to enqueue buffer/texture_subdata as-is.
    229  * If the upload size is greater than this, it will do instead:
    230  * - for buffers: DISCARD_RANGE is done by the threaded context
    231  * - for textures: sync and call the driver directly
    232  */
    233 #define TC_MAX_SUBDATA_BYTES        320
    234 
    235 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
    236                                                struct pipe_resource *dst,
    237                                                struct pipe_resource *src);
    238 typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx,
    239                                                           struct tc_unflushed_batch_token *token);
    240 
    241 struct threaded_resource {
    242    struct pipe_resource b;
    243    const struct u_resource_vtbl *vtbl;
    244 
    245    /* Since buffer invalidations are queued, we can't use the base resource
    246     * for unsychronized mappings. This points to the latest version of
    247     * the buffer after the latest invalidation. It's only used for unsychro-
    248     * nized mappings in the non-driver thread. Initially it's set to &b.
    249     */
    250    struct pipe_resource *latest;
    251 
    252    /* The buffer range which is initialized (with a write transfer, streamout,
    253     * or writable shader resources). The remainder of the buffer is considered
    254     * invalid and can be mapped unsynchronized.
    255     *
    256     * This allows unsychronized mapping of a buffer range which hasn't been
    257     * used yet. It's for applications which forget to use the unsynchronized
    258     * map flag and expect the driver to figure it out.
    259     *
    260     * Drivers should set this to the full range for buffers backed by user
    261     * memory.
    262     */
    263    struct util_range valid_buffer_range;
    264 
    265    /* If "this" is not the base instance of the buffer, but it's one of its
    266     * reallocations (set in "latest" of the base instance), this points to
    267     * the valid range of the base instance. It's used for transfers after
    268     * a buffer invalidation, because such transfers operate on "latest", not
    269     * the base instance. Initially it's set to &valid_buffer_range.
    270     */
    271    struct util_range *base_valid_buffer_range;
    272 
    273    /* Drivers are required to update this for shared resources and user
    274     * pointers. */
    275    bool	is_shared;
    276    bool is_user_ptr;
    277 
    278    /* If positive, prefer DISCARD_RANGE with a staging buffer over any other
    279     * method of CPU access when map flags allow it. Useful for buffers that
    280     * are too large for the visible VRAM window.
    281     */
    282    int max_forced_staging_uploads;
    283 };
    284 
    285 struct threaded_transfer {
    286    struct pipe_transfer b;
    287 
    288    /* Staging buffer for DISCARD_RANGE transfers. */
    289    struct pipe_resource *staging;
    290 
    291    /* Offset into the staging buffer, because the backing buffer is
    292     * sub-allocated. */
    293    unsigned offset;
    294 };
    295 
    296 struct threaded_query {
    297    /* The query is added to the list in end_query and removed in flush. */
    298    struct list_head head_unflushed;
    299 
    300    /* Whether pipe->flush has been called in non-deferred mode after end_query. */
    301    bool flushed;
    302 };
    303 
    304 /* This is the second half of tc_call containing call data.
    305  * Most calls will typecast this to the type they need, typically larger
    306  * than 8 bytes.
    307  */
    308 union tc_payload {
    309    struct pipe_query *query;
    310    struct pipe_resource *resource;
    311    struct pipe_transfer *transfer;
    312    struct pipe_fence_handle *fence;
    313    uint64_t handle;
    314 };
    315 
    316 #ifdef _MSC_VER
    317 #define ALIGN16 __declspec(align(16))
    318 #else
    319 #define ALIGN16 __attribute__((aligned(16)))
    320 #endif
    321 
    322 /* Each call slot should be aligned to its own size for optimal cache usage. */
    323 struct ALIGN16 tc_call {
    324    unsigned sentinel;
    325    ushort num_call_slots;
    326    ushort call_id;
    327    union tc_payload payload;
    328 };
    329 
    330 /**
    331  * A token representing an unflushed batch.
    332  *
    333  * See the general rules for fences for an explanation.
    334  */
    335 struct tc_unflushed_batch_token {
    336    struct pipe_reference ref;
    337    struct threaded_context *tc;
    338 };
    339 
    340 struct tc_batch {
    341    struct pipe_context *pipe;
    342    unsigned sentinel;
    343    unsigned num_total_call_slots;
    344    struct tc_unflushed_batch_token *token;
    345    struct util_queue_fence fence;
    346    struct tc_call call[TC_CALLS_PER_BATCH];
    347 };
    348 
    349 struct threaded_context {
    350    struct pipe_context base;
    351    struct pipe_context *pipe;
    352    struct slab_child_pool pool_transfers;
    353    tc_replace_buffer_storage_func replace_buffer_storage;
    354    tc_create_fence_func create_fence;
    355    unsigned map_buffer_alignment;
    356 
    357    struct list_head unflushed_queries;
    358 
    359    /* Counters for the HUD. */
    360    unsigned num_offloaded_slots;
    361    unsigned num_direct_slots;
    362    unsigned num_syncs;
    363 
    364    struct util_queue queue;
    365    struct util_queue_fence *fence;
    366 
    367    unsigned last, next;
    368    struct tc_batch batch_slots[TC_MAX_BATCHES];
    369 };
    370 
    371 void threaded_resource_init(struct pipe_resource *res);
    372 void threaded_resource_deinit(struct pipe_resource *res);
    373 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
    374 
    375 struct pipe_context *
    376 threaded_context_create(struct pipe_context *pipe,
    377                         struct slab_parent_pool *parent_transfer_pool,
    378                         tc_replace_buffer_storage_func replace_buffer,
    379                         tc_create_fence_func create_fence,
    380                         struct threaded_context **out);
    381 
    382 void
    383 threaded_context_flush(struct pipe_context *_pipe,
    384                        struct tc_unflushed_batch_token *token,
    385                        bool prefer_async);
    386 
    387 static inline struct threaded_context *
    388 threaded_context(struct pipe_context *pipe)
    389 {
    390    return (struct threaded_context*)pipe;
    391 }
    392 
    393 static inline struct threaded_resource *
    394 threaded_resource(struct pipe_resource *res)
    395 {
    396    return (struct threaded_resource*)res;
    397 }
    398 
    399 static inline struct threaded_query *
    400 threaded_query(struct pipe_query *q)
    401 {
    402    return (struct threaded_query*)q;
    403 }
    404 
    405 static inline struct threaded_transfer *
    406 threaded_transfer(struct pipe_transfer *transfer)
    407 {
    408    return (struct threaded_transfer*)transfer;
    409 }
    410 
    411 static inline struct pipe_context *
    412 threaded_context_unwrap_unsync(struct pipe_context *pipe)
    413 {
    414    if (!pipe || !pipe->priv)
    415       return pipe;
    416    return (struct pipe_context*)pipe->priv;
    417 }
    418 
    419 static inline void
    420 tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst,
    421                                    struct tc_unflushed_batch_token *src)
    422 {
    423    if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference *)src))
    424       free(*dst);
    425    *dst = src;
    426 }
    427 
    428 #endif
    429