1 /************************************************************************** 2 * 3 * Copyright 2017 Advanced Micro Devices, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * on the rights to use, copy, modify, merge, publish, distribute, sub 10 * license, and/or sell copies of the Software, and to permit persons to whom 11 * the Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 21 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 22 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 23 * USE OR OTHER DEALINGS IN THE SOFTWARE. 24 * 25 **************************************************************************/ 26 27 /* This is a wrapper for pipe_context that executes all pipe_context calls 28 * in another thread. 29 * 30 * 31 * Guidelines for adopters and deviations from Gallium 32 * --------------------------------------------------- 33 * 34 * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen 35 * driver functions that take a context (fence_finish, texture_get_handle) 36 * should manually unwrap pipe_context by doing: 37 * pipe = threaded_context_unwrap_sync(pipe); 38 * 39 * pipe_context::priv is used to unwrap the context, so drivers and state 40 * trackers shouldn't use it. 41 * 42 * No other objects are wrapped. 43 * 44 * 2) Drivers must subclass and initialize these structures: 45 * - threaded_resource for pipe_resource (use threaded_resource_init/deinit) 46 * - threaded_query for pipe_query (zero memory) 47 * - threaded_transfer for pipe_transfer (zero memory) 48 * 49 * 3) The threaded context must not be enabled for contexts that can use video 50 * codecs. 51 * 52 * 4) Changes in driver behavior: 53 * - begin_query and end_query always return true; return values from 54 * the driver are ignored. 55 * - generate_mipmap uses is_format_supported to determine success; 56 * the return value from the driver is ignored. 57 * - resource_commit always returns true; failures are ignored. 58 * - set_debug_callback is skipped if the callback is synchronous. 59 * 60 * 61 * Thread-safety requirements on context functions 62 * ----------------------------------------------- 63 * 64 * These pipe_context functions are executed directly, so they shouldn't use 65 * pipe_context in an unsafe way. They are de-facto screen functions now: 66 * - create_query 67 * - create_batch_query 68 * - create_*_state (all CSOs and shaders) 69 * - Make sure the shader compiler doesn't use any per-context stuff. 70 * (e.g. LLVM target machine) 71 * - Only pipe_context's debug callback for shader dumps is guaranteed to 72 * be up to date, because set_debug_callback synchronizes execution. 73 * - create_surface 74 * - surface_destroy 75 * - create_sampler_view 76 * - sampler_view_destroy 77 * - stream_output_target_destroy 78 * - transfer_map (only unsychronized buffer mappings) 79 * - get_query_result (when threaded_query::flushed == true) 80 * 81 * Create calls causing a sync that can't be async due to driver limitations: 82 * - create_stream_output_target 83 * 84 * 85 * Transfer_map rules for buffer mappings 86 * -------------------------------------- 87 * 88 * 1) If transfer_map has PIPE_TRANSFER_UNSYNCHRONIZED, the call is made 89 * in the non-driver thread without flushing the queue. The driver will 90 * receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_TRANSFER_- 91 * UNSYNCHRONIZED to indicate this. 92 * Note that transfer_unmap is always enqueued and called from the driver 93 * thread. 94 * 95 * 2) The driver isn't allowed to infer unsychronized mappings by tracking 96 * the valid buffer range. The threaded context always sends TC_TRANSFER_- 97 * MAP_NO_INFER_UNSYNCHRONIZED to indicate this. Ignoring the flag will lead 98 * to failures. 99 * The threaded context does its own detection of unsynchronized mappings. 100 * 101 * 3) The driver isn't allowed to do buffer invalidations by itself under any 102 * circumstances. This is necessary for unsychronized maps to map the latest 103 * version of the buffer. (because invalidations can be queued, while 104 * unsychronized maps are not queued and they should return the latest 105 * storage after invalidation). The threaded context always sends 106 * TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to 107 * indicate this. Ignoring the flag will lead to failures. 108 * The threaded context uses its own buffer invalidation mechanism. 109 * 110 * 111 * Rules for fences 112 * ---------------- 113 * 114 * Flushes will be executed asynchronously in the driver thread if a 115 * create_fence callback is provided. This affects fence semantics as follows. 116 * 117 * When the threaded context wants to perform an asynchronous flush, it will 118 * use the create_fence callback to pre-create the fence from the calling 119 * thread. This pre-created fence will be passed to pipe_context::flush 120 * together with the TC_FLUSH_ASYNC flag. 121 * 122 * The callback receives the unwrapped context as a parameter, but must use it 123 * in a thread-safe way because it is called from a non-driver thread. 124 * 125 * If the threaded_context does not immediately flush the current batch, the 126 * callback also receives a tc_unflushed_batch_token. If fence_finish is called 127 * on the returned fence in the context that created the fence, 128 * threaded_context_flush must be called. 129 * 130 * The driver must implement pipe_context::fence_server_sync properly, since 131 * the threaded context handles PIPE_FLUSH_ASYNC. 132 * 133 * 134 * Additional requirements 135 * ----------------------- 136 * 137 * get_query_result: 138 * If threaded_query::flushed == true, get_query_result should assume that 139 * it's called from a non-driver thread, in which case the driver shouldn't 140 * use the context in an unsafe way. 141 * 142 * replace_buffer_storage: 143 * The driver has to implement this callback, which will be called when 144 * the threaded context wants to replace a resource's backing storage with 145 * another resource's backing storage. The threaded context uses it to 146 * implement buffer invalidation. This call is always queued. 147 * 148 * 149 * Performance gotchas 150 * ------------------- 151 * 152 * Buffer invalidations are done unconditionally - they don't check whether 153 * the buffer is busy. This can cause drivers to have more live allocations 154 * and CPU mappings than necessary. 155 * 156 * 157 * How it works (queue architecture) 158 * --------------------------------- 159 * 160 * There is a multithreaded queue consisting of batches, each batch consisting 161 * of call slots. Each call slot consists of an 8-byte header (call ID + 162 * call size + constant 32-bit marker for integrity checking) and an 8-byte 163 * body for per-call data. That is 16 bytes per call slot. 164 * 165 * Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger 166 * calls occupy multiple call slots depending on the size needed by call 167 * parameters. That means that calls can have a variable size in the batch. 168 * For example, set_vertex_buffers(count = any, buffers = NULL) occupies only 169 * 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots. 170 * Even though the first call slot can use only 8 bytes for data, additional 171 * call slots used by the same call can use all 16 bytes for data. 172 * For example, a call using 2 call slots has 24 bytes of space for data. 173 * 174 * Once a batch is full and there is no space for the next call, it's flushed, 175 * meaning that it's added to the queue for execution in the other thread. 176 * The batches are ordered in a ring and reused once they are idle again. 177 * The batching is necessary for low queue/mutex overhead. 178 * 179 */ 180 181 #ifndef U_THREADED_CONTEXT_H 182 #define U_THREADED_CONTEXT_H 183 184 #include "pipe/p_context.h" 185 #include "pipe/p_state.h" 186 #include "util/u_inlines.h" 187 #include "util/u_queue.h" 188 #include "util/u_range.h" 189 #include "util/slab.h" 190 191 struct threaded_context; 192 struct tc_unflushed_batch_token; 193 194 /* These are transfer flags sent to drivers. */ 195 /* Never infer whether it's safe to use unsychronized mappings: */ 196 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29) 197 /* Don't invalidate buffers: */ 198 #define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30) 199 /* transfer_map is called from a non-driver thread: */ 200 #define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31) 201 202 /* Custom flush flags sent to drivers. */ 203 /* fence is pre-populated with a fence created by the create_fence callback */ 204 #define TC_FLUSH_ASYNC (1u << 31) 205 206 /* Size of the queue = number of batch slots in memory. 207 * - 1 batch is always idle and records new commands 208 * - 1 batch is being executed 209 * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches. 210 * 211 * Use a size as small as possible for low CPU L2 cache usage but large enough 212 * so that the queue isn't stalled too often for not having enough idle batch 213 * slots. 214 */ 215 #define TC_MAX_BATCHES 10 216 217 /* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer) 218 * can occupy multiple call slots. 219 * 220 * The idea is to have batches as small as possible but large enough so that 221 * the queuing and mutex overhead is negligible. 222 */ 223 #define TC_CALLS_PER_BATCH 192 224 225 /* Threshold for when to use the queue or sync. */ 226 #define TC_MAX_STRING_MARKER_BYTES 512 227 228 /* Threshold for when to enqueue buffer/texture_subdata as-is. 229 * If the upload size is greater than this, it will do instead: 230 * - for buffers: DISCARD_RANGE is done by the threaded context 231 * - for textures: sync and call the driver directly 232 */ 233 #define TC_MAX_SUBDATA_BYTES 320 234 235 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx, 236 struct pipe_resource *dst, 237 struct pipe_resource *src); 238 typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx, 239 struct tc_unflushed_batch_token *token); 240 241 struct threaded_resource { 242 struct pipe_resource b; 243 const struct u_resource_vtbl *vtbl; 244 245 /* Since buffer invalidations are queued, we can't use the base resource 246 * for unsychronized mappings. This points to the latest version of 247 * the buffer after the latest invalidation. It's only used for unsychro- 248 * nized mappings in the non-driver thread. Initially it's set to &b. 249 */ 250 struct pipe_resource *latest; 251 252 /* The buffer range which is initialized (with a write transfer, streamout, 253 * or writable shader resources). The remainder of the buffer is considered 254 * invalid and can be mapped unsynchronized. 255 * 256 * This allows unsychronized mapping of a buffer range which hasn't been 257 * used yet. It's for applications which forget to use the unsynchronized 258 * map flag and expect the driver to figure it out. 259 * 260 * Drivers should set this to the full range for buffers backed by user 261 * memory. 262 */ 263 struct util_range valid_buffer_range; 264 265 /* If "this" is not the base instance of the buffer, but it's one of its 266 * reallocations (set in "latest" of the base instance), this points to 267 * the valid range of the base instance. It's used for transfers after 268 * a buffer invalidation, because such transfers operate on "latest", not 269 * the base instance. Initially it's set to &valid_buffer_range. 270 */ 271 struct util_range *base_valid_buffer_range; 272 273 /* Drivers are required to update this for shared resources and user 274 * pointers. */ 275 bool is_shared; 276 bool is_user_ptr; 277 278 /* If positive, prefer DISCARD_RANGE with a staging buffer over any other 279 * method of CPU access when map flags allow it. Useful for buffers that 280 * are too large for the visible VRAM window. 281 */ 282 int max_forced_staging_uploads; 283 }; 284 285 struct threaded_transfer { 286 struct pipe_transfer b; 287 288 /* Staging buffer for DISCARD_RANGE transfers. */ 289 struct pipe_resource *staging; 290 291 /* Offset into the staging buffer, because the backing buffer is 292 * sub-allocated. */ 293 unsigned offset; 294 }; 295 296 struct threaded_query { 297 /* The query is added to the list in end_query and removed in flush. */ 298 struct list_head head_unflushed; 299 300 /* Whether pipe->flush has been called in non-deferred mode after end_query. */ 301 bool flushed; 302 }; 303 304 /* This is the second half of tc_call containing call data. 305 * Most calls will typecast this to the type they need, typically larger 306 * than 8 bytes. 307 */ 308 union tc_payload { 309 struct pipe_query *query; 310 struct pipe_resource *resource; 311 struct pipe_transfer *transfer; 312 struct pipe_fence_handle *fence; 313 uint64_t handle; 314 }; 315 316 #ifdef _MSC_VER 317 #define ALIGN16 __declspec(align(16)) 318 #else 319 #define ALIGN16 __attribute__((aligned(16))) 320 #endif 321 322 /* Each call slot should be aligned to its own size for optimal cache usage. */ 323 struct ALIGN16 tc_call { 324 unsigned sentinel; 325 ushort num_call_slots; 326 ushort call_id; 327 union tc_payload payload; 328 }; 329 330 /** 331 * A token representing an unflushed batch. 332 * 333 * See the general rules for fences for an explanation. 334 */ 335 struct tc_unflushed_batch_token { 336 struct pipe_reference ref; 337 struct threaded_context *tc; 338 }; 339 340 struct tc_batch { 341 struct pipe_context *pipe; 342 unsigned sentinel; 343 unsigned num_total_call_slots; 344 struct tc_unflushed_batch_token *token; 345 struct util_queue_fence fence; 346 struct tc_call call[TC_CALLS_PER_BATCH]; 347 }; 348 349 struct threaded_context { 350 struct pipe_context base; 351 struct pipe_context *pipe; 352 struct slab_child_pool pool_transfers; 353 tc_replace_buffer_storage_func replace_buffer_storage; 354 tc_create_fence_func create_fence; 355 unsigned map_buffer_alignment; 356 357 struct list_head unflushed_queries; 358 359 /* Counters for the HUD. */ 360 unsigned num_offloaded_slots; 361 unsigned num_direct_slots; 362 unsigned num_syncs; 363 364 struct util_queue queue; 365 struct util_queue_fence *fence; 366 367 unsigned last, next; 368 struct tc_batch batch_slots[TC_MAX_BATCHES]; 369 }; 370 371 void threaded_resource_init(struct pipe_resource *res); 372 void threaded_resource_deinit(struct pipe_resource *res); 373 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe); 374 375 struct pipe_context * 376 threaded_context_create(struct pipe_context *pipe, 377 struct slab_parent_pool *parent_transfer_pool, 378 tc_replace_buffer_storage_func replace_buffer, 379 tc_create_fence_func create_fence, 380 struct threaded_context **out); 381 382 void 383 threaded_context_flush(struct pipe_context *_pipe, 384 struct tc_unflushed_batch_token *token, 385 bool prefer_async); 386 387 static inline struct threaded_context * 388 threaded_context(struct pipe_context *pipe) 389 { 390 return (struct threaded_context*)pipe; 391 } 392 393 static inline struct threaded_resource * 394 threaded_resource(struct pipe_resource *res) 395 { 396 return (struct threaded_resource*)res; 397 } 398 399 static inline struct threaded_query * 400 threaded_query(struct pipe_query *q) 401 { 402 return (struct threaded_query*)q; 403 } 404 405 static inline struct threaded_transfer * 406 threaded_transfer(struct pipe_transfer *transfer) 407 { 408 return (struct threaded_transfer*)transfer; 409 } 410 411 static inline struct pipe_context * 412 threaded_context_unwrap_unsync(struct pipe_context *pipe) 413 { 414 if (!pipe || !pipe->priv) 415 return pipe; 416 return (struct pipe_context*)pipe->priv; 417 } 418 419 static inline void 420 tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst, 421 struct tc_unflushed_batch_token *src) 422 { 423 if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference *)src)) 424 free(*dst); 425 *dst = src; 426 } 427 428 #endif 429