1 /* 2 * Copyright 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include <assert.h> 25 #include <stdbool.h> 26 #include <string.h> 27 #include <unistd.h> 28 #include <fcntl.h> 29 30 #include "anv_private.h" 31 32 #include "genxml/gen8_pack.h" 33 34 #include "util/debug.h" 35 36 /** \file anv_batch_chain.c 37 * 38 * This file contains functions related to anv_cmd_buffer as a data 39 * structure. This involves everything required to create and destroy 40 * the actual batch buffers as well as link them together and handle 41 * relocations and surface state. It specifically does *not* contain any 42 * handling of actual vkCmd calls beyond vkCmdExecuteCommands. 43 */ 44 45 /*-----------------------------------------------------------------------* 46 * Functions related to anv_reloc_list 47 *-----------------------------------------------------------------------*/ 48 49 static VkResult 50 anv_reloc_list_init_clone(struct anv_reloc_list *list, 51 const VkAllocationCallbacks *alloc, 52 const struct anv_reloc_list *other_list) 53 { 54 if (other_list) { 55 list->num_relocs = other_list->num_relocs; 56 list->array_length = other_list->array_length; 57 } else { 58 list->num_relocs = 0; 59 list->array_length = 256; 60 } 61 62 list->relocs = 63 vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8, 64 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 65 66 if (list->relocs == NULL) 67 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 68 69 list->reloc_bos = 70 vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8, 71 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 72 73 if (list->reloc_bos == NULL) { 74 vk_free(alloc, list->relocs); 75 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 76 } 77 78 if (other_list) { 79 memcpy(list->relocs, other_list->relocs, 80 list->array_length * sizeof(*list->relocs)); 81 memcpy(list->reloc_bos, other_list->reloc_bos, 82 list->array_length * sizeof(*list->reloc_bos)); 83 } 84 85 return VK_SUCCESS; 86 } 87 88 VkResult 89 anv_reloc_list_init(struct anv_reloc_list *list, 90 const VkAllocationCallbacks *alloc) 91 { 92 return anv_reloc_list_init_clone(list, alloc, NULL); 93 } 94 95 void 96 anv_reloc_list_finish(struct anv_reloc_list *list, 97 const VkAllocationCallbacks *alloc) 98 { 99 vk_free(alloc, list->relocs); 100 vk_free(alloc, list->reloc_bos); 101 } 102 103 static VkResult 104 anv_reloc_list_grow(struct anv_reloc_list *list, 105 const VkAllocationCallbacks *alloc, 106 size_t num_additional_relocs) 107 { 108 if (list->num_relocs + num_additional_relocs <= list->array_length) 109 return VK_SUCCESS; 110 111 size_t new_length = list->array_length * 2; 112 while (new_length < list->num_relocs + num_additional_relocs) 113 new_length *= 2; 114 115 struct drm_i915_gem_relocation_entry *new_relocs = 116 vk_alloc(alloc, new_length * sizeof(*list->relocs), 8, 117 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 118 if (new_relocs == NULL) 119 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 120 121 struct anv_bo **new_reloc_bos = 122 vk_alloc(alloc, new_length * sizeof(*list->reloc_bos), 8, 123 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 124 if (new_reloc_bos == NULL) { 125 vk_free(alloc, new_relocs); 126 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 127 } 128 129 memcpy(new_relocs, list->relocs, list->num_relocs * sizeof(*list->relocs)); 130 memcpy(new_reloc_bos, list->reloc_bos, 131 list->num_relocs * sizeof(*list->reloc_bos)); 132 133 vk_free(alloc, list->relocs); 134 vk_free(alloc, list->reloc_bos); 135 136 list->array_length = new_length; 137 list->relocs = new_relocs; 138 list->reloc_bos = new_reloc_bos; 139 140 return VK_SUCCESS; 141 } 142 143 VkResult 144 anv_reloc_list_add(struct anv_reloc_list *list, 145 const VkAllocationCallbacks *alloc, 146 uint32_t offset, struct anv_bo *target_bo, uint32_t delta) 147 { 148 struct drm_i915_gem_relocation_entry *entry; 149 int index; 150 151 VkResult result = anv_reloc_list_grow(list, alloc, 1); 152 if (result != VK_SUCCESS) 153 return result; 154 155 /* XXX: Can we use I915_EXEC_HANDLE_LUT? */ 156 index = list->num_relocs++; 157 list->reloc_bos[index] = target_bo; 158 entry = &list->relocs[index]; 159 entry->target_handle = target_bo->gem_handle; 160 entry->delta = delta; 161 entry->offset = offset; 162 entry->presumed_offset = target_bo->offset; 163 entry->read_domains = 0; 164 entry->write_domain = 0; 165 VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry))); 166 167 return VK_SUCCESS; 168 } 169 170 static VkResult 171 anv_reloc_list_append(struct anv_reloc_list *list, 172 const VkAllocationCallbacks *alloc, 173 struct anv_reloc_list *other, uint32_t offset) 174 { 175 VkResult result = anv_reloc_list_grow(list, alloc, other->num_relocs); 176 if (result != VK_SUCCESS) 177 return result; 178 179 memcpy(&list->relocs[list->num_relocs], &other->relocs[0], 180 other->num_relocs * sizeof(other->relocs[0])); 181 memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0], 182 other->num_relocs * sizeof(other->reloc_bos[0])); 183 184 for (uint32_t i = 0; i < other->num_relocs; i++) 185 list->relocs[i + list->num_relocs].offset += offset; 186 187 list->num_relocs += other->num_relocs; 188 return VK_SUCCESS; 189 } 190 191 /*-----------------------------------------------------------------------* 192 * Functions related to anv_batch 193 *-----------------------------------------------------------------------*/ 194 195 void * 196 anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords) 197 { 198 if (batch->next + num_dwords * 4 > batch->end) { 199 VkResult result = batch->extend_cb(batch, batch->user_data); 200 if (result != VK_SUCCESS) { 201 anv_batch_set_error(batch, result); 202 return NULL; 203 } 204 } 205 206 void *p = batch->next; 207 208 batch->next += num_dwords * 4; 209 assert(batch->next <= batch->end); 210 211 return p; 212 } 213 214 uint64_t 215 anv_batch_emit_reloc(struct anv_batch *batch, 216 void *location, struct anv_bo *bo, uint32_t delta) 217 { 218 VkResult result = anv_reloc_list_add(batch->relocs, batch->alloc, 219 location - batch->start, bo, delta); 220 if (result != VK_SUCCESS) { 221 anv_batch_set_error(batch, result); 222 return 0; 223 } 224 225 return bo->offset + delta; 226 } 227 228 void 229 anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other) 230 { 231 uint32_t size, offset; 232 233 size = other->next - other->start; 234 assert(size % 4 == 0); 235 236 if (batch->next + size > batch->end) { 237 VkResult result = batch->extend_cb(batch, batch->user_data); 238 if (result != VK_SUCCESS) { 239 anv_batch_set_error(batch, result); 240 return; 241 } 242 } 243 244 assert(batch->next + size <= batch->end); 245 246 VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size)); 247 memcpy(batch->next, other->start, size); 248 249 offset = batch->next - batch->start; 250 VkResult result = anv_reloc_list_append(batch->relocs, batch->alloc, 251 other->relocs, offset); 252 if (result != VK_SUCCESS) { 253 anv_batch_set_error(batch, result); 254 return; 255 } 256 257 batch->next += size; 258 } 259 260 /*-----------------------------------------------------------------------* 261 * Functions related to anv_batch_bo 262 *-----------------------------------------------------------------------*/ 263 264 static VkResult 265 anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer, 266 struct anv_batch_bo **bbo_out) 267 { 268 VkResult result; 269 270 struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo), 271 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 272 if (bbo == NULL) 273 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 274 275 result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo, 276 ANV_CMD_BUFFER_BATCH_SIZE); 277 if (result != VK_SUCCESS) 278 goto fail_alloc; 279 280 result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->pool->alloc); 281 if (result != VK_SUCCESS) 282 goto fail_bo_alloc; 283 284 *bbo_out = bbo; 285 286 return VK_SUCCESS; 287 288 fail_bo_alloc: 289 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); 290 fail_alloc: 291 vk_free(&cmd_buffer->pool->alloc, bbo); 292 293 return result; 294 } 295 296 static VkResult 297 anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer, 298 const struct anv_batch_bo *other_bbo, 299 struct anv_batch_bo **bbo_out) 300 { 301 VkResult result; 302 303 struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo), 304 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 305 if (bbo == NULL) 306 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 307 308 result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo, 309 other_bbo->bo.size); 310 if (result != VK_SUCCESS) 311 goto fail_alloc; 312 313 result = anv_reloc_list_init_clone(&bbo->relocs, &cmd_buffer->pool->alloc, 314 &other_bbo->relocs); 315 if (result != VK_SUCCESS) 316 goto fail_bo_alloc; 317 318 bbo->length = other_bbo->length; 319 memcpy(bbo->bo.map, other_bbo->bo.map, other_bbo->length); 320 321 *bbo_out = bbo; 322 323 return VK_SUCCESS; 324 325 fail_bo_alloc: 326 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); 327 fail_alloc: 328 vk_free(&cmd_buffer->pool->alloc, bbo); 329 330 return result; 331 } 332 333 static void 334 anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch, 335 size_t batch_padding) 336 { 337 batch->next = batch->start = bbo->bo.map; 338 batch->end = bbo->bo.map + bbo->bo.size - batch_padding; 339 batch->relocs = &bbo->relocs; 340 bbo->relocs.num_relocs = 0; 341 } 342 343 static void 344 anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch, 345 size_t batch_padding) 346 { 347 batch->start = bbo->bo.map; 348 batch->next = bbo->bo.map + bbo->length; 349 batch->end = bbo->bo.map + bbo->bo.size - batch_padding; 350 batch->relocs = &bbo->relocs; 351 } 352 353 static void 354 anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch) 355 { 356 assert(batch->start == bbo->bo.map); 357 bbo->length = batch->next - batch->start; 358 VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length)); 359 } 360 361 static VkResult 362 anv_batch_bo_grow(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo *bbo, 363 struct anv_batch *batch, size_t aditional, 364 size_t batch_padding) 365 { 366 assert(batch->start == bbo->bo.map); 367 bbo->length = batch->next - batch->start; 368 369 size_t new_size = bbo->bo.size; 370 while (new_size <= bbo->length + aditional + batch_padding) 371 new_size *= 2; 372 373 if (new_size == bbo->bo.size) 374 return VK_SUCCESS; 375 376 struct anv_bo new_bo; 377 VkResult result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, 378 &new_bo, new_size); 379 if (result != VK_SUCCESS) 380 return result; 381 382 memcpy(new_bo.map, bbo->bo.map, bbo->length); 383 384 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); 385 386 bbo->bo = new_bo; 387 anv_batch_bo_continue(bbo, batch, batch_padding); 388 389 return VK_SUCCESS; 390 } 391 392 static void 393 anv_batch_bo_destroy(struct anv_batch_bo *bbo, 394 struct anv_cmd_buffer *cmd_buffer) 395 { 396 anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->pool->alloc); 397 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); 398 vk_free(&cmd_buffer->pool->alloc, bbo); 399 } 400 401 static VkResult 402 anv_batch_bo_list_clone(const struct list_head *list, 403 struct anv_cmd_buffer *cmd_buffer, 404 struct list_head *new_list) 405 { 406 VkResult result = VK_SUCCESS; 407 408 list_inithead(new_list); 409 410 struct anv_batch_bo *prev_bbo = NULL; 411 list_for_each_entry(struct anv_batch_bo, bbo, list, link) { 412 struct anv_batch_bo *new_bbo = NULL; 413 result = anv_batch_bo_clone(cmd_buffer, bbo, &new_bbo); 414 if (result != VK_SUCCESS) 415 break; 416 list_addtail(&new_bbo->link, new_list); 417 418 if (prev_bbo) { 419 /* As we clone this list of batch_bo's, they chain one to the 420 * other using MI_BATCH_BUFFER_START commands. We need to fix up 421 * those relocations as we go. Fortunately, this is pretty easy 422 * as it will always be the last relocation in the list. 423 */ 424 uint32_t last_idx = prev_bbo->relocs.num_relocs - 1; 425 assert(prev_bbo->relocs.reloc_bos[last_idx] == &bbo->bo); 426 prev_bbo->relocs.reloc_bos[last_idx] = &new_bbo->bo; 427 } 428 429 prev_bbo = new_bbo; 430 } 431 432 if (result != VK_SUCCESS) { 433 list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) 434 anv_batch_bo_destroy(bbo, cmd_buffer); 435 } 436 437 return result; 438 } 439 440 /*-----------------------------------------------------------------------* 441 * Functions related to anv_batch_bo 442 *-----------------------------------------------------------------------*/ 443 444 static struct anv_batch_bo * 445 anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer) 446 { 447 return LIST_ENTRY(struct anv_batch_bo, cmd_buffer->batch_bos.prev, link); 448 } 449 450 struct anv_address 451 anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer) 452 { 453 struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); 454 return (struct anv_address) { 455 .bo = &cmd_buffer->device->surface_state_pool.block_pool.bo, 456 .offset = bt_block->offset, 457 }; 458 } 459 460 static void 461 emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer, 462 struct anv_bo *bo, uint32_t offset) 463 { 464 /* In gen8+ the address field grew to two dwords to accomodate 48 bit 465 * offsets. The high 16 bits are in the last dword, so we can use the gen8 466 * version in either case, as long as we set the instruction length in the 467 * header accordingly. This means that we always emit three dwords here 468 * and all the padding and adjustment we do in this file works for all 469 * gens. 470 */ 471 472 #define GEN7_MI_BATCH_BUFFER_START_length 2 473 #define GEN7_MI_BATCH_BUFFER_START_length_bias 2 474 475 const uint32_t gen7_length = 476 GEN7_MI_BATCH_BUFFER_START_length - GEN7_MI_BATCH_BUFFER_START_length_bias; 477 const uint32_t gen8_length = 478 GEN8_MI_BATCH_BUFFER_START_length - GEN8_MI_BATCH_BUFFER_START_length_bias; 479 480 anv_batch_emit(&cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_START, bbs) { 481 bbs.DWordLength = cmd_buffer->device->info.gen < 8 ? 482 gen7_length : gen8_length; 483 bbs._2ndLevelBatchBuffer = _1stlevelbatch; 484 bbs.AddressSpaceIndicator = ASI_PPGTT; 485 bbs.BatchBufferStartAddress = (struct anv_address) { bo, offset }; 486 } 487 } 488 489 static void 490 cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer, 491 struct anv_batch_bo *bbo) 492 { 493 struct anv_batch *batch = &cmd_buffer->batch; 494 struct anv_batch_bo *current_bbo = 495 anv_cmd_buffer_current_batch_bo(cmd_buffer); 496 497 /* We set the end of the batch a little short so we would be sure we 498 * have room for the chaining command. Since we're about to emit the 499 * chaining command, let's set it back where it should go. 500 */ 501 batch->end += GEN8_MI_BATCH_BUFFER_START_length * 4; 502 assert(batch->end == current_bbo->bo.map + current_bbo->bo.size); 503 504 emit_batch_buffer_start(cmd_buffer, &bbo->bo, 0); 505 506 anv_batch_bo_finish(current_bbo, batch); 507 } 508 509 static VkResult 510 anv_cmd_buffer_chain_batch(struct anv_batch *batch, void *_data) 511 { 512 struct anv_cmd_buffer *cmd_buffer = _data; 513 struct anv_batch_bo *new_bbo; 514 515 VkResult result = anv_batch_bo_create(cmd_buffer, &new_bbo); 516 if (result != VK_SUCCESS) 517 return result; 518 519 struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos); 520 if (seen_bbo == NULL) { 521 anv_batch_bo_destroy(new_bbo, cmd_buffer); 522 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 523 } 524 *seen_bbo = new_bbo; 525 526 cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo); 527 528 list_addtail(&new_bbo->link, &cmd_buffer->batch_bos); 529 530 anv_batch_bo_start(new_bbo, batch, GEN8_MI_BATCH_BUFFER_START_length * 4); 531 532 return VK_SUCCESS; 533 } 534 535 static VkResult 536 anv_cmd_buffer_grow_batch(struct anv_batch *batch, void *_data) 537 { 538 struct anv_cmd_buffer *cmd_buffer = _data; 539 struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer); 540 541 anv_batch_bo_grow(cmd_buffer, bbo, &cmd_buffer->batch, 4096, 542 GEN8_MI_BATCH_BUFFER_START_length * 4); 543 544 return VK_SUCCESS; 545 } 546 547 /** Allocate a binding table 548 * 549 * This function allocates a binding table. This is a bit more complicated 550 * than one would think due to a combination of Vulkan driver design and some 551 * unfortunate hardware restrictions. 552 * 553 * The 3DSTATE_BINDING_TABLE_POINTERS_* packets only have a 16-bit field for 554 * the binding table pointer which means that all binding tables need to live 555 * in the bottom 64k of surface state base address. The way the GL driver has 556 * classically dealt with this restriction is to emit all surface states 557 * on-the-fly into the batch and have a batch buffer smaller than 64k. This 558 * isn't really an option in Vulkan for a couple of reasons: 559 * 560 * 1) In Vulkan, we have growing (or chaining) batches so surface states have 561 * to live in their own buffer and we have to be able to re-emit 562 * STATE_BASE_ADDRESS as needed which requires a full pipeline stall. In 563 * order to avoid emitting STATE_BASE_ADDRESS any more often than needed 564 * (it's not that hard to hit 64k of just binding tables), we allocate 565 * surface state objects up-front when VkImageView is created. In order 566 * for this to work, surface state objects need to be allocated from a 567 * global buffer. 568 * 569 * 2) We tried to design the surface state system in such a way that it's 570 * already ready for bindless texturing. The way bindless texturing works 571 * on our hardware is that you have a big pool of surface state objects 572 * (with its own state base address) and the bindless handles are simply 573 * offsets into that pool. With the architecture we chose, we already 574 * have that pool and it's exactly the same pool that we use for regular 575 * surface states so we should already be ready for bindless. 576 * 577 * 3) For render targets, we need to be able to fill out the surface states 578 * later in vkBeginRenderPass so that we can assign clear colors 579 * correctly. One way to do this would be to just create the surface 580 * state data and then repeatedly copy it into the surface state BO every 581 * time we have to re-emit STATE_BASE_ADDRESS. While this works, it's 582 * rather annoying and just being able to allocate them up-front and 583 * re-use them for the entire render pass. 584 * 585 * While none of these are technically blockers for emitting state on the fly 586 * like we do in GL, the ability to have a single surface state pool is 587 * simplifies things greatly. Unfortunately, it comes at a cost... 588 * 589 * Because of the 64k limitation of 3DSTATE_BINDING_TABLE_POINTERS_*, we can't 590 * place the binding tables just anywhere in surface state base address. 591 * Because 64k isn't a whole lot of space, we can't simply restrict the 592 * surface state buffer to 64k, we have to be more clever. The solution we've 593 * chosen is to have a block pool with a maximum size of 2G that starts at 594 * zero and grows in both directions. All surface states are allocated from 595 * the top of the pool (positive offsets) and we allocate blocks (< 64k) of 596 * binding tables from the bottom of the pool (negative offsets). Every time 597 * we allocate a new binding table block, we set surface state base address to 598 * point to the bottom of the binding table block. This way all of the 599 * binding tables in the block are in the bottom 64k of surface state base 600 * address. When we fill out the binding table, we add the distance between 601 * the bottom of our binding table block and zero of the block pool to the 602 * surface state offsets so that they are correct relative to out new surface 603 * state base address at the bottom of the binding table block. 604 * 605 * \see adjust_relocations_from_block_pool() 606 * \see adjust_relocations_too_block_pool() 607 * 608 * \param[in] entries The number of surface state entries the binding 609 * table should be able to hold. 610 * 611 * \param[out] state_offset The offset surface surface state base address 612 * where the surface states live. This must be 613 * added to the surface state offset when it is 614 * written into the binding table entry. 615 * 616 * \return An anv_state representing the binding table 617 */ 618 struct anv_state 619 anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer, 620 uint32_t entries, uint32_t *state_offset) 621 { 622 struct anv_state_pool *state_pool = &cmd_buffer->device->surface_state_pool; 623 struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); 624 struct anv_state state; 625 626 state.alloc_size = align_u32(entries * 4, 32); 627 628 if (cmd_buffer->bt_next + state.alloc_size > state_pool->block_size) 629 return (struct anv_state) { 0 }; 630 631 state.offset = cmd_buffer->bt_next; 632 state.map = state_pool->block_pool.map + bt_block->offset + state.offset; 633 634 cmd_buffer->bt_next += state.alloc_size; 635 636 assert(bt_block->offset < 0); 637 *state_offset = -bt_block->offset; 638 639 return state; 640 } 641 642 struct anv_state 643 anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer) 644 { 645 struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 646 return anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 647 isl_dev->ss.size, isl_dev->ss.align); 648 } 649 650 struct anv_state 651 anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer, 652 uint32_t size, uint32_t alignment) 653 { 654 return anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream, 655 size, alignment); 656 } 657 658 VkResult 659 anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer) 660 { 661 struct anv_state_pool *state_pool = &cmd_buffer->device->surface_state_pool; 662 663 struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states); 664 if (bt_block == NULL) { 665 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); 666 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 667 } 668 669 *bt_block = anv_state_pool_alloc_back(state_pool); 670 cmd_buffer->bt_next = 0; 671 672 return VK_SUCCESS; 673 } 674 675 VkResult 676 anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) 677 { 678 struct anv_batch_bo *batch_bo; 679 VkResult result; 680 681 list_inithead(&cmd_buffer->batch_bos); 682 683 result = anv_batch_bo_create(cmd_buffer, &batch_bo); 684 if (result != VK_SUCCESS) 685 return result; 686 687 list_addtail(&batch_bo->link, &cmd_buffer->batch_bos); 688 689 cmd_buffer->batch.alloc = &cmd_buffer->pool->alloc; 690 cmd_buffer->batch.user_data = cmd_buffer; 691 692 if (cmd_buffer->device->can_chain_batches) { 693 cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch; 694 } else { 695 cmd_buffer->batch.extend_cb = anv_cmd_buffer_grow_batch; 696 } 697 698 anv_batch_bo_start(batch_bo, &cmd_buffer->batch, 699 GEN8_MI_BATCH_BUFFER_START_length * 4); 700 701 int success = u_vector_init(&cmd_buffer->seen_bbos, 702 sizeof(struct anv_bo *), 703 8 * sizeof(struct anv_bo *)); 704 if (!success) 705 goto fail_batch_bo; 706 707 *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo; 708 709 /* u_vector requires power-of-two size elements */ 710 unsigned pow2_state_size = util_next_power_of_two(sizeof(struct anv_state)); 711 success = u_vector_init(&cmd_buffer->bt_block_states, 712 pow2_state_size, 8 * pow2_state_size); 713 if (!success) 714 goto fail_seen_bbos; 715 716 result = anv_reloc_list_init(&cmd_buffer->surface_relocs, 717 &cmd_buffer->pool->alloc); 718 if (result != VK_SUCCESS) 719 goto fail_bt_blocks; 720 cmd_buffer->last_ss_pool_center = 0; 721 722 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); 723 if (result != VK_SUCCESS) 724 goto fail_bt_blocks; 725 726 return VK_SUCCESS; 727 728 fail_bt_blocks: 729 u_vector_finish(&cmd_buffer->bt_block_states); 730 fail_seen_bbos: 731 u_vector_finish(&cmd_buffer->seen_bbos); 732 fail_batch_bo: 733 anv_batch_bo_destroy(batch_bo, cmd_buffer); 734 735 return result; 736 } 737 738 void 739 anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) 740 { 741 struct anv_state *bt_block; 742 u_vector_foreach(bt_block, &cmd_buffer->bt_block_states) 743 anv_state_pool_free(&cmd_buffer->device->surface_state_pool, *bt_block); 744 u_vector_finish(&cmd_buffer->bt_block_states); 745 746 anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc); 747 748 u_vector_finish(&cmd_buffer->seen_bbos); 749 750 /* Destroy all of the batch buffers */ 751 list_for_each_entry_safe(struct anv_batch_bo, bbo, 752 &cmd_buffer->batch_bos, link) { 753 anv_batch_bo_destroy(bbo, cmd_buffer); 754 } 755 } 756 757 void 758 anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) 759 { 760 /* Delete all but the first batch bo */ 761 assert(!list_empty(&cmd_buffer->batch_bos)); 762 while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) { 763 struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer); 764 list_del(&bbo->link); 765 anv_batch_bo_destroy(bbo, cmd_buffer); 766 } 767 assert(!list_empty(&cmd_buffer->batch_bos)); 768 769 anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer), 770 &cmd_buffer->batch, 771 GEN8_MI_BATCH_BUFFER_START_length * 4); 772 773 while (u_vector_length(&cmd_buffer->bt_block_states) > 1) { 774 struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states); 775 anv_state_pool_free(&cmd_buffer->device->surface_state_pool, *bt_block); 776 } 777 assert(u_vector_length(&cmd_buffer->bt_block_states) == 1); 778 cmd_buffer->bt_next = 0; 779 780 cmd_buffer->surface_relocs.num_relocs = 0; 781 cmd_buffer->last_ss_pool_center = 0; 782 783 /* Reset the list of seen buffers */ 784 cmd_buffer->seen_bbos.head = 0; 785 cmd_buffer->seen_bbos.tail = 0; 786 787 *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = 788 anv_cmd_buffer_current_batch_bo(cmd_buffer); 789 } 790 791 void 792 anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) 793 { 794 struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer); 795 796 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { 797 /* When we start a batch buffer, we subtract a certain amount of 798 * padding from the end to ensure that we always have room to emit a 799 * BATCH_BUFFER_START to chain to the next BO. We need to remove 800 * that padding before we end the batch; otherwise, we may end up 801 * with our BATCH_BUFFER_END in another BO. 802 */ 803 cmd_buffer->batch.end += GEN8_MI_BATCH_BUFFER_START_length * 4; 804 assert(cmd_buffer->batch.end == batch_bo->bo.map + batch_bo->bo.size); 805 806 anv_batch_emit(&cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_END, bbe); 807 808 /* Round batch up to an even number of dwords. */ 809 if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4) 810 anv_batch_emit(&cmd_buffer->batch, GEN8_MI_NOOP, noop); 811 812 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY; 813 } 814 815 anv_batch_bo_finish(batch_bo, &cmd_buffer->batch); 816 817 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { 818 /* If this is a secondary command buffer, we need to determine the 819 * mode in which it will be executed with vkExecuteCommands. We 820 * determine this statically here so that this stays in sync with the 821 * actual ExecuteCommands implementation. 822 */ 823 if (!cmd_buffer->device->can_chain_batches) { 824 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT; 825 } else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) && 826 (batch_bo->length < ANV_CMD_BUFFER_BATCH_SIZE / 2)) { 827 /* If the secondary has exactly one batch buffer in its list *and* 828 * that batch buffer is less than half of the maximum size, we're 829 * probably better of simply copying it into our batch. 830 */ 831 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_EMIT; 832 } else if (!(cmd_buffer->usage_flags & 833 VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) { 834 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN; 835 836 /* When we chain, we need to add an MI_BATCH_BUFFER_START command 837 * with its relocation. In order to handle this we'll increment here 838 * so we can unconditionally decrement right before adding the 839 * MI_BATCH_BUFFER_START command. 840 */ 841 batch_bo->relocs.num_relocs++; 842 cmd_buffer->batch.next += GEN8_MI_BATCH_BUFFER_START_length * 4; 843 } else { 844 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN; 845 } 846 } 847 } 848 849 static VkResult 850 anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer, 851 struct list_head *list) 852 { 853 list_for_each_entry(struct anv_batch_bo, bbo, list, link) { 854 struct anv_batch_bo **bbo_ptr = u_vector_add(&cmd_buffer->seen_bbos); 855 if (bbo_ptr == NULL) 856 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 857 858 *bbo_ptr = bbo; 859 } 860 861 return VK_SUCCESS; 862 } 863 864 void 865 anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, 866 struct anv_cmd_buffer *secondary) 867 { 868 switch (secondary->exec_mode) { 869 case ANV_CMD_BUFFER_EXEC_MODE_EMIT: 870 anv_batch_emit_batch(&primary->batch, &secondary->batch); 871 break; 872 case ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT: { 873 struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(primary); 874 unsigned length = secondary->batch.end - secondary->batch.start; 875 anv_batch_bo_grow(primary, bbo, &primary->batch, length, 876 GEN8_MI_BATCH_BUFFER_START_length * 4); 877 anv_batch_emit_batch(&primary->batch, &secondary->batch); 878 break; 879 } 880 case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: { 881 struct anv_batch_bo *first_bbo = 882 list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link); 883 struct anv_batch_bo *last_bbo = 884 list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link); 885 886 emit_batch_buffer_start(primary, &first_bbo->bo, 0); 887 888 struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary); 889 assert(primary->batch.start == this_bbo->bo.map); 890 uint32_t offset = primary->batch.next - primary->batch.start; 891 const uint32_t inst_size = GEN8_MI_BATCH_BUFFER_START_length * 4; 892 893 /* Roll back the previous MI_BATCH_BUFFER_START and its relocation so we 894 * can emit a new command and relocation for the current splice. In 895 * order to handle the initial-use case, we incremented next and 896 * num_relocs in end_batch_buffer() so we can alyways just subtract 897 * here. 898 */ 899 last_bbo->relocs.num_relocs--; 900 secondary->batch.next -= inst_size; 901 emit_batch_buffer_start(secondary, &this_bbo->bo, offset); 902 anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos); 903 904 /* After patching up the secondary buffer, we need to clflush the 905 * modified instruction in case we're on a !llc platform. We use a 906 * little loop to handle the case where the instruction crosses a cache 907 * line boundary. 908 */ 909 if (!primary->device->info.has_llc) { 910 void *inst = secondary->batch.next - inst_size; 911 void *p = (void *) (((uintptr_t) inst) & ~CACHELINE_MASK); 912 __builtin_ia32_mfence(); 913 while (p < secondary->batch.next) { 914 __builtin_ia32_clflush(p); 915 p += CACHELINE_SIZE; 916 } 917 } 918 break; 919 } 920 case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: { 921 struct list_head copy_list; 922 VkResult result = anv_batch_bo_list_clone(&secondary->batch_bos, 923 secondary, 924 ©_list); 925 if (result != VK_SUCCESS) 926 return; /* FIXME */ 927 928 anv_cmd_buffer_add_seen_bbos(primary, ©_list); 929 930 struct anv_batch_bo *first_bbo = 931 list_first_entry(©_list, struct anv_batch_bo, link); 932 struct anv_batch_bo *last_bbo = 933 list_last_entry(©_list, struct anv_batch_bo, link); 934 935 cmd_buffer_chain_to_batch_bo(primary, first_bbo); 936 937 list_splicetail(©_list, &primary->batch_bos); 938 939 anv_batch_bo_continue(last_bbo, &primary->batch, 940 GEN8_MI_BATCH_BUFFER_START_length * 4); 941 break; 942 } 943 default: 944 assert(!"Invalid execution mode"); 945 } 946 947 anv_reloc_list_append(&primary->surface_relocs, &primary->pool->alloc, 948 &secondary->surface_relocs, 0); 949 } 950 951 struct anv_execbuf { 952 struct drm_i915_gem_execbuffer2 execbuf; 953 954 struct drm_i915_gem_exec_object2 * objects; 955 uint32_t bo_count; 956 struct anv_bo ** bos; 957 958 /* Allocated length of the 'objects' and 'bos' arrays */ 959 uint32_t array_length; 960 961 uint32_t fence_count; 962 uint32_t fence_array_length; 963 struct drm_i915_gem_exec_fence * fences; 964 struct anv_syncobj ** syncobjs; 965 }; 966 967 static void 968 anv_execbuf_init(struct anv_execbuf *exec) 969 { 970 memset(exec, 0, sizeof(*exec)); 971 } 972 973 static void 974 anv_execbuf_finish(struct anv_execbuf *exec, 975 const VkAllocationCallbacks *alloc) 976 { 977 vk_free(alloc, exec->objects); 978 vk_free(alloc, exec->bos); 979 vk_free(alloc, exec->fences); 980 vk_free(alloc, exec->syncobjs); 981 } 982 983 static VkResult 984 anv_execbuf_add_bo(struct anv_execbuf *exec, 985 struct anv_bo *bo, 986 struct anv_reloc_list *relocs, 987 uint32_t extra_flags, 988 const VkAllocationCallbacks *alloc) 989 { 990 struct drm_i915_gem_exec_object2 *obj = NULL; 991 992 if (bo->index < exec->bo_count && exec->bos[bo->index] == bo) 993 obj = &exec->objects[bo->index]; 994 995 if (obj == NULL) { 996 /* We've never seen this one before. Add it to the list and assign 997 * an id that we can use later. 998 */ 999 if (exec->bo_count >= exec->array_length) { 1000 uint32_t new_len = exec->objects ? exec->array_length * 2 : 64; 1001 1002 struct drm_i915_gem_exec_object2 *new_objects = 1003 vk_alloc(alloc, new_len * sizeof(*new_objects), 1004 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 1005 if (new_objects == NULL) 1006 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 1007 1008 struct anv_bo **new_bos = 1009 vk_alloc(alloc, new_len * sizeof(*new_bos), 1010 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 1011 if (new_bos == NULL) { 1012 vk_free(alloc, new_objects); 1013 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 1014 } 1015 1016 if (exec->objects) { 1017 memcpy(new_objects, exec->objects, 1018 exec->bo_count * sizeof(*new_objects)); 1019 memcpy(new_bos, exec->bos, 1020 exec->bo_count * sizeof(*new_bos)); 1021 } 1022 1023 vk_free(alloc, exec->objects); 1024 vk_free(alloc, exec->bos); 1025 1026 exec->objects = new_objects; 1027 exec->bos = new_bos; 1028 exec->array_length = new_len; 1029 } 1030 1031 assert(exec->bo_count < exec->array_length); 1032 1033 bo->index = exec->bo_count++; 1034 obj = &exec->objects[bo->index]; 1035 exec->bos[bo->index] = bo; 1036 1037 obj->handle = bo->gem_handle; 1038 obj->relocation_count = 0; 1039 obj->relocs_ptr = 0; 1040 obj->alignment = 0; 1041 obj->offset = bo->offset; 1042 obj->flags = bo->flags | extra_flags; 1043 obj->rsvd1 = 0; 1044 obj->rsvd2 = 0; 1045 } 1046 1047 if (relocs != NULL && obj->relocation_count == 0) { 1048 /* This is the first time we've ever seen a list of relocations for 1049 * this BO. Go ahead and set the relocations and then walk the list 1050 * of relocations and add them all. 1051 */ 1052 obj->relocation_count = relocs->num_relocs; 1053 obj->relocs_ptr = (uintptr_t) relocs->relocs; 1054 1055 for (size_t i = 0; i < relocs->num_relocs; i++) { 1056 VkResult result; 1057 1058 /* A quick sanity check on relocations */ 1059 assert(relocs->relocs[i].offset < bo->size); 1060 result = anv_execbuf_add_bo(exec, relocs->reloc_bos[i], NULL, 1061 extra_flags, alloc); 1062 1063 if (result != VK_SUCCESS) 1064 return result; 1065 } 1066 } 1067 1068 return VK_SUCCESS; 1069 } 1070 1071 static VkResult 1072 anv_execbuf_add_syncobj(struct anv_execbuf *exec, 1073 uint32_t handle, uint32_t flags, 1074 const VkAllocationCallbacks *alloc) 1075 { 1076 assert(flags != 0); 1077 1078 if (exec->fence_count >= exec->fence_array_length) { 1079 uint32_t new_len = MAX2(exec->fence_array_length * 2, 64); 1080 1081 exec->fences = vk_realloc(alloc, exec->fences, 1082 new_len * sizeof(*exec->fences), 1083 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 1084 if (exec->fences == NULL) 1085 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 1086 1087 exec->fence_array_length = new_len; 1088 } 1089 1090 exec->fences[exec->fence_count] = (struct drm_i915_gem_exec_fence) { 1091 .handle = handle, 1092 .flags = flags, 1093 }; 1094 1095 exec->fence_count++; 1096 1097 return VK_SUCCESS; 1098 } 1099 1100 static void 1101 anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer, 1102 struct anv_reloc_list *list) 1103 { 1104 for (size_t i = 0; i < list->num_relocs; i++) 1105 list->relocs[i].target_handle = list->reloc_bos[i]->index; 1106 } 1107 1108 static void 1109 write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush) 1110 { 1111 unsigned reloc_size = 0; 1112 if (device->info.gen >= 8) { 1113 /* From the Broadwell PRM Vol. 2a, MI_LOAD_REGISTER_MEM::MemoryAddress: 1114 * 1115 * "This field specifies the address of the memory location where the 1116 * register value specified in the DWord above will read from. The 1117 * address specifies the DWord location of the data. Range = 1118 * GraphicsVirtualAddress[63:2] for a DWord register GraphicsAddress 1119 * [63:48] are ignored by the HW and assumed to be in correct 1120 * canonical form [63:48] == [47]." 1121 */ 1122 const int shift = 63 - 47; 1123 reloc_size = sizeof(uint64_t); 1124 *(uint64_t *)p = (((int64_t)v) << shift) >> shift; 1125 } else { 1126 reloc_size = sizeof(uint32_t); 1127 *(uint32_t *)p = v; 1128 } 1129 1130 if (flush && !device->info.has_llc) 1131 gen_flush_range(p, reloc_size); 1132 } 1133 1134 static void 1135 adjust_relocations_from_state_pool(struct anv_state_pool *pool, 1136 struct anv_reloc_list *relocs, 1137 uint32_t last_pool_center_bo_offset) 1138 { 1139 assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); 1140 uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; 1141 1142 for (size_t i = 0; i < relocs->num_relocs; i++) { 1143 /* All of the relocations from this block pool to other BO's should 1144 * have been emitted relative to the surface block pool center. We 1145 * need to add the center offset to make them relative to the 1146 * beginning of the actual GEM bo. 1147 */ 1148 relocs->relocs[i].offset += delta; 1149 } 1150 } 1151 1152 static void 1153 adjust_relocations_to_state_pool(struct anv_state_pool *pool, 1154 struct anv_bo *from_bo, 1155 struct anv_reloc_list *relocs, 1156 uint32_t last_pool_center_bo_offset) 1157 { 1158 assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); 1159 uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; 1160 1161 /* When we initially emit relocations into a block pool, we don't 1162 * actually know what the final center_bo_offset will be so we just emit 1163 * it as if center_bo_offset == 0. Now that we know what the center 1164 * offset is, we need to walk the list of relocations and adjust any 1165 * relocations that point to the pool bo with the correct offset. 1166 */ 1167 for (size_t i = 0; i < relocs->num_relocs; i++) { 1168 if (relocs->reloc_bos[i] == &pool->block_pool.bo) { 1169 /* Adjust the delta value in the relocation to correctly 1170 * correspond to the new delta. Initially, this value may have 1171 * been negative (if treated as unsigned), but we trust in 1172 * uint32_t roll-over to fix that for us at this point. 1173 */ 1174 relocs->relocs[i].delta += delta; 1175 1176 /* Since the delta has changed, we need to update the actual 1177 * relocated value with the new presumed value. This function 1178 * should only be called on batch buffers, so we know it isn't in 1179 * use by the GPU at the moment. 1180 */ 1181 assert(relocs->relocs[i].offset < from_bo->size); 1182 write_reloc(pool->block_pool.device, 1183 from_bo->map + relocs->relocs[i].offset, 1184 relocs->relocs[i].presumed_offset + 1185 relocs->relocs[i].delta, false); 1186 } 1187 } 1188 } 1189 1190 static void 1191 anv_reloc_list_apply(struct anv_device *device, 1192 struct anv_reloc_list *list, 1193 struct anv_bo *bo, 1194 bool always_relocate) 1195 { 1196 for (size_t i = 0; i < list->num_relocs; i++) { 1197 struct anv_bo *target_bo = list->reloc_bos[i]; 1198 if (list->relocs[i].presumed_offset == target_bo->offset && 1199 !always_relocate) 1200 continue; 1201 1202 void *p = bo->map + list->relocs[i].offset; 1203 write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true); 1204 list->relocs[i].presumed_offset = target_bo->offset; 1205 } 1206 } 1207 1208 /** 1209 * This function applies the relocation for a command buffer and writes the 1210 * actual addresses into the buffers as per what we were told by the kernel on 1211 * the previous execbuf2 call. This should be safe to do because, for each 1212 * relocated address, we have two cases: 1213 * 1214 * 1) The target BO is inactive (as seen by the kernel). In this case, it is 1215 * not in use by the GPU so updating the address is 100% ok. It won't be 1216 * in-use by the GPU (from our context) again until the next execbuf2 1217 * happens. If the kernel decides to move it in the next execbuf2, it 1218 * will have to do the relocations itself, but that's ok because it should 1219 * have all of the information needed to do so. 1220 * 1221 * 2) The target BO is active (as seen by the kernel). In this case, it 1222 * hasn't moved since the last execbuffer2 call because GTT shuffling 1223 * *only* happens when the BO is idle. (From our perspective, it only 1224 * happens inside the execbuffer2 ioctl, but the shuffling may be 1225 * triggered by another ioctl, with full-ppgtt this is limited to only 1226 * execbuffer2 ioctls on the same context, or memory pressure.) Since the 1227 * target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT 1228 * address and the relocated value we are writing into the BO will be the 1229 * same as the value that is already there. 1230 * 1231 * There is also a possibility that the target BO is active but the exact 1232 * RENDER_SURFACE_STATE object we are writing the relocation into isn't in 1233 * use. In this case, the address currently in the RENDER_SURFACE_STATE 1234 * may be stale but it's still safe to write the relocation because that 1235 * particular RENDER_SURFACE_STATE object isn't in-use by the GPU and 1236 * won't be until the next execbuf2 call. 1237 * 1238 * By doing relocations on the CPU, we can tell the kernel that it doesn't 1239 * need to bother. We want to do this because the surface state buffer is 1240 * used by every command buffer so, if the kernel does the relocations, it 1241 * will always be busy and the kernel will always stall. This is also 1242 * probably the fastest mechanism for doing relocations since the kernel would 1243 * have to make a full copy of all the relocations lists. 1244 */ 1245 static bool 1246 relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, 1247 struct anv_execbuf *exec) 1248 { 1249 static int userspace_relocs = -1; 1250 if (userspace_relocs < 0) 1251 userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true); 1252 if (!userspace_relocs) 1253 return false; 1254 1255 /* First, we have to check to see whether or not we can even do the 1256 * relocation. New buffers which have never been submitted to the kernel 1257 * don't have a valid offset so we need to let the kernel do relocations so 1258 * that we can get offsets for them. On future execbuf2 calls, those 1259 * buffers will have offsets and we will be able to skip relocating. 1260 * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1. 1261 */ 1262 for (uint32_t i = 0; i < exec->bo_count; i++) { 1263 if (exec->bos[i]->offset == (uint64_t)-1) 1264 return false; 1265 } 1266 1267 /* Since surface states are shared between command buffers and we don't 1268 * know what order they will be submitted to the kernel, we don't know 1269 * what address is actually written in the surface state object at any 1270 * given time. The only option is to always relocate them. 1271 */ 1272 anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs, 1273 &cmd_buffer->device->surface_state_pool.block_pool.bo, 1274 true /* always relocate surface states */); 1275 1276 /* Since we own all of the batch buffers, we know what values are stored 1277 * in the relocated addresses and only have to update them if the offsets 1278 * have changed. 1279 */ 1280 struct anv_batch_bo **bbo; 1281 u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { 1282 anv_reloc_list_apply(cmd_buffer->device, 1283 &(*bbo)->relocs, &(*bbo)->bo, false); 1284 } 1285 1286 for (uint32_t i = 0; i < exec->bo_count; i++) 1287 exec->objects[i].offset = exec->bos[i]->offset; 1288 1289 return true; 1290 } 1291 1292 static VkResult 1293 setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, 1294 struct anv_cmd_buffer *cmd_buffer) 1295 { 1296 struct anv_batch *batch = &cmd_buffer->batch; 1297 struct anv_state_pool *ss_pool = 1298 &cmd_buffer->device->surface_state_pool; 1299 1300 adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs, 1301 cmd_buffer->last_ss_pool_center); 1302 VkResult result = anv_execbuf_add_bo(execbuf, &ss_pool->block_pool.bo, 1303 &cmd_buffer->surface_relocs, 0, 1304 &cmd_buffer->device->alloc); 1305 if (result != VK_SUCCESS) 1306 return result; 1307 1308 /* First, we walk over all of the bos we've seen and add them and their 1309 * relocations to the validate list. 1310 */ 1311 struct anv_batch_bo **bbo; 1312 u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { 1313 adjust_relocations_to_state_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs, 1314 cmd_buffer->last_ss_pool_center); 1315 1316 result = anv_execbuf_add_bo(execbuf, &(*bbo)->bo, &(*bbo)->relocs, 0, 1317 &cmd_buffer->device->alloc); 1318 if (result != VK_SUCCESS) 1319 return result; 1320 } 1321 1322 /* Now that we've adjusted all of the surface state relocations, we need to 1323 * record the surface state pool center so future executions of the command 1324 * buffer can adjust correctly. 1325 */ 1326 cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset; 1327 1328 struct anv_batch_bo *first_batch_bo = 1329 list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link); 1330 1331 /* The kernel requires that the last entry in the validation list be the 1332 * batch buffer to execute. We can simply swap the element 1333 * corresponding to the first batch_bo in the chain with the last 1334 * element in the list. 1335 */ 1336 if (first_batch_bo->bo.index != execbuf->bo_count - 1) { 1337 uint32_t idx = first_batch_bo->bo.index; 1338 uint32_t last_idx = execbuf->bo_count - 1; 1339 1340 struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx]; 1341 assert(execbuf->bos[idx] == &first_batch_bo->bo); 1342 1343 execbuf->objects[idx] = execbuf->objects[last_idx]; 1344 execbuf->bos[idx] = execbuf->bos[last_idx]; 1345 execbuf->bos[idx]->index = idx; 1346 1347 execbuf->objects[last_idx] = tmp_obj; 1348 execbuf->bos[last_idx] = &first_batch_bo->bo; 1349 first_batch_bo->bo.index = last_idx; 1350 } 1351 1352 /* Now we go through and fixup all of the relocation lists to point to 1353 * the correct indices in the object array. We have to do this after we 1354 * reorder the list above as some of the indices may have changed. 1355 */ 1356 u_vector_foreach(bbo, &cmd_buffer->seen_bbos) 1357 anv_cmd_buffer_process_relocs(cmd_buffer, &(*bbo)->relocs); 1358 1359 anv_cmd_buffer_process_relocs(cmd_buffer, &cmd_buffer->surface_relocs); 1360 1361 if (!cmd_buffer->device->info.has_llc) { 1362 __builtin_ia32_mfence(); 1363 u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { 1364 for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE) 1365 __builtin_ia32_clflush((*bbo)->bo.map + i); 1366 } 1367 } 1368 1369 execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { 1370 .buffers_ptr = (uintptr_t) execbuf->objects, 1371 .buffer_count = execbuf->bo_count, 1372 .batch_start_offset = 0, 1373 .batch_len = batch->next - batch->start, 1374 .cliprects_ptr = 0, 1375 .num_cliprects = 0, 1376 .DR1 = 0, 1377 .DR4 = 0, 1378 .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER | 1379 I915_EXEC_CONSTANTS_REL_GENERAL, 1380 .rsvd1 = cmd_buffer->device->context_id, 1381 .rsvd2 = 0, 1382 }; 1383 1384 if (relocate_cmd_buffer(cmd_buffer, execbuf)) { 1385 /* If we were able to successfully relocate everything, tell the kernel 1386 * that it can skip doing relocations. The requirement for using 1387 * NO_RELOC is: 1388 * 1389 * 1) The addresses written in the objects must match the corresponding 1390 * reloc.presumed_offset which in turn must match the corresponding 1391 * execobject.offset. 1392 * 1393 * 2) To avoid stalling, execobject.offset should match the current 1394 * address of that object within the active context. 1395 * 1396 * In order to satisfy all of the invariants that make userspace 1397 * relocations to be safe (see relocate_cmd_buffer()), we need to 1398 * further ensure that the addresses we use match those used by the 1399 * kernel for the most recent execbuf2. 1400 * 1401 * The kernel may still choose to do relocations anyway if something has 1402 * moved in the GTT. In this case, the relocation list still needs to be 1403 * valid. All relocations on the batch buffers are already valid and 1404 * kept up-to-date. For surface state relocations, by applying the 1405 * relocations in relocate_cmd_buffer, we ensured that the address in 1406 * the RENDER_SURFACE_STATE matches presumed_offset, so it should be 1407 * safe for the kernel to relocate them as needed. 1408 */ 1409 execbuf->execbuf.flags |= I915_EXEC_NO_RELOC; 1410 } else { 1411 /* In the case where we fall back to doing kernel relocations, we need 1412 * to ensure that the relocation list is valid. All relocations on the 1413 * batch buffers are already valid and kept up-to-date. Since surface 1414 * states are shared between command buffers and we don't know what 1415 * order they will be submitted to the kernel, we don't know what 1416 * address is actually written in the surface state object at any given 1417 * time. The only option is to set a bogus presumed offset and let the 1418 * kernel relocate them. 1419 */ 1420 for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++) 1421 cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1; 1422 } 1423 1424 return VK_SUCCESS; 1425 } 1426 1427 static VkResult 1428 setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_device *device) 1429 { 1430 VkResult result = anv_execbuf_add_bo(execbuf, &device->trivial_batch_bo, 1431 NULL, 0, &device->alloc); 1432 if (result != VK_SUCCESS) 1433 return result; 1434 1435 execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { 1436 .buffers_ptr = (uintptr_t) execbuf->objects, 1437 .buffer_count = execbuf->bo_count, 1438 .batch_start_offset = 0, 1439 .batch_len = 8, /* GEN7_MI_BATCH_BUFFER_END and NOOP */ 1440 .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER, 1441 .rsvd1 = device->context_id, 1442 .rsvd2 = 0, 1443 }; 1444 1445 return VK_SUCCESS; 1446 } 1447 1448 VkResult 1449 anv_cmd_buffer_execbuf(struct anv_device *device, 1450 struct anv_cmd_buffer *cmd_buffer, 1451 const VkSemaphore *in_semaphores, 1452 uint32_t num_in_semaphores, 1453 const VkSemaphore *out_semaphores, 1454 uint32_t num_out_semaphores, 1455 VkFence _fence) 1456 { 1457 ANV_FROM_HANDLE(anv_fence, fence, _fence); 1458 1459 struct anv_execbuf execbuf; 1460 anv_execbuf_init(&execbuf); 1461 1462 int in_fence = -1; 1463 VkResult result = VK_SUCCESS; 1464 for (uint32_t i = 0; i < num_in_semaphores; i++) { 1465 ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); 1466 struct anv_semaphore_impl *impl = 1467 semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? 1468 &semaphore->temporary : &semaphore->permanent; 1469 1470 switch (impl->type) { 1471 case ANV_SEMAPHORE_TYPE_BO: 1472 result = anv_execbuf_add_bo(&execbuf, impl->bo, NULL, 1473 0, &device->alloc); 1474 if (result != VK_SUCCESS) 1475 return result; 1476 break; 1477 1478 case ANV_SEMAPHORE_TYPE_SYNC_FILE: 1479 if (in_fence == -1) { 1480 in_fence = impl->fd; 1481 } else { 1482 int merge = anv_gem_sync_file_merge(device, in_fence, impl->fd); 1483 if (merge == -1) 1484 return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); 1485 1486 close(impl->fd); 1487 close(in_fence); 1488 in_fence = merge; 1489 } 1490 1491 impl->fd = -1; 1492 break; 1493 1494 case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: 1495 result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, 1496 I915_EXEC_FENCE_WAIT, 1497 &device->alloc); 1498 if (result != VK_SUCCESS) 1499 return result; 1500 break; 1501 1502 default: 1503 break; 1504 } 1505 } 1506 1507 bool need_out_fence = false; 1508 for (uint32_t i = 0; i < num_out_semaphores; i++) { 1509 ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); 1510 1511 /* Under most circumstances, out fences won't be temporary. However, 1512 * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: 1513 * 1514 * "If the import is temporary, the implementation must restore the 1515 * semaphore to its prior permanent state after submitting the next 1516 * semaphore wait operation." 1517 * 1518 * The spec says nothing whatsoever about signal operations on 1519 * temporarily imported semaphores so it appears they are allowed. 1520 * There are also CTS tests that require this to work. 1521 */ 1522 struct anv_semaphore_impl *impl = 1523 semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? 1524 &semaphore->temporary : &semaphore->permanent; 1525 1526 switch (impl->type) { 1527 case ANV_SEMAPHORE_TYPE_BO: 1528 result = anv_execbuf_add_bo(&execbuf, impl->bo, NULL, 1529 EXEC_OBJECT_WRITE, &device->alloc); 1530 if (result != VK_SUCCESS) 1531 return result; 1532 break; 1533 1534 case ANV_SEMAPHORE_TYPE_SYNC_FILE: 1535 need_out_fence = true; 1536 break; 1537 1538 case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: 1539 result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, 1540 I915_EXEC_FENCE_SIGNAL, 1541 &device->alloc); 1542 if (result != VK_SUCCESS) 1543 return result; 1544 break; 1545 1546 default: 1547 break; 1548 } 1549 } 1550 1551 if (fence) { 1552 /* Under most circumstances, out fences won't be temporary. However, 1553 * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: 1554 * 1555 * "If the import is temporary, the implementation must restore the 1556 * semaphore to its prior permanent state after submitting the next 1557 * semaphore wait operation." 1558 * 1559 * The spec says nothing whatsoever about signal operations on 1560 * temporarily imported semaphores so it appears they are allowed. 1561 * There are also CTS tests that require this to work. 1562 */ 1563 struct anv_fence_impl *impl = 1564 fence->temporary.type != ANV_FENCE_TYPE_NONE ? 1565 &fence->temporary : &fence->permanent; 1566 1567 switch (impl->type) { 1568 case ANV_FENCE_TYPE_BO: 1569 result = anv_execbuf_add_bo(&execbuf, &impl->bo.bo, NULL, 1570 EXEC_OBJECT_WRITE, &device->alloc); 1571 if (result != VK_SUCCESS) 1572 return result; 1573 break; 1574 1575 case ANV_FENCE_TYPE_SYNCOBJ: 1576 result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, 1577 I915_EXEC_FENCE_SIGNAL, 1578 &device->alloc); 1579 if (result != VK_SUCCESS) 1580 return result; 1581 break; 1582 1583 default: 1584 unreachable("Invalid fence type"); 1585 } 1586 } 1587 1588 if (cmd_buffer) 1589 result = setup_execbuf_for_cmd_buffer(&execbuf, cmd_buffer); 1590 else 1591 result = setup_empty_execbuf(&execbuf, device); 1592 1593 if (result != VK_SUCCESS) 1594 return result; 1595 1596 if (execbuf.fence_count > 0) { 1597 assert(device->instance->physicalDevice.has_syncobj); 1598 execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY; 1599 execbuf.execbuf.num_cliprects = execbuf.fence_count; 1600 execbuf.execbuf.cliprects_ptr = (uintptr_t) execbuf.fences; 1601 } 1602 1603 if (in_fence != -1) { 1604 execbuf.execbuf.flags |= I915_EXEC_FENCE_IN; 1605 execbuf.execbuf.rsvd2 |= (uint32_t)in_fence; 1606 } 1607 1608 if (need_out_fence) 1609 execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT; 1610 1611 result = anv_device_execbuf(device, &execbuf.execbuf, execbuf.bos); 1612 1613 /* Execbuf does not consume the in_fence. It's our job to close it. */ 1614 if (in_fence != -1) 1615 close(in_fence); 1616 1617 for (uint32_t i = 0; i < num_in_semaphores; i++) { 1618 ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); 1619 /* From the Vulkan 1.0.53 spec: 1620 * 1621 * "If the import is temporary, the implementation must restore the 1622 * semaphore to its prior permanent state after submitting the next 1623 * semaphore wait operation." 1624 * 1625 * This has to happen after the execbuf in case we close any syncobjs in 1626 * the process. 1627 */ 1628 anv_semaphore_reset_temporary(device, semaphore); 1629 } 1630 1631 if (fence && fence->permanent.type == ANV_FENCE_TYPE_BO) { 1632 /* BO fences can't be shared, so they can't be temporary. */ 1633 assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); 1634 1635 /* Once the execbuf has returned, we need to set the fence state to 1636 * SUBMITTED. We can't do this before calling execbuf because 1637 * anv_GetFenceStatus does take the global device lock before checking 1638 * fence->state. 1639 * 1640 * We set the fence state to SUBMITTED regardless of whether or not the 1641 * execbuf succeeds because we need to ensure that vkWaitForFences() and 1642 * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or 1643 * VK_SUCCESS) in a finite amount of time even if execbuf fails. 1644 */ 1645 fence->permanent.bo.state = ANV_BO_FENCE_STATE_SUBMITTED; 1646 } 1647 1648 if (result == VK_SUCCESS && need_out_fence) { 1649 int out_fence = execbuf.execbuf.rsvd2 >> 32; 1650 for (uint32_t i = 0; i < num_out_semaphores; i++) { 1651 ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); 1652 /* Out fences can't have temporary state because that would imply 1653 * that we imported a sync file and are trying to signal it. 1654 */ 1655 assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE); 1656 struct anv_semaphore_impl *impl = &semaphore->permanent; 1657 1658 if (impl->type == ANV_SEMAPHORE_TYPE_SYNC_FILE) { 1659 assert(impl->fd == -1); 1660 impl->fd = dup(out_fence); 1661 } 1662 } 1663 close(out_fence); 1664 } 1665 1666 anv_execbuf_finish(&execbuf, &device->alloc); 1667 1668 return result; 1669 } 1670