1 /* 2 * Copyright 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include <assert.h> 25 #include <stdbool.h> 26 #include <string.h> 27 #include <unistd.h> 28 #include <fcntl.h> 29 30 #include "anv_private.h" 31 32 #include "genxml/gen_macros.h" 33 #include "genxml/genX_pack.h" 34 35 VkResult genX(CreateQueryPool)( 36 VkDevice _device, 37 const VkQueryPoolCreateInfo* pCreateInfo, 38 const VkAllocationCallbacks* pAllocator, 39 VkQueryPool* pQueryPool) 40 { 41 ANV_FROM_HANDLE(anv_device, device, _device); 42 const struct anv_physical_device *pdevice = &device->instance->physicalDevice; 43 struct anv_query_pool *pool; 44 VkResult result; 45 46 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); 47 48 /* Query pool slots are made up of some number of 64-bit values packed 49 * tightly together. The first 64-bit value is always the "available" bit 50 * which is 0 when the query is unavailable and 1 when it is available. 51 * The 64-bit values that follow are determined by the type of query. 52 */ 53 uint32_t uint64s_per_slot = 1; 54 55 VkQueryPipelineStatisticFlags pipeline_statistics = 0; 56 switch (pCreateInfo->queryType) { 57 case VK_QUERY_TYPE_OCCLUSION: 58 /* Occlusion queries have two values: begin and end. */ 59 uint64s_per_slot += 2; 60 break; 61 case VK_QUERY_TYPE_TIMESTAMP: 62 /* Timestamps just have the one timestamp value */ 63 uint64s_per_slot += 1; 64 break; 65 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 66 pipeline_statistics = pCreateInfo->pipelineStatistics; 67 /* We're going to trust this field implicitly so we need to ensure that 68 * no unhandled extension bits leak in. 69 */ 70 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK; 71 72 /* Statistics queries have a min and max for every statistic */ 73 uint64s_per_slot += 2 * _mesa_bitcount(pipeline_statistics); 74 break; 75 default: 76 assert(!"Invalid query type"); 77 } 78 79 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, 80 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 81 if (pool == NULL) 82 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 83 84 pool->type = pCreateInfo->queryType; 85 pool->pipeline_statistics = pipeline_statistics; 86 pool->stride = uint64s_per_slot * sizeof(uint64_t); 87 pool->slots = pCreateInfo->queryCount; 88 89 uint64_t size = pool->slots * pool->stride; 90 result = anv_bo_init_new(&pool->bo, device, size); 91 if (result != VK_SUCCESS) 92 goto fail; 93 94 if (pdevice->supports_48bit_addresses) 95 pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; 96 97 if (pdevice->has_exec_async) 98 pool->bo.flags |= EXEC_OBJECT_ASYNC; 99 100 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC 101 * platforms, this does nothing. On non-LLC platforms, this means snooping 102 * which comes at a slight cost. However, the buffers aren't big, won't be 103 * written frequently, and trying to handle the flushing manually without 104 * doing too much flushing is extremely painful. 105 */ 106 anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED); 107 108 pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0); 109 110 *pQueryPool = anv_query_pool_to_handle(pool); 111 112 return VK_SUCCESS; 113 114 fail: 115 vk_free2(&device->alloc, pAllocator, pool); 116 117 return result; 118 } 119 120 void genX(DestroyQueryPool)( 121 VkDevice _device, 122 VkQueryPool _pool, 123 const VkAllocationCallbacks* pAllocator) 124 { 125 ANV_FROM_HANDLE(anv_device, device, _device); 126 ANV_FROM_HANDLE(anv_query_pool, pool, _pool); 127 128 if (!pool) 129 return; 130 131 anv_gem_munmap(pool->bo.map, pool->bo.size); 132 anv_gem_close(device, pool->bo.gem_handle); 133 vk_free2(&device->alloc, pAllocator, pool); 134 } 135 136 static void 137 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, 138 uint32_t value_index, uint64_t result) 139 { 140 if (flags & VK_QUERY_RESULT_64_BIT) { 141 uint64_t *dst64 = dst_slot; 142 dst64[value_index] = result; 143 } else { 144 uint32_t *dst32 = dst_slot; 145 dst32[value_index] = result; 146 } 147 } 148 149 static bool 150 query_is_available(uint64_t *slot) 151 { 152 return *(volatile uint64_t *)slot; 153 } 154 155 static VkResult 156 wait_for_available(struct anv_device *device, 157 struct anv_query_pool *pool, uint64_t *slot) 158 { 159 while (true) { 160 if (query_is_available(slot)) 161 return VK_SUCCESS; 162 163 int ret = anv_gem_busy(device, pool->bo.gem_handle); 164 if (ret == 1) { 165 /* The BO is still busy, keep waiting. */ 166 continue; 167 } else if (ret == -1) { 168 /* We don't know the real error. */ 169 device->lost = true; 170 return vk_errorf(device->instance, device, VK_ERROR_DEVICE_LOST, 171 "gem wait failed: %m"); 172 } else { 173 assert(ret == 0); 174 /* The BO is no longer busy. */ 175 if (query_is_available(slot)) { 176 return VK_SUCCESS; 177 } else { 178 VkResult status = anv_device_query_status(device); 179 if (status != VK_SUCCESS) 180 return status; 181 182 /* If we haven't seen availability yet, then we never will. This 183 * can only happen if we have a client error where they call 184 * GetQueryPoolResults on a query that they haven't submitted to 185 * the GPU yet. The spec allows us to do anything in this case, 186 * but returning VK_SUCCESS doesn't seem right and we shouldn't 187 * just keep spinning. 188 */ 189 return VK_NOT_READY; 190 } 191 } 192 } 193 } 194 195 VkResult genX(GetQueryPoolResults)( 196 VkDevice _device, 197 VkQueryPool queryPool, 198 uint32_t firstQuery, 199 uint32_t queryCount, 200 size_t dataSize, 201 void* pData, 202 VkDeviceSize stride, 203 VkQueryResultFlags flags) 204 { 205 ANV_FROM_HANDLE(anv_device, device, _device); 206 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 207 208 assert(pool->type == VK_QUERY_TYPE_OCCLUSION || 209 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || 210 pool->type == VK_QUERY_TYPE_TIMESTAMP); 211 212 if (unlikely(device->lost)) 213 return VK_ERROR_DEVICE_LOST; 214 215 if (pData == NULL) 216 return VK_SUCCESS; 217 218 void *data_end = pData + dataSize; 219 220 VkResult status = VK_SUCCESS; 221 for (uint32_t i = 0; i < queryCount; i++) { 222 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride; 223 224 /* Availability is always at the start of the slot */ 225 bool available = slot[0]; 226 227 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { 228 status = wait_for_available(device, pool, slot); 229 if (status != VK_SUCCESS) 230 return status; 231 232 available = true; 233 } 234 235 /* From the Vulkan 1.0.42 spec: 236 * 237 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are 238 * both not set then no result values are written to pData for 239 * queries that are in the unavailable state at the time of the call, 240 * and vkGetQueryPoolResults returns VK_NOT_READY. However, 241 * availability state is still written to pData for those queries if 242 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set." 243 */ 244 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT); 245 246 if (write_results) { 247 switch (pool->type) { 248 case VK_QUERY_TYPE_OCCLUSION: { 249 cpu_write_query_result(pData, flags, 0, slot[2] - slot[1]); 250 break; 251 } 252 253 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 254 uint32_t statistics = pool->pipeline_statistics; 255 uint32_t idx = 0; 256 while (statistics) { 257 uint32_t stat = u_bit_scan(&statistics); 258 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1]; 259 260 /* WaDividePSInvocationCountBy4:HSW,BDW */ 261 if ((device->info.gen == 8 || device->info.is_haswell) && 262 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) 263 result >>= 2; 264 265 cpu_write_query_result(pData, flags, idx, result); 266 267 idx++; 268 } 269 assert(idx == _mesa_bitcount(pool->pipeline_statistics)); 270 break; 271 } 272 273 case VK_QUERY_TYPE_TIMESTAMP: { 274 cpu_write_query_result(pData, flags, 0, slot[1]); 275 break; 276 } 277 default: 278 unreachable("invalid pool type"); 279 } 280 } else { 281 status = VK_NOT_READY; 282 } 283 284 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 285 uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ? 286 _mesa_bitcount(pool->pipeline_statistics) : 1; 287 cpu_write_query_result(pData, flags, idx, available); 288 } 289 290 pData += stride; 291 if (pData >= data_end) 292 break; 293 } 294 295 return status; 296 } 297 298 static void 299 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, 300 struct anv_bo *bo, uint32_t offset) 301 { 302 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 303 pc.DestinationAddressType = DAT_PPGTT; 304 pc.PostSyncOperation = WritePSDepthCount; 305 pc.DepthStallEnable = true; 306 pc.Address = (struct anv_address) { bo, offset }; 307 308 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4) 309 pc.CommandStreamerStallEnable = true; 310 } 311 } 312 313 static void 314 emit_query_availability(struct anv_cmd_buffer *cmd_buffer, 315 struct anv_bo *bo, uint32_t offset) 316 { 317 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 318 pc.DestinationAddressType = DAT_PPGTT; 319 pc.PostSyncOperation = WriteImmediateData; 320 pc.Address = (struct anv_address) { bo, offset }; 321 pc.ImmediateData = 1; 322 } 323 } 324 325 /** 326 * Goes through a series of consecutive query indices in the given pool 327 * setting all element values to 0 and emitting them as available. 328 */ 329 static void 330 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, 331 struct anv_query_pool *pool, 332 uint32_t first_index, uint32_t num_queries) 333 { 334 const uint32_t num_elements = pool->stride / sizeof(uint64_t); 335 336 for (uint32_t i = 0; i < num_queries; i++) { 337 uint32_t slot_offset = (first_index + i) * pool->stride; 338 for (uint32_t j = 1; j < num_elements; j++) { 339 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 340 sdi.Address.bo = &pool->bo; 341 sdi.Address.offset = slot_offset + j * sizeof(uint64_t); 342 sdi.ImmediateData = 0ull; 343 } 344 } 345 emit_query_availability(cmd_buffer, &pool->bo, slot_offset); 346 } 347 } 348 349 void genX(CmdResetQueryPool)( 350 VkCommandBuffer commandBuffer, 351 VkQueryPool queryPool, 352 uint32_t firstQuery, 353 uint32_t queryCount) 354 { 355 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 356 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 357 358 for (uint32_t i = 0; i < queryCount; i++) { 359 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) { 360 sdm.Address = (struct anv_address) { 361 .bo = &pool->bo, 362 .offset = (firstQuery + i) * pool->stride, 363 }; 364 sdm.ImmediateData = 0; 365 } 366 } 367 } 368 369 static const uint32_t vk_pipeline_stat_to_reg[] = { 370 GENX(IA_VERTICES_COUNT_num), 371 GENX(IA_PRIMITIVES_COUNT_num), 372 GENX(VS_INVOCATION_COUNT_num), 373 GENX(GS_INVOCATION_COUNT_num), 374 GENX(GS_PRIMITIVES_COUNT_num), 375 GENX(CL_INVOCATION_COUNT_num), 376 GENX(CL_PRIMITIVES_COUNT_num), 377 GENX(PS_INVOCATION_COUNT_num), 378 GENX(HS_INVOCATION_COUNT_num), 379 GENX(DS_INVOCATION_COUNT_num), 380 GENX(CS_INVOCATION_COUNT_num), 381 }; 382 383 static void 384 emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat, 385 struct anv_bo *bo, uint32_t offset) 386 { 387 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK == 388 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1); 389 390 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg)); 391 uint32_t reg = vk_pipeline_stat_to_reg[stat]; 392 393 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) { 394 lrm.RegisterAddress = reg, 395 lrm.MemoryAddress = (struct anv_address) { bo, offset }; 396 } 397 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) { 398 lrm.RegisterAddress = reg + 4, 399 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; 400 } 401 } 402 403 void genX(CmdBeginQuery)( 404 VkCommandBuffer commandBuffer, 405 VkQueryPool queryPool, 406 uint32_t query, 407 VkQueryControlFlags flags) 408 { 409 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 410 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 411 412 switch (pool->type) { 413 case VK_QUERY_TYPE_OCCLUSION: 414 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 8); 415 break; 416 417 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 418 /* TODO: This might only be necessary for certain stats */ 419 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 420 pc.CommandStreamerStallEnable = true; 421 pc.StallAtPixelScoreboard = true; 422 } 423 424 uint32_t statistics = pool->pipeline_statistics; 425 uint32_t offset = query * pool->stride + 8; 426 while (statistics) { 427 uint32_t stat = u_bit_scan(&statistics); 428 emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset); 429 offset += 16; 430 } 431 break; 432 } 433 434 default: 435 unreachable(""); 436 } 437 } 438 439 void genX(CmdEndQuery)( 440 VkCommandBuffer commandBuffer, 441 VkQueryPool queryPool, 442 uint32_t query) 443 { 444 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 445 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 446 447 switch (pool->type) { 448 case VK_QUERY_TYPE_OCCLUSION: 449 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 16); 450 emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride); 451 break; 452 453 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 454 /* TODO: This might only be necessary for certain stats */ 455 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 456 pc.CommandStreamerStallEnable = true; 457 pc.StallAtPixelScoreboard = true; 458 } 459 460 uint32_t statistics = pool->pipeline_statistics; 461 uint32_t offset = query * pool->stride + 16; 462 while (statistics) { 463 uint32_t stat = u_bit_scan(&statistics); 464 emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset); 465 offset += 16; 466 } 467 468 emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride); 469 break; 470 } 471 472 default: 473 unreachable(""); 474 } 475 476 /* When multiview is active the spec requires that N consecutive query 477 * indices are used, where N is the number of active views in the subpass. 478 * The spec allows that we only write the results to one of the queries 479 * but we still need to manage result availability for all the query indices. 480 * Since we only emit a single query for all active views in the 481 * first index, mark the other query indices as being already available 482 * with result 0. 483 */ 484 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 485 const uint32_t num_queries = 486 _mesa_bitcount(cmd_buffer->state.subpass->view_mask); 487 if (num_queries > 1) 488 emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1); 489 } 490 } 491 492 #define TIMESTAMP 0x2358 493 494 void genX(CmdWriteTimestamp)( 495 VkCommandBuffer commandBuffer, 496 VkPipelineStageFlagBits pipelineStage, 497 VkQueryPool queryPool, 498 uint32_t query) 499 { 500 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 501 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 502 uint32_t offset = query * pool->stride; 503 504 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP); 505 506 switch (pipelineStage) { 507 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT: 508 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { 509 srm.RegisterAddress = TIMESTAMP; 510 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 8 }; 511 } 512 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { 513 srm.RegisterAddress = TIMESTAMP + 4; 514 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 12 }; 515 } 516 break; 517 518 default: 519 /* Everything else is bottom-of-pipe */ 520 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 521 pc.DestinationAddressType = DAT_PPGTT; 522 pc.PostSyncOperation = WriteTimestamp; 523 pc.Address = (struct anv_address) { &pool->bo, offset + 8 }; 524 525 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4) 526 pc.CommandStreamerStallEnable = true; 527 } 528 break; 529 } 530 531 emit_query_availability(cmd_buffer, &pool->bo, offset); 532 533 /* When multiview is active the spec requires that N consecutive query 534 * indices are used, where N is the number of active views in the subpass. 535 * The spec allows that we only write the results to one of the queries 536 * but we still need to manage result availability for all the query indices. 537 * Since we only emit a single query for all active views in the 538 * first index, mark the other query indices as being already available 539 * with result 0. 540 */ 541 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 542 const uint32_t num_queries = 543 _mesa_bitcount(cmd_buffer->state.subpass->view_mask); 544 if (num_queries > 1) 545 emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1); 546 } 547 } 548 549 #if GEN_GEN > 7 || GEN_IS_HASWELL 550 551 static uint32_t 552 mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2) 553 { 554 struct GENX(MI_MATH_ALU_INSTRUCTION) instr = { 555 .ALUOpcode = opcode, 556 .Operand1 = operand1, 557 .Operand2 = operand2, 558 }; 559 560 uint32_t dw; 561 GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr); 562 563 return dw; 564 } 565 566 #define CS_GPR(n) (0x2600 + (n) * 8) 567 568 static void 569 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg, 570 struct anv_bo *bo, uint32_t offset) 571 { 572 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 573 lrm.RegisterAddress = reg, 574 lrm.MemoryAddress = (struct anv_address) { bo, offset }; 575 } 576 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 577 lrm.RegisterAddress = reg + 4; 578 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; 579 } 580 } 581 582 static void 583 emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm) 584 { 585 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { 586 lri.RegisterOffset = reg; 587 lri.DataDWord = imm; 588 } 589 } 590 591 static void 592 emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm) 593 { 594 emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm); 595 emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32)); 596 } 597 598 static void 599 emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst) 600 { 601 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { 602 lrr.SourceRegisterAddress = src; 603 lrr.DestinationRegisterAddress = dst; 604 } 605 } 606 607 /* 608 * GPR0 = GPR0 & ((1ull << n) - 1); 609 */ 610 static void 611 keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n) 612 { 613 assert(n < 64); 614 emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1); 615 616 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH)); 617 if (!dw) { 618 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY); 619 return; 620 } 621 622 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0); 623 dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1); 624 dw[3] = mi_alu(MI_ALU_AND, 0, 0); 625 dw[4] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU); 626 } 627 628 /* 629 * GPR0 = GPR0 << 30; 630 */ 631 static void 632 shl_gpr0_by_30_bits(struct anv_batch *batch) 633 { 634 /* First we mask 34 bits of GPR0 to prevent overflow */ 635 keep_gpr0_lower_n_bits(batch, 34); 636 637 const uint32_t outer_count = 5; 638 const uint32_t inner_count = 6; 639 STATIC_ASSERT(outer_count * inner_count == 30); 640 const uint32_t cmd_len = 1 + inner_count * 4; 641 642 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of 643 * 30 left shifts. 644 */ 645 for (int o = 0; o < outer_count; o++) { 646 /* Submit one MI_MATH to shift left by 6 bits */ 647 uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH)); 648 if (!dw) { 649 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY); 650 return; 651 } 652 653 dw++; 654 for (int i = 0; i < inner_count; i++, dw += 4) { 655 dw[0] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0); 656 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0); 657 dw[2] = mi_alu(MI_ALU_ADD, 0, 0); 658 dw[3] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU); 659 } 660 } 661 } 662 663 /* 664 * GPR0 = GPR0 >> 2; 665 * 666 * Note that the upper 30 bits of GPR are lost! 667 */ 668 static void 669 shr_gpr0_by_2_bits(struct anv_batch *batch) 670 { 671 shl_gpr0_by_30_bits(batch); 672 emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0)); 673 emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0); 674 } 675 676 static void 677 gpu_write_query_result(struct anv_batch *batch, 678 struct anv_buffer *dst_buffer, uint32_t dst_offset, 679 VkQueryResultFlags flags, 680 uint32_t value_index, uint32_t reg) 681 { 682 if (flags & VK_QUERY_RESULT_64_BIT) 683 dst_offset += value_index * 8; 684 else 685 dst_offset += value_index * 4; 686 687 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) { 688 srm.RegisterAddress = reg; 689 srm.MemoryAddress = (struct anv_address) { 690 .bo = dst_buffer->bo, 691 .offset = dst_buffer->offset + dst_offset, 692 }; 693 } 694 695 if (flags & VK_QUERY_RESULT_64_BIT) { 696 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) { 697 srm.RegisterAddress = reg + 4; 698 srm.MemoryAddress = (struct anv_address) { 699 .bo = dst_buffer->bo, 700 .offset = dst_buffer->offset + dst_offset + 4, 701 }; 702 } 703 } 704 } 705 706 static void 707 compute_query_result(struct anv_batch *batch, uint32_t dst_reg, 708 struct anv_bo *bo, uint32_t offset) 709 { 710 emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset); 711 emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8); 712 713 /* FIXME: We need to clamp the result for 32 bit. */ 714 715 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH)); 716 if (!dw) { 717 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY); 718 return; 719 } 720 721 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG1); 722 dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0); 723 dw[3] = mi_alu(MI_ALU_SUB, 0, 0); 724 dw[4] = mi_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU); 725 } 726 727 void genX(CmdCopyQueryPoolResults)( 728 VkCommandBuffer commandBuffer, 729 VkQueryPool queryPool, 730 uint32_t firstQuery, 731 uint32_t queryCount, 732 VkBuffer destBuffer, 733 VkDeviceSize destOffset, 734 VkDeviceSize destStride, 735 VkQueryResultFlags flags) 736 { 737 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 738 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 739 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer); 740 uint32_t slot_offset; 741 742 if (flags & VK_QUERY_RESULT_WAIT_BIT) { 743 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 744 pc.CommandStreamerStallEnable = true; 745 pc.StallAtPixelScoreboard = true; 746 } 747 } 748 749 for (uint32_t i = 0; i < queryCount; i++) { 750 slot_offset = (firstQuery + i) * pool->stride; 751 switch (pool->type) { 752 case VK_QUERY_TYPE_OCCLUSION: 753 compute_query_result(&cmd_buffer->batch, MI_ALU_REG2, 754 &pool->bo, slot_offset + 8); 755 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset, 756 flags, 0, CS_GPR(2)); 757 break; 758 759 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 760 uint32_t statistics = pool->pipeline_statistics; 761 uint32_t idx = 0; 762 while (statistics) { 763 uint32_t stat = u_bit_scan(&statistics); 764 765 compute_query_result(&cmd_buffer->batch, MI_ALU_REG0, 766 &pool->bo, slot_offset + idx * 16 + 8); 767 768 /* WaDividePSInvocationCountBy4:HSW,BDW */ 769 if ((cmd_buffer->device->info.gen == 8 || 770 cmd_buffer->device->info.is_haswell) && 771 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) { 772 shr_gpr0_by_2_bits(&cmd_buffer->batch); 773 } 774 775 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset, 776 flags, idx, CS_GPR(0)); 777 778 idx++; 779 } 780 assert(idx == _mesa_bitcount(pool->pipeline_statistics)); 781 break; 782 } 783 784 case VK_QUERY_TYPE_TIMESTAMP: 785 emit_load_alu_reg_u64(&cmd_buffer->batch, 786 CS_GPR(2), &pool->bo, slot_offset + 8); 787 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset, 788 flags, 0, CS_GPR(2)); 789 break; 790 791 default: 792 unreachable("unhandled query type"); 793 } 794 795 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 796 uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ? 797 _mesa_bitcount(pool->pipeline_statistics) : 1; 798 799 emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), 800 &pool->bo, slot_offset); 801 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset, 802 flags, idx, CS_GPR(0)); 803 } 804 805 destOffset += destStride; 806 } 807 } 808 809 #else 810 void genX(CmdCopyQueryPoolResults)( 811 VkCommandBuffer commandBuffer, 812 VkQueryPool queryPool, 813 uint32_t firstQuery, 814 uint32_t queryCount, 815 VkBuffer destBuffer, 816 VkDeviceSize destOffset, 817 VkDeviceSize destStride, 818 VkQueryResultFlags flags) 819 { 820 anv_finishme("Queries not yet supported on Ivy Bridge"); 821 } 822 #endif 823