Home | History | Annotate | Download | only in vulkan
      1 /*
      2  * Copyright  2015 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 #include <assert.h>
     25 #include <stdbool.h>
     26 #include <string.h>
     27 #include <unistd.h>
     28 #include <fcntl.h>
     29 
     30 #include "anv_private.h"
     31 
     32 #include "genxml/gen_macros.h"
     33 #include "genxml/genX_pack.h"
     34 
     35 VkResult genX(CreateQueryPool)(
     36     VkDevice                                    _device,
     37     const VkQueryPoolCreateInfo*                pCreateInfo,
     38     const VkAllocationCallbacks*                pAllocator,
     39     VkQueryPool*                                pQueryPool)
     40 {
     41    ANV_FROM_HANDLE(anv_device, device, _device);
     42    const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
     43    struct anv_query_pool *pool;
     44    VkResult result;
     45 
     46    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
     47 
     48    /* Query pool slots are made up of some number of 64-bit values packed
     49     * tightly together.  The first 64-bit value is always the "available" bit
     50     * which is 0 when the query is unavailable and 1 when it is available.
     51     * The 64-bit values that follow are determined by the type of query.
     52     */
     53    uint32_t uint64s_per_slot = 1;
     54 
     55    VkQueryPipelineStatisticFlags pipeline_statistics = 0;
     56    switch (pCreateInfo->queryType) {
     57    case VK_QUERY_TYPE_OCCLUSION:
     58       /* Occlusion queries have two values: begin and end. */
     59       uint64s_per_slot += 2;
     60       break;
     61    case VK_QUERY_TYPE_TIMESTAMP:
     62       /* Timestamps just have the one timestamp value */
     63       uint64s_per_slot += 1;
     64       break;
     65    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
     66       pipeline_statistics = pCreateInfo->pipelineStatistics;
     67       /* We're going to trust this field implicitly so we need to ensure that
     68        * no unhandled extension bits leak in.
     69        */
     70       pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
     71 
     72       /* Statistics queries have a min and max for every statistic */
     73       uint64s_per_slot += 2 * _mesa_bitcount(pipeline_statistics);
     74       break;
     75    default:
     76       assert(!"Invalid query type");
     77    }
     78 
     79    pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
     80                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
     81    if (pool == NULL)
     82       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
     83 
     84    pool->type = pCreateInfo->queryType;
     85    pool->pipeline_statistics = pipeline_statistics;
     86    pool->stride = uint64s_per_slot * sizeof(uint64_t);
     87    pool->slots = pCreateInfo->queryCount;
     88 
     89    uint64_t size = pool->slots * pool->stride;
     90    result = anv_bo_init_new(&pool->bo, device, size);
     91    if (result != VK_SUCCESS)
     92       goto fail;
     93 
     94    if (pdevice->supports_48bit_addresses)
     95       pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
     96 
     97    if (pdevice->has_exec_async)
     98       pool->bo.flags |= EXEC_OBJECT_ASYNC;
     99 
    100    /* For query pools, we set the caching mode to I915_CACHING_CACHED.  On LLC
    101     * platforms, this does nothing.  On non-LLC platforms, this means snooping
    102     * which comes at a slight cost.  However, the buffers aren't big, won't be
    103     * written frequently, and trying to handle the flushing manually without
    104     * doing too much flushing is extremely painful.
    105     */
    106    anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED);
    107 
    108    pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
    109 
    110    *pQueryPool = anv_query_pool_to_handle(pool);
    111 
    112    return VK_SUCCESS;
    113 
    114  fail:
    115    vk_free2(&device->alloc, pAllocator, pool);
    116 
    117    return result;
    118 }
    119 
    120 void genX(DestroyQueryPool)(
    121     VkDevice                                    _device,
    122     VkQueryPool                                 _pool,
    123     const VkAllocationCallbacks*                pAllocator)
    124 {
    125    ANV_FROM_HANDLE(anv_device, device, _device);
    126    ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
    127 
    128    if (!pool)
    129       return;
    130 
    131    anv_gem_munmap(pool->bo.map, pool->bo.size);
    132    anv_gem_close(device, pool->bo.gem_handle);
    133    vk_free2(&device->alloc, pAllocator, pool);
    134 }
    135 
    136 static void
    137 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
    138                        uint32_t value_index, uint64_t result)
    139 {
    140    if (flags & VK_QUERY_RESULT_64_BIT) {
    141       uint64_t *dst64 = dst_slot;
    142       dst64[value_index] = result;
    143    } else {
    144       uint32_t *dst32 = dst_slot;
    145       dst32[value_index] = result;
    146    }
    147 }
    148 
    149 static bool
    150 query_is_available(uint64_t *slot)
    151 {
    152    return *(volatile uint64_t *)slot;
    153 }
    154 
    155 static VkResult
    156 wait_for_available(struct anv_device *device,
    157                    struct anv_query_pool *pool, uint64_t *slot)
    158 {
    159    while (true) {
    160       if (query_is_available(slot))
    161          return VK_SUCCESS;
    162 
    163       int ret = anv_gem_busy(device, pool->bo.gem_handle);
    164       if (ret == 1) {
    165          /* The BO is still busy, keep waiting. */
    166          continue;
    167       } else if (ret == -1) {
    168          /* We don't know the real error. */
    169          device->lost = true;
    170          return vk_errorf(device->instance, device, VK_ERROR_DEVICE_LOST,
    171                           "gem wait failed: %m");
    172       } else {
    173          assert(ret == 0);
    174          /* The BO is no longer busy. */
    175          if (query_is_available(slot)) {
    176             return VK_SUCCESS;
    177          } else {
    178             VkResult status = anv_device_query_status(device);
    179             if (status != VK_SUCCESS)
    180                return status;
    181 
    182             /* If we haven't seen availability yet, then we never will.  This
    183              * can only happen if we have a client error where they call
    184              * GetQueryPoolResults on a query that they haven't submitted to
    185              * the GPU yet.  The spec allows us to do anything in this case,
    186              * but returning VK_SUCCESS doesn't seem right and we shouldn't
    187              * just keep spinning.
    188              */
    189             return VK_NOT_READY;
    190          }
    191       }
    192    }
    193 }
    194 
    195 VkResult genX(GetQueryPoolResults)(
    196     VkDevice                                    _device,
    197     VkQueryPool                                 queryPool,
    198     uint32_t                                    firstQuery,
    199     uint32_t                                    queryCount,
    200     size_t                                      dataSize,
    201     void*                                       pData,
    202     VkDeviceSize                                stride,
    203     VkQueryResultFlags                          flags)
    204 {
    205    ANV_FROM_HANDLE(anv_device, device, _device);
    206    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
    207 
    208    assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
    209           pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
    210           pool->type == VK_QUERY_TYPE_TIMESTAMP);
    211 
    212    if (unlikely(device->lost))
    213       return VK_ERROR_DEVICE_LOST;
    214 
    215    if (pData == NULL)
    216       return VK_SUCCESS;
    217 
    218    void *data_end = pData + dataSize;
    219 
    220    VkResult status = VK_SUCCESS;
    221    for (uint32_t i = 0; i < queryCount; i++) {
    222       uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
    223 
    224       /* Availability is always at the start of the slot */
    225       bool available = slot[0];
    226 
    227       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
    228          status = wait_for_available(device, pool, slot);
    229          if (status != VK_SUCCESS)
    230             return status;
    231 
    232          available = true;
    233       }
    234 
    235       /* From the Vulkan 1.0.42 spec:
    236        *
    237        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
    238        *    both not set then no result values are written to pData for
    239        *    queries that are in the unavailable state at the time of the call,
    240        *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
    241        *    availability state is still written to pData for those queries if
    242        *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
    243        */
    244       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
    245 
    246       if (write_results) {
    247          switch (pool->type) {
    248          case VK_QUERY_TYPE_OCCLUSION: {
    249             cpu_write_query_result(pData, flags, 0, slot[2] - slot[1]);
    250             break;
    251          }
    252 
    253          case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
    254             uint32_t statistics = pool->pipeline_statistics;
    255             uint32_t idx = 0;
    256             while (statistics) {
    257                uint32_t stat = u_bit_scan(&statistics);
    258                uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
    259 
    260                /* WaDividePSInvocationCountBy4:HSW,BDW */
    261                if ((device->info.gen == 8 || device->info.is_haswell) &&
    262                    (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
    263                   result >>= 2;
    264 
    265                cpu_write_query_result(pData, flags, idx, result);
    266 
    267                idx++;
    268             }
    269             assert(idx == _mesa_bitcount(pool->pipeline_statistics));
    270             break;
    271          }
    272 
    273          case VK_QUERY_TYPE_TIMESTAMP: {
    274             cpu_write_query_result(pData, flags, 0, slot[1]);
    275             break;
    276          }
    277          default:
    278             unreachable("invalid pool type");
    279          }
    280       } else {
    281          status = VK_NOT_READY;
    282       }
    283 
    284       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
    285          uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
    286                         _mesa_bitcount(pool->pipeline_statistics) : 1;
    287          cpu_write_query_result(pData, flags, idx, available);
    288       }
    289 
    290       pData += stride;
    291       if (pData >= data_end)
    292          break;
    293    }
    294 
    295    return status;
    296 }
    297 
    298 static void
    299 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
    300                     struct anv_bo *bo, uint32_t offset)
    301 {
    302    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
    303       pc.DestinationAddressType  = DAT_PPGTT;
    304       pc.PostSyncOperation       = WritePSDepthCount;
    305       pc.DepthStallEnable        = true;
    306       pc.Address                 = (struct anv_address) { bo, offset };
    307 
    308       if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
    309          pc.CommandStreamerStallEnable = true;
    310    }
    311 }
    312 
    313 static void
    314 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
    315                         struct anv_bo *bo, uint32_t offset)
    316 {
    317    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
    318       pc.DestinationAddressType  = DAT_PPGTT;
    319       pc.PostSyncOperation       = WriteImmediateData;
    320       pc.Address                 = (struct anv_address) { bo, offset };
    321       pc.ImmediateData           = 1;
    322    }
    323 }
    324 
    325 /**
    326  * Goes through a series of consecutive query indices in the given pool
    327  * setting all element values to 0 and emitting them as available.
    328  */
    329 static void
    330 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
    331                   struct anv_query_pool *pool,
    332                   uint32_t first_index, uint32_t num_queries)
    333 {
    334    const uint32_t num_elements = pool->stride / sizeof(uint64_t);
    335 
    336    for (uint32_t i = 0; i < num_queries; i++) {
    337       uint32_t slot_offset = (first_index + i) * pool->stride;
    338       for (uint32_t j = 1; j < num_elements; j++) {
    339          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
    340             sdi.Address.bo = &pool->bo;
    341             sdi.Address.offset = slot_offset + j * sizeof(uint64_t);
    342             sdi.ImmediateData = 0ull;
    343          }
    344       }
    345       emit_query_availability(cmd_buffer, &pool->bo, slot_offset);
    346    }
    347 }
    348 
    349 void genX(CmdResetQueryPool)(
    350     VkCommandBuffer                             commandBuffer,
    351     VkQueryPool                                 queryPool,
    352     uint32_t                                    firstQuery,
    353     uint32_t                                    queryCount)
    354 {
    355    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    356    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
    357 
    358    for (uint32_t i = 0; i < queryCount; i++) {
    359       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
    360          sdm.Address = (struct anv_address) {
    361             .bo = &pool->bo,
    362             .offset = (firstQuery + i) * pool->stride,
    363          };
    364          sdm.ImmediateData = 0;
    365       }
    366    }
    367 }
    368 
    369 static const uint32_t vk_pipeline_stat_to_reg[] = {
    370    GENX(IA_VERTICES_COUNT_num),
    371    GENX(IA_PRIMITIVES_COUNT_num),
    372    GENX(VS_INVOCATION_COUNT_num),
    373    GENX(GS_INVOCATION_COUNT_num),
    374    GENX(GS_PRIMITIVES_COUNT_num),
    375    GENX(CL_INVOCATION_COUNT_num),
    376    GENX(CL_PRIMITIVES_COUNT_num),
    377    GENX(PS_INVOCATION_COUNT_num),
    378    GENX(HS_INVOCATION_COUNT_num),
    379    GENX(DS_INVOCATION_COUNT_num),
    380    GENX(CS_INVOCATION_COUNT_num),
    381 };
    382 
    383 static void
    384 emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat,
    385                    struct anv_bo *bo, uint32_t offset)
    386 {
    387    STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
    388                  (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
    389 
    390    assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
    391    uint32_t reg = vk_pipeline_stat_to_reg[stat];
    392 
    393    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
    394       lrm.RegisterAddress  = reg,
    395       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
    396    }
    397    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
    398       lrm.RegisterAddress  = reg + 4,
    399       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
    400    }
    401 }
    402 
    403 void genX(CmdBeginQuery)(
    404     VkCommandBuffer                             commandBuffer,
    405     VkQueryPool                                 queryPool,
    406     uint32_t                                    query,
    407     VkQueryControlFlags                         flags)
    408 {
    409    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    410    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
    411 
    412    switch (pool->type) {
    413    case VK_QUERY_TYPE_OCCLUSION:
    414       emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 8);
    415       break;
    416 
    417    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
    418       /* TODO: This might only be necessary for certain stats */
    419       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
    420          pc.CommandStreamerStallEnable = true;
    421          pc.StallAtPixelScoreboard = true;
    422       }
    423 
    424       uint32_t statistics = pool->pipeline_statistics;
    425       uint32_t offset = query * pool->stride + 8;
    426       while (statistics) {
    427          uint32_t stat = u_bit_scan(&statistics);
    428          emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
    429          offset += 16;
    430       }
    431       break;
    432    }
    433 
    434    default:
    435       unreachable("");
    436    }
    437 }
    438 
    439 void genX(CmdEndQuery)(
    440     VkCommandBuffer                             commandBuffer,
    441     VkQueryPool                                 queryPool,
    442     uint32_t                                    query)
    443 {
    444    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    445    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
    446 
    447    switch (pool->type) {
    448    case VK_QUERY_TYPE_OCCLUSION:
    449       emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 16);
    450       emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
    451       break;
    452 
    453    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
    454       /* TODO: This might only be necessary for certain stats */
    455       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
    456          pc.CommandStreamerStallEnable = true;
    457          pc.StallAtPixelScoreboard = true;
    458       }
    459 
    460       uint32_t statistics = pool->pipeline_statistics;
    461       uint32_t offset = query * pool->stride + 16;
    462       while (statistics) {
    463          uint32_t stat = u_bit_scan(&statistics);
    464          emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
    465          offset += 16;
    466       }
    467 
    468       emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
    469       break;
    470    }
    471 
    472    default:
    473       unreachable("");
    474    }
    475 
    476    /* When multiview is active the spec requires that N consecutive query
    477     * indices are used, where N is the number of active views in the subpass.
    478     * The spec allows that we only write the results to one of the queries
    479     * but we still need to manage result availability for all the query indices.
    480     * Since we only emit a single query for all active views in the
    481     * first index, mark the other query indices as being already available
    482     * with result 0.
    483     */
    484    if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
    485       const uint32_t num_queries =
    486          _mesa_bitcount(cmd_buffer->state.subpass->view_mask);
    487       if (num_queries > 1)
    488          emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1);
    489    }
    490 }
    491 
    492 #define TIMESTAMP 0x2358
    493 
    494 void genX(CmdWriteTimestamp)(
    495     VkCommandBuffer                             commandBuffer,
    496     VkPipelineStageFlagBits                     pipelineStage,
    497     VkQueryPool                                 queryPool,
    498     uint32_t                                    query)
    499 {
    500    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    501    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
    502    uint32_t offset = query * pool->stride;
    503 
    504    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
    505 
    506    switch (pipelineStage) {
    507    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
    508       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
    509          srm.RegisterAddress  = TIMESTAMP;
    510          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 8 };
    511       }
    512       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
    513          srm.RegisterAddress  = TIMESTAMP + 4;
    514          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 12 };
    515       }
    516       break;
    517 
    518    default:
    519       /* Everything else is bottom-of-pipe */
    520       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
    521          pc.DestinationAddressType  = DAT_PPGTT;
    522          pc.PostSyncOperation       = WriteTimestamp;
    523          pc.Address = (struct anv_address) { &pool->bo, offset + 8 };
    524 
    525          if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
    526             pc.CommandStreamerStallEnable = true;
    527       }
    528       break;
    529    }
    530 
    531    emit_query_availability(cmd_buffer, &pool->bo, offset);
    532 
    533    /* When multiview is active the spec requires that N consecutive query
    534     * indices are used, where N is the number of active views in the subpass.
    535     * The spec allows that we only write the results to one of the queries
    536     * but we still need to manage result availability for all the query indices.
    537     * Since we only emit a single query for all active views in the
    538     * first index, mark the other query indices as being already available
    539     * with result 0.
    540     */
    541    if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
    542       const uint32_t num_queries =
    543          _mesa_bitcount(cmd_buffer->state.subpass->view_mask);
    544       if (num_queries > 1)
    545          emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1);
    546    }
    547 }
    548 
    549 #if GEN_GEN > 7 || GEN_IS_HASWELL
    550 
    551 static uint32_t
    552 mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2)
    553 {
    554    struct GENX(MI_MATH_ALU_INSTRUCTION) instr = {
    555       .ALUOpcode = opcode,
    556       .Operand1 = operand1,
    557       .Operand2 = operand2,
    558    };
    559 
    560    uint32_t dw;
    561    GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr);
    562 
    563    return dw;
    564 }
    565 
    566 #define CS_GPR(n) (0x2600 + (n) * 8)
    567 
    568 static void
    569 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
    570                       struct anv_bo *bo, uint32_t offset)
    571 {
    572    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
    573       lrm.RegisterAddress  = reg,
    574       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
    575    }
    576    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
    577       lrm.RegisterAddress  = reg + 4;
    578       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
    579    }
    580 }
    581 
    582 static void
    583 emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm)
    584 {
    585    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
    586       lri.RegisterOffset   = reg;
    587       lri.DataDWord        = imm;
    588    }
    589 }
    590 
    591 static void
    592 emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm)
    593 {
    594    emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm);
    595    emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32));
    596 }
    597 
    598 static void
    599 emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst)
    600 {
    601    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
    602       lrr.SourceRegisterAddress      = src;
    603       lrr.DestinationRegisterAddress = dst;
    604    }
    605 }
    606 
    607 /*
    608  * GPR0 = GPR0 & ((1ull << n) - 1);
    609  */
    610 static void
    611 keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n)
    612 {
    613    assert(n < 64);
    614    emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
    615 
    616    uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
    617    if (!dw) {
    618       anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
    619       return;
    620    }
    621 
    622    dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
    623    dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1);
    624    dw[3] = mi_alu(MI_ALU_AND, 0, 0);
    625    dw[4] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
    626 }
    627 
    628 /*
    629  * GPR0 = GPR0 << 30;
    630  */
    631 static void
    632 shl_gpr0_by_30_bits(struct anv_batch *batch)
    633 {
    634    /* First we mask 34 bits of GPR0 to prevent overflow */
    635    keep_gpr0_lower_n_bits(batch, 34);
    636 
    637    const uint32_t outer_count = 5;
    638    const uint32_t inner_count = 6;
    639    STATIC_ASSERT(outer_count * inner_count == 30);
    640    const uint32_t cmd_len = 1 + inner_count * 4;
    641 
    642    /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
    643     * 30 left shifts.
    644     */
    645    for (int o = 0; o < outer_count; o++) {
    646       /* Submit one MI_MATH to shift left by 6 bits */
    647       uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
    648       if (!dw) {
    649          anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
    650          return;
    651       }
    652 
    653       dw++;
    654       for (int i = 0; i < inner_count; i++, dw += 4) {
    655          dw[0] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
    656          dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
    657          dw[2] = mi_alu(MI_ALU_ADD, 0, 0);
    658          dw[3] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
    659       }
    660    }
    661 }
    662 
    663 /*
    664  * GPR0 = GPR0 >> 2;
    665  *
    666  * Note that the upper 30 bits of GPR are lost!
    667  */
    668 static void
    669 shr_gpr0_by_2_bits(struct anv_batch *batch)
    670 {
    671    shl_gpr0_by_30_bits(batch);
    672    emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
    673    emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
    674 }
    675 
    676 static void
    677 gpu_write_query_result(struct anv_batch *batch,
    678                        struct anv_buffer *dst_buffer, uint32_t dst_offset,
    679                        VkQueryResultFlags flags,
    680                        uint32_t value_index, uint32_t reg)
    681 {
    682    if (flags & VK_QUERY_RESULT_64_BIT)
    683       dst_offset += value_index * 8;
    684    else
    685       dst_offset += value_index * 4;
    686 
    687    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
    688       srm.RegisterAddress  = reg;
    689       srm.MemoryAddress    = (struct anv_address) {
    690          .bo = dst_buffer->bo,
    691          .offset = dst_buffer->offset + dst_offset,
    692       };
    693    }
    694 
    695    if (flags & VK_QUERY_RESULT_64_BIT) {
    696       anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
    697          srm.RegisterAddress  = reg + 4;
    698          srm.MemoryAddress    = (struct anv_address) {
    699             .bo = dst_buffer->bo,
    700             .offset = dst_buffer->offset + dst_offset + 4,
    701          };
    702       }
    703    }
    704 }
    705 
    706 static void
    707 compute_query_result(struct anv_batch *batch, uint32_t dst_reg,
    708                      struct anv_bo *bo, uint32_t offset)
    709 {
    710    emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset);
    711    emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8);
    712 
    713    /* FIXME: We need to clamp the result for 32 bit. */
    714 
    715    uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
    716    if (!dw) {
    717       anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
    718       return;
    719    }
    720 
    721    dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG1);
    722    dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
    723    dw[3] = mi_alu(MI_ALU_SUB, 0, 0);
    724    dw[4] = mi_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU);
    725 }
    726 
    727 void genX(CmdCopyQueryPoolResults)(
    728     VkCommandBuffer                             commandBuffer,
    729     VkQueryPool                                 queryPool,
    730     uint32_t                                    firstQuery,
    731     uint32_t                                    queryCount,
    732     VkBuffer                                    destBuffer,
    733     VkDeviceSize                                destOffset,
    734     VkDeviceSize                                destStride,
    735     VkQueryResultFlags                          flags)
    736 {
    737    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    738    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
    739    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
    740    uint32_t slot_offset;
    741 
    742    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
    743       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
    744          pc.CommandStreamerStallEnable = true;
    745          pc.StallAtPixelScoreboard     = true;
    746       }
    747    }
    748 
    749    for (uint32_t i = 0; i < queryCount; i++) {
    750       slot_offset = (firstQuery + i) * pool->stride;
    751       switch (pool->type) {
    752       case VK_QUERY_TYPE_OCCLUSION:
    753          compute_query_result(&cmd_buffer->batch, MI_ALU_REG2,
    754                               &pool->bo, slot_offset + 8);
    755          gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
    756                                 flags, 0, CS_GPR(2));
    757          break;
    758 
    759       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
    760          uint32_t statistics = pool->pipeline_statistics;
    761          uint32_t idx = 0;
    762          while (statistics) {
    763             uint32_t stat = u_bit_scan(&statistics);
    764 
    765             compute_query_result(&cmd_buffer->batch, MI_ALU_REG0,
    766                                  &pool->bo, slot_offset + idx * 16 + 8);
    767 
    768             /* WaDividePSInvocationCountBy4:HSW,BDW */
    769             if ((cmd_buffer->device->info.gen == 8 ||
    770                  cmd_buffer->device->info.is_haswell) &&
    771                 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
    772                shr_gpr0_by_2_bits(&cmd_buffer->batch);
    773             }
    774 
    775             gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
    776                                    flags, idx, CS_GPR(0));
    777 
    778             idx++;
    779          }
    780          assert(idx == _mesa_bitcount(pool->pipeline_statistics));
    781          break;
    782       }
    783 
    784       case VK_QUERY_TYPE_TIMESTAMP:
    785          emit_load_alu_reg_u64(&cmd_buffer->batch,
    786                                CS_GPR(2), &pool->bo, slot_offset + 8);
    787          gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
    788                                 flags, 0, CS_GPR(2));
    789          break;
    790 
    791       default:
    792          unreachable("unhandled query type");
    793       }
    794 
    795       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
    796          uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
    797                         _mesa_bitcount(pool->pipeline_statistics) : 1;
    798 
    799          emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
    800                                &pool->bo, slot_offset);
    801          gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
    802                                 flags, idx, CS_GPR(0));
    803       }
    804 
    805       destOffset += destStride;
    806    }
    807 }
    808 
    809 #else
    810 void genX(CmdCopyQueryPoolResults)(
    811     VkCommandBuffer                             commandBuffer,
    812     VkQueryPool                                 queryPool,
    813     uint32_t                                    firstQuery,
    814     uint32_t                                    queryCount,
    815     VkBuffer                                    destBuffer,
    816     VkDeviceSize                                destOffset,
    817     VkDeviceSize                                destStride,
    818     VkQueryResultFlags                          flags)
    819 {
    820    anv_finishme("Queries not yet supported on Ivy Bridge");
    821 }
    822 #endif
    823