Home | History | Annotate | Download | only in core
      1 /*
      2  * Mesa 3-D graphics library
      3  *
      4  * Copyright (C) 2012-2015 LunarG, Inc.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included
     14  * in all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     22  * DEALINGS IN THE SOFTWARE.
     23  *
     24  * Authors:
     25  *    Chia-I Wu <olv (at) lunarg.com>
     26  */
     27 
     28 #include "ilo_debug.h"
     29 #include "ilo_state_compute.h"
     30 
     31 struct compute_urb_configuration {
     32    int idrt_entry_count;
     33    int curbe_entry_count;
     34 
     35    int urb_entry_count;
     36    /* in 256-bit register increments */
     37    int urb_entry_size;
     38 };
     39 
     40 static int
     41 get_gen6_rob_entry_count(const struct ilo_dev *dev)
     42 {
     43    ILO_DEV_ASSERT(dev, 6, 8);
     44 
     45    /*
     46     * From the Ivy Bridge PRM, volume 2 part 2, page 60:
     47     *
     48     *     "ROB has 64KB of storage; 2048 entries."
     49     *
     50     * From the valid ranges of "CURBE Allocation Size", we can also conclude
     51     * that interface entries and CURBE data must be in ROB.  And that ROB
     52     * should be 16KB, or 512 entries, on Gen7 GT1.
     53     */
     54    if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
     55       return 2048;
     56    else if (ilo_dev_gen(dev) >= ILO_GEN(7))
     57       return (dev->gt == 2) ? 2048 : 512;
     58    else
     59       return (dev->gt == 2) ? 2048 : 1024;
     60 }
     61 
     62 static int
     63 get_gen6_idrt_entry_count(const struct ilo_dev *dev)
     64 {
     65    ILO_DEV_ASSERT(dev, 6, 8);
     66 
     67    /*
     68     * From the Ivy Bridge PRM, volume 2 part 2, page 21:
     69     *
     70     *     "The first 32 URB entries are reserved for the interface
     71     *      descriptor..."
     72     *
     73     * From the Haswell PRM, volume 7, page 836:
     74     *
     75     *     "The first 64 URB entries are reserved for the interface
     76     *      description..."
     77     */
     78    return (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 64 : 32;
     79 }
     80 
     81 static int
     82 get_gen6_curbe_entry_count(const struct ilo_dev *dev, uint32_t curbe_size)
     83 {
     84    /*
     85     * From the Ivy Bridge PRM, volume 2 part 2, page 21:
     86     *
     87     *     "(CURBE Allocation Size) Specifies the total length allocated for
     88     *      CURBE, in 256-bit register increments.
     89     */
     90    const int entry_count = (curbe_size + 31) / 32;
     91 
     92    ILO_DEV_ASSERT(dev, 6, 8);
     93 
     94    assert(get_gen6_idrt_entry_count(dev) + entry_count <=
     95          get_gen6_rob_entry_count(dev));
     96 
     97    return entry_count;
     98 }
     99 
    100 static bool
    101 compute_get_gen6_urb_configuration(const struct ilo_dev *dev,
    102                                    const struct ilo_state_compute_info *info,
    103                                    struct compute_urb_configuration *urb)
    104 {
    105    ILO_DEV_ASSERT(dev, 6, 8);
    106 
    107    urb->idrt_entry_count = get_gen6_idrt_entry_count(dev);
    108    urb->curbe_entry_count =
    109       get_gen6_curbe_entry_count(dev, info->curbe_alloc_size);
    110 
    111    /*
    112     * From the Broadwell PRM, volume 2b, page 451:
    113     *
    114     *     "Please note that 0 is not allowed for this field (Number of URB
    115     *      Entries)."
    116     */
    117    urb->urb_entry_count = (ilo_dev_gen(dev) >= ILO_GEN(8)) ? 1 : 0;
    118 
    119    /*
    120     * From the Ivy Bridge PRM, volume 2 part 2, page 52:
    121     *
    122     *     "(URB Entry Allocation Size) Specifies the length of each URB entry
    123     *      used by the unit, in 256-bit register increments - 1."
    124     */
    125    urb->urb_entry_size = 1;
    126 
    127    /*
    128     * From the Ivy Bridge PRM, volume 2 part 2, page 22:
    129     *
    130     *      MEDIA_VFE_STATE specifies the amount of CURBE space, the URB handle
    131     *      size and the number of URB handles. The driver must ensure that
    132     *      ((URB_handle_size * URB_num_handle) - CURBE - 32) <=
    133     *      URB_allocation_in_L3."
    134     */
    135    assert(urb->idrt_entry_count + urb->curbe_entry_count +
    136          urb->urb_entry_count * urb->urb_entry_size <=
    137          info->cv_urb_alloc_size / 32);
    138 
    139    return true;
    140 }
    141 
    142 static int
    143 compute_interface_get_gen6_read_end(const struct ilo_dev *dev,
    144                                     const struct ilo_state_compute_interface_info *interface)
    145 {
    146    const int per_thread_read = (interface->curbe_read_length + 31) / 32;
    147    const int cross_thread_read =
    148       (interface->cross_thread_curbe_read_length + 31) / 32;
    149 
    150    ILO_DEV_ASSERT(dev, 6, 8);
    151 
    152    assert(interface->curbe_read_offset % 32 == 0);
    153 
    154    /*
    155     * From the Ivy Bridge PRM, volume 2 part 2, page 60:
    156     *
    157     *     "(Constant URB Entry Read Length) [0,63]"
    158     */
    159    assert(per_thread_read <= 63);
    160 
    161    /*
    162     * From the Haswell PRM, volume 2d, page 199:
    163     *
    164     *     "(Cross-Thread Constant Data Read Length) [0,127]"
    165     */
    166    if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
    167       assert(cross_thread_read <= 127);
    168    else
    169       assert(!cross_thread_read);
    170 
    171    if (per_thread_read || cross_thread_read) {
    172       return interface->curbe_read_offset / 32 + cross_thread_read +
    173          per_thread_read * interface->thread_group_size;
    174    } else {
    175       return 0;
    176    }
    177 }
    178 
    179 static bool
    180 compute_validate_gen6(const struct ilo_dev *dev,
    181                       const struct ilo_state_compute_info *info,
    182                       const struct compute_urb_configuration *urb)
    183 {
    184    int min_curbe_entry_count;
    185    uint8_t i;
    186 
    187    ILO_DEV_ASSERT(dev, 6, 8);
    188 
    189    assert(info->interface_count <= urb->idrt_entry_count);
    190 
    191    min_curbe_entry_count = 0;
    192    for (i = 0; i < info->interface_count; i++) {
    193       const int read_end =
    194          compute_interface_get_gen6_read_end(dev, &info->interfaces[i]);
    195 
    196       if (min_curbe_entry_count < read_end)
    197          min_curbe_entry_count = read_end;
    198    }
    199 
    200    assert(min_curbe_entry_count <= urb->curbe_entry_count);
    201 
    202    /*
    203     * From the Broadwell PRM, volume 2b, page 452:
    204     *
    205     *     "CURBE Allocation Size should be 0 for GPGPU workloads that uses
    206     *      indirect instead of CURBE."
    207     */
    208    if (!min_curbe_entry_count)
    209       assert(!urb->curbe_entry_count);
    210 
    211    return true;
    212 }
    213 
    214 static uint32_t
    215 compute_get_gen6_per_thread_scratch_size(const struct ilo_dev *dev,
    216                                          const struct ilo_state_compute_info *info,
    217                                          uint8_t *per_thread_space)
    218 {
    219    ILO_DEV_ASSERT(dev, 6, 7);
    220 
    221    /*
    222     * From the Sandy Bridge PRM, volume 2 part 2, page 30:
    223     *
    224     *     "(Per Thread Scratch Space)
    225     *      Range = [0,11] indicating [1k bytes, 12k bytes] [DevSNB]"
    226     */
    227    assert(info->per_thread_scratch_size <= 12 * 1024);
    228 
    229    if (!info->per_thread_scratch_size) {
    230       *per_thread_space = 0;
    231       return 0;
    232    }
    233 
    234    *per_thread_space = (info->per_thread_scratch_size > 1024) ?
    235       (info->per_thread_scratch_size - 1) / 1024 : 0;
    236 
    237    return 1024 * (1 + *per_thread_space);
    238 }
    239 
    240 static uint32_t
    241 compute_get_gen75_per_thread_scratch_size(const struct ilo_dev *dev,
    242                                           const struct ilo_state_compute_info *info,
    243                                           uint8_t *per_thread_space)
    244 {
    245    ILO_DEV_ASSERT(dev, 7.5, 8);
    246 
    247    /*
    248     * From the Haswell PRM, volume 2b, page 407:
    249     *
    250     *     "(Per Thread Scratch Space)
    251     *      [0,10]  Indicating [2k bytes, 2 Mbytes]"
    252     *
    253     *     "Note: The scratch space should be declared as 2x the desired
    254     *      scratch space. The stack will start at the half-way point instead
    255     *      of the end. The upper half of scratch space will not be accessed
    256     *      and so does not have to be allocated in memory."
    257     *
    258     * From the Broadwell PRM, volume 2a, page 450:
    259     *
    260     *     "(Per Thread Scratch Space)
    261     *      [0,11]  indicating [1k bytes, 2 Mbytes]"
    262     */
    263    assert(info->per_thread_scratch_size <=
    264          ((ilo_dev_gen(dev) >= ILO_GEN(8)) ? 2 : 1) * 1024 * 1024);
    265 
    266    if (!info->per_thread_scratch_size) {
    267       *per_thread_space = 0;
    268       return 0;
    269    }
    270 
    271    /* next power of two, starting from 1KB */
    272    *per_thread_space = (info->per_thread_scratch_size > 1024) ?
    273       (util_last_bit(info->per_thread_scratch_size - 1) - 10) : 0;
    274 
    275    return 1 << (10 + *per_thread_space);
    276 }
    277 
    278 static bool
    279 compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
    280                                  const struct ilo_dev *dev,
    281                                  const struct ilo_state_compute_info *info)
    282 {
    283    struct compute_urb_configuration urb;
    284    uint32_t per_thread_size;
    285    uint8_t per_thread_space;
    286 
    287    uint32_t dw1, dw2, dw4;
    288 
    289    ILO_DEV_ASSERT(dev, 6, 8);
    290 
    291    if (!compute_get_gen6_urb_configuration(dev, info, &urb) ||
    292        !compute_validate_gen6(dev, info, &urb))
    293       return false;
    294 
    295    if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
    296       per_thread_size = compute_get_gen75_per_thread_scratch_size(dev,
    297             info, &per_thread_space);
    298    } else {
    299       per_thread_size = compute_get_gen6_per_thread_scratch_size(dev,
    300             info, &per_thread_space);
    301    }
    302 
    303    dw1 = per_thread_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT;
    304 
    305    dw2 = (dev->thread_count - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT |
    306          urb.urb_entry_count << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT |
    307          GEN6_VFE_DW2_RESET_GATEWAY_TIMER |
    308          GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL;
    309 
    310    if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5))
    311       dw2 |= GEN7_VFE_DW2_GPGPU_MODE;
    312 
    313    assert(urb.urb_entry_size);
    314 
    315    dw4 = (urb.urb_entry_size - 1) << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT |
    316          urb.curbe_entry_count << GEN6_VFE_DW4_CURBE_SIZE__SHIFT;
    317 
    318    STATIC_ASSERT(ARRAY_SIZE(compute->vfe) >= 3);
    319    compute->vfe[0] = dw1;
    320    compute->vfe[1] = dw2;
    321    compute->vfe[2] = dw4;
    322 
    323    compute->scratch_size = per_thread_size * dev->thread_count;
    324 
    325    return true;
    326 }
    327 
    328 static uint8_t
    329 compute_interface_get_gen6_sampler_count(const struct ilo_dev *dev,
    330                                          const struct ilo_state_compute_interface_info *interface)
    331 {
    332    ILO_DEV_ASSERT(dev, 6, 8);
    333    return (interface->sampler_count <= 12) ?
    334       (interface->sampler_count + 3) / 4 : 4;
    335 }
    336 
    337 static uint8_t
    338 compute_interface_get_gen6_surface_count(const struct ilo_dev *dev,
    339                                          const struct ilo_state_compute_interface_info *interface)
    340 {
    341    ILO_DEV_ASSERT(dev, 6, 8);
    342    return (interface->surface_count <= 31) ? interface->surface_count : 31;
    343 }
    344 
    345 static uint8_t
    346 compute_interface_get_gen7_slm_size(const struct ilo_dev *dev,
    347                                     const struct ilo_state_compute_interface_info *interface)
    348 {
    349    ILO_DEV_ASSERT(dev, 7, 8);
    350 
    351    /*
    352     * From the Ivy Bridge PRM, volume 2 part 2, page 61:
    353     *
    354     *     "The amount is specified in 4k blocks, but only powers of 2 are
    355     *      allowed: 0, 4k, 8k, 16k, 32k and 64k per half-slice."
    356     */
    357    assert(interface->slm_size <= 64 * 1024);
    358 
    359    return util_next_power_of_two((interface->slm_size + 4095) / 4096);
    360 }
    361 
    362 static bool
    363 compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_state_compute *compute,
    364                                            const struct ilo_dev *dev,
    365                                            const struct ilo_state_compute_info *info)
    366 {
    367    uint8_t i;
    368 
    369    ILO_DEV_ASSERT(dev, 6, 8);
    370 
    371    for (i = 0; i < info->interface_count; i++) {
    372       const struct ilo_state_compute_interface_info *interface =
    373          &info->interfaces[i];
    374       uint16_t read_offset, per_thread_read_len, cross_thread_read_len;
    375       uint8_t sampler_count, surface_count;
    376       uint32_t dw0, dw2, dw3, dw4, dw5, dw6;
    377 
    378       assert(interface->kernel_offset % 64 == 0);
    379       assert(interface->thread_group_size);
    380 
    381       read_offset = interface->curbe_read_offset / 32;
    382       per_thread_read_len = (interface->curbe_read_length + 31) / 32;
    383       cross_thread_read_len =
    384          (interface->cross_thread_curbe_read_length + 31) / 32;
    385 
    386       sampler_count =
    387          compute_interface_get_gen6_sampler_count(dev, interface);
    388       surface_count =
    389          compute_interface_get_gen6_surface_count(dev, interface);
    390 
    391       dw0 = interface->kernel_offset;
    392       dw2 = sampler_count << GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT;
    393       dw3 = surface_count << GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT;
    394       dw4 = per_thread_read_len << GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT |
    395             read_offset << GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT;
    396 
    397       dw5 = 0;
    398       dw6 = 0;
    399       if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
    400          const uint8_t slm_size =
    401             compute_interface_get_gen7_slm_size(dev, interface);
    402 
    403          dw5 |= GEN7_IDRT_DW5_ROUNDING_MODE_RTNE;
    404 
    405          if (slm_size) {
    406             dw5 |= GEN7_IDRT_DW5_BARRIER_ENABLE |
    407                    slm_size << GEN7_IDRT_DW5_SLM_SIZE__SHIFT;
    408          }
    409 
    410          /*
    411           * From the Haswell PRM, volume 2d, page 199:
    412           *
    413           *     "(Number of Threads in GPGPU Thread Group) Specifies the
    414           *      number of threads that are in this thread group.  Used to
    415           *      program the barrier for the number of messages to expect. The
    416           *      minimum value is 0 (which will disable the barrier), while
    417           *      the maximum value is the number of threads in a subslice for
    418           *      local barriers."
    419           *
    420           * From the Broadwell PRM, volume 2d, page 183:
    421           *
    422           *     "(Number of Threads in GPGPU Thread Group) Specifies the
    423           *      number of threads that are in this thread group.  The minimum
    424           *      value is 1, while the maximum value is the number of threads
    425           *      in a subslice for local barriers. See vol1b Configurations
    426           *      for the number of threads per subslice for different
    427           *      products.  The maximum value for global barriers is limited
    428           *      by the number of threads in the system, or by 511, whichever
    429           *      is lower. This field should not be set to 0 even if the
    430           *      barrier is disabled, since an accurate value is needed for
    431           *      proper pre-emption."
    432           */
    433          if (slm_size || ilo_dev_gen(dev) >= ILO_GEN(8)) {
    434             dw5 |= interface->thread_group_size <<
    435                GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT;
    436          }
    437 
    438          if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
    439             dw6 |= cross_thread_read_len <<
    440                GEN75_IDRT_DW6_CROSS_THREAD_CURBE_READ_LEN__SHIFT;
    441          }
    442       }
    443 
    444       STATIC_ASSERT(ARRAY_SIZE(compute->idrt[i]) >= 6);
    445       compute->idrt[i][0] = dw0;
    446       compute->idrt[i][1] = dw2;
    447       compute->idrt[i][2] = dw3;
    448       compute->idrt[i][3] = dw4;
    449       compute->idrt[i][4] = dw5;
    450       compute->idrt[i][5] = dw6;
    451    }
    452 
    453    return true;
    454 }
    455 
    456 bool
    457 ilo_state_compute_init(struct ilo_state_compute *compute,
    458                        const struct ilo_dev *dev,
    459                        const struct ilo_state_compute_info *info)
    460 {
    461    bool ret = true;
    462 
    463    assert(ilo_is_zeroed(compute, sizeof(*compute)));
    464    assert(ilo_is_zeroed(info->data, info->data_size));
    465 
    466    assert(ilo_state_compute_data_size(dev, info->interface_count) <=
    467          info->data_size);
    468    compute->idrt = (uint32_t (*)[6]) info->data;
    469 
    470    ret &= compute_set_gen6_MEDIA_VFE_STATE(compute, dev, info);
    471    ret &= compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(compute, dev, info);
    472 
    473    assert(ret);
    474 
    475    return ret;
    476 }
    477