1 /* 2 * Mesa 3-D graphics library 3 * 4 * Copyright (C) 2012-2015 LunarG, Inc. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included 14 * in all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 * DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: 25 * Chia-I Wu <olv (at) lunarg.com> 26 */ 27 28 #include "ilo_debug.h" 29 #include "ilo_state_compute.h" 30 31 struct compute_urb_configuration { 32 int idrt_entry_count; 33 int curbe_entry_count; 34 35 int urb_entry_count; 36 /* in 256-bit register increments */ 37 int urb_entry_size; 38 }; 39 40 static int 41 get_gen6_rob_entry_count(const struct ilo_dev *dev) 42 { 43 ILO_DEV_ASSERT(dev, 6, 8); 44 45 /* 46 * From the Ivy Bridge PRM, volume 2 part 2, page 60: 47 * 48 * "ROB has 64KB of storage; 2048 entries." 49 * 50 * From the valid ranges of "CURBE Allocation Size", we can also conclude 51 * that interface entries and CURBE data must be in ROB. And that ROB 52 * should be 16KB, or 512 entries, on Gen7 GT1. 53 */ 54 if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) 55 return 2048; 56 else if (ilo_dev_gen(dev) >= ILO_GEN(7)) 57 return (dev->gt == 2) ? 2048 : 512; 58 else 59 return (dev->gt == 2) ? 2048 : 1024; 60 } 61 62 static int 63 get_gen6_idrt_entry_count(const struct ilo_dev *dev) 64 { 65 ILO_DEV_ASSERT(dev, 6, 8); 66 67 /* 68 * From the Ivy Bridge PRM, volume 2 part 2, page 21: 69 * 70 * "The first 32 URB entries are reserved for the interface 71 * descriptor..." 72 * 73 * From the Haswell PRM, volume 7, page 836: 74 * 75 * "The first 64 URB entries are reserved for the interface 76 * description..." 77 */ 78 return (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 64 : 32; 79 } 80 81 static int 82 get_gen6_curbe_entry_count(const struct ilo_dev *dev, uint32_t curbe_size) 83 { 84 /* 85 * From the Ivy Bridge PRM, volume 2 part 2, page 21: 86 * 87 * "(CURBE Allocation Size) Specifies the total length allocated for 88 * CURBE, in 256-bit register increments. 89 */ 90 const int entry_count = (curbe_size + 31) / 32; 91 92 ILO_DEV_ASSERT(dev, 6, 8); 93 94 assert(get_gen6_idrt_entry_count(dev) + entry_count <= 95 get_gen6_rob_entry_count(dev)); 96 97 return entry_count; 98 } 99 100 static bool 101 compute_get_gen6_urb_configuration(const struct ilo_dev *dev, 102 const struct ilo_state_compute_info *info, 103 struct compute_urb_configuration *urb) 104 { 105 ILO_DEV_ASSERT(dev, 6, 8); 106 107 urb->idrt_entry_count = get_gen6_idrt_entry_count(dev); 108 urb->curbe_entry_count = 109 get_gen6_curbe_entry_count(dev, info->curbe_alloc_size); 110 111 /* 112 * From the Broadwell PRM, volume 2b, page 451: 113 * 114 * "Please note that 0 is not allowed for this field (Number of URB 115 * Entries)." 116 */ 117 urb->urb_entry_count = (ilo_dev_gen(dev) >= ILO_GEN(8)) ? 1 : 0; 118 119 /* 120 * From the Ivy Bridge PRM, volume 2 part 2, page 52: 121 * 122 * "(URB Entry Allocation Size) Specifies the length of each URB entry 123 * used by the unit, in 256-bit register increments - 1." 124 */ 125 urb->urb_entry_size = 1; 126 127 /* 128 * From the Ivy Bridge PRM, volume 2 part 2, page 22: 129 * 130 * MEDIA_VFE_STATE specifies the amount of CURBE space, the URB handle 131 * size and the number of URB handles. The driver must ensure that 132 * ((URB_handle_size * URB_num_handle) - CURBE - 32) <= 133 * URB_allocation_in_L3." 134 */ 135 assert(urb->idrt_entry_count + urb->curbe_entry_count + 136 urb->urb_entry_count * urb->urb_entry_size <= 137 info->cv_urb_alloc_size / 32); 138 139 return true; 140 } 141 142 static int 143 compute_interface_get_gen6_read_end(const struct ilo_dev *dev, 144 const struct ilo_state_compute_interface_info *interface) 145 { 146 const int per_thread_read = (interface->curbe_read_length + 31) / 32; 147 const int cross_thread_read = 148 (interface->cross_thread_curbe_read_length + 31) / 32; 149 150 ILO_DEV_ASSERT(dev, 6, 8); 151 152 assert(interface->curbe_read_offset % 32 == 0); 153 154 /* 155 * From the Ivy Bridge PRM, volume 2 part 2, page 60: 156 * 157 * "(Constant URB Entry Read Length) [0,63]" 158 */ 159 assert(per_thread_read <= 63); 160 161 /* 162 * From the Haswell PRM, volume 2d, page 199: 163 * 164 * "(Cross-Thread Constant Data Read Length) [0,127]" 165 */ 166 if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) 167 assert(cross_thread_read <= 127); 168 else 169 assert(!cross_thread_read); 170 171 if (per_thread_read || cross_thread_read) { 172 return interface->curbe_read_offset / 32 + cross_thread_read + 173 per_thread_read * interface->thread_group_size; 174 } else { 175 return 0; 176 } 177 } 178 179 static bool 180 compute_validate_gen6(const struct ilo_dev *dev, 181 const struct ilo_state_compute_info *info, 182 const struct compute_urb_configuration *urb) 183 { 184 int min_curbe_entry_count; 185 uint8_t i; 186 187 ILO_DEV_ASSERT(dev, 6, 8); 188 189 assert(info->interface_count <= urb->idrt_entry_count); 190 191 min_curbe_entry_count = 0; 192 for (i = 0; i < info->interface_count; i++) { 193 const int read_end = 194 compute_interface_get_gen6_read_end(dev, &info->interfaces[i]); 195 196 if (min_curbe_entry_count < read_end) 197 min_curbe_entry_count = read_end; 198 } 199 200 assert(min_curbe_entry_count <= urb->curbe_entry_count); 201 202 /* 203 * From the Broadwell PRM, volume 2b, page 452: 204 * 205 * "CURBE Allocation Size should be 0 for GPGPU workloads that uses 206 * indirect instead of CURBE." 207 */ 208 if (!min_curbe_entry_count) 209 assert(!urb->curbe_entry_count); 210 211 return true; 212 } 213 214 static uint32_t 215 compute_get_gen6_per_thread_scratch_size(const struct ilo_dev *dev, 216 const struct ilo_state_compute_info *info, 217 uint8_t *per_thread_space) 218 { 219 ILO_DEV_ASSERT(dev, 6, 7); 220 221 /* 222 * From the Sandy Bridge PRM, volume 2 part 2, page 30: 223 * 224 * "(Per Thread Scratch Space) 225 * Range = [0,11] indicating [1k bytes, 12k bytes] [DevSNB]" 226 */ 227 assert(info->per_thread_scratch_size <= 12 * 1024); 228 229 if (!info->per_thread_scratch_size) { 230 *per_thread_space = 0; 231 return 0; 232 } 233 234 *per_thread_space = (info->per_thread_scratch_size > 1024) ? 235 (info->per_thread_scratch_size - 1) / 1024 : 0; 236 237 return 1024 * (1 + *per_thread_space); 238 } 239 240 static uint32_t 241 compute_get_gen75_per_thread_scratch_size(const struct ilo_dev *dev, 242 const struct ilo_state_compute_info *info, 243 uint8_t *per_thread_space) 244 { 245 ILO_DEV_ASSERT(dev, 7.5, 8); 246 247 /* 248 * From the Haswell PRM, volume 2b, page 407: 249 * 250 * "(Per Thread Scratch Space) 251 * [0,10] Indicating [2k bytes, 2 Mbytes]" 252 * 253 * "Note: The scratch space should be declared as 2x the desired 254 * scratch space. The stack will start at the half-way point instead 255 * of the end. The upper half of scratch space will not be accessed 256 * and so does not have to be allocated in memory." 257 * 258 * From the Broadwell PRM, volume 2a, page 450: 259 * 260 * "(Per Thread Scratch Space) 261 * [0,11] indicating [1k bytes, 2 Mbytes]" 262 */ 263 assert(info->per_thread_scratch_size <= 264 ((ilo_dev_gen(dev) >= ILO_GEN(8)) ? 2 : 1) * 1024 * 1024); 265 266 if (!info->per_thread_scratch_size) { 267 *per_thread_space = 0; 268 return 0; 269 } 270 271 /* next power of two, starting from 1KB */ 272 *per_thread_space = (info->per_thread_scratch_size > 1024) ? 273 (util_last_bit(info->per_thread_scratch_size - 1) - 10) : 0; 274 275 return 1 << (10 + *per_thread_space); 276 } 277 278 static bool 279 compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute, 280 const struct ilo_dev *dev, 281 const struct ilo_state_compute_info *info) 282 { 283 struct compute_urb_configuration urb; 284 uint32_t per_thread_size; 285 uint8_t per_thread_space; 286 287 uint32_t dw1, dw2, dw4; 288 289 ILO_DEV_ASSERT(dev, 6, 8); 290 291 if (!compute_get_gen6_urb_configuration(dev, info, &urb) || 292 !compute_validate_gen6(dev, info, &urb)) 293 return false; 294 295 if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) { 296 per_thread_size = compute_get_gen75_per_thread_scratch_size(dev, 297 info, &per_thread_space); 298 } else { 299 per_thread_size = compute_get_gen6_per_thread_scratch_size(dev, 300 info, &per_thread_space); 301 } 302 303 dw1 = per_thread_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT; 304 305 dw2 = (dev->thread_count - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT | 306 urb.urb_entry_count << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT | 307 GEN6_VFE_DW2_RESET_GATEWAY_TIMER | 308 GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL; 309 310 if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5)) 311 dw2 |= GEN7_VFE_DW2_GPGPU_MODE; 312 313 assert(urb.urb_entry_size); 314 315 dw4 = (urb.urb_entry_size - 1) << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT | 316 urb.curbe_entry_count << GEN6_VFE_DW4_CURBE_SIZE__SHIFT; 317 318 STATIC_ASSERT(ARRAY_SIZE(compute->vfe) >= 3); 319 compute->vfe[0] = dw1; 320 compute->vfe[1] = dw2; 321 compute->vfe[2] = dw4; 322 323 compute->scratch_size = per_thread_size * dev->thread_count; 324 325 return true; 326 } 327 328 static uint8_t 329 compute_interface_get_gen6_sampler_count(const struct ilo_dev *dev, 330 const struct ilo_state_compute_interface_info *interface) 331 { 332 ILO_DEV_ASSERT(dev, 6, 8); 333 return (interface->sampler_count <= 12) ? 334 (interface->sampler_count + 3) / 4 : 4; 335 } 336 337 static uint8_t 338 compute_interface_get_gen6_surface_count(const struct ilo_dev *dev, 339 const struct ilo_state_compute_interface_info *interface) 340 { 341 ILO_DEV_ASSERT(dev, 6, 8); 342 return (interface->surface_count <= 31) ? interface->surface_count : 31; 343 } 344 345 static uint8_t 346 compute_interface_get_gen7_slm_size(const struct ilo_dev *dev, 347 const struct ilo_state_compute_interface_info *interface) 348 { 349 ILO_DEV_ASSERT(dev, 7, 8); 350 351 /* 352 * From the Ivy Bridge PRM, volume 2 part 2, page 61: 353 * 354 * "The amount is specified in 4k blocks, but only powers of 2 are 355 * allowed: 0, 4k, 8k, 16k, 32k and 64k per half-slice." 356 */ 357 assert(interface->slm_size <= 64 * 1024); 358 359 return util_next_power_of_two((interface->slm_size + 4095) / 4096); 360 } 361 362 static bool 363 compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_state_compute *compute, 364 const struct ilo_dev *dev, 365 const struct ilo_state_compute_info *info) 366 { 367 uint8_t i; 368 369 ILO_DEV_ASSERT(dev, 6, 8); 370 371 for (i = 0; i < info->interface_count; i++) { 372 const struct ilo_state_compute_interface_info *interface = 373 &info->interfaces[i]; 374 uint16_t read_offset, per_thread_read_len, cross_thread_read_len; 375 uint8_t sampler_count, surface_count; 376 uint32_t dw0, dw2, dw3, dw4, dw5, dw6; 377 378 assert(interface->kernel_offset % 64 == 0); 379 assert(interface->thread_group_size); 380 381 read_offset = interface->curbe_read_offset / 32; 382 per_thread_read_len = (interface->curbe_read_length + 31) / 32; 383 cross_thread_read_len = 384 (interface->cross_thread_curbe_read_length + 31) / 32; 385 386 sampler_count = 387 compute_interface_get_gen6_sampler_count(dev, interface); 388 surface_count = 389 compute_interface_get_gen6_surface_count(dev, interface); 390 391 dw0 = interface->kernel_offset; 392 dw2 = sampler_count << GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT; 393 dw3 = surface_count << GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT; 394 dw4 = per_thread_read_len << GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT | 395 read_offset << GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT; 396 397 dw5 = 0; 398 dw6 = 0; 399 if (ilo_dev_gen(dev) >= ILO_GEN(7)) { 400 const uint8_t slm_size = 401 compute_interface_get_gen7_slm_size(dev, interface); 402 403 dw5 |= GEN7_IDRT_DW5_ROUNDING_MODE_RTNE; 404 405 if (slm_size) { 406 dw5 |= GEN7_IDRT_DW5_BARRIER_ENABLE | 407 slm_size << GEN7_IDRT_DW5_SLM_SIZE__SHIFT; 408 } 409 410 /* 411 * From the Haswell PRM, volume 2d, page 199: 412 * 413 * "(Number of Threads in GPGPU Thread Group) Specifies the 414 * number of threads that are in this thread group. Used to 415 * program the barrier for the number of messages to expect. The 416 * minimum value is 0 (which will disable the barrier), while 417 * the maximum value is the number of threads in a subslice for 418 * local barriers." 419 * 420 * From the Broadwell PRM, volume 2d, page 183: 421 * 422 * "(Number of Threads in GPGPU Thread Group) Specifies the 423 * number of threads that are in this thread group. The minimum 424 * value is 1, while the maximum value is the number of threads 425 * in a subslice for local barriers. See vol1b Configurations 426 * for the number of threads per subslice for different 427 * products. The maximum value for global barriers is limited 428 * by the number of threads in the system, or by 511, whichever 429 * is lower. This field should not be set to 0 even if the 430 * barrier is disabled, since an accurate value is needed for 431 * proper pre-emption." 432 */ 433 if (slm_size || ilo_dev_gen(dev) >= ILO_GEN(8)) { 434 dw5 |= interface->thread_group_size << 435 GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT; 436 } 437 438 if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) { 439 dw6 |= cross_thread_read_len << 440 GEN75_IDRT_DW6_CROSS_THREAD_CURBE_READ_LEN__SHIFT; 441 } 442 } 443 444 STATIC_ASSERT(ARRAY_SIZE(compute->idrt[i]) >= 6); 445 compute->idrt[i][0] = dw0; 446 compute->idrt[i][1] = dw2; 447 compute->idrt[i][2] = dw3; 448 compute->idrt[i][3] = dw4; 449 compute->idrt[i][4] = dw5; 450 compute->idrt[i][5] = dw6; 451 } 452 453 return true; 454 } 455 456 bool 457 ilo_state_compute_init(struct ilo_state_compute *compute, 458 const struct ilo_dev *dev, 459 const struct ilo_state_compute_info *info) 460 { 461 bool ret = true; 462 463 assert(ilo_is_zeroed(compute, sizeof(*compute))); 464 assert(ilo_is_zeroed(info->data, info->data_size)); 465 466 assert(ilo_state_compute_data_size(dev, info->interface_count) <= 467 info->data_size); 468 compute->idrt = (uint32_t (*)[6]) info->data; 469 470 ret &= compute_set_gen6_MEDIA_VFE_STATE(compute, dev, info); 471 ret &= compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(compute, dev, info); 472 473 assert(ret); 474 475 return ret; 476 } 477