1 /* 2 * Copyright 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include "util/ralloc.h" 25 #include "brw_context.h" 26 #include "brw_cs.h" 27 #include "brw_eu.h" 28 #include "brw_wm.h" 29 #include "brw_shader.h" 30 #include "intel_mipmap_tree.h" 31 #include "intel_batchbuffer.h" 32 #include "brw_state.h" 33 #include "program/prog_statevars.h" 34 #include "compiler/glsl/ir_uniform.h" 35 #include "main/shaderapi.h" 36 37 static void 38 brw_upload_cs_state(struct brw_context *brw) 39 { 40 if (!brw->cs.base.prog_data) 41 return; 42 43 uint32_t offset; 44 uint32_t *desc = (uint32_t*) brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 45 8 * 4, 64, &offset); 46 struct brw_stage_state *stage_state = &brw->cs.base; 47 struct brw_stage_prog_data *prog_data = stage_state->prog_data; 48 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); 49 const struct gen_device_info *devinfo = &brw->screen->devinfo; 50 51 if (INTEL_DEBUG & DEBUG_SHADER_TIME) { 52 brw_emit_buffer_surface_state( 53 brw, &stage_state->surf_offset[ 54 prog_data->binding_table.shader_time_start], 55 brw->shader_time.bo, 0, BRW_SURFACEFORMAT_RAW, 56 brw->shader_time.bo->size, 1, true); 57 } 58 59 uint32_t *bind = (uint32_t*) brw_state_batch(brw, AUB_TRACE_BINDING_TABLE, 60 prog_data->binding_table.size_bytes, 61 32, &stage_state->bind_bo_offset); 62 63 uint32_t dwords = brw->gen < 8 ? 8 : 9; 64 BEGIN_BATCH(dwords); 65 OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2)); 66 67 if (prog_data->total_scratch) { 68 if (brw->gen >= 8) { 69 /* Broadwell's Per Thread Scratch Space is in the range [0, 11] 70 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. 71 */ 72 OUT_RELOC64(stage_state->scratch_bo, 73 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 74 ffs(stage_state->per_thread_scratch) - 11); 75 } else if (brw->is_haswell) { 76 /* Haswell's Per Thread Scratch Space is in the range [0, 10] 77 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. 78 */ 79 OUT_RELOC(stage_state->scratch_bo, 80 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 81 ffs(stage_state->per_thread_scratch) - 12); 82 } else { 83 /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB] 84 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. 85 */ 86 OUT_RELOC(stage_state->scratch_bo, 87 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 88 stage_state->per_thread_scratch / 1024 - 1); 89 } 90 } else { 91 OUT_BATCH(0); 92 if (brw->gen >= 8) 93 OUT_BATCH(0); 94 } 95 96 const uint32_t vfe_num_urb_entries = brw->gen >= 8 ? 2 : 0; 97 const uint32_t vfe_gpgpu_mode = 98 brw->gen == 7 ? SET_FIELD(1, GEN7_MEDIA_VFE_STATE_GPGPU_MODE) : 0; 99 const uint32_t subslices = MAX2(brw->screen->subslice_total, 1); 100 OUT_BATCH(SET_FIELD(devinfo->max_cs_threads * subslices - 1, 101 MEDIA_VFE_STATE_MAX_THREADS) | 102 SET_FIELD(vfe_num_urb_entries, MEDIA_VFE_STATE_URB_ENTRIES) | 103 SET_FIELD(1, MEDIA_VFE_STATE_RESET_GTW_TIMER) | 104 SET_FIELD(1, MEDIA_VFE_STATE_BYPASS_GTW) | 105 vfe_gpgpu_mode); 106 107 OUT_BATCH(0); 108 const uint32_t vfe_urb_allocation = brw->gen >= 8 ? 2 : 0; 109 110 /* We are uploading duplicated copies of push constant uniforms for each 111 * thread. Although the local id data needs to vary per thread, it won't 112 * change for other uniform data. Unfortunately this duplication is 113 * required for gen7. As of Haswell, this duplication can be avoided, but 114 * this older mechanism with duplicated data continues to work. 115 * 116 * FINISHME: As of Haswell, we could make use of the 117 * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length" field 118 * to only store one copy of uniform data. 119 * 120 * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage" 121 * which is described in the GPGPU_WALKER command and in the Broadwell PRM 122 * Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of 123 * Operations => GPGPU Mode => Indirect Payload Storage. 124 * 125 * Note: The constant data is built in brw_upload_cs_push_constants below. 126 */ 127 const uint32_t vfe_curbe_allocation = 128 ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + 129 cs_prog_data->push.cross_thread.regs, 2); 130 OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC) | 131 SET_FIELD(vfe_curbe_allocation, MEDIA_VFE_STATE_CURBE_ALLOC)); 132 OUT_BATCH(0); 133 OUT_BATCH(0); 134 OUT_BATCH(0); 135 ADVANCE_BATCH(); 136 137 if (cs_prog_data->push.total.size > 0) { 138 BEGIN_BATCH(4); 139 OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2)); 140 OUT_BATCH(0); 141 OUT_BATCH(ALIGN(cs_prog_data->push.total.size, 64)); 142 OUT_BATCH(stage_state->push_const_offset); 143 ADVANCE_BATCH(); 144 } 145 146 /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */ 147 memcpy(bind, stage_state->surf_offset, 148 prog_data->binding_table.size_bytes); 149 150 memset(desc, 0, 8 * 4); 151 152 int dw = 0; 153 desc[dw++] = brw->cs.base.prog_offset; 154 if (brw->gen >= 8) 155 desc[dw++] = 0; /* Kernel Start Pointer High */ 156 desc[dw++] = 0; 157 desc[dw++] = stage_state->sampler_offset | 158 ((stage_state->sampler_count + 3) / 4); 159 desc[dw++] = stage_state->bind_bo_offset; 160 desc[dw++] = SET_FIELD(cs_prog_data->push.per_thread.regs, 161 MEDIA_CURBE_READ_LENGTH); 162 const uint32_t media_threads = 163 brw->gen >= 8 ? 164 SET_FIELD(cs_prog_data->threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) : 165 SET_FIELD(cs_prog_data->threads, MEDIA_GPGPU_THREAD_COUNT); 166 assert(cs_prog_data->threads <= devinfo->max_cs_threads); 167 168 const uint32_t slm_size = 169 encode_slm_size(devinfo->gen, prog_data->total_shared); 170 171 desc[dw++] = 172 SET_FIELD(cs_prog_data->uses_barrier, MEDIA_BARRIER_ENABLE) | 173 SET_FIELD(slm_size, MEDIA_SHARED_LOCAL_MEMORY_SIZE) | 174 media_threads; 175 176 desc[dw++] = 177 SET_FIELD(cs_prog_data->push.cross_thread.regs, CROSS_THREAD_READ_LENGTH); 178 179 BEGIN_BATCH(4); 180 OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2)); 181 OUT_BATCH(0); 182 OUT_BATCH(8 * 4); 183 OUT_BATCH(offset); 184 ADVANCE_BATCH(); 185 } 186 187 const struct brw_tracked_state brw_cs_state = { 188 .dirty = { 189 .mesa = _NEW_PROGRAM_CONSTANTS, 190 .brw = BRW_NEW_BATCH | 191 BRW_NEW_BLORP | 192 BRW_NEW_CS_PROG_DATA | 193 BRW_NEW_PUSH_CONSTANT_ALLOCATION | 194 BRW_NEW_SAMPLER_STATE_TABLE | 195 BRW_NEW_SURFACES, 196 }, 197 .emit = brw_upload_cs_state 198 }; 199 200 201 /** 202 * Creates a region containing the push constants for the CS on gen7+. 203 * 204 * Push constants are constant values (such as GLSL uniforms) that are 205 * pre-loaded into a shader stage's register space at thread spawn time. 206 * 207 * For other stages, see brw_curbe.c:brw_upload_constant_buffer for the 208 * equivalent gen4/5 code and gen6_vs_state.c:gen6_upload_push_constants for 209 * gen6+. 210 */ 211 static void 212 brw_upload_cs_push_constants(struct brw_context *brw, 213 const struct gl_program *prog, 214 const struct brw_cs_prog_data *cs_prog_data, 215 struct brw_stage_state *stage_state, 216 enum aub_state_struct_type type) 217 { 218 struct gl_context *ctx = &brw->ctx; 219 const struct brw_stage_prog_data *prog_data = 220 (struct brw_stage_prog_data*) cs_prog_data; 221 222 /* Updates the ParamaterValues[i] pointers for all parameters of the 223 * basic type of PROGRAM_STATE_VAR. 224 */ 225 /* XXX: Should this happen somewhere before to get our state flag set? */ 226 _mesa_load_state_parameters(ctx, prog->Parameters); 227 228 if (cs_prog_data->push.total.size == 0) { 229 stage_state->push_const_size = 0; 230 return; 231 } 232 233 234 gl_constant_value *param = (gl_constant_value*) 235 brw_state_batch(brw, type, ALIGN(cs_prog_data->push.total.size, 64), 236 64, &stage_state->push_const_offset); 237 assert(param); 238 239 STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); 240 241 if (cs_prog_data->push.cross_thread.size > 0) { 242 gl_constant_value *param_copy = param; 243 assert(cs_prog_data->thread_local_id_index < 0 || 244 cs_prog_data->thread_local_id_index >= 245 cs_prog_data->push.cross_thread.dwords); 246 for (unsigned i = 0; 247 i < cs_prog_data->push.cross_thread.dwords; 248 i++) { 249 param_copy[i] = *prog_data->param[i]; 250 } 251 } 252 253 gl_constant_value thread_id; 254 if (cs_prog_data->push.per_thread.size > 0) { 255 for (unsigned t = 0; t < cs_prog_data->threads; t++) { 256 unsigned dst = 257 8 * (cs_prog_data->push.per_thread.regs * t + 258 cs_prog_data->push.cross_thread.regs); 259 unsigned src = cs_prog_data->push.cross_thread.dwords; 260 for ( ; src < prog_data->nr_params; src++, dst++) { 261 if (src != cs_prog_data->thread_local_id_index) 262 param[dst] = *prog_data->param[src]; 263 else { 264 thread_id.u = t * cs_prog_data->simd_size; 265 param[dst] = thread_id; 266 } 267 } 268 } 269 } 270 271 stage_state->push_const_size = 272 cs_prog_data->push.cross_thread.regs + 273 cs_prog_data->push.per_thread.regs; 274 } 275 276 277 static void 278 gen7_upload_cs_push_constants(struct brw_context *brw) 279 { 280 struct brw_stage_state *stage_state = &brw->cs.base; 281 282 /* BRW_NEW_COMPUTE_PROGRAM */ 283 const struct brw_program *cp = (struct brw_program *) brw->compute_program; 284 285 if (cp) { 286 /* BRW_NEW_CS_PROG_DATA */ 287 struct brw_cs_prog_data *cs_prog_data = 288 brw_cs_prog_data(brw->cs.base.prog_data); 289 290 _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE); 291 brw_upload_cs_push_constants(brw, &cp->program, cs_prog_data, 292 stage_state, AUB_TRACE_WM_CONSTANTS); 293 } 294 } 295 296 const struct brw_tracked_state gen7_cs_push_constants = { 297 .dirty = { 298 .mesa = _NEW_PROGRAM_CONSTANTS, 299 .brw = BRW_NEW_BATCH | 300 BRW_NEW_BLORP | 301 BRW_NEW_COMPUTE_PROGRAM | 302 BRW_NEW_CS_PROG_DATA | 303 BRW_NEW_PUSH_CONSTANT_ALLOCATION, 304 }, 305 .emit = gen7_upload_cs_push_constants, 306 }; 307 308 /** 309 * Creates a new CS constant buffer reflecting the current CS program's 310 * constants, if needed by the CS program. 311 */ 312 static void 313 brw_upload_cs_pull_constants(struct brw_context *brw) 314 { 315 struct brw_stage_state *stage_state = &brw->cs.base; 316 317 /* BRW_NEW_COMPUTE_PROGRAM */ 318 struct brw_program *cp = (struct brw_program *) brw->compute_program; 319 320 /* BRW_NEW_CS_PROG_DATA */ 321 const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data; 322 323 _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE); 324 /* _NEW_PROGRAM_CONSTANTS */ 325 brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program, 326 stage_state, prog_data); 327 } 328 329 const struct brw_tracked_state brw_cs_pull_constants = { 330 .dirty = { 331 .mesa = _NEW_PROGRAM_CONSTANTS, 332 .brw = BRW_NEW_BATCH | 333 BRW_NEW_BLORP | 334 BRW_NEW_COMPUTE_PROGRAM | 335 BRW_NEW_CS_PROG_DATA, 336 }, 337 .emit = brw_upload_cs_pull_constants, 338 }; 339