1 /* 2 * Copyright 2012 Nouveau Project 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * Authors: Christoph Bumiller 23 */ 24 25 #include "nvc0/nvc0_context.h" 26 #include "nvc0/nve4_compute.h" 27 28 #include "codegen/nv50_ir_driver.h" 29 30 #ifdef DEBUG 31 static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *); 32 #endif 33 34 35 int 36 nve4_screen_compute_setup(struct nvc0_screen *screen, 37 struct nouveau_pushbuf *push) 38 { 39 struct nouveau_device *dev = screen->base.device; 40 struct nouveau_object *chan = screen->base.channel; 41 int i; 42 int ret; 43 uint32_t obj_class; 44 uint64_t address; 45 46 switch (dev->chipset & ~0xf) { 47 case 0x100: 48 case 0xf0: 49 obj_class = NVF0_COMPUTE_CLASS; /* GK110 */ 50 break; 51 case 0xe0: 52 obj_class = NVE4_COMPUTE_CLASS; /* GK104 */ 53 break; 54 case 0x110: 55 obj_class = GM107_COMPUTE_CLASS; 56 break; 57 case 0x120: 58 obj_class = GM200_COMPUTE_CLASS; 59 break; 60 default: 61 NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset); 62 return -1; 63 } 64 65 ret = nouveau_object_new(chan, 0xbeef00c0, obj_class, NULL, 0, 66 &screen->compute); 67 if (ret) { 68 NOUVEAU_ERR("Failed to allocate compute object: %d\n", ret); 69 return ret; 70 } 71 72 BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1); 73 PUSH_DATA (push, screen->compute->oclass); 74 75 BEGIN_NVC0(push, NVE4_CP(TEMP_ADDRESS_HIGH), 2); 76 PUSH_DATAh(push, screen->tls->offset); 77 PUSH_DATA (push, screen->tls->offset); 78 /* No idea why there are 2. Divide size by 2 to be safe. 79 * Actually this might be per-MP TEMP size and looks like I'm only using 80 * 2 MPs instead of all 8. 81 */ 82 BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(0)), 3); 83 PUSH_DATAh(push, screen->tls->size / screen->mp_count); 84 PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff); 85 PUSH_DATA (push, 0xff); 86 BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3); 87 PUSH_DATAh(push, screen->tls->size / screen->mp_count); 88 PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff); 89 PUSH_DATA (push, 0xff); 90 91 /* Unified address space ? Who needs that ? Certainly not OpenCL. 92 * 93 * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be 94 * accessible. We cannot prevent that at the moment, so expect failure. 95 */ 96 BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1); 97 PUSH_DATA (push, 0xff << 24); 98 BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1); 99 PUSH_DATA (push, 0xfe << 24); 100 101 BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2); 102 PUSH_DATAh(push, screen->text->offset); 103 PUSH_DATA (push, screen->text->offset); 104 105 BEGIN_NVC0(push, SUBC_CP(0x0310), 1); 106 PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300); 107 108 /* NOTE: these do not affect the state used by the 3D object */ 109 BEGIN_NVC0(push, NVE4_CP(TIC_ADDRESS_HIGH), 3); 110 PUSH_DATAh(push, screen->txc->offset); 111 PUSH_DATA (push, screen->txc->offset); 112 PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1); 113 BEGIN_NVC0(push, NVE4_CP(TSC_ADDRESS_HIGH), 3); 114 PUSH_DATAh(push, screen->txc->offset + 65536); 115 PUSH_DATA (push, screen->txc->offset + 65536); 116 PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1); 117 118 if (obj_class >= NVF0_COMPUTE_CLASS) { 119 /* The blob calls GK110_COMPUTE.FIRMWARE[0x6], along with the args (0x1) 120 * passed with GK110_COMPUTE.GRAPH.SCRATCH[0x2]. This is currently 121 * disabled because our firmware doesn't support these commands and the 122 * GPU hangs if they are used. */ 123 BEGIN_NIC0(push, SUBC_CP(0x0248), 64); 124 for (i = 63; i >= 0; i--) 125 PUSH_DATA(push, 0x38000 | i); 126 IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0); 127 } 128 129 BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1); 130 PUSH_DATA (push, 7); /* does not interfere with 3D */ 131 132 /* Disabling this UNK command avoid a read fault when using texelFetch() 133 * from a compute shader for weird reasons. 134 if (obj_class == NVF0_COMPUTE_CLASS) 135 IMMED_NVC0(push, SUBC_CP(0x02c4), 1); 136 */ 137 138 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); 139 140 /* MS sample coordinate offsets: these do not work with _ALT modes ! */ 141 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 142 PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO); 143 PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO); 144 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 145 PUSH_DATA (push, 64); 146 PUSH_DATA (push, 1); 147 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 17); 148 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 149 PUSH_DATA (push, 0); /* 0 */ 150 PUSH_DATA (push, 0); 151 PUSH_DATA (push, 1); /* 1 */ 152 PUSH_DATA (push, 0); 153 PUSH_DATA (push, 0); /* 2 */ 154 PUSH_DATA (push, 1); 155 PUSH_DATA (push, 1); /* 3 */ 156 PUSH_DATA (push, 1); 157 PUSH_DATA (push, 2); /* 4 */ 158 PUSH_DATA (push, 0); 159 PUSH_DATA (push, 3); /* 5 */ 160 PUSH_DATA (push, 0); 161 PUSH_DATA (push, 2); /* 6 */ 162 PUSH_DATA (push, 1); 163 PUSH_DATA (push, 3); /* 7 */ 164 PUSH_DATA (push, 1); 165 166 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER 167 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 168 PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR); 169 PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR); 170 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 171 PUSH_DATA (push, 28); 172 PUSH_DATA (push, 1); 173 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 8); 174 PUSH_DATA (push, 1); 175 PUSH_DATA (push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO); 176 PUSH_DATAh(push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO); 177 PUSH_DATA (push, screen->tls->offset); 178 PUSH_DATAh(push, screen->tls->offset); 179 PUSH_DATA (push, screen->tls->size / 2); /* MP TEMP block size */ 180 PUSH_DATA (push, screen->tls->size / 2 / 64); /* warp TEMP block size */ 181 PUSH_DATA (push, 0); /* warp cfstack size */ 182 #endif 183 184 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); 185 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); 186 187 return 0; 188 } 189 190 static void 191 gm107_compute_validate_surfaces(struct nvc0_context *nvc0, 192 struct pipe_image_view *view, int slot) 193 { 194 struct nv04_resource *res = nv04_resource(view->resource); 195 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 196 struct nvc0_screen *screen = nvc0->screen; 197 struct nouveau_bo *txc = nvc0->screen->txc; 198 struct nv50_tic_entry *tic; 199 uint64_t address; 200 const int s = 5; 201 202 tic = nv50_tic_entry(nvc0->images_tic[s][slot]); 203 204 res = nv04_resource(tic->pipe.texture); 205 nvc0_update_tic(nvc0, tic, res); 206 207 if (tic->id < 0) { 208 tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); 209 210 /* upload the texture view */ 211 PUSH_SPACE(push, 16); 212 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 213 PUSH_DATAh(push, txc->offset + (tic->id * 32)); 214 PUSH_DATA (push, txc->offset + (tic->id * 32)); 215 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 216 PUSH_DATA (push, 32); 217 PUSH_DATA (push, 1); 218 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9); 219 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 220 PUSH_DATAp(push, &tic->tic[0], 8); 221 222 BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), 1); 223 PUSH_DATA (push, (tic->id << 4) | 1); 224 } else 225 if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { 226 BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), 1); 227 PUSH_DATA (push, (tic->id << 4) | 1); 228 } 229 nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32); 230 231 res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; 232 res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING; 233 234 BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD); 235 236 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); 237 238 /* upload the texture handle */ 239 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 240 PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(slot + 32)); 241 PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(slot + 32)); 242 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 243 PUSH_DATA (push, 4); 244 PUSH_DATA (push, 0x1); 245 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 2); 246 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 247 PUSH_DATA (push, tic->id); 248 249 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); 250 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); 251 } 252 253 static void 254 nve4_compute_validate_surfaces(struct nvc0_context *nvc0) 255 { 256 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 257 uint64_t address; 258 const int s = 5; 259 int i, j; 260 261 if (!nvc0->images_dirty[s]) 262 return; 263 264 address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); 265 266 for (i = 0; i < NVC0_MAX_IMAGES; ++i) { 267 struct pipe_image_view *view = &nvc0->images[s][i]; 268 269 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 270 PUSH_DATAh(push, address + NVC0_CB_AUX_SU_INFO(i)); 271 PUSH_DATA (push, address + NVC0_CB_AUX_SU_INFO(i)); 272 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 273 PUSH_DATA (push, 16 * 4); 274 PUSH_DATA (push, 0x1); 275 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 16); 276 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 277 278 if (view->resource) { 279 struct nv04_resource *res = nv04_resource(view->resource); 280 281 if (res->base.target == PIPE_BUFFER) { 282 if (view->access & PIPE_IMAGE_ACCESS_WRITE) 283 nvc0_mark_image_range_valid(view); 284 } 285 286 nve4_set_surface_info(push, view, nvc0); 287 BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR); 288 289 if (nvc0->screen->base.class_3d >= GM107_3D_CLASS) 290 gm107_compute_validate_surfaces(nvc0, view, i); 291 } else { 292 for (j = 0; j < 16; j++) 293 PUSH_DATA(push, 0); 294 } 295 } 296 } 297 298 /* Thankfully, textures with samplers follow the normal rules. */ 299 static void 300 nve4_compute_validate_samplers(struct nvc0_context *nvc0) 301 { 302 bool need_flush = nve4_validate_tsc(nvc0, 5); 303 if (need_flush) { 304 BEGIN_NVC0(nvc0->base.pushbuf, NVE4_CP(TSC_FLUSH), 1); 305 PUSH_DATA (nvc0->base.pushbuf, 0); 306 } 307 308 /* Invalidate all 3D samplers because they are aliased. */ 309 for (int s = 0; s < 5; s++) 310 nvc0->samplers_dirty[s] = ~0; 311 nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS; 312 } 313 314 /* (Code duplicated at bottom for various non-convincing reasons. 315 * E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC 316 * entries to avoid a subchannel switch. 317 * Same for texture cache flushes. 318 * Also, the bufctx differs, and more IFs in the 3D version looks ugly.) 319 */ 320 static void nve4_compute_validate_textures(struct nvc0_context *); 321 322 static void 323 nve4_compute_set_tex_handles(struct nvc0_context *nvc0) 324 { 325 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 326 struct nvc0_screen *screen = nvc0->screen; 327 uint64_t address; 328 const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE); 329 unsigned i, n; 330 uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s]; 331 332 if (!dirty) 333 return; 334 i = ffs(dirty) - 1; 335 n = util_logbase2(dirty) + 1 - i; 336 assert(n); 337 338 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); 339 340 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 341 PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i)); 342 PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i)); 343 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 344 PUSH_DATA (push, n * 4); 345 PUSH_DATA (push, 0x1); 346 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + n); 347 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 348 PUSH_DATAp(push, &nvc0->tex_handles[s][i], n); 349 350 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); 351 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); 352 353 nvc0->textures_dirty[s] = 0; 354 nvc0->samplers_dirty[s] = 0; 355 } 356 357 static void 358 nve4_compute_validate_constbufs(struct nvc0_context *nvc0) 359 { 360 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 361 const int s = 5; 362 363 while (nvc0->constbuf_dirty[s]) { 364 int i = ffs(nvc0->constbuf_dirty[s]) - 1; 365 nvc0->constbuf_dirty[s] &= ~(1 << i); 366 367 if (nvc0->constbuf[s][i].user) { 368 struct nouveau_bo *bo = nvc0->screen->uniform_bo; 369 const unsigned base = NVC0_CB_USR_INFO(s); 370 const unsigned size = nvc0->constbuf[s][0].size; 371 assert(i == 0); /* we really only want OpenGL uniforms here */ 372 assert(nvc0->constbuf[s][0].u.data); 373 374 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 375 PUSH_DATAh(push, bo->offset + base); 376 PUSH_DATA (push, bo->offset + base); 377 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 378 PUSH_DATA (push, size); 379 PUSH_DATA (push, 0x1); 380 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4)); 381 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 382 PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4); 383 } 384 else { 385 struct nv04_resource *res = 386 nv04_resource(nvc0->constbuf[s][i].u.buf); 387 if (res) { 388 uint64_t address 389 = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); 390 391 assert(i > 0); /* we really only want uniform buffer objects */ 392 393 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 394 PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1)); 395 PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1)); 396 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 397 PUSH_DATA (push, 4 * 4); 398 PUSH_DATA (push, 0x1); 399 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4); 400 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 401 402 PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset); 403 PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset); 404 PUSH_DATA (push, nvc0->constbuf[5][i].size); 405 PUSH_DATA (push, 0); 406 BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD); 407 408 res->cb_bindings[s] |= 1 << i; 409 } 410 } 411 } 412 413 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); 414 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); 415 } 416 417 static void 418 nve4_compute_validate_buffers(struct nvc0_context *nvc0) 419 { 420 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 421 uint64_t address; 422 const int s = 5; 423 int i; 424 425 address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); 426 427 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 428 PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0)); 429 PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0)); 430 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 431 PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4); 432 PUSH_DATA (push, 0x1); 433 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS); 434 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 435 436 for (i = 0; i < NVC0_MAX_BUFFERS; i++) { 437 if (nvc0->buffers[s][i].buffer) { 438 struct nv04_resource *res = 439 nv04_resource(nvc0->buffers[s][i].buffer); 440 PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset); 441 PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset); 442 PUSH_DATA (push, nvc0->buffers[s][i].buffer_size); 443 PUSH_DATA (push, 0); 444 BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR); 445 util_range_add(&res->valid_buffer_range, 446 nvc0->buffers[s][i].buffer_offset, 447 nvc0->buffers[s][i].buffer_offset + 448 nvc0->buffers[s][i].buffer_size); 449 } else { 450 PUSH_DATA (push, 0); 451 PUSH_DATA (push, 0); 452 PUSH_DATA (push, 0); 453 PUSH_DATA (push, 0); 454 } 455 } 456 } 457 458 static struct nvc0_state_validate 459 validate_list_cp[] = { 460 { nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM }, 461 { nve4_compute_validate_textures, NVC0_NEW_CP_TEXTURES }, 462 { nve4_compute_validate_samplers, NVC0_NEW_CP_SAMPLERS }, 463 { nve4_compute_set_tex_handles, NVC0_NEW_CP_TEXTURES | 464 NVC0_NEW_CP_SAMPLERS }, 465 { nve4_compute_validate_surfaces, NVC0_NEW_CP_SURFACES }, 466 { nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS }, 467 { nve4_compute_validate_buffers, NVC0_NEW_CP_BUFFERS }, 468 { nve4_compute_validate_constbufs, NVC0_NEW_CP_CONSTBUF }, 469 }; 470 471 static bool 472 nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask) 473 { 474 bool ret; 475 476 ret = nvc0_state_validate(nvc0, mask, validate_list_cp, 477 ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp, 478 nvc0->bufctx_cp); 479 480 if (unlikely(nvc0->state.flushed)) 481 nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); 482 return ret; 483 } 484 485 static void 486 nve4_compute_upload_input(struct nvc0_context *nvc0, 487 const struct pipe_grid_info *info) 488 { 489 struct nvc0_screen *screen = nvc0->screen; 490 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 491 struct nvc0_program *cp = nvc0->compprog; 492 uint64_t address; 493 494 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); 495 496 if (cp->parm_size) { 497 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 498 PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5)); 499 PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5)); 500 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 501 PUSH_DATA (push, cp->parm_size); 502 PUSH_DATA (push, 0x1); 503 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4)); 504 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 505 PUSH_DATAp(push, info->input, cp->parm_size / 4); 506 } 507 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 508 PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO(0)); 509 PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO(0)); 510 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 511 PUSH_DATA (push, 8 * 4); 512 PUSH_DATA (push, 0x1); 513 514 if (unlikely(info->indirect)) { 515 struct nv04_resource *res = nv04_resource(info->indirect); 516 uint32_t offset = res->offset + info->indirect_offset; 517 518 nouveau_pushbuf_space(push, 32, 0, 1); 519 PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); 520 521 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8); 522 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 523 PUSH_DATAp(push, info->block, 3); 524 nouveau_pushbuf_data(push, res->bo, offset, 525 NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4); 526 } else { 527 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8); 528 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 529 PUSH_DATAp(push, info->block, 3); 530 PUSH_DATAp(push, info->grid, 3); 531 } 532 PUSH_DATA (push, 0); 533 PUSH_DATA (push, info->work_dim); 534 535 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); 536 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); 537 } 538 539 static inline uint8_t 540 nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size) 541 { 542 if (shared_size > (32 << 10)) 543 return NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1; 544 if (shared_size > (16 << 10)) 545 return NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1; 546 return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1; 547 } 548 549 static void 550 nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, 551 struct nve4_cp_launch_desc *desc, 552 const struct pipe_grid_info *info) 553 { 554 const struct nvc0_screen *screen = nvc0->screen; 555 const struct nvc0_program *cp = nvc0->compprog; 556 557 nve4_cp_launch_desc_init_default(desc); 558 559 desc->entry = nvc0_program_symbol_offset(cp, info->pc); 560 561 desc->griddim_x = info->grid[0]; 562 desc->griddim_y = info->grid[1]; 563 desc->griddim_z = info->grid[2]; 564 desc->blockdim_x = info->block[0]; 565 desc->blockdim_y = info->block[1]; 566 desc->blockdim_z = info->block[2]; 567 568 desc->shared_size = align(cp->cp.smem_size, 0x100); 569 desc->local_size_p = (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10); 570 desc->local_size_n = 0; 571 desc->cstack_size = 0x800; 572 desc->cache_split = nve4_compute_derive_cache_split(nvc0, cp->cp.smem_size); 573 574 desc->gpr_alloc = cp->num_gprs; 575 desc->bar_alloc = cp->num_barriers; 576 577 // Only bind user uniforms and the driver constant buffer through the 578 // launch descriptor because UBOs are sticked to the driver cb to avoid the 579 // limitation of 8 CBs. 580 if (nvc0->constbuf[5][0].user || cp->parm_size) { 581 nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo, 582 NVC0_CB_USR_INFO(5), 1 << 16); 583 } 584 nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo, 585 NVC0_CB_AUX_INFO(5), 1 << 11); 586 } 587 588 static inline struct nve4_cp_launch_desc * 589 nve4_compute_alloc_launch_desc(struct nouveau_context *nv, 590 struct nouveau_bo **pbo, uint64_t *pgpuaddr) 591 { 592 uint8_t *ptr = nouveau_scratch_get(nv, 512, pgpuaddr, pbo); 593 if (!ptr) 594 return NULL; 595 if (*pgpuaddr & 255) { 596 unsigned adj = 256 - (*pgpuaddr & 255); 597 ptr += adj; 598 *pgpuaddr += adj; 599 } 600 return (struct nve4_cp_launch_desc *)ptr; 601 } 602 603 void 604 nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) 605 { 606 struct nvc0_context *nvc0 = nvc0_context(pipe); 607 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 608 struct nve4_cp_launch_desc *desc; 609 uint64_t desc_gpuaddr; 610 struct nouveau_bo *desc_bo; 611 int ret; 612 613 desc = nve4_compute_alloc_launch_desc(&nvc0->base, &desc_bo, &desc_gpuaddr); 614 if (!desc) { 615 ret = -1; 616 goto out; 617 } 618 BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD, 619 desc_bo); 620 621 ret = !nve4_state_validate_cp(nvc0, ~0); 622 if (ret) 623 goto out; 624 625 nve4_compute_setup_launch_desc(nvc0, desc, info); 626 627 nve4_compute_upload_input(nvc0, info); 628 629 #ifdef DEBUG 630 if (debug_get_num_option("NV50_PROG_DEBUG", 0)) 631 nve4_compute_dump_launch_desc(desc); 632 #endif 633 634 if (unlikely(info->indirect)) { 635 struct nv04_resource *res = nv04_resource(info->indirect); 636 uint32_t offset = res->offset + info->indirect_offset; 637 638 /* upload the descriptor */ 639 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 640 PUSH_DATAh(push, desc_gpuaddr); 641 PUSH_DATA (push, desc_gpuaddr); 642 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 643 PUSH_DATA (push, 256); 644 PUSH_DATA (push, 1); 645 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4)); 646 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); 647 PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4); 648 649 /* overwrite griddim_x and griddim_y as two 32-bits integers even 650 * if griddim_y must be a 16-bits integer */ 651 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 652 PUSH_DATAh(push, desc_gpuaddr + 48); 653 PUSH_DATA (push, desc_gpuaddr + 48); 654 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 655 PUSH_DATA (push, 8); 656 PUSH_DATA (push, 1); 657 658 nouveau_pushbuf_space(push, 32, 0, 1); 659 PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); 660 661 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4)); 662 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); 663 nouveau_pushbuf_data(push, res->bo, offset, 664 NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4); 665 666 /* overwrite the 16 high bits of griddim_y with griddim_z because 667 * we need (z << 16) | x */ 668 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 669 PUSH_DATAh(push, desc_gpuaddr + 54); 670 PUSH_DATA (push, desc_gpuaddr + 54); 671 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 672 PUSH_DATA (push, 4); 673 PUSH_DATA (push, 1); 674 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4)); 675 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); 676 nouveau_pushbuf_data(push, res->bo, offset + 8, 677 NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4); 678 } 679 680 /* upload descriptor and flush */ 681 BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1); 682 PUSH_DATA (push, desc_gpuaddr >> 8); 683 BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1); 684 PUSH_DATA (push, 0x3); 685 BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1); 686 PUSH_DATA (push, 0); 687 688 out: 689 if (ret) 690 NOUVEAU_ERR("Failed to launch grid !\n"); 691 nouveau_scratch_done(&nvc0->base); 692 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_DESC); 693 } 694 695 696 #define NVE4_TIC_ENTRY_INVALID 0x000fffff 697 698 static void 699 nve4_compute_validate_textures(struct nvc0_context *nvc0) 700 { 701 struct nouveau_bo *txc = nvc0->screen->txc; 702 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 703 const unsigned s = 5; 704 unsigned i; 705 uint32_t commands[2][32]; 706 unsigned n[2] = { 0, 0 }; 707 708 for (i = 0; i < nvc0->num_textures[s]; ++i) { 709 struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]); 710 struct nv04_resource *res; 711 const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i)); 712 713 if (!tic) { 714 nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; 715 continue; 716 } 717 res = nv04_resource(tic->pipe.texture); 718 nvc0_update_tic(nvc0, tic, res); 719 720 if (tic->id < 0) { 721 tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); 722 723 PUSH_SPACE(push, 16); 724 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 725 PUSH_DATAh(push, txc->offset + (tic->id * 32)); 726 PUSH_DATA (push, txc->offset + (tic->id * 32)); 727 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 728 PUSH_DATA (push, 32); 729 PUSH_DATA (push, 1); 730 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9); 731 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 732 PUSH_DATAp(push, &tic->tic[0], 8); 733 734 commands[0][n[0]++] = (tic->id << 4) | 1; 735 } else 736 if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { 737 commands[1][n[1]++] = (tic->id << 4) | 1; 738 } 739 nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32); 740 741 res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; 742 res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING; 743 744 nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID; 745 nvc0->tex_handles[s][i] |= tic->id; 746 if (dirty) 747 BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD); 748 } 749 for (; i < nvc0->state.num_textures[s]; ++i) { 750 nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; 751 nvc0->textures_dirty[s] |= 1 << i; 752 } 753 754 if (n[0]) { 755 BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), n[0]); 756 PUSH_DATAp(push, commands[0], n[0]); 757 } 758 if (n[1]) { 759 BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), n[1]); 760 PUSH_DATAp(push, commands[1], n[1]); 761 } 762 763 nvc0->state.num_textures[s] = nvc0->num_textures[s]; 764 765 /* Invalidate all 3D textures because they are aliased. */ 766 for (int s = 0; s < 5; s++) { 767 for (int i = 0; i < nvc0->num_textures[s]; i++) 768 nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i)); 769 nvc0->textures_dirty[s] = ~0; 770 } 771 nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES; 772 } 773 774 775 #ifdef DEBUG 776 static const char *nve4_cache_split_name(unsigned value) 777 { 778 switch (value) { 779 case NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1: return "16K_SHARED_48K_L1"; 780 case NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1: return "32K_SHARED_32K_L1"; 781 case NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1: return "48K_SHARED_16K_L1"; 782 default: 783 return "(invalid)"; 784 } 785 } 786 787 static void 788 nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc) 789 { 790 const uint32_t *data = (const uint32_t *)desc; 791 unsigned i; 792 bool zero = false; 793 794 debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n"); 795 796 for (i = 0; i < sizeof(*desc); i += 4) { 797 if (data[i / 4]) { 798 debug_printf("[%x]: 0x%08x\n", i, data[i / 4]); 799 zero = false; 800 } else 801 if (!zero) { 802 debug_printf("...\n"); 803 zero = true; 804 } 805 } 806 807 debug_printf("entry = 0x%x\n", desc->entry); 808 debug_printf("grid dimensions = %ux%ux%u\n", 809 desc->griddim_x, desc->griddim_y, desc->griddim_z); 810 debug_printf("block dimensions = %ux%ux%u\n", 811 desc->blockdim_x, desc->blockdim_y, desc->blockdim_z); 812 debug_printf("s[] size: 0x%x\n", desc->shared_size); 813 debug_printf("l[] size: -0x%x / +0x%x\n", 814 desc->local_size_n, desc->local_size_p); 815 debug_printf("stack size: 0x%x\n", desc->cstack_size); 816 debug_printf("barrier count: %u\n", desc->bar_alloc); 817 debug_printf("$r count: %u\n", desc->gpr_alloc); 818 debug_printf("cache split: %s\n", nve4_cache_split_name(desc->cache_split)); 819 debug_printf("linked tsc: %d\n", desc->linked_tsc); 820 821 for (i = 0; i < 8; ++i) { 822 uint64_t address; 823 uint32_t size = desc->cb[i].size; 824 bool valid = !!(desc->cb_mask & (1 << i)); 825 826 address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l; 827 828 if (!valid && !address && !size) 829 continue; 830 debug_printf("CB[%u]: address = 0x%"PRIx64", size 0x%x%s\n", 831 i, address, size, valid ? "" : " (invalid)"); 832 } 833 } 834 #endif 835 836 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER 837 static void 838 nve4_compute_trap_info(struct nvc0_context *nvc0) 839 { 840 struct nvc0_screen *screen = nvc0->screen; 841 struct nouveau_bo *bo = screen->parm; 842 int ret, i; 843 volatile struct nve4_mp_trap_info *info; 844 uint8_t *map; 845 846 ret = nouveau_bo_map(bo, NOUVEAU_BO_RDWR, nvc0->base.client); 847 if (ret) 848 return; 849 map = (uint8_t *)bo->map; 850 info = (volatile struct nve4_mp_trap_info *)(map + NVE4_CP_PARAM_TRAP_INFO); 851 852 if (info->lock) { 853 debug_printf("trapstat = %08x\n", info->trapstat); 854 debug_printf("warperr = %08x\n", info->warperr); 855 debug_printf("PC = %x\n", info->pc); 856 debug_printf("tid = %u %u %u\n", 857 info->tid[0], info->tid[1], info->tid[2]); 858 debug_printf("ctaid = %u %u %u\n", 859 info->ctaid[0], info->ctaid[1], info->ctaid[2]); 860 for (i = 0; i <= 63; ++i) 861 debug_printf("$r%i = %08x\n", i, info->r[i]); 862 for (i = 0; i <= 6; ++i) 863 debug_printf("$p%i = %i\n", i, (info->flags >> i) & 1); 864 debug_printf("$c = %x\n", info->flags >> 12); 865 } 866 info->lock = 0; 867 } 868 #endif 869