1 /* 2 * Copyright 2010 Jerome Glisse <glisse (at) freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 #include "r600_sq.h" 24 #include "r600_opcodes.h" 25 #include "r600_formats.h" 26 #include "r600_shader.h" 27 #include "r600d.h" 28 29 #include <errno.h> 30 #include "util/u_bitcast.h" 31 #include "util/u_dump.h" 32 #include "util/u_memory.h" 33 #include "util/u_math.h" 34 #include "pipe/p_shader_tokens.h" 35 36 #include "sb/sb_public.h" 37 38 #define NUM_OF_CYCLES 3 39 #define NUM_OF_COMPONENTS 4 40 41 static inline bool alu_writes(struct r600_bytecode_alu *alu) 42 { 43 return alu->dst.write || alu->is_op3; 44 } 45 46 static inline unsigned int r600_bytecode_get_num_operands( 47 struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 48 { 49 return r600_isa_alu(alu->op)->src_count; 50 } 51 52 int r700_bytecode_alu_build(struct r600_bytecode *bc, 53 struct r600_bytecode_alu *alu, unsigned id); 54 55 static struct r600_bytecode_cf *r600_bytecode_cf(void) 56 { 57 struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf); 58 59 if (!cf) 60 return NULL; 61 LIST_INITHEAD(&cf->list); 62 LIST_INITHEAD(&cf->alu); 63 LIST_INITHEAD(&cf->vtx); 64 LIST_INITHEAD(&cf->tex); 65 LIST_INITHEAD(&cf->gds); 66 return cf; 67 } 68 69 static struct r600_bytecode_alu *r600_bytecode_alu(void) 70 { 71 struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu); 72 73 if (!alu) 74 return NULL; 75 LIST_INITHEAD(&alu->list); 76 return alu; 77 } 78 79 static struct r600_bytecode_vtx *r600_bytecode_vtx(void) 80 { 81 struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx); 82 83 if (!vtx) 84 return NULL; 85 LIST_INITHEAD(&vtx->list); 86 return vtx; 87 } 88 89 static struct r600_bytecode_tex *r600_bytecode_tex(void) 90 { 91 struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex); 92 93 if (!tex) 94 return NULL; 95 LIST_INITHEAD(&tex->list); 96 return tex; 97 } 98 99 static struct r600_bytecode_gds *r600_bytecode_gds(void) 100 { 101 struct r600_bytecode_gds *gds = CALLOC_STRUCT(r600_bytecode_gds); 102 103 if (gds == NULL) 104 return NULL; 105 LIST_INITHEAD(&gds->list); 106 return gds; 107 } 108 109 static unsigned stack_entry_size(enum radeon_family chip) { 110 /* Wavefront size: 111 * 64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/ 112 * Aruba/Sumo/Sumo2/redwood/juniper 113 * 32: R630/R730/R710/Palm/Cedar 114 * 16: R610/Rs780 115 * 116 * Stack row size: 117 * Wavefront Size 16 32 48 64 118 * Columns per Row (R6xx/R7xx/R8xx only) 8 8 4 4 119 * Columns per Row (R9xx+) 8 4 4 4 */ 120 121 switch (chip) { 122 /* FIXME: are some chips missing here? */ 123 /* wavefront size 16 */ 124 case CHIP_RV610: 125 case CHIP_RS780: 126 case CHIP_RV620: 127 case CHIP_RS880: 128 /* wavefront size 32 */ 129 case CHIP_RV630: 130 case CHIP_RV635: 131 case CHIP_RV730: 132 case CHIP_RV710: 133 case CHIP_PALM: 134 case CHIP_CEDAR: 135 return 8; 136 137 /* wavefront size 64 */ 138 default: 139 return 4; 140 } 141 } 142 143 void r600_bytecode_init(struct r600_bytecode *bc, 144 enum chip_class chip_class, 145 enum radeon_family family, 146 bool has_compressed_msaa_texturing) 147 { 148 static unsigned next_shader_id = 0; 149 150 bc->debug_id = ++next_shader_id; 151 152 if ((chip_class == R600) && 153 (family != CHIP_RV670 && family != CHIP_RS780 && family != CHIP_RS880)) { 154 bc->ar_handling = AR_HANDLE_RV6XX; 155 bc->r6xx_nop_after_rel_dst = 1; 156 } else { 157 bc->ar_handling = AR_HANDLE_NORMAL; 158 bc->r6xx_nop_after_rel_dst = 0; 159 } 160 161 LIST_INITHEAD(&bc->cf); 162 bc->chip_class = chip_class; 163 bc->family = family; 164 bc->has_compressed_msaa_texturing = has_compressed_msaa_texturing; 165 bc->stack.entry_size = stack_entry_size(family); 166 } 167 168 int r600_bytecode_add_cf(struct r600_bytecode *bc) 169 { 170 struct r600_bytecode_cf *cf = r600_bytecode_cf(); 171 172 if (!cf) 173 return -ENOMEM; 174 LIST_ADDTAIL(&cf->list, &bc->cf); 175 if (bc->cf_last) { 176 cf->id = bc->cf_last->id + 2; 177 if (bc->cf_last->eg_alu_extended) { 178 /* take into account extended alu size */ 179 cf->id += 2; 180 bc->ndw += 2; 181 } 182 } 183 bc->cf_last = cf; 184 bc->ncf++; 185 bc->ndw += 2; 186 bc->force_add_cf = 0; 187 bc->ar_loaded = 0; 188 return 0; 189 } 190 191 int r600_bytecode_add_output(struct r600_bytecode *bc, 192 const struct r600_bytecode_output *output) 193 { 194 int r; 195 196 if (output->gpr >= bc->ngpr) 197 bc->ngpr = output->gpr + 1; 198 199 if (bc->cf_last && (bc->cf_last->op == output->op || 200 (bc->cf_last->op == CF_OP_EXPORT && 201 output->op == CF_OP_EXPORT_DONE)) && 202 output->type == bc->cf_last->output.type && 203 output->elem_size == bc->cf_last->output.elem_size && 204 output->swizzle_x == bc->cf_last->output.swizzle_x && 205 output->swizzle_y == bc->cf_last->output.swizzle_y && 206 output->swizzle_z == bc->cf_last->output.swizzle_z && 207 output->swizzle_w == bc->cf_last->output.swizzle_w && 208 output->comp_mask == bc->cf_last->output.comp_mask && 209 (output->burst_count + bc->cf_last->output.burst_count) <= 16) { 210 211 if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr && 212 (output->array_base + output->burst_count) == bc->cf_last->output.array_base) { 213 214 bc->cf_last->op = bc->cf_last->output.op = output->op; 215 bc->cf_last->output.gpr = output->gpr; 216 bc->cf_last->output.array_base = output->array_base; 217 bc->cf_last->output.burst_count += output->burst_count; 218 return 0; 219 220 } else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) && 221 output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) { 222 223 bc->cf_last->op = bc->cf_last->output.op = output->op; 224 bc->cf_last->output.burst_count += output->burst_count; 225 return 0; 226 } 227 } 228 229 r = r600_bytecode_add_cf(bc); 230 if (r) 231 return r; 232 bc->cf_last->op = output->op; 233 memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output)); 234 bc->cf_last->barrier = 1; 235 return 0; 236 } 237 238 /* alu instructions that can ony exits once per group */ 239 static int is_alu_once_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 240 { 241 return r600_isa_alu(alu->op)->flags & (AF_KILL | AF_PRED) || alu->is_lds_idx_op || alu->op == ALU_OP0_GROUP_BARRIER; 242 } 243 244 static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 245 { 246 return (r600_isa_alu(alu->op)->flags & AF_REPL) && 247 (r600_isa_alu_slots(bc->isa->hw_class, alu->op) == AF_4V); 248 } 249 250 static int is_alu_mova_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 251 { 252 return r600_isa_alu(alu->op)->flags & AF_MOVA; 253 } 254 255 static int alu_uses_rel(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 256 { 257 unsigned num_src = r600_bytecode_get_num_operands(bc, alu); 258 unsigned src; 259 260 if (alu->dst.rel) { 261 return 1; 262 } 263 264 for (src = 0; src < num_src; ++src) { 265 if (alu->src[src].rel) { 266 return 1; 267 } 268 } 269 return 0; 270 } 271 272 static int is_lds_read(int sel) 273 { 274 return sel == EG_V_SQ_ALU_SRC_LDS_OQ_A_POP || sel == EG_V_SQ_ALU_SRC_LDS_OQ_B_POP; 275 } 276 277 static int alu_uses_lds(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 278 { 279 unsigned num_src = r600_bytecode_get_num_operands(bc, alu); 280 unsigned src; 281 282 for (src = 0; src < num_src; ++src) { 283 if (is_lds_read(alu->src[src].sel)) { 284 return 1; 285 } 286 } 287 return 0; 288 } 289 290 static int is_alu_64bit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 291 { 292 const struct alu_op_info *op = r600_isa_alu(alu->op); 293 return (op->flags & AF_64); 294 } 295 296 static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 297 { 298 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 299 return !(slots & AF_S); 300 } 301 302 static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 303 { 304 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 305 return !(slots & AF_V); 306 } 307 308 /* alu instructions that can execute on any unit */ 309 static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 310 { 311 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 312 return slots == AF_VS; 313 } 314 315 static int is_nop_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 316 { 317 return alu->op == ALU_OP0_NOP; 318 } 319 320 static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first, 321 struct r600_bytecode_alu *assignment[5]) 322 { 323 struct r600_bytecode_alu *alu; 324 unsigned i, chan, trans; 325 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 326 327 for (i = 0; i < max_slots; i++) 328 assignment[i] = NULL; 329 330 for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bytecode_alu, alu->list.next, list)) { 331 chan = alu->dst.chan; 332 if (max_slots == 4) 333 trans = 0; 334 else if (is_alu_trans_unit_inst(bc, alu)) 335 trans = 1; 336 else if (is_alu_vec_unit_inst(bc, alu)) 337 trans = 0; 338 else if (assignment[chan]) 339 trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */ 340 else 341 trans = 0; 342 343 if (trans) { 344 if (assignment[4]) { 345 assert(0); /* ALU.Trans has already been allocated. */ 346 return -1; 347 } 348 assignment[4] = alu; 349 } else { 350 if (assignment[chan]) { 351 assert(0); /* ALU.chan has already been allocated. */ 352 return -1; 353 } 354 assignment[chan] = alu; 355 } 356 357 if (alu->last) 358 break; 359 } 360 return 0; 361 } 362 363 struct alu_bank_swizzle { 364 int hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS]; 365 int hw_cfile_addr[4]; 366 int hw_cfile_elem[4]; 367 }; 368 369 static const unsigned cycle_for_bank_swizzle_vec[][3] = { 370 [SQ_ALU_VEC_012] = { 0, 1, 2 }, 371 [SQ_ALU_VEC_021] = { 0, 2, 1 }, 372 [SQ_ALU_VEC_120] = { 1, 2, 0 }, 373 [SQ_ALU_VEC_102] = { 1, 0, 2 }, 374 [SQ_ALU_VEC_201] = { 2, 0, 1 }, 375 [SQ_ALU_VEC_210] = { 2, 1, 0 } 376 }; 377 378 static const unsigned cycle_for_bank_swizzle_scl[][3] = { 379 [SQ_ALU_SCL_210] = { 2, 1, 0 }, 380 [SQ_ALU_SCL_122] = { 1, 2, 2 }, 381 [SQ_ALU_SCL_212] = { 2, 1, 2 }, 382 [SQ_ALU_SCL_221] = { 2, 2, 1 } 383 }; 384 385 static void init_bank_swizzle(struct alu_bank_swizzle *bs) 386 { 387 int i, cycle, component; 388 /* set up gpr use */ 389 for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++) 390 for (component = 0; component < NUM_OF_COMPONENTS; component++) 391 bs->hw_gpr[cycle][component] = -1; 392 for (i = 0; i < 4; i++) 393 bs->hw_cfile_addr[i] = -1; 394 for (i = 0; i < 4; i++) 395 bs->hw_cfile_elem[i] = -1; 396 } 397 398 static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle) 399 { 400 if (bs->hw_gpr[cycle][chan] == -1) 401 bs->hw_gpr[cycle][chan] = sel; 402 else if (bs->hw_gpr[cycle][chan] != (int)sel) { 403 /* Another scalar operation has already used the GPR read port for the channel. */ 404 return -1; 405 } 406 return 0; 407 } 408 409 static int reserve_cfile(struct r600_bytecode *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan) 410 { 411 int res, num_res = 4; 412 if (bc->chip_class >= R700) { 413 num_res = 2; 414 chan /= 2; 415 } 416 for (res = 0; res < num_res; ++res) { 417 if (bs->hw_cfile_addr[res] == -1) { 418 bs->hw_cfile_addr[res] = sel; 419 bs->hw_cfile_elem[res] = chan; 420 return 0; 421 } else if (bs->hw_cfile_addr[res] == sel && 422 bs->hw_cfile_elem[res] == chan) 423 return 0; /* Read for this scalar element already reserved, nothing to do here. */ 424 } 425 /* All cfile read ports are used, cannot reference vector element. */ 426 return -1; 427 } 428 429 static int is_gpr(unsigned sel) 430 { 431 return (sel <= 127); 432 } 433 434 /* CB constants start at 512, and get translated to a kcache index when ALU 435 * clauses are constructed. Note that we handle kcache constants the same way 436 * as (the now gone) cfile constants, is that really required? */ 437 static int is_cfile(unsigned sel) 438 { 439 return (sel > 255 && sel < 512) || 440 (sel > 511 && sel < 4607) || /* Kcache before translation. */ 441 (sel > 127 && sel < 192); /* Kcache after translation. */ 442 } 443 444 static int is_const(int sel) 445 { 446 return is_cfile(sel) || 447 (sel >= V_SQ_ALU_SRC_0 && 448 sel <= V_SQ_ALU_SRC_LITERAL); 449 } 450 451 static int check_vector(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, 452 struct alu_bank_swizzle *bs, int bank_swizzle) 453 { 454 int r, src, num_src, sel, elem, cycle; 455 456 num_src = r600_bytecode_get_num_operands(bc, alu); 457 for (src = 0; src < num_src; src++) { 458 sel = alu->src[src].sel; 459 elem = alu->src[src].chan; 460 if (is_gpr(sel)) { 461 cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src]; 462 if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan) 463 /* Nothing to do; special-case optimization, 464 * second source uses first sources reservation. */ 465 continue; 466 else { 467 r = reserve_gpr(bs, sel, elem, cycle); 468 if (r) 469 return r; 470 } 471 } else if (is_cfile(sel)) { 472 r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem); 473 if (r) 474 return r; 475 } 476 /* No restrictions on PV, PS, literal or special constants. */ 477 } 478 return 0; 479 } 480 481 static int check_scalar(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, 482 struct alu_bank_swizzle *bs, int bank_swizzle) 483 { 484 int r, src, num_src, const_count, sel, elem, cycle; 485 486 num_src = r600_bytecode_get_num_operands(bc, alu); 487 for (const_count = 0, src = 0; src < num_src; ++src) { 488 sel = alu->src[src].sel; 489 elem = alu->src[src].chan; 490 if (is_const(sel)) { /* Any constant, including literal and inline constants. */ 491 if (const_count >= 2) 492 /* More than two references to a constant in 493 * transcendental operation. */ 494 return -1; 495 else 496 const_count++; 497 } 498 if (is_cfile(sel)) { 499 r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem); 500 if (r) 501 return r; 502 } 503 } 504 for (src = 0; src < num_src; ++src) { 505 sel = alu->src[src].sel; 506 elem = alu->src[src].chan; 507 if (is_gpr(sel)) { 508 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src]; 509 if (cycle < const_count) 510 /* Cycle for GPR load conflicts with 511 * constant load in transcendental operation. */ 512 return -1; 513 r = reserve_gpr(bs, sel, elem, cycle); 514 if (r) 515 return r; 516 } 517 /* PV PS restrictions */ 518 if (const_count && (sel == 254 || sel == 255)) { 519 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src]; 520 if (cycle < const_count) 521 return -1; 522 } 523 } 524 return 0; 525 } 526 527 static int check_and_set_bank_swizzle(struct r600_bytecode *bc, 528 struct r600_bytecode_alu *slots[5]) 529 { 530 struct alu_bank_swizzle bs; 531 int bank_swizzle[5]; 532 int i, r = 0, forced = 1; 533 boolean scalar_only = bc->chip_class == CAYMAN ? false : true; 534 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 535 536 for (i = 0; i < max_slots; i++) { 537 if (slots[i]) { 538 if (slots[i]->bank_swizzle_force) { 539 slots[i]->bank_swizzle = slots[i]->bank_swizzle_force; 540 } else { 541 forced = 0; 542 } 543 } 544 545 if (i < 4 && slots[i]) 546 scalar_only = false; 547 } 548 if (forced) 549 return 0; 550 551 /* Just check every possible combination of bank swizzle. 552 * Not very efficent, but works on the first try in most of the cases. */ 553 for (i = 0; i < 4; i++) 554 if (!slots[i] || !slots[i]->bank_swizzle_force) 555 bank_swizzle[i] = SQ_ALU_VEC_012; 556 else 557 bank_swizzle[i] = slots[i]->bank_swizzle; 558 559 bank_swizzle[4] = SQ_ALU_SCL_210; 560 while(bank_swizzle[4] <= SQ_ALU_SCL_221) { 561 562 init_bank_swizzle(&bs); 563 if (scalar_only == false) { 564 for (i = 0; i < 4; i++) { 565 if (slots[i]) { 566 r = check_vector(bc, slots[i], &bs, bank_swizzle[i]); 567 if (r) 568 break; 569 } 570 } 571 } else 572 r = 0; 573 574 if (!r && max_slots == 5 && slots[4]) { 575 r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]); 576 } 577 if (!r) { 578 for (i = 0; i < max_slots; i++) { 579 if (slots[i]) 580 slots[i]->bank_swizzle = bank_swizzle[i]; 581 } 582 return 0; 583 } 584 585 if (scalar_only) { 586 bank_swizzle[4]++; 587 } else { 588 for (i = 0; i < max_slots; i++) { 589 if (!slots[i] || !slots[i]->bank_swizzle_force) { 590 bank_swizzle[i]++; 591 if (bank_swizzle[i] <= SQ_ALU_VEC_210) 592 break; 593 else if (i < max_slots - 1) 594 bank_swizzle[i] = SQ_ALU_VEC_012; 595 else 596 return -1; 597 } 598 } 599 } 600 } 601 602 /* Couldn't find a working swizzle. */ 603 return -1; 604 } 605 606 static int replace_gpr_with_pv_ps(struct r600_bytecode *bc, 607 struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev) 608 { 609 struct r600_bytecode_alu *prev[5]; 610 int gpr[5], chan[5]; 611 int i, j, r, src, num_src; 612 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 613 614 r = assign_alu_units(bc, alu_prev, prev); 615 if (r) 616 return r; 617 618 for (i = 0; i < max_slots; ++i) { 619 if (prev[i] && alu_writes(prev[i]) && !prev[i]->dst.rel) { 620 621 if (is_alu_64bit_inst(bc, prev[i])) { 622 gpr[i] = -1; 623 continue; 624 } 625 626 gpr[i] = prev[i]->dst.sel; 627 /* cube writes more than PV.X */ 628 if (is_alu_reduction_inst(bc, prev[i])) 629 chan[i] = 0; 630 else 631 chan[i] = prev[i]->dst.chan; 632 } else 633 gpr[i] = -1; 634 } 635 636 for (i = 0; i < max_slots; ++i) { 637 struct r600_bytecode_alu *alu = slots[i]; 638 if (!alu) 639 continue; 640 641 if (is_alu_64bit_inst(bc, alu)) 642 continue; 643 num_src = r600_bytecode_get_num_operands(bc, alu); 644 for (src = 0; src < num_src; ++src) { 645 if (!is_gpr(alu->src[src].sel) || alu->src[src].rel) 646 continue; 647 648 if (bc->chip_class < CAYMAN) { 649 if (alu->src[src].sel == gpr[4] && 650 alu->src[src].chan == chan[4] && 651 alu_prev->pred_sel == alu->pred_sel) { 652 alu->src[src].sel = V_SQ_ALU_SRC_PS; 653 alu->src[src].chan = 0; 654 continue; 655 } 656 } 657 658 for (j = 0; j < 4; ++j) { 659 if (alu->src[src].sel == gpr[j] && 660 alu->src[src].chan == j && 661 alu_prev->pred_sel == alu->pred_sel) { 662 alu->src[src].sel = V_SQ_ALU_SRC_PV; 663 alu->src[src].chan = chan[j]; 664 break; 665 } 666 } 667 } 668 } 669 670 return 0; 671 } 672 673 void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *neg, unsigned abs) 674 { 675 switch(value) { 676 case 0: 677 *sel = V_SQ_ALU_SRC_0; 678 break; 679 case 1: 680 *sel = V_SQ_ALU_SRC_1_INT; 681 break; 682 case -1: 683 *sel = V_SQ_ALU_SRC_M_1_INT; 684 break; 685 case 0x3F800000: /* 1.0f */ 686 *sel = V_SQ_ALU_SRC_1; 687 break; 688 case 0x3F000000: /* 0.5f */ 689 *sel = V_SQ_ALU_SRC_0_5; 690 break; 691 case 0xBF800000: /* -1.0f */ 692 *sel = V_SQ_ALU_SRC_1; 693 *neg ^= !abs; 694 break; 695 case 0xBF000000: /* -0.5f */ 696 *sel = V_SQ_ALU_SRC_0_5; 697 *neg ^= !abs; 698 break; 699 default: 700 *sel = V_SQ_ALU_SRC_LITERAL; 701 break; 702 } 703 } 704 705 /* compute how many literal are needed */ 706 static int r600_bytecode_alu_nliterals(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, 707 uint32_t literal[4], unsigned *nliteral) 708 { 709 unsigned num_src = r600_bytecode_get_num_operands(bc, alu); 710 unsigned i, j; 711 712 for (i = 0; i < num_src; ++i) { 713 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 714 uint32_t value = alu->src[i].value; 715 unsigned found = 0; 716 for (j = 0; j < *nliteral; ++j) { 717 if (literal[j] == value) { 718 found = 1; 719 break; 720 } 721 } 722 if (!found) { 723 if (*nliteral >= 4) 724 return -EINVAL; 725 literal[(*nliteral)++] = value; 726 } 727 } 728 } 729 return 0; 730 } 731 732 static void r600_bytecode_alu_adjust_literals(struct r600_bytecode *bc, 733 struct r600_bytecode_alu *alu, 734 uint32_t literal[4], unsigned nliteral) 735 { 736 unsigned num_src = r600_bytecode_get_num_operands(bc, alu); 737 unsigned i, j; 738 739 for (i = 0; i < num_src; ++i) { 740 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 741 uint32_t value = alu->src[i].value; 742 for (j = 0; j < nliteral; ++j) { 743 if (literal[j] == value) { 744 alu->src[i].chan = j; 745 break; 746 } 747 } 748 } 749 } 750 } 751 752 static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5], 753 struct r600_bytecode_alu *alu_prev) 754 { 755 struct r600_bytecode_alu *prev[5]; 756 struct r600_bytecode_alu *result[5] = { NULL }; 757 758 uint32_t literal[4], prev_literal[4]; 759 unsigned nliteral = 0, prev_nliteral = 0; 760 761 int i, j, r, src, num_src; 762 int num_once_inst = 0; 763 int have_mova = 0, have_rel = 0; 764 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 765 766 r = assign_alu_units(bc, alu_prev, prev); 767 if (r) 768 return r; 769 770 for (i = 0; i < max_slots; ++i) { 771 if (prev[i]) { 772 if (prev[i]->pred_sel) 773 return 0; 774 if (is_alu_once_inst(bc, prev[i])) 775 return 0; 776 } 777 if (slots[i]) { 778 if (slots[i]->pred_sel) 779 return 0; 780 if (is_alu_once_inst(bc, slots[i])) 781 return 0; 782 } 783 } 784 785 for (i = 0; i < max_slots; ++i) { 786 struct r600_bytecode_alu *alu; 787 788 if (num_once_inst > 0) 789 return 0; 790 791 /* check number of literals */ 792 if (prev[i]) { 793 if (r600_bytecode_alu_nliterals(bc, prev[i], literal, &nliteral)) 794 return 0; 795 if (r600_bytecode_alu_nliterals(bc, prev[i], prev_literal, &prev_nliteral)) 796 return 0; 797 if (is_alu_mova_inst(bc, prev[i])) { 798 if (have_rel) 799 return 0; 800 have_mova = 1; 801 } 802 803 if (alu_uses_rel(bc, prev[i])) { 804 if (have_mova) { 805 return 0; 806 } 807 have_rel = 1; 808 } 809 if (alu_uses_lds(bc, prev[i])) 810 return 0; 811 812 num_once_inst += is_alu_once_inst(bc, prev[i]); 813 } 814 if (slots[i] && r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral)) 815 return 0; 816 817 /* Let's check used slots. */ 818 if (prev[i] && !slots[i]) { 819 result[i] = prev[i]; 820 continue; 821 } else if (prev[i] && slots[i]) { 822 if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) { 823 /* Trans unit is still free try to use it. */ 824 if (is_alu_any_unit_inst(bc, slots[i]) && !alu_uses_lds(bc, slots[i])) { 825 result[i] = prev[i]; 826 result[4] = slots[i]; 827 } else if (is_alu_any_unit_inst(bc, prev[i])) { 828 if (slots[i]->dst.sel == prev[i]->dst.sel && 829 alu_writes(slots[i]) && 830 alu_writes(prev[i])) 831 return 0; 832 833 result[i] = slots[i]; 834 result[4] = prev[i]; 835 } else 836 return 0; 837 } else 838 return 0; 839 } else if(!slots[i]) { 840 continue; 841 } else { 842 if (max_slots == 5 && slots[i] && prev[4] && 843 slots[i]->dst.sel == prev[4]->dst.sel && 844 slots[i]->dst.chan == prev[4]->dst.chan && 845 alu_writes(slots[i]) && 846 alu_writes(prev[4])) 847 return 0; 848 849 result[i] = slots[i]; 850 } 851 852 alu = slots[i]; 853 num_once_inst += is_alu_once_inst(bc, alu); 854 855 /* don't reschedule NOPs */ 856 if (is_nop_inst(bc, alu)) 857 return 0; 858 859 if (is_alu_mova_inst(bc, alu)) { 860 if (have_rel) { 861 return 0; 862 } 863 have_mova = 1; 864 } 865 866 if (alu_uses_rel(bc, alu)) { 867 if (have_mova) { 868 return 0; 869 } 870 have_rel = 1; 871 } 872 873 if (alu->op == ALU_OP0_SET_CF_IDX0 || 874 alu->op == ALU_OP0_SET_CF_IDX1) 875 return 0; /* data hazard with MOVA */ 876 877 /* Let's check source gprs */ 878 num_src = r600_bytecode_get_num_operands(bc, alu); 879 for (src = 0; src < num_src; ++src) { 880 881 /* Constants don't matter. */ 882 if (!is_gpr(alu->src[src].sel)) 883 continue; 884 885 for (j = 0; j < max_slots; ++j) { 886 if (!prev[j] || !alu_writes(prev[j])) 887 continue; 888 889 /* If it's relative then we can't determin which gpr is really used. */ 890 if (prev[j]->dst.chan == alu->src[src].chan && 891 (prev[j]->dst.sel == alu->src[src].sel || 892 prev[j]->dst.rel || alu->src[src].rel)) 893 return 0; 894 } 895 } 896 } 897 898 /* more than one PRED_ or KILL_ ? */ 899 if (num_once_inst > 1) 900 return 0; 901 902 /* check if the result can still be swizzlet */ 903 r = check_and_set_bank_swizzle(bc, result); 904 if (r) 905 return 0; 906 907 /* looks like everything worked out right, apply the changes */ 908 909 /* undo adding previus literals */ 910 bc->cf_last->ndw -= align(prev_nliteral, 2); 911 912 /* sort instructions */ 913 for (i = 0; i < max_slots; ++i) { 914 slots[i] = result[i]; 915 if (result[i]) { 916 LIST_DEL(&result[i]->list); 917 result[i]->last = 0; 918 LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu); 919 } 920 } 921 922 /* determine new last instruction */ 923 LIST_ENTRY(struct r600_bytecode_alu, bc->cf_last->alu.prev, list)->last = 1; 924 925 /* determine new first instruction */ 926 for (i = 0; i < max_slots; ++i) { 927 if (result[i]) { 928 bc->cf_last->curr_bs_head = result[i]; 929 break; 930 } 931 } 932 933 bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head; 934 bc->cf_last->prev2_bs_head = NULL; 935 936 return 0; 937 } 938 939 /* we'll keep kcache sets sorted by bank & addr */ 940 static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc, 941 struct r600_bytecode_kcache *kcache, 942 unsigned bank, unsigned line, unsigned index_mode) 943 { 944 int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2; 945 946 for (i = 0; i < kcache_banks; i++) { 947 if (kcache[i].mode) { 948 int d; 949 950 if (kcache[i].bank < bank) 951 continue; 952 953 if ((kcache[i].bank == bank && kcache[i].addr > line+1) || 954 kcache[i].bank > bank) { 955 /* try to insert new line */ 956 if (kcache[kcache_banks-1].mode) { 957 /* all sets are in use */ 958 return -ENOMEM; 959 } 960 961 memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache)); 962 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1; 963 kcache[i].bank = bank; 964 kcache[i].addr = line; 965 kcache[i].index_mode = index_mode; 966 return 0; 967 } 968 969 d = line - kcache[i].addr; 970 971 if (d == -1) { 972 kcache[i].addr--; 973 if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) { 974 /* we are prepending the line to the current set, 975 * discarding the existing second line, 976 * so we'll have to insert line+2 after it */ 977 line += 2; 978 continue; 979 } else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) { 980 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2; 981 return 0; 982 } else { 983 /* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */ 984 return -ENOMEM; 985 } 986 } else if (d == 1) { 987 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2; 988 return 0; 989 } else if (d == 0) 990 return 0; 991 } else { /* free kcache set - use it */ 992 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1; 993 kcache[i].bank = bank; 994 kcache[i].addr = line; 995 kcache[i].index_mode = index_mode; 996 return 0; 997 } 998 } 999 return -ENOMEM; 1000 } 1001 1002 static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc, 1003 struct r600_bytecode_kcache *kcache, 1004 struct r600_bytecode_alu *alu) 1005 { 1006 int i, r; 1007 1008 for (i = 0; i < 3; i++) { 1009 unsigned bank, line, sel = alu->src[i].sel, index_mode; 1010 1011 if (sel < 512) 1012 continue; 1013 1014 bank = alu->src[i].kc_bank; 1015 line = (sel-512)>>4; 1016 index_mode = alu->src[i].kc_rel ? 1 : 0; // V_SQ_CF_INDEX_0 / V_SQ_CF_INDEX_NONE 1017 1018 if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line, index_mode))) 1019 return r; 1020 } 1021 return 0; 1022 } 1023 1024 static int r600_bytecode_assign_kcache_banks(struct r600_bytecode *bc, 1025 struct r600_bytecode_alu *alu, 1026 struct r600_bytecode_kcache * kcache) 1027 { 1028 int i, j; 1029 1030 /* Alter the src operands to refer to the kcache. */ 1031 for (i = 0; i < 3; ++i) { 1032 static const unsigned int base[] = {128, 160, 256, 288}; 1033 unsigned int line, sel = alu->src[i].sel, found = 0; 1034 1035 if (sel < 512) 1036 continue; 1037 1038 sel -= 512; 1039 line = sel>>4; 1040 1041 for (j = 0; j < 4 && !found; ++j) { 1042 switch (kcache[j].mode) { 1043 case V_SQ_CF_KCACHE_NOP: 1044 case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX: 1045 R600_ERR("unexpected kcache line mode\n"); 1046 return -ENOMEM; 1047 default: 1048 if (kcache[j].bank == alu->src[i].kc_bank && 1049 kcache[j].addr <= line && 1050 line < kcache[j].addr + kcache[j].mode) { 1051 alu->src[i].sel = sel - (kcache[j].addr<<4); 1052 alu->src[i].sel += base[j]; 1053 found=1; 1054 } 1055 } 1056 } 1057 } 1058 return 0; 1059 } 1060 1061 static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, 1062 struct r600_bytecode_alu *alu, 1063 unsigned type) 1064 { 1065 struct r600_bytecode_kcache kcache_sets[4]; 1066 struct r600_bytecode_kcache *kcache = kcache_sets; 1067 int r; 1068 1069 memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache)); 1070 1071 if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) { 1072 /* can't alloc, need to start new clause */ 1073 if ((r = r600_bytecode_add_cf(bc))) { 1074 return r; 1075 } 1076 bc->cf_last->op = type; 1077 1078 /* retry with the new clause */ 1079 kcache = bc->cf_last->kcache; 1080 if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) { 1081 /* can't alloc again- should never happen */ 1082 return r; 1083 } 1084 } else { 1085 /* update kcache sets */ 1086 memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache)); 1087 } 1088 1089 /* if we actually used more than 2 kcache sets, or have relative indexing - use ALU_EXTENDED on eg+ */ 1090 if (kcache[2].mode != V_SQ_CF_KCACHE_NOP || 1091 kcache[0].index_mode || kcache[1].index_mode || kcache[2].index_mode || kcache[3].index_mode) { 1092 if (bc->chip_class < EVERGREEN) 1093 return -ENOMEM; 1094 bc->cf_last->eg_alu_extended = 1; 1095 } 1096 1097 return 0; 1098 } 1099 1100 static int insert_nop_r6xx(struct r600_bytecode *bc) 1101 { 1102 struct r600_bytecode_alu alu; 1103 int r, i; 1104 1105 for (i = 0; i < 4; i++) { 1106 memset(&alu, 0, sizeof(alu)); 1107 alu.op = ALU_OP0_NOP; 1108 alu.src[0].chan = i; 1109 alu.dst.chan = i; 1110 alu.last = (i == 3); 1111 r = r600_bytecode_add_alu(bc, &alu); 1112 if (r) 1113 return r; 1114 } 1115 return 0; 1116 } 1117 1118 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */ 1119 static int load_ar_r6xx(struct r600_bytecode *bc) 1120 { 1121 struct r600_bytecode_alu alu; 1122 int r; 1123 1124 if (bc->ar_loaded) 1125 return 0; 1126 1127 /* hack to avoid making MOVA the last instruction in the clause */ 1128 if ((bc->cf_last->ndw>>1) >= 110) 1129 bc->force_add_cf = 1; 1130 1131 memset(&alu, 0, sizeof(alu)); 1132 alu.op = ALU_OP1_MOVA_GPR_INT; 1133 alu.src[0].sel = bc->ar_reg; 1134 alu.src[0].chan = bc->ar_chan; 1135 alu.last = 1; 1136 alu.index_mode = INDEX_MODE_LOOP; 1137 r = r600_bytecode_add_alu(bc, &alu); 1138 if (r) 1139 return r; 1140 1141 /* no requirement to set uses waterfall on MOVA_GPR_INT */ 1142 bc->ar_loaded = 1; 1143 return 0; 1144 } 1145 1146 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */ 1147 static int load_ar(struct r600_bytecode *bc) 1148 { 1149 struct r600_bytecode_alu alu; 1150 int r; 1151 1152 if (bc->ar_handling) 1153 return load_ar_r6xx(bc); 1154 1155 if (bc->ar_loaded) 1156 return 0; 1157 1158 /* hack to avoid making MOVA the last instruction in the clause */ 1159 if ((bc->cf_last->ndw>>1) >= 110) 1160 bc->force_add_cf = 1; 1161 1162 memset(&alu, 0, sizeof(alu)); 1163 alu.op = ALU_OP1_MOVA_INT; 1164 alu.src[0].sel = bc->ar_reg; 1165 alu.src[0].chan = bc->ar_chan; 1166 alu.last = 1; 1167 r = r600_bytecode_add_alu(bc, &alu); 1168 if (r) 1169 return r; 1170 1171 bc->cf_last->r6xx_uses_waterfall = 1; 1172 bc->ar_loaded = 1; 1173 return 0; 1174 } 1175 1176 int r600_bytecode_add_alu_type(struct r600_bytecode *bc, 1177 const struct r600_bytecode_alu *alu, unsigned type) 1178 { 1179 struct r600_bytecode_alu *nalu = r600_bytecode_alu(); 1180 struct r600_bytecode_alu *lalu; 1181 int i, r; 1182 1183 if (!nalu) 1184 return -ENOMEM; 1185 memcpy(nalu, alu, sizeof(struct r600_bytecode_alu)); 1186 1187 if (alu->is_op3) { 1188 /* will fail later since alu does not support it. */ 1189 assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs); 1190 } 1191 1192 if (bc->cf_last != NULL && bc->cf_last->op != type) { 1193 /* check if we could add it anyway */ 1194 if (bc->cf_last->op == CF_OP_ALU && 1195 type == CF_OP_ALU_PUSH_BEFORE) { 1196 LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) { 1197 if (lalu->execute_mask) { 1198 bc->force_add_cf = 1; 1199 break; 1200 } 1201 } 1202 } else 1203 bc->force_add_cf = 1; 1204 } 1205 1206 /* cf can contains only alu or only vtx or only tex */ 1207 if (bc->cf_last == NULL || bc->force_add_cf) { 1208 r = r600_bytecode_add_cf(bc); 1209 if (r) { 1210 free(nalu); 1211 return r; 1212 } 1213 } 1214 bc->cf_last->op = type; 1215 1216 /* Load index register if required */ 1217 if (bc->chip_class >= EVERGREEN) { 1218 for (i = 0; i < 3; i++) 1219 if (nalu->src[i].kc_bank && nalu->src[i].kc_rel) 1220 egcm_load_index_reg(bc, 0, true); 1221 } 1222 1223 /* Check AR usage and load it if required */ 1224 for (i = 0; i < 3; i++) 1225 if (nalu->src[i].rel && !bc->ar_loaded) 1226 load_ar(bc); 1227 1228 if (nalu->dst.rel && !bc->ar_loaded) 1229 load_ar(bc); 1230 1231 /* Setup the kcache for this ALU instruction. This will start a new 1232 * ALU clause if needed. */ 1233 if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) { 1234 free(nalu); 1235 return r; 1236 } 1237 1238 if (!bc->cf_last->curr_bs_head) { 1239 bc->cf_last->curr_bs_head = nalu; 1240 } 1241 /* number of gpr == the last gpr used in any alu */ 1242 for (i = 0; i < 3; i++) { 1243 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) { 1244 bc->ngpr = nalu->src[i].sel + 1; 1245 } 1246 if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL) 1247 r600_bytecode_special_constants(nalu->src[i].value, 1248 &nalu->src[i].sel, &nalu->src[i].neg, nalu->src[i].abs); 1249 } 1250 if (nalu->dst.sel >= bc->ngpr) { 1251 bc->ngpr = nalu->dst.sel + 1; 1252 } 1253 LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu); 1254 /* each alu use 2 dwords */ 1255 bc->cf_last->ndw += 2; 1256 bc->ndw += 2; 1257 1258 /* process cur ALU instructions for bank swizzle */ 1259 if (nalu->last) { 1260 uint32_t literal[4]; 1261 unsigned nliteral; 1262 struct r600_bytecode_alu *slots[5]; 1263 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 1264 r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots); 1265 if (r) 1266 return r; 1267 1268 if (bc->cf_last->prev_bs_head) { 1269 r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head); 1270 if (r) 1271 return r; 1272 } 1273 1274 if (bc->cf_last->prev_bs_head) { 1275 r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head); 1276 if (r) 1277 return r; 1278 } 1279 1280 r = check_and_set_bank_swizzle(bc, slots); 1281 if (r) 1282 return r; 1283 1284 for (i = 0, nliteral = 0; i < max_slots; i++) { 1285 if (slots[i]) { 1286 r = r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral); 1287 if (r) 1288 return r; 1289 } 1290 } 1291 bc->cf_last->ndw += align(nliteral, 2); 1292 1293 /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots) 1294 * worst case */ 1295 if ((bc->cf_last->ndw >> 1) >= 120) { 1296 bc->force_add_cf = 1; 1297 } 1298 1299 bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head; 1300 bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head; 1301 bc->cf_last->curr_bs_head = NULL; 1302 } 1303 1304 if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst) 1305 insert_nop_r6xx(bc); 1306 1307 return 0; 1308 } 1309 1310 int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu) 1311 { 1312 return r600_bytecode_add_alu_type(bc, alu, CF_OP_ALU); 1313 } 1314 1315 static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc) 1316 { 1317 switch (bc->chip_class) { 1318 case R600: 1319 return 8; 1320 1321 case R700: 1322 case EVERGREEN: 1323 case CAYMAN: 1324 return 16; 1325 1326 default: 1327 R600_ERR("Unknown chip class %d.\n", bc->chip_class); 1328 return 8; 1329 } 1330 } 1331 1332 static inline boolean last_inst_was_not_vtx_fetch(struct r600_bytecode *bc) 1333 { 1334 return !((r600_isa_cf(bc->cf_last->op)->flags & CF_FETCH) && 1335 (bc->chip_class == CAYMAN || 1336 bc->cf_last->op != CF_OP_TEX)); 1337 } 1338 1339 int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx) 1340 { 1341 struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx(); 1342 int r; 1343 1344 if (!nvtx) 1345 return -ENOMEM; 1346 memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx)); 1347 1348 /* Load index register if required */ 1349 if (bc->chip_class >= EVERGREEN) { 1350 if (vtx->buffer_index_mode) 1351 egcm_load_index_reg(bc, 0, false); 1352 } 1353 1354 /* cf can contains only alu or only vtx or only tex */ 1355 if (bc->cf_last == NULL || 1356 last_inst_was_not_vtx_fetch(bc) || 1357 bc->force_add_cf) { 1358 r = r600_bytecode_add_cf(bc); 1359 if (r) { 1360 free(nvtx); 1361 return r; 1362 } 1363 switch (bc->chip_class) { 1364 case R600: 1365 case R700: 1366 case EVERGREEN: 1367 bc->cf_last->op = CF_OP_VTX; 1368 break; 1369 case CAYMAN: 1370 bc->cf_last->op = CF_OP_TEX; 1371 break; 1372 default: 1373 R600_ERR("Unknown chip class %d.\n", bc->chip_class); 1374 free(nvtx); 1375 return -EINVAL; 1376 } 1377 } 1378 LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx); 1379 /* each fetch use 4 dwords */ 1380 bc->cf_last->ndw += 4; 1381 bc->ndw += 4; 1382 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1383 bc->force_add_cf = 1; 1384 1385 bc->ngpr = MAX2(bc->ngpr, vtx->src_gpr + 1); 1386 bc->ngpr = MAX2(bc->ngpr, vtx->dst_gpr + 1); 1387 1388 return 0; 1389 } 1390 1391 int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex) 1392 { 1393 struct r600_bytecode_tex *ntex = r600_bytecode_tex(); 1394 int r; 1395 1396 if (!ntex) 1397 return -ENOMEM; 1398 memcpy(ntex, tex, sizeof(struct r600_bytecode_tex)); 1399 1400 /* Load index register if required */ 1401 if (bc->chip_class >= EVERGREEN) { 1402 if (tex->sampler_index_mode || tex->resource_index_mode) 1403 egcm_load_index_reg(bc, 1, false); 1404 } 1405 1406 /* we can't fetch data und use it as texture lookup address in the same TEX clause */ 1407 if (bc->cf_last != NULL && 1408 bc->cf_last->op == CF_OP_TEX) { 1409 struct r600_bytecode_tex *ttex; 1410 LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) { 1411 if (ttex->dst_gpr == ntex->src_gpr) { 1412 bc->force_add_cf = 1; 1413 break; 1414 } 1415 } 1416 /* slight hack to make gradients always go into same cf */ 1417 if (ntex->op == FETCH_OP_SET_GRADIENTS_H) 1418 bc->force_add_cf = 1; 1419 } 1420 1421 /* cf can contains only alu or only vtx or only tex */ 1422 if (bc->cf_last == NULL || 1423 bc->cf_last->op != CF_OP_TEX || 1424 bc->force_add_cf) { 1425 r = r600_bytecode_add_cf(bc); 1426 if (r) { 1427 free(ntex); 1428 return r; 1429 } 1430 bc->cf_last->op = CF_OP_TEX; 1431 } 1432 if (ntex->src_gpr >= bc->ngpr) { 1433 bc->ngpr = ntex->src_gpr + 1; 1434 } 1435 if (ntex->dst_gpr >= bc->ngpr) { 1436 bc->ngpr = ntex->dst_gpr + 1; 1437 } 1438 LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex); 1439 /* each texture fetch use 4 dwords */ 1440 bc->cf_last->ndw += 4; 1441 bc->ndw += 4; 1442 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1443 bc->force_add_cf = 1; 1444 return 0; 1445 } 1446 1447 int r600_bytecode_add_gds(struct r600_bytecode *bc, const struct r600_bytecode_gds *gds) 1448 { 1449 struct r600_bytecode_gds *ngds = r600_bytecode_gds(); 1450 int r; 1451 1452 if (ngds == NULL) 1453 return -ENOMEM; 1454 memcpy(ngds, gds, sizeof(struct r600_bytecode_gds)); 1455 1456 if (bc->cf_last == NULL || 1457 bc->cf_last->op != CF_OP_GDS || 1458 bc->force_add_cf) { 1459 r = r600_bytecode_add_cf(bc); 1460 if (r) { 1461 free(ngds); 1462 return r; 1463 } 1464 bc->cf_last->op = CF_OP_GDS; 1465 } 1466 1467 LIST_ADDTAIL(&ngds->list, &bc->cf_last->gds); 1468 bc->cf_last->ndw += 4; /* each GDS uses 4 dwords */ 1469 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1470 bc->force_add_cf = 1; 1471 return 0; 1472 } 1473 1474 int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op) 1475 { 1476 int r; 1477 r = r600_bytecode_add_cf(bc); 1478 if (r) 1479 return r; 1480 1481 bc->cf_last->cond = V_SQ_CF_COND_ACTIVE; 1482 bc->cf_last->op = op; 1483 return 0; 1484 } 1485 1486 int cm_bytecode_add_cf_end(struct r600_bytecode *bc) 1487 { 1488 return r600_bytecode_add_cfinst(bc, CF_OP_CF_END); 1489 } 1490 1491 /* common to all 3 families */ 1492 static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id) 1493 { 1494 bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) | 1495 S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) | 1496 S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) | 1497 S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x); 1498 if (bc->chip_class < CAYMAN) 1499 bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count); 1500 id++; 1501 bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) | 1502 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) | 1503 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) | 1504 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) | 1505 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) | 1506 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) | 1507 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) | 1508 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) | 1509 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) | 1510 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr); 1511 bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)| 1512 S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian); 1513 if (bc->chip_class >= EVERGREEN) 1514 bc->bytecode[id] |= ((vtx->buffer_index_mode & 0x3) << 21); // S_SQ_VTX_WORD2_BIM(vtx->buffer_index_mode); 1515 if (bc->chip_class < CAYMAN) 1516 bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1); 1517 id++; 1518 bc->bytecode[id++] = 0; 1519 return 0; 1520 } 1521 1522 /* common to all 3 families */ 1523 static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id) 1524 { 1525 bc->bytecode[id] = S_SQ_TEX_WORD0_TEX_INST( 1526 r600_isa_fetch_opcode(bc->isa->hw_class, tex->op)) | 1527 EG_S_SQ_TEX_WORD0_INST_MOD(tex->inst_mod) | 1528 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) | 1529 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) | 1530 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel); 1531 if (bc->chip_class >= EVERGREEN) 1532 bc->bytecode[id] |= ((tex->sampler_index_mode & 0x3) << 27) | // S_SQ_TEX_WORD0_SIM(tex->sampler_index_mode); 1533 ((tex->resource_index_mode & 0x3) << 25); // S_SQ_TEX_WORD0_RIM(tex->resource_index_mode) 1534 id++; 1535 bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) | 1536 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) | 1537 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) | 1538 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) | 1539 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) | 1540 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) | 1541 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) | 1542 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) | 1543 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) | 1544 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) | 1545 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w); 1546 bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) | 1547 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) | 1548 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) | 1549 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) | 1550 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) | 1551 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) | 1552 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) | 1553 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w); 1554 bc->bytecode[id++] = 0; 1555 return 0; 1556 } 1557 1558 /* r600 only, r700/eg bits in r700_asm.c */ 1559 static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id) 1560 { 1561 unsigned opcode = r600_isa_alu_opcode(bc->isa->hw_class, alu->op); 1562 1563 /* don't replace gpr by pv or ps for destination register */ 1564 bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) | 1565 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) | 1566 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) | 1567 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) | 1568 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) | 1569 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) | 1570 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) | 1571 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) | 1572 S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) | 1573 S_SQ_ALU_WORD0_PRED_SEL(alu->pred_sel) | 1574 S_SQ_ALU_WORD0_LAST(alu->last); 1575 1576 if (alu->is_op3) { 1577 assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs); 1578 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) | 1579 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) | 1580 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) | 1581 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) | 1582 S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) | 1583 S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) | 1584 S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) | 1585 S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) | 1586 S_SQ_ALU_WORD1_OP3_ALU_INST(opcode) | 1587 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle); 1588 } else { 1589 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) | 1590 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) | 1591 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) | 1592 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) | 1593 S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) | 1594 S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) | 1595 S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) | 1596 S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) | 1597 S_SQ_ALU_WORD1_OP2_ALU_INST(opcode) | 1598 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) | 1599 S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->execute_mask) | 1600 S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->update_pred); 1601 } 1602 return 0; 1603 } 1604 1605 static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf) 1606 { 1607 *bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1); 1608 *bytecode++ = S_SQ_CF_WORD1_CF_INST(r600_isa_cf_opcode(ISA_CC_R600, cf->op)) | 1609 S_SQ_CF_WORD1_BARRIER(1) | 1610 S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1); 1611 } 1612 1613 /* common for r600/r700 - eg in eg_asm.c */ 1614 static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf) 1615 { 1616 unsigned id = cf->id; 1617 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 1618 unsigned opcode = r600_isa_cf_opcode(bc->isa->hw_class, cf->op); 1619 1620 1621 if (cf->op == CF_NATIVE) { 1622 bc->bytecode[id++] = cf->isa[0]; 1623 bc->bytecode[id++] = cf->isa[1]; 1624 } else if (cfop->flags & CF_ALU) { 1625 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) | 1626 S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) | 1627 S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) | 1628 S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank); 1629 1630 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(opcode) | 1631 S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) | 1632 S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) | 1633 S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) | 1634 S_SQ_CF_ALU_WORD1_BARRIER(1) | 1635 S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chip_class == R600 ? cf->r6xx_uses_waterfall : 0) | 1636 S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1); 1637 } else if (cfop->flags & CF_FETCH) { 1638 if (bc->chip_class == R700) 1639 r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf); 1640 else 1641 r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf); 1642 } else if (cfop->flags & CF_EXP) { 1643 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) | 1644 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) | 1645 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) | 1646 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) | 1647 S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr); 1648 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) | 1649 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) | 1650 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) | 1651 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) | 1652 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) | 1653 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) | 1654 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) | 1655 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program); 1656 } else if (cfop->flags & CF_MEM) { 1657 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) | 1658 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) | 1659 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) | 1660 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) | 1661 S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr); 1662 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) | 1663 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) | 1664 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) | 1665 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program) | 1666 S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) | 1667 S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask); 1668 } else { 1669 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1); 1670 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(opcode) | 1671 S_SQ_CF_WORD1_BARRIER(1) | 1672 S_SQ_CF_WORD1_COND(cf->cond) | 1673 S_SQ_CF_WORD1_POP_COUNT(cf->pop_count) | 1674 S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program); 1675 } 1676 return 0; 1677 } 1678 1679 int r600_bytecode_build(struct r600_bytecode *bc) 1680 { 1681 struct r600_bytecode_cf *cf; 1682 struct r600_bytecode_alu *alu; 1683 struct r600_bytecode_vtx *vtx; 1684 struct r600_bytecode_tex *tex; 1685 struct r600_bytecode_gds *gds; 1686 uint32_t literal[4]; 1687 unsigned nliteral; 1688 unsigned addr; 1689 int i, r; 1690 1691 if (!bc->nstack) // If not 0, Stack_size already provided by llvm 1692 bc->nstack = bc->stack.max_entries; 1693 1694 if ((bc->type == PIPE_SHADER_VERTEX || bc->type == PIPE_SHADER_TESS_EVAL || bc->type == PIPE_SHADER_TESS_CTRL) && !bc->nstack) { 1695 bc->nstack = 1; 1696 } 1697 1698 /* first path compute addr of each CF block */ 1699 /* addr start after all the CF instructions */ 1700 addr = bc->cf_last->id + 2; 1701 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 1702 if (r600_isa_cf(cf->op)->flags & CF_FETCH) { 1703 addr += 3; 1704 addr &= 0xFFFFFFFCUL; 1705 } 1706 cf->addr = addr; 1707 addr += cf->ndw; 1708 bc->ndw = cf->addr + cf->ndw; 1709 } 1710 free(bc->bytecode); 1711 bc->bytecode = calloc(4, bc->ndw); 1712 if (bc->bytecode == NULL) 1713 return -ENOMEM; 1714 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 1715 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 1716 addr = cf->addr; 1717 if (bc->chip_class >= EVERGREEN) 1718 r = eg_bytecode_cf_build(bc, cf); 1719 else 1720 r = r600_bytecode_cf_build(bc, cf); 1721 if (r) 1722 return r; 1723 if (cfop->flags & CF_ALU) { 1724 nliteral = 0; 1725 memset(literal, 0, sizeof(literal)); 1726 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) { 1727 r = r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral); 1728 if (r) 1729 return r; 1730 r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral); 1731 r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache); 1732 1733 switch(bc->chip_class) { 1734 case R600: 1735 r = r600_bytecode_alu_build(bc, alu, addr); 1736 break; 1737 case R700: 1738 r = r700_bytecode_alu_build(bc, alu, addr); 1739 break; 1740 case EVERGREEN: 1741 case CAYMAN: 1742 r = eg_bytecode_alu_build(bc, alu, addr); 1743 break; 1744 default: 1745 R600_ERR("unknown chip class %d.\n", bc->chip_class); 1746 return -EINVAL; 1747 } 1748 if (r) 1749 return r; 1750 addr += 2; 1751 if (alu->last) { 1752 for (i = 0; i < align(nliteral, 2); ++i) { 1753 bc->bytecode[addr++] = literal[i]; 1754 } 1755 nliteral = 0; 1756 memset(literal, 0, sizeof(literal)); 1757 } 1758 } 1759 } else if (cf->op == CF_OP_VTX) { 1760 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 1761 r = r600_bytecode_vtx_build(bc, vtx, addr); 1762 if (r) 1763 return r; 1764 addr += 4; 1765 } 1766 } else if (cf->op == CF_OP_GDS) { 1767 assert(bc->chip_class >= EVERGREEN); 1768 LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) { 1769 r = eg_bytecode_gds_build(bc, gds, addr); 1770 if (r) 1771 return r; 1772 addr += 4; 1773 } 1774 } else if (cf->op == CF_OP_TEX) { 1775 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 1776 assert(bc->chip_class >= EVERGREEN); 1777 r = r600_bytecode_vtx_build(bc, vtx, addr); 1778 if (r) 1779 return r; 1780 addr += 4; 1781 } 1782 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) { 1783 r = r600_bytecode_tex_build(bc, tex, addr); 1784 if (r) 1785 return r; 1786 addr += 4; 1787 } 1788 } 1789 } 1790 return 0; 1791 } 1792 1793 void r600_bytecode_clear(struct r600_bytecode *bc) 1794 { 1795 struct r600_bytecode_cf *cf = NULL, *next_cf; 1796 1797 free(bc->bytecode); 1798 bc->bytecode = NULL; 1799 1800 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) { 1801 struct r600_bytecode_alu *alu = NULL, *next_alu; 1802 struct r600_bytecode_tex *tex = NULL, *next_tex; 1803 struct r600_bytecode_tex *vtx = NULL, *next_vtx; 1804 struct r600_bytecode_gds *gds = NULL, *next_gds; 1805 1806 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) { 1807 free(alu); 1808 } 1809 1810 LIST_INITHEAD(&cf->alu); 1811 1812 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) { 1813 free(tex); 1814 } 1815 1816 LIST_INITHEAD(&cf->tex); 1817 1818 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) { 1819 free(vtx); 1820 } 1821 1822 LIST_INITHEAD(&cf->vtx); 1823 1824 LIST_FOR_EACH_ENTRY_SAFE(gds, next_gds, &cf->gds, list) { 1825 free(gds); 1826 } 1827 1828 LIST_INITHEAD(&cf->gds); 1829 1830 free(cf); 1831 } 1832 1833 LIST_INITHEAD(&cf->list); 1834 } 1835 1836 static int print_swizzle(unsigned swz) 1837 { 1838 const char * swzchars = "xyzw01?_"; 1839 assert(swz<8 && swz != 6); 1840 return fprintf(stderr, "%c", swzchars[swz]); 1841 } 1842 1843 static int print_sel(unsigned sel, unsigned rel, unsigned index_mode, 1844 unsigned need_brackets) 1845 { 1846 int o = 0; 1847 if (rel && index_mode >= 5 && sel < 128) 1848 o += fprintf(stderr, "G"); 1849 if (rel || need_brackets) { 1850 o += fprintf(stderr, "["); 1851 } 1852 o += fprintf(stderr, "%d", sel); 1853 if (rel) { 1854 if (index_mode == 0 || index_mode == 6) 1855 o += fprintf(stderr, "+AR"); 1856 else if (index_mode == 4) 1857 o += fprintf(stderr, "+AL"); 1858 } 1859 if (rel || need_brackets) { 1860 o += fprintf(stderr, "]"); 1861 } 1862 return o; 1863 } 1864 1865 static int print_dst(struct r600_bytecode_alu *alu) 1866 { 1867 int o = 0; 1868 unsigned sel = alu->dst.sel; 1869 char reg_char = 'R'; 1870 if (sel > 128 - 4) { /* clause temporary gpr */ 1871 sel -= 128 - 4; 1872 reg_char = 'T'; 1873 } 1874 1875 if (alu_writes(alu)) { 1876 o += fprintf(stderr, "%c", reg_char); 1877 o += print_sel(alu->dst.sel, alu->dst.rel, alu->index_mode, 0); 1878 } else { 1879 o += fprintf(stderr, "__"); 1880 } 1881 o += fprintf(stderr, "."); 1882 o += print_swizzle(alu->dst.chan); 1883 return o; 1884 } 1885 1886 static int print_src(struct r600_bytecode_alu *alu, unsigned idx) 1887 { 1888 int o = 0; 1889 struct r600_bytecode_alu_src *src = &alu->src[idx]; 1890 unsigned sel = src->sel, need_sel = 1, need_chan = 1, need_brackets = 0; 1891 1892 if (src->neg) 1893 o += fprintf(stderr,"-"); 1894 if (src->abs) 1895 o += fprintf(stderr,"|"); 1896 1897 if (sel < 128 - 4) { 1898 o += fprintf(stderr, "R"); 1899 } else if (sel < 128) { 1900 o += fprintf(stderr, "T"); 1901 sel -= 128 - 4; 1902 } else if (sel < 160) { 1903 o += fprintf(stderr, "KC0"); 1904 need_brackets = 1; 1905 sel -= 128; 1906 } else if (sel < 192) { 1907 o += fprintf(stderr, "KC1"); 1908 need_brackets = 1; 1909 sel -= 160; 1910 } else if (sel >= 512) { 1911 o += fprintf(stderr, "C%d", src->kc_bank); 1912 need_brackets = 1; 1913 sel -= 512; 1914 } else if (sel >= 448) { 1915 o += fprintf(stderr, "Param"); 1916 sel -= 448; 1917 need_chan = 0; 1918 } else if (sel >= 288) { 1919 o += fprintf(stderr, "KC3"); 1920 need_brackets = 1; 1921 sel -= 288; 1922 } else if (sel >= 256) { 1923 o += fprintf(stderr, "KC2"); 1924 need_brackets = 1; 1925 sel -= 256; 1926 } else { 1927 need_sel = 0; 1928 need_chan = 0; 1929 switch (sel) { 1930 case EG_V_SQ_ALU_SRC_LDS_DIRECT_A: 1931 o += fprintf(stderr, "LDS_A[0x%08X]", src->value); 1932 break; 1933 case EG_V_SQ_ALU_SRC_LDS_DIRECT_B: 1934 o += fprintf(stderr, "LDS_B[0x%08X]", src->value); 1935 break; 1936 case EG_V_SQ_ALU_SRC_LDS_OQ_A: 1937 o += fprintf(stderr, "LDS_OQ_A"); 1938 need_chan = 1; 1939 break; 1940 case EG_V_SQ_ALU_SRC_LDS_OQ_B: 1941 o += fprintf(stderr, "LDS_OQ_B"); 1942 need_chan = 1; 1943 break; 1944 case EG_V_SQ_ALU_SRC_LDS_OQ_A_POP: 1945 o += fprintf(stderr, "LDS_OQ_A_POP"); 1946 need_chan = 1; 1947 break; 1948 case EG_V_SQ_ALU_SRC_LDS_OQ_B_POP: 1949 o += fprintf(stderr, "LDS_OQ_B_POP"); 1950 need_chan = 1; 1951 break; 1952 case V_SQ_ALU_SRC_PS: 1953 o += fprintf(stderr, "PS"); 1954 break; 1955 case V_SQ_ALU_SRC_PV: 1956 o += fprintf(stderr, "PV"); 1957 need_chan = 1; 1958 break; 1959 case V_SQ_ALU_SRC_LITERAL: 1960 o += fprintf(stderr, "[0x%08X %f]", src->value, u_bitcast_u2f(src->value)); 1961 break; 1962 case V_SQ_ALU_SRC_0_5: 1963 o += fprintf(stderr, "0.5"); 1964 break; 1965 case V_SQ_ALU_SRC_M_1_INT: 1966 o += fprintf(stderr, "-1"); 1967 break; 1968 case V_SQ_ALU_SRC_1_INT: 1969 o += fprintf(stderr, "1"); 1970 break; 1971 case V_SQ_ALU_SRC_1: 1972 o += fprintf(stderr, "1.0"); 1973 break; 1974 case V_SQ_ALU_SRC_0: 1975 o += fprintf(stderr, "0"); 1976 break; 1977 default: 1978 o += fprintf(stderr, "??IMM_%d", sel); 1979 break; 1980 } 1981 } 1982 1983 if (need_sel) 1984 o += print_sel(sel, src->rel, alu->index_mode, need_brackets); 1985 1986 if (need_chan) { 1987 o += fprintf(stderr, "."); 1988 o += print_swizzle(src->chan); 1989 } 1990 1991 if (src->abs) 1992 o += fprintf(stderr,"|"); 1993 1994 return o; 1995 } 1996 1997 static int print_indent(int p, int c) 1998 { 1999 int o = 0; 2000 while (p++ < c) 2001 o += fprintf(stderr, " "); 2002 return o; 2003 } 2004 2005 void r600_bytecode_disasm(struct r600_bytecode *bc) 2006 { 2007 const char *index_mode[] = {"CF_INDEX_NONE", "CF_INDEX_0", "CF_INDEX_1"}; 2008 static int index = 0; 2009 struct r600_bytecode_cf *cf = NULL; 2010 struct r600_bytecode_alu *alu = NULL; 2011 struct r600_bytecode_vtx *vtx = NULL; 2012 struct r600_bytecode_tex *tex = NULL; 2013 struct r600_bytecode_gds *gds = NULL; 2014 2015 unsigned i, id, ngr = 0, last; 2016 uint32_t literal[4]; 2017 unsigned nliteral; 2018 char chip = '6'; 2019 2020 switch (bc->chip_class) { 2021 case R700: 2022 chip = '7'; 2023 break; 2024 case EVERGREEN: 2025 chip = 'E'; 2026 break; 2027 case CAYMAN: 2028 chip = 'C'; 2029 break; 2030 case R600: 2031 default: 2032 chip = '6'; 2033 break; 2034 } 2035 fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n", 2036 bc->ndw, bc->ngpr, bc->nstack); 2037 fprintf(stderr, "shader %d -- %c\n", index++, chip); 2038 2039 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 2040 id = cf->id; 2041 if (cf->op == CF_NATIVE) { 2042 fprintf(stderr, "%04d %08X %08X CF_NATIVE\n", id, bc->bytecode[id], 2043 bc->bytecode[id + 1]); 2044 } else { 2045 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 2046 if (cfop->flags & CF_ALU) { 2047 if (cf->eg_alu_extended) { 2048 fprintf(stderr, "%04d %08X %08X %s\n", id, bc->bytecode[id], 2049 bc->bytecode[id + 1], "ALU_EXT"); 2050 id += 2; 2051 } 2052 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2053 bc->bytecode[id + 1], cfop->name); 2054 fprintf(stderr, "%d @%d ", cf->ndw / 2, cf->addr); 2055 for (i = 0; i < 4; ++i) { 2056 if (cf->kcache[i].mode) { 2057 int c_start = (cf->kcache[i].addr << 4); 2058 int c_end = c_start + (cf->kcache[i].mode << 4); 2059 fprintf(stderr, "KC%d[CB%d:%d-%d%s%s] ", 2060 i, cf->kcache[i].bank, c_start, c_end, 2061 cf->kcache[i].index_mode ? " " : "", 2062 cf->kcache[i].index_mode ? index_mode[cf->kcache[i].index_mode] : ""); 2063 } 2064 } 2065 fprintf(stderr, "\n"); 2066 } else if (cfop->flags & CF_FETCH) { 2067 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2068 bc->bytecode[id + 1], cfop->name); 2069 fprintf(stderr, "%d @%d ", cf->ndw / 4, cf->addr); 2070 fprintf(stderr, "\n"); 2071 } else if (cfop->flags & CF_EXP) { 2072 int o = 0; 2073 const char *exp_type[] = {"PIXEL", "POS ", "PARAM"}; 2074 o += fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2075 bc->bytecode[id + 1], cfop->name); 2076 o += print_indent(o, 43); 2077 o += fprintf(stderr, "%s ", exp_type[cf->output.type]); 2078 if (cf->output.burst_count > 1) { 2079 o += fprintf(stderr, "%d-%d ", cf->output.array_base, 2080 cf->output.array_base + cf->output.burst_count - 1); 2081 2082 o += print_indent(o, 55); 2083 o += fprintf(stderr, "R%d-%d.", cf->output.gpr, 2084 cf->output.gpr + cf->output.burst_count - 1); 2085 } else { 2086 o += fprintf(stderr, "%d ", cf->output.array_base); 2087 o += print_indent(o, 55); 2088 o += fprintf(stderr, "R%d.", cf->output.gpr); 2089 } 2090 2091 o += print_swizzle(cf->output.swizzle_x); 2092 o += print_swizzle(cf->output.swizzle_y); 2093 o += print_swizzle(cf->output.swizzle_z); 2094 o += print_swizzle(cf->output.swizzle_w); 2095 2096 print_indent(o, 67); 2097 2098 fprintf(stderr, " ES:%X ", cf->output.elem_size); 2099 if (!cf->barrier) 2100 fprintf(stderr, "NO_BARRIER "); 2101 if (cf->end_of_program) 2102 fprintf(stderr, "EOP "); 2103 fprintf(stderr, "\n"); 2104 } else if (r600_isa_cf(cf->op)->flags & CF_MEM) { 2105 int o = 0; 2106 const char *exp_type[] = {"WRITE", "WRITE_IND", "WRITE_ACK", 2107 "WRITE_IND_ACK"}; 2108 o += fprintf(stderr, "%04d %08X %08X %s ", id, 2109 bc->bytecode[id], bc->bytecode[id + 1], cfop->name); 2110 o += print_indent(o, 43); 2111 o += fprintf(stderr, "%s ", exp_type[cf->output.type]); 2112 if (cf->output.burst_count > 1) { 2113 o += fprintf(stderr, "%d-%d ", cf->output.array_base, 2114 cf->output.array_base + cf->output.burst_count - 1); 2115 o += print_indent(o, 55); 2116 o += fprintf(stderr, "R%d-%d.", cf->output.gpr, 2117 cf->output.gpr + cf->output.burst_count - 1); 2118 } else { 2119 o += fprintf(stderr, "%d ", cf->output.array_base); 2120 o += print_indent(o, 55); 2121 o += fprintf(stderr, "R%d.", cf->output.gpr); 2122 } 2123 for (i = 0; i < 4; ++i) { 2124 if (cf->output.comp_mask & (1 << i)) 2125 o += print_swizzle(i); 2126 else 2127 o += print_swizzle(7); 2128 } 2129 2130 if (cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND) 2131 o += fprintf(stderr, " R%d", cf->output.index_gpr); 2132 2133 o += print_indent(o, 67); 2134 2135 fprintf(stderr, " ES:%i ", cf->output.elem_size); 2136 if (cf->output.array_size != 0xFFF) 2137 fprintf(stderr, "AS:%i ", cf->output.array_size); 2138 if (!cf->barrier) 2139 fprintf(stderr, "NO_BARRIER "); 2140 if (cf->end_of_program) 2141 fprintf(stderr, "EOP "); 2142 fprintf(stderr, "\n"); 2143 } else { 2144 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2145 bc->bytecode[id + 1], cfop->name); 2146 fprintf(stderr, "@%d ", cf->cf_addr); 2147 if (cf->cond) 2148 fprintf(stderr, "CND:%X ", cf->cond); 2149 if (cf->pop_count) 2150 fprintf(stderr, "POP:%X ", cf->pop_count); 2151 if (cf->count && (cfop->flags & CF_EMIT)) 2152 fprintf(stderr, "STREAM%d ", cf->count); 2153 if (cf->end_of_program) 2154 fprintf(stderr, "EOP "); 2155 fprintf(stderr, "\n"); 2156 } 2157 } 2158 2159 id = cf->addr; 2160 nliteral = 0; 2161 last = 1; 2162 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) { 2163 const char *omod_str[] = {"","*2","*4","/2"}; 2164 const struct alu_op_info *aop = r600_isa_alu(alu->op); 2165 int o = 0; 2166 2167 r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral); 2168 o += fprintf(stderr, " %04d %08X %08X ", id, bc->bytecode[id], bc->bytecode[id+1]); 2169 if (last) 2170 o += fprintf(stderr, "%4d ", ++ngr); 2171 else 2172 o += fprintf(stderr, " "); 2173 o += fprintf(stderr, "%c%c %c ", alu->execute_mask ? 'M':' ', 2174 alu->update_pred ? 'P':' ', 2175 alu->pred_sel ? alu->pred_sel==2 ? '0':'1':' '); 2176 2177 o += fprintf(stderr, "%s%s%s ", aop->name, 2178 omod_str[alu->omod], alu->dst.clamp ? "_sat":""); 2179 2180 o += print_indent(o,60); 2181 o += print_dst(alu); 2182 for (i = 0; i < aop->src_count; ++i) { 2183 o += fprintf(stderr, i == 0 ? ", ": ", "); 2184 o += print_src(alu, i); 2185 } 2186 2187 if (alu->bank_swizzle) { 2188 o += print_indent(o,75); 2189 o += fprintf(stderr, " BS:%d", alu->bank_swizzle); 2190 } 2191 2192 fprintf(stderr, "\n"); 2193 id += 2; 2194 2195 if (alu->last) { 2196 for (i = 0; i < nliteral; i++, id++) { 2197 float *f = (float*)(bc->bytecode + id); 2198 o = fprintf(stderr, " %04d %08X", id, bc->bytecode[id]); 2199 print_indent(o, 60); 2200 fprintf(stderr, " %f (%d)\n", *f, *(bc->bytecode + id)); 2201 } 2202 id += nliteral & 1; 2203 nliteral = 0; 2204 } 2205 last = alu->last; 2206 } 2207 2208 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) { 2209 int o = 0; 2210 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2211 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2212 2213 o += fprintf(stderr, "%s ", r600_isa_fetch(tex->op)->name); 2214 2215 o += print_indent(o, 50); 2216 2217 o += fprintf(stderr, "R%d.", tex->dst_gpr); 2218 o += print_swizzle(tex->dst_sel_x); 2219 o += print_swizzle(tex->dst_sel_y); 2220 o += print_swizzle(tex->dst_sel_z); 2221 o += print_swizzle(tex->dst_sel_w); 2222 2223 o += fprintf(stderr, ", R%d.", tex->src_gpr); 2224 o += print_swizzle(tex->src_sel_x); 2225 o += print_swizzle(tex->src_sel_y); 2226 o += print_swizzle(tex->src_sel_z); 2227 o += print_swizzle(tex->src_sel_w); 2228 2229 o += fprintf(stderr, ", RID:%d", tex->resource_id); 2230 o += fprintf(stderr, ", SID:%d ", tex->sampler_id); 2231 2232 if (tex->sampler_index_mode) 2233 fprintf(stderr, "SQ_%s ", index_mode[tex->sampler_index_mode]); 2234 2235 if (tex->lod_bias) 2236 fprintf(stderr, "LB:%d ", tex->lod_bias); 2237 2238 fprintf(stderr, "CT:%c%c%c%c ", 2239 tex->coord_type_x ? 'N' : 'U', 2240 tex->coord_type_y ? 'N' : 'U', 2241 tex->coord_type_z ? 'N' : 'U', 2242 tex->coord_type_w ? 'N' : 'U'); 2243 2244 if (tex->offset_x) 2245 fprintf(stderr, "OX:%d ", tex->offset_x); 2246 if (tex->offset_y) 2247 fprintf(stderr, "OY:%d ", tex->offset_y); 2248 if (tex->offset_z) 2249 fprintf(stderr, "OZ:%d ", tex->offset_z); 2250 2251 id += 4; 2252 fprintf(stderr, "\n"); 2253 } 2254 2255 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 2256 int o = 0; 2257 const char * fetch_type[] = {"VERTEX", "INSTANCE", ""}; 2258 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2259 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2260 2261 o += fprintf(stderr, "%s ", r600_isa_fetch(vtx->op)->name); 2262 2263 o += print_indent(o, 50); 2264 2265 o += fprintf(stderr, "R%d.", vtx->dst_gpr); 2266 o += print_swizzle(vtx->dst_sel_x); 2267 o += print_swizzle(vtx->dst_sel_y); 2268 o += print_swizzle(vtx->dst_sel_z); 2269 o += print_swizzle(vtx->dst_sel_w); 2270 2271 o += fprintf(stderr, ", R%d.", vtx->src_gpr); 2272 o += print_swizzle(vtx->src_sel_x); 2273 2274 if (vtx->offset) 2275 fprintf(stderr, " +%db", vtx->offset); 2276 2277 o += print_indent(o, 55); 2278 2279 fprintf(stderr, ", RID:%d ", vtx->buffer_id); 2280 2281 fprintf(stderr, "%s ", fetch_type[vtx->fetch_type]); 2282 2283 if (bc->chip_class < CAYMAN && vtx->mega_fetch_count) 2284 fprintf(stderr, "MFC:%d ", vtx->mega_fetch_count); 2285 2286 if (bc->chip_class >= EVERGREEN && vtx->buffer_index_mode) 2287 fprintf(stderr, "SQ_%s ", index_mode[vtx->buffer_index_mode]); 2288 2289 fprintf(stderr, "UCF:%d ", vtx->use_const_fields); 2290 fprintf(stderr, "FMT(DTA:%d ", vtx->data_format); 2291 fprintf(stderr, "NUM:%d ", vtx->num_format_all); 2292 fprintf(stderr, "COMP:%d ", vtx->format_comp_all); 2293 fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all); 2294 2295 id += 4; 2296 } 2297 2298 LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) { 2299 int o = 0; 2300 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2301 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2302 2303 o += fprintf(stderr, "%s ", r600_isa_fetch(gds->op)->name); 2304 2305 if (gds->op != FETCH_OP_TF_WRITE) { 2306 o += fprintf(stderr, "R%d.", gds->dst_gpr); 2307 o += print_swizzle(gds->dst_sel_x); 2308 o += print_swizzle(gds->dst_sel_y); 2309 o += print_swizzle(gds->dst_sel_z); 2310 o += print_swizzle(gds->dst_sel_w); 2311 } 2312 2313 o += fprintf(stderr, ", R%d.", gds->src_gpr); 2314 o += print_swizzle(gds->src_sel_x); 2315 o += print_swizzle(gds->src_sel_y); 2316 o += print_swizzle(gds->src_sel_z); 2317 2318 if (gds->op != FETCH_OP_TF_WRITE) { 2319 o += fprintf(stderr, ", R%d.", gds->src_gpr2); 2320 } 2321 fprintf(stderr, "\n"); 2322 id += 4; 2323 } 2324 } 2325 2326 fprintf(stderr, "--------------------------------------\n"); 2327 } 2328 2329 void r600_vertex_data_type(enum pipe_format pformat, 2330 unsigned *format, 2331 unsigned *num_format, unsigned *format_comp, unsigned *endian) 2332 { 2333 const struct util_format_description *desc; 2334 unsigned i; 2335 2336 *format = 0; 2337 *num_format = 0; 2338 *format_comp = 0; 2339 *endian = ENDIAN_NONE; 2340 2341 if (pformat == PIPE_FORMAT_R11G11B10_FLOAT) { 2342 *format = FMT_10_11_11_FLOAT; 2343 *endian = r600_endian_swap(32); 2344 return; 2345 } 2346 2347 if (pformat == PIPE_FORMAT_B5G6R5_UNORM) { 2348 *format = FMT_5_6_5; 2349 *endian = r600_endian_swap(16); 2350 return; 2351 } 2352 2353 desc = util_format_description(pformat); 2354 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) { 2355 goto out_unknown; 2356 } 2357 2358 /* Find the first non-VOID channel. */ 2359 for (i = 0; i < 4; i++) { 2360 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) { 2361 break; 2362 } 2363 } 2364 2365 *endian = r600_endian_swap(desc->channel[i].size); 2366 2367 switch (desc->channel[i].type) { 2368 /* Half-floats, floats, ints */ 2369 case UTIL_FORMAT_TYPE_FLOAT: 2370 switch (desc->channel[i].size) { 2371 case 16: 2372 switch (desc->nr_channels) { 2373 case 1: 2374 *format = FMT_16_FLOAT; 2375 break; 2376 case 2: 2377 *format = FMT_16_16_FLOAT; 2378 break; 2379 case 3: 2380 case 4: 2381 *format = FMT_16_16_16_16_FLOAT; 2382 break; 2383 } 2384 break; 2385 case 32: 2386 switch (desc->nr_channels) { 2387 case 1: 2388 *format = FMT_32_FLOAT; 2389 break; 2390 case 2: 2391 *format = FMT_32_32_FLOAT; 2392 break; 2393 case 3: 2394 *format = FMT_32_32_32_FLOAT; 2395 break; 2396 case 4: 2397 *format = FMT_32_32_32_32_FLOAT; 2398 break; 2399 } 2400 break; 2401 default: 2402 goto out_unknown; 2403 } 2404 break; 2405 /* Unsigned ints */ 2406 case UTIL_FORMAT_TYPE_UNSIGNED: 2407 /* Signed ints */ 2408 case UTIL_FORMAT_TYPE_SIGNED: 2409 switch (desc->channel[i].size) { 2410 case 8: 2411 switch (desc->nr_channels) { 2412 case 1: 2413 *format = FMT_8; 2414 break; 2415 case 2: 2416 *format = FMT_8_8; 2417 break; 2418 case 3: 2419 case 4: 2420 *format = FMT_8_8_8_8; 2421 break; 2422 } 2423 break; 2424 case 10: 2425 if (desc->nr_channels != 4) 2426 goto out_unknown; 2427 2428 *format = FMT_2_10_10_10; 2429 break; 2430 case 16: 2431 switch (desc->nr_channels) { 2432 case 1: 2433 *format = FMT_16; 2434 break; 2435 case 2: 2436 *format = FMT_16_16; 2437 break; 2438 case 3: 2439 case 4: 2440 *format = FMT_16_16_16_16; 2441 break; 2442 } 2443 break; 2444 case 32: 2445 switch (desc->nr_channels) { 2446 case 1: 2447 *format = FMT_32; 2448 break; 2449 case 2: 2450 *format = FMT_32_32; 2451 break; 2452 case 3: 2453 *format = FMT_32_32_32; 2454 break; 2455 case 4: 2456 *format = FMT_32_32_32_32; 2457 break; 2458 } 2459 break; 2460 default: 2461 goto out_unknown; 2462 } 2463 break; 2464 default: 2465 goto out_unknown; 2466 } 2467 2468 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { 2469 *format_comp = 1; 2470 } 2471 2472 *num_format = 0; 2473 if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED || 2474 desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { 2475 if (!desc->channel[i].normalized) { 2476 if (desc->channel[i].pure_integer) 2477 *num_format = 1; 2478 else 2479 *num_format = 2; 2480 } 2481 } 2482 return; 2483 out_unknown: 2484 R600_ERR("unsupported vertex format %s\n", util_format_name(pformat)); 2485 } 2486 2487 void *r600_create_vertex_fetch_shader(struct pipe_context *ctx, 2488 unsigned count, 2489 const struct pipe_vertex_element *elements) 2490 { 2491 struct r600_context *rctx = (struct r600_context *)ctx; 2492 struct r600_bytecode bc; 2493 struct r600_bytecode_vtx vtx; 2494 const struct util_format_description *desc; 2495 unsigned fetch_resource_start = rctx->b.chip_class >= EVERGREEN ? 0 : 160; 2496 unsigned format, num_format, format_comp, endian; 2497 uint32_t *bytecode; 2498 int i, j, r, fs_size; 2499 struct r600_fetch_shader *shader; 2500 unsigned no_sb = rctx->screen->b.debug_flags & DBG_NO_SB; 2501 unsigned sb_disasm = !no_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); 2502 2503 assert(count < 32); 2504 2505 memset(&bc, 0, sizeof(bc)); 2506 r600_bytecode_init(&bc, rctx->b.chip_class, rctx->b.family, 2507 rctx->screen->has_compressed_msaa_texturing); 2508 2509 bc.isa = rctx->isa; 2510 2511 for (i = 0; i < count; i++) { 2512 if (elements[i].instance_divisor > 1) { 2513 if (rctx->b.chip_class == CAYMAN) { 2514 for (j = 0; j < 4; j++) { 2515 struct r600_bytecode_alu alu; 2516 memset(&alu, 0, sizeof(alu)); 2517 alu.op = ALU_OP2_MULHI_UINT; 2518 alu.src[0].sel = 0; 2519 alu.src[0].chan = 3; 2520 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2521 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1; 2522 alu.dst.sel = i + 1; 2523 alu.dst.chan = j; 2524 alu.dst.write = j == 3; 2525 alu.last = j == 3; 2526 if ((r = r600_bytecode_add_alu(&bc, &alu))) { 2527 r600_bytecode_clear(&bc); 2528 return NULL; 2529 } 2530 } 2531 } else { 2532 struct r600_bytecode_alu alu; 2533 memset(&alu, 0, sizeof(alu)); 2534 alu.op = ALU_OP2_MULHI_UINT; 2535 alu.src[0].sel = 0; 2536 alu.src[0].chan = 3; 2537 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2538 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1; 2539 alu.dst.sel = i + 1; 2540 alu.dst.chan = 3; 2541 alu.dst.write = 1; 2542 alu.last = 1; 2543 if ((r = r600_bytecode_add_alu(&bc, &alu))) { 2544 r600_bytecode_clear(&bc); 2545 return NULL; 2546 } 2547 } 2548 } 2549 } 2550 2551 for (i = 0; i < count; i++) { 2552 r600_vertex_data_type(elements[i].src_format, 2553 &format, &num_format, &format_comp, &endian); 2554 2555 desc = util_format_description(elements[i].src_format); 2556 if (!desc) { 2557 r600_bytecode_clear(&bc); 2558 R600_ERR("unknown format %d\n", elements[i].src_format); 2559 return NULL; 2560 } 2561 2562 if (elements[i].src_offset > 65535) { 2563 r600_bytecode_clear(&bc); 2564 R600_ERR("too big src_offset: %u\n", elements[i].src_offset); 2565 return NULL; 2566 } 2567 2568 memset(&vtx, 0, sizeof(vtx)); 2569 vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start; 2570 vtx.fetch_type = elements[i].instance_divisor ? SQ_VTX_FETCH_INSTANCE_DATA : SQ_VTX_FETCH_VERTEX_DATA; 2571 vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0; 2572 vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0; 2573 vtx.mega_fetch_count = 0x1F; 2574 vtx.dst_gpr = i + 1; 2575 vtx.dst_sel_x = desc->swizzle[0]; 2576 vtx.dst_sel_y = desc->swizzle[1]; 2577 vtx.dst_sel_z = desc->swizzle[2]; 2578 vtx.dst_sel_w = desc->swizzle[3]; 2579 vtx.data_format = format; 2580 vtx.num_format_all = num_format; 2581 vtx.format_comp_all = format_comp; 2582 vtx.offset = elements[i].src_offset; 2583 vtx.endian = endian; 2584 2585 if ((r = r600_bytecode_add_vtx(&bc, &vtx))) { 2586 r600_bytecode_clear(&bc); 2587 return NULL; 2588 } 2589 } 2590 2591 r600_bytecode_add_cfinst(&bc, CF_OP_RET); 2592 2593 if ((r = r600_bytecode_build(&bc))) { 2594 r600_bytecode_clear(&bc); 2595 return NULL; 2596 } 2597 2598 if (rctx->screen->b.debug_flags & DBG_FS) { 2599 fprintf(stderr, "--------------------------------------------------------------\n"); 2600 fprintf(stderr, "Vertex elements state:\n"); 2601 for (i = 0; i < count; i++) { 2602 fprintf(stderr, " "); 2603 util_dump_vertex_element(stderr, elements+i); 2604 fprintf(stderr, "\n"); 2605 } 2606 2607 if (!sb_disasm) { 2608 r600_bytecode_disasm(&bc); 2609 2610 fprintf(stderr, "______________________________________________________________\n"); 2611 } else { 2612 r600_sb_bytecode_process(rctx, &bc, NULL, 1 /*dump*/, 0 /*optimize*/); 2613 } 2614 } 2615 2616 fs_size = bc.ndw*4; 2617 2618 /* Allocate the CSO. */ 2619 shader = CALLOC_STRUCT(r600_fetch_shader); 2620 if (!shader) { 2621 r600_bytecode_clear(&bc); 2622 return NULL; 2623 } 2624 2625 u_suballocator_alloc(rctx->allocator_fetch_shader, fs_size, 256, 2626 &shader->offset, 2627 (struct pipe_resource**)&shader->buffer); 2628 if (!shader->buffer) { 2629 r600_bytecode_clear(&bc); 2630 FREE(shader); 2631 return NULL; 2632 } 2633 2634 bytecode = r600_buffer_map_sync_with_rings(&rctx->b, shader->buffer, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED); 2635 bytecode += shader->offset / 4; 2636 2637 if (R600_BIG_ENDIAN) { 2638 for (i = 0; i < fs_size / 4; ++i) { 2639 bytecode[i] = util_cpu_to_le32(bc.bytecode[i]); 2640 } 2641 } else { 2642 memcpy(bytecode, bc.bytecode, fs_size); 2643 } 2644 rctx->b.ws->buffer_unmap(shader->buffer->buf); 2645 2646 r600_bytecode_clear(&bc); 2647 return shader; 2648 } 2649 2650 void r600_bytecode_alu_read(struct r600_bytecode *bc, 2651 struct r600_bytecode_alu *alu, uint32_t word0, uint32_t word1) 2652 { 2653 /* WORD0 */ 2654 alu->src[0].sel = G_SQ_ALU_WORD0_SRC0_SEL(word0); 2655 alu->src[0].rel = G_SQ_ALU_WORD0_SRC0_REL(word0); 2656 alu->src[0].chan = G_SQ_ALU_WORD0_SRC0_CHAN(word0); 2657 alu->src[0].neg = G_SQ_ALU_WORD0_SRC0_NEG(word0); 2658 alu->src[1].sel = G_SQ_ALU_WORD0_SRC1_SEL(word0); 2659 alu->src[1].rel = G_SQ_ALU_WORD0_SRC1_REL(word0); 2660 alu->src[1].chan = G_SQ_ALU_WORD0_SRC1_CHAN(word0); 2661 alu->src[1].neg = G_SQ_ALU_WORD0_SRC1_NEG(word0); 2662 alu->index_mode = G_SQ_ALU_WORD0_INDEX_MODE(word0); 2663 alu->pred_sel = G_SQ_ALU_WORD0_PRED_SEL(word0); 2664 alu->last = G_SQ_ALU_WORD0_LAST(word0); 2665 2666 /* WORD1 */ 2667 alu->bank_swizzle = G_SQ_ALU_WORD1_BANK_SWIZZLE(word1); 2668 if (alu->bank_swizzle) 2669 alu->bank_swizzle_force = alu->bank_swizzle; 2670 alu->dst.sel = G_SQ_ALU_WORD1_DST_GPR(word1); 2671 alu->dst.rel = G_SQ_ALU_WORD1_DST_REL(word1); 2672 alu->dst.chan = G_SQ_ALU_WORD1_DST_CHAN(word1); 2673 alu->dst.clamp = G_SQ_ALU_WORD1_CLAMP(word1); 2674 if (G_SQ_ALU_WORD1_ENCODING(word1)) /*ALU_DWORD1_OP3*/ 2675 { 2676 alu->is_op3 = 1; 2677 alu->src[2].sel = G_SQ_ALU_WORD1_OP3_SRC2_SEL(word1); 2678 alu->src[2].rel = G_SQ_ALU_WORD1_OP3_SRC2_REL(word1); 2679 alu->src[2].chan = G_SQ_ALU_WORD1_OP3_SRC2_CHAN(word1); 2680 alu->src[2].neg = G_SQ_ALU_WORD1_OP3_SRC2_NEG(word1); 2681 alu->op = r600_isa_alu_by_opcode(bc->isa, 2682 G_SQ_ALU_WORD1_OP3_ALU_INST(word1), /* is_op3 = */ 1); 2683 2684 } 2685 else /*ALU_DWORD1_OP2*/ 2686 { 2687 alu->src[0].abs = G_SQ_ALU_WORD1_OP2_SRC0_ABS(word1); 2688 alu->src[1].abs = G_SQ_ALU_WORD1_OP2_SRC1_ABS(word1); 2689 alu->op = r600_isa_alu_by_opcode(bc->isa, 2690 G_SQ_ALU_WORD1_OP2_ALU_INST(word1), /* is_op3 = */ 0); 2691 alu->omod = G_SQ_ALU_WORD1_OP2_OMOD(word1); 2692 alu->dst.write = G_SQ_ALU_WORD1_OP2_WRITE_MASK(word1); 2693 alu->update_pred = G_SQ_ALU_WORD1_OP2_UPDATE_PRED(word1); 2694 alu->execute_mask = 2695 G_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(word1); 2696 } 2697 } 2698 2699 #if 0 2700 void r600_bytecode_export_read(struct r600_bytecode *bc, 2701 struct r600_bytecode_output *output, uint32_t word0, uint32_t word1) 2702 { 2703 output->array_base = G_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(word0); 2704 output->type = G_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(word0); 2705 output->gpr = G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(word0); 2706 output->elem_size = G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(word0); 2707 2708 output->swizzle_x = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(word1); 2709 output->swizzle_y = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(word1); 2710 output->swizzle_z = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(word1); 2711 output->swizzle_w = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(word1); 2712 output->burst_count = G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(word1); 2713 output->end_of_program = G_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(word1); 2714 output->op = r600_isa_cf_by_opcode(bc->isa, 2715 G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(word1), 0); 2716 output->barrier = G_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(word1); 2717 output->array_size = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(word1); 2718 output->comp_mask = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(word1); 2719 } 2720 #endif 2721