1 /* Copyright 2011 Intel Corporation 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a 4 * copy of this software and associated documentation files (the "Software"), 5 * to deal in the Software without restriction, including without limitation 6 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 * and/or sell copies of the Software, and to permit persons to whom the 8 * Software is furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice (including the next 11 * paragraph) shall be included in all copies or substantial portions of the 12 * Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 20 * IN THE SOFTWARE. 21 */ 22 23 #include "brw_vec4.h" 24 #include "brw_cfg.h" 25 #include "brw_eu.h" 26 #include "common/gen_debug.h" 27 28 using namespace brw; 29 30 static void 31 generate_math1_gen4(struct brw_codegen *p, 32 vec4_instruction *inst, 33 struct brw_reg dst, 34 struct brw_reg src) 35 { 36 gen4_math(p, 37 dst, 38 brw_math_function(inst->opcode), 39 inst->base_mrf, 40 src, 41 BRW_MATH_PRECISION_FULL); 42 } 43 44 static void 45 check_gen6_math_src_arg(struct brw_reg src) 46 { 47 /* Source swizzles are ignored. */ 48 assert(!src.abs); 49 assert(!src.negate); 50 assert(src.swizzle == BRW_SWIZZLE_XYZW); 51 } 52 53 static void 54 generate_math_gen6(struct brw_codegen *p, 55 vec4_instruction *inst, 56 struct brw_reg dst, 57 struct brw_reg src0, 58 struct brw_reg src1) 59 { 60 /* Can't do writemask because math can't be align16. */ 61 assert(dst.writemask == WRITEMASK_XYZW); 62 /* Source swizzles are ignored. */ 63 check_gen6_math_src_arg(src0); 64 if (src1.file == BRW_GENERAL_REGISTER_FILE) 65 check_gen6_math_src_arg(src1); 66 67 brw_set_default_access_mode(p, BRW_ALIGN_1); 68 gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1); 69 brw_set_default_access_mode(p, BRW_ALIGN_16); 70 } 71 72 static void 73 generate_math2_gen4(struct brw_codegen *p, 74 vec4_instruction *inst, 75 struct brw_reg dst, 76 struct brw_reg src0, 77 struct brw_reg src1) 78 { 79 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 80 * "Message Payload": 81 * 82 * "Operand0[7]. For the INT DIV functions, this operand is the 83 * denominator." 84 * ... 85 * "Operand1[7]. For the INT DIV functions, this operand is the 86 * numerator." 87 */ 88 bool is_int_div = inst->opcode != SHADER_OPCODE_POW; 89 struct brw_reg &op0 = is_int_div ? src1 : src0; 90 struct brw_reg &op1 = is_int_div ? src0 : src1; 91 92 brw_push_insn_state(p); 93 brw_set_default_saturate(p, false); 94 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 95 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1); 96 brw_pop_insn_state(p); 97 98 gen4_math(p, 99 dst, 100 brw_math_function(inst->opcode), 101 inst->base_mrf, 102 op0, 103 BRW_MATH_PRECISION_FULL); 104 } 105 106 static void 107 generate_tex(struct brw_codegen *p, 108 struct brw_vue_prog_data *prog_data, 109 gl_shader_stage stage, 110 vec4_instruction *inst, 111 struct brw_reg dst, 112 struct brw_reg src, 113 struct brw_reg surface_index, 114 struct brw_reg sampler_index) 115 { 116 const struct gen_device_info *devinfo = p->devinfo; 117 int msg_type = -1; 118 119 if (devinfo->gen >= 5) { 120 switch (inst->opcode) { 121 case SHADER_OPCODE_TEX: 122 case SHADER_OPCODE_TXL: 123 if (inst->shadow_compare) { 124 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 125 } else { 126 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 127 } 128 break; 129 case SHADER_OPCODE_TXD: 130 if (inst->shadow_compare) { 131 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */ 132 assert(devinfo->gen >= 8 || devinfo->is_haswell); 133 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE; 134 } else { 135 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; 136 } 137 break; 138 case SHADER_OPCODE_TXF: 139 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 140 break; 141 case SHADER_OPCODE_TXF_CMS_W: 142 assert(devinfo->gen >= 9); 143 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; 144 break; 145 case SHADER_OPCODE_TXF_CMS: 146 if (devinfo->gen >= 7) 147 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; 148 else 149 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 150 break; 151 case SHADER_OPCODE_TXF_MCS: 152 assert(devinfo->gen >= 7); 153 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; 154 break; 155 case SHADER_OPCODE_TXS: 156 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; 157 break; 158 case SHADER_OPCODE_TG4: 159 if (inst->shadow_compare) { 160 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C; 161 } else { 162 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; 163 } 164 break; 165 case SHADER_OPCODE_TG4_OFFSET: 166 if (inst->shadow_compare) { 167 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C; 168 } else { 169 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; 170 } 171 break; 172 case SHADER_OPCODE_SAMPLEINFO: 173 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; 174 break; 175 default: 176 unreachable("should not get here: invalid vec4 texture opcode"); 177 } 178 } else { 179 switch (inst->opcode) { 180 case SHADER_OPCODE_TEX: 181 case SHADER_OPCODE_TXL: 182 if (inst->shadow_compare) { 183 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE; 184 assert(inst->mlen == 3); 185 } else { 186 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD; 187 assert(inst->mlen == 2); 188 } 189 break; 190 case SHADER_OPCODE_TXD: 191 /* There is no sample_d_c message; comparisons are done manually. */ 192 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS; 193 assert(inst->mlen == 4); 194 break; 195 case SHADER_OPCODE_TXF: 196 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD; 197 assert(inst->mlen == 2); 198 break; 199 case SHADER_OPCODE_TXS: 200 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO; 201 assert(inst->mlen == 2); 202 break; 203 default: 204 unreachable("should not get here: invalid vec4 texture opcode"); 205 } 206 } 207 208 assert(msg_type != -1); 209 210 assert(sampler_index.type == BRW_REGISTER_TYPE_UD); 211 212 /* Load the message header if present. If there's a texture offset, we need 213 * to set it up explicitly and load the offset bitfield. Otherwise, we can 214 * use an implied move from g0 to the first message register. 215 */ 216 if (inst->header_size != 0) { 217 if (devinfo->gen < 6 && !inst->offset) { 218 /* Set up an implied move from g0 to the MRF. */ 219 src = brw_vec8_grf(0, 0); 220 } else { 221 struct brw_reg header = 222 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); 223 uint32_t dw2 = 0; 224 225 /* Explicitly set up the message header by copying g0 to the MRF. */ 226 brw_push_insn_state(p); 227 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 228 brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 229 230 brw_set_default_access_mode(p, BRW_ALIGN_1); 231 232 if (inst->offset) 233 /* Set the texel offset bits in DWord 2. */ 234 dw2 = inst->offset; 235 236 if (devinfo->gen >= 9) 237 /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D, 238 * based on bit 22 in the header. 239 */ 240 dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2; 241 242 /* The VS, DS, and FS stages have the g0.2 payload delivered as 0, 243 * so header0.2 is 0 when g0 is copied. The HS and GS stages do 244 * not, so we must set to to 0 to avoid setting undesirable bits 245 * in the message header. 246 */ 247 if (dw2 || 248 stage == MESA_SHADER_TESS_CTRL || 249 stage == MESA_SHADER_GEOMETRY) { 250 brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2)); 251 } 252 253 brw_adjust_sampler_state_pointer(p, header, sampler_index); 254 brw_pop_insn_state(p); 255 } 256 } 257 258 uint32_t return_format; 259 260 switch (dst.type) { 261 case BRW_REGISTER_TYPE_D: 262 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; 263 break; 264 case BRW_REGISTER_TYPE_UD: 265 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 266 break; 267 default: 268 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 269 break; 270 } 271 272 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || 273 inst->opcode == SHADER_OPCODE_TG4_OFFSET) 274 ? prog_data->base.binding_table.gather_texture_start 275 : prog_data->base.binding_table.texture_start; 276 277 if (surface_index.file == BRW_IMMEDIATE_VALUE && 278 sampler_index.file == BRW_IMMEDIATE_VALUE) { 279 uint32_t surface = surface_index.ud; 280 uint32_t sampler = sampler_index.ud; 281 282 brw_SAMPLE(p, 283 dst, 284 inst->base_mrf, 285 src, 286 surface + base_binding_table_index, 287 sampler % 16, 288 msg_type, 289 1, /* response length */ 290 inst->mlen, 291 inst->header_size != 0, 292 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 293 return_format); 294 295 brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index); 296 } else { 297 /* Non-constant sampler index. */ 298 299 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 300 struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD)); 301 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); 302 303 brw_push_insn_state(p); 304 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 305 brw_set_default_access_mode(p, BRW_ALIGN_1); 306 307 if (brw_regs_equal(&surface_reg, &sampler_reg)) { 308 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); 309 } else { 310 if (sampler_reg.file == BRW_IMMEDIATE_VALUE) { 311 brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8)); 312 } else { 313 brw_SHL(p, addr, sampler_reg, brw_imm_ud(8)); 314 brw_OR(p, addr, addr, surface_reg); 315 } 316 } 317 if (base_binding_table_index) 318 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index)); 319 brw_AND(p, addr, addr, brw_imm_ud(0xfff)); 320 321 brw_pop_insn_state(p); 322 323 if (inst->base_mrf != -1) 324 gen6_resolve_implied_move(p, &src, inst->base_mrf); 325 326 /* dst = send(offset, a0.0 | <descriptor>) */ 327 brw_inst *insn = brw_send_indirect_message( 328 p, BRW_SFID_SAMPLER, dst, src, addr); 329 brw_set_sampler_message(p, insn, 330 0 /* surface */, 331 0 /* sampler */, 332 msg_type, 333 1 /* rlen */, 334 inst->mlen /* mlen */, 335 inst->header_size != 0 /* header */, 336 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 337 return_format); 338 339 /* visitor knows more than we do about the surface limit required, 340 * so has already done marking. 341 */ 342 } 343 } 344 345 static void 346 generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst) 347 { 348 brw_urb_WRITE(p, 349 brw_null_reg(), /* dest */ 350 inst->base_mrf, /* starting mrf reg nr */ 351 brw_vec8_grf(0, 0), /* src */ 352 inst->urb_write_flags, 353 inst->mlen, 354 0, /* response len */ 355 inst->offset, /* urb destination offset */ 356 BRW_URB_SWIZZLE_INTERLEAVE); 357 } 358 359 static void 360 generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst) 361 { 362 struct brw_reg src = brw_message_reg(inst->base_mrf); 363 brw_urb_WRITE(p, 364 brw_null_reg(), /* dest */ 365 inst->base_mrf, /* starting mrf reg nr */ 366 src, 367 inst->urb_write_flags, 368 inst->mlen, 369 0, /* response len */ 370 inst->offset, /* urb destination offset */ 371 BRW_URB_SWIZZLE_INTERLEAVE); 372 } 373 374 static void 375 generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst) 376 { 377 struct brw_reg src = brw_message_reg(inst->base_mrf); 378 379 /* We pass the temporary passed in src0 as the writeback register */ 380 brw_urb_WRITE(p, 381 inst->src[0].as_brw_reg(), /* dest */ 382 inst->base_mrf, /* starting mrf reg nr */ 383 src, 384 BRW_URB_WRITE_ALLOCATE_COMPLETE, 385 inst->mlen, 386 1, /* response len */ 387 inst->offset, /* urb destination offset */ 388 BRW_URB_SWIZZLE_INTERLEAVE); 389 390 /* Now put allocated urb handle in dst.0 */ 391 brw_push_insn_state(p); 392 brw_set_default_access_mode(p, BRW_ALIGN_1); 393 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 394 brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0), 395 get_element_ud(inst->src[0].as_brw_reg(), 0)); 396 brw_pop_insn_state(p); 397 } 398 399 static void 400 generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst) 401 { 402 struct brw_reg src = brw_message_reg(inst->base_mrf); 403 brw_urb_WRITE(p, 404 brw_null_reg(), /* dest */ 405 inst->base_mrf, /* starting mrf reg nr */ 406 src, 407 BRW_URB_WRITE_EOT | inst->urb_write_flags, 408 inst->mlen, 409 0, /* response len */ 410 0, /* urb destination offset */ 411 BRW_URB_SWIZZLE_INTERLEAVE); 412 } 413 414 static void 415 generate_gs_set_write_offset(struct brw_codegen *p, 416 struct brw_reg dst, 417 struct brw_reg src0, 418 struct brw_reg src1) 419 { 420 /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message 421 * Header: M0.3): 422 * 423 * Slot 0 Offset. This field, after adding to the Global Offset field 424 * in the message descriptor, specifies the offset (in 256-bit units) 425 * from the start of the URB entry, as referenced by URB Handle 0, at 426 * which the data will be accessed. 427 * 428 * Similar text describes DWORD M0.4, which is slot 1 offset. 429 * 430 * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components 431 * of the register for geometry shader invocations 0 and 1) by the 432 * immediate value in src1, and store the result in DWORDs 3 and 4 of dst. 433 * 434 * We can do this with the following EU instruction: 435 * 436 * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all } 437 */ 438 brw_push_insn_state(p); 439 brw_set_default_access_mode(p, BRW_ALIGN_1); 440 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 441 assert(p->devinfo->gen >= 7 && 442 src1.file == BRW_IMMEDIATE_VALUE && 443 src1.type == BRW_REGISTER_TYPE_UD && 444 src1.ud <= USHRT_MAX); 445 if (src0.file == BRW_IMMEDIATE_VALUE) { 446 brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3), 447 brw_imm_ud(src0.ud * src1.ud)); 448 } else { 449 brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), 450 retype(src1, BRW_REGISTER_TYPE_UW)); 451 } 452 brw_pop_insn_state(p); 453 } 454 455 static void 456 generate_gs_set_vertex_count(struct brw_codegen *p, 457 struct brw_reg dst, 458 struct brw_reg src) 459 { 460 brw_push_insn_state(p); 461 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 462 463 if (p->devinfo->gen >= 8) { 464 /* Move the vertex count into the second MRF for the EOT write. */ 465 brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD), 466 src); 467 } else { 468 /* If we think of the src and dst registers as composed of 8 DWORDs each, 469 * we want to pick up the contents of DWORDs 0 and 4 from src, truncate 470 * them to WORDs, and then pack them into DWORD 2 of dst. 471 * 472 * It's easier to get the EU to do this if we think of the src and dst 473 * registers as composed of 16 WORDS each; then, we want to pick up the 474 * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 475 * of dst. 476 * 477 * We can do that by the following EU instruction: 478 * 479 * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask } 480 */ 481 brw_set_default_access_mode(p, BRW_ALIGN_1); 482 brw_MOV(p, 483 suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4), 484 stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0)); 485 } 486 brw_pop_insn_state(p); 487 } 488 489 static void 490 generate_gs_svb_write(struct brw_codegen *p, 491 struct brw_vue_prog_data *prog_data, 492 vec4_instruction *inst, 493 struct brw_reg dst, 494 struct brw_reg src0, 495 struct brw_reg src1) 496 { 497 int binding = inst->sol_binding; 498 bool final_write = inst->sol_final_write; 499 500 brw_push_insn_state(p); 501 brw_set_default_exec_size(p, BRW_EXECUTE_4); 502 /* Copy Vertex data into M0.x */ 503 brw_MOV(p, stride(dst, 4, 4, 1), 504 stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1)); 505 brw_pop_insn_state(p); 506 507 brw_push_insn_state(p); 508 /* Send SVB Write */ 509 brw_svb_write(p, 510 final_write ? src1 : brw_null_reg(), /* dest == src1 */ 511 1, /* msg_reg_nr */ 512 dst, /* src0 == previous dst */ 513 BRW_GEN6_SOL_BINDING_START + binding, /* binding_table_index */ 514 final_write); /* send_commit_msg */ 515 516 /* Finally, wait for the write commit to occur so that we can proceed to 517 * other things safely. 518 * 519 * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3: 520 * 521 * The write commit does not modify the destination register, but 522 * merely clears the dependency associated with the destination 523 * register. Thus, a simple mov instruction using the register as a 524 * source is sufficient to wait for the write commit to occur. 525 */ 526 if (final_write) { 527 brw_MOV(p, src1, src1); 528 } 529 brw_pop_insn_state(p); 530 } 531 532 static void 533 generate_gs_svb_set_destination_index(struct brw_codegen *p, 534 vec4_instruction *inst, 535 struct brw_reg dst, 536 struct brw_reg src) 537 { 538 int vertex = inst->sol_vertex; 539 brw_push_insn_state(p); 540 brw_set_default_access_mode(p, BRW_ALIGN_1); 541 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 542 brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex)); 543 brw_pop_insn_state(p); 544 } 545 546 static void 547 generate_gs_set_dword_2(struct brw_codegen *p, 548 struct brw_reg dst, 549 struct brw_reg src) 550 { 551 brw_push_insn_state(p); 552 brw_set_default_access_mode(p, BRW_ALIGN_1); 553 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 554 brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0)); 555 brw_pop_insn_state(p); 556 } 557 558 static void 559 generate_gs_prepare_channel_masks(struct brw_codegen *p, 560 struct brw_reg dst) 561 { 562 /* We want to left shift just DWORD 4 (the x component belonging to the 563 * second geometry shader invocation) by 4 bits. So generate the 564 * instruction: 565 * 566 * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all } 567 */ 568 dst = suboffset(vec1(dst), 4); 569 brw_push_insn_state(p); 570 brw_set_default_access_mode(p, BRW_ALIGN_1); 571 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 572 brw_SHL(p, dst, dst, brw_imm_ud(4)); 573 brw_pop_insn_state(p); 574 } 575 576 static void 577 generate_gs_set_channel_masks(struct brw_codegen *p, 578 struct brw_reg dst, 579 struct brw_reg src) 580 { 581 /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message 582 * Header: M0.5): 583 * 584 * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask 585 * 586 * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1 587 * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls 588 * Vertex 0 DATA[7]. This bit is ANDed with the corresponding 589 * channel enable to determine the final channel enable. For the 590 * URB_READ_OWORD & URB_READ_HWORD messages, when final channel 591 * enable is 1 it indicates that Vertex 1 DATA [3] will be included 592 * in the writeback message. For the URB_WRITE_OWORD & 593 * URB_WRITE_HWORD messages, when final channel enable is 1 it 594 * indicates that Vertex 1 DATA [3] will be written to the surface. 595 * 596 * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included 597 * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included 598 * 599 * 14 Vertex 1 DATA [2] Channel Mask 600 * 13 Vertex 1 DATA [1] Channel Mask 601 * 12 Vertex 1 DATA [0] Channel Mask 602 * 11 Vertex 0 DATA [3] Channel Mask 603 * 10 Vertex 0 DATA [2] Channel Mask 604 * 9 Vertex 0 DATA [1] Channel Mask 605 * 8 Vertex 0 DATA [0] Channel Mask 606 * 607 * (This is from a section of the PRM that is agnostic to the particular 608 * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to 609 * geometry shader invocations 0 and 1, respectively). Since we have the 610 * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0, 611 * and the enable flags for geometry shader invocation 1 in bits 7:0 of 612 * DWORD 4, we just need to OR them together and store the result in bits 613 * 15:8 of DWORD 5. 614 * 615 * It's easier to get the EU to do this if we think of the src and dst 616 * registers as composed of 32 bytes each; then, we want to pick up the 617 * contents of bytes 0 and 16 from src, OR them together, and store them in 618 * byte 21. 619 * 620 * We can do that by the following EU instruction: 621 * 622 * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all } 623 * 624 * Note: this relies on the source register having zeros in (a) bits 7:4 of 625 * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the 626 * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which 627 * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to 628 * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to 629 * contain valid channel mask values (which are in the range 0x0-0xf). 630 */ 631 dst = retype(dst, BRW_REGISTER_TYPE_UB); 632 src = retype(src, BRW_REGISTER_TYPE_UB); 633 brw_push_insn_state(p); 634 brw_set_default_access_mode(p, BRW_ALIGN_1); 635 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 636 brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16)); 637 brw_pop_insn_state(p); 638 } 639 640 static void 641 generate_gs_get_instance_id(struct brw_codegen *p, 642 struct brw_reg dst) 643 { 644 /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT 645 * and store into dst.0 & dst.4. So generate the instruction: 646 * 647 * shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q } 648 */ 649 brw_push_insn_state(p); 650 brw_set_default_access_mode(p, BRW_ALIGN_1); 651 dst = retype(dst, BRW_REGISTER_TYPE_UD); 652 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 653 brw_SHR(p, dst, stride(r0, 1, 4, 0), 654 brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT)); 655 brw_pop_insn_state(p); 656 } 657 658 static void 659 generate_gs_ff_sync_set_primitives(struct brw_codegen *p, 660 struct brw_reg dst, 661 struct brw_reg src0, 662 struct brw_reg src1, 663 struct brw_reg src2) 664 { 665 brw_push_insn_state(p); 666 brw_set_default_access_mode(p, BRW_ALIGN_1); 667 /* Save src0 data in 16:31 bits of dst.0 */ 668 brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0), 669 brw_imm_ud(0xffffu)); 670 brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16)); 671 /* Save src1 data in 0:15 bits of dst.0 */ 672 brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0), 673 brw_imm_ud(0xffffu)); 674 brw_OR(p, suboffset(vec1(dst), 0), 675 suboffset(vec1(dst), 0), 676 suboffset(vec1(src2), 0)); 677 brw_pop_insn_state(p); 678 } 679 680 static void 681 generate_gs_ff_sync(struct brw_codegen *p, 682 vec4_instruction *inst, 683 struct brw_reg dst, 684 struct brw_reg src0, 685 struct brw_reg src1) 686 { 687 /* This opcode uses an implied MRF register for: 688 * - the header of the ff_sync message. And as such it is expected to be 689 * initialized to r0 before calling here. 690 * - the destination where we will write the allocated URB handle. 691 */ 692 struct brw_reg header = 693 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); 694 695 /* Overwrite dword 0 of the header (SO vertices to write) and 696 * dword 1 (number of primitives written). 697 */ 698 brw_push_insn_state(p); 699 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 700 brw_set_default_access_mode(p, BRW_ALIGN_1); 701 brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0)); 702 brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0)); 703 brw_pop_insn_state(p); 704 705 /* Allocate URB handle in dst */ 706 brw_ff_sync(p, 707 dst, 708 0, 709 header, 710 1, /* allocate */ 711 1, /* response length */ 712 0 /* eot */); 713 714 /* Now put allocated urb handle in header.0 */ 715 brw_push_insn_state(p); 716 brw_set_default_access_mode(p, BRW_ALIGN_1); 717 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 718 brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0)); 719 720 /* src1 is not an immediate when we use transform feedback */ 721 if (src1.file != BRW_IMMEDIATE_VALUE) { 722 brw_set_default_exec_size(p, BRW_EXECUTE_4); 723 brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1)); 724 } 725 726 brw_pop_insn_state(p); 727 } 728 729 static void 730 generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst) 731 { 732 /* In gen6, PrimitiveID is delivered in R0.1 of the payload */ 733 struct brw_reg src = brw_vec8_grf(0, 0); 734 brw_push_insn_state(p); 735 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 736 brw_set_default_access_mode(p, BRW_ALIGN_1); 737 brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1)); 738 brw_pop_insn_state(p); 739 } 740 741 static void 742 generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst) 743 { 744 const struct gen_device_info *devinfo = p->devinfo; 745 const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail; 746 747 /* "Instance Count" comes as part of the payload in r0.2 bits 23:17. 748 * 749 * Since we operate in SIMD4x2 mode, we need run half as many threads 750 * as necessary. So we assign (2i + 1, 2i) as the thread counts. We 751 * shift right by one less to accomplish the multiplication by two. 752 */ 753 dst = retype(dst, BRW_REGISTER_TYPE_UD); 754 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 755 756 brw_push_insn_state(p); 757 brw_set_default_access_mode(p, BRW_ALIGN_1); 758 759 const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17); 760 const int shift = ivb ? 16 : 17; 761 762 brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask)); 763 brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0), 764 brw_imm_ud(shift - 1)); 765 brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1)); 766 767 brw_pop_insn_state(p); 768 } 769 770 static void 771 generate_tcs_urb_write(struct brw_codegen *p, 772 vec4_instruction *inst, 773 struct brw_reg urb_header) 774 { 775 const struct gen_device_info *devinfo = p->devinfo; 776 777 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 778 brw_set_dest(p, send, brw_null_reg()); 779 brw_set_src0(p, send, urb_header); 780 781 brw_set_message_descriptor(p, send, BRW_SFID_URB, 782 inst->mlen /* mlen */, 0 /* rlen */, 783 true /* header */, false /* eot */); 784 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD); 785 brw_inst_set_urb_global_offset(devinfo, send, inst->offset); 786 if (inst->urb_write_flags & BRW_URB_WRITE_EOT) { 787 brw_inst_set_eot(devinfo, send, 1); 788 } else { 789 brw_inst_set_urb_per_slot_offset(devinfo, send, 1); 790 brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); 791 } 792 793 /* what happens to swizzles? */ 794 } 795 796 797 static void 798 generate_tcs_input_urb_offsets(struct brw_codegen *p, 799 struct brw_reg dst, 800 struct brw_reg vertex, 801 struct brw_reg offset) 802 { 803 /* Generates an URB read/write message header for HS/DS operation. 804 * Inputs are a vertex index, and a byte offset from the beginning of 805 * the vertex. */ 806 807 /* If `vertex` is not an immediate, we clobber a0.0 */ 808 809 assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE); 810 assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D); 811 812 assert(dst.file == BRW_GENERAL_REGISTER_FILE); 813 814 brw_push_insn_state(p); 815 brw_set_default_access_mode(p, BRW_ALIGN_1); 816 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 817 brw_MOV(p, dst, brw_imm_ud(0)); 818 819 /* m0.5 bits 8-15 are channel enables */ 820 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); 821 822 /* m0.0-0.1: URB handles */ 823 if (vertex.file == BRW_IMMEDIATE_VALUE) { 824 uint32_t vertex_index = vertex.ud; 825 struct brw_reg index_reg = brw_vec1_grf( 826 1 + (vertex_index >> 3), vertex_index & 7); 827 828 brw_MOV(p, vec2(get_element_ud(dst, 0)), 829 retype(index_reg, BRW_REGISTER_TYPE_UD)); 830 } else { 831 /* Use indirect addressing. ICP Handles are DWords (single channels 832 * of a register) and start at g1.0. 833 * 834 * In order to start our region at g1.0, we add 8 to the vertex index, 835 * effectively skipping over the 8 channels in g0.0. This gives us a 836 * DWord offset to the ICP Handle. 837 * 838 * Indirect addressing works in terms of bytes, so we then multiply 839 * the DWord offset by 4 (by shifting left by 2). 840 */ 841 struct brw_reg addr = brw_address_reg(0); 842 843 /* bottom half: m0.0 = g[1.0 + vertex.0]UD */ 844 brw_ADD(p, addr, retype(get_element_ud(vertex, 0), BRW_REGISTER_TYPE_UW), 845 brw_imm_uw(0x8)); 846 brw_SHL(p, addr, addr, brw_imm_uw(2)); 847 brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0)); 848 849 /* top half: m0.1 = g[1.0 + vertex.4]UD */ 850 brw_ADD(p, addr, retype(get_element_ud(vertex, 4), BRW_REGISTER_TYPE_UW), 851 brw_imm_uw(0x8)); 852 brw_SHL(p, addr, addr, brw_imm_uw(2)); 853 brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0)); 854 } 855 856 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ 857 if (offset.file != ARF) 858 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); 859 860 brw_pop_insn_state(p); 861 } 862 863 864 static void 865 generate_tcs_output_urb_offsets(struct brw_codegen *p, 866 struct brw_reg dst, 867 struct brw_reg write_mask, 868 struct brw_reg offset) 869 { 870 /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */ 871 assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE); 872 873 assert(write_mask.file == BRW_IMMEDIATE_VALUE); 874 assert(write_mask.type == BRW_REGISTER_TYPE_UD); 875 876 brw_push_insn_state(p); 877 878 brw_set_default_access_mode(p, BRW_ALIGN_1); 879 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 880 brw_MOV(p, dst, brw_imm_ud(0)); 881 882 unsigned mask = write_mask.ud; 883 884 /* m0.5 bits 15:12 and 11:8 are channel enables */ 885 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12))); 886 887 /* HS patch URB handle is delivered in r0.0 */ 888 struct brw_reg urb_handle = brw_vec1_grf(0, 0); 889 890 /* m0.0-0.1: URB handles */ 891 brw_MOV(p, vec2(get_element_ud(dst, 0)), 892 retype(urb_handle, BRW_REGISTER_TYPE_UD)); 893 894 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ 895 if (offset.file != ARF) 896 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); 897 898 brw_pop_insn_state(p); 899 } 900 901 static void 902 generate_tes_create_input_read_header(struct brw_codegen *p, 903 struct brw_reg dst) 904 { 905 brw_push_insn_state(p); 906 brw_set_default_access_mode(p, BRW_ALIGN_1); 907 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 908 909 /* Initialize the register to 0 */ 910 brw_MOV(p, dst, brw_imm_ud(0)); 911 912 /* Enable all the channels in m0.5 bits 15:8 */ 913 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); 914 915 /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1. For safety, 916 * mask out irrelevant "Reserved" bits, as they're not marked MBZ. 917 */ 918 brw_AND(p, vec2(get_element_ud(dst, 0)), 919 retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD), 920 brw_imm_ud(0x1fff)); 921 brw_pop_insn_state(p); 922 } 923 924 static void 925 generate_tes_add_indirect_urb_offset(struct brw_codegen *p, 926 struct brw_reg dst, 927 struct brw_reg header, 928 struct brw_reg offset) 929 { 930 brw_push_insn_state(p); 931 brw_set_default_access_mode(p, BRW_ALIGN_1); 932 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 933 934 brw_MOV(p, dst, header); 935 /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */ 936 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); 937 938 brw_pop_insn_state(p); 939 } 940 941 static void 942 generate_vec4_urb_read(struct brw_codegen *p, 943 vec4_instruction *inst, 944 struct brw_reg dst, 945 struct brw_reg header) 946 { 947 const struct gen_device_info *devinfo = p->devinfo; 948 949 assert(header.file == BRW_GENERAL_REGISTER_FILE); 950 assert(header.type == BRW_REGISTER_TYPE_UD); 951 952 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 953 brw_set_dest(p, send, dst); 954 brw_set_src0(p, send, header); 955 956 brw_set_message_descriptor(p, send, BRW_SFID_URB, 957 1 /* mlen */, 1 /* rlen */, 958 true /* header */, false /* eot */); 959 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); 960 brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); 961 brw_inst_set_urb_per_slot_offset(devinfo, send, 1); 962 963 brw_inst_set_urb_global_offset(devinfo, send, inst->offset); 964 } 965 966 static void 967 generate_tcs_release_input(struct brw_codegen *p, 968 struct brw_reg header, 969 struct brw_reg vertex, 970 struct brw_reg is_unpaired) 971 { 972 const struct gen_device_info *devinfo = p->devinfo; 973 974 assert(vertex.file == BRW_IMMEDIATE_VALUE); 975 assert(vertex.type == BRW_REGISTER_TYPE_UD); 976 977 /* m0.0-0.1: URB handles */ 978 struct brw_reg urb_handles = 979 retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7), 980 BRW_REGISTER_TYPE_UD); 981 982 brw_push_insn_state(p); 983 brw_set_default_access_mode(p, BRW_ALIGN_1); 984 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 985 brw_MOV(p, header, brw_imm_ud(0)); 986 brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles); 987 brw_pop_insn_state(p); 988 989 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 990 brw_set_dest(p, send, brw_null_reg()); 991 brw_set_src0(p, send, header); 992 brw_set_message_descriptor(p, send, BRW_SFID_URB, 993 1 /* mlen */, 0 /* rlen */, 994 true /* header */, false /* eot */); 995 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); 996 brw_inst_set_urb_complete(devinfo, send, 1); 997 brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ? 998 BRW_URB_SWIZZLE_NONE : 999 BRW_URB_SWIZZLE_INTERLEAVE); 1000 } 1001 1002 static void 1003 generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst) 1004 { 1005 struct brw_reg header = brw_message_reg(inst->base_mrf); 1006 1007 brw_push_insn_state(p); 1008 brw_set_default_access_mode(p, BRW_ALIGN_1); 1009 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1010 brw_MOV(p, header, brw_imm_ud(0)); 1011 brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8)); 1012 brw_MOV(p, get_element_ud(header, 0), 1013 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); 1014 brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u)); 1015 brw_pop_insn_state(p); 1016 1017 brw_urb_WRITE(p, 1018 brw_null_reg(), /* dest */ 1019 inst->base_mrf, /* starting mrf reg nr */ 1020 header, 1021 BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD | 1022 BRW_URB_WRITE_USE_CHANNEL_MASKS, 1023 inst->mlen, 1024 0, /* response len */ 1025 0, /* urb destination offset */ 1026 0); 1027 } 1028 1029 static void 1030 generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) 1031 { 1032 brw_push_insn_state(p); 1033 brw_set_default_access_mode(p, BRW_ALIGN_1); 1034 brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D)); 1035 brw_pop_insn_state(p); 1036 } 1037 1038 static void 1039 generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) 1040 { 1041 brw_push_insn_state(p); 1042 brw_set_default_access_mode(p, BRW_ALIGN_1); 1043 brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); 1044 brw_pop_insn_state(p); 1045 } 1046 1047 static void 1048 generate_tcs_create_barrier_header(struct brw_codegen *p, 1049 struct brw_vue_prog_data *prog_data, 1050 struct brw_reg dst) 1051 { 1052 const struct gen_device_info *devinfo = p->devinfo; 1053 const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail; 1054 struct brw_reg m0_2 = get_element_ud(dst, 2); 1055 unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances; 1056 1057 brw_push_insn_state(p); 1058 brw_set_default_access_mode(p, BRW_ALIGN_1); 1059 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1060 1061 /* Zero the message header */ 1062 brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); 1063 1064 /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */ 1065 brw_AND(p, m0_2, 1066 retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 1067 brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13))); 1068 1069 /* Shift it up to bits 27:24. */ 1070 brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11)); 1071 1072 /* Set the Barrier Count and the enable bit */ 1073 brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15))); 1074 1075 brw_pop_insn_state(p); 1076 } 1077 1078 static void 1079 generate_oword_dual_block_offsets(struct brw_codegen *p, 1080 struct brw_reg m1, 1081 struct brw_reg index) 1082 { 1083 int second_vertex_offset; 1084 1085 if (p->devinfo->gen >= 6) 1086 second_vertex_offset = 1; 1087 else 1088 second_vertex_offset = 16; 1089 1090 m1 = retype(m1, BRW_REGISTER_TYPE_D); 1091 1092 /* Set up M1 (message payload). Only the block offsets in M1.0 and 1093 * M1.4 are used, and the rest are ignored. 1094 */ 1095 struct brw_reg m1_0 = suboffset(vec1(m1), 0); 1096 struct brw_reg m1_4 = suboffset(vec1(m1), 4); 1097 struct brw_reg index_0 = suboffset(vec1(index), 0); 1098 struct brw_reg index_4 = suboffset(vec1(index), 4); 1099 1100 brw_push_insn_state(p); 1101 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1102 brw_set_default_access_mode(p, BRW_ALIGN_1); 1103 1104 brw_MOV(p, m1_0, index_0); 1105 1106 if (index.file == BRW_IMMEDIATE_VALUE) { 1107 index_4.ud += second_vertex_offset; 1108 brw_MOV(p, m1_4, index_4); 1109 } else { 1110 brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset)); 1111 } 1112 1113 brw_pop_insn_state(p); 1114 } 1115 1116 static void 1117 generate_unpack_flags(struct brw_codegen *p, 1118 struct brw_reg dst) 1119 { 1120 brw_push_insn_state(p); 1121 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1122 brw_set_default_access_mode(p, BRW_ALIGN_1); 1123 1124 struct brw_reg flags = brw_flag_reg(0, 0); 1125 struct brw_reg dst_0 = suboffset(vec1(dst), 0); 1126 struct brw_reg dst_4 = suboffset(vec1(dst), 4); 1127 1128 brw_AND(p, dst_0, flags, brw_imm_ud(0x0f)); 1129 brw_AND(p, dst_4, flags, brw_imm_ud(0xf0)); 1130 brw_SHR(p, dst_4, dst_4, brw_imm_ud(4)); 1131 1132 brw_pop_insn_state(p); 1133 } 1134 1135 static void 1136 generate_scratch_read(struct brw_codegen *p, 1137 vec4_instruction *inst, 1138 struct brw_reg dst, 1139 struct brw_reg index) 1140 { 1141 const struct gen_device_info *devinfo = p->devinfo; 1142 struct brw_reg header = brw_vec8_grf(0, 0); 1143 1144 gen6_resolve_implied_move(p, &header, inst->base_mrf); 1145 1146 generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), 1147 index); 1148 1149 uint32_t msg_type; 1150 1151 if (devinfo->gen >= 6) 1152 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1153 else if (devinfo->gen == 5 || devinfo->is_g4x) 1154 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1155 else 1156 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1157 1158 const unsigned target_cache = 1159 devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : 1160 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : 1161 BRW_DATAPORT_READ_TARGET_RENDER_CACHE; 1162 1163 /* Each of the 8 channel enables is considered for whether each 1164 * dword is written. 1165 */ 1166 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1167 brw_set_dest(p, send, dst); 1168 brw_set_src0(p, send, header); 1169 if (devinfo->gen < 6) 1170 brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf); 1171 brw_set_dp_read_message(p, send, 1172 brw_scratch_surface_idx(p), 1173 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, 1174 msg_type, target_cache, 1175 2, /* mlen */ 1176 true, /* header_present */ 1177 1 /* rlen */); 1178 } 1179 1180 static void 1181 generate_scratch_write(struct brw_codegen *p, 1182 vec4_instruction *inst, 1183 struct brw_reg dst, 1184 struct brw_reg src, 1185 struct brw_reg index) 1186 { 1187 const struct gen_device_info *devinfo = p->devinfo; 1188 const unsigned target_cache = 1189 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : 1190 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : 1191 BRW_DATAPORT_READ_TARGET_RENDER_CACHE); 1192 struct brw_reg header = brw_vec8_grf(0, 0); 1193 bool write_commit; 1194 1195 /* If the instruction is predicated, we'll predicate the send, not 1196 * the header setup. 1197 */ 1198 brw_set_default_predicate_control(p, false); 1199 1200 gen6_resolve_implied_move(p, &header, inst->base_mrf); 1201 1202 generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), 1203 index); 1204 1205 brw_MOV(p, 1206 retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D), 1207 retype(src, BRW_REGISTER_TYPE_D)); 1208 1209 uint32_t msg_type; 1210 1211 if (devinfo->gen >= 7) 1212 msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE; 1213 else if (devinfo->gen == 6) 1214 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE; 1215 else 1216 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE; 1217 1218 brw_set_default_predicate_control(p, inst->predicate); 1219 1220 /* Pre-gen6, we have to specify write commits to ensure ordering 1221 * between reads and writes within a thread. Afterwards, that's 1222 * guaranteed and write commits only matter for inter-thread 1223 * synchronization. 1224 */ 1225 if (devinfo->gen >= 6) { 1226 write_commit = false; 1227 } else { 1228 /* The visitor set up our destination register to be g0. This 1229 * means that when the next read comes along, we will end up 1230 * reading from g0 and causing a block on the write commit. For 1231 * write-after-read, we are relying on the value of the previous 1232 * read being used (and thus blocking on completion) before our 1233 * write is executed. This means we have to be careful in 1234 * instruction scheduling to not violate this assumption. 1235 */ 1236 write_commit = true; 1237 } 1238 1239 /* Each of the 8 channel enables is considered for whether each 1240 * dword is written. 1241 */ 1242 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1243 brw_set_dest(p, send, dst); 1244 brw_set_src0(p, send, header); 1245 if (devinfo->gen < 6) 1246 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf); 1247 brw_set_dp_write_message(p, send, 1248 brw_scratch_surface_idx(p), 1249 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, 1250 msg_type, 1251 target_cache, 1252 3, /* mlen */ 1253 true, /* header present */ 1254 false, /* not a render target write */ 1255 write_commit, /* rlen */ 1256 false, /* eot */ 1257 write_commit); 1258 } 1259 1260 static void 1261 generate_pull_constant_load(struct brw_codegen *p, 1262 struct brw_vue_prog_data *prog_data, 1263 vec4_instruction *inst, 1264 struct brw_reg dst, 1265 struct brw_reg index, 1266 struct brw_reg offset) 1267 { 1268 const struct gen_device_info *devinfo = p->devinfo; 1269 const unsigned target_cache = 1270 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE : 1271 BRW_DATAPORT_READ_TARGET_DATA_CACHE); 1272 assert(index.file == BRW_IMMEDIATE_VALUE && 1273 index.type == BRW_REGISTER_TYPE_UD); 1274 uint32_t surf_index = index.ud; 1275 1276 struct brw_reg header = brw_vec8_grf(0, 0); 1277 1278 gen6_resolve_implied_move(p, &header, inst->base_mrf); 1279 1280 if (devinfo->gen >= 6) { 1281 if (offset.file == BRW_IMMEDIATE_VALUE) { 1282 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), 1283 BRW_REGISTER_TYPE_D), 1284 brw_imm_d(offset.ud >> 4)); 1285 } else { 1286 brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1), 1287 BRW_REGISTER_TYPE_D), 1288 offset, brw_imm_d(4)); 1289 } 1290 } else { 1291 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), 1292 BRW_REGISTER_TYPE_D), 1293 offset); 1294 } 1295 1296 uint32_t msg_type; 1297 1298 if (devinfo->gen >= 6) 1299 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1300 else if (devinfo->gen == 5 || devinfo->is_g4x) 1301 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1302 else 1303 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1304 1305 /* Each of the 8 channel enables is considered for whether each 1306 * dword is written. 1307 */ 1308 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1309 brw_set_dest(p, send, dst); 1310 brw_set_src0(p, send, header); 1311 if (devinfo->gen < 6) 1312 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf); 1313 brw_set_dp_read_message(p, send, 1314 surf_index, 1315 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, 1316 msg_type, 1317 target_cache, 1318 2, /* mlen */ 1319 true, /* header_present */ 1320 1 /* rlen */); 1321 } 1322 1323 static void 1324 generate_get_buffer_size(struct brw_codegen *p, 1325 struct brw_vue_prog_data *prog_data, 1326 vec4_instruction *inst, 1327 struct brw_reg dst, 1328 struct brw_reg src, 1329 struct brw_reg surf_index) 1330 { 1331 assert(p->devinfo->gen >= 7); 1332 assert(surf_index.type == BRW_REGISTER_TYPE_UD && 1333 surf_index.file == BRW_IMMEDIATE_VALUE); 1334 1335 brw_SAMPLE(p, 1336 dst, 1337 inst->base_mrf, 1338 src, 1339 surf_index.ud, 1340 0, 1341 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO, 1342 1, /* response length */ 1343 inst->mlen, 1344 inst->header_size > 0, 1345 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 1346 BRW_SAMPLER_RETURN_FORMAT_SINT32); 1347 1348 brw_mark_surface_used(&prog_data->base, surf_index.ud); 1349 } 1350 1351 static void 1352 generate_pull_constant_load_gen7(struct brw_codegen *p, 1353 struct brw_vue_prog_data *prog_data, 1354 vec4_instruction *inst, 1355 struct brw_reg dst, 1356 struct brw_reg surf_index, 1357 struct brw_reg offset) 1358 { 1359 assert(surf_index.type == BRW_REGISTER_TYPE_UD); 1360 1361 if (surf_index.file == BRW_IMMEDIATE_VALUE) { 1362 1363 brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND); 1364 brw_set_dest(p, insn, dst); 1365 brw_set_src0(p, insn, offset); 1366 brw_set_sampler_message(p, insn, 1367 surf_index.ud, 1368 0, /* LD message ignores sampler unit */ 1369 GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 1370 1, /* rlen */ 1371 inst->mlen, 1372 inst->header_size != 0, 1373 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 1374 0); 1375 1376 brw_mark_surface_used(&prog_data->base, surf_index.ud); 1377 1378 } else { 1379 1380 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 1381 1382 brw_push_insn_state(p); 1383 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1384 brw_set_default_access_mode(p, BRW_ALIGN_1); 1385 1386 /* a0.0 = surf_index & 0xff */ 1387 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); 1388 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); 1389 brw_set_dest(p, insn_and, addr); 1390 brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD))); 1391 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); 1392 1393 brw_pop_insn_state(p); 1394 1395 /* dst = send(offset, a0.0 | <descriptor>) */ 1396 brw_inst *insn = brw_send_indirect_message( 1397 p, BRW_SFID_SAMPLER, dst, offset, addr); 1398 brw_set_sampler_message(p, insn, 1399 0 /* surface */, 1400 0 /* sampler */, 1401 GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 1402 1 /* rlen */, 1403 inst->mlen, 1404 inst->header_size != 0, 1405 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 1406 0); 1407 } 1408 } 1409 1410 static void 1411 generate_set_simd4x2_header_gen9(struct brw_codegen *p, 1412 vec4_instruction *inst, 1413 struct brw_reg dst) 1414 { 1415 brw_push_insn_state(p); 1416 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1417 1418 brw_set_default_exec_size(p, BRW_EXECUTE_8); 1419 brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 1420 1421 brw_set_default_access_mode(p, BRW_ALIGN_1); 1422 brw_MOV(p, get_element_ud(dst, 2), 1423 brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2)); 1424 1425 brw_pop_insn_state(p); 1426 } 1427 1428 static void 1429 generate_mov_indirect(struct brw_codegen *p, 1430 vec4_instruction *inst, 1431 struct brw_reg dst, struct brw_reg reg, 1432 struct brw_reg indirect, struct brw_reg length) 1433 { 1434 assert(indirect.type == BRW_REGISTER_TYPE_UD); 1435 assert(p->devinfo->gen >= 6); 1436 1437 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2); 1438 1439 /* This instruction acts in align1 mode */ 1440 assert(dst.writemask == WRITEMASK_XYZW); 1441 1442 if (indirect.file == BRW_IMMEDIATE_VALUE) { 1443 imm_byte_offset += indirect.ud; 1444 1445 reg.nr = imm_byte_offset / REG_SIZE; 1446 reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2; 1447 unsigned shift = (imm_byte_offset / 4) % 4; 1448 reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift); 1449 1450 brw_MOV(p, dst, reg); 1451 } else { 1452 brw_push_insn_state(p); 1453 brw_set_default_access_mode(p, BRW_ALIGN_1); 1454 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1455 1456 struct brw_reg addr = vec8(brw_address_reg(0)); 1457 1458 /* We need to move the indirect value into the address register. In 1459 * order to make things make some sense, we want to respect at least the 1460 * X component of the swizzle. In order to do that, we need to convert 1461 * the subnr (probably 0) to an align1 subnr and add in the swizzle. 1462 */ 1463 assert(brw_is_single_value_swizzle(indirect.swizzle)); 1464 indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0)); 1465 1466 /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of 1467 * the indirect and splat it out to all four channels of the given half 1468 * of a0. 1469 */ 1470 indirect.subnr *= 2; 1471 indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0); 1472 brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset)); 1473 1474 /* Now we need to incorporate the swizzle from the source register */ 1475 if (reg.swizzle != BRW_SWIZZLE_XXXX) { 1476 uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 | 1477 BRW_GET_SWZ(reg.swizzle, 1) << 6 | 1478 BRW_GET_SWZ(reg.swizzle, 2) << 10 | 1479 BRW_GET_SWZ(reg.swizzle, 3) << 14; 1480 uv_swiz |= uv_swiz << 16; 1481 1482 brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz)); 1483 } 1484 1485 brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type)); 1486 1487 brw_pop_insn_state(p); 1488 } 1489 } 1490 1491 static void 1492 generate_code(struct brw_codegen *p, 1493 const struct brw_compiler *compiler, 1494 void *log_data, 1495 const nir_shader *nir, 1496 struct brw_vue_prog_data *prog_data, 1497 const struct cfg_t *cfg) 1498 { 1499 const struct gen_device_info *devinfo = p->devinfo; 1500 const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage); 1501 bool debug_flag = INTEL_DEBUG & 1502 intel_debug_flag_for_shader_stage(nir->info.stage); 1503 struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg); 1504 int spill_count = 0, fill_count = 0; 1505 int loop_count = 0; 1506 1507 foreach_block_and_inst (block, vec4_instruction, inst, cfg) { 1508 struct brw_reg src[3], dst; 1509 1510 if (unlikely(debug_flag)) 1511 disasm_annotate(disasm_info, inst, p->next_insn_offset); 1512 1513 for (unsigned int i = 0; i < 3; i++) { 1514 src[i] = inst->src[i].as_brw_reg(); 1515 } 1516 dst = inst->dst.as_brw_reg(); 1517 1518 brw_set_default_predicate_control(p, inst->predicate); 1519 brw_set_default_predicate_inverse(p, inst->predicate_inverse); 1520 brw_set_default_flag_reg(p, 0, inst->flag_subreg); 1521 brw_set_default_saturate(p, inst->saturate); 1522 brw_set_default_mask_control(p, inst->force_writemask_all); 1523 brw_set_default_acc_write_control(p, inst->writes_accumulator); 1524 1525 assert(inst->group % inst->exec_size == 0); 1526 assert(inst->group % 4 == 0); 1527 1528 /* There are some instructions where the destination is 64-bit 1529 * but we retype it to a smaller type. In that case, we cannot 1530 * double the exec_size. 1531 */ 1532 const bool is_df = (get_exec_type_size(inst) == 8 || 1533 inst->dst.type == BRW_REGISTER_TYPE_DF) && 1534 inst->opcode != VEC4_OPCODE_PICK_LOW_32BIT && 1535 inst->opcode != VEC4_OPCODE_PICK_HIGH_32BIT && 1536 inst->opcode != VEC4_OPCODE_SET_LOW_32BIT && 1537 inst->opcode != VEC4_OPCODE_SET_HIGH_32BIT; 1538 1539 unsigned exec_size = inst->exec_size; 1540 if (devinfo->gen == 7 && !devinfo->is_haswell && is_df) 1541 exec_size *= 2; 1542 1543 brw_set_default_exec_size(p, cvt(exec_size) - 1); 1544 1545 if (!inst->force_writemask_all) 1546 brw_set_default_group(p, inst->group); 1547 1548 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen)); 1549 assert(inst->mlen <= BRW_MAX_MSG_LENGTH); 1550 1551 unsigned pre_emit_nr_insn = p->nr_insn; 1552 1553 switch (inst->opcode) { 1554 case VEC4_OPCODE_UNPACK_UNIFORM: 1555 case BRW_OPCODE_MOV: 1556 brw_MOV(p, dst, src[0]); 1557 break; 1558 case BRW_OPCODE_ADD: 1559 brw_ADD(p, dst, src[0], src[1]); 1560 break; 1561 case BRW_OPCODE_MUL: 1562 brw_MUL(p, dst, src[0], src[1]); 1563 break; 1564 case BRW_OPCODE_MACH: 1565 brw_MACH(p, dst, src[0], src[1]); 1566 break; 1567 1568 case BRW_OPCODE_MAD: 1569 assert(devinfo->gen >= 6); 1570 brw_MAD(p, dst, src[0], src[1], src[2]); 1571 break; 1572 1573 case BRW_OPCODE_FRC: 1574 brw_FRC(p, dst, src[0]); 1575 break; 1576 case BRW_OPCODE_RNDD: 1577 brw_RNDD(p, dst, src[0]); 1578 break; 1579 case BRW_OPCODE_RNDE: 1580 brw_RNDE(p, dst, src[0]); 1581 break; 1582 case BRW_OPCODE_RNDZ: 1583 brw_RNDZ(p, dst, src[0]); 1584 break; 1585 1586 case BRW_OPCODE_AND: 1587 brw_AND(p, dst, src[0], src[1]); 1588 break; 1589 case BRW_OPCODE_OR: 1590 brw_OR(p, dst, src[0], src[1]); 1591 break; 1592 case BRW_OPCODE_XOR: 1593 brw_XOR(p, dst, src[0], src[1]); 1594 break; 1595 case BRW_OPCODE_NOT: 1596 brw_NOT(p, dst, src[0]); 1597 break; 1598 case BRW_OPCODE_ASR: 1599 brw_ASR(p, dst, src[0], src[1]); 1600 break; 1601 case BRW_OPCODE_SHR: 1602 brw_SHR(p, dst, src[0], src[1]); 1603 break; 1604 case BRW_OPCODE_SHL: 1605 brw_SHL(p, dst, src[0], src[1]); 1606 break; 1607 1608 case BRW_OPCODE_CMP: 1609 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 1610 break; 1611 case BRW_OPCODE_SEL: 1612 brw_SEL(p, dst, src[0], src[1]); 1613 break; 1614 1615 case BRW_OPCODE_DPH: 1616 brw_DPH(p, dst, src[0], src[1]); 1617 break; 1618 1619 case BRW_OPCODE_DP4: 1620 brw_DP4(p, dst, src[0], src[1]); 1621 break; 1622 1623 case BRW_OPCODE_DP3: 1624 brw_DP3(p, dst, src[0], src[1]); 1625 break; 1626 1627 case BRW_OPCODE_DP2: 1628 brw_DP2(p, dst, src[0], src[1]); 1629 break; 1630 1631 case BRW_OPCODE_F32TO16: 1632 assert(devinfo->gen >= 7); 1633 brw_F32TO16(p, dst, src[0]); 1634 break; 1635 1636 case BRW_OPCODE_F16TO32: 1637 assert(devinfo->gen >= 7); 1638 brw_F16TO32(p, dst, src[0]); 1639 break; 1640 1641 case BRW_OPCODE_LRP: 1642 assert(devinfo->gen >= 6); 1643 brw_LRP(p, dst, src[0], src[1], src[2]); 1644 break; 1645 1646 case BRW_OPCODE_BFREV: 1647 assert(devinfo->gen >= 7); 1648 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), 1649 retype(src[0], BRW_REGISTER_TYPE_UD)); 1650 break; 1651 case BRW_OPCODE_FBH: 1652 assert(devinfo->gen >= 7); 1653 brw_FBH(p, retype(dst, src[0].type), src[0]); 1654 break; 1655 case BRW_OPCODE_FBL: 1656 assert(devinfo->gen >= 7); 1657 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), 1658 retype(src[0], BRW_REGISTER_TYPE_UD)); 1659 break; 1660 case BRW_OPCODE_LZD: 1661 brw_LZD(p, dst, src[0]); 1662 break; 1663 case BRW_OPCODE_CBIT: 1664 assert(devinfo->gen >= 7); 1665 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), 1666 retype(src[0], BRW_REGISTER_TYPE_UD)); 1667 break; 1668 case BRW_OPCODE_ADDC: 1669 assert(devinfo->gen >= 7); 1670 brw_ADDC(p, dst, src[0], src[1]); 1671 break; 1672 case BRW_OPCODE_SUBB: 1673 assert(devinfo->gen >= 7); 1674 brw_SUBB(p, dst, src[0], src[1]); 1675 break; 1676 case BRW_OPCODE_MAC: 1677 brw_MAC(p, dst, src[0], src[1]); 1678 break; 1679 1680 case BRW_OPCODE_BFE: 1681 assert(devinfo->gen >= 7); 1682 brw_BFE(p, dst, src[0], src[1], src[2]); 1683 break; 1684 1685 case BRW_OPCODE_BFI1: 1686 assert(devinfo->gen >= 7); 1687 brw_BFI1(p, dst, src[0], src[1]); 1688 break; 1689 case BRW_OPCODE_BFI2: 1690 assert(devinfo->gen >= 7); 1691 brw_BFI2(p, dst, src[0], src[1], src[2]); 1692 break; 1693 1694 case BRW_OPCODE_IF: 1695 if (!inst->src[0].is_null()) { 1696 /* The instruction has an embedded compare (only allowed on gen6) */ 1697 assert(devinfo->gen == 6); 1698 gen6_IF(p, inst->conditional_mod, src[0], src[1]); 1699 } else { 1700 brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8); 1701 brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate); 1702 } 1703 break; 1704 1705 case BRW_OPCODE_ELSE: 1706 brw_ELSE(p); 1707 break; 1708 case BRW_OPCODE_ENDIF: 1709 brw_ENDIF(p); 1710 break; 1711 1712 case BRW_OPCODE_DO: 1713 brw_DO(p, BRW_EXECUTE_8); 1714 break; 1715 1716 case BRW_OPCODE_BREAK: 1717 brw_BREAK(p); 1718 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 1719 break; 1720 case BRW_OPCODE_CONTINUE: 1721 brw_CONT(p); 1722 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 1723 break; 1724 1725 case BRW_OPCODE_WHILE: 1726 brw_WHILE(p); 1727 loop_count++; 1728 break; 1729 1730 case SHADER_OPCODE_RCP: 1731 case SHADER_OPCODE_RSQ: 1732 case SHADER_OPCODE_SQRT: 1733 case SHADER_OPCODE_EXP2: 1734 case SHADER_OPCODE_LOG2: 1735 case SHADER_OPCODE_SIN: 1736 case SHADER_OPCODE_COS: 1737 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 1738 if (devinfo->gen >= 7) { 1739 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], 1740 brw_null_reg()); 1741 } else if (devinfo->gen == 6) { 1742 generate_math_gen6(p, inst, dst, src[0], brw_null_reg()); 1743 } else { 1744 generate_math1_gen4(p, inst, dst, src[0]); 1745 } 1746 break; 1747 1748 case SHADER_OPCODE_POW: 1749 case SHADER_OPCODE_INT_QUOTIENT: 1750 case SHADER_OPCODE_INT_REMAINDER: 1751 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 1752 if (devinfo->gen >= 7) { 1753 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); 1754 } else if (devinfo->gen == 6) { 1755 generate_math_gen6(p, inst, dst, src[0], src[1]); 1756 } else { 1757 generate_math2_gen4(p, inst, dst, src[0], src[1]); 1758 } 1759 break; 1760 1761 case SHADER_OPCODE_TEX: 1762 case SHADER_OPCODE_TXD: 1763 case SHADER_OPCODE_TXF: 1764 case SHADER_OPCODE_TXF_CMS: 1765 case SHADER_OPCODE_TXF_CMS_W: 1766 case SHADER_OPCODE_TXF_MCS: 1767 case SHADER_OPCODE_TXL: 1768 case SHADER_OPCODE_TXS: 1769 case SHADER_OPCODE_TG4: 1770 case SHADER_OPCODE_TG4_OFFSET: 1771 case SHADER_OPCODE_SAMPLEINFO: 1772 generate_tex(p, prog_data, nir->info.stage, 1773 inst, dst, src[0], src[1], src[2]); 1774 break; 1775 1776 case SHADER_OPCODE_GET_BUFFER_SIZE: 1777 generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]); 1778 break; 1779 1780 case VS_OPCODE_URB_WRITE: 1781 generate_vs_urb_write(p, inst); 1782 break; 1783 1784 case SHADER_OPCODE_GEN4_SCRATCH_READ: 1785 generate_scratch_read(p, inst, dst, src[0]); 1786 fill_count++; 1787 break; 1788 1789 case SHADER_OPCODE_GEN4_SCRATCH_WRITE: 1790 generate_scratch_write(p, inst, dst, src[0], src[1]); 1791 spill_count++; 1792 break; 1793 1794 case VS_OPCODE_PULL_CONSTANT_LOAD: 1795 generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]); 1796 break; 1797 1798 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: 1799 generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]); 1800 break; 1801 1802 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: 1803 generate_set_simd4x2_header_gen9(p, inst, dst); 1804 break; 1805 1806 case GS_OPCODE_URB_WRITE: 1807 generate_gs_urb_write(p, inst); 1808 break; 1809 1810 case GS_OPCODE_URB_WRITE_ALLOCATE: 1811 generate_gs_urb_write_allocate(p, inst); 1812 break; 1813 1814 case GS_OPCODE_SVB_WRITE: 1815 generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]); 1816 break; 1817 1818 case GS_OPCODE_SVB_SET_DST_INDEX: 1819 generate_gs_svb_set_destination_index(p, inst, dst, src[0]); 1820 break; 1821 1822 case GS_OPCODE_THREAD_END: 1823 generate_gs_thread_end(p, inst); 1824 break; 1825 1826 case GS_OPCODE_SET_WRITE_OFFSET: 1827 generate_gs_set_write_offset(p, dst, src[0], src[1]); 1828 break; 1829 1830 case GS_OPCODE_SET_VERTEX_COUNT: 1831 generate_gs_set_vertex_count(p, dst, src[0]); 1832 break; 1833 1834 case GS_OPCODE_FF_SYNC: 1835 generate_gs_ff_sync(p, inst, dst, src[0], src[1]); 1836 break; 1837 1838 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: 1839 generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]); 1840 break; 1841 1842 case GS_OPCODE_SET_PRIMITIVE_ID: 1843 generate_gs_set_primitive_id(p, dst); 1844 break; 1845 1846 case GS_OPCODE_SET_DWORD_2: 1847 generate_gs_set_dword_2(p, dst, src[0]); 1848 break; 1849 1850 case GS_OPCODE_PREPARE_CHANNEL_MASKS: 1851 generate_gs_prepare_channel_masks(p, dst); 1852 break; 1853 1854 case GS_OPCODE_SET_CHANNEL_MASKS: 1855 generate_gs_set_channel_masks(p, dst, src[0]); 1856 break; 1857 1858 case GS_OPCODE_GET_INSTANCE_ID: 1859 generate_gs_get_instance_id(p, dst); 1860 break; 1861 1862 case SHADER_OPCODE_SHADER_TIME_ADD: 1863 brw_shader_time_add(p, src[0], 1864 prog_data->base.binding_table.shader_time_start); 1865 brw_mark_surface_used(&prog_data->base, 1866 prog_data->base.binding_table.shader_time_start); 1867 break; 1868 1869 case SHADER_OPCODE_UNTYPED_ATOMIC: 1870 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1871 brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, 1872 !inst->dst.is_null()); 1873 break; 1874 1875 case SHADER_OPCODE_UNTYPED_SURFACE_READ: 1876 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1877 brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen, 1878 src[2].ud); 1879 break; 1880 1881 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: 1882 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1883 brw_untyped_surface_write(p, src[0], src[1], inst->mlen, 1884 src[2].ud); 1885 break; 1886 1887 case SHADER_OPCODE_TYPED_ATOMIC: 1888 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1889 brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, 1890 !inst->dst.is_null()); 1891 break; 1892 1893 case SHADER_OPCODE_TYPED_SURFACE_READ: 1894 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1895 brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen, 1896 src[2].ud); 1897 break; 1898 1899 case SHADER_OPCODE_TYPED_SURFACE_WRITE: 1900 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1901 brw_typed_surface_write(p, src[0], src[1], inst->mlen, 1902 src[2].ud); 1903 break; 1904 1905 case SHADER_OPCODE_MEMORY_FENCE: 1906 brw_memory_fence(p, dst); 1907 break; 1908 1909 case SHADER_OPCODE_FIND_LIVE_CHANNEL: { 1910 const struct brw_reg mask = 1911 brw_stage_has_packed_dispatch(devinfo, nir->info.stage, 1912 &prog_data->base) ? brw_imm_ud(~0u) : 1913 brw_dmask_reg(); 1914 brw_find_live_channel(p, dst, mask); 1915 break; 1916 } 1917 1918 case SHADER_OPCODE_BROADCAST: 1919 assert(inst->force_writemask_all); 1920 brw_broadcast(p, dst, src[0], src[1]); 1921 break; 1922 1923 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: 1924 generate_unpack_flags(p, dst); 1925 break; 1926 1927 case VEC4_OPCODE_MOV_BYTES: { 1928 /* Moves the low byte from each channel, using an Align1 access mode 1929 * and a <4,1,0> source region. 1930 */ 1931 assert(src[0].type == BRW_REGISTER_TYPE_UB || 1932 src[0].type == BRW_REGISTER_TYPE_B); 1933 1934 brw_set_default_access_mode(p, BRW_ALIGN_1); 1935 src[0].vstride = BRW_VERTICAL_STRIDE_4; 1936 src[0].width = BRW_WIDTH_1; 1937 src[0].hstride = BRW_HORIZONTAL_STRIDE_0; 1938 brw_MOV(p, dst, src[0]); 1939 brw_set_default_access_mode(p, BRW_ALIGN_16); 1940 break; 1941 } 1942 1943 case VEC4_OPCODE_DOUBLE_TO_F32: 1944 case VEC4_OPCODE_DOUBLE_TO_D32: 1945 case VEC4_OPCODE_DOUBLE_TO_U32: { 1946 assert(type_sz(src[0].type) == 8); 1947 assert(type_sz(dst.type) == 8); 1948 1949 brw_reg_type dst_type; 1950 1951 switch (inst->opcode) { 1952 case VEC4_OPCODE_DOUBLE_TO_F32: 1953 dst_type = BRW_REGISTER_TYPE_F; 1954 break; 1955 case VEC4_OPCODE_DOUBLE_TO_D32: 1956 dst_type = BRW_REGISTER_TYPE_D; 1957 break; 1958 case VEC4_OPCODE_DOUBLE_TO_U32: 1959 dst_type = BRW_REGISTER_TYPE_UD; 1960 break; 1961 default: 1962 unreachable("Not supported conversion"); 1963 } 1964 dst = retype(dst, dst_type); 1965 1966 brw_set_default_access_mode(p, BRW_ALIGN_1); 1967 1968 /* When converting from DF->F, we set destination's stride as 2 as an 1969 * aligment requirement. But in IVB/BYT, each DF implicitly writes 1970 * two floats, being the first one the converted value. So we don't 1971 * need to explicitly set stride 2, but 1. 1972 */ 1973 struct brw_reg spread_dst; 1974 if (devinfo->gen == 7 && !devinfo->is_haswell) 1975 spread_dst = stride(dst, 8, 4, 1); 1976 else 1977 spread_dst = stride(dst, 8, 4, 2); 1978 1979 brw_MOV(p, spread_dst, src[0]); 1980 1981 brw_set_default_access_mode(p, BRW_ALIGN_16); 1982 break; 1983 } 1984 1985 case VEC4_OPCODE_TO_DOUBLE: { 1986 assert(type_sz(src[0].type) == 4); 1987 assert(type_sz(dst.type) == 8); 1988 1989 brw_set_default_access_mode(p, BRW_ALIGN_1); 1990 1991 brw_MOV(p, dst, src[0]); 1992 1993 brw_set_default_access_mode(p, BRW_ALIGN_16); 1994 break; 1995 } 1996 1997 case VEC4_OPCODE_PICK_LOW_32BIT: 1998 case VEC4_OPCODE_PICK_HIGH_32BIT: { 1999 /* Stores the low/high 32-bit of each 64-bit element in src[0] into 2000 * dst using ALIGN1 mode and a <8,4,2>:UD region on the source. 2001 */ 2002 assert(type_sz(src[0].type) == 8); 2003 assert(type_sz(dst.type) == 4); 2004 2005 brw_set_default_access_mode(p, BRW_ALIGN_1); 2006 2007 dst = retype(dst, BRW_REGISTER_TYPE_UD); 2008 dst.hstride = BRW_HORIZONTAL_STRIDE_1; 2009 2010 src[0] = retype(src[0], BRW_REGISTER_TYPE_UD); 2011 if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT) 2012 src[0] = suboffset(src[0], 1); 2013 src[0] = spread(src[0], 2); 2014 brw_MOV(p, dst, src[0]); 2015 2016 brw_set_default_access_mode(p, BRW_ALIGN_16); 2017 break; 2018 } 2019 2020 case VEC4_OPCODE_SET_LOW_32BIT: 2021 case VEC4_OPCODE_SET_HIGH_32BIT: { 2022 /* Reads consecutive 32-bit elements from src[0] and writes 2023 * them to the low/high 32-bit of each 64-bit element in dst. 2024 */ 2025 assert(type_sz(src[0].type) == 4); 2026 assert(type_sz(dst.type) == 8); 2027 2028 brw_set_default_access_mode(p, BRW_ALIGN_1); 2029 2030 dst = retype(dst, BRW_REGISTER_TYPE_UD); 2031 if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT) 2032 dst = suboffset(dst, 1); 2033 dst.hstride = BRW_HORIZONTAL_STRIDE_2; 2034 2035 src[0] = retype(src[0], BRW_REGISTER_TYPE_UD); 2036 brw_MOV(p, dst, src[0]); 2037 2038 brw_set_default_access_mode(p, BRW_ALIGN_16); 2039 break; 2040 } 2041 2042 case VEC4_OPCODE_PACK_BYTES: { 2043 /* Is effectively: 2044 * 2045 * mov(8) dst<16,4,1>:UB src<4,1,0>:UB 2046 * 2047 * but destinations' only regioning is horizontal stride, so instead we 2048 * have to use two instructions: 2049 * 2050 * mov(4) dst<1>:UB src<4,1,0>:UB 2051 * mov(4) dst.16<1>:UB src.16<4,1,0>:UB 2052 * 2053 * where they pack the four bytes from the low and high four DW. 2054 */ 2055 assert(_mesa_is_pow_two(dst.writemask) && 2056 dst.writemask != 0); 2057 unsigned offset = __builtin_ctz(dst.writemask); 2058 2059 dst.type = BRW_REGISTER_TYPE_UB; 2060 2061 brw_set_default_access_mode(p, BRW_ALIGN_1); 2062 2063 src[0].type = BRW_REGISTER_TYPE_UB; 2064 src[0].vstride = BRW_VERTICAL_STRIDE_4; 2065 src[0].width = BRW_WIDTH_1; 2066 src[0].hstride = BRW_HORIZONTAL_STRIDE_0; 2067 dst.subnr = offset * 4; 2068 struct brw_inst *insn = brw_MOV(p, dst, src[0]); 2069 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4); 2070 brw_inst_set_no_dd_clear(p->devinfo, insn, true); 2071 brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check); 2072 2073 src[0].subnr = 16; 2074 dst.subnr = 16 + offset * 4; 2075 insn = brw_MOV(p, dst, src[0]); 2076 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4); 2077 brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear); 2078 brw_inst_set_no_dd_check(p->devinfo, insn, true); 2079 2080 brw_set_default_access_mode(p, BRW_ALIGN_16); 2081 break; 2082 } 2083 2084 case TCS_OPCODE_URB_WRITE: 2085 generate_tcs_urb_write(p, inst, src[0]); 2086 break; 2087 2088 case VEC4_OPCODE_URB_READ: 2089 generate_vec4_urb_read(p, inst, dst, src[0]); 2090 break; 2091 2092 case TCS_OPCODE_SET_INPUT_URB_OFFSETS: 2093 generate_tcs_input_urb_offsets(p, dst, src[0], src[1]); 2094 break; 2095 2096 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: 2097 generate_tcs_output_urb_offsets(p, dst, src[0], src[1]); 2098 break; 2099 2100 case TCS_OPCODE_GET_INSTANCE_ID: 2101 generate_tcs_get_instance_id(p, dst); 2102 break; 2103 2104 case TCS_OPCODE_GET_PRIMITIVE_ID: 2105 generate_tcs_get_primitive_id(p, dst); 2106 break; 2107 2108 case TCS_OPCODE_CREATE_BARRIER_HEADER: 2109 generate_tcs_create_barrier_header(p, prog_data, dst); 2110 break; 2111 2112 case TES_OPCODE_CREATE_INPUT_READ_HEADER: 2113 generate_tes_create_input_read_header(p, dst); 2114 break; 2115 2116 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: 2117 generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]); 2118 break; 2119 2120 case TES_OPCODE_GET_PRIMITIVE_ID: 2121 generate_tes_get_primitive_id(p, dst); 2122 break; 2123 2124 case TCS_OPCODE_SRC0_010_IS_ZERO: 2125 /* If src_reg had stride like fs_reg, we wouldn't need this. */ 2126 brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0)); 2127 break; 2128 2129 case TCS_OPCODE_RELEASE_INPUT: 2130 generate_tcs_release_input(p, dst, src[0], src[1]); 2131 break; 2132 2133 case TCS_OPCODE_THREAD_END: 2134 generate_tcs_thread_end(p, inst); 2135 break; 2136 2137 case SHADER_OPCODE_BARRIER: 2138 brw_barrier(p, src[0]); 2139 brw_WAIT(p); 2140 break; 2141 2142 case SHADER_OPCODE_MOV_INDIRECT: 2143 generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]); 2144 break; 2145 2146 case BRW_OPCODE_DIM: 2147 assert(devinfo->is_haswell); 2148 assert(src[0].type == BRW_REGISTER_TYPE_DF); 2149 assert(dst.type == BRW_REGISTER_TYPE_DF); 2150 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); 2151 break; 2152 2153 default: 2154 unreachable("Unsupported opcode"); 2155 } 2156 2157 if (inst->opcode == VEC4_OPCODE_PACK_BYTES) { 2158 /* Handled dependency hints in the generator. */ 2159 2160 assert(!inst->conditional_mod); 2161 } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { 2162 assert(p->nr_insn == pre_emit_nr_insn + 1 || 2163 !"conditional_mod, no_dd_check, or no_dd_clear set for IR " 2164 "emitting more than 1 instruction"); 2165 2166 brw_inst *last = &p->store[pre_emit_nr_insn]; 2167 2168 if (inst->conditional_mod) 2169 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); 2170 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); 2171 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); 2172 } 2173 } 2174 2175 brw_set_uip_jip(p, 0); 2176 2177 /* end of program sentinel */ 2178 disasm_new_inst_group(disasm_info, p->next_insn_offset); 2179 2180 #ifndef NDEBUG 2181 bool validated = 2182 #else 2183 if (unlikely(debug_flag)) 2184 #endif 2185 brw_validate_instructions(devinfo, p->store, 2186 0, p->next_insn_offset, 2187 disasm_info); 2188 2189 int before_size = p->next_insn_offset; 2190 brw_compact_instructions(p, 0, disasm_info); 2191 int after_size = p->next_insn_offset; 2192 2193 if (unlikely(debug_flag)) { 2194 fprintf(stderr, "Native code for %s %s shader %s:\n", 2195 nir->info.label ? nir->info.label : "unnamed", 2196 _mesa_shader_stage_to_string(nir->info.stage), nir->info.name); 2197 2198 fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d " 2199 "spills:fills. Compacted %d to %d bytes (%.0f%%)\n", 2200 stage_abbrev, before_size / 16, loop_count, cfg->cycle_count, 2201 spill_count, fill_count, before_size, after_size, 2202 100.0f * (before_size - after_size) / before_size); 2203 2204 dump_assembly(p->store, disasm_info); 2205 } 2206 ralloc_free(disasm_info); 2207 assert(validated); 2208 2209 compiler->shader_debug_log(log_data, 2210 "%s vec4 shader: %d inst, %d loops, %u cycles, " 2211 "%d:%d spills:fills, compacted %d to %d bytes.", 2212 stage_abbrev, before_size / 16, 2213 loop_count, cfg->cycle_count, spill_count, 2214 fill_count, before_size, after_size); 2215 2216 } 2217 2218 extern "C" const unsigned * 2219 brw_vec4_generate_assembly(const struct brw_compiler *compiler, 2220 void *log_data, 2221 void *mem_ctx, 2222 const nir_shader *nir, 2223 struct brw_vue_prog_data *prog_data, 2224 const struct cfg_t *cfg, 2225 unsigned *out_assembly_size) 2226 { 2227 struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen); 2228 brw_init_codegen(compiler->devinfo, p, mem_ctx); 2229 brw_set_default_access_mode(p, BRW_ALIGN_16); 2230 2231 generate_code(p, compiler, log_data, nir, prog_data, cfg); 2232 2233 return brw_get_program(p, out_assembly_size); 2234 } 2235