1 /* 2 * Copyright 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include "brw_nir.h" 25 #include "brw_vec4.h" 26 #include "brw_vec4_builder.h" 27 #include "brw_vec4_surface_builder.h" 28 #include "brw_program.h" 29 30 using namespace brw; 31 using namespace brw::surface_access; 32 33 namespace brw { 34 35 void 36 vec4_visitor::emit_nir_code() 37 { 38 if (nir->num_uniforms > 0) 39 nir_setup_uniforms(); 40 41 nir_setup_system_values(); 42 43 /* get the main function and emit it */ 44 nir_foreach_function(function, nir) { 45 assert(strcmp(function->name, "main") == 0); 46 assert(function->impl); 47 nir_emit_impl(function->impl); 48 } 49 } 50 51 void 52 vec4_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr) 53 { 54 dst_reg *reg; 55 56 switch (instr->intrinsic) { 57 case nir_intrinsic_load_vertex_id: 58 unreachable("should be lowered by lower_vertex_id()."); 59 60 case nir_intrinsic_load_vertex_id_zero_base: 61 reg = &nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE]; 62 if (reg->file == BAD_FILE) 63 *reg = *make_reg_for_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); 64 break; 65 66 case nir_intrinsic_load_base_vertex: 67 reg = &nir_system_values[SYSTEM_VALUE_BASE_VERTEX]; 68 if (reg->file == BAD_FILE) 69 *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_VERTEX); 70 break; 71 72 case nir_intrinsic_load_instance_id: 73 reg = &nir_system_values[SYSTEM_VALUE_INSTANCE_ID]; 74 if (reg->file == BAD_FILE) 75 *reg = *make_reg_for_system_value(SYSTEM_VALUE_INSTANCE_ID); 76 break; 77 78 case nir_intrinsic_load_base_instance: 79 reg = &nir_system_values[SYSTEM_VALUE_BASE_INSTANCE]; 80 if (reg->file == BAD_FILE) 81 *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_INSTANCE); 82 break; 83 84 case nir_intrinsic_load_draw_id: 85 reg = &nir_system_values[SYSTEM_VALUE_DRAW_ID]; 86 if (reg->file == BAD_FILE) 87 *reg = *make_reg_for_system_value(SYSTEM_VALUE_DRAW_ID); 88 break; 89 90 default: 91 break; 92 } 93 } 94 95 static bool 96 setup_system_values_block(nir_block *block, vec4_visitor *v) 97 { 98 nir_foreach_instr(instr, block) { 99 if (instr->type != nir_instr_type_intrinsic) 100 continue; 101 102 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 103 v->nir_setup_system_value_intrinsic(intrin); 104 } 105 106 return true; 107 } 108 109 void 110 vec4_visitor::nir_setup_system_values() 111 { 112 nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX); 113 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { 114 nir_system_values[i] = dst_reg(); 115 } 116 117 nir_foreach_function(function, nir) { 118 assert(strcmp(function->name, "main") == 0); 119 assert(function->impl); 120 nir_foreach_block(block, function->impl) { 121 setup_system_values_block(block, this); 122 } 123 } 124 } 125 126 void 127 vec4_visitor::nir_setup_uniforms() 128 { 129 uniforms = nir->num_uniforms / 16; 130 } 131 132 void 133 vec4_visitor::nir_emit_impl(nir_function_impl *impl) 134 { 135 nir_locals = ralloc_array(mem_ctx, dst_reg, impl->reg_alloc); 136 for (unsigned i = 0; i < impl->reg_alloc; i++) { 137 nir_locals[i] = dst_reg(); 138 } 139 140 foreach_list_typed(nir_register, reg, node, &impl->registers) { 141 unsigned array_elems = 142 reg->num_array_elems == 0 ? 1 : reg->num_array_elems; 143 const unsigned num_regs = array_elems * DIV_ROUND_UP(reg->bit_size, 32); 144 nir_locals[reg->index] = dst_reg(VGRF, alloc.allocate(num_regs)); 145 146 if (reg->bit_size == 64) 147 nir_locals[reg->index].type = BRW_REGISTER_TYPE_DF; 148 } 149 150 nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc); 151 152 nir_emit_cf_list(&impl->body); 153 } 154 155 void 156 vec4_visitor::nir_emit_cf_list(exec_list *list) 157 { 158 exec_list_validate(list); 159 foreach_list_typed(nir_cf_node, node, node, list) { 160 switch (node->type) { 161 case nir_cf_node_if: 162 nir_emit_if(nir_cf_node_as_if(node)); 163 break; 164 165 case nir_cf_node_loop: 166 nir_emit_loop(nir_cf_node_as_loop(node)); 167 break; 168 169 case nir_cf_node_block: 170 nir_emit_block(nir_cf_node_as_block(node)); 171 break; 172 173 default: 174 unreachable("Invalid CFG node block"); 175 } 176 } 177 } 178 179 void 180 vec4_visitor::nir_emit_if(nir_if *if_stmt) 181 { 182 /* First, put the condition in f0 */ 183 src_reg condition = get_nir_src(if_stmt->condition, BRW_REGISTER_TYPE_D, 1); 184 vec4_instruction *inst = emit(MOV(dst_null_d(), condition)); 185 inst->conditional_mod = BRW_CONDITIONAL_NZ; 186 187 /* We can just predicate based on the X channel, as the condition only 188 * goes on its own line */ 189 emit(IF(BRW_PREDICATE_ALIGN16_REPLICATE_X)); 190 191 nir_emit_cf_list(&if_stmt->then_list); 192 193 /* note: if the else is empty, dead CF elimination will remove it */ 194 emit(BRW_OPCODE_ELSE); 195 196 nir_emit_cf_list(&if_stmt->else_list); 197 198 emit(BRW_OPCODE_ENDIF); 199 } 200 201 void 202 vec4_visitor::nir_emit_loop(nir_loop *loop) 203 { 204 emit(BRW_OPCODE_DO); 205 206 nir_emit_cf_list(&loop->body); 207 208 emit(BRW_OPCODE_WHILE); 209 } 210 211 void 212 vec4_visitor::nir_emit_block(nir_block *block) 213 { 214 nir_foreach_instr(instr, block) { 215 nir_emit_instr(instr); 216 } 217 } 218 219 void 220 vec4_visitor::nir_emit_instr(nir_instr *instr) 221 { 222 base_ir = instr; 223 224 switch (instr->type) { 225 case nir_instr_type_load_const: 226 nir_emit_load_const(nir_instr_as_load_const(instr)); 227 break; 228 229 case nir_instr_type_intrinsic: 230 nir_emit_intrinsic(nir_instr_as_intrinsic(instr)); 231 break; 232 233 case nir_instr_type_alu: 234 nir_emit_alu(nir_instr_as_alu(instr)); 235 break; 236 237 case nir_instr_type_jump: 238 nir_emit_jump(nir_instr_as_jump(instr)); 239 break; 240 241 case nir_instr_type_tex: 242 nir_emit_texture(nir_instr_as_tex(instr)); 243 break; 244 245 case nir_instr_type_ssa_undef: 246 nir_emit_undef(nir_instr_as_ssa_undef(instr)); 247 break; 248 249 default: 250 fprintf(stderr, "VS instruction not yet implemented by NIR->vec4\n"); 251 break; 252 } 253 } 254 255 static dst_reg 256 dst_reg_for_nir_reg(vec4_visitor *v, nir_register *nir_reg, 257 unsigned base_offset, nir_src *indirect) 258 { 259 dst_reg reg; 260 261 reg = v->nir_locals[nir_reg->index]; 262 if (nir_reg->bit_size == 64) 263 reg.type = BRW_REGISTER_TYPE_DF; 264 reg = offset(reg, 8, base_offset); 265 if (indirect) { 266 reg.reladdr = 267 new(v->mem_ctx) src_reg(v->get_nir_src(*indirect, 268 BRW_REGISTER_TYPE_D, 269 1)); 270 } 271 return reg; 272 } 273 274 dst_reg 275 vec4_visitor::get_nir_dest(const nir_dest &dest) 276 { 277 if (dest.is_ssa) { 278 dst_reg dst = 279 dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(dest.ssa.bit_size, 32))); 280 if (dest.ssa.bit_size == 64) 281 dst.type = BRW_REGISTER_TYPE_DF; 282 nir_ssa_values[dest.ssa.index] = dst; 283 return dst; 284 } else { 285 return dst_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset, 286 dest.reg.indirect); 287 } 288 } 289 290 dst_reg 291 vec4_visitor::get_nir_dest(const nir_dest &dest, enum brw_reg_type type) 292 { 293 return retype(get_nir_dest(dest), type); 294 } 295 296 dst_reg 297 vec4_visitor::get_nir_dest(const nir_dest &dest, nir_alu_type type) 298 { 299 return get_nir_dest(dest, brw_type_for_nir_type(type)); 300 } 301 302 src_reg 303 vec4_visitor::get_nir_src(const nir_src &src, enum brw_reg_type type, 304 unsigned num_components) 305 { 306 dst_reg reg; 307 308 if (src.is_ssa) { 309 assert(src.ssa != NULL); 310 reg = nir_ssa_values[src.ssa->index]; 311 } 312 else { 313 reg = dst_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset, 314 src.reg.indirect); 315 } 316 317 reg = retype(reg, type); 318 319 src_reg reg_as_src = src_reg(reg); 320 reg_as_src.swizzle = brw_swizzle_for_size(num_components); 321 return reg_as_src; 322 } 323 324 src_reg 325 vec4_visitor::get_nir_src(const nir_src &src, nir_alu_type type, 326 unsigned num_components) 327 { 328 return get_nir_src(src, brw_type_for_nir_type(type), num_components); 329 } 330 331 src_reg 332 vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components) 333 { 334 /* if type is not specified, default to signed int */ 335 return get_nir_src(src, nir_type_int32, num_components); 336 } 337 338 src_reg 339 vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr) 340 { 341 nir_src *offset_src = nir_get_io_offset_src(instr); 342 nir_const_value *const_value = nir_src_as_const_value(*offset_src); 343 344 if (const_value) { 345 /* The only constant offset we should find is 0. brw_nir.c's 346 * add_const_offset_to_base() will fold other constant offsets 347 * into instr->const_index[0]. 348 */ 349 assert(const_value->u32[0] == 0); 350 return src_reg(); 351 } 352 353 return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1); 354 } 355 356 void 357 vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr) 358 { 359 dst_reg reg; 360 361 if (instr->def.bit_size == 64) { 362 reg = dst_reg(VGRF, alloc.allocate(2)); 363 reg.type = BRW_REGISTER_TYPE_DF; 364 } else { 365 reg = dst_reg(VGRF, alloc.allocate(1)); 366 reg.type = BRW_REGISTER_TYPE_D; 367 } 368 369 unsigned remaining = brw_writemask_for_size(instr->def.num_components); 370 371 /* @FIXME: consider emitting vector operations to save some MOVs in 372 * cases where the components are representable in 8 bits. 373 * For now, we emit a MOV for each distinct value. 374 */ 375 for (unsigned i = 0; i < instr->def.num_components; i++) { 376 unsigned writemask = 1 << i; 377 378 if ((remaining & writemask) == 0) 379 continue; 380 381 for (unsigned j = i; j < instr->def.num_components; j++) { 382 if ((instr->def.bit_size == 32 && 383 instr->value.u32[i] == instr->value.u32[j]) || 384 (instr->def.bit_size == 64 && 385 instr->value.f64[i] == instr->value.f64[j])) { 386 writemask |= 1 << j; 387 } 388 } 389 390 reg.writemask = writemask; 391 if (instr->def.bit_size == 64) { 392 emit(MOV(reg, setup_imm_df(instr->value.f64[i]))); 393 } else { 394 emit(MOV(reg, brw_imm_d(instr->value.i32[i]))); 395 } 396 397 remaining &= ~writemask; 398 } 399 400 /* Set final writemask */ 401 reg.writemask = brw_writemask_for_size(instr->def.num_components); 402 403 nir_ssa_values[instr->def.index] = reg; 404 } 405 406 void 407 vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) 408 { 409 dst_reg dest; 410 src_reg src; 411 412 switch (instr->intrinsic) { 413 414 case nir_intrinsic_load_input: { 415 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 416 417 /* We set EmitNoIndirectInput for VS */ 418 assert(const_offset); 419 420 dest = get_nir_dest(instr->dest); 421 dest.writemask = brw_writemask_for_size(instr->num_components); 422 423 src = src_reg(ATTR, instr->const_index[0] + const_offset->u32[0], 424 glsl_type::uvec4_type); 425 src = retype(src, dest.type); 426 427 bool is_64bit = nir_dest_bit_size(instr->dest) == 64; 428 if (is_64bit) { 429 dst_reg tmp = dst_reg(this, glsl_type::dvec4_type); 430 src.swizzle = BRW_SWIZZLE_XYZW; 431 shuffle_64bit_data(tmp, src, false); 432 emit(MOV(dest, src_reg(tmp))); 433 } else { 434 /* Swizzle source based on component layout qualifier */ 435 src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr)); 436 emit(MOV(dest, src)); 437 } 438 break; 439 } 440 441 case nir_intrinsic_store_output: { 442 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 443 assert(const_offset); 444 445 int varying = instr->const_index[0] + const_offset->u32[0]; 446 447 bool is_64bit = nir_src_bit_size(instr->src[0]) == 64; 448 if (is_64bit) { 449 src_reg data; 450 src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_DF, 451 instr->num_components); 452 data = src_reg(this, glsl_type::dvec4_type); 453 shuffle_64bit_data(dst_reg(data), src, true); 454 src = retype(data, BRW_REGISTER_TYPE_F); 455 } else { 456 src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 457 instr->num_components); 458 } 459 460 unsigned c = nir_intrinsic_component(instr); 461 output_reg[varying][c] = dst_reg(src); 462 output_num_components[varying][c] = instr->num_components; 463 464 unsigned num_components = instr->num_components; 465 if (is_64bit) 466 num_components *= 2; 467 468 output_reg[varying][c] = dst_reg(src); 469 output_num_components[varying][c] = MIN2(4, num_components); 470 471 if (is_64bit && num_components > 4) { 472 assert(num_components <= 8); 473 output_reg[varying + 1][c] = byte_offset(dst_reg(src), REG_SIZE); 474 output_num_components[varying + 1][c] = num_components - 4; 475 } 476 break; 477 } 478 479 case nir_intrinsic_get_buffer_size: { 480 nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); 481 unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0; 482 483 const unsigned index = 484 prog_data->base.binding_table.ssbo_start + ssbo_index; 485 dst_reg result_dst = get_nir_dest(instr->dest); 486 vec4_instruction *inst = new(mem_ctx) 487 vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst); 488 489 inst->base_mrf = 2; 490 inst->mlen = 1; /* always at least one */ 491 inst->src[1] = brw_imm_ud(index); 492 493 /* MRF for the first parameter */ 494 src_reg lod = brw_imm_d(0); 495 int param_base = inst->base_mrf; 496 int writemask = WRITEMASK_X; 497 emit(MOV(dst_reg(MRF, param_base, glsl_type::int_type, writemask), lod)); 498 499 emit(inst); 500 501 brw_mark_surface_used(&prog_data->base, index); 502 break; 503 } 504 505 case nir_intrinsic_store_ssbo: { 506 assert(devinfo->gen >= 7); 507 508 /* Block index */ 509 src_reg surf_index; 510 nir_const_value *const_uniform_block = 511 nir_src_as_const_value(instr->src[1]); 512 if (const_uniform_block) { 513 unsigned index = prog_data->base.binding_table.ssbo_start + 514 const_uniform_block->u32[0]; 515 surf_index = brw_imm_ud(index); 516 brw_mark_surface_used(&prog_data->base, index); 517 } else { 518 surf_index = src_reg(this, glsl_type::uint_type); 519 emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[1], 1), 520 brw_imm_ud(prog_data->base.binding_table.ssbo_start))); 521 surf_index = emit_uniformize(surf_index); 522 523 brw_mark_surface_used(&prog_data->base, 524 prog_data->base.binding_table.ssbo_start + 525 nir->info->num_ssbos - 1); 526 } 527 528 /* Offset */ 529 src_reg offset_reg; 530 nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]); 531 if (const_offset) { 532 offset_reg = brw_imm_ud(const_offset->u32[0]); 533 } else { 534 offset_reg = get_nir_src(instr->src[2], 1); 535 } 536 537 /* Value */ 538 src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4); 539 540 /* Writemask */ 541 unsigned write_mask = instr->const_index[0]; 542 543 /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped 544 * writes will use SIMD8 mode. In order to hide this and keep symmetry across 545 * typed and untyped messages and across hardware platforms, the 546 * current implementation of the untyped messages will transparently convert 547 * the SIMD4x2 payload into an equivalent SIMD8 payload by transposing it 548 * and enabling only channel X on the SEND instruction. 549 * 550 * The above, works well for full vector writes, but not for partial writes 551 * where we want to write some channels and not others, like when we have 552 * code such as v.xyw = vec3(1,2,4). Because the untyped write messages are 553 * quite restrictive with regards to the channel enables we can configure in 554 * the message descriptor (not all combinations are allowed) we cannot simply 555 * implement these scenarios with a single message while keeping the 556 * aforementioned symmetry in the implementation. For now we de decided that 557 * it is better to keep the symmetry to reduce complexity, so in situations 558 * such as the one described we end up emitting two untyped write messages 559 * (one for xy and another for w). 560 * 561 * The code below packs consecutive channels into a single write message, 562 * detects gaps in the vector write and if needed, sends a second message 563 * with the remaining channels. If in the future we decide that we want to 564 * emit a single message at the expense of losing the symmetry in the 565 * implementation we can: 566 * 567 * 1) For IvyBridge: Only use the red channel of the untyped write SIMD8 568 * message payload. In this mode we can write up to 8 offsets and dwords 569 * to the red channel only (for the two vec4s in the SIMD4x2 execution) 570 * and select which of the 8 channels carry data to write by setting the 571 * appropriate writemask in the dst register of the SEND instruction. 572 * It would require to write a new generator opcode specifically for 573 * IvyBridge since we would need to prepare a SIMD8 payload that could 574 * use any channel, not just X. 575 * 576 * 2) For Haswell+: Simply send a single write message but set the writemask 577 * on the dst of the SEND instruction to select the channels we want to 578 * write. It would require to modify the current messages to receive 579 * and honor the writemask provided. 580 */ 581 const vec4_builder bld = vec4_builder(this).at_end() 582 .annotate(current_annotation, base_ir); 583 584 unsigned type_slots = nir_src_bit_size(instr->src[0]) / 32; 585 if (type_slots == 2) { 586 dst_reg tmp = dst_reg(this, glsl_type::dvec4_type); 587 shuffle_64bit_data(tmp, retype(val_reg, tmp.type), true); 588 val_reg = src_reg(retype(tmp, BRW_REGISTER_TYPE_F)); 589 } 590 591 uint8_t swizzle[4] = { 0, 0, 0, 0}; 592 int num_channels = 0; 593 unsigned skipped_channels = 0; 594 int num_components = instr->num_components; 595 for (int i = 0; i < num_components; i++) { 596 /* Read components Z/W of a dvec from the appropriate place. We will 597 * also have to adjust the swizzle (we do that with the '% 4' below) 598 */ 599 if (i == 2 && type_slots == 2) 600 val_reg = byte_offset(val_reg, REG_SIZE); 601 602 /* Check if this channel needs to be written. If so, record the 603 * channel we need to take the data from in the swizzle array 604 */ 605 int component_mask = 1 << i; 606 int write_test = write_mask & component_mask; 607 if (write_test) { 608 /* If we are writing doubles we have to write 2 channels worth of 609 * of data (64 bits) for each double component. 610 */ 611 swizzle[num_channels++] = (i * type_slots) % 4; 612 if (type_slots == 2) 613 swizzle[num_channels++] = (i * type_slots + 1) % 4; 614 } 615 616 /* If we don't have to write this channel it means we have a gap in the 617 * vector, so write the channels we accumulated until now, if any. Do 618 * the same if this was the last component in the vector, if we have 619 * enough channels for a full vec4 write or if we have processed 620 * components XY of a dvec (since components ZW are not in the same 621 * SIMD register) 622 */ 623 if (!write_test || i == num_components - 1 || num_channels == 4 || 624 (i == 1 && type_slots == 2)) { 625 if (num_channels > 0) { 626 /* We have channels to write, so update the offset we need to 627 * write at to skip the channels we skipped, if any. 628 */ 629 if (skipped_channels > 0) { 630 if (offset_reg.file == IMM) { 631 offset_reg.ud += 4 * skipped_channels; 632 } else { 633 emit(ADD(dst_reg(offset_reg), offset_reg, 634 brw_imm_ud(4 * skipped_channels))); 635 } 636 } 637 638 /* Swizzle the data register so we take the data from the channels 639 * we need to write and send the write message. This will write 640 * num_channels consecutive dwords starting at offset. 641 */ 642 val_reg.swizzle = 643 BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]); 644 emit_untyped_write(bld, surf_index, offset_reg, val_reg, 645 1 /* dims */, num_channels /* size */, 646 BRW_PREDICATE_NONE); 647 648 /* If we have to do a second write we will have to update the 649 * offset so that we jump over the channels we have just written 650 * now. 651 */ 652 skipped_channels = num_channels; 653 654 /* Restart the count for the next write message */ 655 num_channels = 0; 656 } 657 658 /* If we didn't write the channel, increase skipped count */ 659 if (!write_test) 660 skipped_channels += type_slots; 661 } 662 } 663 664 break; 665 } 666 667 case nir_intrinsic_load_ssbo: { 668 assert(devinfo->gen >= 7); 669 670 nir_const_value *const_uniform_block = 671 nir_src_as_const_value(instr->src[0]); 672 673 src_reg surf_index; 674 if (const_uniform_block) { 675 unsigned index = prog_data->base.binding_table.ssbo_start + 676 const_uniform_block->u32[0]; 677 surf_index = brw_imm_ud(index); 678 679 brw_mark_surface_used(&prog_data->base, index); 680 } else { 681 surf_index = src_reg(this, glsl_type::uint_type); 682 emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], 1), 683 brw_imm_ud(prog_data->base.binding_table.ssbo_start))); 684 surf_index = emit_uniformize(surf_index); 685 686 /* Assume this may touch any UBO. It would be nice to provide 687 * a tighter bound, but the array information is already lowered away. 688 */ 689 brw_mark_surface_used(&prog_data->base, 690 prog_data->base.binding_table.ssbo_start + 691 nir->info->num_ssbos - 1); 692 } 693 694 src_reg offset_reg; 695 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 696 if (const_offset) { 697 offset_reg = brw_imm_ud(const_offset->u32[0]); 698 } else { 699 offset_reg = get_nir_src(instr->src[1], 1); 700 } 701 702 /* Read the vector */ 703 const vec4_builder bld = vec4_builder(this).at_end() 704 .annotate(current_annotation, base_ir); 705 706 src_reg read_result; 707 dst_reg dest = get_nir_dest(instr->dest); 708 if (type_sz(dest.type) < 8) { 709 read_result = emit_untyped_read(bld, surf_index, offset_reg, 710 1 /* dims */, 4 /* size*/, 711 BRW_PREDICATE_NONE); 712 } else { 713 src_reg shuffled = src_reg(this, glsl_type::dvec4_type); 714 715 src_reg temp; 716 temp = emit_untyped_read(bld, surf_index, offset_reg, 717 1 /* dims */, 4 /* size*/, 718 BRW_PREDICATE_NONE); 719 emit(MOV(dst_reg(retype(shuffled, temp.type)), temp)); 720 721 if (offset_reg.file == IMM) 722 offset_reg.ud += 16; 723 else 724 emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16))); 725 726 temp = emit_untyped_read(bld, surf_index, offset_reg, 727 1 /* dims */, 4 /* size*/, 728 BRW_PREDICATE_NONE); 729 emit(MOV(dst_reg(retype(byte_offset(shuffled, REG_SIZE), temp.type)), 730 temp)); 731 732 read_result = src_reg(this, glsl_type::dvec4_type); 733 shuffle_64bit_data(dst_reg(read_result), shuffled, false); 734 } 735 736 read_result.type = dest.type; 737 read_result.swizzle = brw_swizzle_for_size(instr->num_components); 738 emit(MOV(dest, read_result)); 739 break; 740 } 741 742 case nir_intrinsic_ssbo_atomic_add: 743 nir_emit_ssbo_atomic(BRW_AOP_ADD, instr); 744 break; 745 case nir_intrinsic_ssbo_atomic_imin: 746 nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr); 747 break; 748 case nir_intrinsic_ssbo_atomic_umin: 749 nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr); 750 break; 751 case nir_intrinsic_ssbo_atomic_imax: 752 nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr); 753 break; 754 case nir_intrinsic_ssbo_atomic_umax: 755 nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr); 756 break; 757 case nir_intrinsic_ssbo_atomic_and: 758 nir_emit_ssbo_atomic(BRW_AOP_AND, instr); 759 break; 760 case nir_intrinsic_ssbo_atomic_or: 761 nir_emit_ssbo_atomic(BRW_AOP_OR, instr); 762 break; 763 case nir_intrinsic_ssbo_atomic_xor: 764 nir_emit_ssbo_atomic(BRW_AOP_XOR, instr); 765 break; 766 case nir_intrinsic_ssbo_atomic_exchange: 767 nir_emit_ssbo_atomic(BRW_AOP_MOV, instr); 768 break; 769 case nir_intrinsic_ssbo_atomic_comp_swap: 770 nir_emit_ssbo_atomic(BRW_AOP_CMPWR, instr); 771 break; 772 773 case nir_intrinsic_load_vertex_id: 774 unreachable("should be lowered by lower_vertex_id()"); 775 776 case nir_intrinsic_load_vertex_id_zero_base: 777 case nir_intrinsic_load_base_vertex: 778 case nir_intrinsic_load_instance_id: 779 case nir_intrinsic_load_base_instance: 780 case nir_intrinsic_load_draw_id: 781 case nir_intrinsic_load_invocation_id: { 782 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 783 src_reg val = src_reg(nir_system_values[sv]); 784 assert(val.file != BAD_FILE); 785 dest = get_nir_dest(instr->dest, val.type); 786 emit(MOV(dest, val)); 787 break; 788 } 789 790 case nir_intrinsic_load_uniform: { 791 /* Offsets are in bytes but they should always be multiples of 4 */ 792 assert(nir_intrinsic_base(instr) % 4 == 0); 793 794 dest = get_nir_dest(instr->dest); 795 796 src = src_reg(dst_reg(UNIFORM, nir_intrinsic_base(instr) / 16)); 797 src.type = dest.type; 798 799 /* Uniforms don't actually have to be vec4 aligned. In the case that 800 * it isn't, we have to use a swizzle to shift things around. They 801 * do still have the std140 alignment requirement that vec2's have to 802 * be vec2-aligned and vec3's and vec4's have to be vec4-aligned. 803 * 804 * The swizzle also works in the indirect case as the generator adds 805 * the swizzle to the offset for us. 806 */ 807 unsigned shift = (nir_intrinsic_base(instr) % 16) / 4; 808 assert(shift + instr->num_components <= 4); 809 810 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 811 if (const_offset) { 812 /* Offsets are in bytes but they should always be multiples of 4 */ 813 assert(const_offset->u32[0] % 4 == 0); 814 815 unsigned offset = const_offset->u32[0] + shift * 4; 816 src.offset = ROUND_DOWN_TO(offset, 16); 817 shift = (offset % 16) / 4; 818 src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift); 819 820 emit(MOV(dest, src)); 821 } else { 822 src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift); 823 824 src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); 825 826 /* MOV_INDIRECT is going to stomp the whole thing anyway */ 827 dest.writemask = WRITEMASK_XYZW; 828 829 emit(SHADER_OPCODE_MOV_INDIRECT, dest, src, 830 indirect, brw_imm_ud(instr->const_index[1])); 831 } 832 break; 833 } 834 835 case nir_intrinsic_atomic_counter_read: 836 case nir_intrinsic_atomic_counter_inc: 837 case nir_intrinsic_atomic_counter_dec: { 838 unsigned surf_index = prog_data->base.binding_table.abo_start + 839 (unsigned) instr->const_index[0]; 840 const vec4_builder bld = 841 vec4_builder(this).at_end().annotate(current_annotation, base_ir); 842 843 /* Get some metadata from the image intrinsic. */ 844 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; 845 846 /* Get the arguments of the atomic intrinsic. */ 847 src_reg offset = get_nir_src(instr->src[0], nir_type_int32, 848 instr->num_components); 849 const src_reg surface = brw_imm_ud(surf_index); 850 const src_reg src0 = (info->num_srcs >= 2 851 ? get_nir_src(instr->src[1]) : src_reg()); 852 const src_reg src1 = (info->num_srcs >= 3 853 ? get_nir_src(instr->src[2]) : src_reg()); 854 855 src_reg tmp; 856 857 dest = get_nir_dest(instr->dest); 858 859 if (instr->intrinsic == nir_intrinsic_atomic_counter_read) { 860 tmp = emit_untyped_read(bld, surface, offset, 1, 1); 861 } else { 862 tmp = emit_untyped_atomic(bld, surface, offset, 863 src0, src1, 864 1, 1, 865 get_atomic_counter_op(instr->intrinsic)); 866 } 867 868 bld.MOV(retype(dest, tmp.type), tmp); 869 brw_mark_surface_used(stage_prog_data, surf_index); 870 break; 871 } 872 873 case nir_intrinsic_load_ubo: { 874 nir_const_value *const_block_index = nir_src_as_const_value(instr->src[0]); 875 src_reg surf_index; 876 877 dest = get_nir_dest(instr->dest); 878 879 if (const_block_index) { 880 /* The block index is a constant, so just emit the binding table entry 881 * as an immediate. 882 */ 883 const unsigned index = prog_data->base.binding_table.ubo_start + 884 const_block_index->u32[0]; 885 surf_index = brw_imm_ud(index); 886 brw_mark_surface_used(&prog_data->base, index); 887 } else { 888 /* The block index is not a constant. Evaluate the index expression 889 * per-channel and add the base UBO index; we have to select a value 890 * from any live channel. 891 */ 892 surf_index = src_reg(this, glsl_type::uint_type); 893 emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int32, 894 instr->num_components), 895 brw_imm_ud(prog_data->base.binding_table.ubo_start))); 896 surf_index = emit_uniformize(surf_index); 897 898 /* Assume this may touch any UBO. It would be nice to provide 899 * a tighter bound, but the array information is already lowered away. 900 */ 901 brw_mark_surface_used(&prog_data->base, 902 prog_data->base.binding_table.ubo_start + 903 nir->info->num_ubos - 1); 904 } 905 906 src_reg offset_reg; 907 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 908 if (const_offset) { 909 offset_reg = brw_imm_ud(const_offset->u32[0] & ~15); 910 } else { 911 offset_reg = get_nir_src(instr->src[1], nir_type_uint32, 1); 912 } 913 914 src_reg packed_consts; 915 if (nir_dest_bit_size(instr->dest) == 32) { 916 packed_consts = src_reg(this, glsl_type::vec4_type); 917 emit_pull_constant_load_reg(dst_reg(packed_consts), 918 surf_index, 919 offset_reg, 920 NULL, NULL /* before_block/inst */); 921 } else { 922 src_reg temp = src_reg(this, glsl_type::dvec4_type); 923 src_reg temp_float = retype(temp, BRW_REGISTER_TYPE_F); 924 925 emit_pull_constant_load_reg(dst_reg(temp_float), 926 surf_index, offset_reg, NULL, NULL); 927 if (offset_reg.file == IMM) 928 offset_reg.ud += 16; 929 else 930 emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16u))); 931 emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)), 932 surf_index, offset_reg, NULL, NULL); 933 934 packed_consts = src_reg(this, glsl_type::dvec4_type); 935 shuffle_64bit_data(dst_reg(packed_consts), temp, false); 936 } 937 938 packed_consts.swizzle = brw_swizzle_for_size(instr->num_components); 939 if (const_offset) { 940 unsigned type_size = type_sz(dest.type); 941 packed_consts.swizzle += 942 BRW_SWIZZLE4(const_offset->u32[0] % 16 / type_size, 943 const_offset->u32[0] % 16 / type_size, 944 const_offset->u32[0] % 16 / type_size, 945 const_offset->u32[0] % 16 / type_size); 946 } 947 948 emit(MOV(dest, retype(packed_consts, dest.type))); 949 950 break; 951 } 952 953 case nir_intrinsic_memory_barrier: { 954 const vec4_builder bld = 955 vec4_builder(this).at_end().annotate(current_annotation, base_ir); 956 const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 957 bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp) 958 ->size_written = 2 * REG_SIZE; 959 break; 960 } 961 962 case nir_intrinsic_shader_clock: { 963 /* We cannot do anything if there is an event, so ignore it for now */ 964 const src_reg shader_clock = get_timestamp(); 965 const enum brw_reg_type type = brw_type_for_base_type(glsl_type::uvec2_type); 966 967 dest = get_nir_dest(instr->dest, type); 968 emit(MOV(dest, shader_clock)); 969 break; 970 } 971 972 default: 973 unreachable("Unknown intrinsic"); 974 } 975 } 976 977 void 978 vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr) 979 { 980 dst_reg dest; 981 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 982 dest = get_nir_dest(instr->dest); 983 984 src_reg surface; 985 nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); 986 if (const_surface) { 987 unsigned surf_index = prog_data->base.binding_table.ssbo_start + 988 const_surface->u32[0]; 989 surface = brw_imm_ud(surf_index); 990 brw_mark_surface_used(&prog_data->base, surf_index); 991 } else { 992 surface = src_reg(this, glsl_type::uint_type); 993 emit(ADD(dst_reg(surface), get_nir_src(instr->src[0]), 994 brw_imm_ud(prog_data->base.binding_table.ssbo_start))); 995 996 /* Assume this may touch any UBO. This is the same we do for other 997 * UBO/SSBO accesses with non-constant surface. 998 */ 999 brw_mark_surface_used(&prog_data->base, 1000 prog_data->base.binding_table.ssbo_start + 1001 nir->info->num_ssbos - 1); 1002 } 1003 1004 src_reg offset = get_nir_src(instr->src[1], 1); 1005 src_reg data1 = get_nir_src(instr->src[2], 1); 1006 src_reg data2; 1007 if (op == BRW_AOP_CMPWR) 1008 data2 = get_nir_src(instr->src[3], 1); 1009 1010 /* Emit the actual atomic operation operation */ 1011 const vec4_builder bld = 1012 vec4_builder(this).at_end().annotate(current_annotation, base_ir); 1013 1014 src_reg atomic_result = emit_untyped_atomic(bld, surface, offset, 1015 data1, data2, 1016 1 /* dims */, 1 /* rsize */, 1017 op, 1018 BRW_PREDICATE_NONE); 1019 dest.type = atomic_result.type; 1020 bld.MOV(dest, atomic_result); 1021 } 1022 1023 static unsigned 1024 brw_swizzle_for_nir_swizzle(uint8_t swizzle[4]) 1025 { 1026 return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]); 1027 } 1028 1029 static enum brw_conditional_mod 1030 brw_conditional_for_nir_comparison(nir_op op) 1031 { 1032 switch (op) { 1033 case nir_op_flt: 1034 case nir_op_ilt: 1035 case nir_op_ult: 1036 return BRW_CONDITIONAL_L; 1037 1038 case nir_op_fge: 1039 case nir_op_ige: 1040 case nir_op_uge: 1041 return BRW_CONDITIONAL_GE; 1042 1043 case nir_op_feq: 1044 case nir_op_ieq: 1045 case nir_op_ball_fequal2: 1046 case nir_op_ball_iequal2: 1047 case nir_op_ball_fequal3: 1048 case nir_op_ball_iequal3: 1049 case nir_op_ball_fequal4: 1050 case nir_op_ball_iequal4: 1051 return BRW_CONDITIONAL_Z; 1052 1053 case nir_op_fne: 1054 case nir_op_ine: 1055 case nir_op_bany_fnequal2: 1056 case nir_op_bany_inequal2: 1057 case nir_op_bany_fnequal3: 1058 case nir_op_bany_inequal3: 1059 case nir_op_bany_fnequal4: 1060 case nir_op_bany_inequal4: 1061 return BRW_CONDITIONAL_NZ; 1062 1063 default: 1064 unreachable("not reached: bad operation for comparison"); 1065 } 1066 } 1067 1068 bool 1069 vec4_visitor::optimize_predicate(nir_alu_instr *instr, 1070 enum brw_predicate *predicate) 1071 { 1072 if (!instr->src[0].src.is_ssa || 1073 instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 1074 return false; 1075 1076 nir_alu_instr *cmp_instr = 1077 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 1078 1079 switch (cmp_instr->op) { 1080 case nir_op_bany_fnequal2: 1081 case nir_op_bany_inequal2: 1082 case nir_op_bany_fnequal3: 1083 case nir_op_bany_inequal3: 1084 case nir_op_bany_fnequal4: 1085 case nir_op_bany_inequal4: 1086 *predicate = BRW_PREDICATE_ALIGN16_ANY4H; 1087 break; 1088 case nir_op_ball_fequal2: 1089 case nir_op_ball_iequal2: 1090 case nir_op_ball_fequal3: 1091 case nir_op_ball_iequal3: 1092 case nir_op_ball_fequal4: 1093 case nir_op_ball_iequal4: 1094 *predicate = BRW_PREDICATE_ALIGN16_ALL4H; 1095 break; 1096 default: 1097 return false; 1098 } 1099 1100 unsigned size_swizzle = 1101 brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]); 1102 1103 src_reg op[2]; 1104 assert(nir_op_infos[cmp_instr->op].num_inputs == 2); 1105 for (unsigned i = 0; i < 2; i++) { 1106 nir_alu_type type = nir_op_infos[cmp_instr->op].input_types[i]; 1107 unsigned bit_size = nir_src_bit_size(cmp_instr->src[i].src); 1108 type = (nir_alu_type) (((unsigned) type) | bit_size); 1109 op[i] = get_nir_src(cmp_instr->src[i].src, type, 4); 1110 unsigned base_swizzle = 1111 brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle); 1112 op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle); 1113 op[i].abs = cmp_instr->src[i].abs; 1114 op[i].negate = cmp_instr->src[i].negate; 1115 } 1116 1117 emit(CMP(dst_null_d(), op[0], op[1], 1118 brw_conditional_for_nir_comparison(cmp_instr->op))); 1119 1120 return true; 1121 } 1122 1123 static void 1124 emit_find_msb_using_lzd(const vec4_builder &bld, 1125 const dst_reg &dst, 1126 const src_reg &src, 1127 bool is_signed) 1128 { 1129 vec4_instruction *inst; 1130 src_reg temp = src; 1131 1132 if (is_signed) { 1133 /* LZD of an absolute value source almost always does the right 1134 * thing. There are two problem values: 1135 * 1136 * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns 1137 * 0. However, findMSB(int(0x80000000)) == 30. 1138 * 1139 * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns 1140 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1141 * 1142 * For a value of zero or negative one, -1 will be returned. 1143 * 1144 * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but 1145 * findMSB(-(1<<x)) should return x-1. 1146 * 1147 * For all negative number cases, including 0x80000000 and 1148 * 0xffffffff, the correct value is obtained from LZD if instead of 1149 * negating the (already negative) value the logical-not is used. A 1150 * conditonal logical-not can be achieved in two instructions. 1151 */ 1152 temp = src_reg(bld.vgrf(BRW_REGISTER_TYPE_D)); 1153 1154 bld.ASR(dst_reg(temp), src, brw_imm_d(31)); 1155 bld.XOR(dst_reg(temp), temp, src); 1156 } 1157 1158 bld.LZD(retype(dst, BRW_REGISTER_TYPE_UD), 1159 retype(temp, BRW_REGISTER_TYPE_UD)); 1160 1161 /* LZD counts from the MSB side, while GLSL's findMSB() wants the count 1162 * from the LSB side. Subtract the result from 31 to convert the MSB count 1163 * into an LSB count. If no bits are set, LZD will return 32. 31-32 = -1, 1164 * which is exactly what findMSB() is supposed to return. 1165 */ 1166 inst = bld.ADD(dst, retype(src_reg(dst), BRW_REGISTER_TYPE_D), 1167 brw_imm_d(31)); 1168 inst->src[0].negate = true; 1169 } 1170 1171 void 1172 vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src, 1173 bool saturate, 1174 brw_reg_type single_type) 1175 { 1176 /* BDW PRM vol 15 - workarounds: 1177 * DF->f format conversion for Align16 has wrong emask calculation when 1178 * source is immediate. 1179 */ 1180 if (devinfo->gen == 8 && single_type == BRW_REGISTER_TYPE_F && 1181 src.file == BRW_IMMEDIATE_VALUE) { 1182 vec4_instruction *inst = emit(MOV(dst, brw_imm_f(src.df))); 1183 inst->saturate = saturate; 1184 return; 1185 } 1186 1187 dst_reg temp = dst_reg(this, glsl_type::dvec4_type); 1188 emit(MOV(temp, src)); 1189 1190 dst_reg temp2 = dst_reg(this, glsl_type::dvec4_type); 1191 temp2 = retype(temp2, single_type); 1192 emit(VEC4_OPCODE_FROM_DOUBLE, temp2, src_reg(temp)) 1193 ->size_written = 2 * REG_SIZE; 1194 1195 vec4_instruction *inst = emit(MOV(dst, src_reg(temp2))); 1196 inst->saturate = saturate; 1197 } 1198 1199 void 1200 vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src, 1201 bool saturate, 1202 brw_reg_type single_type) 1203 { 1204 dst_reg tmp_dst = dst_reg(src_reg(this, glsl_type::dvec4_type)); 1205 src_reg tmp_src = retype(src_reg(this, glsl_type::vec4_type), single_type); 1206 emit(MOV(dst_reg(tmp_src), retype(src, single_type))); 1207 emit(VEC4_OPCODE_TO_DOUBLE, tmp_dst, tmp_src); 1208 vec4_instruction *inst = emit(MOV(dst, src_reg(tmp_dst))); 1209 inst->saturate = saturate; 1210 } 1211 1212 src_reg 1213 vec4_visitor::setup_imm_df(double v) 1214 { 1215 assert(devinfo->gen >= 7); 1216 1217 if (devinfo->gen >= 8) 1218 return brw_imm_df(v); 1219 1220 /* gen7.5 does not support DF immediates straighforward but the DIM 1221 * instruction allows to set the 64-bit immediate value. 1222 */ 1223 if (devinfo->is_haswell) { 1224 dst_reg dst = retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_DF); 1225 emit(DIM(dst, brw_imm_df(v)))->force_writemask_all = true; 1226 return swizzle(src_reg(retype(dst, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX); 1227 } 1228 1229 /* gen7 does not support DF immediates */ 1230 union { 1231 double d; 1232 struct { 1233 uint32_t i1; 1234 uint32_t i2; 1235 }; 1236 } di; 1237 1238 di.d = v; 1239 1240 /* Write the low 32-bit of the constant to the X:UD channel and the 1241 * high 32-bit to the Y:UD channel to build the constant in a VGRF. 1242 * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes 1243 * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle 1244 * XXXX so any access to the VGRF only reads the constant data in these 1245 * channels. 1246 */ 1247 const dst_reg tmp = 1248 retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_UD); 1249 for (int n = 0; n < 2; n++) { 1250 emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), brw_imm_ud(di.i1))) 1251 ->force_writemask_all = true; 1252 emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), brw_imm_ud(di.i2))) 1253 ->force_writemask_all = true; 1254 } 1255 1256 return swizzle(src_reg(retype(tmp, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX); 1257 } 1258 1259 void 1260 vec4_visitor::nir_emit_alu(nir_alu_instr *instr) 1261 { 1262 vec4_instruction *inst; 1263 1264 nir_alu_type dst_type = (nir_alu_type) (nir_op_infos[instr->op].output_type | 1265 nir_dest_bit_size(instr->dest.dest)); 1266 dst_reg dst = get_nir_dest(instr->dest.dest, dst_type); 1267 dst.writemask = instr->dest.write_mask; 1268 1269 src_reg op[4]; 1270 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1271 nir_alu_type src_type = (nir_alu_type) 1272 (nir_op_infos[instr->op].input_types[i] | 1273 nir_src_bit_size(instr->src[i].src)); 1274 op[i] = get_nir_src(instr->src[i].src, src_type, 4); 1275 op[i].swizzle = brw_swizzle_for_nir_swizzle(instr->src[i].swizzle); 1276 op[i].abs = instr->src[i].abs; 1277 op[i].negate = instr->src[i].negate; 1278 } 1279 1280 switch (instr->op) { 1281 case nir_op_imov: 1282 case nir_op_fmov: 1283 inst = emit(MOV(dst, op[0])); 1284 inst->saturate = instr->dest.saturate; 1285 break; 1286 1287 case nir_op_vec2: 1288 case nir_op_vec3: 1289 case nir_op_vec4: 1290 unreachable("not reached: should be handled by lower_vec_to_movs()"); 1291 1292 case nir_op_i2f: 1293 case nir_op_u2f: 1294 inst = emit(MOV(dst, op[0])); 1295 inst->saturate = instr->dest.saturate; 1296 break; 1297 1298 case nir_op_f2i: 1299 case nir_op_f2u: 1300 inst = emit(MOV(dst, op[0])); 1301 break; 1302 1303 case nir_op_d2f: 1304 emit_conversion_from_double(dst, op[0], instr->dest.saturate, 1305 BRW_REGISTER_TYPE_F); 1306 break; 1307 1308 case nir_op_f2d: 1309 emit_conversion_to_double(dst, op[0], instr->dest.saturate, 1310 BRW_REGISTER_TYPE_F); 1311 break; 1312 1313 case nir_op_d2i: 1314 case nir_op_d2u: 1315 emit_conversion_from_double(dst, op[0], instr->dest.saturate, 1316 instr->op == nir_op_d2i ? BRW_REGISTER_TYPE_D : 1317 BRW_REGISTER_TYPE_UD); 1318 break; 1319 1320 case nir_op_i2d: 1321 case nir_op_u2d: 1322 emit_conversion_to_double(dst, op[0], instr->dest.saturate, 1323 instr->op == nir_op_i2d ? BRW_REGISTER_TYPE_D : 1324 BRW_REGISTER_TYPE_UD); 1325 break; 1326 1327 case nir_op_iadd: 1328 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1329 case nir_op_fadd: 1330 inst = emit(ADD(dst, op[0], op[1])); 1331 inst->saturate = instr->dest.saturate; 1332 break; 1333 1334 case nir_op_fmul: 1335 inst = emit(MUL(dst, op[0], op[1])); 1336 inst->saturate = instr->dest.saturate; 1337 break; 1338 1339 case nir_op_imul: { 1340 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1341 if (devinfo->gen < 8) { 1342 nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src); 1343 nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src); 1344 1345 /* For integer multiplication, the MUL uses the low 16 bits of one of 1346 * the operands (src0 through SNB, src1 on IVB and later). The MACH 1347 * accumulates in the contribution of the upper 16 bits of that 1348 * operand. If we can determine that one of the args is in the low 1349 * 16 bits, though, we can just emit a single MUL. 1350 */ 1351 if (value0 && value0->u32[0] < (1 << 16)) { 1352 if (devinfo->gen < 7) 1353 emit(MUL(dst, op[0], op[1])); 1354 else 1355 emit(MUL(dst, op[1], op[0])); 1356 } else if (value1 && value1->u32[0] < (1 << 16)) { 1357 if (devinfo->gen < 7) 1358 emit(MUL(dst, op[1], op[0])); 1359 else 1360 emit(MUL(dst, op[0], op[1])); 1361 } else { 1362 struct brw_reg acc = retype(brw_acc_reg(8), dst.type); 1363 1364 emit(MUL(acc, op[0], op[1])); 1365 emit(MACH(dst_null_d(), op[0], op[1])); 1366 emit(MOV(dst, src_reg(acc))); 1367 } 1368 } else { 1369 emit(MUL(dst, op[0], op[1])); 1370 } 1371 break; 1372 } 1373 1374 case nir_op_imul_high: 1375 case nir_op_umul_high: { 1376 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1377 struct brw_reg acc = retype(brw_acc_reg(8), dst.type); 1378 1379 if (devinfo->gen >= 8) 1380 emit(MUL(acc, op[0], retype(op[1], BRW_REGISTER_TYPE_UW))); 1381 else 1382 emit(MUL(acc, op[0], op[1])); 1383 1384 emit(MACH(dst, op[0], op[1])); 1385 break; 1386 } 1387 1388 case nir_op_frcp: 1389 inst = emit_math(SHADER_OPCODE_RCP, dst, op[0]); 1390 inst->saturate = instr->dest.saturate; 1391 break; 1392 1393 case nir_op_fexp2: 1394 inst = emit_math(SHADER_OPCODE_EXP2, dst, op[0]); 1395 inst->saturate = instr->dest.saturate; 1396 break; 1397 1398 case nir_op_flog2: 1399 inst = emit_math(SHADER_OPCODE_LOG2, dst, op[0]); 1400 inst->saturate = instr->dest.saturate; 1401 break; 1402 1403 case nir_op_fsin: 1404 inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]); 1405 inst->saturate = instr->dest.saturate; 1406 break; 1407 1408 case nir_op_fcos: 1409 inst = emit_math(SHADER_OPCODE_COS, dst, op[0]); 1410 inst->saturate = instr->dest.saturate; 1411 break; 1412 1413 case nir_op_idiv: 1414 case nir_op_udiv: 1415 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1416 emit_math(SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]); 1417 break; 1418 1419 case nir_op_umod: 1420 case nir_op_irem: 1421 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it 1422 * appears that our hardware just does the right thing for signed 1423 * remainder. 1424 */ 1425 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1426 emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]); 1427 break; 1428 1429 case nir_op_imod: { 1430 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ 1431 inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]); 1432 1433 /* Math instructions don't support conditional mod */ 1434 inst = emit(MOV(dst_null_d(), src_reg(dst))); 1435 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1436 1437 /* Now, we need to determine if signs of the sources are different. 1438 * When we XOR the sources, the top bit is 0 if they are the same and 1 1439 * if they are different. We can then use a conditional modifier to 1440 * turn that into a predicate. This leads us to an XOR.l instruction. 1441 * 1442 * Technically, according to the PRM, you're not allowed to use .l on a 1443 * XOR instruction. However, emperical experiments and Curro's reading 1444 * of the simulator source both indicate that it's safe. 1445 */ 1446 src_reg tmp = src_reg(this, glsl_type::ivec4_type); 1447 inst = emit(XOR(dst_reg(tmp), op[0], op[1])); 1448 inst->predicate = BRW_PREDICATE_NORMAL; 1449 inst->conditional_mod = BRW_CONDITIONAL_L; 1450 1451 /* If the result of the initial remainder operation is non-zero and the 1452 * two sources have different signs, add in a copy of op[1] to get the 1453 * final integer modulus value. 1454 */ 1455 inst = emit(ADD(dst, src_reg(dst), op[1])); 1456 inst->predicate = BRW_PREDICATE_NORMAL; 1457 break; 1458 } 1459 1460 case nir_op_ldexp: 1461 unreachable("not reached: should be handled by ldexp_to_arith()"); 1462 1463 case nir_op_fsqrt: 1464 inst = emit_math(SHADER_OPCODE_SQRT, dst, op[0]); 1465 inst->saturate = instr->dest.saturate; 1466 break; 1467 1468 case nir_op_frsq: 1469 inst = emit_math(SHADER_OPCODE_RSQ, dst, op[0]); 1470 inst->saturate = instr->dest.saturate; 1471 break; 1472 1473 case nir_op_fpow: 1474 inst = emit_math(SHADER_OPCODE_POW, dst, op[0], op[1]); 1475 inst->saturate = instr->dest.saturate; 1476 break; 1477 1478 case nir_op_uadd_carry: { 1479 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1480 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD); 1481 1482 emit(ADDC(dst_null_ud(), op[0], op[1])); 1483 emit(MOV(dst, src_reg(acc))); 1484 break; 1485 } 1486 1487 case nir_op_usub_borrow: { 1488 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1489 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD); 1490 1491 emit(SUBB(dst_null_ud(), op[0], op[1])); 1492 emit(MOV(dst, src_reg(acc))); 1493 break; 1494 } 1495 1496 case nir_op_ftrunc: 1497 inst = emit(RNDZ(dst, op[0])); 1498 inst->saturate = instr->dest.saturate; 1499 break; 1500 1501 case nir_op_fceil: { 1502 src_reg tmp = src_reg(this, glsl_type::float_type); 1503 tmp.swizzle = 1504 brw_swizzle_for_size(instr->src[0].src.is_ssa ? 1505 instr->src[0].src.ssa->num_components : 1506 instr->src[0].src.reg.reg->num_components); 1507 1508 op[0].negate = !op[0].negate; 1509 emit(RNDD(dst_reg(tmp), op[0])); 1510 tmp.negate = true; 1511 inst = emit(MOV(dst, tmp)); 1512 inst->saturate = instr->dest.saturate; 1513 break; 1514 } 1515 1516 case nir_op_ffloor: 1517 inst = emit(RNDD(dst, op[0])); 1518 inst->saturate = instr->dest.saturate; 1519 break; 1520 1521 case nir_op_ffract: 1522 inst = emit(FRC(dst, op[0])); 1523 inst->saturate = instr->dest.saturate; 1524 break; 1525 1526 case nir_op_fround_even: 1527 inst = emit(RNDE(dst, op[0])); 1528 inst->saturate = instr->dest.saturate; 1529 break; 1530 1531 case nir_op_fquantize2f16: { 1532 /* See also vec4_visitor::emit_pack_half_2x16() */ 1533 src_reg tmp16 = src_reg(this, glsl_type::uvec4_type); 1534 src_reg tmp32 = src_reg(this, glsl_type::vec4_type); 1535 src_reg zero = src_reg(this, glsl_type::vec4_type); 1536 1537 /* Check for denormal */ 1538 src_reg abs_src0 = op[0]; 1539 abs_src0.abs = true; 1540 emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), 1541 BRW_CONDITIONAL_L)); 1542 /* Get the appropriately signed zero */ 1543 emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD), 1544 retype(op[0], BRW_REGISTER_TYPE_UD), 1545 brw_imm_ud(0x80000000))); 1546 /* Do the actual F32 -> F16 -> F32 conversion */ 1547 emit(F32TO16(dst_reg(tmp16), op[0])); 1548 emit(F16TO32(dst_reg(tmp32), tmp16)); 1549 /* Select that or zero based on normal status */ 1550 inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32); 1551 inst->predicate = BRW_PREDICATE_NORMAL; 1552 inst->saturate = instr->dest.saturate; 1553 break; 1554 } 1555 1556 case nir_op_imin: 1557 case nir_op_umin: 1558 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1559 case nir_op_fmin: 1560 inst = emit_minmax(BRW_CONDITIONAL_L, dst, op[0], op[1]); 1561 inst->saturate = instr->dest.saturate; 1562 break; 1563 1564 case nir_op_imax: 1565 case nir_op_umax: 1566 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1567 case nir_op_fmax: 1568 inst = emit_minmax(BRW_CONDITIONAL_GE, dst, op[0], op[1]); 1569 inst->saturate = instr->dest.saturate; 1570 break; 1571 1572 case nir_op_fddx: 1573 case nir_op_fddx_coarse: 1574 case nir_op_fddx_fine: 1575 case nir_op_fddy: 1576 case nir_op_fddy_coarse: 1577 case nir_op_fddy_fine: 1578 unreachable("derivatives are not valid in vertex shaders"); 1579 1580 case nir_op_ilt: 1581 case nir_op_ult: 1582 case nir_op_ige: 1583 case nir_op_uge: 1584 case nir_op_ieq: 1585 case nir_op_ine: 1586 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1587 /* Fallthrough */ 1588 case nir_op_flt: 1589 case nir_op_fge: 1590 case nir_op_feq: 1591 case nir_op_fne: { 1592 enum brw_conditional_mod conditional_mod = 1593 brw_conditional_for_nir_comparison(instr->op); 1594 1595 if (nir_src_bit_size(instr->src[0].src) < 64) { 1596 emit(CMP(dst, op[0], op[1], conditional_mod)); 1597 } else { 1598 /* Produce a 32-bit boolean result from the DF comparison by selecting 1599 * only the low 32-bit in each DF produced. Do this in a temporary 1600 * so we can then move from there to the result using align16 again 1601 * to honor the original writemask. 1602 */ 1603 dst_reg temp = dst_reg(this, glsl_type::dvec4_type); 1604 emit(CMP(temp, op[0], op[1], conditional_mod)); 1605 dst_reg result = dst_reg(this, glsl_type::bvec4_type); 1606 emit(VEC4_OPCODE_PICK_LOW_32BIT, result, src_reg(temp)); 1607 emit(MOV(dst, src_reg(result))); 1608 } 1609 break; 1610 } 1611 1612 case nir_op_ball_iequal2: 1613 case nir_op_ball_iequal3: 1614 case nir_op_ball_iequal4: 1615 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1616 /* Fallthrough */ 1617 case nir_op_ball_fequal2: 1618 case nir_op_ball_fequal3: 1619 case nir_op_ball_fequal4: { 1620 unsigned swiz = 1621 brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); 1622 1623 emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz), 1624 brw_conditional_for_nir_comparison(instr->op))); 1625 emit(MOV(dst, brw_imm_d(0))); 1626 inst = emit(MOV(dst, brw_imm_d(~0))); 1627 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H; 1628 break; 1629 } 1630 1631 case nir_op_bany_inequal2: 1632 case nir_op_bany_inequal3: 1633 case nir_op_bany_inequal4: 1634 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1635 /* Fallthrough */ 1636 case nir_op_bany_fnequal2: 1637 case nir_op_bany_fnequal3: 1638 case nir_op_bany_fnequal4: { 1639 unsigned swiz = 1640 brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); 1641 1642 emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz), 1643 brw_conditional_for_nir_comparison(instr->op))); 1644 1645 emit(MOV(dst, brw_imm_d(0))); 1646 inst = emit(MOV(dst, brw_imm_d(~0))); 1647 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H; 1648 break; 1649 } 1650 1651 case nir_op_inot: 1652 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1653 if (devinfo->gen >= 8) { 1654 op[0] = resolve_source_modifiers(op[0]); 1655 } 1656 emit(NOT(dst, op[0])); 1657 break; 1658 1659 case nir_op_ixor: 1660 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1661 if (devinfo->gen >= 8) { 1662 op[0] = resolve_source_modifiers(op[0]); 1663 op[1] = resolve_source_modifiers(op[1]); 1664 } 1665 emit(XOR(dst, op[0], op[1])); 1666 break; 1667 1668 case nir_op_ior: 1669 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1670 if (devinfo->gen >= 8) { 1671 op[0] = resolve_source_modifiers(op[0]); 1672 op[1] = resolve_source_modifiers(op[1]); 1673 } 1674 emit(OR(dst, op[0], op[1])); 1675 break; 1676 1677 case nir_op_iand: 1678 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1679 if (devinfo->gen >= 8) { 1680 op[0] = resolve_source_modifiers(op[0]); 1681 op[1] = resolve_source_modifiers(op[1]); 1682 } 1683 emit(AND(dst, op[0], op[1])); 1684 break; 1685 1686 case nir_op_b2i: 1687 case nir_op_b2f: 1688 emit(MOV(dst, negate(op[0]))); 1689 break; 1690 1691 case nir_op_f2b: 1692 emit(CMP(dst, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ)); 1693 break; 1694 1695 case nir_op_d2b: { 1696 /* We use a MOV with conditional_mod to check if the provided value is 1697 * 0.0. We want this to flush denormalized numbers to zero, so we set a 1698 * source modifier on the source operand to trigger this, as source 1699 * modifiers don't affect the result of the testing against 0.0. 1700 */ 1701 src_reg value = op[0]; 1702 value.abs = true; 1703 vec4_instruction *inst = emit(MOV(dst_null_df(), value)); 1704 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1705 1706 src_reg one = src_reg(this, glsl_type::ivec4_type); 1707 emit(MOV(dst_reg(one), brw_imm_d(~0))); 1708 inst = emit(BRW_OPCODE_SEL, dst, one, brw_imm_d(0)); 1709 inst->predicate = BRW_PREDICATE_NORMAL; 1710 break; 1711 } 1712 1713 case nir_op_i2b: 1714 emit(CMP(dst, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ)); 1715 break; 1716 1717 case nir_op_fnoise1_1: 1718 case nir_op_fnoise1_2: 1719 case nir_op_fnoise1_3: 1720 case nir_op_fnoise1_4: 1721 case nir_op_fnoise2_1: 1722 case nir_op_fnoise2_2: 1723 case nir_op_fnoise2_3: 1724 case nir_op_fnoise2_4: 1725 case nir_op_fnoise3_1: 1726 case nir_op_fnoise3_2: 1727 case nir_op_fnoise3_3: 1728 case nir_op_fnoise3_4: 1729 case nir_op_fnoise4_1: 1730 case nir_op_fnoise4_2: 1731 case nir_op_fnoise4_3: 1732 case nir_op_fnoise4_4: 1733 unreachable("not reached: should be handled by lower_noise"); 1734 1735 case nir_op_unpack_half_2x16_split_x: 1736 case nir_op_unpack_half_2x16_split_y: 1737 case nir_op_pack_half_2x16_split: 1738 unreachable("not reached: should not occur in vertex shader"); 1739 1740 case nir_op_unpack_snorm_2x16: 1741 case nir_op_unpack_unorm_2x16: 1742 case nir_op_pack_snorm_2x16: 1743 case nir_op_pack_unorm_2x16: 1744 unreachable("not reached: should be handled by lower_packing_builtins"); 1745 1746 case nir_op_pack_uvec4_to_uint: 1747 unreachable("not reached"); 1748 1749 case nir_op_pack_uvec2_to_uint: { 1750 dst_reg tmp1 = dst_reg(this, glsl_type::uint_type); 1751 tmp1.writemask = WRITEMASK_X; 1752 op[0].swizzle = BRW_SWIZZLE_YYYY; 1753 emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u)))); 1754 1755 dst_reg tmp2 = dst_reg(this, glsl_type::uint_type); 1756 tmp2.writemask = WRITEMASK_X; 1757 op[0].swizzle = BRW_SWIZZLE_XXXX; 1758 emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu)))); 1759 1760 emit(OR(dst, src_reg(tmp1), src_reg(tmp2))); 1761 break; 1762 } 1763 1764 case nir_op_pack_double_2x32_split: { 1765 dst_reg result = dst_reg(this, glsl_type::dvec4_type); 1766 dst_reg tmp = dst_reg(this, glsl_type::uvec4_type); 1767 emit(MOV(tmp, retype(op[0], BRW_REGISTER_TYPE_UD))); 1768 emit(VEC4_OPCODE_SET_LOW_32BIT, result, src_reg(tmp)); 1769 emit(MOV(tmp, retype(op[1], BRW_REGISTER_TYPE_UD))); 1770 emit(VEC4_OPCODE_SET_HIGH_32BIT, result, src_reg(tmp)); 1771 emit(MOV(dst, src_reg(result))); 1772 break; 1773 } 1774 1775 case nir_op_unpack_double_2x32_split_x: 1776 case nir_op_unpack_double_2x32_split_y: { 1777 enum opcode oper = (instr->op == nir_op_unpack_double_2x32_split_x) ? 1778 VEC4_OPCODE_PICK_LOW_32BIT : VEC4_OPCODE_PICK_HIGH_32BIT; 1779 dst_reg tmp = dst_reg(this, glsl_type::dvec4_type); 1780 emit(MOV(tmp, op[0])); 1781 dst_reg tmp2 = dst_reg(this, glsl_type::uvec4_type); 1782 emit(oper, tmp2, src_reg(tmp)); 1783 emit(MOV(dst, src_reg(tmp2))); 1784 break; 1785 } 1786 1787 case nir_op_unpack_half_2x16: 1788 /* As NIR does not guarantee that we have a correct swizzle outside the 1789 * boundaries of a vector, and the implementation of emit_unpack_half_2x16 1790 * uses the source operand in an operation with WRITEMASK_Y while our 1791 * source operand has only size 1, it accessed incorrect data producing 1792 * regressions in Piglit. We repeat the swizzle of the first component on the 1793 * rest of components to avoid regressions. In the vec4_visitor IR code path 1794 * this is not needed because the operand has already the correct swizzle. 1795 */ 1796 op[0].swizzle = brw_compose_swizzle(BRW_SWIZZLE_XXXX, op[0].swizzle); 1797 emit_unpack_half_2x16(dst, op[0]); 1798 break; 1799 1800 case nir_op_pack_half_2x16: 1801 emit_pack_half_2x16(dst, op[0]); 1802 break; 1803 1804 case nir_op_unpack_unorm_4x8: 1805 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1806 emit_unpack_unorm_4x8(dst, op[0]); 1807 break; 1808 1809 case nir_op_pack_unorm_4x8: 1810 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1811 emit_pack_unorm_4x8(dst, op[0]); 1812 break; 1813 1814 case nir_op_unpack_snorm_4x8: 1815 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1816 emit_unpack_snorm_4x8(dst, op[0]); 1817 break; 1818 1819 case nir_op_pack_snorm_4x8: 1820 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1821 emit_pack_snorm_4x8(dst, op[0]); 1822 break; 1823 1824 case nir_op_bitfield_reverse: 1825 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1826 emit(BFREV(dst, op[0])); 1827 break; 1828 1829 case nir_op_bit_count: 1830 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1831 emit(CBIT(dst, op[0])); 1832 break; 1833 1834 case nir_op_ufind_msb: 1835 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1836 emit_find_msb_using_lzd(vec4_builder(this).at_end(), dst, op[0], false); 1837 break; 1838 1839 case nir_op_ifind_msb: { 1840 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1841 vec4_builder bld = vec4_builder(this).at_end(); 1842 src_reg src(dst); 1843 1844 if (devinfo->gen < 7) { 1845 emit_find_msb_using_lzd(bld, dst, op[0], true); 1846 } else { 1847 emit(FBH(retype(dst, BRW_REGISTER_TYPE_UD), op[0])); 1848 1849 /* FBH counts from the MSB side, while GLSL's findMSB() wants the 1850 * count from the LSB side. If FBH didn't return an error 1851 * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB 1852 * count into an LSB count. 1853 */ 1854 bld.CMP(dst_null_d(), src, brw_imm_d(-1), BRW_CONDITIONAL_NZ); 1855 1856 inst = bld.ADD(dst, src, brw_imm_d(31)); 1857 inst->predicate = BRW_PREDICATE_NORMAL; 1858 inst->src[0].negate = true; 1859 } 1860 break; 1861 } 1862 1863 case nir_op_find_lsb: { 1864 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1865 vec4_builder bld = vec4_builder(this).at_end(); 1866 1867 if (devinfo->gen < 7) { 1868 dst_reg temp = bld.vgrf(BRW_REGISTER_TYPE_D); 1869 1870 /* (x & -x) generates a value that consists of only the LSB of x. 1871 * For all powers of 2, findMSB(y) == findLSB(y). 1872 */ 1873 src_reg src = src_reg(retype(op[0], BRW_REGISTER_TYPE_D)); 1874 src_reg negated_src = src; 1875 1876 /* One must be negated, and the other must be non-negated. It 1877 * doesn't matter which is which. 1878 */ 1879 negated_src.negate = true; 1880 src.negate = false; 1881 1882 bld.AND(temp, src, negated_src); 1883 emit_find_msb_using_lzd(bld, dst, src_reg(temp), false); 1884 } else { 1885 bld.FBL(dst, op[0]); 1886 } 1887 break; 1888 } 1889 1890 case nir_op_ubitfield_extract: 1891 case nir_op_ibitfield_extract: 1892 unreachable("should have been lowered"); 1893 case nir_op_ubfe: 1894 case nir_op_ibfe: 1895 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1896 op[0] = fix_3src_operand(op[0]); 1897 op[1] = fix_3src_operand(op[1]); 1898 op[2] = fix_3src_operand(op[2]); 1899 1900 emit(BFE(dst, op[2], op[1], op[0])); 1901 break; 1902 1903 case nir_op_bfm: 1904 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1905 emit(BFI1(dst, op[0], op[1])); 1906 break; 1907 1908 case nir_op_bfi: 1909 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1910 op[0] = fix_3src_operand(op[0]); 1911 op[1] = fix_3src_operand(op[1]); 1912 op[2] = fix_3src_operand(op[2]); 1913 1914 emit(BFI2(dst, op[0], op[1], op[2])); 1915 break; 1916 1917 case nir_op_bitfield_insert: 1918 unreachable("not reached: should have been lowered"); 1919 1920 case nir_op_fsign: 1921 if (type_sz(op[0].type) < 8) { 1922 /* AND(val, 0x80000000) gives the sign bit. 1923 * 1924 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not 1925 * zero. 1926 */ 1927 emit(CMP(dst_null_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ)); 1928 1929 op[0].type = BRW_REGISTER_TYPE_UD; 1930 dst.type = BRW_REGISTER_TYPE_UD; 1931 emit(AND(dst, op[0], brw_imm_ud(0x80000000u))); 1932 1933 inst = emit(OR(dst, src_reg(dst), brw_imm_ud(0x3f800000u))); 1934 inst->predicate = BRW_PREDICATE_NORMAL; 1935 dst.type = BRW_REGISTER_TYPE_F; 1936 1937 if (instr->dest.saturate) { 1938 inst = emit(MOV(dst, src_reg(dst))); 1939 inst->saturate = true; 1940 } 1941 } else { 1942 /* For doubles we do the same but we need to consider: 1943 * 1944 * - We use a MOV with conditional_mod instead of a CMP so that we can 1945 * skip loading a 0.0 immediate. We use a source modifier on the 1946 * source of the MOV so that we flush denormalized values to 0. 1947 * Since we want to compare against 0, this won't alter the result. 1948 * - We need to extract the high 32-bit of each DF where the sign 1949 * is stored. 1950 * - We need to produce a DF result. 1951 */ 1952 1953 /* Check for zero */ 1954 src_reg value = op[0]; 1955 value.abs = true; 1956 inst = emit(MOV(dst_null_df(), value)); 1957 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1958 1959 /* AND each high 32-bit channel with 0x80000000u */ 1960 dst_reg tmp = dst_reg(this, glsl_type::uvec4_type); 1961 emit(VEC4_OPCODE_PICK_HIGH_32BIT, tmp, op[0]); 1962 emit(AND(tmp, src_reg(tmp), brw_imm_ud(0x80000000u))); 1963 1964 /* Add 1.0 to each channel, predicated to skip the cases where the 1965 * channel's value was 0 1966 */ 1967 inst = emit(OR(tmp, src_reg(tmp), brw_imm_ud(0x3f800000u))); 1968 inst->predicate = BRW_PREDICATE_NORMAL; 1969 1970 /* Now convert the result from float to double */ 1971 emit_conversion_to_double(dst, src_reg(tmp), instr->dest.saturate, 1972 BRW_REGISTER_TYPE_F); 1973 } 1974 break; 1975 1976 case nir_op_isign: 1977 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1). 1978 * -> non-negative val generates 0x00000000. 1979 * Predicated OR sets 1 if val is positive. 1980 */ 1981 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1982 emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G)); 1983 emit(ASR(dst, op[0], brw_imm_d(31))); 1984 inst = emit(OR(dst, src_reg(dst), brw_imm_d(1))); 1985 inst->predicate = BRW_PREDICATE_NORMAL; 1986 break; 1987 1988 case nir_op_ishl: 1989 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1990 emit(SHL(dst, op[0], op[1])); 1991 break; 1992 1993 case nir_op_ishr: 1994 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1995 emit(ASR(dst, op[0], op[1])); 1996 break; 1997 1998 case nir_op_ushr: 1999 assert(nir_dest_bit_size(instr->dest.dest) < 64); 2000 emit(SHR(dst, op[0], op[1])); 2001 break; 2002 2003 case nir_op_ffma: 2004 if (type_sz(dst.type) == 8) { 2005 dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type); 2006 emit(MUL(mul_dst, op[1], op[0])); 2007 inst = emit(ADD(dst, src_reg(mul_dst), op[2])); 2008 inst->saturate = instr->dest.saturate; 2009 } else { 2010 op[0] = fix_3src_operand(op[0]); 2011 op[1] = fix_3src_operand(op[1]); 2012 op[2] = fix_3src_operand(op[2]); 2013 2014 inst = emit(MAD(dst, op[2], op[1], op[0])); 2015 inst->saturate = instr->dest.saturate; 2016 } 2017 break; 2018 2019 case nir_op_flrp: 2020 inst = emit_lrp(dst, op[0], op[1], op[2]); 2021 inst->saturate = instr->dest.saturate; 2022 break; 2023 2024 case nir_op_bcsel: 2025 enum brw_predicate predicate; 2026 if (!optimize_predicate(instr, &predicate)) { 2027 emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ)); 2028 switch (dst.writemask) { 2029 case WRITEMASK_X: 2030 predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X; 2031 break; 2032 case WRITEMASK_Y: 2033 predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y; 2034 break; 2035 case WRITEMASK_Z: 2036 predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z; 2037 break; 2038 case WRITEMASK_W: 2039 predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W; 2040 break; 2041 default: 2042 predicate = BRW_PREDICATE_NORMAL; 2043 break; 2044 } 2045 } 2046 inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]); 2047 inst->predicate = predicate; 2048 break; 2049 2050 case nir_op_fdot_replicated2: 2051 inst = emit(BRW_OPCODE_DP2, dst, op[0], op[1]); 2052 inst->saturate = instr->dest.saturate; 2053 break; 2054 2055 case nir_op_fdot_replicated3: 2056 inst = emit(BRW_OPCODE_DP3, dst, op[0], op[1]); 2057 inst->saturate = instr->dest.saturate; 2058 break; 2059 2060 case nir_op_fdot_replicated4: 2061 inst = emit(BRW_OPCODE_DP4, dst, op[0], op[1]); 2062 inst->saturate = instr->dest.saturate; 2063 break; 2064 2065 case nir_op_fdph_replicated: 2066 inst = emit(BRW_OPCODE_DPH, dst, op[0], op[1]); 2067 inst->saturate = instr->dest.saturate; 2068 break; 2069 2070 case nir_op_iabs: 2071 case nir_op_ineg: 2072 assert(nir_dest_bit_size(instr->dest.dest) < 64); 2073 case nir_op_fabs: 2074 case nir_op_fneg: 2075 case nir_op_fsat: 2076 unreachable("not reached: should be lowered by lower_source mods"); 2077 2078 case nir_op_fdiv: 2079 unreachable("not reached: should be lowered by DIV_TO_MUL_RCP in the compiler"); 2080 2081 case nir_op_fmod: 2082 unreachable("not reached: should be lowered by MOD_TO_FLOOR in the compiler"); 2083 2084 case nir_op_fsub: 2085 case nir_op_isub: 2086 unreachable("not reached: should be handled by ir_sub_to_add_neg"); 2087 2088 default: 2089 unreachable("Unimplemented ALU operation"); 2090 } 2091 2092 /* If we need to do a boolean resolve, replace the result with -(x & 1) 2093 * to sign extend the low bit to 0/~0 2094 */ 2095 if (devinfo->gen <= 5 && 2096 (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == 2097 BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { 2098 dst_reg masked = dst_reg(this, glsl_type::int_type); 2099 masked.writemask = dst.writemask; 2100 emit(AND(masked, src_reg(dst), brw_imm_d(1))); 2101 src_reg masked_neg = src_reg(masked); 2102 masked_neg.negate = true; 2103 emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg)); 2104 } 2105 } 2106 2107 void 2108 vec4_visitor::nir_emit_jump(nir_jump_instr *instr) 2109 { 2110 switch (instr->type) { 2111 case nir_jump_break: 2112 emit(BRW_OPCODE_BREAK); 2113 break; 2114 2115 case nir_jump_continue: 2116 emit(BRW_OPCODE_CONTINUE); 2117 break; 2118 2119 case nir_jump_return: 2120 /* fall through */ 2121 default: 2122 unreachable("unknown jump"); 2123 } 2124 } 2125 2126 enum ir_texture_opcode 2127 ir_texture_opcode_for_nir_texop(nir_texop texop) 2128 { 2129 enum ir_texture_opcode op; 2130 2131 switch (texop) { 2132 case nir_texop_lod: op = ir_lod; break; 2133 case nir_texop_query_levels: op = ir_query_levels; break; 2134 case nir_texop_texture_samples: op = ir_texture_samples; break; 2135 case nir_texop_tex: op = ir_tex; break; 2136 case nir_texop_tg4: op = ir_tg4; break; 2137 case nir_texop_txb: op = ir_txb; break; 2138 case nir_texop_txd: op = ir_txd; break; 2139 case nir_texop_txf: op = ir_txf; break; 2140 case nir_texop_txf_ms: op = ir_txf_ms; break; 2141 case nir_texop_txl: op = ir_txl; break; 2142 case nir_texop_txs: op = ir_txs; break; 2143 case nir_texop_samples_identical: op = ir_samples_identical; break; 2144 default: 2145 unreachable("unknown texture opcode"); 2146 } 2147 2148 return op; 2149 } 2150 const glsl_type * 2151 glsl_type_for_nir_alu_type(nir_alu_type alu_type, 2152 unsigned components) 2153 { 2154 return glsl_type::get_instance(brw_glsl_base_type_for_nir_type(alu_type), 2155 components, 1); 2156 } 2157 2158 void 2159 vec4_visitor::nir_emit_texture(nir_tex_instr *instr) 2160 { 2161 unsigned texture = instr->texture_index; 2162 unsigned sampler = instr->sampler_index; 2163 src_reg texture_reg = brw_imm_ud(texture); 2164 src_reg sampler_reg = brw_imm_ud(sampler); 2165 src_reg coordinate; 2166 const glsl_type *coord_type = NULL; 2167 src_reg shadow_comparator; 2168 src_reg offset_value; 2169 src_reg lod, lod2; 2170 src_reg sample_index; 2171 src_reg mcs; 2172 2173 const glsl_type *dest_type = 2174 glsl_type_for_nir_alu_type(instr->dest_type, 2175 nir_tex_instr_dest_size(instr)); 2176 dst_reg dest = get_nir_dest(instr->dest, instr->dest_type); 2177 2178 /* The hardware requires a LOD for buffer textures */ 2179 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 2180 lod = brw_imm_d(0); 2181 2182 /* Load the texture operation sources */ 2183 uint32_t constant_offset = 0; 2184 for (unsigned i = 0; i < instr->num_srcs; i++) { 2185 switch (instr->src[i].src_type) { 2186 case nir_tex_src_comparator: 2187 shadow_comparator = get_nir_src(instr->src[i].src, 2188 BRW_REGISTER_TYPE_F, 1); 2189 break; 2190 2191 case nir_tex_src_coord: { 2192 unsigned src_size = nir_tex_instr_src_size(instr, i); 2193 2194 switch (instr->op) { 2195 case nir_texop_txf: 2196 case nir_texop_txf_ms: 2197 case nir_texop_samples_identical: 2198 coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2199 src_size); 2200 coord_type = glsl_type::ivec(src_size); 2201 break; 2202 2203 default: 2204 coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 2205 src_size); 2206 coord_type = glsl_type::vec(src_size); 2207 break; 2208 } 2209 break; 2210 } 2211 2212 case nir_tex_src_ddx: 2213 lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 2214 nir_tex_instr_src_size(instr, i)); 2215 break; 2216 2217 case nir_tex_src_ddy: 2218 lod2 = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 2219 nir_tex_instr_src_size(instr, i)); 2220 break; 2221 2222 case nir_tex_src_lod: 2223 switch (instr->op) { 2224 case nir_texop_txs: 2225 case nir_texop_txf: 2226 lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1); 2227 break; 2228 2229 default: 2230 lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 1); 2231 break; 2232 } 2233 break; 2234 2235 case nir_tex_src_ms_index: { 2236 sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1); 2237 break; 2238 } 2239 2240 case nir_tex_src_offset: { 2241 nir_const_value *const_offset = 2242 nir_src_as_const_value(instr->src[i].src); 2243 if (!const_offset || 2244 !brw_texture_offset(const_offset->i32, 2245 nir_tex_instr_src_size(instr, i), 2246 &constant_offset)) { 2247 offset_value = 2248 get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2); 2249 } 2250 break; 2251 } 2252 2253 case nir_tex_src_texture_offset: { 2254 /* The highest texture which may be used by this operation is 2255 * the last element of the array. Mark it here, because the generator 2256 * doesn't have enough information to determine the bound. 2257 */ 2258 uint32_t array_size = instr->texture_array_size; 2259 uint32_t max_used = texture + array_size - 1; 2260 if (instr->op == nir_texop_tg4) { 2261 max_used += prog_data->base.binding_table.gather_texture_start; 2262 } else { 2263 max_used += prog_data->base.binding_table.texture_start; 2264 } 2265 2266 brw_mark_surface_used(&prog_data->base, max_used); 2267 2268 /* Emit code to evaluate the actual indexing expression */ 2269 src_reg src = get_nir_src(instr->src[i].src, 1); 2270 src_reg temp(this, glsl_type::uint_type); 2271 emit(ADD(dst_reg(temp), src, brw_imm_ud(texture))); 2272 texture_reg = emit_uniformize(temp); 2273 break; 2274 } 2275 2276 case nir_tex_src_sampler_offset: { 2277 /* Emit code to evaluate the actual indexing expression */ 2278 src_reg src = get_nir_src(instr->src[i].src, 1); 2279 src_reg temp(this, glsl_type::uint_type); 2280 emit(ADD(dst_reg(temp), src, brw_imm_ud(sampler))); 2281 sampler_reg = emit_uniformize(temp); 2282 break; 2283 } 2284 2285 case nir_tex_src_projector: 2286 unreachable("Should be lowered by do_lower_texture_projection"); 2287 2288 case nir_tex_src_bias: 2289 unreachable("LOD bias is not valid for vertex shaders.\n"); 2290 2291 default: 2292 unreachable("unknown texture source"); 2293 } 2294 } 2295 2296 if (instr->op == nir_texop_txf_ms || 2297 instr->op == nir_texop_samples_identical) { 2298 assert(coord_type != NULL); 2299 if (devinfo->gen >= 7 && 2300 key_tex->compressed_multisample_layout_mask & (1 << texture)) { 2301 mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg); 2302 } else { 2303 mcs = brw_imm_ud(0u); 2304 } 2305 } 2306 2307 /* Stuff the channel select bits in the top of the texture offset */ 2308 if (instr->op == nir_texop_tg4) { 2309 if (instr->component == 1 && 2310 (key_tex->gather_channel_quirk_mask & (1 << texture))) { 2311 /* gather4 sampler is broken for green channel on RG32F -- 2312 * we must ask for blue instead. 2313 */ 2314 constant_offset |= 2 << 16; 2315 } else { 2316 constant_offset |= instr->component << 16; 2317 } 2318 } 2319 2320 ir_texture_opcode op = ir_texture_opcode_for_nir_texop(instr->op); 2321 2322 emit_texture(op, dest, dest_type, coordinate, instr->coord_components, 2323 shadow_comparator, 2324 lod, lod2, sample_index, 2325 constant_offset, offset_value, mcs, 2326 texture, texture_reg, sampler_reg); 2327 } 2328 2329 void 2330 vec4_visitor::nir_emit_undef(nir_ssa_undef_instr *instr) 2331 { 2332 nir_ssa_values[instr->def.index] = 2333 dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32))); 2334 } 2335 2336 /* SIMD4x2 64bit data is stored in register space like this: 2337 * 2338 * r0.0:DF x0 y0 z0 w0 2339 * r1.0:DF x1 y1 z1 w1 2340 * 2341 * When we need to write data such as this to memory using 32-bit write 2342 * messages we need to shuffle it in this fashion: 2343 * 2344 * r0.0:DF x0 y0 x1 y1 (to be written at base offset) 2345 * r0.0:DF z0 w0 z1 w1 (to be written at base offset + 16) 2346 * 2347 * We need to do the inverse operation when we read using 32-bit messages, 2348 * which we can do by applying the same exact shuffling on the 64-bit data 2349 * read, only that because the data for each vertex is positioned differently 2350 * we need to apply different channel enables. 2351 * 2352 * This function takes 64bit data and shuffles it as explained above. 2353 * 2354 * The @for_write parameter is used to specify if the shuffling is being done 2355 * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit 2356 * write message (for_write = true), or instead we are doing the inverse 2357 * operation and we have just read 64-bit data using a 32-bit messages that we 2358 * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false). 2359 * 2360 * If @block and @ref are non-NULL, then the shuffling is done after @ref, 2361 * otherwise the instructions are emitted normally at the end. The function 2362 * returns the last instruction inserted. 2363 * 2364 * Notice that @src and @dst cannot be the same register. 2365 */ 2366 vec4_instruction * 2367 vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write, 2368 bblock_t *block, vec4_instruction *ref) 2369 { 2370 assert(type_sz(src.type) == 8); 2371 assert(type_sz(dst.type) == 8); 2372 assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE)); 2373 assert(!ref == !block); 2374 2375 const vec4_builder bld = !ref ? vec4_builder(this).at_end() : 2376 vec4_builder(this).at(block, ref->next); 2377 2378 /* Resolve swizzle in src */ 2379 vec4_instruction *inst; 2380 if (src.swizzle != BRW_SWIZZLE_XYZW) { 2381 dst_reg data = dst_reg(this, glsl_type::dvec4_type); 2382 inst = bld.MOV(data, src); 2383 src = src_reg(data); 2384 } 2385 2386 /* dst+0.XY = src+0.XY */ 2387 inst = bld.group(4, 0).MOV(writemask(dst, WRITEMASK_XY), src); 2388 2389 /* dst+0.ZW = src+1.XY */ 2390 inst = bld.group(4, for_write ? 1 : 0) 2391 .MOV(writemask(dst, WRITEMASK_ZW), 2392 swizzle(byte_offset(src, REG_SIZE), BRW_SWIZZLE_XYXY)); 2393 2394 /* dst+1.XY = src+0.ZW */ 2395 inst = bld.group(4, for_write ? 0 : 1) 2396 .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY), 2397 swizzle(src, BRW_SWIZZLE_ZWZW)); 2398 2399 /* dst+1.ZW = src+1.ZW */ 2400 inst = bld.group(4, 1) 2401 .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW), 2402 byte_offset(src, REG_SIZE)); 2403 2404 return inst; 2405 } 2406 2407 } 2408