1 /* 2 * Copyright 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric (at) anholt.net> 25 * 26 */ 27 28 #include "main/macros.h" 29 #include "program/program.h" 30 #include "program/prog_print.h" 31 #include "brw_context.h" 32 #include "brw_defines.h" 33 #include "brw_eu.h" 34 35 const struct brw_instruction_info brw_opcodes[128] = { 36 [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1, .is_arith = 1 }, 37 [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1, .is_arith = 1 }, 38 [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1, .is_arith = 1 }, 39 [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1, .is_arith = 1 }, 40 [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1, .is_arith = 1 }, 41 [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1, .is_arith = 1 }, 42 [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1, .is_arith = 1 }, 43 [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 }, 44 45 [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 46 [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 47 [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 48 [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 49 [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 }, 50 [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 }, 51 [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 }, 52 [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 }, 53 [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 }, 54 [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 }, 55 [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 }, 56 [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 }, 57 58 [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 59 [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 60 [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 61 [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 62 [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 63 [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 64 [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 65 [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1, .is_arith = 1 }, 66 [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 }, 67 [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 }, 68 [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 }, 69 70 [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 }, 71 [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 }, 72 [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 }, 73 [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 }, 74 [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 }, 75 [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 }, 76 [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 }, 77 [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 }, 78 [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 }, 79 [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 }, 80 [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 }, 81 [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 }, 82 [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 }, 83 [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 }, 84 [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 }, 85 [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 }, 86 [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 }, 87 }; 88 89 static INLINE 90 bool brw_is_arithmetic_inst(const struct brw_instruction *inst) 91 { 92 return brw_opcodes[inst->header.opcode].is_arith; 93 } 94 95 static const GLuint inst_stride[7] = { 96 [0] = 0, 97 [1] = 1, 98 [2] = 2, 99 [3] = 4, 100 [4] = 8, 101 [5] = 16, 102 [6] = 32 103 }; 104 105 static const GLuint inst_type_size[8] = { 106 [BRW_REGISTER_TYPE_UD] = 4, 107 [BRW_REGISTER_TYPE_D] = 4, 108 [BRW_REGISTER_TYPE_UW] = 2, 109 [BRW_REGISTER_TYPE_W] = 2, 110 [BRW_REGISTER_TYPE_UB] = 1, 111 [BRW_REGISTER_TYPE_B] = 1, 112 [BRW_REGISTER_TYPE_F] = 4 113 }; 114 115 static INLINE bool 116 brw_is_grf_written(const struct brw_instruction *inst, 117 int reg_index, int size, 118 int gen) 119 { 120 if (brw_opcodes[inst->header.opcode].ndst == 0) 121 return false; 122 123 if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT) 124 if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE) 125 return true; 126 127 if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE) 128 return false; 129 130 const int reg_start = reg_index * REG_SIZE; 131 const int reg_end = reg_start + size; 132 133 const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type]; 134 const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE 135 + inst->bits1.da1.dest_subreg_nr; 136 int length, write_end; 137 138 /* SEND is specific */ 139 if (inst->header.opcode == BRW_OPCODE_SEND) { 140 if (gen >= 5) 141 length = inst->bits3.generic_gen5.response_length*REG_SIZE; 142 else 143 length = inst->bits3.generic.response_length*REG_SIZE; 144 } 145 else { 146 length = 1 << inst->header.execution_size; 147 length *= type_size; 148 length *= inst->bits1.da1.dest_horiz_stride; 149 } 150 151 /* If the two intervals intersect, we overwrite the register */ 152 write_end = write_start + length; 153 const int left = MAX2(write_start, reg_start); 154 const int right = MIN2(write_end, reg_end); 155 156 return left < right; 157 } 158 159 static bool 160 brw_is_mrf_written_alu(const struct brw_instruction *inst, 161 int reg_index, int size) 162 { 163 if (brw_opcodes[inst->header.opcode].ndst == 0) 164 return false; 165 166 if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE) 167 return false; 168 169 if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT) 170 return true; 171 172 const int reg_start = reg_index * REG_SIZE; 173 const int reg_end = reg_start + size; 174 175 const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f; 176 const int is_compr4 = inst->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4; 177 const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type]; 178 179 /* We use compr4 with a size != 16 elements. Strange, we conservatively 180 * consider that we are writing the register. 181 */ 182 if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16) 183 return true; 184 185 /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */ 186 if (is_compr4) { 187 const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride; 188 189 /* First 8-way register */ 190 const int write_start0 = mrf_index*REG_SIZE 191 + inst->bits1.da1.dest_subreg_nr; 192 const int write_end0 = write_start0 + length; 193 194 /* Second 8-way register */ 195 const int write_start1 = (mrf_index+4)*REG_SIZE 196 + inst->bits1.da1.dest_subreg_nr; 197 const int write_end1 = write_start1 + length; 198 199 /* If the two intervals intersect, we overwrite the register */ 200 const int left0 = MAX2(write_start0, reg_start); 201 const int right0 = MIN2(write_end0, reg_end); 202 const int left1 = MAX2(write_start1, reg_start); 203 const int right1 = MIN2(write_end1, reg_end); 204 205 if (left0 < right0 || left1 < right1) 206 return true; 207 } 208 else { 209 int length; 210 length = 1 << inst->header.execution_size; 211 length *= type_size; 212 length *= inst->bits1.da1.dest_horiz_stride; 213 214 /* If the two intervals intersect, we write into the register */ 215 const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE 216 + inst->bits1.da1.dest_subreg_nr; 217 const int write_end = write_start + length; 218 const int left = MAX2(write_start, reg_start); 219 const int right = MIN2(write_end, reg_end); 220 221 if (left < right) 222 return true; 223 } 224 225 return false; 226 } 227 228 /* SEND may perform an implicit mov to a mrf register */ 229 static bool 230 brw_is_mrf_written_send(const struct brw_instruction *inst, 231 int reg_index, int size) 232 { 233 234 const int reg_start = reg_index * REG_SIZE; 235 const int reg_end = reg_start + size; 236 const int mrf_start = inst->header.destreg__conditionalmod; 237 const int write_start = mrf_start * REG_SIZE; 238 const int write_end = write_start + REG_SIZE; 239 const int left = MAX2(write_start, reg_start); 240 const int right = MIN2(write_end, reg_end); 241 242 if (inst->header.opcode != BRW_OPCODE_SEND || 243 inst->bits1.da1.src0_reg_file == 0) 244 return false; 245 246 return left < right; 247 } 248 249 /* Specific path for message register since we need to handle the compr4 case */ 250 static INLINE bool 251 brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size) 252 { 253 return (brw_is_mrf_written_alu(inst, reg_index, size) || 254 brw_is_mrf_written_send(inst, reg_index, size)); 255 } 256 257 static INLINE bool 258 brw_is_mrf_read(const struct brw_instruction *inst, 259 int reg_index, int size, int gen) 260 { 261 if (inst->header.opcode != BRW_OPCODE_SEND) 262 return false; 263 if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT) 264 return true; 265 266 const int reg_start = reg_index*REG_SIZE; 267 const int reg_end = reg_start + size; 268 269 int length, read_start, read_end; 270 if (gen >= 5) 271 length = inst->bits3.generic_gen5.msg_length*REG_SIZE; 272 else 273 length = inst->bits3.generic.msg_length*REG_SIZE; 274 275 /* Look if SEND uses an implicit mov. In that case, we read one less register 276 * (but we write it) 277 */ 278 if (inst->bits1.da1.src0_reg_file != 0) 279 read_start = inst->header.destreg__conditionalmod; 280 else { 281 length--; 282 read_start = inst->header.destreg__conditionalmod + 1; 283 } 284 read_start *= REG_SIZE; 285 read_end = read_start + length; 286 287 const int left = MAX2(read_start, reg_start); 288 const int right = MIN2(read_end, reg_end); 289 290 return left < right; 291 } 292 293 static INLINE bool 294 brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size) 295 { 296 int i, j; 297 if (brw_opcodes[inst->header.opcode].nsrc == 0) 298 return false; 299 300 /* Look at first source. We must take into account register regions to 301 * monitor carefully the read. Note that we are a bit too conservative here 302 * since we do not take into account the fact that some complete registers 303 * may be skipped 304 */ 305 if (brw_opcodes[inst->header.opcode].nsrc >= 1) { 306 307 if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT) 308 if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE) 309 return true; 310 if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE) 311 return false; 312 313 const int reg_start = reg_index*REG_SIZE; 314 const int reg_end = reg_start + size; 315 316 /* See if at least one of this element intersects the interval */ 317 const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type]; 318 const int elem_num = 1 << inst->header.execution_size; 319 const int width = 1 << inst->bits2.da1.src0_width; 320 const int row_num = elem_num >> inst->bits2.da1.src0_width; 321 const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride]; 322 const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride]; 323 int row_start = inst->bits2.da1.src0_reg_nr*REG_SIZE 324 + inst->bits2.da1.src0_subreg_nr; 325 for (j = 0; j < row_num; ++j) { 326 int write_start = row_start; 327 for (i = 0; i < width; ++i) { 328 const int write_end = write_start + type_size; 329 const int left = write_start > reg_start ? write_start : reg_start; 330 const int right = write_end < reg_end ? write_end : reg_end; 331 if (left < right) 332 return true; 333 write_start += hs; 334 } 335 row_start += vs; 336 } 337 } 338 339 /* Second src register */ 340 if (brw_opcodes[inst->header.opcode].nsrc >= 2) { 341 342 if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT) 343 if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE) 344 return true; 345 if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE) 346 return false; 347 348 const int reg_start = reg_index*REG_SIZE; 349 const int reg_end = reg_start + size; 350 351 /* See if at least one of this element intersects the interval */ 352 const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type]; 353 const int elem_num = 1 << inst->header.execution_size; 354 const int width = 1 << inst->bits3.da1.src1_width; 355 const int row_num = elem_num >> inst->bits3.da1.src1_width; 356 const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride]; 357 const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride]; 358 int row_start = inst->bits3.da1.src1_reg_nr*REG_SIZE 359 + inst->bits3.da1.src1_subreg_nr; 360 for (j = 0; j < row_num; ++j) { 361 int write_start = row_start; 362 for (i = 0; i < width; ++i) { 363 const int write_end = write_start + type_size; 364 const int left = write_start > reg_start ? write_start : reg_start; 365 const int right = write_end < reg_end ? write_end : reg_end; 366 if (left < right) 367 return true; 368 write_start += hs; 369 } 370 row_start += vs; 371 } 372 } 373 374 return false; 375 } 376 377 static INLINE bool 378 brw_is_control_done(const struct brw_instruction *mov) { 379 return 380 mov->header.dependency_control != 0 || 381 mov->header.thread_control != 0 || 382 mov->header.mask_control != 0 || 383 mov->header.saturate != 0 || 384 mov->header.debug_control != 0; 385 } 386 387 static INLINE bool 388 brw_is_predicated(const struct brw_instruction *mov) { 389 return mov->header.predicate_control != 0; 390 } 391 392 static INLINE bool 393 brw_is_grf_to_mrf_mov(const struct brw_instruction *mov, 394 int *mrf_index, 395 int *grf_index, 396 bool *is_compr4) 397 { 398 if (brw_is_predicated(mov) || 399 brw_is_control_done(mov) || 400 mov->header.debug_control != 0) 401 return false; 402 403 if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT || 404 mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE || 405 mov->bits1.da1.dest_reg_type != BRW_REGISTER_TYPE_F || 406 mov->bits1.da1.dest_horiz_stride != BRW_HORIZONTAL_STRIDE_1 || 407 mov->bits1.da1.dest_subreg_nr != 0) 408 return false; 409 410 if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT || 411 mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE || 412 mov->bits1.da1.src0_reg_type != BRW_REGISTER_TYPE_F || 413 mov->bits2.da1.src0_width != BRW_WIDTH_8 || 414 mov->bits2.da1.src0_horiz_stride != BRW_HORIZONTAL_STRIDE_1 || 415 mov->bits2.da1.src0_vert_stride != BRW_VERTICAL_STRIDE_8 || 416 mov->bits2.da1.src0_subreg_nr != 0 || 417 mov->bits2.da1.src0_abs != 0 || 418 mov->bits2.da1.src0_negate != 0) 419 return false; 420 421 *grf_index = mov->bits2.da1.src0_reg_nr; 422 *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f; 423 *is_compr4 = (mov->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4) != 0; 424 return true; 425 } 426 427 static INLINE bool 428 brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index) 429 { 430 /* remark: no problem to predicate a SEL instruction */ 431 if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) && 432 brw_is_control_done(inst) == false && 433 inst->header.execution_size == 4 && 434 inst->header.access_mode == BRW_ALIGN_1 && 435 inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT && 436 inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE && 437 inst->bits1.da1.dest_reg_type == BRW_REGISTER_TYPE_F && 438 inst->bits1.da1.dest_horiz_stride == BRW_HORIZONTAL_STRIDE_1 && 439 inst->bits1.da1.dest_reg_nr == grf_index && 440 inst->bits1.da1.dest_subreg_nr == 0 && 441 brw_is_arithmetic_inst(inst)) 442 return true; 443 444 return false; 445 } 446 447 static INLINE bool 448 brw_inst_are_equal(const struct brw_instruction *src0, 449 const struct brw_instruction *src1) 450 { 451 const GLuint *field0 = (GLuint *) src0; 452 const GLuint *field1 = (GLuint *) src1; 453 return field0[0] == field1[0] && 454 field0[1] == field1[1] && 455 field0[2] == field1[2] && 456 field0[3] == field1[3]; 457 } 458 459 static INLINE void 460 brw_inst_copy(struct brw_instruction *dst, 461 const struct brw_instruction *src) 462 { 463 GLuint *field_dst = (GLuint *) dst; 464 const GLuint *field_src = (GLuint *) src; 465 field_dst[0] = field_src[0]; 466 field_dst[1] = field_src[1]; 467 field_dst[2] = field_src[2]; 468 field_dst[3] = field_src[3]; 469 } 470 471 static void brw_remove_inst(struct brw_compile *p, const bool *removeInst) 472 { 473 int i, nr_insn = 0, to = 0, from = 0; 474 475 for (from = 0; from < p->nr_insn; ++from) { 476 if (removeInst[from]) 477 continue; 478 if(to != from) 479 brw_inst_copy(p->store + to, p->store + from); 480 to++; 481 } 482 483 for (i = 0; i < p->nr_insn; ++i) 484 if (removeInst[i] == false) 485 nr_insn++; 486 p->nr_insn = nr_insn; 487 } 488 489 /* The gen code emitter generates a lot of duplications in the 490 * grf-to-mrf moves, for example when texture sampling with the same 491 * coordinates from multiple textures.. Here, we monitor same mov 492 * grf-to-mrf instrutions and remove repeated ones where the operands 493 * and dst ahven't changed in between. 494 */ 495 void brw_remove_duplicate_mrf_moves(struct brw_compile *p) 496 { 497 const int gen = p->brw->intel.gen; 498 int i, j; 499 500 bool *removeInst = calloc(sizeof(bool), p->nr_insn); 501 for (i = 0; i < p->nr_insn; i++) { 502 if (removeInst[i]) 503 continue; 504 505 const struct brw_instruction *mov = p->store + i; 506 int mrf_index, grf_index; 507 bool is_compr4; 508 509 /* Only consider _straight_ grf-to-mrf moves */ 510 if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4)) 511 continue; 512 513 const int mrf_index0 = mrf_index; 514 const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1; 515 const int simd16_size = 2 * REG_SIZE; 516 517 for (j = i + 1; j < p->nr_insn; j++) { 518 const struct brw_instruction *inst = p->store + j; 519 520 if (brw_inst_are_equal(mov, inst)) { 521 removeInst[j] = true; 522 continue; 523 } 524 525 if (brw_is_grf_written(inst, grf_index, simd16_size, gen) || 526 brw_is_mrf_written(inst, mrf_index0, REG_SIZE) || 527 brw_is_mrf_written(inst, mrf_index1, REG_SIZE)) 528 break; 529 } 530 } 531 532 brw_remove_inst(p, removeInst); 533 free(removeInst); 534 } 535 536 /* Replace moves to MRFs where the value moved is the result of a 537 * normal arithmetic operation with computation right into the MRF. 538 */ 539 void brw_remove_grf_to_mrf_moves(struct brw_compile *p) 540 { 541 int i, j, prev; 542 struct brw_context *brw = p->brw; 543 const int gen = brw->intel.gen; 544 const int simd16_size = 2*REG_SIZE; 545 546 bool *removeInst = calloc(sizeof(bool), p->nr_insn); 547 assert(removeInst); 548 549 for (i = 0; i < p->nr_insn; i++) { 550 if (removeInst[i]) 551 continue; 552 553 struct brw_instruction *grf_inst = NULL; 554 const struct brw_instruction *mov = p->store + i; 555 int mrf_index, grf_index; 556 bool is_compr4; 557 558 /* Only consider _straight_ grf-to-mrf moves */ 559 if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4)) 560 continue; 561 562 /* Using comp4 enables a stride of 4 for this instruction */ 563 const int mrf_index0 = mrf_index; 564 const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1; 565 566 /* Look where the register has been set */ 567 prev = i; 568 bool potential_remove = false; 569 while (prev--) { 570 571 /* If _one_ instruction writes the grf, we try to remove the mov */ 572 struct brw_instruction *inst = p->store + prev; 573 if (brw_is_grf_straight_write(inst, grf_index)) { 574 potential_remove = true; 575 grf_inst = inst; 576 break; 577 } 578 579 } 580 581 if (potential_remove == false) 582 continue; 583 removeInst[i] = true; 584 585 /* Monitor first the section of code between the grf computation and the 586 * mov. Here we cannot read or write both mrf and grf register 587 */ 588 for (j = prev + 1; j < i; ++j) { 589 struct brw_instruction *inst = p->store + j; 590 if (removeInst[j]) 591 continue; 592 if (brw_is_grf_written(inst, grf_index, simd16_size, gen) || 593 brw_is_grf_read(inst, grf_index, simd16_size) || 594 brw_is_mrf_written(inst, mrf_index0, REG_SIZE) || 595 brw_is_mrf_written(inst, mrf_index1, REG_SIZE) || 596 brw_is_mrf_read(inst, mrf_index0, REG_SIZE, gen) || 597 brw_is_mrf_read(inst, mrf_index1, REG_SIZE, gen)) { 598 removeInst[i] = false; 599 break; 600 } 601 } 602 603 /* After the mov, we can read or write the mrf. If the grf is overwritten, 604 * we are done 605 */ 606 for (j = i + 1; j < p->nr_insn; ++j) { 607 struct brw_instruction *inst = p->store + j; 608 if (removeInst[j]) 609 continue; 610 611 if (brw_is_grf_read(inst, grf_index, simd16_size)) { 612 removeInst[i] = false; 613 break; 614 } 615 616 if (brw_is_grf_straight_write(inst, grf_index)) 617 break; 618 } 619 620 /* Note that with the top down traversal, we can safely pacth the mov 621 * instruction 622 */ 623 if (removeInst[i]) { 624 grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file; 625 grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr; 626 } 627 } 628 629 brw_remove_inst(p, removeInst); 630 free(removeInst); 631 } 632 633 static bool 634 is_single_channel_dp4(struct brw_instruction *insn) 635 { 636 if (insn->header.opcode != BRW_OPCODE_DP4 || 637 insn->header.execution_size != BRW_EXECUTE_8 || 638 insn->header.access_mode != BRW_ALIGN_16 || 639 insn->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE) 640 return false; 641 642 if (!is_power_of_two(insn->bits1.da16.dest_writemask)) 643 return false; 644 645 return true; 646 } 647 648 /** 649 * Sets the dependency control fields on DP4 instructions. 650 * 651 * The hardware only tracks dependencies on a register basis, so when 652 * you do: 653 * 654 * DP4 dst.x src1 src2 655 * DP4 dst.y src1 src3 656 * DP4 dst.z src1 src4 657 * DP4 dst.w src1 src5 658 * 659 * It will wait to do the DP4 dst.y until the dst.x is resolved, etc. 660 * We can examine our instruction stream and set the dependency 661 * control fields to tell the hardware when to do it. 662 * 663 * We may want to extend this to other instructions that are used to 664 * fill in a channel at a time of the destination register. 665 */ 666 static void 667 brw_set_dp4_dependency_control(struct brw_compile *p) 668 { 669 int i; 670 671 for (i = 1; i < p->nr_insn; i++) { 672 struct brw_instruction *insn = &p->store[i]; 673 struct brw_instruction *prev = &p->store[i - 1]; 674 675 if (!is_single_channel_dp4(prev)) 676 continue; 677 678 if (!is_single_channel_dp4(insn)) { 679 i++; 680 continue; 681 } 682 683 /* Only avoid hw dep control if the write masks are different 684 * channels of one reg. 685 */ 686 if (insn->bits1.da16.dest_writemask == prev->bits1.da16.dest_writemask) 687 continue; 688 if (insn->bits1.da16.dest_reg_nr != prev->bits1.da16.dest_reg_nr) 689 continue; 690 691 /* Check if the second instruction depends on the previous one 692 * for a src. 693 */ 694 if (insn->bits1.da1.src0_reg_file == BRW_GENERAL_REGISTER_FILE && 695 (insn->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT || 696 insn->bits2.da1.src0_reg_nr == insn->bits1.da16.dest_reg_nr)) 697 continue; 698 if (insn->bits1.da1.src1_reg_file == BRW_GENERAL_REGISTER_FILE && 699 (insn->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT || 700 insn->bits3.da1.src1_reg_nr == insn->bits1.da16.dest_reg_nr)) 701 continue; 702 703 prev->header.dependency_control |= BRW_DEPENDENCY_NOTCLEARED; 704 insn->header.dependency_control |= BRW_DEPENDENCY_NOTCHECKED; 705 } 706 } 707 708 void 709 brw_optimize(struct brw_compile *p) 710 { 711 brw_set_dp4_dependency_control(p); 712 } 713