1 /* 2 * Copyright 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 extern "C" { 25 #include "main/macros.h" 26 #include "program/register_allocate.h" 27 } /* extern "C" */ 28 29 #include "brw_vec4.h" 30 #include "glsl/ir_print_visitor.h" 31 32 using namespace brw; 33 34 namespace brw { 35 36 static void 37 assign(unsigned int *reg_hw_locations, reg *reg) 38 { 39 if (reg->file == GRF) { 40 reg->reg = reg_hw_locations[reg->reg]; 41 } 42 } 43 44 bool 45 vec4_visitor::reg_allocate_trivial() 46 { 47 unsigned int hw_reg_mapping[this->virtual_grf_count]; 48 bool virtual_grf_used[this->virtual_grf_count]; 49 int i; 50 int next; 51 52 /* Calculate which virtual GRFs are actually in use after whatever 53 * optimization passes have occurred. 54 */ 55 for (int i = 0; i < this->virtual_grf_count; i++) { 56 virtual_grf_used[i] = false; 57 } 58 59 foreach_iter(exec_list_iterator, iter, this->instructions) { 60 vec4_instruction *inst = (vec4_instruction *)iter.get(); 61 62 if (inst->dst.file == GRF) 63 virtual_grf_used[inst->dst.reg] = true; 64 65 for (int i = 0; i < 3; i++) { 66 if (inst->src[i].file == GRF) 67 virtual_grf_used[inst->src[i].reg] = true; 68 } 69 } 70 71 hw_reg_mapping[0] = this->first_non_payload_grf; 72 next = hw_reg_mapping[0] + this->virtual_grf_sizes[0]; 73 for (i = 1; i < this->virtual_grf_count; i++) { 74 if (virtual_grf_used[i]) { 75 hw_reg_mapping[i] = next; 76 next += this->virtual_grf_sizes[i]; 77 } 78 } 79 prog_data->total_grf = next; 80 81 foreach_iter(exec_list_iterator, iter, this->instructions) { 82 vec4_instruction *inst = (vec4_instruction *)iter.get(); 83 84 assign(hw_reg_mapping, &inst->dst); 85 assign(hw_reg_mapping, &inst->src[0]); 86 assign(hw_reg_mapping, &inst->src[1]); 87 assign(hw_reg_mapping, &inst->src[2]); 88 } 89 90 if (prog_data->total_grf > max_grf) { 91 fail("Ran out of regs on trivial allocator (%d/%d)\n", 92 prog_data->total_grf, max_grf); 93 return false; 94 } 95 96 return true; 97 } 98 99 static void 100 brw_alloc_reg_set_for_classes(struct brw_context *brw, 101 int *class_sizes, 102 int class_count, 103 int base_reg_count) 104 { 105 /* Compute the total number of registers across all classes. */ 106 int ra_reg_count = 0; 107 for (int i = 0; i < class_count; i++) { 108 ra_reg_count += base_reg_count - (class_sizes[i] - 1); 109 } 110 111 ralloc_free(brw->vs.ra_reg_to_grf); 112 brw->vs.ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count); 113 ralloc_free(brw->vs.regs); 114 brw->vs.regs = ra_alloc_reg_set(brw, ra_reg_count); 115 ralloc_free(brw->vs.classes); 116 brw->vs.classes = ralloc_array(brw, int, class_count + 1); 117 118 /* Now, add the registers to their classes, and add the conflicts 119 * between them and the base GRF registers (and also each other). 120 */ 121 int reg = 0; 122 for (int i = 0; i < class_count; i++) { 123 int class_reg_count = base_reg_count - (class_sizes[i] - 1); 124 brw->vs.classes[i] = ra_alloc_reg_class(brw->vs.regs); 125 126 for (int j = 0; j < class_reg_count; j++) { 127 ra_class_add_reg(brw->vs.regs, brw->vs.classes[i], reg); 128 129 brw->vs.ra_reg_to_grf[reg] = j; 130 131 for (int base_reg = j; 132 base_reg < j + class_sizes[i]; 133 base_reg++) { 134 ra_add_transitive_reg_conflict(brw->vs.regs, base_reg, reg); 135 } 136 137 reg++; 138 } 139 } 140 assert(reg == ra_reg_count); 141 142 ra_set_finalize(brw->vs.regs); 143 } 144 145 bool 146 vec4_visitor::reg_allocate() 147 { 148 unsigned int hw_reg_mapping[virtual_grf_count]; 149 int first_assigned_grf = this->first_non_payload_grf; 150 int base_reg_count = max_grf - first_assigned_grf; 151 int class_sizes[base_reg_count]; 152 int class_count = 0; 153 154 /* Using the trivial allocator can be useful in debugging undefined 155 * register access as a result of broken optimization passes. 156 */ 157 if (0) 158 return reg_allocate_trivial(); 159 160 calculate_live_intervals(); 161 162 /* Set up the register classes. 163 * 164 * The base registers store a vec4. However, we'll need larger 165 * storage for arrays, structures, and matrices, which will be sets 166 * of contiguous registers. 167 */ 168 class_sizes[class_count++] = 1; 169 170 for (int r = 0; r < virtual_grf_count; r++) { 171 int i; 172 173 for (i = 0; i < class_count; i++) { 174 if (class_sizes[i] == this->virtual_grf_sizes[r]) 175 break; 176 } 177 if (i == class_count) { 178 if (this->virtual_grf_sizes[r] >= base_reg_count) { 179 fail("Object too large to register allocate.\n"); 180 } 181 182 class_sizes[class_count++] = this->virtual_grf_sizes[r]; 183 } 184 } 185 186 brw_alloc_reg_set_for_classes(brw, class_sizes, class_count, base_reg_count); 187 188 struct ra_graph *g = ra_alloc_interference_graph(brw->vs.regs, 189 virtual_grf_count); 190 191 for (int i = 0; i < virtual_grf_count; i++) { 192 for (int c = 0; c < class_count; c++) { 193 if (class_sizes[c] == this->virtual_grf_sizes[i]) { 194 ra_set_node_class(g, i, brw->vs.classes[c]); 195 break; 196 } 197 } 198 199 for (int j = 0; j < i; j++) { 200 if (virtual_grf_interferes(i, j)) { 201 ra_add_node_interference(g, i, j); 202 } 203 } 204 } 205 206 if (!ra_allocate_no_spills(g)) { 207 /* Failed to allocate registers. Spill a reg, and the caller will 208 * loop back into here to try again. 209 */ 210 int reg = choose_spill_reg(g); 211 if (reg == -1) { 212 fail("no register to spill\n"); 213 } else { 214 spill_reg(reg); 215 } 216 ralloc_free(g); 217 return false; 218 } 219 220 /* Get the chosen virtual registers for each node, and map virtual 221 * regs in the register classes back down to real hardware reg 222 * numbers. 223 */ 224 prog_data->total_grf = first_assigned_grf; 225 for (int i = 0; i < virtual_grf_count; i++) { 226 int reg = ra_get_node_reg(g, i); 227 228 hw_reg_mapping[i] = first_assigned_grf + brw->vs.ra_reg_to_grf[reg]; 229 prog_data->total_grf = MAX2(prog_data->total_grf, 230 hw_reg_mapping[i] + virtual_grf_sizes[i]); 231 } 232 233 foreach_list(node, &this->instructions) { 234 vec4_instruction *inst = (vec4_instruction *)node; 235 236 assign(hw_reg_mapping, &inst->dst); 237 assign(hw_reg_mapping, &inst->src[0]); 238 assign(hw_reg_mapping, &inst->src[1]); 239 assign(hw_reg_mapping, &inst->src[2]); 240 } 241 242 ralloc_free(g); 243 244 return true; 245 } 246 247 void 248 vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill) 249 { 250 float loop_scale = 1.0; 251 252 for (int i = 0; i < this->virtual_grf_count; i++) { 253 spill_costs[i] = 0.0; 254 no_spill[i] = virtual_grf_sizes[i] != 1; 255 } 256 257 /* Calculate costs for spilling nodes. Call it a cost of 1 per 258 * spill/unspill we'll have to do, and guess that the insides of 259 * loops run 10 times. 260 */ 261 foreach_list(node, &this->instructions) { 262 vec4_instruction *inst = (vec4_instruction *) node; 263 264 for (unsigned int i = 0; i < 3; i++) { 265 if (inst->src[i].file == GRF) { 266 spill_costs[inst->src[i].reg] += loop_scale; 267 if (inst->src[i].reladdr) 268 no_spill[inst->src[i].reg] = true; 269 } 270 } 271 272 if (inst->dst.file == GRF) { 273 spill_costs[inst->dst.reg] += loop_scale; 274 if (inst->dst.reladdr) 275 no_spill[inst->dst.reg] = true; 276 } 277 278 switch (inst->opcode) { 279 280 case BRW_OPCODE_DO: 281 loop_scale *= 10; 282 break; 283 284 case BRW_OPCODE_WHILE: 285 loop_scale /= 10; 286 break; 287 288 case VS_OPCODE_SCRATCH_READ: 289 case VS_OPCODE_SCRATCH_WRITE: 290 for (int i = 0; i < 3; i++) { 291 if (inst->src[i].file == GRF) 292 no_spill[inst->src[i].reg] = true; 293 } 294 if (inst->dst.file == GRF) 295 no_spill[inst->dst.reg] = true; 296 break; 297 298 default: 299 break; 300 } 301 } 302 } 303 304 int 305 vec4_visitor::choose_spill_reg(struct ra_graph *g) 306 { 307 float spill_costs[this->virtual_grf_count]; 308 bool no_spill[this->virtual_grf_count]; 309 310 evaluate_spill_costs(spill_costs, no_spill); 311 312 for (int i = 0; i < this->virtual_grf_count; i++) { 313 if (!no_spill[i]) 314 ra_set_node_spill_cost(g, i, spill_costs[i]); 315 } 316 317 return ra_get_best_spill_node(g); 318 } 319 320 void 321 vec4_visitor::spill_reg(int spill_reg_nr) 322 { 323 assert(virtual_grf_sizes[spill_reg_nr] == 1); 324 unsigned int spill_offset = c->last_scratch++; 325 326 /* Generate spill/unspill instructions for the objects being spilled. */ 327 foreach_list(node, &this->instructions) { 328 vec4_instruction *inst = (vec4_instruction *) node; 329 330 for (unsigned int i = 0; i < 3; i++) { 331 if (inst->src[i].file == GRF && inst->src[i].reg == spill_reg_nr) { 332 src_reg spill_reg = inst->src[i]; 333 inst->src[i].reg = virtual_grf_alloc(1); 334 dst_reg temp = dst_reg(inst->src[i]); 335 336 /* Only read the necessary channels, to avoid overwriting the rest 337 * with data that may not have been written to scratch. 338 */ 339 temp.writemask = 0; 340 for (int c = 0; c < 4; c++) 341 temp.writemask |= (1 << BRW_GET_SWZ(inst->src[i].swizzle, c)); 342 assert(temp.writemask != 0); 343 344 emit_scratch_read(inst, temp, spill_reg, spill_offset); 345 } 346 } 347 348 if (inst->dst.file == GRF && inst->dst.reg == spill_reg_nr) { 349 dst_reg spill_reg = inst->dst; 350 inst->dst.reg = virtual_grf_alloc(1); 351 352 /* We don't want a swizzle when reading from the source; read the 353 * whole register and use spill_reg's writemask to select which 354 * channels to write. 355 */ 356 src_reg temp = src_reg(inst->dst); 357 temp.swizzle = BRW_SWIZZLE_XYZW; 358 emit_scratch_write(inst, temp, spill_reg, spill_offset); 359 } 360 } 361 362 this->live_intervals_valid = false; 363 } 364 365 } /* namespace brw */ 366