1 /* 2 * Copyright 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Jason Ekstrand (jason (at) jlekstrand.net) 25 * 26 */ 27 28 #include "nir.h" 29 30 /* 31 * Implements a simple pass that lowers vecN instructions to a series of 32 * moves with partial writes. 33 */ 34 35 static bool 36 src_matches_dest_reg(nir_dest *dest, nir_src *src) 37 { 38 if (dest->is_ssa || src->is_ssa) 39 return false; 40 41 return (dest->reg.reg == src->reg.reg && 42 dest->reg.base_offset == src->reg.base_offset && 43 !dest->reg.indirect && 44 !src->reg.indirect); 45 } 46 47 /** 48 * For a given starting writemask channel and corresponding source index in 49 * the vec instruction, insert a MOV to the vec instruction's dest of all the 50 * writemask channels that get read from the same src reg. 51 * 52 * Returns the writemask of our MOV, so the parent loop calling this knows 53 * which ones have been processed. 54 */ 55 static unsigned 56 insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader) 57 { 58 assert(start_idx < nir_op_infos[vec->op].num_inputs); 59 60 nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_imov); 61 nir_alu_src_copy(&mov->src[0], &vec->src[start_idx], mov); 62 nir_alu_dest_copy(&mov->dest, &vec->dest, mov); 63 64 mov->dest.write_mask = (1u << start_idx); 65 mov->src[0].swizzle[start_idx] = vec->src[start_idx].swizzle[0]; 66 mov->src[0].negate = vec->src[start_idx].negate; 67 mov->src[0].abs = vec->src[start_idx].abs; 68 69 for (unsigned i = start_idx + 1; i < 4; i++) { 70 if (!(vec->dest.write_mask & (1 << i))) 71 continue; 72 73 if (nir_srcs_equal(vec->src[i].src, vec->src[start_idx].src) && 74 vec->src[i].negate == vec->src[start_idx].negate && 75 vec->src[i].abs == vec->src[start_idx].abs) { 76 mov->dest.write_mask |= (1 << i); 77 mov->src[0].swizzle[i] = vec->src[i].swizzle[0]; 78 } 79 } 80 81 unsigned channels_handled = mov->dest.write_mask; 82 83 /* In some situations (if the vecN is involved in a phi-web), we can end 84 * up with a mov from a register to itself. Some of those channels may end 85 * up doing nothing and there's no reason to have them as part of the mov. 86 */ 87 if (src_matches_dest_reg(&mov->dest.dest, &mov->src[0].src) && 88 !mov->src[0].abs && !mov->src[0].negate) { 89 for (unsigned i = 0; i < 4; i++) { 90 if (mov->src[0].swizzle[i] == i) { 91 mov->dest.write_mask &= ~(1 << i); 92 } 93 } 94 } 95 96 /* Only emit the instruction if it actually does something */ 97 if (mov->dest.write_mask) { 98 nir_instr_insert_before(&vec->instr, &mov->instr); 99 } else { 100 ralloc_free(mov); 101 } 102 103 return channels_handled; 104 } 105 106 static bool 107 has_replicated_dest(nir_alu_instr *alu) 108 { 109 return alu->op == nir_op_fdot_replicated2 || 110 alu->op == nir_op_fdot_replicated3 || 111 alu->op == nir_op_fdot_replicated4 || 112 alu->op == nir_op_fdph_replicated; 113 } 114 115 /* Attempts to coalesce the "move" from the given source of the vec to the 116 * destination of the instruction generating the value. If, for whatever 117 * reason, we cannot coalesce the mmove, it does nothing and returns 0. We 118 * can then call insert_mov as normal. 119 */ 120 static unsigned 121 try_coalesce(nir_alu_instr *vec, unsigned start_idx) 122 { 123 assert(start_idx < nir_op_infos[vec->op].num_inputs); 124 125 /* We will only even try if the source is SSA */ 126 if (!vec->src[start_idx].src.is_ssa) 127 return 0; 128 129 assert(vec->src[start_idx].src.ssa); 130 131 /* If we are going to do a reswizzle, then the vecN operation must be the 132 * only use of the source value. We also can't have any source modifiers. 133 */ 134 nir_foreach_use(src, vec->src[start_idx].src.ssa) { 135 if (src->parent_instr != &vec->instr) 136 return 0; 137 138 nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src); 139 if (alu_src->abs || alu_src->negate) 140 return 0; 141 } 142 143 if (!list_empty(&vec->src[start_idx].src.ssa->if_uses)) 144 return 0; 145 146 if (vec->src[start_idx].src.ssa->parent_instr->type != nir_instr_type_alu) 147 return 0; 148 149 nir_alu_instr *src_alu = 150 nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr); 151 152 if (has_replicated_dest(src_alu)) { 153 /* The fdot instruction is special: It replicates its result to all 154 * components. This means that we can always rewrite its destination 155 * and we don't need to swizzle anything. 156 */ 157 } else { 158 /* We only care about being able to re-swizzle the instruction if it is 159 * something that we can reswizzle. It must be per-component. The one 160 * exception to this is the fdotN instructions which implicitly splat 161 * their result out to all channels. 162 */ 163 if (nir_op_infos[src_alu->op].output_size != 0) 164 return 0; 165 166 /* If we are going to reswizzle the instruction, we can't have any 167 * non-per-component sources either. 168 */ 169 for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) 170 if (nir_op_infos[src_alu->op].input_sizes[j] != 0) 171 return 0; 172 } 173 174 /* Stash off all of the ALU instruction's swizzles. */ 175 uint8_t swizzles[4][4]; 176 for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) 177 for (unsigned i = 0; i < 4; i++) 178 swizzles[j][i] = src_alu->src[j].swizzle[i]; 179 180 unsigned write_mask = 0; 181 for (unsigned i = start_idx; i < 4; i++) { 182 if (!(vec->dest.write_mask & (1 << i))) 183 continue; 184 185 if (!vec->src[i].src.is_ssa || 186 vec->src[i].src.ssa != &src_alu->dest.dest.ssa) 187 continue; 188 189 /* At this point, the give vec source matchese up with the ALU 190 * instruction so we can re-swizzle that component to match. 191 */ 192 write_mask |= 1 << i; 193 if (has_replicated_dest(src_alu)) { 194 /* Since the destination is a single replicated value, we don't need 195 * to do any reswizzling 196 */ 197 } else { 198 for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) 199 src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]]; 200 } 201 202 /* Clear the no longer needed vec source */ 203 nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT); 204 } 205 206 nir_instr_rewrite_dest(&src_alu->instr, &src_alu->dest.dest, vec->dest.dest); 207 src_alu->dest.write_mask = write_mask; 208 209 return write_mask; 210 } 211 212 static bool 213 lower_vec_to_movs_block(nir_block *block, nir_function_impl *impl) 214 { 215 bool progress = false; 216 nir_shader *shader = impl->function->shader; 217 218 nir_foreach_instr_safe(instr, block) { 219 if (instr->type != nir_instr_type_alu) 220 continue; 221 222 nir_alu_instr *vec = nir_instr_as_alu(instr); 223 224 switch (vec->op) { 225 case nir_op_vec2: 226 case nir_op_vec3: 227 case nir_op_vec4: 228 break; 229 default: 230 continue; /* The loop */ 231 } 232 233 if (vec->dest.dest.is_ssa) { 234 /* Since we insert multiple MOVs, we have a register destination. */ 235 nir_register *reg = nir_local_reg_create(impl); 236 reg->num_components = vec->dest.dest.ssa.num_components; 237 reg->bit_size = vec->dest.dest.ssa.bit_size; 238 239 nir_ssa_def_rewrite_uses(&vec->dest.dest.ssa, nir_src_for_reg(reg)); 240 241 nir_instr_rewrite_dest(&vec->instr, &vec->dest.dest, 242 nir_dest_for_reg(reg)); 243 } 244 245 unsigned finished_write_mask = 0; 246 247 /* First, emit a MOV for all the src channels that are in the 248 * destination reg, in case other values we're populating in the dest 249 * might overwrite them. 250 */ 251 for (unsigned i = 0; i < 4; i++) { 252 if (!(vec->dest.write_mask & (1 << i))) 253 continue; 254 255 if (src_matches_dest_reg(&vec->dest.dest, &vec->src[i].src)) { 256 finished_write_mask |= insert_mov(vec, i, shader); 257 break; 258 } 259 } 260 261 /* Now, emit MOVs for all the other src channels. */ 262 for (unsigned i = 0; i < 4; i++) { 263 if (!(vec->dest.write_mask & (1 << i))) 264 continue; 265 266 if (!(finished_write_mask & (1 << i))) 267 finished_write_mask |= try_coalesce(vec, i); 268 269 if (!(finished_write_mask & (1 << i))) 270 finished_write_mask |= insert_mov(vec, i, shader); 271 } 272 273 nir_instr_remove(&vec->instr); 274 ralloc_free(vec); 275 progress = true; 276 } 277 278 return progress; 279 } 280 281 static bool 282 nir_lower_vec_to_movs_impl(nir_function_impl *impl) 283 { 284 bool progress = false; 285 286 nir_foreach_block(block, impl) { 287 progress |= lower_vec_to_movs_block(block, impl); 288 } 289 290 if (progress) { 291 nir_metadata_preserve(impl, nir_metadata_block_index | 292 nir_metadata_dominance); 293 } 294 295 return progress; 296 } 297 298 bool 299 nir_lower_vec_to_movs(nir_shader *shader) 300 { 301 bool progress = false; 302 303 nir_foreach_function(function, shader) { 304 if (function->impl) 305 progress = nir_lower_vec_to_movs_impl(function->impl) || progress; 306 } 307 308 return progress; 309 } 310