1 /* 2 * Copyright 2012 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include "blorp_nir_builder.h" 25 26 #include "blorp_priv.h" 27 28 /* header-only include needed for _mesa_unorm_to_float and friends. */ 29 #include "mesa/main/format_utils.h" 30 31 #define FILE_DEBUG_FLAG DEBUG_BLORP 32 33 static const bool split_blorp_blit_debug = false; 34 35 /** 36 * Enum to specify the order of arguments in a sampler message 37 */ 38 enum sampler_message_arg 39 { 40 SAMPLER_MESSAGE_ARG_U_FLOAT, 41 SAMPLER_MESSAGE_ARG_V_FLOAT, 42 SAMPLER_MESSAGE_ARG_U_INT, 43 SAMPLER_MESSAGE_ARG_V_INT, 44 SAMPLER_MESSAGE_ARG_R_INT, 45 SAMPLER_MESSAGE_ARG_SI_INT, 46 SAMPLER_MESSAGE_ARG_MCS_INT, 47 SAMPLER_MESSAGE_ARG_ZERO_INT, 48 }; 49 50 struct brw_blorp_blit_vars { 51 /* Input values from brw_blorp_wm_inputs */ 52 nir_variable *v_discard_rect; 53 nir_variable *v_rect_grid; 54 nir_variable *v_coord_transform; 55 nir_variable *v_src_z; 56 nir_variable *v_src_offset; 57 nir_variable *v_dst_offset; 58 nir_variable *v_src_inv_size; 59 60 /* gl_FragCoord */ 61 nir_variable *frag_coord; 62 63 /* gl_FragColor */ 64 nir_variable *color_out; 65 }; 66 67 static void 68 brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v, 69 const struct brw_blorp_blit_prog_key *key) 70 { 71 /* Blended and scaled blits never use pixel discard. */ 72 assert(!key->use_kill || !(key->blend && key->blit_scaled)); 73 74 #define LOAD_INPUT(name, type)\ 75 v->v_##name = BLORP_CREATE_NIR_INPUT(b->shader, name, type); 76 77 LOAD_INPUT(discard_rect, glsl_vec4_type()) 78 LOAD_INPUT(rect_grid, glsl_vec4_type()) 79 LOAD_INPUT(coord_transform, glsl_vec4_type()) 80 LOAD_INPUT(src_z, glsl_uint_type()) 81 LOAD_INPUT(src_offset, glsl_vector_type(GLSL_TYPE_UINT, 2)) 82 LOAD_INPUT(dst_offset, glsl_vector_type(GLSL_TYPE_UINT, 2)) 83 LOAD_INPUT(src_inv_size, glsl_vector_type(GLSL_TYPE_FLOAT, 2)) 84 85 #undef LOAD_INPUT 86 87 v->frag_coord = nir_variable_create(b->shader, nir_var_shader_in, 88 glsl_vec4_type(), "gl_FragCoord"); 89 v->frag_coord->data.location = VARYING_SLOT_POS; 90 v->frag_coord->data.origin_upper_left = true; 91 92 v->color_out = nir_variable_create(b->shader, nir_var_shader_out, 93 glsl_vec4_type(), "gl_FragColor"); 94 v->color_out->data.location = FRAG_RESULT_COLOR; 95 } 96 97 static nir_ssa_def * 98 blorp_blit_get_frag_coords(nir_builder *b, 99 const struct brw_blorp_blit_prog_key *key, 100 struct brw_blorp_blit_vars *v) 101 { 102 nir_ssa_def *coord = nir_f2i32(b, nir_load_var(b, v->frag_coord)); 103 104 /* Account for destination surface intratile offset 105 * 106 * Transformation parameters giving translation from destination to source 107 * coordinates don't take into account possible intra-tile destination 108 * offset. Therefore it has to be first subtracted from the incoming 109 * coordinates. Vertices are set up based on coordinates containing the 110 * intra-tile offset. 111 */ 112 if (key->need_dst_offset) 113 coord = nir_isub(b, coord, nir_load_var(b, v->v_dst_offset)); 114 115 if (key->persample_msaa_dispatch) { 116 return nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1), 117 nir_load_sample_id(b)); 118 } else { 119 return nir_vec2(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1)); 120 } 121 } 122 123 /** 124 * Emit code to translate from destination (X, Y) coordinates to source (X, Y) 125 * coordinates. 126 */ 127 static nir_ssa_def * 128 blorp_blit_apply_transform(nir_builder *b, nir_ssa_def *src_pos, 129 struct brw_blorp_blit_vars *v) 130 { 131 nir_ssa_def *coord_transform = nir_load_var(b, v->v_coord_transform); 132 133 nir_ssa_def *offset = nir_vec2(b, nir_channel(b, coord_transform, 1), 134 nir_channel(b, coord_transform, 3)); 135 nir_ssa_def *mul = nir_vec2(b, nir_channel(b, coord_transform, 0), 136 nir_channel(b, coord_transform, 2)); 137 138 return nir_fadd(b, nir_fmul(b, src_pos, mul), offset); 139 } 140 141 static inline void 142 blorp_nir_discard_if_outside_rect(nir_builder *b, nir_ssa_def *pos, 143 struct brw_blorp_blit_vars *v) 144 { 145 nir_ssa_def *c0, *c1, *c2, *c3; 146 nir_ssa_def *discard_rect = nir_load_var(b, v->v_discard_rect); 147 nir_ssa_def *dst_x0 = nir_channel(b, discard_rect, 0); 148 nir_ssa_def *dst_x1 = nir_channel(b, discard_rect, 1); 149 nir_ssa_def *dst_y0 = nir_channel(b, discard_rect, 2); 150 nir_ssa_def *dst_y1 = nir_channel(b, discard_rect, 3); 151 152 c0 = nir_ult(b, nir_channel(b, pos, 0), dst_x0); 153 c1 = nir_uge(b, nir_channel(b, pos, 0), dst_x1); 154 c2 = nir_ult(b, nir_channel(b, pos, 1), dst_y0); 155 c3 = nir_uge(b, nir_channel(b, pos, 1), dst_y1); 156 157 nir_ssa_def *oob = nir_ior(b, nir_ior(b, c0, c1), nir_ior(b, c2, c3)); 158 159 nir_intrinsic_instr *discard = 160 nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if); 161 discard->src[0] = nir_src_for_ssa(oob); 162 nir_builder_instr_insert(b, &discard->instr); 163 } 164 165 static nir_tex_instr * 166 blorp_create_nir_tex_instr(nir_builder *b, struct brw_blorp_blit_vars *v, 167 nir_texop op, nir_ssa_def *pos, unsigned num_srcs, 168 nir_alu_type dst_type) 169 { 170 nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs); 171 172 tex->op = op; 173 174 tex->dest_type = dst_type; 175 tex->is_array = false; 176 tex->is_shadow = false; 177 178 /* Blorp only has one texture and it's bound at unit 0 */ 179 tex->texture = NULL; 180 tex->sampler = NULL; 181 tex->texture_index = 0; 182 tex->sampler_index = 0; 183 184 /* To properly handle 3-D and 2-D array textures, we pull the Z component 185 * from an input. TODO: This is a bit magic; we should probably make this 186 * more explicit in the future. 187 */ 188 assert(pos->num_components >= 2); 189 pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1), 190 nir_load_var(b, v->v_src_z)); 191 192 tex->src[0].src_type = nir_tex_src_coord; 193 tex->src[0].src = nir_src_for_ssa(pos); 194 tex->coord_components = 3; 195 196 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); 197 198 return tex; 199 } 200 201 static nir_ssa_def * 202 blorp_nir_tex(nir_builder *b, struct brw_blorp_blit_vars *v, 203 const struct brw_blorp_blit_prog_key *key, nir_ssa_def *pos) 204 { 205 if (key->need_src_offset) 206 pos = nir_fadd(b, pos, nir_i2f32(b, nir_load_var(b, v->v_src_offset))); 207 208 /* If the sampler requires normalized coordinates, we need to compensate. */ 209 if (key->src_coords_normalized) 210 pos = nir_fmul(b, pos, nir_load_var(b, v->v_src_inv_size)); 211 212 nir_tex_instr *tex = 213 blorp_create_nir_tex_instr(b, v, nir_texop_tex, pos, 2, 214 key->texture_data_type); 215 216 assert(pos->num_components == 2); 217 tex->sampler_dim = GLSL_SAMPLER_DIM_2D; 218 tex->src[1].src_type = nir_tex_src_lod; 219 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); 220 221 nir_builder_instr_insert(b, &tex->instr); 222 223 return &tex->dest.ssa; 224 } 225 226 static nir_ssa_def * 227 blorp_nir_txf(nir_builder *b, struct brw_blorp_blit_vars *v, 228 nir_ssa_def *pos, nir_alu_type dst_type) 229 { 230 nir_tex_instr *tex = 231 blorp_create_nir_tex_instr(b, v, nir_texop_txf, pos, 2, dst_type); 232 233 tex->sampler_dim = GLSL_SAMPLER_DIM_3D; 234 tex->src[1].src_type = nir_tex_src_lod; 235 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); 236 237 nir_builder_instr_insert(b, &tex->instr); 238 239 return &tex->dest.ssa; 240 } 241 242 static nir_ssa_def * 243 blorp_nir_txf_ms(nir_builder *b, struct brw_blorp_blit_vars *v, 244 nir_ssa_def *pos, nir_ssa_def *mcs, nir_alu_type dst_type) 245 { 246 nir_tex_instr *tex = 247 blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms, pos, 248 mcs != NULL ? 3 : 2, dst_type); 249 250 tex->sampler_dim = GLSL_SAMPLER_DIM_MS; 251 252 tex->src[1].src_type = nir_tex_src_ms_index; 253 if (pos->num_components == 2) { 254 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); 255 } else { 256 assert(pos->num_components == 3); 257 tex->src[1].src = nir_src_for_ssa(nir_channel(b, pos, 2)); 258 } 259 260 if (mcs) { 261 tex->src[2].src_type = nir_tex_src_ms_mcs; 262 tex->src[2].src = nir_src_for_ssa(mcs); 263 } 264 265 nir_builder_instr_insert(b, &tex->instr); 266 267 return &tex->dest.ssa; 268 } 269 270 static nir_ssa_def * 271 blorp_blit_txf_ms_mcs(nir_builder *b, struct brw_blorp_blit_vars *v, 272 nir_ssa_def *pos) 273 { 274 nir_tex_instr *tex = 275 blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms_mcs, 276 pos, 1, nir_type_int); 277 278 tex->sampler_dim = GLSL_SAMPLER_DIM_MS; 279 280 nir_builder_instr_insert(b, &tex->instr); 281 282 return &tex->dest.ssa; 283 } 284 285 static nir_ssa_def * 286 nir_mask_shift_or(struct nir_builder *b, nir_ssa_def *dst, nir_ssa_def *src, 287 uint32_t src_mask, int src_left_shift) 288 { 289 nir_ssa_def *masked = nir_iand(b, src, nir_imm_int(b, src_mask)); 290 291 nir_ssa_def *shifted; 292 if (src_left_shift > 0) { 293 shifted = nir_ishl(b, masked, nir_imm_int(b, src_left_shift)); 294 } else if (src_left_shift < 0) { 295 shifted = nir_ushr(b, masked, nir_imm_int(b, -src_left_shift)); 296 } else { 297 assert(src_left_shift == 0); 298 shifted = masked; 299 } 300 301 return nir_ior(b, dst, shifted); 302 } 303 304 /** 305 * Emit code to compensate for the difference between Y and W tiling. 306 * 307 * This code modifies the X and Y coordinates according to the formula: 308 * 309 * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S)) 310 * 311 * (See brw_blorp_build_nir_shader). 312 */ 313 static inline nir_ssa_def * 314 blorp_nir_retile_y_to_w(nir_builder *b, nir_ssa_def *pos) 315 { 316 assert(pos->num_components == 2); 317 nir_ssa_def *x_Y = nir_channel(b, pos, 0); 318 nir_ssa_def *y_Y = nir_channel(b, pos, 1); 319 320 /* Given X and Y coordinates that describe an address using Y tiling, 321 * translate to the X and Y coordinates that describe the same address 322 * using W tiling. 323 * 324 * If we break down the low order bits of X and Y, using a 325 * single letter to represent each low-order bit: 326 * 327 * X = A << 7 | 0bBCDEFGH 328 * Y = J << 5 | 0bKLMNP (1) 329 * 330 * Then we can apply the Y tiling formula to see the memory offset being 331 * addressed: 332 * 333 * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2) 334 * 335 * If we apply the W detiling formula to this memory location, that the 336 * corresponding X' and Y' coordinates are: 337 * 338 * X' = A << 6 | 0bBCDPFH (3) 339 * Y' = J << 6 | 0bKLMNEG 340 * 341 * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'), 342 * we need to make the following computation: 343 * 344 * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4) 345 * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1 346 */ 347 nir_ssa_def *x_W = nir_imm_int(b, 0); 348 x_W = nir_mask_shift_or(b, x_W, x_Y, 0xfffffff4, -1); 349 x_W = nir_mask_shift_or(b, x_W, y_Y, 0x1, 2); 350 x_W = nir_mask_shift_or(b, x_W, x_Y, 0x1, 0); 351 352 nir_ssa_def *y_W = nir_imm_int(b, 0); 353 y_W = nir_mask_shift_or(b, y_W, y_Y, 0xfffffffe, 1); 354 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x8, -2); 355 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x2, -1); 356 357 return nir_vec2(b, x_W, y_W); 358 } 359 360 /** 361 * Emit code to compensate for the difference between Y and W tiling. 362 * 363 * This code modifies the X and Y coordinates according to the formula: 364 * 365 * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S)) 366 * 367 * (See brw_blorp_build_nir_shader). 368 */ 369 static inline nir_ssa_def * 370 blorp_nir_retile_w_to_y(nir_builder *b, nir_ssa_def *pos) 371 { 372 assert(pos->num_components == 2); 373 nir_ssa_def *x_W = nir_channel(b, pos, 0); 374 nir_ssa_def *y_W = nir_channel(b, pos, 1); 375 376 /* Applying the same logic as above, but in reverse, we obtain the 377 * formulas: 378 * 379 * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1 380 * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2 381 */ 382 nir_ssa_def *x_Y = nir_imm_int(b, 0); 383 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0xfffffffa, 1); 384 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x2, 2); 385 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x1, 1); 386 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0x1, 0); 387 388 nir_ssa_def *y_Y = nir_imm_int(b, 0); 389 y_Y = nir_mask_shift_or(b, y_Y, y_W, 0xfffffffc, -1); 390 y_Y = nir_mask_shift_or(b, y_Y, x_W, 0x4, -2); 391 392 return nir_vec2(b, x_Y, y_Y); 393 } 394 395 /** 396 * Emit code to compensate for the difference between MSAA and non-MSAA 397 * surfaces. 398 * 399 * This code modifies the X and Y coordinates according to the formula: 400 * 401 * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S) 402 * 403 * (See brw_blorp_blit_program). 404 */ 405 static inline nir_ssa_def * 406 blorp_nir_encode_msaa(nir_builder *b, nir_ssa_def *pos, 407 unsigned num_samples, enum isl_msaa_layout layout) 408 { 409 assert(pos->num_components == 2 || pos->num_components == 3); 410 411 switch (layout) { 412 case ISL_MSAA_LAYOUT_NONE: 413 assert(pos->num_components == 2); 414 return pos; 415 case ISL_MSAA_LAYOUT_ARRAY: 416 /* No translation needed */ 417 return pos; 418 case ISL_MSAA_LAYOUT_INTERLEAVED: { 419 nir_ssa_def *x_in = nir_channel(b, pos, 0); 420 nir_ssa_def *y_in = nir_channel(b, pos, 1); 421 nir_ssa_def *s_in = pos->num_components == 2 ? nir_imm_int(b, 0) : 422 nir_channel(b, pos, 2); 423 424 nir_ssa_def *x_out = nir_imm_int(b, 0); 425 nir_ssa_def *y_out = nir_imm_int(b, 0); 426 switch (num_samples) { 427 case 2: 428 case 4: 429 /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0) 430 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) 431 * Y' = Y 432 * 433 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0) 434 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) 435 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) 436 */ 437 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 1); 438 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); 439 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 440 if (num_samples == 2) { 441 y_out = y_in; 442 } else { 443 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1); 444 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); 445 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 446 } 447 break; 448 449 case 8: 450 /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0) 451 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 452 * | (X & 0b1) 453 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) 454 */ 455 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2); 456 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0); 457 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); 458 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 459 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1); 460 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); 461 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 462 break; 463 464 case 16: 465 /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0) 466 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 467 * | (X & 0b1) 468 * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10) 469 * | (Y & 0b1) 470 */ 471 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2); 472 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0); 473 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); 474 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 475 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 2); 476 y_out = nir_mask_shift_or(b, y_out, s_in, 0x8, -1); 477 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); 478 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 479 break; 480 481 default: 482 unreachable("Invalid number of samples for IMS layout"); 483 } 484 485 return nir_vec2(b, x_out, y_out); 486 } 487 488 default: 489 unreachable("Invalid MSAA layout"); 490 } 491 } 492 493 /** 494 * Emit code to compensate for the difference between MSAA and non-MSAA 495 * surfaces. 496 * 497 * This code modifies the X and Y coordinates according to the formula: 498 * 499 * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S) 500 * 501 * (See brw_blorp_blit_program). 502 */ 503 static inline nir_ssa_def * 504 blorp_nir_decode_msaa(nir_builder *b, nir_ssa_def *pos, 505 unsigned num_samples, enum isl_msaa_layout layout) 506 { 507 assert(pos->num_components == 2 || pos->num_components == 3); 508 509 switch (layout) { 510 case ISL_MSAA_LAYOUT_NONE: 511 /* No translation necessary, and S should already be zero. */ 512 assert(pos->num_components == 2); 513 return pos; 514 case ISL_MSAA_LAYOUT_ARRAY: 515 /* No translation necessary. */ 516 return pos; 517 case ISL_MSAA_LAYOUT_INTERLEAVED: { 518 assert(pos->num_components == 2); 519 520 nir_ssa_def *x_in = nir_channel(b, pos, 0); 521 nir_ssa_def *y_in = nir_channel(b, pos, 1); 522 523 nir_ssa_def *x_out = nir_imm_int(b, 0); 524 nir_ssa_def *y_out = nir_imm_int(b, 0); 525 nir_ssa_def *s_out = nir_imm_int(b, 0); 526 switch (num_samples) { 527 case 2: 528 case 4: 529 /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S) 530 * where X' = (X & ~0b11) >> 1 | (X & 0b1) 531 * S = (X & 0b10) >> 1 532 * 533 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S) 534 * where X' = (X & ~0b11) >> 1 | (X & 0b1) 535 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 536 * S = (Y & 0b10) | (X & 0b10) >> 1 537 */ 538 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffc, -1); 539 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 540 if (num_samples == 2) { 541 y_out = y_in; 542 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 543 } else { 544 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1); 545 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 546 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 547 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); 548 } 549 break; 550 551 case 8: 552 /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S) 553 * where X' = (X & ~0b111) >> 2 | (X & 0b1) 554 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 555 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1 556 */ 557 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2); 558 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 559 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1); 560 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 561 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0); 562 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); 563 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 564 break; 565 566 case 16: 567 /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S) 568 * where X' = (X & ~0b111) >> 2 | (X & 0b1) 569 * Y' = (Y & ~0b111) >> 2 | (Y & 0b1) 570 * S = (Y & 0b100) << 1 | (X & 0b100) | 571 * (Y & 0b10) | (X & 0b10) >> 1 572 */ 573 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2); 574 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 575 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffff8, -2); 576 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 577 s_out = nir_mask_shift_or(b, s_out, y_in, 0x4, 1); 578 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0); 579 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); 580 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 581 break; 582 583 default: 584 unreachable("Invalid number of samples for IMS layout"); 585 } 586 587 return nir_vec3(b, x_out, y_out, s_out); 588 } 589 590 default: 591 unreachable("Invalid MSAA layout"); 592 } 593 } 594 595 /** 596 * Count the number of trailing 1 bits in the given value. For example: 597 * 598 * count_trailing_one_bits(0) == 0 599 * count_trailing_one_bits(7) == 3 600 * count_trailing_one_bits(11) == 2 601 */ 602 static inline int count_trailing_one_bits(unsigned value) 603 { 604 #ifdef HAVE___BUILTIN_CTZ 605 return __builtin_ctz(~value); 606 #else 607 return _mesa_bitcount(value & ~(value + 1)); 608 #endif 609 } 610 611 static nir_ssa_def * 612 blorp_nir_manual_blend_average(nir_builder *b, struct brw_blorp_blit_vars *v, 613 nir_ssa_def *pos, unsigned tex_samples, 614 enum isl_aux_usage tex_aux_usage, 615 nir_alu_type dst_type) 616 { 617 /* If non-null, this is the outer-most if statement */ 618 nir_if *outer_if = NULL; 619 620 nir_variable *color = 621 nir_local_variable_create(b->impl, glsl_vec4_type(), "color"); 622 623 nir_ssa_def *mcs = NULL; 624 if (tex_aux_usage == ISL_AUX_USAGE_MCS) 625 mcs = blorp_blit_txf_ms_mcs(b, v, pos); 626 627 /* We add together samples using a binary tree structure, e.g. for 4x MSAA: 628 * 629 * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4 630 * 631 * This ensures that when all samples have the same value, no numerical 632 * precision is lost, since each addition operation always adds two equal 633 * values, and summing two equal floating point values does not lose 634 * precision. 635 * 636 * We perform this computation by treating the texture_data array as a 637 * stack and performing the following operations: 638 * 639 * - push sample 0 onto stack 640 * - push sample 1 onto stack 641 * - add top two stack entries 642 * - push sample 2 onto stack 643 * - push sample 3 onto stack 644 * - add top two stack entries 645 * - add top two stack entries 646 * - divide top stack entry by 4 647 * 648 * Note that after pushing sample i onto the stack, the number of add 649 * operations we do is equal to the number of trailing 1 bits in i. This 650 * works provided the total number of samples is a power of two, which it 651 * always is for i965. 652 * 653 * For integer formats, we replace the add operations with average 654 * operations and skip the final division. 655 */ 656 nir_ssa_def *texture_data[5]; 657 unsigned stack_depth = 0; 658 for (unsigned i = 0; i < tex_samples; ++i) { 659 assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */ 660 661 /* Push sample i onto the stack */ 662 assert(stack_depth < ARRAY_SIZE(texture_data)); 663 664 nir_ssa_def *ms_pos = nir_vec3(b, nir_channel(b, pos, 0), 665 nir_channel(b, pos, 1), 666 nir_imm_int(b, i)); 667 texture_data[stack_depth++] = blorp_nir_txf_ms(b, v, ms_pos, mcs, dst_type); 668 669 if (i == 0 && tex_aux_usage == ISL_AUX_USAGE_MCS) { 670 /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface) 671 * suggests an optimization: 672 * 673 * "A simple optimization with probable large return in 674 * performance is to compare the MCS value to zero (indicating 675 * all samples are on sample slice 0), and sample only from 676 * sample slice 0 using ld2dss if MCS is zero." 677 * 678 * Note that in the case where the MCS value is zero, sampling from 679 * sample slice 0 using ld2dss and sampling from sample 0 using 680 * ld2dms are equivalent (since all samples are on sample slice 0). 681 * Since we have already sampled from sample 0, all we need to do is 682 * skip the remaining fetches and averaging if MCS is zero. 683 * 684 * It's also trivial to detect when the MCS has the magic clear color 685 * value. In this case, the txf we did on sample 0 will return the 686 * clear color and we can skip the remaining fetches just like we do 687 * when MCS == 0. 688 */ 689 nir_ssa_def *mcs_zero = 690 nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0)); 691 if (tex_samples == 16) { 692 mcs_zero = nir_iand(b, mcs_zero, 693 nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b, 0))); 694 } 695 nir_ssa_def *mcs_clear = 696 blorp_nir_mcs_is_clear_color(b, mcs, tex_samples); 697 698 nir_if *if_stmt = nir_if_create(b->shader); 699 if_stmt->condition = nir_src_for_ssa(nir_ior(b, mcs_zero, mcs_clear)); 700 nir_cf_node_insert(b->cursor, &if_stmt->cf_node); 701 702 b->cursor = nir_after_cf_list(&if_stmt->then_list); 703 nir_store_var(b, color, texture_data[0], 0xf); 704 705 b->cursor = nir_after_cf_list(&if_stmt->else_list); 706 outer_if = if_stmt; 707 } 708 709 for (int j = 0; j < count_trailing_one_bits(i); j++) { 710 assert(stack_depth >= 2); 711 --stack_depth; 712 713 assert(dst_type == nir_type_float); 714 texture_data[stack_depth - 1] = 715 nir_fadd(b, texture_data[stack_depth - 1], 716 texture_data[stack_depth]); 717 } 718 } 719 720 /* We should have just 1 sample on the stack now. */ 721 assert(stack_depth == 1); 722 723 texture_data[0] = nir_fmul(b, texture_data[0], 724 nir_imm_float(b, 1.0 / tex_samples)); 725 726 nir_store_var(b, color, texture_data[0], 0xf); 727 728 if (outer_if) 729 b->cursor = nir_after_cf_node(&outer_if->cf_node); 730 731 return nir_load_var(b, color); 732 } 733 734 static inline nir_ssa_def * 735 nir_imm_vec2(nir_builder *build, float x, float y) 736 { 737 nir_const_value v; 738 739 memset(&v, 0, sizeof(v)); 740 v.f32[0] = x; 741 v.f32[1] = y; 742 743 return nir_build_imm(build, 4, 32, v); 744 } 745 746 static nir_ssa_def * 747 blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos, 748 unsigned tex_samples, 749 const struct brw_blorp_blit_prog_key *key, 750 struct brw_blorp_blit_vars *v) 751 { 752 nir_ssa_def *pos_xy = nir_channels(b, pos, 0x3); 753 nir_ssa_def *rect_grid = nir_load_var(b, v->v_rect_grid); 754 nir_ssa_def *scale = nir_imm_vec2(b, key->x_scale, key->y_scale); 755 756 /* Translate coordinates to lay out the samples in a rectangular grid 757 * roughly corresponding to sample locations. 758 */ 759 pos_xy = nir_fmul(b, pos_xy, scale); 760 /* Adjust coordinates so that integers represent pixel centers rather 761 * than pixel edges. 762 */ 763 pos_xy = nir_fadd(b, pos_xy, nir_imm_float(b, -0.5)); 764 /* Clamp the X, Y texture coordinates to properly handle the sampling of 765 * texels on texture edges. 766 */ 767 pos_xy = nir_fmin(b, nir_fmax(b, pos_xy, nir_imm_float(b, 0.0)), 768 nir_vec2(b, nir_channel(b, rect_grid, 0), 769 nir_channel(b, rect_grid, 1))); 770 771 /* Store the fractional parts to be used as bilinear interpolation 772 * coefficients. 773 */ 774 nir_ssa_def *frac_xy = nir_ffract(b, pos_xy); 775 /* Round the float coordinates down to nearest integer */ 776 pos_xy = nir_fdiv(b, nir_ftrunc(b, pos_xy), scale); 777 778 nir_ssa_def *tex_data[4]; 779 for (unsigned i = 0; i < 4; ++i) { 780 float sample_off_x = (float)(i & 0x1) / key->x_scale; 781 float sample_off_y = (float)((i >> 1) & 0x1) / key->y_scale; 782 nir_ssa_def *sample_off = nir_imm_vec2(b, sample_off_x, sample_off_y); 783 784 nir_ssa_def *sample_coords = nir_fadd(b, pos_xy, sample_off); 785 nir_ssa_def *sample_coords_int = nir_f2i32(b, sample_coords); 786 787 /* The MCS value we fetch has to match up with the pixel that we're 788 * sampling from. Since we sample from different pixels in each 789 * iteration of this "for" loop, the call to mcs_fetch() should be 790 * here inside the loop after computing the pixel coordinates. 791 */ 792 nir_ssa_def *mcs = NULL; 793 if (key->tex_aux_usage == ISL_AUX_USAGE_MCS) 794 mcs = blorp_blit_txf_ms_mcs(b, v, sample_coords_int); 795 796 /* Compute sample index and map the sample index to a sample number. 797 * Sample index layout shows the numbering of slots in a rectangular 798 * grid of samples with in a pixel. Sample number layout shows the 799 * rectangular grid of samples roughly corresponding to the real sample 800 * locations with in a pixel. 801 * In case of 4x MSAA, layout of sample indices matches the layout of 802 * sample numbers: 803 * --------- 804 * | 0 | 1 | 805 * --------- 806 * | 2 | 3 | 807 * --------- 808 * 809 * In case of 8x MSAA the two layouts don't match. 810 * sample index layout : --------- sample number layout : --------- 811 * | 0 | 1 | | 3 | 7 | 812 * --------- --------- 813 * | 2 | 3 | | 5 | 0 | 814 * --------- --------- 815 * | 4 | 5 | | 1 | 2 | 816 * --------- --------- 817 * | 6 | 7 | | 4 | 6 | 818 * --------- --------- 819 * 820 * Fortunately, this can be done fairly easily as: 821 * S' = (0x17306425 >> (S * 4)) & 0xf 822 * 823 * In the case of 16x MSAA the two layouts don't match. 824 * Sample index layout: Sample number layout: 825 * --------------------- --------------------- 826 * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 7 | 827 * --------------------- --------------------- 828 * | 4 | 5 | 6 | 7 | | 4 | 1 | 3 | 13 | 829 * --------------------- --------------------- 830 * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 | 831 * --------------------- --------------------- 832 * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 | 833 * --------------------- --------------------- 834 * 835 * This is equivalent to 836 * S' = (0xe58b602cd31479af >> (S * 4)) & 0xf 837 */ 838 nir_ssa_def *frac = nir_ffract(b, sample_coords); 839 nir_ssa_def *sample = 840 nir_fdot2(b, frac, nir_imm_vec2(b, key->x_scale, 841 key->x_scale * key->y_scale)); 842 sample = nir_f2i32(b, sample); 843 844 if (tex_samples == 8) { 845 sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x64210573), 846 nir_ishl(b, sample, nir_imm_int(b, 2))), 847 nir_imm_int(b, 0xf)); 848 } else if (tex_samples == 16) { 849 nir_ssa_def *sample_low = 850 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xd31479af), 851 nir_ishl(b, sample, nir_imm_int(b, 2))), 852 nir_imm_int(b, 0xf)); 853 nir_ssa_def *sample_high = 854 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xe58b602c), 855 nir_ishl(b, nir_iadd(b, sample, 856 nir_imm_int(b, -8)), 857 nir_imm_int(b, 2))), 858 nir_imm_int(b, 0xf)); 859 860 sample = nir_bcsel(b, nir_ilt(b, sample, nir_imm_int(b, 8)), 861 sample_low, sample_high); 862 } 863 nir_ssa_def *pos_ms = nir_vec3(b, nir_channel(b, sample_coords_int, 0), 864 nir_channel(b, sample_coords_int, 1), 865 sample); 866 tex_data[i] = blorp_nir_txf_ms(b, v, pos_ms, mcs, key->texture_data_type); 867 } 868 869 nir_ssa_def *frac_x = nir_channel(b, frac_xy, 0); 870 nir_ssa_def *frac_y = nir_channel(b, frac_xy, 1); 871 return nir_flrp(b, nir_flrp(b, tex_data[0], tex_data[1], frac_x), 872 nir_flrp(b, tex_data[2], tex_data[3], frac_x), 873 frac_y); 874 } 875 876 /** Perform a color bit-cast operation 877 * 878 * For copy operations involving CCS, we may need to use different formats for 879 * the source and destination surfaces. The two formats must both be UINT 880 * formats and must have the same size but may have different bit layouts. 881 * For instance, we may be copying from R8G8B8A8_UINT to R32_UINT or R32_UINT 882 * to R16G16_UINT. This function generates code to shuffle bits around to get 883 * us from one to the other. 884 */ 885 static nir_ssa_def * 886 bit_cast_color(struct nir_builder *b, nir_ssa_def *color, 887 const struct brw_blorp_blit_prog_key *key) 888 { 889 assert(key->texture_data_type == nir_type_uint); 890 891 if (key->dst_bpc > key->src_bpc) { 892 nir_ssa_def *u = nir_ssa_undef(b, 1, 32); 893 nir_ssa_def *dst_chan[2] = { u, u }; 894 unsigned shift = 0; 895 unsigned dst_idx = 0; 896 for (unsigned i = 0; i < 4; i++) { 897 nir_ssa_def *shifted = nir_ishl(b, nir_channel(b, color, i), 898 nir_imm_int(b, shift)); 899 if (shift == 0) { 900 dst_chan[dst_idx] = shifted; 901 } else { 902 dst_chan[dst_idx] = nir_ior(b, dst_chan[dst_idx], shifted); 903 } 904 905 shift += key->src_bpc; 906 if (shift >= key->dst_bpc) { 907 dst_idx++; 908 shift = 0; 909 } 910 } 911 912 return nir_vec4(b, dst_chan[0], dst_chan[1], u, u); 913 } else { 914 assert(key->dst_bpc < key->src_bpc); 915 916 nir_ssa_def *mask = nir_imm_int(b, ~0u >> (32 - key->dst_bpc)); 917 918 nir_ssa_def *dst_chan[4]; 919 unsigned src_idx = 0; 920 unsigned shift = 0; 921 for (unsigned i = 0; i < 4; i++) { 922 dst_chan[i] = nir_iand(b, nir_ushr(b, nir_channel(b, color, src_idx), 923 nir_imm_int(b, shift)), 924 mask); 925 shift += key->dst_bpc; 926 if (shift >= key->src_bpc) { 927 src_idx++; 928 shift = 0; 929 } 930 } 931 932 return nir_vec4(b, dst_chan[0], dst_chan[1], dst_chan[2], dst_chan[3]); 933 } 934 } 935 936 /** 937 * Generator for WM programs used in BLORP blits. 938 * 939 * The bulk of the work done by the WM program is to wrap and unwrap the 940 * coordinate transformations used by the hardware to store surfaces in 941 * memory. The hardware transforms a pixel location (X, Y, S) (where S is the 942 * sample index for a multisampled surface) to a memory offset by the 943 * following formulas: 944 * 945 * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S)) 946 * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset)) 947 * 948 * For a single-sampled surface, or for a multisampled surface using 949 * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity 950 * function: 951 * 952 * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0) 953 * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0) 954 * encode_msaa(n, UMS, X, Y, S) = (X, Y, S) 955 * decode_msaa(n, UMS, X, Y, S) = (X, Y, S) 956 * 957 * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa() 958 * embeds the sample number into bit 1 of the X and Y coordinates: 959 * 960 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0) 961 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) 962 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1) 963 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S) 964 * where X' = (X & ~0b11) >> 1 | (X & 0b1) 965 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 966 * S = (Y & 0b10) | (X & 0b10) >> 1 967 * 968 * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa() 969 * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of 970 * the Y coordinate: 971 * 972 * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0) 973 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1) 974 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) 975 * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S) 976 * where X' = (X & ~0b111) >> 2 | (X & 0b1) 977 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 978 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1 979 * 980 * For X tiling, tile() combines together the low-order bits of the X and Y 981 * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512 982 * bytes wide and 8 rows high: 983 * 984 * tile(x_tiled, X, Y, S) = A 985 * where A = tile_num << 12 | offset 986 * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9) 987 * offset = (Y' & 0b111) << 9 988 * | (X & 0b111111111) 989 * X' = X * cpp 990 * Y' = Y + S * qpitch 991 * detile(x_tiled, A) = (X, Y, S) 992 * where X = X' / cpp 993 * Y = Y' % qpitch 994 * S = Y' / qpitch 995 * Y' = (tile_num / tile_pitch) << 3 996 * | (A & 0b111000000000) >> 9 997 * X' = (tile_num % tile_pitch) << 9 998 * | (A & 0b111111111) 999 * 1000 * (In all tiling formulas, cpp is the number of bytes occupied by a single 1001 * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required 1002 * to fill the width of the surface, and qpitch is the spacing (in rows) 1003 * between array slices). 1004 * 1005 * For Y tiling, tile() combines together the low-order bits of the X and Y 1006 * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128 1007 * bytes wide and 32 rows high: 1008 * 1009 * tile(y_tiled, X, Y, S) = A 1010 * where A = tile_num << 12 | offset 1011 * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7) 1012 * offset = (X' & 0b1110000) << 5 1013 * | (Y' & 0b11111) << 4 1014 * | (X' & 0b1111) 1015 * X' = X * cpp 1016 * Y' = Y + S * qpitch 1017 * detile(y_tiled, A) = (X, Y, S) 1018 * where X = X' / cpp 1019 * Y = Y' % qpitch 1020 * S = Y' / qpitch 1021 * Y' = (tile_num / tile_pitch) << 5 1022 * | (A & 0b111110000) >> 4 1023 * X' = (tile_num % tile_pitch) << 7 1024 * | (A & 0b111000000000) >> 5 1025 * | (A & 0b1111) 1026 * 1027 * For W tiling, tile() combines together the low-order bits of the X and Y 1028 * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64 1029 * bytes wide and 64 rows high (note that W tiling is only used for stencil 1030 * buffers, which always have cpp = 1 and S=0): 1031 * 1032 * tile(w_tiled, X, Y, S) = A 1033 * where A = tile_num << 12 | offset 1034 * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6) 1035 * offset = (X' & 0b111000) << 6 1036 * | (Y' & 0b111100) << 3 1037 * | (X' & 0b100) << 2 1038 * | (Y' & 0b10) << 2 1039 * | (X' & 0b10) << 1 1040 * | (Y' & 0b1) << 1 1041 * | (X' & 0b1) 1042 * X' = X * cpp = X 1043 * Y' = Y + S * qpitch 1044 * detile(w_tiled, A) = (X, Y, S) 1045 * where X = X' / cpp = X' 1046 * Y = Y' % qpitch = Y' 1047 * S = Y / qpitch = 0 1048 * Y' = (tile_num / tile_pitch) << 6 1049 * | (A & 0b111100000) >> 3 1050 * | (A & 0b1000) >> 2 1051 * | (A & 0b10) >> 1 1052 * X' = (tile_num % tile_pitch) << 6 1053 * | (A & 0b111000000000) >> 6 1054 * | (A & 0b10000) >> 2 1055 * | (A & 0b100) >> 1 1056 * | (A & 0b1) 1057 * 1058 * Finally, for a non-tiled surface, tile() simply combines together the X and 1059 * Y coordinates in the natural way: 1060 * 1061 * tile(untiled, X, Y, S) = A 1062 * where A = Y * pitch + X' 1063 * X' = X * cpp 1064 * Y' = Y + S * qpitch 1065 * detile(untiled, A) = (X, Y, S) 1066 * where X = X' / cpp 1067 * Y = Y' % qpitch 1068 * S = Y' / qpitch 1069 * X' = A % pitch 1070 * Y' = A / pitch 1071 * 1072 * (In these formulas, pitch is the number of bytes occupied by a single row 1073 * of samples). 1074 */ 1075 static nir_shader * 1076 brw_blorp_build_nir_shader(struct blorp_context *blorp, void *mem_ctx, 1077 const struct brw_blorp_blit_prog_key *key) 1078 { 1079 const struct gen_device_info *devinfo = blorp->isl_dev->info; 1080 nir_ssa_def *src_pos, *dst_pos, *color; 1081 1082 /* Sanity checks */ 1083 if (key->dst_tiled_w && key->rt_samples > 1) { 1084 /* If the destination image is W tiled and multisampled, then the thread 1085 * must be dispatched once per sample, not once per pixel. This is 1086 * necessary because after conversion between W and Y tiling, there's no 1087 * guarantee that all samples corresponding to a single pixel will still 1088 * be together. 1089 */ 1090 assert(key->persample_msaa_dispatch); 1091 } 1092 1093 if (key->blend) { 1094 /* We are blending, which means we won't have an opportunity to 1095 * translate the tiling and sample count for the texture surface. So 1096 * the surface state for the texture must be configured with the correct 1097 * tiling and sample count. 1098 */ 1099 assert(!key->src_tiled_w); 1100 assert(key->tex_samples == key->src_samples); 1101 assert(key->tex_layout == key->src_layout); 1102 assert(key->tex_samples > 0); 1103 } 1104 1105 if (key->persample_msaa_dispatch) { 1106 /* It only makes sense to do persample dispatch if the render target is 1107 * configured as multisampled. 1108 */ 1109 assert(key->rt_samples > 0); 1110 } 1111 1112 /* Make sure layout is consistent with sample count */ 1113 assert((key->tex_layout == ISL_MSAA_LAYOUT_NONE) == 1114 (key->tex_samples <= 1)); 1115 assert((key->rt_layout == ISL_MSAA_LAYOUT_NONE) == 1116 (key->rt_samples <= 1)); 1117 assert((key->src_layout == ISL_MSAA_LAYOUT_NONE) == 1118 (key->src_samples <= 1)); 1119 assert((key->dst_layout == ISL_MSAA_LAYOUT_NONE) == 1120 (key->dst_samples <= 1)); 1121 1122 nir_builder b; 1123 nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, NULL); 1124 1125 struct brw_blorp_blit_vars v; 1126 brw_blorp_blit_vars_init(&b, &v, key); 1127 1128 dst_pos = blorp_blit_get_frag_coords(&b, key, &v); 1129 1130 /* Render target and texture hardware don't support W tiling until Gen8. */ 1131 const bool rt_tiled_w = false; 1132 const bool tex_tiled_w = devinfo->gen >= 8 && key->src_tiled_w; 1133 1134 /* The address that data will be written to is determined by the 1135 * coordinates supplied to the WM thread and the tiling and sample count of 1136 * the render target, according to the formula: 1137 * 1138 * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset)) 1139 * 1140 * If the actual tiling and sample count of the destination surface are not 1141 * the same as the configuration of the render target, then these 1142 * coordinates are wrong and we have to adjust them to compensate for the 1143 * difference. 1144 */ 1145 if (rt_tiled_w != key->dst_tiled_w || 1146 key->rt_samples != key->dst_samples || 1147 key->rt_layout != key->dst_layout) { 1148 dst_pos = blorp_nir_encode_msaa(&b, dst_pos, key->rt_samples, 1149 key->rt_layout); 1150 /* Now (X, Y, S) = detile(rt_tiling, offset) */ 1151 if (rt_tiled_w != key->dst_tiled_w) 1152 dst_pos = blorp_nir_retile_y_to_w(&b, dst_pos); 1153 /* Now (X, Y, S) = detile(rt_tiling, offset) */ 1154 dst_pos = blorp_nir_decode_msaa(&b, dst_pos, key->dst_samples, 1155 key->dst_layout); 1156 } 1157 1158 /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)). 1159 * 1160 * That is: X, Y and S now contain the true coordinates and sample index of 1161 * the data that the WM thread should output. 1162 * 1163 * If we need to kill pixels that are outside the destination rectangle, 1164 * now is the time to do it. 1165 */ 1166 if (key->use_kill) { 1167 assert(!(key->blend && key->blit_scaled)); 1168 blorp_nir_discard_if_outside_rect(&b, dst_pos, &v); 1169 } 1170 1171 src_pos = blorp_blit_apply_transform(&b, nir_i2f32(&b, dst_pos), &v); 1172 if (dst_pos->num_components == 3) { 1173 /* The sample coordinate is an integer that we want left alone but 1174 * blorp_blit_apply_transform() blindly applies the transform to all 1175 * three coordinates. Grab the original sample index. 1176 */ 1177 src_pos = nir_vec3(&b, nir_channel(&b, src_pos, 0), 1178 nir_channel(&b, src_pos, 1), 1179 nir_channel(&b, dst_pos, 2)); 1180 } 1181 1182 /* If the source image is not multisampled, then we want to fetch sample 1183 * number 0, because that's the only sample there is. 1184 */ 1185 if (key->src_samples == 1) 1186 src_pos = nir_channels(&b, src_pos, 0x3); 1187 1188 /* X, Y, and S are now the coordinates of the pixel in the source image 1189 * that we want to texture from. Exception: if we are blending, then S is 1190 * irrelevant, because we are going to fetch all samples. 1191 */ 1192 if (key->blend && !key->blit_scaled) { 1193 /* Resolves (effecively) use texelFetch, so we need integers and we 1194 * don't care about the sample index if we got one. 1195 */ 1196 src_pos = nir_f2i32(&b, nir_channels(&b, src_pos, 0x3)); 1197 1198 if (devinfo->gen == 6) { 1199 /* Because gen6 only supports 4x interleved MSAA, we can do all the 1200 * blending we need with a single linear-interpolated texture lookup 1201 * at the center of the sample. The texture coordinates to be odd 1202 * integers so that they correspond to the center of a 2x2 block 1203 * representing the four samples that maxe up a pixel. So we need 1204 * to multiply our X and Y coordinates each by 2 and then add 1. 1205 */ 1206 assert(key->src_coords_normalized); 1207 src_pos = nir_fadd(&b, 1208 nir_i2f32(&b, src_pos), 1209 nir_imm_float(&b, 0.5f)); 1210 color = blorp_nir_tex(&b, &v, key, src_pos); 1211 } else { 1212 /* Gen7+ hardware doesn't automaticaly blend. */ 1213 color = blorp_nir_manual_blend_average(&b, &v, src_pos, key->src_samples, 1214 key->tex_aux_usage, 1215 key->texture_data_type); 1216 } 1217 } else if (key->blend && key->blit_scaled) { 1218 assert(!key->use_kill); 1219 color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, key, &v); 1220 } else { 1221 if (key->bilinear_filter) { 1222 color = blorp_nir_tex(&b, &v, key, src_pos); 1223 } else { 1224 /* We're going to use texelFetch, so we need integers */ 1225 if (src_pos->num_components == 2) { 1226 src_pos = nir_f2i32(&b, src_pos); 1227 } else { 1228 assert(src_pos->num_components == 3); 1229 src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i32(&b, src_pos), 0), 1230 nir_channel(&b, nir_f2i32(&b, src_pos), 1), 1231 nir_channel(&b, src_pos, 2)); 1232 } 1233 1234 /* We aren't blending, which means we just want to fetch a single 1235 * sample from the source surface. The address that we want to fetch 1236 * from is related to the X, Y and S values according to the formula: 1237 * 1238 * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)). 1239 * 1240 * If the actual tiling and sample count of the source surface are 1241 * not the same as the configuration of the texture, then we need to 1242 * adjust the coordinates to compensate for the difference. 1243 */ 1244 if (tex_tiled_w != key->src_tiled_w || 1245 key->tex_samples != key->src_samples || 1246 key->tex_layout != key->src_layout) { 1247 src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples, 1248 key->src_layout); 1249 /* Now (X, Y, S) = detile(src_tiling, offset) */ 1250 if (tex_tiled_w != key->src_tiled_w) 1251 src_pos = blorp_nir_retile_w_to_y(&b, src_pos); 1252 /* Now (X, Y, S) = detile(tex_tiling, offset) */ 1253 src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples, 1254 key->tex_layout); 1255 } 1256 1257 if (key->need_src_offset) 1258 src_pos = nir_iadd(&b, src_pos, nir_load_var(&b, v.v_src_offset)); 1259 1260 /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)). 1261 * 1262 * In other words: X, Y, and S now contain values which, when passed to 1263 * the texturing unit, will cause data to be read from the correct 1264 * memory location. So we can fetch the texel now. 1265 */ 1266 if (key->src_samples == 1) { 1267 color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type); 1268 } else { 1269 nir_ssa_def *mcs = NULL; 1270 if (key->tex_aux_usage == ISL_AUX_USAGE_MCS) 1271 mcs = blorp_blit_txf_ms_mcs(&b, &v, src_pos); 1272 1273 color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type); 1274 } 1275 } 1276 } 1277 1278 if (key->dst_bpc != key->src_bpc) 1279 color = bit_cast_color(&b, color, key); 1280 1281 if (key->dst_rgb) { 1282 /* The destination image is bound as a red texture three times as wide 1283 * as the actual image. Our shader is effectively running one color 1284 * component at a time. We need to pick off the appropriate component 1285 * from the source color and write that to destination red. 1286 */ 1287 assert(dst_pos->num_components == 2); 1288 nir_ssa_def *comp = 1289 nir_umod(&b, nir_channel(&b, dst_pos, 0), nir_imm_int(&b, 3)); 1290 1291 nir_ssa_def *color_component = 1292 nir_bcsel(&b, nir_ieq(&b, comp, nir_imm_int(&b, 0)), 1293 nir_channel(&b, color, 0), 1294 nir_bcsel(&b, nir_ieq(&b, comp, nir_imm_int(&b, 1)), 1295 nir_channel(&b, color, 1), 1296 nir_channel(&b, color, 2))); 1297 1298 nir_ssa_def *u = nir_ssa_undef(&b, 1, 32); 1299 color = nir_vec4(&b, color_component, u, u, u); 1300 } 1301 1302 nir_store_var(&b, v.color_out, color, 0xf); 1303 1304 return b.shader; 1305 } 1306 1307 static bool 1308 brw_blorp_get_blit_kernel(struct blorp_context *blorp, 1309 struct blorp_params *params, 1310 const struct brw_blorp_blit_prog_key *prog_key) 1311 { 1312 if (blorp->lookup_shader(blorp, prog_key, sizeof(*prog_key), 1313 ¶ms->wm_prog_kernel, ¶ms->wm_prog_data)) 1314 return true; 1315 1316 void *mem_ctx = ralloc_context(NULL); 1317 1318 const unsigned *program; 1319 struct brw_wm_prog_data prog_data; 1320 1321 nir_shader *nir = brw_blorp_build_nir_shader(blorp, mem_ctx, prog_key); 1322 nir->info.name = ralloc_strdup(nir, "BLORP-blit"); 1323 1324 struct brw_wm_prog_key wm_key; 1325 brw_blorp_init_wm_prog_key(&wm_key); 1326 wm_key.tex.compressed_multisample_layout_mask = 1327 prog_key->tex_aux_usage == ISL_AUX_USAGE_MCS; 1328 wm_key.tex.msaa_16 = prog_key->tex_samples == 16; 1329 wm_key.multisample_fbo = prog_key->rt_samples > 1; 1330 1331 program = blorp_compile_fs(blorp, mem_ctx, nir, &wm_key, false, 1332 &prog_data); 1333 1334 bool result = 1335 blorp->upload_shader(blorp, prog_key, sizeof(*prog_key), 1336 program, prog_data.base.program_size, 1337 &prog_data.base, sizeof(prog_data), 1338 ¶ms->wm_prog_kernel, ¶ms->wm_prog_data); 1339 1340 ralloc_free(mem_ctx); 1341 return result; 1342 } 1343 1344 static void 1345 brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform *xform, 1346 GLfloat src0, GLfloat src1, 1347 GLfloat dst0, GLfloat dst1, 1348 bool mirror) 1349 { 1350 double scale = (double)(src1 - src0) / (double)(dst1 - dst0); 1351 if (!mirror) { 1352 /* When not mirroring a coordinate (say, X), we need: 1353 * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale 1354 * Therefore: 1355 * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale 1356 * 1357 * blorp program uses "round toward zero" to convert the 1358 * transformed floating point coordinates to integer coordinates, 1359 * whereas the behaviour we actually want is "round to nearest", 1360 * so 0.5 provides the necessary correction. 1361 */ 1362 xform->multiplier = scale; 1363 xform->offset = src0 + (-(double)dst0 + 0.5) * scale; 1364 } else { 1365 /* When mirroring X we need: 1366 * src_x - src_x0 = dst_x1 - dst_x - 0.5 1367 * Therefore: 1368 * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale 1369 */ 1370 xform->multiplier = -scale; 1371 xform->offset = src0 + ((double)dst1 - 0.5) * scale; 1372 } 1373 } 1374 1375 static inline void 1376 surf_get_intratile_offset_px(struct brw_blorp_surface_info *info, 1377 uint32_t *tile_x_px, uint32_t *tile_y_px) 1378 { 1379 if (info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { 1380 struct isl_extent2d px_size_sa = 1381 isl_get_interleaved_msaa_px_size_sa(info->surf.samples); 1382 assert(info->tile_x_sa % px_size_sa.width == 0); 1383 assert(info->tile_y_sa % px_size_sa.height == 0); 1384 *tile_x_px = info->tile_x_sa / px_size_sa.width; 1385 *tile_y_px = info->tile_y_sa / px_size_sa.height; 1386 } else { 1387 *tile_x_px = info->tile_x_sa; 1388 *tile_y_px = info->tile_y_sa; 1389 } 1390 } 1391 1392 void 1393 blorp_surf_convert_to_single_slice(const struct isl_device *isl_dev, 1394 struct brw_blorp_surface_info *info) 1395 { 1396 bool ok UNUSED; 1397 1398 /* Just bail if we have nothing to do. */ 1399 if (info->surf.dim == ISL_SURF_DIM_2D && 1400 info->view.base_level == 0 && info->view.base_array_layer == 0 && 1401 info->surf.levels == 1 && info->surf.logical_level0_px.array_len == 1) 1402 return; 1403 1404 /* If this gets triggered then we've gotten here twice which. This 1405 * shouldn't happen thanks to the above early return. 1406 */ 1407 assert(info->tile_x_sa == 0 && info->tile_y_sa == 0); 1408 1409 uint32_t layer = 0, z = 0; 1410 if (info->surf.dim == ISL_SURF_DIM_3D) 1411 z = info->view.base_array_layer + info->z_offset; 1412 else 1413 layer = info->view.base_array_layer; 1414 1415 uint32_t byte_offset; 1416 isl_surf_get_image_surf(isl_dev, &info->surf, 1417 info->view.base_level, layer, z, 1418 &info->surf, 1419 &byte_offset, &info->tile_x_sa, &info->tile_y_sa); 1420 info->addr.offset += byte_offset; 1421 1422 uint32_t tile_x_px, tile_y_px; 1423 surf_get_intratile_offset_px(info, &tile_x_px, &tile_y_px); 1424 1425 /* Instead of using the X/Y Offset fields in RENDER_SURFACE_STATE, we place 1426 * the image at the tile boundary and offset our sampling or rendering. 1427 * For this reason, we need to grow the image by the offset to ensure that 1428 * the hardware doesn't think we've gone past the edge. 1429 */ 1430 info->surf.logical_level0_px.w += tile_x_px; 1431 info->surf.logical_level0_px.h += tile_y_px; 1432 info->surf.phys_level0_sa.w += info->tile_x_sa; 1433 info->surf.phys_level0_sa.h += info->tile_y_sa; 1434 1435 /* The view is also different now. */ 1436 info->view.base_level = 0; 1437 info->view.levels = 1; 1438 info->view.base_array_layer = 0; 1439 info->view.array_len = 1; 1440 info->z_offset = 0; 1441 } 1442 1443 static void 1444 surf_fake_interleaved_msaa(const struct isl_device *isl_dev, 1445 struct brw_blorp_surface_info *info) 1446 { 1447 assert(info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED); 1448 1449 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */ 1450 blorp_surf_convert_to_single_slice(isl_dev, info); 1451 1452 info->surf.logical_level0_px = info->surf.phys_level0_sa; 1453 info->surf.samples = 1; 1454 info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE; 1455 } 1456 1457 static void 1458 surf_retile_w_to_y(const struct isl_device *isl_dev, 1459 struct brw_blorp_surface_info *info) 1460 { 1461 assert(info->surf.tiling == ISL_TILING_W); 1462 1463 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */ 1464 blorp_surf_convert_to_single_slice(isl_dev, info); 1465 1466 /* On gen7+, we don't have interleaved multisampling for color render 1467 * targets so we have to fake it. 1468 * 1469 * TODO: Are we sure we don't also need to fake it on gen6? 1470 */ 1471 if (isl_dev->info->gen > 6 && 1472 info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { 1473 surf_fake_interleaved_msaa(isl_dev, info); 1474 } 1475 1476 if (isl_dev->info->gen == 6) { 1477 /* Gen6 stencil buffers have a very large alignment coming in from the 1478 * miptree. It's out-of-bounds for what the surface state can handle. 1479 * Since we have a single layer and level, it doesn't really matter as 1480 * long as we don't pass a bogus value into isl_surf_fill_state(). 1481 */ 1482 info->surf.image_alignment_el = isl_extent3d(4, 2, 1); 1483 } 1484 1485 /* Now that we've converted everything to a simple 2-D surface with only 1486 * one miplevel, we can go about retiling it. 1487 */ 1488 const unsigned x_align = 8, y_align = info->surf.samples != 0 ? 8 : 4; 1489 info->surf.tiling = ISL_TILING_Y0; 1490 info->surf.logical_level0_px.width = 1491 ALIGN(info->surf.logical_level0_px.width, x_align) * 2; 1492 info->surf.logical_level0_px.height = 1493 ALIGN(info->surf.logical_level0_px.height, y_align) / 2; 1494 info->tile_x_sa *= 2; 1495 info->tile_y_sa /= 2; 1496 } 1497 1498 static bool 1499 can_shrink_surface(const struct brw_blorp_surface_info *surf) 1500 { 1501 /* The current code doesn't support offsets into the aux buffers. This 1502 * should be possible, but we need to make sure the offset is page 1503 * aligned for both the surface and the aux buffer surface. Generally 1504 * this mean using the page aligned offset for the aux buffer. 1505 * 1506 * Currently the cases where we must split the blit are limited to cases 1507 * where we don't have a aux buffer. 1508 */ 1509 if (surf->aux_addr.buffer != NULL) 1510 return false; 1511 1512 /* We can't support splitting the blit for gen <= 7, because the qpitch 1513 * size is calculated by the hardware based on the surface height for 1514 * gen <= 7. In gen >= 8, the qpitch is controlled by the driver. 1515 */ 1516 if (surf->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY) 1517 return false; 1518 1519 return true; 1520 } 1521 1522 static bool 1523 can_shrink_surfaces(const struct blorp_params *params) 1524 { 1525 return 1526 can_shrink_surface(¶ms->src) && 1527 can_shrink_surface(¶ms->dst); 1528 } 1529 1530 static unsigned 1531 get_max_surface_size(const struct gen_device_info *devinfo, 1532 const struct blorp_params *params) 1533 { 1534 const unsigned max = devinfo->gen >= 7 ? 16384 : 8192; 1535 if (split_blorp_blit_debug && can_shrink_surfaces(params)) 1536 return max >> 4; /* A smaller restriction when debug is enabled */ 1537 else 1538 return max; 1539 } 1540 1541 struct blt_axis { 1542 double src0, src1, dst0, dst1; 1543 bool mirror; 1544 }; 1545 1546 struct blt_coords { 1547 struct blt_axis x, y; 1548 }; 1549 1550 static void 1551 surf_fake_rgb_with_red(const struct isl_device *isl_dev, 1552 struct brw_blorp_surface_info *info, 1553 uint32_t *x, uint32_t *width) 1554 { 1555 blorp_surf_convert_to_single_slice(isl_dev, info); 1556 1557 info->surf.logical_level0_px.width *= 3; 1558 info->surf.phys_level0_sa.width *= 3; 1559 info->tile_x_sa *= 3; 1560 *x *= 3; 1561 *width *= 3; 1562 1563 enum isl_format red_format; 1564 switch (info->view.format) { 1565 case ISL_FORMAT_R8G8B8_UNORM: 1566 red_format = ISL_FORMAT_R8_UNORM; 1567 break; 1568 case ISL_FORMAT_R8G8B8_UINT: 1569 red_format = ISL_FORMAT_R8_UINT; 1570 break; 1571 case ISL_FORMAT_R16G16B16_UNORM: 1572 red_format = ISL_FORMAT_R16_UNORM; 1573 break; 1574 case ISL_FORMAT_R16G16B16_UINT: 1575 red_format = ISL_FORMAT_R16_UINT; 1576 break; 1577 case ISL_FORMAT_R32G32B32_UINT: 1578 red_format = ISL_FORMAT_R32_UINT; 1579 break; 1580 default: 1581 unreachable("Invalid RGB copy destination format"); 1582 } 1583 assert(isl_format_get_layout(red_format)->channels.r.type == 1584 isl_format_get_layout(info->view.format)->channels.r.type); 1585 assert(isl_format_get_layout(red_format)->channels.r.bits == 1586 isl_format_get_layout(info->view.format)->channels.r.bits); 1587 1588 info->surf.format = info->view.format = red_format; 1589 } 1590 1591 static void 1592 fake_dest_rgb_with_red(const struct isl_device *dev, 1593 struct blorp_params *params, 1594 struct brw_blorp_blit_prog_key *wm_prog_key, 1595 struct blt_coords *coords) 1596 { 1597 /* Handle RGB destinations for blorp_copy */ 1598 const struct isl_format_layout *dst_fmtl = 1599 isl_format_get_layout(params->dst.surf.format); 1600 1601 if (dst_fmtl->bpb % 3 == 0) { 1602 uint32_t dst_x = coords->x.dst0; 1603 uint32_t dst_width = coords->x.dst1 - dst_x; 1604 surf_fake_rgb_with_red(dev, ¶ms->dst, 1605 &dst_x, &dst_width); 1606 coords->x.dst0 = dst_x; 1607 coords->x.dst1 = dst_x + dst_width; 1608 wm_prog_key->dst_rgb = true; 1609 wm_prog_key->need_dst_offset = true; 1610 } 1611 } 1612 1613 enum blit_shrink_status { 1614 BLIT_NO_SHRINK = 0, 1615 BLIT_WIDTH_SHRINK = 1, 1616 BLIT_HEIGHT_SHRINK = 2, 1617 }; 1618 1619 /* Try to blit. If the surface parameters exceed the size allowed by hardware, 1620 * then enum blit_shrink_status will be returned. If BLIT_NO_SHRINK is 1621 * returned, then the blit was successful. 1622 */ 1623 static enum blit_shrink_status 1624 try_blorp_blit(struct blorp_batch *batch, 1625 struct blorp_params *params, 1626 struct brw_blorp_blit_prog_key *wm_prog_key, 1627 struct blt_coords *coords) 1628 { 1629 const struct gen_device_info *devinfo = batch->blorp->isl_dev->info; 1630 1631 fake_dest_rgb_with_red(batch->blorp->isl_dev, params, wm_prog_key, coords); 1632 1633 if (isl_format_has_sint_channel(params->src.view.format)) { 1634 wm_prog_key->texture_data_type = nir_type_int; 1635 } else if (isl_format_has_uint_channel(params->src.view.format)) { 1636 wm_prog_key->texture_data_type = nir_type_uint; 1637 } else { 1638 wm_prog_key->texture_data_type = nir_type_float; 1639 } 1640 1641 /* src_samples and dst_samples are the true sample counts */ 1642 wm_prog_key->src_samples = params->src.surf.samples; 1643 wm_prog_key->dst_samples = params->dst.surf.samples; 1644 1645 wm_prog_key->tex_aux_usage = params->src.aux_usage; 1646 1647 /* src_layout and dst_layout indicate the true MSAA layout used by src and 1648 * dst. 1649 */ 1650 wm_prog_key->src_layout = params->src.surf.msaa_layout; 1651 wm_prog_key->dst_layout = params->dst.surf.msaa_layout; 1652 1653 /* Round floating point values to nearest integer to avoid "off by one texel" 1654 * kind of errors when blitting. 1655 */ 1656 params->x0 = params->wm_inputs.discard_rect.x0 = round(coords->x.dst0); 1657 params->y0 = params->wm_inputs.discard_rect.y0 = round(coords->y.dst0); 1658 params->x1 = params->wm_inputs.discard_rect.x1 = round(coords->x.dst1); 1659 params->y1 = params->wm_inputs.discard_rect.y1 = round(coords->y.dst1); 1660 1661 brw_blorp_setup_coord_transform(¶ms->wm_inputs.coord_transform[0], 1662 coords->x.src0, coords->x.src1, 1663 coords->x.dst0, coords->x.dst1, 1664 coords->x.mirror); 1665 brw_blorp_setup_coord_transform(¶ms->wm_inputs.coord_transform[1], 1666 coords->y.src0, coords->y.src1, 1667 coords->y.dst0, coords->y.dst1, 1668 coords->y.mirror); 1669 1670 1671 if (devinfo->gen == 4) { 1672 /* The MinLOD and MinimumArrayElement don't work properly for cube maps. 1673 * Convert them to a single slice on gen4. 1674 */ 1675 if (params->dst.surf.usage & ISL_SURF_USAGE_CUBE_BIT) { 1676 blorp_surf_convert_to_single_slice(batch->blorp->isl_dev, ¶ms->dst); 1677 wm_prog_key->need_dst_offset = true; 1678 } 1679 1680 if (params->src.surf.usage & ISL_SURF_USAGE_CUBE_BIT) { 1681 blorp_surf_convert_to_single_slice(batch->blorp->isl_dev, ¶ms->src); 1682 wm_prog_key->need_src_offset = true; 1683 } 1684 } 1685 1686 if (devinfo->gen > 6 && 1687 params->dst.surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { 1688 assert(params->dst.surf.samples > 1); 1689 1690 /* We must expand the rectangle we send through the rendering pipeline, 1691 * to account for the fact that we are mapping the destination region as 1692 * single-sampled when it is in fact multisampled. We must also align 1693 * it to a multiple of the multisampling pattern, because the 1694 * differences between multisampled and single-sampled surface formats 1695 * will mean that pixels are scrambled within the multisampling pattern. 1696 * TODO: what if this makes the coordinates too large? 1697 * 1698 * Note: this only works if the destination surface uses the IMS layout. 1699 * If it's UMS, then we have no choice but to set up the rendering 1700 * pipeline as multisampled. 1701 */ 1702 struct isl_extent2d px_size_sa = 1703 isl_get_interleaved_msaa_px_size_sa(params->dst.surf.samples); 1704 params->x0 = ROUND_DOWN_TO(params->x0, 2) * px_size_sa.width; 1705 params->y0 = ROUND_DOWN_TO(params->y0, 2) * px_size_sa.height; 1706 params->x1 = ALIGN(params->x1, 2) * px_size_sa.width; 1707 params->y1 = ALIGN(params->y1, 2) * px_size_sa.height; 1708 1709 surf_fake_interleaved_msaa(batch->blorp->isl_dev, ¶ms->dst); 1710 1711 wm_prog_key->use_kill = true; 1712 wm_prog_key->need_dst_offset = true; 1713 } 1714 1715 if (params->dst.surf.tiling == ISL_TILING_W) { 1716 /* We must modify the rectangle we send through the rendering pipeline 1717 * (and the size and x/y offset of the destination surface), to account 1718 * for the fact that we are mapping it as Y-tiled when it is in fact 1719 * W-tiled. 1720 * 1721 * Both Y tiling and W tiling can be understood as organizations of 1722 * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels 1723 * is different, but the layout of the 32-byte sub-tiles within the 4k 1724 * tile is the same (8 sub-tiles across by 16 sub-tiles down, in 1725 * column-major order). In Y tiling, the sub-tiles are 16 bytes wide 1726 * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high. 1727 * 1728 * Therefore, to account for the layout differences within the 32-byte 1729 * sub-tiles, we must expand the rectangle so the X coordinates of its 1730 * edges are multiples of 8 (the W sub-tile width), and its Y 1731 * coordinates of its edges are multiples of 4 (the W sub-tile height). 1732 * Then we need to scale the X and Y coordinates of the rectangle to 1733 * account for the differences in aspect ratio between the Y and W 1734 * sub-tiles. We need to modify the layer width and height similarly. 1735 * 1736 * A correction needs to be applied when MSAA is in use: since 1737 * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4, 1738 * we need to align the Y coordinates to multiples of 8, so that when 1739 * they are divided by two they are still multiples of 4. 1740 * 1741 * Note: Since the x/y offset of the surface will be applied using the 1742 * SURFACE_STATE command packet, it will be invisible to the swizzling 1743 * code in the shader; therefore it needs to be in a multiple of the 1744 * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8 1745 * pixels wide and 4 pixels high (when viewed as a W-tiled stencil 1746 * buffer), and the miplevel alignment used for stencil buffers is 8 1747 * pixels horizontally and either 4 or 8 pixels vertically (see 1748 * intel_horizontal_texture_alignment_unit() and 1749 * intel_vertical_texture_alignment_unit()). 1750 * 1751 * Note: Also, since the SURFACE_STATE command packet can only apply 1752 * offsets that are multiples of 4 pixels horizontally and 2 pixels 1753 * vertically, it is important that the offsets will be multiples of 1754 * these sizes after they are converted into Y-tiled coordinates. 1755 * Fortunately they will be, since we know from above that the offsets 1756 * are a multiple of the 32-byte sub-tile size, and in Y-tiled 1757 * coordinates the sub-tile is 16 pixels wide and 2 pixels high. 1758 * 1759 * TODO: what if this makes the coordinates (or the texture size) too 1760 * large? 1761 */ 1762 const unsigned x_align = 8; 1763 const unsigned y_align = params->dst.surf.samples != 0 ? 8 : 4; 1764 params->x0 = ROUND_DOWN_TO(params->x0, x_align) * 2; 1765 params->y0 = ROUND_DOWN_TO(params->y0, y_align) / 2; 1766 params->x1 = ALIGN(params->x1, x_align) * 2; 1767 params->y1 = ALIGN(params->y1, y_align) / 2; 1768 1769 /* Retile the surface to Y-tiled */ 1770 surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->dst); 1771 1772 wm_prog_key->dst_tiled_w = true; 1773 wm_prog_key->use_kill = true; 1774 wm_prog_key->need_dst_offset = true; 1775 1776 if (params->dst.surf.samples > 1) { 1777 /* If the destination surface is a W-tiled multisampled stencil 1778 * buffer that we're mapping as Y tiled, then we need to arrange for 1779 * the WM program to run once per sample rather than once per pixel, 1780 * because the memory layout of related samples doesn't match between 1781 * W and Y tiling. 1782 */ 1783 wm_prog_key->persample_msaa_dispatch = true; 1784 } 1785 } 1786 1787 if (devinfo->gen < 8 && params->src.surf.tiling == ISL_TILING_W) { 1788 /* On Haswell and earlier, we have to fake W-tiled sources as Y-tiled. 1789 * Broadwell adds support for sampling from stencil. 1790 * 1791 * See the comments above concerning x/y offset alignment for the 1792 * destination surface. 1793 * 1794 * TODO: what if this makes the texture size too large? 1795 */ 1796 surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->src); 1797 1798 wm_prog_key->src_tiled_w = true; 1799 wm_prog_key->need_src_offset = true; 1800 } 1801 1802 /* tex_samples and rt_samples are the sample counts that are set up in 1803 * SURFACE_STATE. 1804 */ 1805 wm_prog_key->tex_samples = params->src.surf.samples; 1806 wm_prog_key->rt_samples = params->dst.surf.samples; 1807 1808 /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will 1809 * use to access the source and destination surfaces. 1810 */ 1811 wm_prog_key->tex_layout = params->src.surf.msaa_layout; 1812 wm_prog_key->rt_layout = params->dst.surf.msaa_layout; 1813 1814 if (params->src.surf.samples > 0 && params->dst.surf.samples > 1) { 1815 /* We are blitting from a multisample buffer to a multisample buffer, so 1816 * we must preserve samples within a pixel. This means we have to 1817 * arrange for the WM program to run once per sample rather than once 1818 * per pixel. 1819 */ 1820 wm_prog_key->persample_msaa_dispatch = true; 1821 } 1822 1823 params->num_samples = params->dst.surf.samples; 1824 1825 if ((wm_prog_key->bilinear_filter || 1826 (wm_prog_key->blend && !wm_prog_key->blit_scaled)) && 1827 batch->blorp->isl_dev->info->gen <= 6) { 1828 /* Gen4-5 don't support non-normalized texture coordinates */ 1829 wm_prog_key->src_coords_normalized = true; 1830 params->wm_inputs.src_inv_size[0] = 1831 1.0f / minify(params->src.surf.logical_level0_px.width, 1832 params->src.view.base_level); 1833 params->wm_inputs.src_inv_size[1] = 1834 1.0f / minify(params->src.surf.logical_level0_px.height, 1835 params->src.view.base_level); 1836 } 1837 1838 if (params->src.tile_x_sa || params->src.tile_y_sa) { 1839 assert(wm_prog_key->need_src_offset); 1840 surf_get_intratile_offset_px(¶ms->src, 1841 ¶ms->wm_inputs.src_offset.x, 1842 ¶ms->wm_inputs.src_offset.y); 1843 } 1844 1845 if (params->dst.tile_x_sa || params->dst.tile_y_sa) { 1846 assert(wm_prog_key->need_dst_offset); 1847 surf_get_intratile_offset_px(¶ms->dst, 1848 ¶ms->wm_inputs.dst_offset.x, 1849 ¶ms->wm_inputs.dst_offset.y); 1850 params->x0 += params->wm_inputs.dst_offset.x; 1851 params->y0 += params->wm_inputs.dst_offset.y; 1852 params->x1 += params->wm_inputs.dst_offset.x; 1853 params->y1 += params->wm_inputs.dst_offset.y; 1854 } 1855 1856 /* For some texture types, we need to pass the layer through the sampler. */ 1857 params->wm_inputs.src_z = params->src.z_offset; 1858 1859 if (!brw_blorp_get_blit_kernel(batch->blorp, params, wm_prog_key)) 1860 return 0; 1861 1862 if (!blorp_ensure_sf_program(batch->blorp, params)) 1863 return 0; 1864 1865 unsigned result = 0; 1866 unsigned max_surface_size = get_max_surface_size(devinfo, params); 1867 if (params->src.surf.logical_level0_px.width > max_surface_size || 1868 params->dst.surf.logical_level0_px.width > max_surface_size) 1869 result |= BLIT_WIDTH_SHRINK; 1870 if (params->src.surf.logical_level0_px.height > max_surface_size || 1871 params->dst.surf.logical_level0_px.height > max_surface_size) 1872 result |= BLIT_HEIGHT_SHRINK; 1873 1874 if (result == 0) { 1875 batch->blorp->exec(batch, params); 1876 } 1877 1878 return result; 1879 } 1880 1881 /* Adjust split blit source coordinates for the current destination 1882 * coordinates. 1883 */ 1884 static void 1885 adjust_split_source_coords(const struct blt_axis *orig, 1886 struct blt_axis *split_coords, 1887 double scale) 1888 { 1889 /* When scale is greater than 0, then we are growing from the start, so 1890 * src0 uses delta0, and src1 uses delta1. When scale is less than 0, the 1891 * source range shrinks from the end. In that case src0 is adjusted by 1892 * delta1, and src1 is adjusted by delta0. 1893 */ 1894 double delta0 = scale * (split_coords->dst0 - orig->dst0); 1895 double delta1 = scale * (split_coords->dst1 - orig->dst1); 1896 split_coords->src0 = orig->src0 + (scale >= 0.0 ? delta0 : delta1); 1897 split_coords->src1 = orig->src1 + (scale >= 0.0 ? delta1 : delta0); 1898 } 1899 1900 static struct isl_extent2d 1901 get_px_size_sa(const struct isl_surf *surf) 1902 { 1903 static const struct isl_extent2d one_to_one = { .w = 1, .h = 1 }; 1904 1905 if (surf->msaa_layout != ISL_MSAA_LAYOUT_INTERLEAVED) 1906 return one_to_one; 1907 else 1908 return isl_get_interleaved_msaa_px_size_sa(surf->samples); 1909 } 1910 1911 static void 1912 shrink_surface_params(const struct isl_device *dev, 1913 struct brw_blorp_surface_info *info, 1914 double *x0, double *x1, double *y0, double *y1) 1915 { 1916 uint32_t byte_offset, x_offset_sa, y_offset_sa, size; 1917 struct isl_extent2d px_size_sa; 1918 int adjust; 1919 1920 blorp_surf_convert_to_single_slice(dev, info); 1921 1922 px_size_sa = get_px_size_sa(&info->surf); 1923 1924 /* Because this gets called after we lower compressed images, the tile 1925 * offsets may be non-zero and we need to incorporate them in our 1926 * calculations. 1927 */ 1928 x_offset_sa = (uint32_t)*x0 * px_size_sa.w + info->tile_x_sa; 1929 y_offset_sa = (uint32_t)*y0 * px_size_sa.h + info->tile_y_sa; 1930 isl_tiling_get_intratile_offset_sa(info->surf.tiling, 1931 info->surf.format, info->surf.row_pitch, 1932 x_offset_sa, y_offset_sa, 1933 &byte_offset, 1934 &info->tile_x_sa, &info->tile_y_sa); 1935 1936 info->addr.offset += byte_offset; 1937 1938 adjust = (int)info->tile_x_sa / px_size_sa.w - (int)*x0; 1939 *x0 += adjust; 1940 *x1 += adjust; 1941 info->tile_x_sa = 0; 1942 1943 adjust = (int)info->tile_y_sa / px_size_sa.h - (int)*y0; 1944 *y0 += adjust; 1945 *y1 += adjust; 1946 info->tile_y_sa = 0; 1947 1948 size = MIN2((uint32_t)ceil(*x1), info->surf.logical_level0_px.width); 1949 info->surf.logical_level0_px.width = size; 1950 info->surf.phys_level0_sa.width = size * px_size_sa.w; 1951 1952 size = MIN2((uint32_t)ceil(*y1), info->surf.logical_level0_px.height); 1953 info->surf.logical_level0_px.height = size; 1954 info->surf.phys_level0_sa.height = size * px_size_sa.h; 1955 } 1956 1957 static void 1958 shrink_surfaces(const struct isl_device *dev, 1959 struct blorp_params *params, 1960 struct brw_blorp_blit_prog_key *wm_prog_key, 1961 struct blt_coords *coords) 1962 { 1963 /* Shrink source surface */ 1964 shrink_surface_params(dev, ¶ms->src, &coords->x.src0, &coords->x.src1, 1965 &coords->y.src0, &coords->y.src1); 1966 wm_prog_key->need_src_offset = false; 1967 1968 /* Shrink destination surface */ 1969 shrink_surface_params(dev, ¶ms->dst, &coords->x.dst0, &coords->x.dst1, 1970 &coords->y.dst0, &coords->y.dst1); 1971 wm_prog_key->need_dst_offset = false; 1972 } 1973 1974 static void 1975 do_blorp_blit(struct blorp_batch *batch, 1976 const struct blorp_params *orig_params, 1977 struct brw_blorp_blit_prog_key *wm_prog_key, 1978 const struct blt_coords *orig) 1979 { 1980 struct blorp_params params; 1981 struct blt_coords blit_coords; 1982 struct blt_coords split_coords = *orig; 1983 double w = orig->x.dst1 - orig->x.dst0; 1984 double h = orig->y.dst1 - orig->y.dst0; 1985 double x_scale = (orig->x.src1 - orig->x.src0) / w; 1986 double y_scale = (orig->y.src1 - orig->y.src0) / h; 1987 if (orig->x.mirror) 1988 x_scale = -x_scale; 1989 if (orig->y.mirror) 1990 y_scale = -y_scale; 1991 1992 bool x_done, y_done; 1993 bool shrink = split_blorp_blit_debug && can_shrink_surfaces(orig_params); 1994 do { 1995 params = *orig_params; 1996 blit_coords = split_coords; 1997 if (shrink) 1998 shrink_surfaces(batch->blorp->isl_dev, ¶ms, wm_prog_key, 1999 &blit_coords); 2000 enum blit_shrink_status result = 2001 try_blorp_blit(batch, ¶ms, wm_prog_key, &blit_coords); 2002 2003 if (result & BLIT_WIDTH_SHRINK) { 2004 w /= 2.0; 2005 assert(w >= 1.0); 2006 split_coords.x.dst1 = MIN2(split_coords.x.dst0 + w, orig->x.dst1); 2007 adjust_split_source_coords(&orig->x, &split_coords.x, x_scale); 2008 } 2009 if (result & BLIT_HEIGHT_SHRINK) { 2010 h /= 2.0; 2011 assert(h >= 1.0); 2012 split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1); 2013 adjust_split_source_coords(&orig->y, &split_coords.y, y_scale); 2014 } 2015 2016 if (result != 0) { 2017 assert(can_shrink_surfaces(orig_params)); 2018 shrink = true; 2019 continue; 2020 } 2021 2022 y_done = (orig->y.dst1 - split_coords.y.dst1 < 0.5); 2023 x_done = y_done && (orig->x.dst1 - split_coords.x.dst1 < 0.5); 2024 if (x_done) { 2025 break; 2026 } else if (y_done) { 2027 split_coords.x.dst0 += w; 2028 split_coords.x.dst1 = MIN2(split_coords.x.dst0 + w, orig->x.dst1); 2029 split_coords.y.dst0 = orig->y.dst0; 2030 split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1); 2031 adjust_split_source_coords(&orig->x, &split_coords.x, x_scale); 2032 } else { 2033 split_coords.y.dst0 += h; 2034 split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1); 2035 adjust_split_source_coords(&orig->y, &split_coords.y, y_scale); 2036 } 2037 } while (true); 2038 } 2039 2040 void 2041 blorp_blit(struct blorp_batch *batch, 2042 const struct blorp_surf *src_surf, 2043 unsigned src_level, unsigned src_layer, 2044 enum isl_format src_format, struct isl_swizzle src_swizzle, 2045 const struct blorp_surf *dst_surf, 2046 unsigned dst_level, unsigned dst_layer, 2047 enum isl_format dst_format, struct isl_swizzle dst_swizzle, 2048 float src_x0, float src_y0, 2049 float src_x1, float src_y1, 2050 float dst_x0, float dst_y0, 2051 float dst_x1, float dst_y1, 2052 GLenum filter, bool mirror_x, bool mirror_y) 2053 { 2054 struct blorp_params params; 2055 blorp_params_init(¶ms); 2056 2057 /* We cannot handle combined depth and stencil. */ 2058 if (src_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT) 2059 assert(src_surf->surf->format == ISL_FORMAT_R8_UINT); 2060 if (dst_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT) 2061 assert(dst_surf->surf->format == ISL_FORMAT_R8_UINT); 2062 2063 if (dst_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT) { 2064 assert(src_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT); 2065 /* Prior to Broadwell, we can't render to R8_UINT */ 2066 if (batch->blorp->isl_dev->info->gen < 8) { 2067 src_format = ISL_FORMAT_R8_UNORM; 2068 dst_format = ISL_FORMAT_R8_UNORM; 2069 } 2070 } 2071 2072 brw_blorp_surface_info_init(batch->blorp, ¶ms.src, src_surf, src_level, 2073 src_layer, src_format, false); 2074 brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, dst_surf, dst_level, 2075 dst_layer, dst_format, true); 2076 2077 params.src.view.swizzle = src_swizzle; 2078 params.dst.view.swizzle = dst_swizzle; 2079 2080 struct brw_blorp_blit_prog_key wm_prog_key = { 2081 .shader_type = BLORP_SHADER_TYPE_BLIT 2082 }; 2083 2084 /* Scaled blitting or not. */ 2085 wm_prog_key.blit_scaled = 2086 ((dst_x1 - dst_x0) == (src_x1 - src_x0) && 2087 (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true; 2088 2089 /* Scaling factors used for bilinear filtering in multisample scaled 2090 * blits. 2091 */ 2092 if (params.src.surf.samples == 16) 2093 wm_prog_key.x_scale = 4.0f; 2094 else 2095 wm_prog_key.x_scale = 2.0f; 2096 wm_prog_key.y_scale = params.src.surf.samples / wm_prog_key.x_scale; 2097 2098 if (filter == GL_LINEAR && 2099 params.src.surf.samples <= 1 && params.dst.surf.samples <= 1) { 2100 wm_prog_key.bilinear_filter = true; 2101 } 2102 2103 if ((params.src.surf.usage & ISL_SURF_USAGE_DEPTH_BIT) == 0 && 2104 (params.src.surf.usage & ISL_SURF_USAGE_STENCIL_BIT) == 0 && 2105 !isl_format_has_int_channel(params.src.surf.format) && 2106 params.src.surf.samples > 1 && params.dst.surf.samples <= 1) { 2107 /* We are downsampling a non-integer color buffer, so blend. 2108 * 2109 * Regarding integer color buffers, the OpenGL ES 3.2 spec says: 2110 * 2111 * "If the source formats are integer types or stencil values, a 2112 * single sample's value is selected for each pixel." 2113 * 2114 * This implies we should not blend in that case. 2115 */ 2116 wm_prog_key.blend = true; 2117 } 2118 2119 params.wm_inputs.rect_grid.x1 = 2120 minify(params.src.surf.logical_level0_px.width, src_level) * 2121 wm_prog_key.x_scale - 1.0f; 2122 params.wm_inputs.rect_grid.y1 = 2123 minify(params.src.surf.logical_level0_px.height, src_level) * 2124 wm_prog_key.y_scale - 1.0f; 2125 2126 struct blt_coords coords = { 2127 .x = { 2128 .src0 = src_x0, 2129 .src1 = src_x1, 2130 .dst0 = dst_x0, 2131 .dst1 = dst_x1, 2132 .mirror = mirror_x 2133 }, 2134 .y = { 2135 .src0 = src_y0, 2136 .src1 = src_y1, 2137 .dst0 = dst_y0, 2138 .dst1 = dst_y1, 2139 .mirror = mirror_y 2140 } 2141 }; 2142 2143 do_blorp_blit(batch, ¶ms, &wm_prog_key, &coords); 2144 } 2145 2146 static enum isl_format 2147 get_copy_format_for_bpb(const struct isl_device *isl_dev, unsigned bpb) 2148 { 2149 /* The choice of UNORM and UINT formats is very intentional here. Most 2150 * of the time, we want to use a UINT format to avoid any rounding error 2151 * in the blit. For stencil blits, R8_UINT is required by the hardware. 2152 * (It's the only format allowed in conjunction with W-tiling.) Also we 2153 * intentionally use the 4-channel formats whenever we can. This is so 2154 * that, when we do a RGB <-> RGBX copy, the two formats will line up 2155 * even though one of them is 3/4 the size of the other. The choice of 2156 * UNORM vs. UINT is also very intentional because we don't have 8 or 2157 * 16-bit RGB UINT formats until Sky Lake so we have to use UNORM there. 2158 * Fortunately, the only time we should ever use two different formats in 2159 * the table below is for RGB -> RGBA blits and so we will never have any 2160 * UNORM/UINT mismatch. 2161 */ 2162 if (ISL_DEV_GEN(isl_dev) >= 9) { 2163 switch (bpb) { 2164 case 8: return ISL_FORMAT_R8_UINT; 2165 case 16: return ISL_FORMAT_R8G8_UINT; 2166 case 24: return ISL_FORMAT_R8G8B8_UINT; 2167 case 32: return ISL_FORMAT_R8G8B8A8_UINT; 2168 case 48: return ISL_FORMAT_R16G16B16_UINT; 2169 case 64: return ISL_FORMAT_R16G16B16A16_UINT; 2170 case 96: return ISL_FORMAT_R32G32B32_UINT; 2171 case 128:return ISL_FORMAT_R32G32B32A32_UINT; 2172 default: 2173 unreachable("Unknown format bpb"); 2174 } 2175 } else { 2176 switch (bpb) { 2177 case 8: return ISL_FORMAT_R8_UINT; 2178 case 16: return ISL_FORMAT_R8G8_UINT; 2179 case 24: return ISL_FORMAT_R8G8B8_UNORM; 2180 case 32: return ISL_FORMAT_R8G8B8A8_UNORM; 2181 case 48: return ISL_FORMAT_R16G16B16_UNORM; 2182 case 64: return ISL_FORMAT_R16G16B16A16_UNORM; 2183 case 96: return ISL_FORMAT_R32G32B32_UINT; 2184 case 128:return ISL_FORMAT_R32G32B32A32_UINT; 2185 default: 2186 unreachable("Unknown format bpb"); 2187 } 2188 } 2189 } 2190 2191 /** Returns a UINT format that is CCS-compatible with the given format 2192 * 2193 * The PRM's say absolutely nothing about how render compression works. The 2194 * only thing they provide is a list of formats on which it is and is not 2195 * supported. Empirical testing indicates that the compression is only based 2196 * on the bit-layout of the format and the channel encoding doesn't matter. 2197 * So, while texture views don't work in general, you can create a view as 2198 * long as the bit-layout of the formats are the same. 2199 * 2200 * Fortunately, for every render compression capable format, the UINT format 2201 * with the same bit layout also supports render compression. This means that 2202 * we only need to handle UINT formats for copy operations. In order to do 2203 * copies between formats with different bit layouts, we attach both with a 2204 * UINT format and use bit_cast_color() to generate code to do the bit-cast 2205 * operation between the two bit layouts. 2206 */ 2207 static enum isl_format 2208 get_ccs_compatible_uint_format(const struct isl_format_layout *fmtl) 2209 { 2210 switch (fmtl->format) { 2211 case ISL_FORMAT_R32G32B32A32_FLOAT: 2212 case ISL_FORMAT_R32G32B32A32_SINT: 2213 case ISL_FORMAT_R32G32B32A32_UINT: 2214 case ISL_FORMAT_R32G32B32A32_UNORM: 2215 case ISL_FORMAT_R32G32B32A32_SNORM: 2216 case ISL_FORMAT_R32G32B32X32_FLOAT: 2217 return ISL_FORMAT_R32G32B32A32_UINT; 2218 2219 case ISL_FORMAT_R16G16B16A16_UNORM: 2220 case ISL_FORMAT_R16G16B16A16_SNORM: 2221 case ISL_FORMAT_R16G16B16A16_SINT: 2222 case ISL_FORMAT_R16G16B16A16_UINT: 2223 case ISL_FORMAT_R16G16B16A16_FLOAT: 2224 case ISL_FORMAT_R16G16B16X16_UNORM: 2225 case ISL_FORMAT_R16G16B16X16_FLOAT: 2226 return ISL_FORMAT_R16G16B16A16_UINT; 2227 2228 case ISL_FORMAT_R32G32_FLOAT: 2229 case ISL_FORMAT_R32G32_SINT: 2230 case ISL_FORMAT_R32G32_UINT: 2231 case ISL_FORMAT_R32G32_UNORM: 2232 case ISL_FORMAT_R32G32_SNORM: 2233 return ISL_FORMAT_R32G32_UINT; 2234 2235 case ISL_FORMAT_B8G8R8A8_UNORM: 2236 case ISL_FORMAT_B8G8R8A8_UNORM_SRGB: 2237 case ISL_FORMAT_R8G8B8A8_UNORM: 2238 case ISL_FORMAT_R8G8B8A8_UNORM_SRGB: 2239 case ISL_FORMAT_R8G8B8A8_SNORM: 2240 case ISL_FORMAT_R8G8B8A8_SINT: 2241 case ISL_FORMAT_R8G8B8A8_UINT: 2242 case ISL_FORMAT_B8G8R8X8_UNORM: 2243 case ISL_FORMAT_B8G8R8X8_UNORM_SRGB: 2244 case ISL_FORMAT_R8G8B8X8_UNORM: 2245 case ISL_FORMAT_R8G8B8X8_UNORM_SRGB: 2246 return ISL_FORMAT_R8G8B8A8_UINT; 2247 2248 case ISL_FORMAT_R16G16_UNORM: 2249 case ISL_FORMAT_R16G16_SNORM: 2250 case ISL_FORMAT_R16G16_SINT: 2251 case ISL_FORMAT_R16G16_UINT: 2252 case ISL_FORMAT_R16G16_FLOAT: 2253 return ISL_FORMAT_R16G16_UINT; 2254 2255 case ISL_FORMAT_R32_SINT: 2256 case ISL_FORMAT_R32_UINT: 2257 case ISL_FORMAT_R32_FLOAT: 2258 case ISL_FORMAT_R32_UNORM: 2259 case ISL_FORMAT_R32_SNORM: 2260 return ISL_FORMAT_R32_UINT; 2261 2262 default: 2263 unreachable("Not a compressible format"); 2264 } 2265 } 2266 2267 /* Takes an isl_color_value and returns a color value that is the original 2268 * color value only bit-casted to a UINT format. This value, together with 2269 * the format from get_ccs_compatible_uint_format, will yield the same bit 2270 * value as the original color and format. 2271 */ 2272 static union isl_color_value 2273 bitcast_color_value_to_uint(union isl_color_value color, 2274 const struct isl_format_layout *fmtl) 2275 { 2276 /* All CCS formats have the same number of bits in each channel */ 2277 const struct isl_channel_layout *chan = &fmtl->channels.r; 2278 2279 union isl_color_value bits; 2280 switch (chan->type) { 2281 case ISL_UINT: 2282 case ISL_SINT: 2283 /* Hardware will ignore the high bits so there's no need to cast */ 2284 bits = color; 2285 break; 2286 2287 case ISL_UNORM: 2288 for (unsigned i = 0; i < 4; i++) 2289 bits.u32[i] = _mesa_float_to_unorm(color.f32[i], chan->bits); 2290 break; 2291 2292 case ISL_SNORM: 2293 for (unsigned i = 0; i < 4; i++) 2294 bits.i32[i] = _mesa_float_to_snorm(color.f32[i], chan->bits); 2295 break; 2296 2297 case ISL_SFLOAT: 2298 switch (chan->bits) { 2299 case 16: 2300 for (unsigned i = 0; i < 4; i++) 2301 bits.u32[i] = _mesa_float_to_half(color.f32[i]); 2302 break; 2303 2304 case 32: 2305 bits = color; 2306 break; 2307 2308 default: 2309 unreachable("Invalid float format size"); 2310 } 2311 break; 2312 2313 default: 2314 unreachable("Invalid channel type"); 2315 } 2316 2317 switch (fmtl->format) { 2318 case ISL_FORMAT_B8G8R8A8_UNORM: 2319 case ISL_FORMAT_B8G8R8A8_UNORM_SRGB: 2320 case ISL_FORMAT_B8G8R8X8_UNORM: 2321 case ISL_FORMAT_B8G8R8X8_UNORM_SRGB: { 2322 /* If it's a BGRA format, we need to swap blue and red */ 2323 uint32_t tmp = bits.u32[0]; 2324 bits.u32[0] = bits.u32[2]; 2325 bits.u32[2] = tmp; 2326 break; 2327 } 2328 2329 default: 2330 break; /* Nothing to do */ 2331 } 2332 2333 return bits; 2334 } 2335 2336 void 2337 blorp_surf_convert_to_uncompressed(const struct isl_device *isl_dev, 2338 struct brw_blorp_surface_info *info, 2339 uint32_t *x, uint32_t *y, 2340 uint32_t *width, uint32_t *height) 2341 { 2342 const struct isl_format_layout *fmtl = 2343 isl_format_get_layout(info->surf.format); 2344 2345 assert(fmtl->bw > 1 || fmtl->bh > 1); 2346 2347 /* This is a compressed surface. We need to convert it to a single 2348 * slice (because compressed layouts don't perfectly match uncompressed 2349 * ones with the same bpb) and divide x, y, width, and height by the 2350 * block size. 2351 */ 2352 blorp_surf_convert_to_single_slice(isl_dev, info); 2353 2354 if (width && height) { 2355 #ifndef NDEBUG 2356 uint32_t right_edge_px = info->tile_x_sa + *x + *width; 2357 uint32_t bottom_edge_px = info->tile_y_sa + *y + *height; 2358 assert(*width % fmtl->bw == 0 || 2359 right_edge_px == info->surf.logical_level0_px.width); 2360 assert(*height % fmtl->bh == 0 || 2361 bottom_edge_px == info->surf.logical_level0_px.height); 2362 #endif 2363 *width = DIV_ROUND_UP(*width, fmtl->bw); 2364 *height = DIV_ROUND_UP(*height, fmtl->bh); 2365 } 2366 2367 if (x && y) { 2368 assert(*x % fmtl->bw == 0); 2369 assert(*y % fmtl->bh == 0); 2370 *x /= fmtl->bw; 2371 *y /= fmtl->bh; 2372 } 2373 2374 info->surf.logical_level0_px.width = 2375 DIV_ROUND_UP(info->surf.logical_level0_px.width, fmtl->bw); 2376 info->surf.logical_level0_px.height = 2377 DIV_ROUND_UP(info->surf.logical_level0_px.height, fmtl->bh); 2378 2379 assert(info->surf.phys_level0_sa.width % fmtl->bw == 0); 2380 assert(info->surf.phys_level0_sa.height % fmtl->bh == 0); 2381 info->surf.phys_level0_sa.width /= fmtl->bw; 2382 info->surf.phys_level0_sa.height /= fmtl->bh; 2383 2384 assert(info->tile_x_sa % fmtl->bw == 0); 2385 assert(info->tile_y_sa % fmtl->bh == 0); 2386 info->tile_x_sa /= fmtl->bw; 2387 info->tile_y_sa /= fmtl->bh; 2388 2389 /* It's now an uncompressed surface so we need an uncompressed format */ 2390 info->surf.format = get_copy_format_for_bpb(isl_dev, fmtl->bpb); 2391 } 2392 2393 void 2394 blorp_copy(struct blorp_batch *batch, 2395 const struct blorp_surf *src_surf, 2396 unsigned src_level, unsigned src_layer, 2397 const struct blorp_surf *dst_surf, 2398 unsigned dst_level, unsigned dst_layer, 2399 uint32_t src_x, uint32_t src_y, 2400 uint32_t dst_x, uint32_t dst_y, 2401 uint32_t src_width, uint32_t src_height) 2402 { 2403 const struct isl_device *isl_dev = batch->blorp->isl_dev; 2404 struct blorp_params params; 2405 2406 if (src_width == 0 || src_height == 0) 2407 return; 2408 2409 blorp_params_init(¶ms); 2410 brw_blorp_surface_info_init(batch->blorp, ¶ms.src, src_surf, src_level, 2411 src_layer, ISL_FORMAT_UNSUPPORTED, false); 2412 brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, dst_surf, dst_level, 2413 dst_layer, ISL_FORMAT_UNSUPPORTED, true); 2414 2415 struct brw_blorp_blit_prog_key wm_prog_key = { 2416 .shader_type = BLORP_SHADER_TYPE_BLIT 2417 }; 2418 2419 const struct isl_format_layout *src_fmtl = 2420 isl_format_get_layout(params.src.surf.format); 2421 const struct isl_format_layout *dst_fmtl = 2422 isl_format_get_layout(params.dst.surf.format); 2423 2424 assert(params.src.aux_usage == ISL_AUX_USAGE_NONE || 2425 params.src.aux_usage == ISL_AUX_USAGE_MCS || 2426 params.src.aux_usage == ISL_AUX_USAGE_CCS_E); 2427 assert(params.dst.aux_usage == ISL_AUX_USAGE_NONE || 2428 params.dst.aux_usage == ISL_AUX_USAGE_MCS || 2429 params.dst.aux_usage == ISL_AUX_USAGE_CCS_E); 2430 2431 if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E) { 2432 params.dst.view.format = get_ccs_compatible_uint_format(dst_fmtl); 2433 if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E) { 2434 params.src.view.format = get_ccs_compatible_uint_format(src_fmtl); 2435 } else if (src_fmtl->bpb == dst_fmtl->bpb) { 2436 params.src.view.format = params.dst.view.format; 2437 } else { 2438 params.src.view.format = 2439 get_copy_format_for_bpb(isl_dev, src_fmtl->bpb); 2440 } 2441 } else if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E) { 2442 params.src.view.format = get_ccs_compatible_uint_format(src_fmtl); 2443 if (src_fmtl->bpb == dst_fmtl->bpb) { 2444 params.dst.view.format = params.src.view.format; 2445 } else { 2446 params.dst.view.format = 2447 get_copy_format_for_bpb(isl_dev, dst_fmtl->bpb); 2448 } 2449 } else { 2450 params.dst.view.format = get_copy_format_for_bpb(isl_dev, dst_fmtl->bpb); 2451 params.src.view.format = get_copy_format_for_bpb(isl_dev, src_fmtl->bpb); 2452 } 2453 2454 if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E) { 2455 /* It's safe to do a blorp_copy between things which are sRGB with CCS_E 2456 * enabled even though CCS_E doesn't technically do sRGB on SKL because 2457 * we stomp everything to UINT anyway. The one thing we have to be 2458 * careful of is clear colors. Because fast clear colors for sRGB on 2459 * gen9 are encoded as the float values between format conversion and 2460 * sRGB curve application, a given clear color float will convert to the 2461 * same bits regardless of whether the format is UNORM or sRGB. 2462 * Therefore, we can handle sRGB without any special cases. 2463 */ 2464 UNUSED enum isl_format linear_src_format = 2465 isl_format_srgb_to_linear(src_surf->surf->format); 2466 assert(isl_formats_are_ccs_e_compatible(batch->blorp->isl_dev->info, 2467 linear_src_format, 2468 params.src.view.format)); 2469 params.src.clear_color = 2470 bitcast_color_value_to_uint(params.src.clear_color, src_fmtl); 2471 } 2472 2473 if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E) { 2474 /* See above where we handle linear_src_format */ 2475 UNUSED enum isl_format linear_dst_format = 2476 isl_format_srgb_to_linear(dst_surf->surf->format); 2477 assert(isl_formats_are_ccs_e_compatible(batch->blorp->isl_dev->info, 2478 linear_dst_format, 2479 params.dst.view.format)); 2480 params.dst.clear_color = 2481 bitcast_color_value_to_uint(params.dst.clear_color, dst_fmtl); 2482 } 2483 2484 wm_prog_key.src_bpc = 2485 isl_format_get_layout(params.src.view.format)->channels.r.bits; 2486 wm_prog_key.dst_bpc = 2487 isl_format_get_layout(params.dst.view.format)->channels.r.bits; 2488 2489 if (src_fmtl->bw > 1 || src_fmtl->bh > 1) { 2490 blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev, ¶ms.src, 2491 &src_x, &src_y, 2492 &src_width, &src_height); 2493 wm_prog_key.need_src_offset = true; 2494 } 2495 2496 if (dst_fmtl->bw > 1 || dst_fmtl->bh > 1) { 2497 blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev, ¶ms.dst, 2498 &dst_x, &dst_y, NULL, NULL); 2499 wm_prog_key.need_dst_offset = true; 2500 } 2501 2502 /* Once both surfaces are stompped to uncompressed as needed, the 2503 * destination size is the same as the source size. 2504 */ 2505 uint32_t dst_width = src_width; 2506 uint32_t dst_height = src_height; 2507 2508 struct blt_coords coords = { 2509 .x = { 2510 .src0 = src_x, 2511 .src1 = src_x + src_width, 2512 .dst0 = dst_x, 2513 .dst1 = dst_x + dst_width, 2514 .mirror = false 2515 }, 2516 .y = { 2517 .src0 = src_y, 2518 .src1 = src_y + src_height, 2519 .dst0 = dst_y, 2520 .dst1 = dst_y + dst_height, 2521 .mirror = false 2522 } 2523 }; 2524 2525 do_blorp_blit(batch, ¶ms, &wm_prog_key, &coords); 2526 } 2527 2528 static enum isl_format 2529 isl_format_for_size(unsigned size_B) 2530 { 2531 switch (size_B) { 2532 case 1: return ISL_FORMAT_R8_UINT; 2533 case 2: return ISL_FORMAT_R8G8_UINT; 2534 case 4: return ISL_FORMAT_R8G8B8A8_UINT; 2535 case 8: return ISL_FORMAT_R16G16B16A16_UINT; 2536 case 16: return ISL_FORMAT_R32G32B32A32_UINT; 2537 default: 2538 unreachable("Not a power-of-two format size"); 2539 } 2540 } 2541 2542 /** 2543 * Returns the greatest common divisor of a and b that is a power of two. 2544 */ 2545 static uint64_t 2546 gcd_pow2_u64(uint64_t a, uint64_t b) 2547 { 2548 assert(a > 0 || b > 0); 2549 2550 unsigned a_log2 = ffsll(a) - 1; 2551 unsigned b_log2 = ffsll(b) - 1; 2552 2553 /* If either a or b is 0, then a_log2 or b_log2 till be UINT_MAX in which 2554 * case, the MIN2() will take the other one. If both are 0 then we will 2555 * hit the assert above. 2556 */ 2557 return 1 << MIN2(a_log2, b_log2); 2558 } 2559 2560 static void 2561 do_buffer_copy(struct blorp_batch *batch, 2562 struct blorp_address *src, 2563 struct blorp_address *dst, 2564 int width, int height, int block_size) 2565 { 2566 /* The actual format we pick doesn't matter as blorp will throw it away. 2567 * The only thing that actually matters is the size. 2568 */ 2569 enum isl_format format = isl_format_for_size(block_size); 2570 2571 UNUSED bool ok; 2572 struct isl_surf surf; 2573 ok = isl_surf_init(batch->blorp->isl_dev, &surf, 2574 .dim = ISL_SURF_DIM_2D, 2575 .format = format, 2576 .width = width, 2577 .height = height, 2578 .depth = 1, 2579 .levels = 1, 2580 .array_len = 1, 2581 .samples = 1, 2582 .row_pitch = width * block_size, 2583 .usage = ISL_SURF_USAGE_TEXTURE_BIT | 2584 ISL_SURF_USAGE_RENDER_TARGET_BIT, 2585 .tiling_flags = ISL_TILING_LINEAR_BIT); 2586 assert(ok); 2587 2588 struct blorp_surf src_blorp_surf = { 2589 .surf = &surf, 2590 .addr = *src, 2591 }; 2592 2593 struct blorp_surf dst_blorp_surf = { 2594 .surf = &surf, 2595 .addr = *dst, 2596 }; 2597 2598 blorp_copy(batch, &src_blorp_surf, 0, 0, &dst_blorp_surf, 0, 0, 2599 0, 0, 0, 0, width, height); 2600 } 2601 2602 void 2603 blorp_buffer_copy(struct blorp_batch *batch, 2604 struct blorp_address src, 2605 struct blorp_address dst, 2606 uint64_t size) 2607 { 2608 const struct gen_device_info *devinfo = batch->blorp->isl_dev->info; 2609 uint64_t copy_size = size; 2610 2611 /* This is maximum possible width/height our HW can handle */ 2612 uint64_t max_surface_dim = 1 << (devinfo->gen >= 7 ? 14 : 13); 2613 2614 /* First, we compute the biggest format that can be used with the 2615 * given offsets and size. 2616 */ 2617 int bs = 16; 2618 bs = gcd_pow2_u64(bs, src.offset); 2619 bs = gcd_pow2_u64(bs, dst.offset); 2620 bs = gcd_pow2_u64(bs, size); 2621 2622 /* First, we make a bunch of max-sized copies */ 2623 uint64_t max_copy_size = max_surface_dim * max_surface_dim * bs; 2624 while (copy_size >= max_copy_size) { 2625 do_buffer_copy(batch, &src, &dst, max_surface_dim, max_surface_dim, bs); 2626 copy_size -= max_copy_size; 2627 src.offset += max_copy_size; 2628 dst.offset += max_copy_size; 2629 } 2630 2631 /* Now make a max-width copy */ 2632 uint64_t height = copy_size / (max_surface_dim * bs); 2633 assert(height < max_surface_dim); 2634 if (height != 0) { 2635 uint64_t rect_copy_size = height * max_surface_dim * bs; 2636 do_buffer_copy(batch, &src, &dst, max_surface_dim, height, bs); 2637 copy_size -= rect_copy_size; 2638 src.offset += rect_copy_size; 2639 dst.offset += rect_copy_size; 2640 } 2641 2642 /* Finally, make a small copy to finish it off */ 2643 if (copy_size != 0) { 2644 do_buffer_copy(batch, &src, &dst, copy_size / bs, 1, bs); 2645 } 2646 } 2647