1 /* 2 * Copyright 2012 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include "compiler/nir/nir_builder.h" 25 26 #include "blorp_priv.h" 27 #include "brw_meta_util.h" 28 29 /* header-only include needed for _mesa_unorm_to_float and friends. */ 30 #include "mesa/main/format_utils.h" 31 32 #define FILE_DEBUG_FLAG DEBUG_BLORP 33 34 static const bool split_blorp_blit_debug = false; 35 36 /** 37 * Enum to specify the order of arguments in a sampler message 38 */ 39 enum sampler_message_arg 40 { 41 SAMPLER_MESSAGE_ARG_U_FLOAT, 42 SAMPLER_MESSAGE_ARG_V_FLOAT, 43 SAMPLER_MESSAGE_ARG_U_INT, 44 SAMPLER_MESSAGE_ARG_V_INT, 45 SAMPLER_MESSAGE_ARG_R_INT, 46 SAMPLER_MESSAGE_ARG_SI_INT, 47 SAMPLER_MESSAGE_ARG_MCS_INT, 48 SAMPLER_MESSAGE_ARG_ZERO_INT, 49 }; 50 51 struct brw_blorp_blit_vars { 52 /* Input values from brw_blorp_wm_inputs */ 53 nir_variable *v_discard_rect; 54 nir_variable *v_rect_grid; 55 nir_variable *v_coord_transform; 56 nir_variable *v_src_z; 57 nir_variable *v_src_offset; 58 nir_variable *v_dst_offset; 59 60 /* gl_FragCoord */ 61 nir_variable *frag_coord; 62 63 /* gl_FragColor */ 64 nir_variable *color_out; 65 }; 66 67 static void 68 brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v, 69 const struct brw_blorp_blit_prog_key *key) 70 { 71 /* Blended and scaled blits never use pixel discard. */ 72 assert(!key->use_kill || !(key->blend && key->blit_scaled)); 73 74 #define LOAD_INPUT(name, type)\ 75 v->v_##name = BLORP_CREATE_NIR_INPUT(b->shader, name, type); 76 77 LOAD_INPUT(discard_rect, glsl_vec4_type()) 78 LOAD_INPUT(rect_grid, glsl_vec4_type()) 79 LOAD_INPUT(coord_transform, glsl_vec4_type()) 80 LOAD_INPUT(src_z, glsl_uint_type()) 81 LOAD_INPUT(src_offset, glsl_vector_type(GLSL_TYPE_UINT, 2)) 82 LOAD_INPUT(dst_offset, glsl_vector_type(GLSL_TYPE_UINT, 2)) 83 84 #undef LOAD_INPUT 85 86 v->frag_coord = nir_variable_create(b->shader, nir_var_shader_in, 87 glsl_vec4_type(), "gl_FragCoord"); 88 v->frag_coord->data.location = VARYING_SLOT_POS; 89 v->frag_coord->data.origin_upper_left = true; 90 91 v->color_out = nir_variable_create(b->shader, nir_var_shader_out, 92 glsl_vec4_type(), "gl_FragColor"); 93 v->color_out->data.location = FRAG_RESULT_COLOR; 94 } 95 96 static nir_ssa_def * 97 blorp_blit_get_frag_coords(nir_builder *b, 98 const struct brw_blorp_blit_prog_key *key, 99 struct brw_blorp_blit_vars *v) 100 { 101 nir_ssa_def *coord = nir_f2i(b, nir_load_var(b, v->frag_coord)); 102 103 /* Account for destination surface intratile offset 104 * 105 * Transformation parameters giving translation from destination to source 106 * coordinates don't take into account possible intra-tile destination 107 * offset. Therefore it has to be first subtracted from the incoming 108 * coordinates. Vertices are set up based on coordinates containing the 109 * intra-tile offset. 110 */ 111 if (key->need_dst_offset) 112 coord = nir_isub(b, coord, nir_load_var(b, v->v_dst_offset)); 113 114 if (key->persample_msaa_dispatch) { 115 return nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1), 116 nir_load_sample_id(b)); 117 } else { 118 return nir_vec2(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1)); 119 } 120 } 121 122 /** 123 * Emit code to translate from destination (X, Y) coordinates to source (X, Y) 124 * coordinates. 125 */ 126 static nir_ssa_def * 127 blorp_blit_apply_transform(nir_builder *b, nir_ssa_def *src_pos, 128 struct brw_blorp_blit_vars *v) 129 { 130 nir_ssa_def *coord_transform = nir_load_var(b, v->v_coord_transform); 131 132 nir_ssa_def *offset = nir_vec2(b, nir_channel(b, coord_transform, 1), 133 nir_channel(b, coord_transform, 3)); 134 nir_ssa_def *mul = nir_vec2(b, nir_channel(b, coord_transform, 0), 135 nir_channel(b, coord_transform, 2)); 136 137 return nir_ffma(b, src_pos, mul, offset); 138 } 139 140 static inline void 141 blorp_nir_discard_if_outside_rect(nir_builder *b, nir_ssa_def *pos, 142 struct brw_blorp_blit_vars *v) 143 { 144 nir_ssa_def *c0, *c1, *c2, *c3; 145 nir_ssa_def *discard_rect = nir_load_var(b, v->v_discard_rect); 146 nir_ssa_def *dst_x0 = nir_channel(b, discard_rect, 0); 147 nir_ssa_def *dst_x1 = nir_channel(b, discard_rect, 1); 148 nir_ssa_def *dst_y0 = nir_channel(b, discard_rect, 2); 149 nir_ssa_def *dst_y1 = nir_channel(b, discard_rect, 3); 150 151 c0 = nir_ult(b, nir_channel(b, pos, 0), dst_x0); 152 c1 = nir_uge(b, nir_channel(b, pos, 0), dst_x1); 153 c2 = nir_ult(b, nir_channel(b, pos, 1), dst_y0); 154 c3 = nir_uge(b, nir_channel(b, pos, 1), dst_y1); 155 156 nir_ssa_def *oob = nir_ior(b, nir_ior(b, c0, c1), nir_ior(b, c2, c3)); 157 158 nir_intrinsic_instr *discard = 159 nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if); 160 discard->src[0] = nir_src_for_ssa(oob); 161 nir_builder_instr_insert(b, &discard->instr); 162 } 163 164 static nir_tex_instr * 165 blorp_create_nir_tex_instr(nir_builder *b, struct brw_blorp_blit_vars *v, 166 nir_texop op, nir_ssa_def *pos, unsigned num_srcs, 167 nir_alu_type dst_type) 168 { 169 nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs); 170 171 tex->op = op; 172 173 tex->dest_type = dst_type; 174 tex->is_array = false; 175 tex->is_shadow = false; 176 177 /* Blorp only has one texture and it's bound at unit 0 */ 178 tex->texture = NULL; 179 tex->sampler = NULL; 180 tex->texture_index = 0; 181 tex->sampler_index = 0; 182 183 /* To properly handle 3-D and 2-D array textures, we pull the Z component 184 * from an input. TODO: This is a bit magic; we should probably make this 185 * more explicit in the future. 186 */ 187 assert(pos->num_components >= 2); 188 pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1), 189 nir_load_var(b, v->v_src_z)); 190 191 tex->src[0].src_type = nir_tex_src_coord; 192 tex->src[0].src = nir_src_for_ssa(pos); 193 tex->coord_components = 3; 194 195 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); 196 197 return tex; 198 } 199 200 static nir_ssa_def * 201 blorp_nir_tex(nir_builder *b, struct brw_blorp_blit_vars *v, 202 nir_ssa_def *pos, nir_alu_type dst_type) 203 { 204 nir_tex_instr *tex = 205 blorp_create_nir_tex_instr(b, v, nir_texop_tex, pos, 2, dst_type); 206 207 assert(pos->num_components == 2); 208 tex->sampler_dim = GLSL_SAMPLER_DIM_2D; 209 tex->src[1].src_type = nir_tex_src_lod; 210 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); 211 212 nir_builder_instr_insert(b, &tex->instr); 213 214 return &tex->dest.ssa; 215 } 216 217 static nir_ssa_def * 218 blorp_nir_txf(nir_builder *b, struct brw_blorp_blit_vars *v, 219 nir_ssa_def *pos, nir_alu_type dst_type) 220 { 221 nir_tex_instr *tex = 222 blorp_create_nir_tex_instr(b, v, nir_texop_txf, pos, 2, dst_type); 223 224 tex->sampler_dim = GLSL_SAMPLER_DIM_3D; 225 tex->src[1].src_type = nir_tex_src_lod; 226 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); 227 228 nir_builder_instr_insert(b, &tex->instr); 229 230 return &tex->dest.ssa; 231 } 232 233 static nir_ssa_def * 234 blorp_nir_txf_ms(nir_builder *b, struct brw_blorp_blit_vars *v, 235 nir_ssa_def *pos, nir_ssa_def *mcs, nir_alu_type dst_type) 236 { 237 nir_tex_instr *tex = 238 blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms, pos, 239 mcs != NULL ? 3 : 2, dst_type); 240 241 tex->sampler_dim = GLSL_SAMPLER_DIM_MS; 242 243 tex->src[1].src_type = nir_tex_src_ms_index; 244 if (pos->num_components == 2) { 245 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); 246 } else { 247 assert(pos->num_components == 3); 248 tex->src[1].src = nir_src_for_ssa(nir_channel(b, pos, 2)); 249 } 250 251 if (mcs) { 252 tex->src[2].src_type = nir_tex_src_ms_mcs; 253 tex->src[2].src = nir_src_for_ssa(mcs); 254 } 255 256 nir_builder_instr_insert(b, &tex->instr); 257 258 return &tex->dest.ssa; 259 } 260 261 static nir_ssa_def * 262 blorp_nir_txf_ms_mcs(nir_builder *b, struct brw_blorp_blit_vars *v, nir_ssa_def *pos) 263 { 264 nir_tex_instr *tex = 265 blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms_mcs, 266 pos, 1, nir_type_int); 267 268 tex->sampler_dim = GLSL_SAMPLER_DIM_MS; 269 270 nir_builder_instr_insert(b, &tex->instr); 271 272 return &tex->dest.ssa; 273 } 274 275 static nir_ssa_def * 276 nir_mask_shift_or(struct nir_builder *b, nir_ssa_def *dst, nir_ssa_def *src, 277 uint32_t src_mask, int src_left_shift) 278 { 279 nir_ssa_def *masked = nir_iand(b, src, nir_imm_int(b, src_mask)); 280 281 nir_ssa_def *shifted; 282 if (src_left_shift > 0) { 283 shifted = nir_ishl(b, masked, nir_imm_int(b, src_left_shift)); 284 } else if (src_left_shift < 0) { 285 shifted = nir_ushr(b, masked, nir_imm_int(b, -src_left_shift)); 286 } else { 287 assert(src_left_shift == 0); 288 shifted = masked; 289 } 290 291 return nir_ior(b, dst, shifted); 292 } 293 294 /** 295 * Emit code to compensate for the difference between Y and W tiling. 296 * 297 * This code modifies the X and Y coordinates according to the formula: 298 * 299 * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S)) 300 * 301 * (See brw_blorp_build_nir_shader). 302 */ 303 static inline nir_ssa_def * 304 blorp_nir_retile_y_to_w(nir_builder *b, nir_ssa_def *pos) 305 { 306 assert(pos->num_components == 2); 307 nir_ssa_def *x_Y = nir_channel(b, pos, 0); 308 nir_ssa_def *y_Y = nir_channel(b, pos, 1); 309 310 /* Given X and Y coordinates that describe an address using Y tiling, 311 * translate to the X and Y coordinates that describe the same address 312 * using W tiling. 313 * 314 * If we break down the low order bits of X and Y, using a 315 * single letter to represent each low-order bit: 316 * 317 * X = A << 7 | 0bBCDEFGH 318 * Y = J << 5 | 0bKLMNP (1) 319 * 320 * Then we can apply the Y tiling formula to see the memory offset being 321 * addressed: 322 * 323 * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2) 324 * 325 * If we apply the W detiling formula to this memory location, that the 326 * corresponding X' and Y' coordinates are: 327 * 328 * X' = A << 6 | 0bBCDPFH (3) 329 * Y' = J << 6 | 0bKLMNEG 330 * 331 * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'), 332 * we need to make the following computation: 333 * 334 * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4) 335 * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1 336 */ 337 nir_ssa_def *x_W = nir_imm_int(b, 0); 338 x_W = nir_mask_shift_or(b, x_W, x_Y, 0xfffffff4, -1); 339 x_W = nir_mask_shift_or(b, x_W, y_Y, 0x1, 2); 340 x_W = nir_mask_shift_or(b, x_W, x_Y, 0x1, 0); 341 342 nir_ssa_def *y_W = nir_imm_int(b, 0); 343 y_W = nir_mask_shift_or(b, y_W, y_Y, 0xfffffffe, 1); 344 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x8, -2); 345 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x2, -1); 346 347 return nir_vec2(b, x_W, y_W); 348 } 349 350 /** 351 * Emit code to compensate for the difference between Y and W tiling. 352 * 353 * This code modifies the X and Y coordinates according to the formula: 354 * 355 * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S)) 356 * 357 * (See brw_blorp_build_nir_shader). 358 */ 359 static inline nir_ssa_def * 360 blorp_nir_retile_w_to_y(nir_builder *b, nir_ssa_def *pos) 361 { 362 assert(pos->num_components == 2); 363 nir_ssa_def *x_W = nir_channel(b, pos, 0); 364 nir_ssa_def *y_W = nir_channel(b, pos, 1); 365 366 /* Applying the same logic as above, but in reverse, we obtain the 367 * formulas: 368 * 369 * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1 370 * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2 371 */ 372 nir_ssa_def *x_Y = nir_imm_int(b, 0); 373 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0xfffffffa, 1); 374 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x2, 2); 375 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x1, 1); 376 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0x1, 0); 377 378 nir_ssa_def *y_Y = nir_imm_int(b, 0); 379 y_Y = nir_mask_shift_or(b, y_Y, y_W, 0xfffffffc, -1); 380 y_Y = nir_mask_shift_or(b, y_Y, x_W, 0x4, -2); 381 382 return nir_vec2(b, x_Y, y_Y); 383 } 384 385 /** 386 * Emit code to compensate for the difference between MSAA and non-MSAA 387 * surfaces. 388 * 389 * This code modifies the X and Y coordinates according to the formula: 390 * 391 * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S) 392 * 393 * (See brw_blorp_blit_program). 394 */ 395 static inline nir_ssa_def * 396 blorp_nir_encode_msaa(nir_builder *b, nir_ssa_def *pos, 397 unsigned num_samples, enum isl_msaa_layout layout) 398 { 399 assert(pos->num_components == 2 || pos->num_components == 3); 400 401 switch (layout) { 402 case ISL_MSAA_LAYOUT_NONE: 403 assert(pos->num_components == 2); 404 return pos; 405 case ISL_MSAA_LAYOUT_ARRAY: 406 /* No translation needed */ 407 return pos; 408 case ISL_MSAA_LAYOUT_INTERLEAVED: { 409 nir_ssa_def *x_in = nir_channel(b, pos, 0); 410 nir_ssa_def *y_in = nir_channel(b, pos, 1); 411 nir_ssa_def *s_in = pos->num_components == 2 ? nir_imm_int(b, 0) : 412 nir_channel(b, pos, 2); 413 414 nir_ssa_def *x_out = nir_imm_int(b, 0); 415 nir_ssa_def *y_out = nir_imm_int(b, 0); 416 switch (num_samples) { 417 case 2: 418 case 4: 419 /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0) 420 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) 421 * Y' = Y 422 * 423 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0) 424 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) 425 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) 426 */ 427 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 1); 428 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); 429 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 430 if (num_samples == 2) { 431 y_out = y_in; 432 } else { 433 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1); 434 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); 435 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 436 } 437 break; 438 439 case 8: 440 /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0) 441 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 442 * | (X & 0b1) 443 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) 444 */ 445 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2); 446 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0); 447 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); 448 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 449 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1); 450 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); 451 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 452 break; 453 454 case 16: 455 /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0) 456 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 457 * | (X & 0b1) 458 * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10) 459 * | (Y & 0b1) 460 */ 461 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2); 462 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0); 463 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); 464 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 465 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 2); 466 y_out = nir_mask_shift_or(b, y_out, s_in, 0x8, -1); 467 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); 468 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 469 break; 470 471 default: 472 unreachable("Invalid number of samples for IMS layout"); 473 } 474 475 return nir_vec2(b, x_out, y_out); 476 } 477 478 default: 479 unreachable("Invalid MSAA layout"); 480 } 481 } 482 483 /** 484 * Emit code to compensate for the difference between MSAA and non-MSAA 485 * surfaces. 486 * 487 * This code modifies the X and Y coordinates according to the formula: 488 * 489 * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S) 490 * 491 * (See brw_blorp_blit_program). 492 */ 493 static inline nir_ssa_def * 494 blorp_nir_decode_msaa(nir_builder *b, nir_ssa_def *pos, 495 unsigned num_samples, enum isl_msaa_layout layout) 496 { 497 assert(pos->num_components == 2 || pos->num_components == 3); 498 499 switch (layout) { 500 case ISL_MSAA_LAYOUT_NONE: 501 /* No translation necessary, and S should already be zero. */ 502 assert(pos->num_components == 2); 503 return pos; 504 case ISL_MSAA_LAYOUT_ARRAY: 505 /* No translation necessary. */ 506 return pos; 507 case ISL_MSAA_LAYOUT_INTERLEAVED: { 508 assert(pos->num_components == 2); 509 510 nir_ssa_def *x_in = nir_channel(b, pos, 0); 511 nir_ssa_def *y_in = nir_channel(b, pos, 1); 512 513 nir_ssa_def *x_out = nir_imm_int(b, 0); 514 nir_ssa_def *y_out = nir_imm_int(b, 0); 515 nir_ssa_def *s_out = nir_imm_int(b, 0); 516 switch (num_samples) { 517 case 2: 518 case 4: 519 /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S) 520 * where X' = (X & ~0b11) >> 1 | (X & 0b1) 521 * S = (X & 0b10) >> 1 522 * 523 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S) 524 * where X' = (X & ~0b11) >> 1 | (X & 0b1) 525 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 526 * S = (Y & 0b10) | (X & 0b10) >> 1 527 */ 528 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffc, -1); 529 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 530 if (num_samples == 2) { 531 y_out = y_in; 532 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 533 } else { 534 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1); 535 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 536 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 537 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); 538 } 539 break; 540 541 case 8: 542 /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S) 543 * where X' = (X & ~0b111) >> 2 | (X & 0b1) 544 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 545 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1 546 */ 547 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2); 548 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 549 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1); 550 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 551 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0); 552 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); 553 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 554 break; 555 556 case 16: 557 /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S) 558 * where X' = (X & ~0b111) >> 2 | (X & 0b1) 559 * Y' = (Y & ~0b111) >> 2 | (Y & 0b1) 560 * S = (Y & 0b100) << 1 | (X & 0b100) | 561 * (Y & 0b10) | (X & 0b10) >> 1 562 */ 563 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2); 564 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 565 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffff8, -2); 566 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 567 s_out = nir_mask_shift_or(b, s_out, y_in, 0x4, 1); 568 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0); 569 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); 570 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 571 break; 572 573 default: 574 unreachable("Invalid number of samples for IMS layout"); 575 } 576 577 return nir_vec3(b, x_out, y_out, s_out); 578 } 579 580 default: 581 unreachable("Invalid MSAA layout"); 582 } 583 } 584 585 /** 586 * Count the number of trailing 1 bits in the given value. For example: 587 * 588 * count_trailing_one_bits(0) == 0 589 * count_trailing_one_bits(7) == 3 590 * count_trailing_one_bits(11) == 2 591 */ 592 static inline int count_trailing_one_bits(unsigned value) 593 { 594 #ifdef HAVE___BUILTIN_CTZ 595 return __builtin_ctz(~value); 596 #else 597 return _mesa_bitcount(value & ~(value + 1)); 598 #endif 599 } 600 601 static nir_ssa_def * 602 blorp_nir_manual_blend_average(nir_builder *b, struct brw_blorp_blit_vars *v, 603 nir_ssa_def *pos, unsigned tex_samples, 604 enum isl_aux_usage tex_aux_usage, 605 nir_alu_type dst_type) 606 { 607 /* If non-null, this is the outer-most if statement */ 608 nir_if *outer_if = NULL; 609 610 nir_variable *color = 611 nir_local_variable_create(b->impl, glsl_vec4_type(), "color"); 612 613 nir_ssa_def *mcs = NULL; 614 if (tex_aux_usage == ISL_AUX_USAGE_MCS) 615 mcs = blorp_nir_txf_ms_mcs(b, v, pos); 616 617 /* We add together samples using a binary tree structure, e.g. for 4x MSAA: 618 * 619 * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4 620 * 621 * This ensures that when all samples have the same value, no numerical 622 * precision is lost, since each addition operation always adds two equal 623 * values, and summing two equal floating point values does not lose 624 * precision. 625 * 626 * We perform this computation by treating the texture_data array as a 627 * stack and performing the following operations: 628 * 629 * - push sample 0 onto stack 630 * - push sample 1 onto stack 631 * - add top two stack entries 632 * - push sample 2 onto stack 633 * - push sample 3 onto stack 634 * - add top two stack entries 635 * - add top two stack entries 636 * - divide top stack entry by 4 637 * 638 * Note that after pushing sample i onto the stack, the number of add 639 * operations we do is equal to the number of trailing 1 bits in i. This 640 * works provided the total number of samples is a power of two, which it 641 * always is for i965. 642 * 643 * For integer formats, we replace the add operations with average 644 * operations and skip the final division. 645 */ 646 nir_ssa_def *texture_data[5]; 647 unsigned stack_depth = 0; 648 for (unsigned i = 0; i < tex_samples; ++i) { 649 assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */ 650 651 /* Push sample i onto the stack */ 652 assert(stack_depth < ARRAY_SIZE(texture_data)); 653 654 nir_ssa_def *ms_pos = nir_vec3(b, nir_channel(b, pos, 0), 655 nir_channel(b, pos, 1), 656 nir_imm_int(b, i)); 657 texture_data[stack_depth++] = blorp_nir_txf_ms(b, v, ms_pos, mcs, dst_type); 658 659 if (i == 0 && tex_aux_usage == ISL_AUX_USAGE_MCS) { 660 /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface) 661 * suggests an optimization: 662 * 663 * "A simple optimization with probable large return in 664 * performance is to compare the MCS value to zero (indicating 665 * all samples are on sample slice 0), and sample only from 666 * sample slice 0 using ld2dss if MCS is zero." 667 * 668 * Note that in the case where the MCS value is zero, sampling from 669 * sample slice 0 using ld2dss and sampling from sample 0 using 670 * ld2dms are equivalent (since all samples are on sample slice 0). 671 * Since we have already sampled from sample 0, all we need to do is 672 * skip the remaining fetches and averaging if MCS is zero. 673 */ 674 nir_ssa_def *mcs_zero = 675 nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0)); 676 if (tex_samples == 16) { 677 mcs_zero = nir_iand(b, mcs_zero, 678 nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b, 0))); 679 } 680 681 nir_if *if_stmt = nir_if_create(b->shader); 682 if_stmt->condition = nir_src_for_ssa(mcs_zero); 683 nir_cf_node_insert(b->cursor, &if_stmt->cf_node); 684 685 b->cursor = nir_after_cf_list(&if_stmt->then_list); 686 nir_store_var(b, color, texture_data[0], 0xf); 687 688 b->cursor = nir_after_cf_list(&if_stmt->else_list); 689 outer_if = if_stmt; 690 } 691 692 for (int j = 0; j < count_trailing_one_bits(i); j++) { 693 assert(stack_depth >= 2); 694 --stack_depth; 695 696 assert(dst_type == nir_type_float); 697 texture_data[stack_depth - 1] = 698 nir_fadd(b, texture_data[stack_depth - 1], 699 texture_data[stack_depth]); 700 } 701 } 702 703 /* We should have just 1 sample on the stack now. */ 704 assert(stack_depth == 1); 705 706 texture_data[0] = nir_fmul(b, texture_data[0], 707 nir_imm_float(b, 1.0 / tex_samples)); 708 709 nir_store_var(b, color, texture_data[0], 0xf); 710 711 if (outer_if) 712 b->cursor = nir_after_cf_node(&outer_if->cf_node); 713 714 return nir_load_var(b, color); 715 } 716 717 static inline nir_ssa_def * 718 nir_imm_vec2(nir_builder *build, float x, float y) 719 { 720 nir_const_value v; 721 722 memset(&v, 0, sizeof(v)); 723 v.f32[0] = x; 724 v.f32[1] = y; 725 726 return nir_build_imm(build, 4, 32, v); 727 } 728 729 static nir_ssa_def * 730 blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos, 731 unsigned tex_samples, 732 const struct brw_blorp_blit_prog_key *key, 733 struct brw_blorp_blit_vars *v) 734 { 735 nir_ssa_def *pos_xy = nir_channels(b, pos, 0x3); 736 nir_ssa_def *rect_grid = nir_load_var(b, v->v_rect_grid); 737 nir_ssa_def *scale = nir_imm_vec2(b, key->x_scale, key->y_scale); 738 739 /* Translate coordinates to lay out the samples in a rectangular grid 740 * roughly corresponding to sample locations. 741 */ 742 pos_xy = nir_fmul(b, pos_xy, scale); 743 /* Adjust coordinates so that integers represent pixel centers rather 744 * than pixel edges. 745 */ 746 pos_xy = nir_fadd(b, pos_xy, nir_imm_float(b, -0.5)); 747 /* Clamp the X, Y texture coordinates to properly handle the sampling of 748 * texels on texture edges. 749 */ 750 pos_xy = nir_fmin(b, nir_fmax(b, pos_xy, nir_imm_float(b, 0.0)), 751 nir_vec2(b, nir_channel(b, rect_grid, 0), 752 nir_channel(b, rect_grid, 1))); 753 754 /* Store the fractional parts to be used as bilinear interpolation 755 * coefficients. 756 */ 757 nir_ssa_def *frac_xy = nir_ffract(b, pos_xy); 758 /* Round the float coordinates down to nearest integer */ 759 pos_xy = nir_fdiv(b, nir_ftrunc(b, pos_xy), scale); 760 761 nir_ssa_def *tex_data[4]; 762 for (unsigned i = 0; i < 4; ++i) { 763 float sample_off_x = (float)(i & 0x1) / key->x_scale; 764 float sample_off_y = (float)((i >> 1) & 0x1) / key->y_scale; 765 nir_ssa_def *sample_off = nir_imm_vec2(b, sample_off_x, sample_off_y); 766 767 nir_ssa_def *sample_coords = nir_fadd(b, pos_xy, sample_off); 768 nir_ssa_def *sample_coords_int = nir_f2i(b, sample_coords); 769 770 /* The MCS value we fetch has to match up with the pixel that we're 771 * sampling from. Since we sample from different pixels in each 772 * iteration of this "for" loop, the call to mcs_fetch() should be 773 * here inside the loop after computing the pixel coordinates. 774 */ 775 nir_ssa_def *mcs = NULL; 776 if (key->tex_aux_usage == ISL_AUX_USAGE_MCS) 777 mcs = blorp_nir_txf_ms_mcs(b, v, sample_coords_int); 778 779 /* Compute sample index and map the sample index to a sample number. 780 * Sample index layout shows the numbering of slots in a rectangular 781 * grid of samples with in a pixel. Sample number layout shows the 782 * rectangular grid of samples roughly corresponding to the real sample 783 * locations with in a pixel. 784 * In case of 4x MSAA, layout of sample indices matches the layout of 785 * sample numbers: 786 * --------- 787 * | 0 | 1 | 788 * --------- 789 * | 2 | 3 | 790 * --------- 791 * 792 * In case of 8x MSAA the two layouts don't match. 793 * sample index layout : --------- sample number layout : --------- 794 * | 0 | 1 | | 3 | 7 | 795 * --------- --------- 796 * | 2 | 3 | | 5 | 0 | 797 * --------- --------- 798 * | 4 | 5 | | 1 | 2 | 799 * --------- --------- 800 * | 6 | 7 | | 4 | 6 | 801 * --------- --------- 802 * 803 * Fortunately, this can be done fairly easily as: 804 * S' = (0x17306425 >> (S * 4)) & 0xf 805 * 806 * In the case of 16x MSAA the two layouts don't match. 807 * Sample index layout: Sample number layout: 808 * --------------------- --------------------- 809 * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 7 | 810 * --------------------- --------------------- 811 * | 4 | 5 | 6 | 7 | | 4 | 1 | 3 | 13 | 812 * --------------------- --------------------- 813 * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 | 814 * --------------------- --------------------- 815 * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 | 816 * --------------------- --------------------- 817 * 818 * This is equivalent to 819 * S' = (0xe58b602cd31479af >> (S * 4)) & 0xf 820 */ 821 nir_ssa_def *frac = nir_ffract(b, sample_coords); 822 nir_ssa_def *sample = 823 nir_fdot2(b, frac, nir_imm_vec2(b, key->x_scale, 824 key->x_scale * key->y_scale)); 825 sample = nir_f2i(b, sample); 826 827 if (tex_samples == 8) { 828 sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x64210573), 829 nir_ishl(b, sample, nir_imm_int(b, 2))), 830 nir_imm_int(b, 0xf)); 831 } else if (tex_samples == 16) { 832 nir_ssa_def *sample_low = 833 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xd31479af), 834 nir_ishl(b, sample, nir_imm_int(b, 2))), 835 nir_imm_int(b, 0xf)); 836 nir_ssa_def *sample_high = 837 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xe58b602c), 838 nir_ishl(b, nir_iadd(b, sample, 839 nir_imm_int(b, -8)), 840 nir_imm_int(b, 2))), 841 nir_imm_int(b, 0xf)); 842 843 sample = nir_bcsel(b, nir_ilt(b, sample, nir_imm_int(b, 8)), 844 sample_low, sample_high); 845 } 846 nir_ssa_def *pos_ms = nir_vec3(b, nir_channel(b, sample_coords_int, 0), 847 nir_channel(b, sample_coords_int, 1), 848 sample); 849 tex_data[i] = blorp_nir_txf_ms(b, v, pos_ms, mcs, key->texture_data_type); 850 } 851 852 nir_ssa_def *frac_x = nir_channel(b, frac_xy, 0); 853 nir_ssa_def *frac_y = nir_channel(b, frac_xy, 1); 854 return nir_flrp(b, nir_flrp(b, tex_data[0], tex_data[1], frac_x), 855 nir_flrp(b, tex_data[2], tex_data[3], frac_x), 856 frac_y); 857 } 858 859 /** Perform a color bit-cast operation 860 * 861 * For copy operations involving CCS, we may need to use different formats for 862 * the source and destination surfaces. The two formats must both be UINT 863 * formats and must have the same size but may have different bit layouts. 864 * For instance, we may be copying from R8G8B8A8_UINT to R32_UINT or R32_UINT 865 * to R16G16_UINT. This function generates code to shuffle bits around to get 866 * us from one to the other. 867 */ 868 static nir_ssa_def * 869 bit_cast_color(struct nir_builder *b, nir_ssa_def *color, 870 const struct brw_blorp_blit_prog_key *key) 871 { 872 assert(key->texture_data_type == nir_type_uint); 873 874 if (key->dst_bpc > key->src_bpc) { 875 nir_ssa_def *u = nir_ssa_undef(b, 1, 32); 876 nir_ssa_def *dst_chan[2] = { u, u }; 877 unsigned shift = 0; 878 unsigned dst_idx = 0; 879 for (unsigned i = 0; i < 4; i++) { 880 nir_ssa_def *shifted = nir_ishl(b, nir_channel(b, color, i), 881 nir_imm_int(b, shift)); 882 if (shift == 0) { 883 dst_chan[dst_idx] = shifted; 884 } else { 885 dst_chan[dst_idx] = nir_ior(b, dst_chan[dst_idx], shifted); 886 } 887 888 shift += key->src_bpc; 889 if (shift >= key->dst_bpc) { 890 dst_idx++; 891 shift = 0; 892 } 893 } 894 895 return nir_vec4(b, dst_chan[0], dst_chan[1], u, u); 896 } else { 897 assert(key->dst_bpc < key->src_bpc); 898 899 nir_ssa_def *mask = nir_imm_int(b, ~0u >> (32 - key->dst_bpc)); 900 901 nir_ssa_def *dst_chan[4]; 902 unsigned src_idx = 0; 903 unsigned shift = 0; 904 for (unsigned i = 0; i < 4; i++) { 905 dst_chan[i] = nir_iand(b, nir_ushr(b, nir_channel(b, color, src_idx), 906 nir_imm_int(b, shift)), 907 mask); 908 shift += key->dst_bpc; 909 if (shift >= key->src_bpc) { 910 src_idx++; 911 shift = 0; 912 } 913 } 914 915 return nir_vec4(b, dst_chan[0], dst_chan[1], dst_chan[2], dst_chan[3]); 916 } 917 } 918 919 /** 920 * Generator for WM programs used in BLORP blits. 921 * 922 * The bulk of the work done by the WM program is to wrap and unwrap the 923 * coordinate transformations used by the hardware to store surfaces in 924 * memory. The hardware transforms a pixel location (X, Y, S) (where S is the 925 * sample index for a multisampled surface) to a memory offset by the 926 * following formulas: 927 * 928 * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S)) 929 * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset)) 930 * 931 * For a single-sampled surface, or for a multisampled surface using 932 * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity 933 * function: 934 * 935 * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0) 936 * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0) 937 * encode_msaa(n, UMS, X, Y, S) = (X, Y, S) 938 * decode_msaa(n, UMS, X, Y, S) = (X, Y, S) 939 * 940 * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa() 941 * embeds the sample number into bit 1 of the X and Y coordinates: 942 * 943 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0) 944 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) 945 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1) 946 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S) 947 * where X' = (X & ~0b11) >> 1 | (X & 0b1) 948 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 949 * S = (Y & 0b10) | (X & 0b10) >> 1 950 * 951 * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa() 952 * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of 953 * the Y coordinate: 954 * 955 * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0) 956 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1) 957 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) 958 * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S) 959 * where X' = (X & ~0b111) >> 2 | (X & 0b1) 960 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 961 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1 962 * 963 * For X tiling, tile() combines together the low-order bits of the X and Y 964 * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512 965 * bytes wide and 8 rows high: 966 * 967 * tile(x_tiled, X, Y, S) = A 968 * where A = tile_num << 12 | offset 969 * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9) 970 * offset = (Y' & 0b111) << 9 971 * | (X & 0b111111111) 972 * X' = X * cpp 973 * Y' = Y + S * qpitch 974 * detile(x_tiled, A) = (X, Y, S) 975 * where X = X' / cpp 976 * Y = Y' % qpitch 977 * S = Y' / qpitch 978 * Y' = (tile_num / tile_pitch) << 3 979 * | (A & 0b111000000000) >> 9 980 * X' = (tile_num % tile_pitch) << 9 981 * | (A & 0b111111111) 982 * 983 * (In all tiling formulas, cpp is the number of bytes occupied by a single 984 * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required 985 * to fill the width of the surface, and qpitch is the spacing (in rows) 986 * between array slices). 987 * 988 * For Y tiling, tile() combines together the low-order bits of the X and Y 989 * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128 990 * bytes wide and 32 rows high: 991 * 992 * tile(y_tiled, X, Y, S) = A 993 * where A = tile_num << 12 | offset 994 * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7) 995 * offset = (X' & 0b1110000) << 5 996 * | (Y' & 0b11111) << 4 997 * | (X' & 0b1111) 998 * X' = X * cpp 999 * Y' = Y + S * qpitch 1000 * detile(y_tiled, A) = (X, Y, S) 1001 * where X = X' / cpp 1002 * Y = Y' % qpitch 1003 * S = Y' / qpitch 1004 * Y' = (tile_num / tile_pitch) << 5 1005 * | (A & 0b111110000) >> 4 1006 * X' = (tile_num % tile_pitch) << 7 1007 * | (A & 0b111000000000) >> 5 1008 * | (A & 0b1111) 1009 * 1010 * For W tiling, tile() combines together the low-order bits of the X and Y 1011 * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64 1012 * bytes wide and 64 rows high (note that W tiling is only used for stencil 1013 * buffers, which always have cpp = 1 and S=0): 1014 * 1015 * tile(w_tiled, X, Y, S) = A 1016 * where A = tile_num << 12 | offset 1017 * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6) 1018 * offset = (X' & 0b111000) << 6 1019 * | (Y' & 0b111100) << 3 1020 * | (X' & 0b100) << 2 1021 * | (Y' & 0b10) << 2 1022 * | (X' & 0b10) << 1 1023 * | (Y' & 0b1) << 1 1024 * | (X' & 0b1) 1025 * X' = X * cpp = X 1026 * Y' = Y + S * qpitch 1027 * detile(w_tiled, A) = (X, Y, S) 1028 * where X = X' / cpp = X' 1029 * Y = Y' % qpitch = Y' 1030 * S = Y / qpitch = 0 1031 * Y' = (tile_num / tile_pitch) << 6 1032 * | (A & 0b111100000) >> 3 1033 * | (A & 0b1000) >> 2 1034 * | (A & 0b10) >> 1 1035 * X' = (tile_num % tile_pitch) << 6 1036 * | (A & 0b111000000000) >> 6 1037 * | (A & 0b10000) >> 2 1038 * | (A & 0b100) >> 1 1039 * | (A & 0b1) 1040 * 1041 * Finally, for a non-tiled surface, tile() simply combines together the X and 1042 * Y coordinates in the natural way: 1043 * 1044 * tile(untiled, X, Y, S) = A 1045 * where A = Y * pitch + X' 1046 * X' = X * cpp 1047 * Y' = Y + S * qpitch 1048 * detile(untiled, A) = (X, Y, S) 1049 * where X = X' / cpp 1050 * Y = Y' % qpitch 1051 * S = Y' / qpitch 1052 * X' = A % pitch 1053 * Y' = A / pitch 1054 * 1055 * (In these formulas, pitch is the number of bytes occupied by a single row 1056 * of samples). 1057 */ 1058 static nir_shader * 1059 brw_blorp_build_nir_shader(struct blorp_context *blorp, void *mem_ctx, 1060 const struct brw_blorp_blit_prog_key *key) 1061 { 1062 const struct gen_device_info *devinfo = blorp->isl_dev->info; 1063 nir_ssa_def *src_pos, *dst_pos, *color; 1064 1065 /* Sanity checks */ 1066 if (key->dst_tiled_w && key->rt_samples > 1) { 1067 /* If the destination image is W tiled and multisampled, then the thread 1068 * must be dispatched once per sample, not once per pixel. This is 1069 * necessary because after conversion between W and Y tiling, there's no 1070 * guarantee that all samples corresponding to a single pixel will still 1071 * be together. 1072 */ 1073 assert(key->persample_msaa_dispatch); 1074 } 1075 1076 if (key->blend) { 1077 /* We are blending, which means we won't have an opportunity to 1078 * translate the tiling and sample count for the texture surface. So 1079 * the surface state for the texture must be configured with the correct 1080 * tiling and sample count. 1081 */ 1082 assert(!key->src_tiled_w); 1083 assert(key->tex_samples == key->src_samples); 1084 assert(key->tex_layout == key->src_layout); 1085 assert(key->tex_samples > 0); 1086 } 1087 1088 if (key->persample_msaa_dispatch) { 1089 /* It only makes sense to do persample dispatch if the render target is 1090 * configured as multisampled. 1091 */ 1092 assert(key->rt_samples > 0); 1093 } 1094 1095 /* Make sure layout is consistent with sample count */ 1096 assert((key->tex_layout == ISL_MSAA_LAYOUT_NONE) == 1097 (key->tex_samples <= 1)); 1098 assert((key->rt_layout == ISL_MSAA_LAYOUT_NONE) == 1099 (key->rt_samples <= 1)); 1100 assert((key->src_layout == ISL_MSAA_LAYOUT_NONE) == 1101 (key->src_samples <= 1)); 1102 assert((key->dst_layout == ISL_MSAA_LAYOUT_NONE) == 1103 (key->dst_samples <= 1)); 1104 1105 nir_builder b; 1106 nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, NULL); 1107 1108 struct brw_blorp_blit_vars v; 1109 brw_blorp_blit_vars_init(&b, &v, key); 1110 1111 dst_pos = blorp_blit_get_frag_coords(&b, key, &v); 1112 1113 /* Render target and texture hardware don't support W tiling until Gen8. */ 1114 const bool rt_tiled_w = false; 1115 const bool tex_tiled_w = devinfo->gen >= 8 && key->src_tiled_w; 1116 1117 /* The address that data will be written to is determined by the 1118 * coordinates supplied to the WM thread and the tiling and sample count of 1119 * the render target, according to the formula: 1120 * 1121 * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset)) 1122 * 1123 * If the actual tiling and sample count of the destination surface are not 1124 * the same as the configuration of the render target, then these 1125 * coordinates are wrong and we have to adjust them to compensate for the 1126 * difference. 1127 */ 1128 if (rt_tiled_w != key->dst_tiled_w || 1129 key->rt_samples != key->dst_samples || 1130 key->rt_layout != key->dst_layout) { 1131 dst_pos = blorp_nir_encode_msaa(&b, dst_pos, key->rt_samples, 1132 key->rt_layout); 1133 /* Now (X, Y, S) = detile(rt_tiling, offset) */ 1134 if (rt_tiled_w != key->dst_tiled_w) 1135 dst_pos = blorp_nir_retile_y_to_w(&b, dst_pos); 1136 /* Now (X, Y, S) = detile(rt_tiling, offset) */ 1137 dst_pos = blorp_nir_decode_msaa(&b, dst_pos, key->dst_samples, 1138 key->dst_layout); 1139 } 1140 1141 /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)). 1142 * 1143 * That is: X, Y and S now contain the true coordinates and sample index of 1144 * the data that the WM thread should output. 1145 * 1146 * If we need to kill pixels that are outside the destination rectangle, 1147 * now is the time to do it. 1148 */ 1149 if (key->use_kill) { 1150 assert(!(key->blend && key->blit_scaled)); 1151 blorp_nir_discard_if_outside_rect(&b, dst_pos, &v); 1152 } 1153 1154 src_pos = blorp_blit_apply_transform(&b, nir_i2f(&b, dst_pos), &v); 1155 if (dst_pos->num_components == 3) { 1156 /* The sample coordinate is an integer that we want left alone but 1157 * blorp_blit_apply_transform() blindly applies the transform to all 1158 * three coordinates. Grab the original sample index. 1159 */ 1160 src_pos = nir_vec3(&b, nir_channel(&b, src_pos, 0), 1161 nir_channel(&b, src_pos, 1), 1162 nir_channel(&b, dst_pos, 2)); 1163 } 1164 1165 /* If the source image is not multisampled, then we want to fetch sample 1166 * number 0, because that's the only sample there is. 1167 */ 1168 if (key->src_samples == 1) 1169 src_pos = nir_channels(&b, src_pos, 0x3); 1170 1171 /* X, Y, and S are now the coordinates of the pixel in the source image 1172 * that we want to texture from. Exception: if we are blending, then S is 1173 * irrelevant, because we are going to fetch all samples. 1174 */ 1175 if (key->blend && !key->blit_scaled) { 1176 /* Resolves (effecively) use texelFetch, so we need integers and we 1177 * don't care about the sample index if we got one. 1178 */ 1179 src_pos = nir_f2i(&b, nir_channels(&b, src_pos, 0x3)); 1180 1181 if (devinfo->gen == 6) { 1182 /* Because gen6 only supports 4x interleved MSAA, we can do all the 1183 * blending we need with a single linear-interpolated texture lookup 1184 * at the center of the sample. The texture coordinates to be odd 1185 * integers so that they correspond to the center of a 2x2 block 1186 * representing the four samples that maxe up a pixel. So we need 1187 * to multiply our X and Y coordinates each by 2 and then add 1. 1188 */ 1189 src_pos = nir_ishl(&b, src_pos, nir_imm_int(&b, 1)); 1190 src_pos = nir_iadd(&b, src_pos, nir_imm_int(&b, 1)); 1191 src_pos = nir_i2f(&b, src_pos); 1192 color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type); 1193 } else { 1194 /* Gen7+ hardware doesn't automaticaly blend. */ 1195 color = blorp_nir_manual_blend_average(&b, &v, src_pos, key->src_samples, 1196 key->tex_aux_usage, 1197 key->texture_data_type); 1198 } 1199 } else if (key->blend && key->blit_scaled) { 1200 assert(!key->use_kill); 1201 color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, key, &v); 1202 } else { 1203 if (key->bilinear_filter) { 1204 color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type); 1205 } else { 1206 /* We're going to use texelFetch, so we need integers */ 1207 if (src_pos->num_components == 2) { 1208 src_pos = nir_f2i(&b, src_pos); 1209 } else { 1210 assert(src_pos->num_components == 3); 1211 src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i(&b, src_pos), 0), 1212 nir_channel(&b, nir_f2i(&b, src_pos), 1), 1213 nir_channel(&b, src_pos, 2)); 1214 } 1215 1216 /* We aren't blending, which means we just want to fetch a single 1217 * sample from the source surface. The address that we want to fetch 1218 * from is related to the X, Y and S values according to the formula: 1219 * 1220 * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)). 1221 * 1222 * If the actual tiling and sample count of the source surface are 1223 * not the same as the configuration of the texture, then we need to 1224 * adjust the coordinates to compensate for the difference. 1225 */ 1226 if (tex_tiled_w != key->src_tiled_w || 1227 key->tex_samples != key->src_samples || 1228 key->tex_layout != key->src_layout) { 1229 src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples, 1230 key->src_layout); 1231 /* Now (X, Y, S) = detile(src_tiling, offset) */ 1232 if (tex_tiled_w != key->src_tiled_w) 1233 src_pos = blorp_nir_retile_w_to_y(&b, src_pos); 1234 /* Now (X, Y, S) = detile(tex_tiling, offset) */ 1235 src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples, 1236 key->tex_layout); 1237 } 1238 1239 if (key->need_src_offset) 1240 src_pos = nir_iadd(&b, src_pos, nir_load_var(&b, v.v_src_offset)); 1241 1242 /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)). 1243 * 1244 * In other words: X, Y, and S now contain values which, when passed to 1245 * the texturing unit, will cause data to be read from the correct 1246 * memory location. So we can fetch the texel now. 1247 */ 1248 if (key->src_samples == 1) { 1249 color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type); 1250 } else { 1251 nir_ssa_def *mcs = NULL; 1252 if (key->tex_aux_usage == ISL_AUX_USAGE_MCS) 1253 mcs = blorp_nir_txf_ms_mcs(&b, &v, src_pos); 1254 1255 color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type); 1256 } 1257 } 1258 } 1259 1260 if (key->dst_bpc != key->src_bpc) 1261 color = bit_cast_color(&b, color, key); 1262 1263 if (key->dst_rgb) { 1264 /* The destination image is bound as a red texture three times as wide 1265 * as the actual image. Our shader is effectively running one color 1266 * component at a time. We need to pick off the appropriate component 1267 * from the source color and write that to destination red. 1268 */ 1269 assert(dst_pos->num_components == 2); 1270 nir_ssa_def *comp = 1271 nir_umod(&b, nir_channel(&b, dst_pos, 0), nir_imm_int(&b, 3)); 1272 1273 nir_ssa_def *color_component = 1274 nir_bcsel(&b, nir_ieq(&b, comp, nir_imm_int(&b, 0)), 1275 nir_channel(&b, color, 0), 1276 nir_bcsel(&b, nir_ieq(&b, comp, nir_imm_int(&b, 1)), 1277 nir_channel(&b, color, 1), 1278 nir_channel(&b, color, 2))); 1279 1280 nir_ssa_def *u = nir_ssa_undef(&b, 1, 32); 1281 color = nir_vec4(&b, color_component, u, u, u); 1282 } 1283 1284 nir_store_var(&b, v.color_out, color, 0xf); 1285 1286 return b.shader; 1287 } 1288 1289 static void 1290 brw_blorp_get_blit_kernel(struct blorp_context *blorp, 1291 struct blorp_params *params, 1292 const struct brw_blorp_blit_prog_key *prog_key) 1293 { 1294 if (blorp->lookup_shader(blorp, prog_key, sizeof(*prog_key), 1295 ¶ms->wm_prog_kernel, ¶ms->wm_prog_data)) 1296 return; 1297 1298 void *mem_ctx = ralloc_context(NULL); 1299 1300 const unsigned *program; 1301 unsigned program_size; 1302 struct brw_wm_prog_data prog_data; 1303 1304 nir_shader *nir = brw_blorp_build_nir_shader(blorp, mem_ctx, prog_key); 1305 struct brw_wm_prog_key wm_key; 1306 brw_blorp_init_wm_prog_key(&wm_key); 1307 wm_key.tex.compressed_multisample_layout_mask = 1308 prog_key->tex_aux_usage == ISL_AUX_USAGE_MCS; 1309 wm_key.tex.msaa_16 = prog_key->tex_samples == 16; 1310 wm_key.multisample_fbo = prog_key->rt_samples > 1; 1311 1312 program = blorp_compile_fs(blorp, mem_ctx, nir, &wm_key, false, 1313 &prog_data, &program_size); 1314 1315 blorp->upload_shader(blorp, prog_key, sizeof(*prog_key), 1316 program, program_size, 1317 &prog_data.base, sizeof(prog_data), 1318 ¶ms->wm_prog_kernel, ¶ms->wm_prog_data); 1319 1320 ralloc_free(mem_ctx); 1321 } 1322 1323 static void 1324 brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform *xform, 1325 GLfloat src0, GLfloat src1, 1326 GLfloat dst0, GLfloat dst1, 1327 bool mirror) 1328 { 1329 double scale = (double)(src1 - src0) / (double)(dst1 - dst0); 1330 if (!mirror) { 1331 /* When not mirroring a coordinate (say, X), we need: 1332 * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale 1333 * Therefore: 1334 * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale 1335 * 1336 * blorp program uses "round toward zero" to convert the 1337 * transformed floating point coordinates to integer coordinates, 1338 * whereas the behaviour we actually want is "round to nearest", 1339 * so 0.5 provides the necessary correction. 1340 */ 1341 xform->multiplier = scale; 1342 xform->offset = src0 + (-(double)dst0 + 0.5) * scale; 1343 } else { 1344 /* When mirroring X we need: 1345 * src_x - src_x0 = dst_x1 - dst_x - 0.5 1346 * Therefore: 1347 * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale 1348 */ 1349 xform->multiplier = -scale; 1350 xform->offset = src0 + ((double)dst1 - 0.5) * scale; 1351 } 1352 } 1353 1354 static inline void 1355 surf_get_intratile_offset_px(struct brw_blorp_surface_info *info, 1356 uint32_t *tile_x_px, uint32_t *tile_y_px) 1357 { 1358 if (info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { 1359 struct isl_extent2d px_size_sa = 1360 isl_get_interleaved_msaa_px_size_sa(info->surf.samples); 1361 assert(info->tile_x_sa % px_size_sa.width == 0); 1362 assert(info->tile_y_sa % px_size_sa.height == 0); 1363 *tile_x_px = info->tile_x_sa / px_size_sa.width; 1364 *tile_y_px = info->tile_y_sa / px_size_sa.height; 1365 } else { 1366 *tile_x_px = info->tile_x_sa; 1367 *tile_y_px = info->tile_y_sa; 1368 } 1369 } 1370 1371 static void 1372 surf_convert_to_single_slice(const struct isl_device *isl_dev, 1373 struct brw_blorp_surface_info *info) 1374 { 1375 /* Just bail if we have nothing to do. */ 1376 if (info->surf.dim == ISL_SURF_DIM_2D && 1377 info->view.base_level == 0 && info->view.base_array_layer == 0 && 1378 info->surf.levels == 1 && info->surf.logical_level0_px.array_len == 1) 1379 return; 1380 1381 /* If this gets triggered then we've gotten here twice which. This 1382 * shouldn't happen thanks to the above early return. 1383 */ 1384 assert(info->tile_x_sa == 0 && info->tile_y_sa == 0); 1385 1386 uint32_t layer = 0, z = 0; 1387 if (info->surf.dim == ISL_SURF_DIM_3D) 1388 z = info->view.base_array_layer + info->z_offset; 1389 else 1390 layer = info->view.base_array_layer; 1391 1392 uint32_t x_offset_sa, y_offset_sa; 1393 isl_surf_get_image_offset_sa(&info->surf, info->view.base_level, 1394 layer, z, &x_offset_sa, &y_offset_sa); 1395 1396 uint32_t byte_offset; 1397 isl_tiling_get_intratile_offset_sa(isl_dev, info->surf.tiling, 1398 info->surf.format, info->surf.row_pitch, 1399 x_offset_sa, y_offset_sa, 1400 &byte_offset, 1401 &info->tile_x_sa, &info->tile_y_sa); 1402 info->addr.offset += byte_offset; 1403 1404 const uint32_t slice_width_px = 1405 minify(info->surf.logical_level0_px.width, info->view.base_level); 1406 const uint32_t slice_height_px = 1407 minify(info->surf.logical_level0_px.height, info->view.base_level); 1408 1409 uint32_t tile_x_px, tile_y_px; 1410 surf_get_intratile_offset_px(info, &tile_x_px, &tile_y_px); 1411 1412 struct isl_surf_init_info init_info = { 1413 .dim = ISL_SURF_DIM_2D, 1414 .format = info->surf.format, 1415 .width = slice_width_px + tile_x_px, 1416 .height = slice_height_px + tile_y_px, 1417 .depth = 1, 1418 .levels = 1, 1419 .array_len = 1, 1420 .samples = info->surf.samples, 1421 .min_pitch = info->surf.row_pitch, 1422 .usage = info->surf.usage, 1423 .tiling_flags = 1 << info->surf.tiling, 1424 }; 1425 1426 isl_surf_init_s(isl_dev, &info->surf, &init_info); 1427 assert(info->surf.row_pitch == init_info.min_pitch); 1428 1429 /* The view is also different now. */ 1430 info->view.base_level = 0; 1431 info->view.levels = 1; 1432 info->view.base_array_layer = 0; 1433 info->view.array_len = 1; 1434 info->z_offset = 0; 1435 } 1436 1437 static void 1438 surf_fake_interleaved_msaa(const struct isl_device *isl_dev, 1439 struct brw_blorp_surface_info *info) 1440 { 1441 assert(info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED); 1442 1443 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */ 1444 surf_convert_to_single_slice(isl_dev, info); 1445 1446 info->surf.logical_level0_px = info->surf.phys_level0_sa; 1447 info->surf.samples = 1; 1448 info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE; 1449 } 1450 1451 static void 1452 surf_retile_w_to_y(const struct isl_device *isl_dev, 1453 struct brw_blorp_surface_info *info) 1454 { 1455 assert(info->surf.tiling == ISL_TILING_W); 1456 1457 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */ 1458 surf_convert_to_single_slice(isl_dev, info); 1459 1460 /* On gen7+, we don't have interleaved multisampling for color render 1461 * targets so we have to fake it. 1462 * 1463 * TODO: Are we sure we don't also need to fake it on gen6? 1464 */ 1465 if (isl_dev->info->gen > 6 && 1466 info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { 1467 surf_fake_interleaved_msaa(isl_dev, info); 1468 } 1469 1470 if (isl_dev->info->gen == 6) { 1471 /* Gen6 stencil buffers have a very large alignment coming in from the 1472 * miptree. It's out-of-bounds for what the surface state can handle. 1473 * Since we have a single layer and level, it doesn't really matter as 1474 * long as we don't pass a bogus value into isl_surf_fill_state(). 1475 */ 1476 info->surf.image_alignment_el = isl_extent3d(4, 2, 1); 1477 } 1478 1479 /* Now that we've converted everything to a simple 2-D surface with only 1480 * one miplevel, we can go about retiling it. 1481 */ 1482 const unsigned x_align = 8, y_align = info->surf.samples != 0 ? 8 : 4; 1483 info->surf.tiling = ISL_TILING_Y0; 1484 info->surf.logical_level0_px.width = 1485 ALIGN(info->surf.logical_level0_px.width, x_align) * 2; 1486 info->surf.logical_level0_px.height = 1487 ALIGN(info->surf.logical_level0_px.height, y_align) / 2; 1488 info->tile_x_sa *= 2; 1489 info->tile_y_sa /= 2; 1490 } 1491 1492 static bool 1493 can_shrink_surface(const struct brw_blorp_surface_info *surf) 1494 { 1495 /* The current code doesn't support offsets into the aux buffers. This 1496 * should be possible, but we need to make sure the offset is page 1497 * aligned for both the surface and the aux buffer surface. Generally 1498 * this mean using the page aligned offset for the aux buffer. 1499 * 1500 * Currently the cases where we must split the blit are limited to cases 1501 * where we don't have a aux buffer. 1502 */ 1503 if (surf->aux_addr.buffer != NULL) 1504 return false; 1505 1506 /* We can't support splitting the blit for gen <= 7, because the qpitch 1507 * size is calculated by the hardware based on the surface height for 1508 * gen <= 7. In gen >= 8, the qpitch is controlled by the driver. 1509 */ 1510 if (surf->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY) 1511 return false; 1512 1513 return true; 1514 } 1515 1516 static bool 1517 can_shrink_surfaces(const struct blorp_params *params) 1518 { 1519 return 1520 can_shrink_surface(¶ms->src) && 1521 can_shrink_surface(¶ms->dst); 1522 } 1523 1524 static unsigned 1525 get_max_surface_size(const struct gen_device_info *devinfo, 1526 const struct blorp_params *params) 1527 { 1528 const unsigned max = devinfo->gen >= 7 ? 16384 : 8192; 1529 if (split_blorp_blit_debug && can_shrink_surfaces(params)) 1530 return max >> 4; /* A smaller restriction when debug is enabled */ 1531 else 1532 return max; 1533 } 1534 1535 struct blt_axis { 1536 double src0, src1, dst0, dst1; 1537 bool mirror; 1538 }; 1539 1540 struct blt_coords { 1541 struct blt_axis x, y; 1542 }; 1543 1544 static void 1545 surf_fake_rgb_with_red(const struct isl_device *isl_dev, 1546 struct brw_blorp_surface_info *info, 1547 uint32_t *x, uint32_t *width) 1548 { 1549 surf_convert_to_single_slice(isl_dev, info); 1550 1551 info->surf.logical_level0_px.width *= 3; 1552 info->surf.phys_level0_sa.width *= 3; 1553 *x *= 3; 1554 *width *= 3; 1555 1556 enum isl_format red_format; 1557 switch (info->view.format) { 1558 case ISL_FORMAT_R8G8B8_UNORM: 1559 red_format = ISL_FORMAT_R8_UNORM; 1560 break; 1561 case ISL_FORMAT_R8G8B8_UINT: 1562 red_format = ISL_FORMAT_R8_UINT; 1563 break; 1564 case ISL_FORMAT_R16G16B16_UNORM: 1565 red_format = ISL_FORMAT_R16_UNORM; 1566 break; 1567 case ISL_FORMAT_R16G16B16_UINT: 1568 red_format = ISL_FORMAT_R16_UINT; 1569 break; 1570 case ISL_FORMAT_R32G32B32_UINT: 1571 red_format = ISL_FORMAT_R32_UINT; 1572 break; 1573 default: 1574 unreachable("Invalid RGB copy destination format"); 1575 } 1576 assert(isl_format_get_layout(red_format)->channels.r.type == 1577 isl_format_get_layout(info->view.format)->channels.r.type); 1578 assert(isl_format_get_layout(red_format)->channels.r.bits == 1579 isl_format_get_layout(info->view.format)->channels.r.bits); 1580 1581 info->surf.format = info->view.format = red_format; 1582 } 1583 1584 static void 1585 fake_dest_rgb_with_red(const struct isl_device *dev, 1586 struct blorp_params *params, 1587 struct brw_blorp_blit_prog_key *wm_prog_key, 1588 struct blt_coords *coords) 1589 { 1590 /* Handle RGB destinations for blorp_copy */ 1591 const struct isl_format_layout *dst_fmtl = 1592 isl_format_get_layout(params->dst.surf.format); 1593 1594 if (dst_fmtl->bpb % 3 == 0) { 1595 uint32_t dst_x = coords->x.dst0; 1596 uint32_t dst_width = coords->x.dst1 - dst_x; 1597 surf_fake_rgb_with_red(dev, ¶ms->dst, 1598 &dst_x, &dst_width); 1599 coords->x.dst0 = dst_x; 1600 coords->x.dst1 = dst_x + dst_width; 1601 wm_prog_key->dst_rgb = true; 1602 wm_prog_key->need_dst_offset = true; 1603 } 1604 } 1605 1606 enum blit_shrink_status { 1607 BLIT_NO_SHRINK = 0, 1608 BLIT_WIDTH_SHRINK = 1, 1609 BLIT_HEIGHT_SHRINK = 2, 1610 }; 1611 1612 /* Try to blit. If the surface parameters exceed the size allowed by hardware, 1613 * then enum blit_shrink_status will be returned. If BLIT_NO_SHRINK is 1614 * returned, then the blit was successful. 1615 */ 1616 static enum blit_shrink_status 1617 try_blorp_blit(struct blorp_batch *batch, 1618 struct blorp_params *params, 1619 struct brw_blorp_blit_prog_key *wm_prog_key, 1620 struct blt_coords *coords) 1621 { 1622 const struct gen_device_info *devinfo = batch->blorp->isl_dev->info; 1623 1624 fake_dest_rgb_with_red(batch->blorp->isl_dev, params, wm_prog_key, coords); 1625 1626 if (isl_format_has_sint_channel(params->src.view.format)) { 1627 wm_prog_key->texture_data_type = nir_type_int; 1628 } else if (isl_format_has_uint_channel(params->src.view.format)) { 1629 wm_prog_key->texture_data_type = nir_type_uint; 1630 } else { 1631 wm_prog_key->texture_data_type = nir_type_float; 1632 } 1633 1634 /* src_samples and dst_samples are the true sample counts */ 1635 wm_prog_key->src_samples = params->src.surf.samples; 1636 wm_prog_key->dst_samples = params->dst.surf.samples; 1637 1638 wm_prog_key->tex_aux_usage = params->src.aux_usage; 1639 1640 /* src_layout and dst_layout indicate the true MSAA layout used by src and 1641 * dst. 1642 */ 1643 wm_prog_key->src_layout = params->src.surf.msaa_layout; 1644 wm_prog_key->dst_layout = params->dst.surf.msaa_layout; 1645 1646 /* Round floating point values to nearest integer to avoid "off by one texel" 1647 * kind of errors when blitting. 1648 */ 1649 params->x0 = params->wm_inputs.discard_rect.x0 = round(coords->x.dst0); 1650 params->y0 = params->wm_inputs.discard_rect.y0 = round(coords->y.dst0); 1651 params->x1 = params->wm_inputs.discard_rect.x1 = round(coords->x.dst1); 1652 params->y1 = params->wm_inputs.discard_rect.y1 = round(coords->y.dst1); 1653 1654 brw_blorp_setup_coord_transform(¶ms->wm_inputs.coord_transform[0], 1655 coords->x.src0, coords->x.src1, 1656 coords->x.dst0, coords->x.dst1, 1657 coords->x.mirror); 1658 brw_blorp_setup_coord_transform(¶ms->wm_inputs.coord_transform[1], 1659 coords->y.src0, coords->y.src1, 1660 coords->y.dst0, coords->y.dst1, 1661 coords->y.mirror); 1662 1663 if (devinfo->gen > 6 && 1664 params->dst.surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { 1665 assert(params->dst.surf.samples > 1); 1666 1667 /* We must expand the rectangle we send through the rendering pipeline, 1668 * to account for the fact that we are mapping the destination region as 1669 * single-sampled when it is in fact multisampled. We must also align 1670 * it to a multiple of the multisampling pattern, because the 1671 * differences between multisampled and single-sampled surface formats 1672 * will mean that pixels are scrambled within the multisampling pattern. 1673 * TODO: what if this makes the coordinates too large? 1674 * 1675 * Note: this only works if the destination surface uses the IMS layout. 1676 * If it's UMS, then we have no choice but to set up the rendering 1677 * pipeline as multisampled. 1678 */ 1679 struct isl_extent2d px_size_sa = 1680 isl_get_interleaved_msaa_px_size_sa(params->dst.surf.samples); 1681 params->x0 = ROUND_DOWN_TO(params->x0, 2) * px_size_sa.width; 1682 params->y0 = ROUND_DOWN_TO(params->y0, 2) * px_size_sa.height; 1683 params->x1 = ALIGN(params->x1, 2) * px_size_sa.width; 1684 params->y1 = ALIGN(params->y1, 2) * px_size_sa.height; 1685 1686 surf_fake_interleaved_msaa(batch->blorp->isl_dev, ¶ms->dst); 1687 1688 wm_prog_key->use_kill = true; 1689 wm_prog_key->need_dst_offset = true; 1690 } 1691 1692 if (params->dst.surf.tiling == ISL_TILING_W) { 1693 /* We must modify the rectangle we send through the rendering pipeline 1694 * (and the size and x/y offset of the destination surface), to account 1695 * for the fact that we are mapping it as Y-tiled when it is in fact 1696 * W-tiled. 1697 * 1698 * Both Y tiling and W tiling can be understood as organizations of 1699 * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels 1700 * is different, but the layout of the 32-byte sub-tiles within the 4k 1701 * tile is the same (8 sub-tiles across by 16 sub-tiles down, in 1702 * column-major order). In Y tiling, the sub-tiles are 16 bytes wide 1703 * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high. 1704 * 1705 * Therefore, to account for the layout differences within the 32-byte 1706 * sub-tiles, we must expand the rectangle so the X coordinates of its 1707 * edges are multiples of 8 (the W sub-tile width), and its Y 1708 * coordinates of its edges are multiples of 4 (the W sub-tile height). 1709 * Then we need to scale the X and Y coordinates of the rectangle to 1710 * account for the differences in aspect ratio between the Y and W 1711 * sub-tiles. We need to modify the layer width and height similarly. 1712 * 1713 * A correction needs to be applied when MSAA is in use: since 1714 * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4, 1715 * we need to align the Y coordinates to multiples of 8, so that when 1716 * they are divided by two they are still multiples of 4. 1717 * 1718 * Note: Since the x/y offset of the surface will be applied using the 1719 * SURFACE_STATE command packet, it will be invisible to the swizzling 1720 * code in the shader; therefore it needs to be in a multiple of the 1721 * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8 1722 * pixels wide and 4 pixels high (when viewed as a W-tiled stencil 1723 * buffer), and the miplevel alignment used for stencil buffers is 8 1724 * pixels horizontally and either 4 or 8 pixels vertically (see 1725 * intel_horizontal_texture_alignment_unit() and 1726 * intel_vertical_texture_alignment_unit()). 1727 * 1728 * Note: Also, since the SURFACE_STATE command packet can only apply 1729 * offsets that are multiples of 4 pixels horizontally and 2 pixels 1730 * vertically, it is important that the offsets will be multiples of 1731 * these sizes after they are converted into Y-tiled coordinates. 1732 * Fortunately they will be, since we know from above that the offsets 1733 * are a multiple of the 32-byte sub-tile size, and in Y-tiled 1734 * coordinates the sub-tile is 16 pixels wide and 2 pixels high. 1735 * 1736 * TODO: what if this makes the coordinates (or the texture size) too 1737 * large? 1738 */ 1739 const unsigned x_align = 8; 1740 const unsigned y_align = params->dst.surf.samples != 0 ? 8 : 4; 1741 params->x0 = ROUND_DOWN_TO(params->x0, x_align) * 2; 1742 params->y0 = ROUND_DOWN_TO(params->y0, y_align) / 2; 1743 params->x1 = ALIGN(params->x1, x_align) * 2; 1744 params->y1 = ALIGN(params->y1, y_align) / 2; 1745 1746 /* Retile the surface to Y-tiled */ 1747 surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->dst); 1748 1749 wm_prog_key->dst_tiled_w = true; 1750 wm_prog_key->use_kill = true; 1751 wm_prog_key->need_dst_offset = true; 1752 1753 if (params->dst.surf.samples > 1) { 1754 /* If the destination surface is a W-tiled multisampled stencil 1755 * buffer that we're mapping as Y tiled, then we need to arrange for 1756 * the WM program to run once per sample rather than once per pixel, 1757 * because the memory layout of related samples doesn't match between 1758 * W and Y tiling. 1759 */ 1760 wm_prog_key->persample_msaa_dispatch = true; 1761 } 1762 } 1763 1764 if (devinfo->gen < 8 && params->src.surf.tiling == ISL_TILING_W) { 1765 /* On Haswell and earlier, we have to fake W-tiled sources as Y-tiled. 1766 * Broadwell adds support for sampling from stencil. 1767 * 1768 * See the comments above concerning x/y offset alignment for the 1769 * destination surface. 1770 * 1771 * TODO: what if this makes the texture size too large? 1772 */ 1773 surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->src); 1774 1775 wm_prog_key->src_tiled_w = true; 1776 wm_prog_key->need_src_offset = true; 1777 } 1778 1779 /* tex_samples and rt_samples are the sample counts that are set up in 1780 * SURFACE_STATE. 1781 */ 1782 wm_prog_key->tex_samples = params->src.surf.samples; 1783 wm_prog_key->rt_samples = params->dst.surf.samples; 1784 1785 /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will 1786 * use to access the source and destination surfaces. 1787 */ 1788 wm_prog_key->tex_layout = params->src.surf.msaa_layout; 1789 wm_prog_key->rt_layout = params->dst.surf.msaa_layout; 1790 1791 if (params->src.surf.samples > 0 && params->dst.surf.samples > 1) { 1792 /* We are blitting from a multisample buffer to a multisample buffer, so 1793 * we must preserve samples within a pixel. This means we have to 1794 * arrange for the WM program to run once per sample rather than once 1795 * per pixel. 1796 */ 1797 wm_prog_key->persample_msaa_dispatch = true; 1798 } 1799 1800 params->num_samples = params->dst.surf.samples; 1801 1802 if (params->src.tile_x_sa || params->src.tile_y_sa) { 1803 assert(wm_prog_key->need_src_offset); 1804 surf_get_intratile_offset_px(¶ms->src, 1805 ¶ms->wm_inputs.src_offset.x, 1806 ¶ms->wm_inputs.src_offset.y); 1807 } 1808 1809 if (params->dst.tile_x_sa || params->dst.tile_y_sa) { 1810 assert(wm_prog_key->need_dst_offset); 1811 surf_get_intratile_offset_px(¶ms->dst, 1812 ¶ms->wm_inputs.dst_offset.x, 1813 ¶ms->wm_inputs.dst_offset.y); 1814 params->x0 += params->wm_inputs.dst_offset.x; 1815 params->y0 += params->wm_inputs.dst_offset.y; 1816 params->x1 += params->wm_inputs.dst_offset.x; 1817 params->y1 += params->wm_inputs.dst_offset.y; 1818 } 1819 1820 /* For some texture types, we need to pass the layer through the sampler. */ 1821 params->wm_inputs.src_z = params->src.z_offset; 1822 1823 brw_blorp_get_blit_kernel(batch->blorp, params, wm_prog_key); 1824 1825 unsigned result = 0; 1826 unsigned max_surface_size = get_max_surface_size(devinfo, params); 1827 if (params->src.surf.logical_level0_px.width > max_surface_size || 1828 params->dst.surf.logical_level0_px.width > max_surface_size) 1829 result |= BLIT_WIDTH_SHRINK; 1830 if (params->src.surf.logical_level0_px.height > max_surface_size || 1831 params->dst.surf.logical_level0_px.height > max_surface_size) 1832 result |= BLIT_HEIGHT_SHRINK; 1833 1834 if (result == 0) { 1835 batch->blorp->exec(batch, params); 1836 } 1837 1838 return result; 1839 } 1840 1841 /* Adjust split blit source coordinates for the current destination 1842 * coordinates. 1843 */ 1844 static void 1845 adjust_split_source_coords(const struct blt_axis *orig, 1846 struct blt_axis *split_coords, 1847 double scale) 1848 { 1849 /* When scale is greater than 0, then we are growing from the start, so 1850 * src0 uses delta0, and src1 uses delta1. When scale is less than 0, the 1851 * source range shrinks from the end. In that case src0 is adjusted by 1852 * delta1, and src1 is adjusted by delta0. 1853 */ 1854 double delta0 = scale * (split_coords->dst0 - orig->dst0); 1855 double delta1 = scale * (split_coords->dst1 - orig->dst1); 1856 split_coords->src0 = orig->src0 + (scale >= 0.0 ? delta0 : delta1); 1857 split_coords->src1 = orig->src1 + (scale >= 0.0 ? delta1 : delta0); 1858 } 1859 1860 static const struct isl_extent2d 1861 get_px_size_sa(const struct isl_surf *surf) 1862 { 1863 static const struct isl_extent2d one_to_one = { .w = 1, .h = 1 }; 1864 1865 if (surf->msaa_layout != ISL_MSAA_LAYOUT_INTERLEAVED) 1866 return one_to_one; 1867 else 1868 return isl_get_interleaved_msaa_px_size_sa(surf->samples); 1869 } 1870 1871 static void 1872 shrink_surface_params(const struct isl_device *dev, 1873 struct brw_blorp_surface_info *info, 1874 double *x0, double *x1, double *y0, double *y1) 1875 { 1876 uint32_t byte_offset, x_offset_sa, y_offset_sa, size; 1877 struct isl_extent2d px_size_sa; 1878 int adjust; 1879 1880 surf_convert_to_single_slice(dev, info); 1881 1882 px_size_sa = get_px_size_sa(&info->surf); 1883 1884 /* Because this gets called after we lower compressed images, the tile 1885 * offsets may be non-zero and we need to incorporate them in our 1886 * calculations. 1887 */ 1888 x_offset_sa = (uint32_t)*x0 * px_size_sa.w + info->tile_x_sa; 1889 y_offset_sa = (uint32_t)*y0 * px_size_sa.h + info->tile_y_sa; 1890 isl_tiling_get_intratile_offset_sa(dev, info->surf.tiling, 1891 info->surf.format, info->surf.row_pitch, 1892 x_offset_sa, y_offset_sa, 1893 &byte_offset, 1894 &info->tile_x_sa, &info->tile_y_sa); 1895 1896 info->addr.offset += byte_offset; 1897 1898 adjust = (int)info->tile_x_sa / px_size_sa.w - (int)*x0; 1899 *x0 += adjust; 1900 *x1 += adjust; 1901 info->tile_x_sa = 0; 1902 1903 adjust = (int)info->tile_y_sa / px_size_sa.h - (int)*y0; 1904 *y0 += adjust; 1905 *y1 += adjust; 1906 info->tile_y_sa = 0; 1907 1908 size = MIN2((uint32_t)ceil(*x1), info->surf.logical_level0_px.width); 1909 info->surf.logical_level0_px.width = size; 1910 info->surf.phys_level0_sa.width = size * px_size_sa.w; 1911 1912 size = MIN2((uint32_t)ceil(*y1), info->surf.logical_level0_px.height); 1913 info->surf.logical_level0_px.height = size; 1914 info->surf.phys_level0_sa.height = size * px_size_sa.h; 1915 } 1916 1917 static void 1918 shrink_surfaces(const struct isl_device *dev, 1919 struct blorp_params *params, 1920 struct brw_blorp_blit_prog_key *wm_prog_key, 1921 struct blt_coords *coords) 1922 { 1923 /* Shrink source surface */ 1924 shrink_surface_params(dev, ¶ms->src, &coords->x.src0, &coords->x.src1, 1925 &coords->y.src0, &coords->y.src1); 1926 wm_prog_key->need_src_offset = false; 1927 1928 /* Shrink destination surface */ 1929 shrink_surface_params(dev, ¶ms->dst, &coords->x.dst0, &coords->x.dst1, 1930 &coords->y.dst0, &coords->y.dst1); 1931 wm_prog_key->need_dst_offset = false; 1932 } 1933 1934 static void 1935 do_blorp_blit(struct blorp_batch *batch, 1936 const struct blorp_params *orig_params, 1937 struct brw_blorp_blit_prog_key *wm_prog_key, 1938 const struct blt_coords *orig) 1939 { 1940 struct blorp_params params; 1941 struct blt_coords blit_coords; 1942 struct blt_coords split_coords = *orig; 1943 double w = orig->x.dst1 - orig->x.dst0; 1944 double h = orig->y.dst1 - orig->y.dst0; 1945 double x_scale = (orig->x.src1 - orig->x.src0) / w; 1946 double y_scale = (orig->y.src1 - orig->y.src0) / h; 1947 if (orig->x.mirror) 1948 x_scale = -x_scale; 1949 if (orig->y.mirror) 1950 y_scale = -y_scale; 1951 1952 bool x_done, y_done; 1953 bool shrink = split_blorp_blit_debug && can_shrink_surfaces(orig_params); 1954 do { 1955 params = *orig_params; 1956 blit_coords = split_coords; 1957 if (shrink) 1958 shrink_surfaces(batch->blorp->isl_dev, ¶ms, wm_prog_key, 1959 &blit_coords); 1960 enum blit_shrink_status result = 1961 try_blorp_blit(batch, ¶ms, wm_prog_key, &blit_coords); 1962 1963 if (result & BLIT_WIDTH_SHRINK) { 1964 w /= 2.0; 1965 assert(w >= 1.0); 1966 split_coords.x.dst1 = MIN2(split_coords.x.dst0 + w, orig->x.dst1); 1967 adjust_split_source_coords(&orig->x, &split_coords.x, x_scale); 1968 } 1969 if (result & BLIT_HEIGHT_SHRINK) { 1970 h /= 2.0; 1971 assert(h >= 1.0); 1972 split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1); 1973 adjust_split_source_coords(&orig->y, &split_coords.y, y_scale); 1974 } 1975 1976 if (result != 0) { 1977 assert(can_shrink_surfaces(orig_params)); 1978 shrink = true; 1979 continue; 1980 } 1981 1982 y_done = (orig->y.dst1 - split_coords.y.dst1 < 0.5); 1983 x_done = y_done && (orig->x.dst1 - split_coords.x.dst1 < 0.5); 1984 if (x_done) { 1985 break; 1986 } else if (y_done) { 1987 split_coords.x.dst0 += w; 1988 split_coords.x.dst1 = MIN2(split_coords.x.dst0 + w, orig->x.dst1); 1989 split_coords.y.dst0 = orig->y.dst0; 1990 split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1); 1991 adjust_split_source_coords(&orig->x, &split_coords.x, x_scale); 1992 } else { 1993 split_coords.y.dst0 += h; 1994 split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1); 1995 adjust_split_source_coords(&orig->y, &split_coords.y, y_scale); 1996 } 1997 } while (true); 1998 } 1999 2000 void 2001 blorp_blit(struct blorp_batch *batch, 2002 const struct blorp_surf *src_surf, 2003 unsigned src_level, unsigned src_layer, 2004 enum isl_format src_format, struct isl_swizzle src_swizzle, 2005 const struct blorp_surf *dst_surf, 2006 unsigned dst_level, unsigned dst_layer, 2007 enum isl_format dst_format, struct isl_swizzle dst_swizzle, 2008 float src_x0, float src_y0, 2009 float src_x1, float src_y1, 2010 float dst_x0, float dst_y0, 2011 float dst_x1, float dst_y1, 2012 GLenum filter, bool mirror_x, bool mirror_y) 2013 { 2014 struct blorp_params params; 2015 blorp_params_init(¶ms); 2016 2017 brw_blorp_surface_info_init(batch->blorp, ¶ms.src, src_surf, src_level, 2018 src_layer, src_format, false); 2019 brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, dst_surf, dst_level, 2020 dst_layer, dst_format, true); 2021 2022 params.src.view.swizzle = src_swizzle; 2023 params.dst.view.swizzle = dst_swizzle; 2024 2025 struct brw_blorp_blit_prog_key wm_prog_key = { 2026 .shader_type = BLORP_SHADER_TYPE_BLIT 2027 }; 2028 2029 /* Scaled blitting or not. */ 2030 wm_prog_key.blit_scaled = 2031 ((dst_x1 - dst_x0) == (src_x1 - src_x0) && 2032 (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true; 2033 2034 /* Scaling factors used for bilinear filtering in multisample scaled 2035 * blits. 2036 */ 2037 if (params.src.surf.samples == 16) 2038 wm_prog_key.x_scale = 4.0f; 2039 else 2040 wm_prog_key.x_scale = 2.0f; 2041 wm_prog_key.y_scale = params.src.surf.samples / wm_prog_key.x_scale; 2042 2043 if (filter == GL_LINEAR && 2044 params.src.surf.samples <= 1 && params.dst.surf.samples <= 1) 2045 wm_prog_key.bilinear_filter = true; 2046 2047 if ((params.src.surf.usage & ISL_SURF_USAGE_DEPTH_BIT) == 0 && 2048 (params.src.surf.usage & ISL_SURF_USAGE_STENCIL_BIT) == 0 && 2049 !isl_format_has_int_channel(params.src.surf.format) && 2050 params.src.surf.samples > 1 && params.dst.surf.samples <= 1) { 2051 /* We are downsampling a non-integer color buffer, so blend. 2052 * 2053 * Regarding integer color buffers, the OpenGL ES 3.2 spec says: 2054 * 2055 * "If the source formats are integer types or stencil values, a 2056 * single sample's value is selected for each pixel." 2057 * 2058 * This implies we should not blend in that case. 2059 */ 2060 wm_prog_key.blend = true; 2061 } 2062 2063 params.wm_inputs.rect_grid.x1 = 2064 minify(params.src.surf.logical_level0_px.width, src_level) * 2065 wm_prog_key.x_scale - 1.0f; 2066 params.wm_inputs.rect_grid.y1 = 2067 minify(params.src.surf.logical_level0_px.height, src_level) * 2068 wm_prog_key.y_scale - 1.0f; 2069 2070 struct blt_coords coords = { 2071 .x = { 2072 .src0 = src_x0, 2073 .src1 = src_x1, 2074 .dst0 = dst_x0, 2075 .dst1 = dst_x1, 2076 .mirror = mirror_x 2077 }, 2078 .y = { 2079 .src0 = src_y0, 2080 .src1 = src_y1, 2081 .dst0 = dst_y0, 2082 .dst1 = dst_y1, 2083 .mirror = mirror_y 2084 } 2085 }; 2086 2087 do_blorp_blit(batch, ¶ms, &wm_prog_key, &coords); 2088 } 2089 2090 static enum isl_format 2091 get_copy_format_for_bpb(const struct isl_device *isl_dev, unsigned bpb) 2092 { 2093 /* The choice of UNORM and UINT formats is very intentional here. Most 2094 * of the time, we want to use a UINT format to avoid any rounding error 2095 * in the blit. For stencil blits, R8_UINT is required by the hardware. 2096 * (It's the only format allowed in conjunction with W-tiling.) Also we 2097 * intentionally use the 4-channel formats whenever we can. This is so 2098 * that, when we do a RGB <-> RGBX copy, the two formats will line up 2099 * even though one of them is 3/4 the size of the other. The choice of 2100 * UNORM vs. UINT is also very intentional because we don't have 8 or 2101 * 16-bit RGB UINT formats until Sky Lake so we have to use UNORM there. 2102 * Fortunately, the only time we should ever use two different formats in 2103 * the table below is for RGB -> RGBA blits and so we will never have any 2104 * UNORM/UINT mismatch. 2105 */ 2106 if (ISL_DEV_GEN(isl_dev) >= 9) { 2107 switch (bpb) { 2108 case 8: return ISL_FORMAT_R8_UINT; 2109 case 16: return ISL_FORMAT_R8G8_UINT; 2110 case 24: return ISL_FORMAT_R8G8B8_UINT; 2111 case 32: return ISL_FORMAT_R8G8B8A8_UINT; 2112 case 48: return ISL_FORMAT_R16G16B16_UINT; 2113 case 64: return ISL_FORMAT_R16G16B16A16_UINT; 2114 case 96: return ISL_FORMAT_R32G32B32_UINT; 2115 case 128:return ISL_FORMAT_R32G32B32A32_UINT; 2116 default: 2117 unreachable("Unknown format bpb"); 2118 } 2119 } else { 2120 switch (bpb) { 2121 case 8: return ISL_FORMAT_R8_UINT; 2122 case 16: return ISL_FORMAT_R8G8_UINT; 2123 case 24: return ISL_FORMAT_R8G8B8_UNORM; 2124 case 32: return ISL_FORMAT_R8G8B8A8_UNORM; 2125 case 48: return ISL_FORMAT_R16G16B16_UNORM; 2126 case 64: return ISL_FORMAT_R16G16B16A16_UNORM; 2127 case 96: return ISL_FORMAT_R32G32B32_UINT; 2128 case 128:return ISL_FORMAT_R32G32B32A32_UINT; 2129 default: 2130 unreachable("Unknown format bpb"); 2131 } 2132 } 2133 } 2134 2135 /** Returns a UINT format that is CCS-compatible with the given format 2136 * 2137 * The PRM's say absolutely nothing about how render compression works. The 2138 * only thing they provide is a list of formats on which it is and is not 2139 * supported. Empirical testing indicates that the compression is only based 2140 * on the bit-layout of the format and the channel encoding doesn't matter. 2141 * So, while texture views don't work in general, you can create a view as 2142 * long as the bit-layout of the formats are the same. 2143 * 2144 * Fortunately, for every render compression capable format, the UINT format 2145 * with the same bit layout also supports render compression. This means that 2146 * we only need to handle UINT formats for copy operations. In order to do 2147 * copies between formats with different bit layouts, we attach both with a 2148 * UINT format and use bit_cast_color() to generate code to do the bit-cast 2149 * operation between the two bit layouts. 2150 */ 2151 static enum isl_format 2152 get_ccs_compatible_uint_format(const struct isl_format_layout *fmtl) 2153 { 2154 switch (fmtl->format) { 2155 case ISL_FORMAT_R32G32B32A32_FLOAT: 2156 case ISL_FORMAT_R32G32B32A32_SINT: 2157 case ISL_FORMAT_R32G32B32A32_UINT: 2158 case ISL_FORMAT_R32G32B32A32_UNORM: 2159 case ISL_FORMAT_R32G32B32A32_SNORM: 2160 return ISL_FORMAT_R32G32B32A32_UINT; 2161 2162 case ISL_FORMAT_R16G16B16A16_UNORM: 2163 case ISL_FORMAT_R16G16B16A16_SNORM: 2164 case ISL_FORMAT_R16G16B16A16_SINT: 2165 case ISL_FORMAT_R16G16B16A16_UINT: 2166 case ISL_FORMAT_R16G16B16A16_FLOAT: 2167 case ISL_FORMAT_R16G16B16X16_UNORM: 2168 case ISL_FORMAT_R16G16B16X16_FLOAT: 2169 return ISL_FORMAT_R16G16B16A16_UINT; 2170 2171 case ISL_FORMAT_R32G32_FLOAT: 2172 case ISL_FORMAT_R32G32_SINT: 2173 case ISL_FORMAT_R32G32_UINT: 2174 case ISL_FORMAT_R32G32_UNORM: 2175 case ISL_FORMAT_R32G32_SNORM: 2176 return ISL_FORMAT_R32G32_UINT; 2177 2178 case ISL_FORMAT_B8G8R8A8_UNORM: 2179 case ISL_FORMAT_B8G8R8A8_UNORM_SRGB: 2180 case ISL_FORMAT_R8G8B8A8_UNORM: 2181 case ISL_FORMAT_R8G8B8A8_UNORM_SRGB: 2182 case ISL_FORMAT_R8G8B8A8_SNORM: 2183 case ISL_FORMAT_R8G8B8A8_SINT: 2184 case ISL_FORMAT_R8G8B8A8_UINT: 2185 case ISL_FORMAT_B8G8R8X8_UNORM: 2186 case ISL_FORMAT_B8G8R8X8_UNORM_SRGB: 2187 case ISL_FORMAT_R8G8B8X8_UNORM: 2188 case ISL_FORMAT_R8G8B8X8_UNORM_SRGB: 2189 return ISL_FORMAT_R8G8B8A8_UINT; 2190 2191 case ISL_FORMAT_R16G16_UNORM: 2192 case ISL_FORMAT_R16G16_SNORM: 2193 case ISL_FORMAT_R16G16_SINT: 2194 case ISL_FORMAT_R16G16_UINT: 2195 case ISL_FORMAT_R16G16_FLOAT: 2196 return ISL_FORMAT_R16G16_UINT; 2197 2198 case ISL_FORMAT_R32_SINT: 2199 case ISL_FORMAT_R32_UINT: 2200 case ISL_FORMAT_R32_FLOAT: 2201 case ISL_FORMAT_R32_UNORM: 2202 case ISL_FORMAT_R32_SNORM: 2203 return ISL_FORMAT_R32_UINT; 2204 2205 default: 2206 unreachable("Not a compressible format"); 2207 } 2208 } 2209 2210 /* Takes an isl_color_value and returns a color value that is the original 2211 * color value only bit-casted to a UINT format. This value, together with 2212 * the format from get_ccs_compatible_uint_format, will yield the same bit 2213 * value as the original color and format. 2214 */ 2215 static union isl_color_value 2216 bitcast_color_value_to_uint(union isl_color_value color, 2217 const struct isl_format_layout *fmtl) 2218 { 2219 /* All CCS formats have the same number of bits in each channel */ 2220 const struct isl_channel_layout *chan = &fmtl->channels.r; 2221 2222 union isl_color_value bits; 2223 switch (chan->type) { 2224 case ISL_UINT: 2225 case ISL_SINT: 2226 /* Hardware will ignore the high bits so there's no need to cast */ 2227 bits = color; 2228 break; 2229 2230 case ISL_UNORM: 2231 for (unsigned i = 0; i < 4; i++) 2232 bits.u32[i] = _mesa_float_to_unorm(color.f32[i], chan->bits); 2233 break; 2234 2235 case ISL_SNORM: 2236 for (unsigned i = 0; i < 4; i++) 2237 bits.i32[i] = _mesa_float_to_snorm(color.f32[i], chan->bits); 2238 break; 2239 2240 case ISL_SFLOAT: 2241 switch (chan->bits) { 2242 case 16: 2243 for (unsigned i = 0; i < 4; i++) 2244 bits.u32[i] = _mesa_float_to_half(color.f32[i]); 2245 break; 2246 2247 case 32: 2248 bits = color; 2249 break; 2250 2251 default: 2252 unreachable("Invalid float format size"); 2253 } 2254 break; 2255 2256 default: 2257 unreachable("Invalid channel type"); 2258 } 2259 2260 switch (fmtl->format) { 2261 case ISL_FORMAT_B8G8R8A8_UNORM: 2262 case ISL_FORMAT_B8G8R8A8_UNORM_SRGB: 2263 case ISL_FORMAT_B8G8R8X8_UNORM: 2264 case ISL_FORMAT_B8G8R8X8_UNORM_SRGB: { 2265 /* If it's a BGRA format, we need to swap blue and red */ 2266 uint32_t tmp = bits.u32[0]; 2267 bits.u32[0] = bits.u32[2]; 2268 bits.u32[2] = tmp; 2269 break; 2270 } 2271 2272 default: 2273 break; /* Nothing to do */ 2274 } 2275 2276 return bits; 2277 } 2278 2279 static void 2280 surf_convert_to_uncompressed(const struct isl_device *isl_dev, 2281 struct brw_blorp_surface_info *info, 2282 uint32_t *x, uint32_t *y, 2283 uint32_t *width, uint32_t *height) 2284 { 2285 const struct isl_format_layout *fmtl = 2286 isl_format_get_layout(info->surf.format); 2287 2288 assert(fmtl->bw > 1 || fmtl->bh > 1); 2289 2290 /* This is a compressed surface. We need to convert it to a single 2291 * slice (because compressed layouts don't perfectly match uncompressed 2292 * ones with the same bpb) and divide x, y, width, and height by the 2293 * block size. 2294 */ 2295 surf_convert_to_single_slice(isl_dev, info); 2296 2297 if (width || height) { 2298 #ifndef NDEBUG 2299 uint32_t right_edge_px = info->tile_x_sa + *x + *width; 2300 uint32_t bottom_edge_px = info->tile_y_sa + *y + *height; 2301 assert(*width % fmtl->bw == 0 || 2302 right_edge_px == info->surf.logical_level0_px.width); 2303 assert(*height % fmtl->bh == 0 || 2304 bottom_edge_px == info->surf.logical_level0_px.height); 2305 #endif 2306 *width = DIV_ROUND_UP(*width, fmtl->bw); 2307 *height = DIV_ROUND_UP(*height, fmtl->bh); 2308 } 2309 2310 assert(*x % fmtl->bw == 0); 2311 assert(*y % fmtl->bh == 0); 2312 *x /= fmtl->bw; 2313 *y /= fmtl->bh; 2314 2315 info->surf.logical_level0_px.width = 2316 DIV_ROUND_UP(info->surf.logical_level0_px.width, fmtl->bw); 2317 info->surf.logical_level0_px.height = 2318 DIV_ROUND_UP(info->surf.logical_level0_px.height, fmtl->bh); 2319 2320 assert(info->surf.phys_level0_sa.width % fmtl->bw == 0); 2321 assert(info->surf.phys_level0_sa.height % fmtl->bh == 0); 2322 info->surf.phys_level0_sa.width /= fmtl->bw; 2323 info->surf.phys_level0_sa.height /= fmtl->bh; 2324 2325 assert(info->tile_x_sa % fmtl->bw == 0); 2326 assert(info->tile_y_sa % fmtl->bh == 0); 2327 info->tile_x_sa /= fmtl->bw; 2328 info->tile_y_sa /= fmtl->bh; 2329 2330 /* It's now an uncompressed surface so we need an uncompressed format */ 2331 info->surf.format = get_copy_format_for_bpb(isl_dev, fmtl->bpb); 2332 } 2333 2334 void 2335 blorp_copy(struct blorp_batch *batch, 2336 const struct blorp_surf *src_surf, 2337 unsigned src_level, unsigned src_layer, 2338 const struct blorp_surf *dst_surf, 2339 unsigned dst_level, unsigned dst_layer, 2340 uint32_t src_x, uint32_t src_y, 2341 uint32_t dst_x, uint32_t dst_y, 2342 uint32_t src_width, uint32_t src_height) 2343 { 2344 const struct isl_device *isl_dev = batch->blorp->isl_dev; 2345 struct blorp_params params; 2346 2347 if (src_width == 0 || src_height == 0) 2348 return; 2349 2350 blorp_params_init(¶ms); 2351 brw_blorp_surface_info_init(batch->blorp, ¶ms.src, src_surf, src_level, 2352 src_layer, ISL_FORMAT_UNSUPPORTED, false); 2353 brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, dst_surf, dst_level, 2354 dst_layer, ISL_FORMAT_UNSUPPORTED, true); 2355 2356 struct brw_blorp_blit_prog_key wm_prog_key = { 2357 .shader_type = BLORP_SHADER_TYPE_BLIT 2358 }; 2359 2360 const struct isl_format_layout *src_fmtl = 2361 isl_format_get_layout(params.src.surf.format); 2362 const struct isl_format_layout *dst_fmtl = 2363 isl_format_get_layout(params.dst.surf.format); 2364 2365 assert(params.src.aux_usage == ISL_AUX_USAGE_NONE || 2366 params.src.aux_usage == ISL_AUX_USAGE_MCS || 2367 params.src.aux_usage == ISL_AUX_USAGE_CCS_E); 2368 assert(params.dst.aux_usage == ISL_AUX_USAGE_NONE || 2369 params.dst.aux_usage == ISL_AUX_USAGE_MCS || 2370 params.dst.aux_usage == ISL_AUX_USAGE_CCS_E); 2371 2372 if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E) { 2373 params.dst.view.format = get_ccs_compatible_uint_format(dst_fmtl); 2374 if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E) { 2375 params.src.view.format = get_ccs_compatible_uint_format(src_fmtl); 2376 } else if (src_fmtl->bpb == dst_fmtl->bpb) { 2377 params.src.view.format = params.dst.view.format; 2378 } else { 2379 params.src.view.format = 2380 get_copy_format_for_bpb(isl_dev, src_fmtl->bpb); 2381 } 2382 } else if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E) { 2383 params.src.view.format = get_ccs_compatible_uint_format(src_fmtl); 2384 if (src_fmtl->bpb == dst_fmtl->bpb) { 2385 params.dst.view.format = params.src.view.format; 2386 } else { 2387 params.dst.view.format = 2388 get_copy_format_for_bpb(isl_dev, dst_fmtl->bpb); 2389 } 2390 } else { 2391 params.dst.view.format = get_copy_format_for_bpb(isl_dev, dst_fmtl->bpb); 2392 params.src.view.format = get_copy_format_for_bpb(isl_dev, src_fmtl->bpb); 2393 } 2394 2395 if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E) { 2396 params.src.clear_color = 2397 bitcast_color_value_to_uint(params.src.clear_color, src_fmtl); 2398 } 2399 2400 if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E) { 2401 params.dst.clear_color = 2402 bitcast_color_value_to_uint(params.dst.clear_color, dst_fmtl); 2403 } 2404 2405 wm_prog_key.src_bpc = 2406 isl_format_get_layout(params.src.view.format)->channels.r.bits; 2407 wm_prog_key.dst_bpc = 2408 isl_format_get_layout(params.dst.view.format)->channels.r.bits; 2409 2410 if (src_fmtl->bw > 1 || src_fmtl->bh > 1) { 2411 surf_convert_to_uncompressed(batch->blorp->isl_dev, ¶ms.src, 2412 &src_x, &src_y, &src_width, &src_height); 2413 wm_prog_key.need_src_offset = true; 2414 } 2415 2416 if (dst_fmtl->bw > 1 || dst_fmtl->bh > 1) { 2417 surf_convert_to_uncompressed(batch->blorp->isl_dev, ¶ms.dst, 2418 &dst_x, &dst_y, NULL, NULL); 2419 wm_prog_key.need_dst_offset = true; 2420 } 2421 2422 /* Once both surfaces are stompped to uncompressed as needed, the 2423 * destination size is the same as the source size. 2424 */ 2425 uint32_t dst_width = src_width; 2426 uint32_t dst_height = src_height; 2427 2428 struct blt_coords coords = { 2429 .x = { 2430 .src0 = src_x, 2431 .src1 = src_x + src_width, 2432 .dst0 = dst_x, 2433 .dst1 = dst_x + dst_width, 2434 .mirror = false 2435 }, 2436 .y = { 2437 .src0 = src_y, 2438 .src1 = src_y + src_height, 2439 .dst0 = dst_y, 2440 .dst1 = dst_y + dst_height, 2441 .mirror = false 2442 } 2443 }; 2444 2445 do_blorp_blit(batch, ¶ms, &wm_prog_key, &coords); 2446 } 2447