Home | History | Annotate | Download | only in blorp
      1 /*
      2  * Copyright  2013 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 #include "util/ralloc.h"
     25 
     26 #include "main/macros.h" /* Needed for MAX3 and MAX2 for format_rgb9e5 */
     27 #include "util/format_rgb9e5.h"
     28 #include "util/format_srgb.h"
     29 
     30 #include "blorp_priv.h"
     31 #include "compiler/brw_eu_defines.h"
     32 
     33 #include "blorp_nir_builder.h"
     34 
     35 #define FILE_DEBUG_FLAG DEBUG_BLORP
     36 
     37 struct brw_blorp_const_color_prog_key
     38 {
     39    enum blorp_shader_type shader_type; /* Must be BLORP_SHADER_TYPE_CLEAR */
     40    bool use_simd16_replicated_data;
     41    bool pad[3];
     42 };
     43 
     44 static bool
     45 blorp_params_get_clear_kernel(struct blorp_context *blorp,
     46                               struct blorp_params *params,
     47                               bool use_replicated_data)
     48 {
     49    const struct brw_blorp_const_color_prog_key blorp_key = {
     50       .shader_type = BLORP_SHADER_TYPE_CLEAR,
     51       .use_simd16_replicated_data = use_replicated_data,
     52    };
     53 
     54    if (blorp->lookup_shader(blorp, &blorp_key, sizeof(blorp_key),
     55                             &params->wm_prog_kernel, &params->wm_prog_data))
     56       return true;
     57 
     58    void *mem_ctx = ralloc_context(NULL);
     59 
     60    nir_builder b;
     61    nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, NULL);
     62    b.shader->info.name = ralloc_strdup(b.shader, "BLORP-clear");
     63 
     64    nir_variable *v_color =
     65       BLORP_CREATE_NIR_INPUT(b.shader, clear_color, glsl_vec4_type());
     66 
     67    nir_variable *frag_color = nir_variable_create(b.shader, nir_var_shader_out,
     68                                                   glsl_vec4_type(),
     69                                                   "gl_FragColor");
     70    frag_color->data.location = FRAG_RESULT_COLOR;
     71 
     72    nir_copy_var(&b, frag_color, v_color);
     73 
     74    struct brw_wm_prog_key wm_key;
     75    brw_blorp_init_wm_prog_key(&wm_key);
     76 
     77    struct brw_wm_prog_data prog_data;
     78    const unsigned *program =
     79       blorp_compile_fs(blorp, mem_ctx, b.shader, &wm_key, use_replicated_data,
     80                        &prog_data);
     81 
     82    bool result =
     83       blorp->upload_shader(blorp, &blorp_key, sizeof(blorp_key),
     84                            program, prog_data.base.program_size,
     85                            &prog_data.base, sizeof(prog_data),
     86                            &params->wm_prog_kernel, &params->wm_prog_data);
     87 
     88    ralloc_free(mem_ctx);
     89    return result;
     90 }
     91 
     92 struct layer_offset_vs_key {
     93    enum blorp_shader_type shader_type;
     94    unsigned num_inputs;
     95 };
     96 
     97 /* In the case of doing attachment clears, we are using a surface state that
     98  * is handed to us so we can't set (and don't even know) the base array layer.
     99  * In order to do a layered clear in this scenario, we need some way of adding
    100  * the base array layer to the instance id.  Unfortunately, our hardware has
    101  * no real concept of "base instance", so we have to do it manually in a
    102  * vertex shader.
    103  */
    104 static bool
    105 blorp_params_get_layer_offset_vs(struct blorp_context *blorp,
    106                                  struct blorp_params *params)
    107 {
    108    struct layer_offset_vs_key blorp_key = {
    109       .shader_type = BLORP_SHADER_TYPE_LAYER_OFFSET_VS,
    110    };
    111 
    112    if (params->wm_prog_data)
    113       blorp_key.num_inputs = params->wm_prog_data->num_varying_inputs;
    114 
    115    if (blorp->lookup_shader(blorp, &blorp_key, sizeof(blorp_key),
    116                             &params->vs_prog_kernel, &params->vs_prog_data))
    117       return true;
    118 
    119    void *mem_ctx = ralloc_context(NULL);
    120 
    121    nir_builder b;
    122    nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_VERTEX, NULL);
    123    b.shader->info.name = ralloc_strdup(b.shader, "BLORP-layer-offset-vs");
    124 
    125    const struct glsl_type *uvec4_type = glsl_vector_type(GLSL_TYPE_UINT, 4);
    126 
    127    /* First we deal with the header which has instance and base instance */
    128    nir_variable *a_header = nir_variable_create(b.shader, nir_var_shader_in,
    129                                                 uvec4_type, "header");
    130    a_header->data.location = VERT_ATTRIB_GENERIC0;
    131 
    132    nir_variable *v_layer = nir_variable_create(b.shader, nir_var_shader_out,
    133                                                glsl_int_type(), "layer_id");
    134    v_layer->data.location = VARYING_SLOT_LAYER;
    135 
    136    /* Compute the layer id */
    137    nir_ssa_def *header = nir_load_var(&b, a_header);
    138    nir_ssa_def *base_layer = nir_channel(&b, header, 0);
    139    nir_ssa_def *instance = nir_channel(&b, header, 1);
    140    nir_store_var(&b, v_layer, nir_iadd(&b, instance, base_layer), 0x1);
    141 
    142    /* Then we copy the vertex from the next slot to VARYING_SLOT_POS */
    143    nir_variable *a_vertex = nir_variable_create(b.shader, nir_var_shader_in,
    144                                                 glsl_vec4_type(), "a_vertex");
    145    a_vertex->data.location = VERT_ATTRIB_GENERIC1;
    146 
    147    nir_variable *v_pos = nir_variable_create(b.shader, nir_var_shader_out,
    148                                              glsl_vec4_type(), "v_pos");
    149    v_pos->data.location = VARYING_SLOT_POS;
    150 
    151    nir_copy_var(&b, v_pos, a_vertex);
    152 
    153    /* Then we copy everything else */
    154    for (unsigned i = 0; i < blorp_key.num_inputs; i++) {
    155       nir_variable *a_in = nir_variable_create(b.shader, nir_var_shader_in,
    156                                                uvec4_type, "input");
    157       a_in->data.location = VERT_ATTRIB_GENERIC2 + i;
    158 
    159       nir_variable *v_out = nir_variable_create(b.shader, nir_var_shader_out,
    160                                                 uvec4_type, "output");
    161       v_out->data.location = VARYING_SLOT_VAR0 + i;
    162 
    163       nir_copy_var(&b, v_out, a_in);
    164    }
    165 
    166    struct brw_vs_prog_data vs_prog_data;
    167    memset(&vs_prog_data, 0, sizeof(vs_prog_data));
    168 
    169    const unsigned *program =
    170       blorp_compile_vs(blorp, mem_ctx, b.shader, &vs_prog_data);
    171 
    172    bool result =
    173       blorp->upload_shader(blorp, &blorp_key, sizeof(blorp_key),
    174                            program, vs_prog_data.base.base.program_size,
    175                            &vs_prog_data.base.base, sizeof(vs_prog_data),
    176                            &params->vs_prog_kernel, &params->vs_prog_data);
    177 
    178    ralloc_free(mem_ctx);
    179    return result;
    180 }
    181 
    182 /* The x0, y0, x1, and y1 parameters must already be populated with the render
    183  * area of the framebuffer to be cleared.
    184  */
    185 static void
    186 get_fast_clear_rect(const struct isl_device *dev,
    187                     const struct isl_surf *aux_surf,
    188                     unsigned *x0, unsigned *y0,
    189                     unsigned *x1, unsigned *y1)
    190 {
    191    unsigned int x_align, y_align;
    192    unsigned int x_scaledown, y_scaledown;
    193 
    194    /* Only single sampled surfaces need to (and actually can) be resolved. */
    195    if (aux_surf->usage == ISL_SURF_USAGE_CCS_BIT) {
    196       /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
    197        * Target(s)", beneath the "Fast Color Clear" bullet (p327):
    198        *
    199        *     Clear pass must have a clear rectangle that must follow
    200        *     alignment rules in terms of pixels and lines as shown in the
    201        *     table below. Further, the clear-rectangle height and width
    202        *     must be multiple of the following dimensions. If the height
    203        *     and width of the render target being cleared do not meet these
    204        *     requirements, an MCS buffer can be created such that it
    205        *     follows the requirement and covers the RT.
    206        *
    207        * The alignment size in the table that follows is related to the
    208        * alignment size that is baked into the CCS surface format but with X
    209        * alignment multiplied by 16 and Y alignment multiplied by 32.
    210        */
    211       x_align = isl_format_get_layout(aux_surf->format)->bw;
    212       y_align = isl_format_get_layout(aux_surf->format)->bh;
    213 
    214       x_align *= 16;
    215 
    216       /* SKL+ line alignment requirement for Y-tiled are half those of the prior
    217        * generations.
    218        */
    219       if (dev->info->gen >= 9)
    220          y_align *= 16;
    221       else
    222          y_align *= 32;
    223 
    224       /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
    225        * Target(s)", beneath the "Fast Color Clear" bullet (p327):
    226        *
    227        *     In order to optimize the performance MCS buffer (when bound to
    228        *     1X RT) clear similarly to MCS buffer clear for MSRT case,
    229        *     clear rect is required to be scaled by the following factors
    230        *     in the horizontal and vertical directions:
    231        *
    232        * The X and Y scale down factors in the table that follows are each
    233        * equal to half the alignment value computed above.
    234        */
    235       x_scaledown = x_align / 2;
    236       y_scaledown = y_align / 2;
    237 
    238       /* From BSpec: 3D-Media-GPGPU Engine > 3D Pipeline > Pixel > Pixel
    239        * Backend > MCS Buffer for Render Target(s) [DevIVB+] > Table "Color
    240        * Clear of Non-MultiSampled Render Target Restrictions":
    241        *
    242        *   Clear rectangle must be aligned to two times the number of
    243        *   pixels in the table shown below due to 16x16 hashing across the
    244        *   slice.
    245        */
    246       x_align *= 2;
    247       y_align *= 2;
    248    } else {
    249       assert(aux_surf->usage == ISL_SURF_USAGE_MCS_BIT);
    250 
    251       /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
    252        * Target(s)", beneath the "MSAA Compression" bullet (p326):
    253        *
    254        *     Clear pass for this case requires that scaled down primitive
    255        *     is sent down with upper left co-ordinate to coincide with
    256        *     actual rectangle being cleared. For MSAA, clear rectangles
    257        *     height and width need to as show in the following table in
    258        *     terms of (width,height) of the RT.
    259        *
    260        *     MSAA  Width of Clear Rect  Height of Clear Rect
    261        *      2X     Ceil(1/8*width)      Ceil(1/2*height)
    262        *      4X     Ceil(1/8*width)      Ceil(1/2*height)
    263        *      8X     Ceil(1/2*width)      Ceil(1/2*height)
    264        *     16X         width            Ceil(1/2*height)
    265        *
    266        * The text "with upper left co-ordinate to coincide with actual
    267        * rectangle being cleared" is a little confusing--it seems to imply
    268        * that to clear a rectangle from (x,y) to (x+w,y+h), one needs to
    269        * feed the pipeline using the rectangle (x,y) to
    270        * (x+Ceil(w/N),y+Ceil(h/2)), where N is either 2 or 8 depending on
    271        * the number of samples.  Experiments indicate that this is not
    272        * quite correct; actually, what the hardware appears to do is to
    273        * align whatever rectangle is sent down the pipeline to the nearest
    274        * multiple of 2x2 blocks, and then scale it up by a factor of N
    275        * horizontally and 2 vertically.  So the resulting alignment is 4
    276        * vertically and either 4 or 16 horizontally, and the scaledown
    277        * factor is 2 vertically and either 2 or 8 horizontally.
    278        */
    279       switch (aux_surf->format) {
    280       case ISL_FORMAT_MCS_2X:
    281       case ISL_FORMAT_MCS_4X:
    282          x_scaledown = 8;
    283          break;
    284       case ISL_FORMAT_MCS_8X:
    285          x_scaledown = 2;
    286          break;
    287       case ISL_FORMAT_MCS_16X:
    288          x_scaledown = 1;
    289          break;
    290       default:
    291          unreachable("Unexpected MCS format for fast clear");
    292       }
    293       y_scaledown = 2;
    294       x_align = x_scaledown * 2;
    295       y_align = y_scaledown * 2;
    296    }
    297 
    298    *x0 = ROUND_DOWN_TO(*x0,  x_align) / x_scaledown;
    299    *y0 = ROUND_DOWN_TO(*y0, y_align) / y_scaledown;
    300    *x1 = ALIGN(*x1, x_align) / x_scaledown;
    301    *y1 = ALIGN(*y1, y_align) / y_scaledown;
    302 }
    303 
    304 void
    305 blorp_fast_clear(struct blorp_batch *batch,
    306                  const struct blorp_surf *surf, enum isl_format format,
    307                  uint32_t level, uint32_t start_layer, uint32_t num_layers,
    308                  uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1)
    309 {
    310    /* Ensure that all layers undergoing the clear have an auxiliary buffer. */
    311    assert(start_layer + num_layers <=
    312           MAX2(surf->aux_surf->logical_level0_px.depth >> level,
    313                surf->aux_surf->logical_level0_px.array_len));
    314 
    315    struct blorp_params params;
    316    blorp_params_init(&params);
    317    params.num_layers = num_layers;
    318 
    319    params.x0 = x0;
    320    params.y0 = y0;
    321    params.x1 = x1;
    322    params.y1 = y1;
    323 
    324    memset(&params.wm_inputs.clear_color, 0xff, 4*sizeof(float));
    325    params.fast_clear_op = BLORP_FAST_CLEAR_OP_CLEAR;
    326 
    327    get_fast_clear_rect(batch->blorp->isl_dev, surf->aux_surf,
    328                        &params.x0, &params.y0, &params.x1, &params.y1);
    329 
    330    if (!blorp_params_get_clear_kernel(batch->blorp, &params, true))
    331       return;
    332 
    333    brw_blorp_surface_info_init(batch->blorp, &params.dst, surf, level,
    334                                start_layer, format, true);
    335    params.num_samples = params.dst.surf.samples;
    336 
    337    batch->blorp->exec(batch, &params);
    338 }
    339 
    340 static union isl_color_value
    341 swizzle_color_value(union isl_color_value src, struct isl_swizzle swizzle)
    342 {
    343    union isl_color_value dst = { .u32 = { 0, } };
    344 
    345    /* We assign colors in ABGR order so that the first one will be taken in
    346     * RGBA precedence order.  According to the PRM docs for shader channel
    347     * select, this matches Haswell hardware behavior.
    348     */
    349    if ((unsigned)(swizzle.a - ISL_CHANNEL_SELECT_RED) < 4)
    350       dst.u32[swizzle.a - ISL_CHANNEL_SELECT_RED] = src.u32[3];
    351    if ((unsigned)(swizzle.b - ISL_CHANNEL_SELECT_RED) < 4)
    352       dst.u32[swizzle.b - ISL_CHANNEL_SELECT_RED] = src.u32[2];
    353    if ((unsigned)(swizzle.g - ISL_CHANNEL_SELECT_RED) < 4)
    354       dst.u32[swizzle.g - ISL_CHANNEL_SELECT_RED] = src.u32[1];
    355    if ((unsigned)(swizzle.r - ISL_CHANNEL_SELECT_RED) < 4)
    356       dst.u32[swizzle.r - ISL_CHANNEL_SELECT_RED] = src.u32[0];
    357 
    358    return dst;
    359 }
    360 
    361 void
    362 blorp_clear(struct blorp_batch *batch,
    363             const struct blorp_surf *surf,
    364             enum isl_format format, struct isl_swizzle swizzle,
    365             uint32_t level, uint32_t start_layer, uint32_t num_layers,
    366             uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1,
    367             union isl_color_value clear_color,
    368             const bool color_write_disable[4])
    369 {
    370    struct blorp_params params;
    371    blorp_params_init(&params);
    372 
    373    /* Manually apply the clear destination swizzle.  This way swizzled clears
    374     * will work for swizzles which we can't normally use for rendering and it
    375     * also ensures that they work on pre-Haswell hardware which can't swizlle
    376     * at all.
    377     */
    378    clear_color = swizzle_color_value(clear_color, swizzle);
    379    swizzle = ISL_SWIZZLE_IDENTITY;
    380 
    381    if (format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) {
    382       clear_color.u32[0] = float3_to_rgb9e5(clear_color.f32);
    383       format = ISL_FORMAT_R32_UINT;
    384    } else if (format == ISL_FORMAT_L8_UNORM_SRGB) {
    385       clear_color.f32[0] = util_format_linear_to_srgb_float(clear_color.f32[0]);
    386       format = ISL_FORMAT_R8_UNORM;
    387    } else if (format == ISL_FORMAT_A4B4G4R4_UNORM) {
    388       /* Broadwell and earlier cannot render to this format so we need to work
    389        * around it by swapping the colors around and using B4G4R4A4 instead.
    390        */
    391       const struct isl_swizzle ARGB = ISL_SWIZZLE(ALPHA, RED, GREEN, BLUE);
    392       clear_color = swizzle_color_value(clear_color, ARGB);
    393       format = ISL_FORMAT_B4G4R4A4_UNORM;
    394    }
    395 
    396    memcpy(&params.wm_inputs.clear_color, clear_color.f32, sizeof(float) * 4);
    397 
    398    bool use_simd16_replicated_data = true;
    399 
    400    /* From the SNB PRM (Vol4_Part1):
    401     *
    402     *     "Replicated data (Message Type = 111) is only supported when
    403     *      accessing tiled memory.  Using this Message Type to access linear
    404     *      (untiled) memory is UNDEFINED."
    405     */
    406    if (surf->surf->tiling == ISL_TILING_LINEAR)
    407       use_simd16_replicated_data = false;
    408 
    409    /* Replicated clears don't work yet before gen6 */
    410    if (batch->blorp->isl_dev->info->gen < 6)
    411       use_simd16_replicated_data = false;
    412 
    413    /* Constant color writes ignore everyting in blend and color calculator
    414     * state.  This is not documented.
    415     */
    416    if (color_write_disable) {
    417       for (unsigned i = 0; i < 4; i++) {
    418          params.color_write_disable[i] = color_write_disable[i];
    419          if (color_write_disable[i])
    420             use_simd16_replicated_data = false;
    421       }
    422    }
    423 
    424    if (!blorp_params_get_clear_kernel(batch->blorp, &params,
    425                                       use_simd16_replicated_data))
    426       return;
    427 
    428    if (!blorp_ensure_sf_program(batch->blorp, &params))
    429       return;
    430 
    431    while (num_layers > 0) {
    432       brw_blorp_surface_info_init(batch->blorp, &params.dst, surf, level,
    433                                   start_layer, format, true);
    434       params.dst.view.swizzle = swizzle;
    435 
    436       params.x0 = x0;
    437       params.y0 = y0;
    438       params.x1 = x1;
    439       params.y1 = y1;
    440 
    441       /* The MinLOD and MinimumArrayElement don't work properly for cube maps.
    442        * Convert them to a single slice on gen4.
    443        */
    444       if (batch->blorp->isl_dev->info->gen == 4 &&
    445           (params.dst.surf.usage & ISL_SURF_USAGE_CUBE_BIT)) {
    446          blorp_surf_convert_to_single_slice(batch->blorp->isl_dev, &params.dst);
    447       }
    448 
    449       if (isl_format_is_compressed(params.dst.surf.format)) {
    450          blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev, &params.dst,
    451                                             NULL, NULL, NULL, NULL);
    452                                             //&dst_x, &dst_y, &dst_w, &dst_h);
    453       }
    454 
    455       if (params.dst.tile_x_sa || params.dst.tile_y_sa) {
    456          /* Either we're on gen4 where there is no multisampling or the
    457           * surface is compressed which also implies no multisampling.
    458           * Therefore, sa == px and we don't need to do a conversion.
    459           */
    460          assert(params.dst.surf.samples == 1);
    461          params.x0 += params.dst.tile_x_sa;
    462          params.y0 += params.dst.tile_y_sa;
    463          params.x1 += params.dst.tile_x_sa;
    464          params.y1 += params.dst.tile_y_sa;
    465       }
    466 
    467       params.num_samples = params.dst.surf.samples;
    468 
    469       /* We may be restricted on the number of layers we can bind at any one
    470        * time.  In particular, Sandy Bridge has a maximum number of layers of
    471        * 512 but a maximum 3D texture size is much larger.
    472        */
    473       params.num_layers = MIN2(params.dst.view.array_len, num_layers);
    474       batch->blorp->exec(batch, &params);
    475 
    476       start_layer += params.num_layers;
    477       num_layers -= params.num_layers;
    478    }
    479 }
    480 
    481 void
    482 blorp_clear_depth_stencil(struct blorp_batch *batch,
    483                           const struct blorp_surf *depth,
    484                           const struct blorp_surf *stencil,
    485                           uint32_t level, uint32_t start_layer,
    486                           uint32_t num_layers,
    487                           uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1,
    488                           bool clear_depth, float depth_value,
    489                           uint8_t stencil_mask, uint8_t stencil_value)
    490 {
    491    struct blorp_params params;
    492    blorp_params_init(&params);
    493 
    494    params.x0 = x0;
    495    params.y0 = y0;
    496    params.x1 = x1;
    497    params.y1 = y1;
    498 
    499    if (ISL_DEV_GEN(batch->blorp->isl_dev) == 6) {
    500       /* For some reason, Sandy Bridge gets occlusion queries wrong if we
    501        * don't have a shader.  In particular, it records samples even though
    502        * we disable statistics in 3DSTATE_WM.  Give it the usual clear shader
    503        * to work around the issue.
    504        */
    505       if (!blorp_params_get_clear_kernel(batch->blorp, &params, false))
    506          return;
    507    }
    508 
    509    while (num_layers > 0) {
    510       params.num_layers = num_layers;
    511 
    512       if (stencil_mask) {
    513          brw_blorp_surface_info_init(batch->blorp, &params.stencil, stencil,
    514                                      level, start_layer,
    515                                      ISL_FORMAT_UNSUPPORTED, true);
    516          params.stencil_mask = stencil_mask;
    517          params.stencil_ref = stencil_value;
    518 
    519          params.dst.surf.samples = params.stencil.surf.samples;
    520          params.dst.surf.logical_level0_px =
    521             params.stencil.surf.logical_level0_px;
    522          params.dst.view = params.depth.view;
    523 
    524          params.num_samples = params.stencil.surf.samples;
    525 
    526          /* We may be restricted on the number of layers we can bind at any
    527           * one time.  In particular, Sandy Bridge has a maximum number of
    528           * layers of 512 but a maximum 3D texture size is much larger.
    529           */
    530          if (params.stencil.view.array_len < params.num_layers)
    531             params.num_layers = params.stencil.view.array_len;
    532       }
    533 
    534       if (clear_depth) {
    535          brw_blorp_surface_info_init(batch->blorp, &params.depth, depth,
    536                                      level, start_layer,
    537                                      ISL_FORMAT_UNSUPPORTED, true);
    538          params.z = depth_value;
    539          params.depth_format =
    540             isl_format_get_depth_format(depth->surf->format, false);
    541 
    542          params.dst.surf.samples = params.depth.surf.samples;
    543          params.dst.surf.logical_level0_px =
    544             params.depth.surf.logical_level0_px;
    545          params.dst.view = params.depth.view;
    546 
    547          params.num_samples = params.depth.surf.samples;
    548 
    549          /* We may be restricted on the number of layers we can bind at any
    550           * one time.  In particular, Sandy Bridge has a maximum number of
    551           * layers of 512 but a maximum 3D texture size is much larger.
    552           */
    553          if (params.depth.view.array_len < params.num_layers)
    554             params.num_layers = params.depth.view.array_len;
    555       }
    556 
    557       batch->blorp->exec(batch, &params);
    558 
    559       start_layer += params.num_layers;
    560       num_layers -= params.num_layers;
    561    }
    562 }
    563 
    564 bool
    565 blorp_can_hiz_clear_depth(uint8_t gen, enum isl_format format,
    566                           uint32_t num_samples,
    567                           uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1)
    568 {
    569    /* This function currently doesn't support any gen prior to gen8 */
    570    assert(gen >= 8);
    571 
    572    if (gen == 8 && format == ISL_FORMAT_R16_UNORM) {
    573       /* Apply the D16 alignment restrictions. On BDW, HiZ has an 8x4 sample
    574        * block with the following property: as the number of samples increases,
    575        * the number of pixels representable by this block decreases by a factor
    576        * of the sample dimensions. Sample dimensions scale following the MSAA
    577        * interleaved pattern.
    578        *
    579        * Sample|Sample|Pixel
    580        * Count |Dim   |Dim
    581        * ===================
    582        *    1  | 1x1  | 8x4
    583        *    2  | 2x1  | 4x4
    584        *    4  | 2x2  | 4x2
    585        *    8  | 4x2  | 2x2
    586        *   16  | 4x4  | 2x1
    587        *
    588        * Table: Pixel Dimensions in a HiZ Sample Block Pre-SKL
    589        */
    590       const struct isl_extent2d sa_block_dim =
    591          isl_get_interleaved_msaa_px_size_sa(num_samples);
    592       const uint8_t align_px_w = 8 / sa_block_dim.w;
    593       const uint8_t align_px_h = 4 / sa_block_dim.h;
    594 
    595       /* Fast depth clears clear an entire sample block at a time. As a result,
    596        * the rectangle must be aligned to the dimensions of the encompassing
    597        * pixel block for a successful operation.
    598        *
    599        * Fast clears can still work if the upper-left corner is aligned and the
    600        * bottom-rigtht corner touches the edge of a depth buffer whose extent
    601        * is unaligned. This is because each miplevel in the depth buffer is
    602        * padded by the Pixel Dim (similar to a standard compressed texture).
    603        * In this case, the clear rectangle could be padded by to match the full
    604        * depth buffer extent but to support multiple clearing techniques, we
    605        * chose to be unaware of the depth buffer's extent and thus don't handle
    606        * this case.
    607        */
    608       if (x0 % align_px_w || y0 % align_px_h ||
    609           x1 % align_px_w || y1 % align_px_h)
    610          return false;
    611    }
    612    return true;
    613 }
    614 
    615 /* Given a depth stencil attachment, this function performs a fast depth clear
    616  * on a depth portion and a regular clear on the stencil portion. When
    617  * performing a fast depth clear on the depth portion, the HiZ buffer is simply
    618  * tagged as cleared so the depth clear value is not actually needed.
    619  */
    620 void
    621 blorp_gen8_hiz_clear_attachments(struct blorp_batch *batch,
    622                                  uint32_t num_samples,
    623                                  uint32_t x0, uint32_t y0,
    624                                  uint32_t x1, uint32_t y1,
    625                                  bool clear_depth, bool clear_stencil,
    626                                  uint8_t stencil_value)
    627 {
    628    assert(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL);
    629 
    630    struct blorp_params params;
    631    blorp_params_init(&params);
    632    params.num_layers = 1;
    633    params.hiz_op = BLORP_HIZ_OP_DEPTH_CLEAR;
    634    params.x0 = x0;
    635    params.y0 = y0;
    636    params.x1 = x1;
    637    params.y1 = y1;
    638    params.num_samples = num_samples;
    639    params.depth.enabled = clear_depth;
    640    params.stencil.enabled = clear_stencil;
    641    params.stencil_ref = stencil_value;
    642    batch->blorp->exec(batch, &params);
    643 }
    644 
    645 /** Clear active color/depth/stencili attachments
    646  *
    647  * This function performs a clear operation on the currently bound
    648  * color/depth/stencil attachments.  It is assumed that any information passed
    649  * in here is valid, consistent, and in-bounds relative to the currently
    650  * attached depth/stencil.  The binding_table_offset parameter is the 32-bit
    651  * offset relative to surface state base address where pre-baked binding table
    652  * that we are to use lives.  If clear_color is false, binding_table_offset
    653  * must point to a binding table with one entry which is a valid null surface
    654  * that matches the currently bound depth and stencil.
    655  */
    656 void
    657 blorp_clear_attachments(struct blorp_batch *batch,
    658                         uint32_t binding_table_offset,
    659                         enum isl_format depth_format,
    660                         uint32_t num_samples,
    661                         uint32_t start_layer, uint32_t num_layers,
    662                         uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1,
    663                         bool clear_color, union isl_color_value color_value,
    664                         bool clear_depth, float depth_value,
    665                         uint8_t stencil_mask, uint8_t stencil_value)
    666 {
    667    struct blorp_params params;
    668    blorp_params_init(&params);
    669 
    670    assert(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL);
    671 
    672    params.x0 = x0;
    673    params.y0 = y0;
    674    params.x1 = x1;
    675    params.y1 = y1;
    676 
    677    params.use_pre_baked_binding_table = true;
    678    params.pre_baked_binding_table_offset = binding_table_offset;
    679 
    680    params.num_layers = num_layers;
    681    params.num_samples = num_samples;
    682 
    683    if (clear_color) {
    684       params.dst.enabled = true;
    685 
    686       memcpy(&params.wm_inputs.clear_color, color_value.f32, sizeof(float) * 4);
    687 
    688       /* Unfortunately, without knowing whether or not our destination surface
    689        * is tiled or not, we have to assume it may be linear.  This means no
    690        * SIMD16_REPDATA for us. :-(
    691        */
    692       if (!blorp_params_get_clear_kernel(batch->blorp, &params, false))
    693          return;
    694    }
    695 
    696    if (clear_depth) {
    697       params.depth.enabled = true;
    698 
    699       params.z = depth_value;
    700       params.depth_format = isl_format_get_depth_format(depth_format, false);
    701    }
    702 
    703    if (stencil_mask) {
    704       params.stencil.enabled = true;
    705 
    706       params.stencil_mask = stencil_mask;
    707       params.stencil_ref = stencil_value;
    708    }
    709 
    710    if (!blorp_params_get_layer_offset_vs(batch->blorp, &params))
    711       return;
    712 
    713    params.vs_inputs.base_layer = start_layer;
    714 
    715    batch->blorp->exec(batch, &params);
    716 }
    717 
    718 void
    719 blorp_ccs_resolve(struct blorp_batch *batch,
    720                   struct blorp_surf *surf, uint32_t level,
    721                   uint32_t start_layer, uint32_t num_layers,
    722                   enum isl_format format,
    723                   enum blorp_fast_clear_op resolve_op)
    724 {
    725    struct blorp_params params;
    726 
    727    blorp_params_init(&params);
    728    brw_blorp_surface_info_init(batch->blorp, &params.dst, surf,
    729                                level, start_layer, format, true);
    730 
    731    /* From the Ivy Bridge PRM, Vol2 Part1 11.9 "Render Target Resolve":
    732     *
    733     *     A rectangle primitive must be scaled down by the following factors
    734     *     with respect to render target being resolved.
    735     *
    736     * The scaledown factors in the table that follows are related to the block
    737     * size of the CCS format.  For IVB and HSW, we divide by two, for BDW we
    738     * multiply by 8 and 16. On Sky Lake, we multiply by 8.
    739     */
    740    const struct isl_format_layout *aux_fmtl =
    741       isl_format_get_layout(params.dst.aux_surf.format);
    742    assert(aux_fmtl->txc == ISL_TXC_CCS);
    743 
    744    unsigned x_scaledown, y_scaledown;
    745    if (ISL_DEV_GEN(batch->blorp->isl_dev) >= 9) {
    746       x_scaledown = aux_fmtl->bw * 8;
    747       y_scaledown = aux_fmtl->bh * 8;
    748    } else if (ISL_DEV_GEN(batch->blorp->isl_dev) >= 8) {
    749       x_scaledown = aux_fmtl->bw * 8;
    750       y_scaledown = aux_fmtl->bh * 16;
    751    } else {
    752       x_scaledown = aux_fmtl->bw / 2;
    753       y_scaledown = aux_fmtl->bh / 2;
    754    }
    755    params.x0 = params.y0 = 0;
    756    params.x1 = minify(params.dst.aux_surf.logical_level0_px.width, level);
    757    params.y1 = minify(params.dst.aux_surf.logical_level0_px.height, level);
    758    params.x1 = ALIGN(params.x1, x_scaledown) / x_scaledown;
    759    params.y1 = ALIGN(params.y1, y_scaledown) / y_scaledown;
    760 
    761    if (batch->blorp->isl_dev->info->gen >= 9) {
    762       assert(resolve_op == BLORP_FAST_CLEAR_OP_RESOLVE_FULL ||
    763              resolve_op == BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL);
    764    } else {
    765       /* Broadwell and earlier do not have a partial resolve */
    766       assert(resolve_op == BLORP_FAST_CLEAR_OP_RESOLVE_FULL);
    767    }
    768    params.fast_clear_op = resolve_op;
    769    params.num_layers = num_layers;
    770 
    771    /* Note: there is no need to initialize push constants because it doesn't
    772     * matter what data gets dispatched to the render target.  However, we must
    773     * ensure that the fragment shader delivers the data using the "replicated
    774     * color" message.
    775     */
    776 
    777    if (!blorp_params_get_clear_kernel(batch->blorp, &params, true))
    778       return;
    779 
    780    batch->blorp->exec(batch, &params);
    781 }
    782 
    783 struct blorp_mcs_partial_resolve_key
    784 {
    785    enum blorp_shader_type shader_type;
    786    uint32_t num_samples;
    787 };
    788 
    789 static bool
    790 blorp_params_get_mcs_partial_resolve_kernel(struct blorp_context *blorp,
    791                                             struct blorp_params *params)
    792 {
    793    const struct blorp_mcs_partial_resolve_key blorp_key = {
    794       .shader_type = BLORP_SHADER_TYPE_MCS_PARTIAL_RESOLVE,
    795       .num_samples = params->num_samples,
    796    };
    797 
    798    if (blorp->lookup_shader(blorp, &blorp_key, sizeof(blorp_key),
    799                             &params->wm_prog_kernel, &params->wm_prog_data))
    800       return true;
    801 
    802    void *mem_ctx = ralloc_context(NULL);
    803 
    804    nir_builder b;
    805    nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, NULL);
    806    b.shader->info.name = ralloc_strdup(b.shader, "BLORP-mcs-partial-resolve");
    807 
    808    nir_variable *v_color =
    809       BLORP_CREATE_NIR_INPUT(b.shader, clear_color, glsl_vec4_type());
    810 
    811    nir_variable *frag_color =
    812       nir_variable_create(b.shader, nir_var_shader_out,
    813                           glsl_vec4_type(), "gl_FragColor");
    814    frag_color->data.location = FRAG_RESULT_COLOR;
    815 
    816    /* Do an MCS fetch and check if it is equal to the magic clear value */
    817    nir_ssa_def *mcs =
    818       blorp_nir_txf_ms_mcs(&b, nir_f2i32(&b, blorp_nir_frag_coord(&b)),
    819                                nir_load_layer_id(&b));
    820    nir_ssa_def *is_clear =
    821       blorp_nir_mcs_is_clear_color(&b, mcs, blorp_key.num_samples);
    822 
    823    /* If we aren't the clear value, discard. */
    824    nir_intrinsic_instr *discard =
    825       nir_intrinsic_instr_create(b.shader, nir_intrinsic_discard_if);
    826    discard->src[0] = nir_src_for_ssa(nir_inot(&b, is_clear));
    827    nir_builder_instr_insert(&b, &discard->instr);
    828 
    829    nir_copy_var(&b, frag_color, v_color);
    830 
    831    struct brw_wm_prog_key wm_key;
    832    brw_blorp_init_wm_prog_key(&wm_key);
    833    wm_key.tex.compressed_multisample_layout_mask = 1;
    834    wm_key.tex.msaa_16 = blorp_key.num_samples == 16;
    835    wm_key.multisample_fbo = true;
    836 
    837    struct brw_wm_prog_data prog_data;
    838    const unsigned *program =
    839       blorp_compile_fs(blorp, mem_ctx, b.shader, &wm_key, false,
    840                        &prog_data);
    841 
    842    bool result =
    843       blorp->upload_shader(blorp, &blorp_key, sizeof(blorp_key),
    844                            program, prog_data.base.program_size,
    845                            &prog_data.base, sizeof(prog_data),
    846                            &params->wm_prog_kernel, &params->wm_prog_data);
    847 
    848    ralloc_free(mem_ctx);
    849    return result;
    850 }
    851 
    852 void
    853 blorp_mcs_partial_resolve(struct blorp_batch *batch,
    854                           struct blorp_surf *surf,
    855                           enum isl_format format,
    856                           uint32_t start_layer, uint32_t num_layers)
    857 {
    858    struct blorp_params params;
    859    blorp_params_init(&params);
    860 
    861    assert(batch->blorp->isl_dev->info->gen >= 7);
    862 
    863    params.x0 = 0;
    864    params.y0 = 0;
    865    params.x1 = surf->surf->logical_level0_px.width;
    866    params.y1 = surf->surf->logical_level0_px.height;
    867 
    868    brw_blorp_surface_info_init(batch->blorp, &params.src, surf, 0,
    869                                start_layer, format, false);
    870    brw_blorp_surface_info_init(batch->blorp, &params.dst, surf, 0,
    871                                start_layer, format, true);
    872 
    873    params.num_samples = params.dst.surf.samples;
    874    params.num_layers = num_layers;
    875 
    876    memcpy(&params.wm_inputs.clear_color,
    877           surf->clear_color.f32, sizeof(float) * 4);
    878 
    879    if (!blorp_params_get_mcs_partial_resolve_kernel(batch->blorp, &params))
    880       return;
    881 
    882    batch->blorp->exec(batch, &params);
    883 }
    884