Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright (c) 2015 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 #include "common/gen_l3_config.h"
     25 
     26 #include "brw_context.h"
     27 #include "brw_defines.h"
     28 #include "brw_state.h"
     29 #include "intel_batchbuffer.h"
     30 
     31 /**
     32  * Calculate the desired L3 partitioning based on the current state of the
     33  * pipeline.  For now this simply returns the conservative defaults calculated
     34  * by get_default_l3_weights(), but we could probably do better by gathering
     35  * more statistics from the pipeline state (e.g. guess of expected URB usage
     36  * and bound surfaces), or by using feed-back from performance counters.
     37  */
     38 static struct gen_l3_weights
     39 get_pipeline_state_l3_weights(const struct brw_context *brw)
     40 {
     41    const struct brw_stage_state *stage_states[] = {
     42       [MESA_SHADER_VERTEX] = &brw->vs.base,
     43       [MESA_SHADER_TESS_CTRL] = &brw->tcs.base,
     44       [MESA_SHADER_TESS_EVAL] = &brw->tes.base,
     45       [MESA_SHADER_GEOMETRY] = &brw->gs.base,
     46       [MESA_SHADER_FRAGMENT] = &brw->wm.base,
     47       [MESA_SHADER_COMPUTE] = &brw->cs.base
     48    };
     49    bool needs_dc = false, needs_slm = false;
     50 
     51    for (unsigned i = 0; i < ARRAY_SIZE(stage_states); i++) {
     52       const struct gl_shader_program *prog =
     53          brw->ctx._Shader->CurrentProgram[stage_states[i]->stage];
     54       const struct brw_stage_prog_data *prog_data = stage_states[i]->prog_data;
     55 
     56       needs_dc |= (prog && (prog->data->NumAtomicBuffers ||
     57                             prog->data->NumShaderStorageBlocks)) ||
     58          (prog_data && (prog_data->total_scratch || prog_data->nr_image_params));
     59       needs_slm |= prog_data && prog_data->total_shared;
     60    }
     61 
     62    return gen_get_default_l3_weights(&brw->screen->devinfo,
     63                                      needs_dc, needs_slm);
     64 }
     65 
     66 /**
     67  * Program the hardware to use the specified L3 configuration.
     68  */
     69 static void
     70 setup_l3_config(struct brw_context *brw, const struct gen_l3_config *cfg)
     71 {
     72    const bool has_dc = cfg->n[GEN_L3P_DC] || cfg->n[GEN_L3P_ALL];
     73    const bool has_is = cfg->n[GEN_L3P_IS] || cfg->n[GEN_L3P_RO] ||
     74                        cfg->n[GEN_L3P_ALL];
     75    const bool has_c = cfg->n[GEN_L3P_C] || cfg->n[GEN_L3P_RO] ||
     76                       cfg->n[GEN_L3P_ALL];
     77    const bool has_t = cfg->n[GEN_L3P_T] || cfg->n[GEN_L3P_RO] ||
     78                       cfg->n[GEN_L3P_ALL];
     79    const bool has_slm = cfg->n[GEN_L3P_SLM];
     80 
     81    /* According to the hardware docs, the L3 partitioning can only be changed
     82     * while the pipeline is completely drained and the caches are flushed,
     83     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
     84     */
     85    brw_emit_pipe_control_flush(brw,
     86                                PIPE_CONTROL_DATA_CACHE_FLUSH |
     87                                PIPE_CONTROL_NO_WRITE |
     88                                PIPE_CONTROL_CS_STALL);
     89 
     90    /* ...followed by a second pipelined PIPE_CONTROL that initiates
     91     * invalidation of the relevant caches.  Note that because RO invalidation
     92     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
     93     * command is processed by the CS) we cannot combine it with the previous
     94     * stalling flush as the hardware documentation suggests, because that
     95     * would cause the CS to stall on previous rendering *after* RO
     96     * invalidation and wouldn't prevent the RO caches from being polluted by
     97     * concurrent rendering before the stall completes.  This intentionally
     98     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
     99     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
    100     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
    101     * already guarantee that there is no concurrent GPGPU kernel execution
    102     * (see SKL HSD 2132585).
    103     */
    104    brw_emit_pipe_control_flush(brw,
    105                                PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
    106                                PIPE_CONTROL_CONST_CACHE_INVALIDATE |
    107                                PIPE_CONTROL_INSTRUCTION_INVALIDATE |
    108                                PIPE_CONTROL_STATE_CACHE_INVALIDATE |
    109                                PIPE_CONTROL_NO_WRITE);
    110 
    111    /* Now send a third stalling flush to make sure that invalidation is
    112     * complete when the L3 configuration registers are modified.
    113     */
    114    brw_emit_pipe_control_flush(brw,
    115                                PIPE_CONTROL_DATA_CACHE_FLUSH |
    116                                PIPE_CONTROL_NO_WRITE |
    117                                PIPE_CONTROL_CS_STALL);
    118 
    119    if (brw->gen >= 8) {
    120       assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]);
    121 
    122       BEGIN_BATCH(3);
    123       OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
    124 
    125       /* Set up the L3 partitioning. */
    126       OUT_BATCH(GEN8_L3CNTLREG);
    127       OUT_BATCH((has_slm ? GEN8_L3CNTLREG_SLM_ENABLE : 0) |
    128                 SET_FIELD(cfg->n[GEN_L3P_URB], GEN8_L3CNTLREG_URB_ALLOC) |
    129                 SET_FIELD(cfg->n[GEN_L3P_RO], GEN8_L3CNTLREG_RO_ALLOC) |
    130                 SET_FIELD(cfg->n[GEN_L3P_DC], GEN8_L3CNTLREG_DC_ALLOC) |
    131                 SET_FIELD(cfg->n[GEN_L3P_ALL], GEN8_L3CNTLREG_ALL_ALLOC));
    132 
    133       ADVANCE_BATCH();
    134 
    135    } else {
    136       assert(!cfg->n[GEN_L3P_ALL]);
    137 
    138       /* When enabled SLM only uses a portion of the L3 on half of the banks,
    139        * the matching space on the remaining banks has to be allocated to a
    140        * client (URB for all validated configurations) set to the
    141        * lower-bandwidth 2-bank address hashing mode.
    142        */
    143       const bool urb_low_bw = has_slm && !brw->is_baytrail;
    144       assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]);
    145 
    146       /* Minimum number of ways that can be allocated to the URB. */
    147       const unsigned n0_urb = (brw->is_baytrail ? 32 : 0);
    148       assert(cfg->n[GEN_L3P_URB] >= n0_urb);
    149 
    150       BEGIN_BATCH(7);
    151       OUT_BATCH(MI_LOAD_REGISTER_IMM | (7 - 2));
    152 
    153       /* Demote any clients with no ways assigned to LLC. */
    154       OUT_BATCH(GEN7_L3SQCREG1);
    155       OUT_BATCH((brw->is_haswell ? HSW_L3SQCREG1_SQGHPCI_DEFAULT :
    156                  brw->is_baytrail ? VLV_L3SQCREG1_SQGHPCI_DEFAULT :
    157                  IVB_L3SQCREG1_SQGHPCI_DEFAULT) |
    158                 (has_dc ? 0 : GEN7_L3SQCREG1_CONV_DC_UC) |
    159                 (has_is ? 0 : GEN7_L3SQCREG1_CONV_IS_UC) |
    160                 (has_c ? 0 : GEN7_L3SQCREG1_CONV_C_UC) |
    161                 (has_t ? 0 : GEN7_L3SQCREG1_CONV_T_UC));
    162 
    163       /* Set up the L3 partitioning. */
    164       OUT_BATCH(GEN7_L3CNTLREG2);
    165       OUT_BATCH((has_slm ? GEN7_L3CNTLREG2_SLM_ENABLE : 0) |
    166                 SET_FIELD(cfg->n[GEN_L3P_URB] - n0_urb, GEN7_L3CNTLREG2_URB_ALLOC) |
    167                 (urb_low_bw ? GEN7_L3CNTLREG2_URB_LOW_BW : 0) |
    168                 SET_FIELD(cfg->n[GEN_L3P_ALL], GEN7_L3CNTLREG2_ALL_ALLOC) |
    169                 SET_FIELD(cfg->n[GEN_L3P_RO], GEN7_L3CNTLREG2_RO_ALLOC) |
    170                 SET_FIELD(cfg->n[GEN_L3P_DC], GEN7_L3CNTLREG2_DC_ALLOC));
    171       OUT_BATCH(GEN7_L3CNTLREG3);
    172       OUT_BATCH(SET_FIELD(cfg->n[GEN_L3P_IS], GEN7_L3CNTLREG3_IS_ALLOC) |
    173                 SET_FIELD(cfg->n[GEN_L3P_C], GEN7_L3CNTLREG3_C_ALLOC) |
    174                 SET_FIELD(cfg->n[GEN_L3P_T], GEN7_L3CNTLREG3_T_ALLOC));
    175 
    176       ADVANCE_BATCH();
    177 
    178       if (can_do_hsw_l3_atomics(brw->screen)) {
    179          /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
    180           * them disabled to avoid crashing the system hard.
    181           */
    182          BEGIN_BATCH(5);
    183          OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
    184          OUT_BATCH(HSW_SCRATCH1);
    185          OUT_BATCH(has_dc ? 0 : HSW_SCRATCH1_L3_ATOMIC_DISABLE);
    186          OUT_BATCH(HSW_ROW_CHICKEN3);
    187          OUT_BATCH(REG_MASK(HSW_ROW_CHICKEN3_L3_ATOMIC_DISABLE) |
    188                    (has_dc ? 0 : HSW_ROW_CHICKEN3_L3_ATOMIC_DISABLE));
    189          ADVANCE_BATCH();
    190       }
    191    }
    192 }
    193 
    194 /**
    195  * Update the URB size in the context state for the specified L3
    196  * configuration.
    197  */
    198 static void
    199 update_urb_size(struct brw_context *brw, const struct gen_l3_config *cfg)
    200 {
    201    const struct gen_device_info *devinfo = &brw->screen->devinfo;
    202    const unsigned sz = gen_get_l3_config_urb_size(devinfo, cfg);
    203 
    204    if (brw->urb.size != sz) {
    205       brw->urb.size = sz;
    206       brw->ctx.NewDriverState |= BRW_NEW_URB_SIZE;
    207    }
    208 }
    209 
    210 static void
    211 emit_l3_state(struct brw_context *brw)
    212 {
    213    const struct gen_l3_weights w = get_pipeline_state_l3_weights(brw);
    214    const float dw = gen_diff_l3_weights(w, gen_get_l3_config_weights(brw->l3.config));
    215    /* The distance between any two compatible weight vectors cannot exceed two
    216     * due to the triangle inequality.
    217     */
    218    const float large_dw_threshold = 2.0;
    219    /* Somewhat arbitrary, simply makes sure that there will be no repeated
    220     * transitions to the same L3 configuration, could probably do better here.
    221     */
    222    const float small_dw_threshold = 0.5;
    223    /* If we're emitting a new batch the caches should already be clean and the
    224     * transition should be relatively cheap, so it shouldn't hurt much to use
    225     * the smaller threshold.  Otherwise use the larger threshold so that we
    226     * only reprogram the L3 mid-batch if the most recently programmed
    227     * configuration is incompatible with the current pipeline state.
    228     */
    229    const float dw_threshold = (brw->ctx.NewDriverState & BRW_NEW_BATCH ?
    230                                small_dw_threshold : large_dw_threshold);
    231 
    232    if (dw > dw_threshold && can_do_pipelined_register_writes(brw->screen)) {
    233       const struct gen_l3_config *const cfg =
    234          gen_get_l3_config(&brw->screen->devinfo, w);
    235 
    236       setup_l3_config(brw, cfg);
    237       update_urb_size(brw, cfg);
    238       brw->l3.config = cfg;
    239 
    240       if (unlikely(INTEL_DEBUG & DEBUG_L3)) {
    241          fprintf(stderr, "L3 config transition (%f > %f): ", dw, dw_threshold);
    242          gen_dump_l3_config(cfg, stderr);
    243       }
    244    }
    245 }
    246 
    247 const struct brw_tracked_state gen7_l3_state = {
    248    .dirty = {
    249       .mesa = 0,
    250       .brw = BRW_NEW_BATCH |
    251              BRW_NEW_BLORP |
    252              BRW_NEW_CS_PROG_DATA |
    253              BRW_NEW_FS_PROG_DATA |
    254              BRW_NEW_GS_PROG_DATA |
    255              BRW_NEW_VS_PROG_DATA,
    256    },
    257    .emit = emit_l3_state
    258 };
    259 
    260 /**
    261  * Hack to restore the default L3 configuration.
    262  *
    263  * This will be called at the end of every batch in order to reset the L3
    264  * configuration to the default values for the time being until the kernel is
    265  * fixed.  Until kernel commit 6702cf16e0ba8b0129f5aa1b6609d4e9c70bc13b
    266  * (included in v4.1) we would set the MI_RESTORE_INHIBIT bit when submitting
    267  * batch buffers for the default context used by the DDX, which meant that any
    268  * context state changed by the GL would leak into the DDX, the assumption
    269  * being that the DDX would initialize any state it cares about manually.  The
    270  * DDX is however not careful enough to program an L3 configuration
    271  * explicitly, and it makes assumptions about it (URB size) which won't hold
    272  * and cause it to misrender if we let our L3 set-up to leak into the DDX.
    273  *
    274  * Since v4.1 of the Linux kernel the default context is saved and restored
    275  * normally, so it's far less likely for our L3 programming to interfere with
    276  * other contexts -- In fact restoring the default L3 configuration at the end
    277  * of the batch will be redundant most of the time.  A kind of state leak is
    278  * still possible though if the context making assumptions about L3 state is
    279  * created immediately after our context was active (e.g. without the DDX
    280  * default context being scheduled in between) because at present the DRM
    281  * doesn't fully initialize the contents of newly created contexts and instead
    282  * sets the MI_RESTORE_INHIBIT flag causing it to inherit the state from the
    283  * last active context.
    284  *
    285  * It's possible to realize such a scenario if, say, an X server (or a GL
    286  * application using an outdated non-L3-aware Mesa version) is started while
    287  * another GL application is running and happens to have modified the L3
    288  * configuration, or if no X server is running at all and a GL application
    289  * using a non-L3-aware Mesa version is started after another GL application
    290  * ran and modified the L3 configuration -- The latter situation can actually
    291  * be reproduced easily on IVB in our CI system.
    292  */
    293 void
    294 gen7_restore_default_l3_config(struct brw_context *brw)
    295 {
    296    const struct gen_device_info *devinfo = &brw->screen->devinfo;
    297    const struct gen_l3_config *const cfg = gen_get_default_l3_config(devinfo);
    298 
    299    if (cfg != brw->l3.config &&
    300        can_do_pipelined_register_writes(brw->screen)) {
    301       setup_l3_config(brw, cfg);
    302       update_urb_size(brw, cfg);
    303       brw->l3.config = cfg;
    304    }
    305 }
    306