Home | History | Annotate | Download | only in shader
      1 /*
      2  * Mesa 3-D graphics library
      3  *
      4  * Copyright (C) 2012-2013 LunarG, Inc.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included
     14  * in all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     22  * DEALINGS IN THE SOFTWARE.
     23  *
     24  * Authors:
     25  *    Chia-I Wu <olv (at) lunarg.com>
     26  */
     27 
     28 #include "tgsi/tgsi_dump.h"
     29 #include "tgsi/tgsi_util.h"
     30 #include "toy_compiler.h"
     31 #include "toy_tgsi.h"
     32 #include "toy_legalize.h"
     33 #include "toy_optimize.h"
     34 #include "toy_helpers.h"
     35 #include "ilo_shader_internal.h"
     36 
     37 struct fs_compile_context {
     38    struct ilo_shader *shader;
     39    const struct ilo_shader_variant *variant;
     40 
     41    struct toy_compiler tc;
     42    struct toy_tgsi tgsi;
     43 
     44    int const_cache;
     45    int dispatch_mode;
     46 
     47    struct {
     48       int interp_perspective_pixel;
     49       int interp_perspective_centroid;
     50       int interp_perspective_sample;
     51       int interp_nonperspective_pixel;
     52       int interp_nonperspective_centroid;
     53       int interp_nonperspective_sample;
     54       int source_depth;
     55       int source_w;
     56       int pos_offset;
     57    } payloads[2];
     58 
     59    int first_const_grf;
     60    int first_attr_grf;
     61    int first_free_grf;
     62    int last_free_grf;
     63 
     64    int num_grf_per_vrf;
     65 
     66    int first_free_mrf;
     67    int last_free_mrf;
     68 };
     69 
     70 static void
     71 fetch_position(struct fs_compile_context *fcc, struct toy_dst dst)
     72 {
     73    struct toy_compiler *tc = &fcc->tc;
     74    const struct toy_src src_z =
     75       tsrc(TOY_FILE_GRF, fcc->payloads[0].source_depth, 0);
     76    const struct toy_src src_w =
     77       tsrc(TOY_FILE_GRF, fcc->payloads[0].source_w, 0);
     78    const int fb_height =
     79       (fcc->variant->u.fs.fb_height) ? fcc->variant->u.fs.fb_height : 1;
     80    const bool origin_upper_left =
     81       (fcc->tgsi.props.fs_coord_origin == TGSI_FS_COORD_ORIGIN_UPPER_LEFT);
     82    const bool pixel_center_integer =
     83       (fcc->tgsi.props.fs_coord_pixel_center ==
     84        TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
     85    struct toy_src subspan_x, subspan_y;
     86    struct toy_dst tmp, tmp_uw;
     87    struct toy_dst real_dst[4];
     88 
     89    tdst_transpose(dst, real_dst);
     90 
     91    subspan_x = tsrc_uw(tsrc(TOY_FILE_GRF, 1, 2 * 4));
     92    subspan_x = tsrc_rect(subspan_x, TOY_RECT_240);
     93 
     94    subspan_y = tsrc_offset(subspan_x, 0, 1);
     95 
     96    tmp_uw = tdst_uw(tc_alloc_tmp(tc));
     97    tmp = tc_alloc_tmp(tc);
     98 
     99    /* X */
    100    tc_ADD(tc, tmp_uw, subspan_x, tsrc_imm_v(0x10101010));
    101    tc_MOV(tc, tmp, tsrc_from(tmp_uw));
    102    if (pixel_center_integer)
    103       tc_MOV(tc, real_dst[0], tsrc_from(tmp));
    104    else
    105       tc_ADD(tc, real_dst[0], tsrc_from(tmp), tsrc_imm_f(0.5f));
    106 
    107    /* Y */
    108    tc_ADD(tc, tmp_uw, subspan_y, tsrc_imm_v(0x11001100));
    109    tc_MOV(tc, tmp, tsrc_from(tmp_uw));
    110    if (origin_upper_left && pixel_center_integer) {
    111       tc_MOV(tc, real_dst[1], tsrc_from(tmp));
    112    }
    113    else {
    114       struct toy_src y = tsrc_from(tmp);
    115       float offset = 0.0f;
    116 
    117       if (!pixel_center_integer)
    118          offset += 0.5f;
    119 
    120       if (!origin_upper_left) {
    121          offset += (float) (fb_height - 1);
    122          y = tsrc_negate(y);
    123       }
    124 
    125       tc_ADD(tc, real_dst[1], y, tsrc_imm_f(offset));
    126    }
    127 
    128    /* Z and W */
    129    tc_MOV(tc, real_dst[2], src_z);
    130    tc_INV(tc, real_dst[3], src_w);
    131 }
    132 
    133 static void
    134 fetch_face(struct fs_compile_context *fcc, struct toy_dst dst)
    135 {
    136    struct toy_compiler *tc = &fcc->tc;
    137    const struct toy_src r0 = tsrc_d(tsrc(TOY_FILE_GRF, 0, 0));
    138    struct toy_dst tmp_f, tmp;
    139    struct toy_dst real_dst[4];
    140 
    141    tdst_transpose(dst, real_dst);
    142 
    143    tmp_f = tc_alloc_tmp(tc);
    144    tmp = tdst_d(tmp_f);
    145    tc_SHR(tc, tmp, tsrc_rect(r0, TOY_RECT_010), tsrc_imm_d(15));
    146    tc_AND(tc, tmp, tsrc_from(tmp), tsrc_imm_d(1));
    147    tc_MOV(tc, tmp_f, tsrc_from(tmp));
    148 
    149    /* convert to 1.0 and -1.0 */
    150    tc_MUL(tc, tmp_f, tsrc_from(tmp_f), tsrc_imm_f(-2.0f));
    151    tc_ADD(tc, real_dst[0], tsrc_from(tmp_f), tsrc_imm_f(1.0f));
    152 
    153    tc_MOV(tc, real_dst[1], tsrc_imm_f(0.0f));
    154    tc_MOV(tc, real_dst[2], tsrc_imm_f(0.0f));
    155    tc_MOV(tc, real_dst[3], tsrc_imm_f(1.0f));
    156 }
    157 
    158 static void
    159 fetch_attr(struct fs_compile_context *fcc, struct toy_dst dst, int slot)
    160 {
    161    struct toy_compiler *tc = &fcc->tc;
    162    struct toy_dst real_dst[4];
    163    bool is_const = false;
    164    int grf, interp, ch;
    165 
    166    tdst_transpose(dst, real_dst);
    167 
    168    grf = fcc->first_attr_grf + slot * 2;
    169 
    170    switch (fcc->tgsi.inputs[slot].interp) {
    171    case TGSI_INTERPOLATE_CONSTANT:
    172       is_const = true;
    173       break;
    174    case TGSI_INTERPOLATE_LINEAR:
    175       if (fcc->tgsi.inputs[slot].centroid)
    176          interp = fcc->payloads[0].interp_nonperspective_centroid;
    177       else
    178          interp = fcc->payloads[0].interp_nonperspective_pixel;
    179       break;
    180    case TGSI_INTERPOLATE_COLOR:
    181       if (fcc->variant->u.fs.flatshade) {
    182          is_const = true;
    183          break;
    184       }
    185       /* fall through */
    186    case TGSI_INTERPOLATE_PERSPECTIVE:
    187       if (fcc->tgsi.inputs[slot].centroid)
    188          interp = fcc->payloads[0].interp_perspective_centroid;
    189       else
    190          interp = fcc->payloads[0].interp_perspective_pixel;
    191       break;
    192    default:
    193       assert(!"unexpected FS interpolation");
    194       interp = fcc->payloads[0].interp_perspective_pixel;
    195       break;
    196    }
    197 
    198    if (is_const) {
    199       struct toy_src a0[4];
    200 
    201       a0[0] = tsrc(TOY_FILE_GRF, grf + 0, 3 * 4);
    202       a0[1] = tsrc(TOY_FILE_GRF, grf + 0, 7 * 4);
    203       a0[2] = tsrc(TOY_FILE_GRF, grf + 1, 3 * 4);
    204       a0[3] = tsrc(TOY_FILE_GRF, grf + 1, 7 * 4);
    205 
    206       for (ch = 0; ch < 4; ch++)
    207          tc_MOV(tc, real_dst[ch], tsrc_rect(a0[ch], TOY_RECT_010));
    208    }
    209    else {
    210       struct toy_src attr[4], uv;
    211 
    212       attr[0] = tsrc(TOY_FILE_GRF, grf + 0, 0);
    213       attr[1] = tsrc(TOY_FILE_GRF, grf + 0, 4 * 4);
    214       attr[2] = tsrc(TOY_FILE_GRF, grf + 1, 0);
    215       attr[3] = tsrc(TOY_FILE_GRF, grf + 1, 4 * 4);
    216 
    217       uv = tsrc(TOY_FILE_GRF, interp, 0);
    218 
    219       for (ch = 0; ch < 4; ch++) {
    220          tc_add2(tc, GEN6_OPCODE_PLN, real_dst[ch],
    221                tsrc_rect(attr[ch], TOY_RECT_010), uv);
    222       }
    223    }
    224 
    225    if (fcc->tgsi.inputs[slot].semantic_name == TGSI_SEMANTIC_FOG) {
    226       tc_MOV(tc, real_dst[1], tsrc_imm_f(0.0f));
    227       tc_MOV(tc, real_dst[2], tsrc_imm_f(0.0f));
    228       tc_MOV(tc, real_dst[3], tsrc_imm_f(1.0f));
    229    }
    230 }
    231 
    232 static void
    233 fs_lower_opcode_tgsi_in(struct fs_compile_context *fcc,
    234                         struct toy_dst dst, int dim, int idx)
    235 {
    236    int slot;
    237 
    238    assert(!dim);
    239 
    240    slot = toy_tgsi_find_input(&fcc->tgsi, idx);
    241    if (slot < 0)
    242       return;
    243 
    244    switch (fcc->tgsi.inputs[slot].semantic_name) {
    245    case TGSI_SEMANTIC_POSITION:
    246       fetch_position(fcc, dst);
    247       break;
    248    case TGSI_SEMANTIC_FACE:
    249       fetch_face(fcc, dst);
    250       break;
    251    default:
    252       fetch_attr(fcc, dst, slot);
    253       break;
    254    }
    255 }
    256 
    257 static void
    258 fs_lower_opcode_tgsi_indirect_const(struct fs_compile_context *fcc,
    259                                     struct toy_dst dst, int dim,
    260                                     struct toy_src idx)
    261 {
    262    const struct toy_dst offset =
    263       tdst_ud(tdst(TOY_FILE_MRF, fcc->first_free_mrf, 0));
    264    struct toy_compiler *tc = &fcc->tc;
    265    unsigned simd_mode, param_size;
    266    struct toy_inst *inst;
    267    struct toy_src desc, real_src[4];
    268    struct toy_dst tmp, real_dst[4];
    269    unsigned i;
    270 
    271    tsrc_transpose(idx, real_src);
    272 
    273    /* set offset */
    274    inst = tc_MOV(tc, offset, real_src[0]);
    275    inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
    276 
    277    switch (inst->exec_size) {
    278    case GEN6_EXECSIZE_8:
    279       simd_mode = GEN6_MSG_SAMPLER_SIMD8;
    280       param_size = 1;
    281       break;
    282    case GEN6_EXECSIZE_16:
    283       simd_mode = GEN6_MSG_SAMPLER_SIMD16;
    284       param_size = 2;
    285       break;
    286    default:
    287       assert(!"unsupported execution size");
    288       tc_MOV(tc, dst, tsrc_imm_f(0.0f));
    289       return;
    290       break;
    291    }
    292 
    293    desc = tsrc_imm_mdesc_sampler(tc, param_size, param_size * 4, false,
    294          simd_mode,
    295          GEN6_MSG_SAMPLER_LD,
    296          0,
    297          fcc->shader->bt.const_base + dim);
    298 
    299    tmp = tdst(TOY_FILE_VRF, tc_alloc_vrf(tc, param_size * 4), 0);
    300    inst = tc_SEND(tc, tmp, tsrc_from(offset), desc, GEN6_SFID_SAMPLER);
    301    inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
    302 
    303    tdst_transpose(dst, real_dst);
    304    for (i = 0; i < 4; i++) {
    305       const struct toy_src src =
    306          tsrc_offset(tsrc_from(tmp), param_size * i, 0);
    307 
    308       /* cast to type D to make sure these are raw moves */
    309       tc_MOV(tc, tdst_d(real_dst[i]), tsrc_d(src));
    310    }
    311 }
    312 
    313 static bool
    314 fs_lower_opcode_tgsi_const_pcb(struct fs_compile_context *fcc,
    315                                struct toy_dst dst, int dim,
    316                                struct toy_src idx)
    317 {
    318    const int grf = fcc->first_const_grf + idx.val32 / 2;
    319    const int grf_subreg = (idx.val32 & 1) * 16;
    320    struct toy_src src;
    321    struct toy_dst real_dst[4];
    322    unsigned i;
    323 
    324    if (!fcc->variant->use_pcb || dim != 0 || idx.file != TOY_FILE_IMM ||
    325        grf >= fcc->first_attr_grf)
    326       return false;
    327 
    328    src = tsrc_rect(tsrc(TOY_FILE_GRF, grf, grf_subreg), TOY_RECT_010);
    329 
    330    tdst_transpose(dst, real_dst);
    331    for (i = 0; i < 4; i++) {
    332       /* cast to type D to make sure these are raw moves */
    333       tc_MOV(&fcc->tc, tdst_d(real_dst[i]), tsrc_d(tsrc_offset(src, 0, i)));
    334    }
    335 
    336    return true;
    337 }
    338 
    339 static void
    340 fs_lower_opcode_tgsi_const_gen6(struct fs_compile_context *fcc,
    341                                 struct toy_dst dst, int dim, struct toy_src idx)
    342 {
    343    const struct toy_dst header =
    344       tdst_ud(tdst(TOY_FILE_MRF, fcc->first_free_mrf, 0));
    345    const struct toy_dst global_offset =
    346       tdst_ud(tdst(TOY_FILE_MRF, fcc->first_free_mrf, 2 * 4));
    347    const struct toy_src r0 = tsrc_ud(tsrc(TOY_FILE_GRF, 0, 0));
    348    struct toy_compiler *tc = &fcc->tc;
    349    unsigned msg_type, msg_ctrl, msg_len;
    350    struct toy_inst *inst;
    351    struct toy_src desc;
    352    struct toy_dst tmp, real_dst[4];
    353    unsigned i;
    354 
    355    if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
    356       return;
    357 
    358    /* set message header */
    359    inst = tc_MOV(tc, header, r0);
    360    inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
    361 
    362    /* set global offset */
    363    inst = tc_MOV(tc, global_offset, idx);
    364    inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
    365    inst->exec_size = GEN6_EXECSIZE_1;
    366    inst->src[0].rect = TOY_RECT_010;
    367 
    368    msg_type = GEN6_MSG_DP_OWORD_BLOCK_READ;
    369    msg_ctrl = GEN6_MSG_DP_OWORD_BLOCK_SIZE_1_LO;
    370    msg_len = 1;
    371 
    372    desc = tsrc_imm_mdesc_data_port(tc, false, msg_len, 1, true, false,
    373          msg_type, msg_ctrl, fcc->shader->bt.const_base + dim);
    374 
    375    tmp = tc_alloc_tmp(tc);
    376 
    377    tc_SEND(tc, tmp, tsrc_from(header), desc, fcc->const_cache);
    378 
    379    tdst_transpose(dst, real_dst);
    380    for (i = 0; i < 4; i++) {
    381       const struct toy_src src =
    382          tsrc_offset(tsrc_rect(tsrc_from(tmp), TOY_RECT_010), 0, i);
    383 
    384       /* cast to type D to make sure these are raw moves */
    385       tc_MOV(tc, tdst_d(real_dst[i]), tsrc_d(src));
    386    }
    387 }
    388 
    389 static void
    390 fs_lower_opcode_tgsi_const_gen7(struct fs_compile_context *fcc,
    391                                 struct toy_dst dst, int dim, struct toy_src idx)
    392 {
    393    struct toy_compiler *tc = &fcc->tc;
    394    const struct toy_dst offset =
    395       tdst_ud(tdst(TOY_FILE_MRF, fcc->first_free_mrf, 0));
    396    struct toy_src desc;
    397    struct toy_inst *inst;
    398    struct toy_dst tmp, real_dst[4];
    399    unsigned i;
    400 
    401    if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
    402       return;
    403 
    404    /*
    405     * In 4c1fdae0a01b3f92ec03b61aac1d3df500d51fc6, pull constant load was
    406     * changed from OWord Block Read to ld to increase performance in the
    407     * classic driver.  Since we use the constant cache instead of the data
    408     * cache, I wonder if we still want to follow the classic driver.
    409     */
    410 
    411    /* set offset */
    412    inst = tc_MOV(tc, offset, tsrc_rect(idx, TOY_RECT_010));
    413    inst->exec_size = GEN6_EXECSIZE_8;
    414    inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
    415 
    416    desc = tsrc_imm_mdesc_sampler(tc, 1, 1, false,
    417          GEN6_MSG_SAMPLER_SIMD4X2,
    418          GEN6_MSG_SAMPLER_LD,
    419          0,
    420          fcc->shader->bt.const_base + dim);
    421 
    422    tmp = tc_alloc_tmp(tc);
    423    inst = tc_SEND(tc, tmp, tsrc_from(offset), desc, GEN6_SFID_SAMPLER);
    424    inst->exec_size = GEN6_EXECSIZE_8;
    425    inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
    426 
    427    tdst_transpose(dst, real_dst);
    428    for (i = 0; i < 4; i++) {
    429       const struct toy_src src =
    430          tsrc_offset(tsrc_rect(tsrc_from(tmp), TOY_RECT_010), 0, i);
    431 
    432       /* cast to type D to make sure these are raw moves */
    433       tc_MOV(tc, tdst_d(real_dst[i]), tsrc_d(src));
    434    }
    435 }
    436 
    437 static void
    438 fs_lower_opcode_tgsi_imm(struct fs_compile_context *fcc,
    439                          struct toy_dst dst, int idx)
    440 {
    441    const uint32_t *imm;
    442    struct toy_dst real_dst[4];
    443    int ch;
    444 
    445    imm = toy_tgsi_get_imm(&fcc->tgsi, idx, NULL);
    446 
    447    tdst_transpose(dst, real_dst);
    448    /* raw moves */
    449    for (ch = 0; ch < 4; ch++)
    450       tc_MOV(&fcc->tc, tdst_ud(real_dst[ch]), tsrc_imm_ud(imm[ch]));
    451 }
    452 
    453 static void
    454 fs_lower_opcode_tgsi_sv(struct fs_compile_context *fcc,
    455                         struct toy_dst dst, int dim, int idx)
    456 {
    457    struct toy_compiler *tc = &fcc->tc;
    458    const struct toy_tgsi *tgsi = &fcc->tgsi;
    459    int slot;
    460 
    461    assert(!dim);
    462 
    463    slot = toy_tgsi_find_system_value(tgsi, idx);
    464    if (slot < 0)
    465       return;
    466 
    467    switch (tgsi->system_values[slot].semantic_name) {
    468    case TGSI_SEMANTIC_PRIMID:
    469    case TGSI_SEMANTIC_INSTANCEID:
    470    case TGSI_SEMANTIC_VERTEXID:
    471    default:
    472       tc_fail(tc, "unhandled system value");
    473       tc_MOV(tc, dst, tsrc_imm_d(0));
    474       break;
    475    }
    476 }
    477 
    478 static void
    479 fs_lower_opcode_tgsi_direct(struct fs_compile_context *fcc,
    480                             struct toy_inst *inst)
    481 {
    482    struct toy_compiler *tc = &fcc->tc;
    483    int dim, idx;
    484 
    485    assert(inst->src[0].file == TOY_FILE_IMM);
    486    dim = inst->src[0].val32;
    487 
    488    assert(inst->src[1].file == TOY_FILE_IMM);
    489    idx = inst->src[1].val32;
    490 
    491    switch (inst->opcode) {
    492    case TOY_OPCODE_TGSI_IN:
    493       fs_lower_opcode_tgsi_in(fcc, inst->dst, dim, idx);
    494       break;
    495    case TOY_OPCODE_TGSI_CONST:
    496       if (ilo_dev_gen(tc->dev) >= ILO_GEN(7))
    497          fs_lower_opcode_tgsi_const_gen7(fcc, inst->dst, dim, inst->src[1]);
    498       else
    499          fs_lower_opcode_tgsi_const_gen6(fcc, inst->dst, dim, inst->src[1]);
    500       break;
    501    case TOY_OPCODE_TGSI_SV:
    502       fs_lower_opcode_tgsi_sv(fcc, inst->dst, dim, idx);
    503       break;
    504    case TOY_OPCODE_TGSI_IMM:
    505       assert(!dim);
    506       fs_lower_opcode_tgsi_imm(fcc, inst->dst, idx);
    507       break;
    508    default:
    509       tc_fail(tc, "unhandled TGSI fetch");
    510       break;
    511    }
    512 
    513    tc_discard_inst(tc, inst);
    514 }
    515 
    516 static void
    517 fs_lower_opcode_tgsi_indirect(struct fs_compile_context *fcc,
    518                               struct toy_inst *inst)
    519 {
    520    struct toy_compiler *tc = &fcc->tc;
    521    enum tgsi_file_type file;
    522    int dim, idx;
    523    struct toy_src indirect_dim, indirect_idx;
    524 
    525    assert(inst->src[0].file == TOY_FILE_IMM);
    526    file = inst->src[0].val32;
    527 
    528    assert(inst->src[1].file == TOY_FILE_IMM);
    529    dim = inst->src[1].val32;
    530    indirect_dim = inst->src[2];
    531 
    532    assert(inst->src[3].file == TOY_FILE_IMM);
    533    idx = inst->src[3].val32;
    534    indirect_idx = inst->src[4];
    535 
    536    /* no dimension indirection */
    537    assert(indirect_dim.file == TOY_FILE_IMM);
    538    dim += indirect_dim.val32;
    539 
    540    switch (inst->opcode) {
    541    case TOY_OPCODE_TGSI_INDIRECT_FETCH:
    542       if (file == TGSI_FILE_CONSTANT) {
    543          if (idx) {
    544             struct toy_dst tmp = tc_alloc_tmp(tc);
    545 
    546             tc_ADD(tc, tmp, indirect_idx, tsrc_imm_d(idx));
    547             indirect_idx = tsrc_from(tmp);
    548          }
    549 
    550          fs_lower_opcode_tgsi_indirect_const(fcc, inst->dst, dim, indirect_idx);
    551          break;
    552       }
    553       /* fall through */
    554    case TOY_OPCODE_TGSI_INDIRECT_STORE:
    555    default:
    556       tc_fail(tc, "unhandled TGSI indirection");
    557       break;
    558    }
    559 
    560    tc_discard_inst(tc, inst);
    561 }
    562 
    563 /**
    564  * Emit instructions to move sampling parameters to the message registers.
    565  */
    566 static int
    567 fs_add_sampler_params_gen6(struct toy_compiler *tc, int msg_type,
    568                            int base_mrf, int param_size,
    569                            struct toy_src *coords, int num_coords,
    570                            struct toy_src bias_or_lod, struct toy_src ref_or_si,
    571                            struct toy_src *ddx, struct toy_src *ddy,
    572                            int num_derivs)
    573 {
    574    int num_params, i;
    575 
    576    assert(num_coords <= 4);
    577    assert(num_derivs <= 3 && num_derivs <= num_coords);
    578 
    579 #define SAMPLER_PARAM(p) (tdst(TOY_FILE_MRF, base_mrf + (p) * param_size, 0))
    580    switch (msg_type) {
    581    case GEN6_MSG_SAMPLER_SAMPLE:
    582       for (i = 0; i < num_coords; i++)
    583          tc_MOV(tc, SAMPLER_PARAM(i), coords[i]);
    584       num_params = num_coords;
    585       break;
    586    case GEN6_MSG_SAMPLER_SAMPLE_B:
    587    case GEN6_MSG_SAMPLER_SAMPLE_L:
    588       for (i = 0; i < num_coords; i++)
    589          tc_MOV(tc, SAMPLER_PARAM(i), coords[i]);
    590       tc_MOV(tc, SAMPLER_PARAM(4), bias_or_lod);
    591       num_params = 5;
    592       break;
    593    case GEN6_MSG_SAMPLER_SAMPLE_C:
    594       for (i = 0; i < num_coords; i++)
    595          tc_MOV(tc, SAMPLER_PARAM(i), coords[i]);
    596       tc_MOV(tc, SAMPLER_PARAM(4), ref_or_si);
    597       num_params = 5;
    598       break;
    599    case GEN6_MSG_SAMPLER_SAMPLE_D:
    600       for (i = 0; i < num_coords; i++)
    601          tc_MOV(tc, SAMPLER_PARAM(i), coords[i]);
    602       for (i = 0; i < num_derivs; i++) {
    603          tc_MOV(tc, SAMPLER_PARAM(4 + i * 2), ddx[i]);
    604          tc_MOV(tc, SAMPLER_PARAM(5 + i * 2), ddy[i]);
    605       }
    606       num_params = 4 + num_derivs * 2;
    607       break;
    608    case GEN6_MSG_SAMPLER_SAMPLE_B_C:
    609    case GEN6_MSG_SAMPLER_SAMPLE_L_C:
    610       for (i = 0; i < num_coords; i++)
    611          tc_MOV(tc, SAMPLER_PARAM(i), coords[i]);
    612       tc_MOV(tc, SAMPLER_PARAM(4), ref_or_si);
    613       tc_MOV(tc, SAMPLER_PARAM(5), bias_or_lod);
    614       num_params = 6;
    615       break;
    616    case GEN6_MSG_SAMPLER_LD:
    617       assert(num_coords <= 3);
    618 
    619       for (i = 0; i < num_coords; i++)
    620          tc_MOV(tc, tdst_d(SAMPLER_PARAM(i)), coords[i]);
    621       tc_MOV(tc, tdst_d(SAMPLER_PARAM(3)), bias_or_lod);
    622       tc_MOV(tc, tdst_d(SAMPLER_PARAM(4)), ref_or_si);
    623       num_params = 5;
    624       break;
    625    case GEN6_MSG_SAMPLER_RESINFO:
    626       tc_MOV(tc, tdst_d(SAMPLER_PARAM(0)), bias_or_lod);
    627       num_params = 1;
    628       break;
    629    default:
    630       tc_fail(tc, "unknown sampler opcode");
    631       num_params = 0;
    632       break;
    633    }
    634 #undef SAMPLER_PARAM
    635 
    636    return num_params * param_size;
    637 }
    638 
    639 static int
    640 fs_add_sampler_params_gen7(struct toy_compiler *tc, int msg_type,
    641                            int base_mrf, int param_size,
    642                            struct toy_src *coords, int num_coords,
    643                            struct toy_src bias_or_lod, struct toy_src ref_or_si,
    644                            struct toy_src *ddx, struct toy_src *ddy,
    645                            int num_derivs)
    646 {
    647    int num_params, i;
    648 
    649    assert(num_coords <= 4);
    650    assert(num_derivs <= 3 && num_derivs <= num_coords);
    651 
    652 #define SAMPLER_PARAM(p) (tdst(TOY_FILE_MRF, base_mrf + (p) * param_size, 0))
    653    switch (msg_type) {
    654    case GEN6_MSG_SAMPLER_SAMPLE:
    655       for (i = 0; i < num_coords; i++)
    656          tc_MOV(tc, SAMPLER_PARAM(i), coords[i]);
    657       num_params = num_coords;
    658       break;
    659    case GEN6_MSG_SAMPLER_SAMPLE_B:
    660    case GEN6_MSG_SAMPLER_SAMPLE_L:
    661       tc_MOV(tc, SAMPLER_PARAM(0), bias_or_lod);
    662       for (i = 0; i < num_coords; i++)
    663          tc_MOV(tc, SAMPLER_PARAM(1 + i), coords[i]);
    664       num_params = 1 + num_coords;
    665       break;
    666    case GEN6_MSG_SAMPLER_SAMPLE_C:
    667       tc_MOV(tc, SAMPLER_PARAM(0), ref_or_si);
    668       for (i = 0; i < num_coords; i++)
    669          tc_MOV(tc, SAMPLER_PARAM(1 + i), coords[i]);
    670       num_params = 1 + num_coords;
    671       break;
    672    case GEN6_MSG_SAMPLER_SAMPLE_D:
    673       for (i = 0; i < num_coords; i++) {
    674          tc_MOV(tc, SAMPLER_PARAM(i * 3), coords[i]);
    675          if (i < num_derivs) {
    676             tc_MOV(tc, SAMPLER_PARAM(i * 3 + 1), ddx[i]);
    677             tc_MOV(tc, SAMPLER_PARAM(i * 3 + 2), ddy[i]);
    678          }
    679       }
    680       num_params = num_coords * 3 - ((num_coords > num_derivs) ? 2 : 0);
    681       break;
    682    case GEN6_MSG_SAMPLER_SAMPLE_B_C:
    683    case GEN6_MSG_SAMPLER_SAMPLE_L_C:
    684       tc_MOV(tc, SAMPLER_PARAM(0), ref_or_si);
    685       tc_MOV(tc, SAMPLER_PARAM(1), bias_or_lod);
    686       for (i = 0; i < num_coords; i++)
    687          tc_MOV(tc, SAMPLER_PARAM(2 + i), coords[i]);
    688       num_params = 2 + num_coords;
    689       break;
    690    case GEN6_MSG_SAMPLER_LD:
    691       assert(num_coords >= 1 && num_coords <= 3);
    692 
    693       tc_MOV(tc, tdst_d(SAMPLER_PARAM(0)), coords[0]);
    694       tc_MOV(tc, tdst_d(SAMPLER_PARAM(1)), bias_or_lod);
    695       for (i = 1; i < num_coords; i++)
    696          tc_MOV(tc, tdst_d(SAMPLER_PARAM(1 + i)), coords[i]);
    697       num_params = 1 + num_coords;
    698       break;
    699    case GEN6_MSG_SAMPLER_RESINFO:
    700       tc_MOV(tc, tdst_d(SAMPLER_PARAM(0)), bias_or_lod);
    701       num_params = 1;
    702       break;
    703    default:
    704       tc_fail(tc, "unknown sampler opcode");
    705       num_params = 0;
    706       break;
    707    }
    708 #undef SAMPLER_PARAM
    709 
    710    return num_params * param_size;
    711 }
    712 
    713 /**
    714  * Set up message registers and return the message descriptor for sampling.
    715  */
    716 static struct toy_src
    717 fs_prepare_tgsi_sampling(struct fs_compile_context *fcc,
    718                          const struct toy_inst *inst,
    719                          int base_mrf, const uint32_t *saturate_coords,
    720                          unsigned *ret_sampler_index)
    721 {
    722    struct toy_compiler *tc = &fcc->tc;
    723    unsigned simd_mode, msg_type, msg_len, sampler_index, binding_table_index;
    724    struct toy_src coords[4], ddx[4], ddy[4], bias_or_lod, ref_or_si;
    725    int num_coords, ref_pos, num_derivs;
    726    int sampler_src, param_size, i;
    727 
    728    switch (inst->exec_size) {
    729    case GEN6_EXECSIZE_8:
    730       simd_mode = GEN6_MSG_SAMPLER_SIMD8;
    731       param_size = 1;
    732       break;
    733    case GEN6_EXECSIZE_16:
    734       simd_mode = GEN6_MSG_SAMPLER_SIMD16;
    735       param_size = 2;
    736       break;
    737    default:
    738       tc_fail(tc, "unsupported execute size for sampling");
    739       return tsrc_null();
    740       break;
    741    }
    742 
    743    num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target);
    744    ref_pos = tgsi_util_get_shadow_ref_src_index(inst->tex.target);
    745 
    746    tsrc_transpose(inst->src[0], coords);
    747    bias_or_lod = tsrc_null();
    748    ref_or_si = tsrc_null();
    749    num_derivs = 0;
    750    sampler_src = 1;
    751 
    752    /*
    753     * For TXD,
    754     *
    755     *   src0 := (x, y, z, w)
    756     *   src1 := ddx
    757     *   src2 := ddy
    758     *   src3 := sampler
    759     *
    760     * For TEX2, TXB2, and TXL2,
    761     *
    762     *   src0 := (x, y, z, w)
    763     *   src1 := (v or bias or lod, ...)
    764     *   src2 := sampler
    765     *
    766     * For TEX, TXB, TXL, and TXP,
    767     *
    768     *   src0 := (x, y, z, w or bias or lod or projection)
    769     *   src1 := sampler
    770     *
    771     * For TXQ,
    772     *
    773     *   src0 := (lod, ...)
    774     *   src1 := sampler
    775     *
    776     * For TXQ_LZ,
    777     *
    778     *   src0 := sampler
    779     *
    780     * And for TXF,
    781     *
    782     *   src0 := (x, y, z, w or lod)
    783     *   src1 := sampler
    784     *
    785     * State trackers should not generate opcode+texture combinations with
    786     * which the two definitions conflict (e.g., TXB with SHADOW2DARRAY).
    787     */
    788    switch (inst->opcode) {
    789    case TOY_OPCODE_TGSI_TEX:
    790       if (ref_pos >= 0) {
    791          assert(ref_pos < 4);
    792 
    793          msg_type = GEN6_MSG_SAMPLER_SAMPLE_C;
    794          ref_or_si = coords[ref_pos];
    795       }
    796       else {
    797          msg_type = GEN6_MSG_SAMPLER_SAMPLE;
    798       }
    799       break;
    800    case TOY_OPCODE_TGSI_TXD:
    801       if (ref_pos >= 0) {
    802          assert(ref_pos < 4);
    803 
    804          msg_type = GEN7_MSG_SAMPLER_SAMPLE_D_C;
    805          ref_or_si = coords[ref_pos];
    806 
    807          if (ilo_dev_gen(tc->dev) < ILO_GEN(7.5))
    808             tc_fail(tc, "TXD with shadow sampler not supported");
    809       }
    810       else {
    811          msg_type = GEN6_MSG_SAMPLER_SAMPLE_D;
    812       }
    813 
    814       tsrc_transpose(inst->src[1], ddx);
    815       tsrc_transpose(inst->src[2], ddy);
    816       num_derivs = num_coords;
    817       sampler_src = 3;
    818       break;
    819    case TOY_OPCODE_TGSI_TXP:
    820       if (ref_pos >= 0) {
    821          assert(ref_pos < 3);
    822 
    823          msg_type = GEN6_MSG_SAMPLER_SAMPLE_C;
    824          ref_or_si = coords[ref_pos];
    825       }
    826       else {
    827          msg_type = GEN6_MSG_SAMPLER_SAMPLE;
    828       }
    829 
    830       /* project the coordinates */
    831       {
    832          struct toy_dst tmp[4];
    833 
    834          tc_alloc_tmp4(tc, tmp);
    835 
    836          tc_INV(tc, tmp[3], coords[3]);
    837          for (i = 0; i < num_coords && i < 3; i++) {
    838             tc_MUL(tc, tmp[i], coords[i], tsrc_from(tmp[3]));
    839             coords[i] = tsrc_from(tmp[i]);
    840          }
    841 
    842          if (ref_pos >= i) {
    843             tc_MUL(tc, tmp[ref_pos], ref_or_si, tsrc_from(tmp[3]));
    844             ref_or_si = tsrc_from(tmp[ref_pos]);
    845          }
    846       }
    847       break;
    848    case TOY_OPCODE_TGSI_TXB:
    849       if (ref_pos >= 0) {
    850          assert(ref_pos < 3);
    851 
    852          msg_type = GEN6_MSG_SAMPLER_SAMPLE_B_C;
    853          ref_or_si = coords[ref_pos];
    854       }
    855       else {
    856          msg_type = GEN6_MSG_SAMPLER_SAMPLE_B;
    857       }
    858 
    859       bias_or_lod = coords[3];
    860       break;
    861    case TOY_OPCODE_TGSI_TXL:
    862       if (ref_pos >= 0) {
    863          assert(ref_pos < 3);
    864 
    865          msg_type = GEN6_MSG_SAMPLER_SAMPLE_L_C;
    866          ref_or_si = coords[ref_pos];
    867       }
    868       else {
    869          msg_type = GEN6_MSG_SAMPLER_SAMPLE_L;
    870       }
    871 
    872       bias_or_lod = coords[3];
    873       break;
    874    case TOY_OPCODE_TGSI_TXF:
    875       msg_type = GEN6_MSG_SAMPLER_LD;
    876 
    877       switch (inst->tex.target) {
    878       case TGSI_TEXTURE_2D_MSAA:
    879       case TGSI_TEXTURE_2D_ARRAY_MSAA:
    880          assert(ref_pos >= 0 && ref_pos < 4);
    881          /* lod is always 0 */
    882          bias_or_lod = tsrc_imm_d(0);
    883          ref_or_si = coords[ref_pos];
    884          break;
    885       default:
    886          bias_or_lod = coords[3];
    887          break;
    888       }
    889 
    890       /* offset the coordinates */
    891       if (!tsrc_is_null(inst->tex.offsets[0])) {
    892          struct toy_dst tmp[4];
    893          struct toy_src offsets[4];
    894 
    895          tc_alloc_tmp4(tc, tmp);
    896          tsrc_transpose(inst->tex.offsets[0], offsets);
    897 
    898          for (i = 0; i < num_coords; i++) {
    899             tc_ADD(tc, tmp[i], coords[i], offsets[i]);
    900             coords[i] = tsrc_from(tmp[i]);
    901          }
    902       }
    903 
    904       sampler_src = 1;
    905       break;
    906    case TOY_OPCODE_TGSI_TXQ:
    907       msg_type = GEN6_MSG_SAMPLER_RESINFO;
    908       num_coords = 0;
    909       bias_or_lod = coords[0];
    910       break;
    911    case TOY_OPCODE_TGSI_TXQ_LZ:
    912       msg_type = GEN6_MSG_SAMPLER_RESINFO;
    913       num_coords = 0;
    914       sampler_src = 0;
    915       break;
    916    case TOY_OPCODE_TGSI_TEX2:
    917       if (ref_pos >= 0) {
    918          assert(ref_pos < 5);
    919 
    920          msg_type = GEN6_MSG_SAMPLER_SAMPLE_C;
    921 
    922          if (ref_pos >= 4) {
    923             struct toy_src src1[4];
    924             tsrc_transpose(inst->src[1], src1);
    925             ref_or_si = src1[ref_pos - 4];
    926          }
    927          else {
    928             ref_or_si = coords[ref_pos];
    929          }
    930       }
    931       else {
    932          msg_type = GEN6_MSG_SAMPLER_SAMPLE;
    933       }
    934 
    935       sampler_src = 2;
    936       break;
    937    case TOY_OPCODE_TGSI_TXB2:
    938       if (ref_pos >= 0) {
    939          assert(ref_pos < 4);
    940 
    941          msg_type = GEN6_MSG_SAMPLER_SAMPLE_B_C;
    942          ref_or_si = coords[ref_pos];
    943       }
    944       else {
    945          msg_type = GEN6_MSG_SAMPLER_SAMPLE_B;
    946       }
    947 
    948       {
    949          struct toy_src src1[4];
    950          tsrc_transpose(inst->src[1], src1);
    951          bias_or_lod = src1[0];
    952       }
    953 
    954       sampler_src = 2;
    955       break;
    956    case TOY_OPCODE_TGSI_TXL2:
    957       if (ref_pos >= 0) {
    958          assert(ref_pos < 4);
    959 
    960          msg_type = GEN6_MSG_SAMPLER_SAMPLE_L_C;
    961          ref_or_si = coords[ref_pos];
    962       }
    963       else {
    964          msg_type = GEN6_MSG_SAMPLER_SAMPLE_L;
    965       }
    966 
    967       {
    968          struct toy_src src1[4];
    969          tsrc_transpose(inst->src[1], src1);
    970          bias_or_lod = src1[0];
    971       }
    972 
    973       sampler_src = 2;
    974       break;
    975    default:
    976       assert(!"unhandled sampling opcode");
    977       return tsrc_null();
    978       break;
    979    }
    980 
    981    assert(inst->src[sampler_src].file == TOY_FILE_IMM);
    982    sampler_index = inst->src[sampler_src].val32;
    983    binding_table_index = fcc->shader->bt.tex_base + sampler_index;
    984 
    985    /*
    986     * From the Sandy Bridge PRM, volume 4 part 1, page 18:
    987     *
    988     *     "Note that the (cube map) coordinates delivered to the sampling
    989     *      engine must already have been divided by the component with the
    990     *      largest absolute value."
    991     */
    992    switch (inst->tex.target) {
    993    case TGSI_TEXTURE_CUBE:
    994    case TGSI_TEXTURE_SHADOWCUBE:
    995    case TGSI_TEXTURE_CUBE_ARRAY:
    996    case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
    997       /* TXQ does not need coordinates */
    998       if (num_coords >= 3) {
    999          struct toy_dst tmp[4];
   1000 
   1001          tc_alloc_tmp4(tc, tmp);
   1002 
   1003          tc_SEL(tc, tmp[3], tsrc_absolute(coords[0]),
   1004                tsrc_absolute(coords[1]), GEN6_COND_GE);
   1005          tc_SEL(tc, tmp[3], tsrc_from(tmp[3]),
   1006                tsrc_absolute(coords[2]), GEN6_COND_GE);
   1007          tc_INV(tc, tmp[3], tsrc_from(tmp[3]));
   1008 
   1009          for (i = 0; i < 3; i++) {
   1010             tc_MUL(tc, tmp[i], coords[i], tsrc_from(tmp[3]));
   1011             coords[i] = tsrc_from(tmp[i]);
   1012          }
   1013       }
   1014       break;
   1015    }
   1016 
   1017    /*
   1018     * Saturate (s, t, r).  saturate_coords is set for sampler and coordinate
   1019     * that uses linear filtering and PIPE_TEX_WRAP_CLAMP respectively.  It is
   1020     * so that sampling outside the border gets the correct colors.
   1021     */
   1022    for (i = 0; i < MIN2(num_coords, 3); i++) {
   1023       bool is_rect;
   1024 
   1025       if (!(saturate_coords[i] & (1 << sampler_index)))
   1026          continue;
   1027 
   1028       switch (inst->tex.target) {
   1029       case TGSI_TEXTURE_RECT:
   1030       case TGSI_TEXTURE_SHADOWRECT:
   1031          is_rect = true;
   1032          break;
   1033       default:
   1034          is_rect = false;
   1035          break;
   1036       }
   1037 
   1038       if (is_rect) {
   1039          struct toy_src min, max;
   1040          struct toy_dst tmp;
   1041 
   1042          tc_fail(tc, "GL_CLAMP with rectangle texture unsupported");
   1043          tmp = tc_alloc_tmp(tc);
   1044 
   1045          /* saturate to [0, width] or [0, height] */
   1046          /* TODO TXQ? */
   1047          min = tsrc_imm_f(0.0f);
   1048          max = tsrc_imm_f(2048.0f);
   1049 
   1050          tc_SEL(tc, tmp, coords[i], min, GEN6_COND_G);
   1051          tc_SEL(tc, tmp, tsrc_from(tmp), max, GEN6_COND_L);
   1052 
   1053          coords[i] = tsrc_from(tmp);
   1054       }
   1055       else {
   1056          struct toy_dst tmp;
   1057          struct toy_inst *inst2;
   1058 
   1059          tmp = tc_alloc_tmp(tc);
   1060 
   1061          /* saturate to [0.0f, 1.0f] */
   1062          inst2 = tc_MOV(tc, tmp, coords[i]);
   1063          inst2->saturate = true;
   1064 
   1065          coords[i] = tsrc_from(tmp);
   1066       }
   1067    }
   1068 
   1069    /* set up sampler parameters */
   1070    if (ilo_dev_gen(tc->dev) >= ILO_GEN(7)) {
   1071       msg_len = fs_add_sampler_params_gen7(tc, msg_type, base_mrf, param_size,
   1072             coords, num_coords, bias_or_lod, ref_or_si, ddx, ddy, num_derivs);
   1073    }
   1074    else {
   1075       msg_len = fs_add_sampler_params_gen6(tc, msg_type, base_mrf, param_size,
   1076             coords, num_coords, bias_or_lod, ref_or_si, ddx, ddy, num_derivs);
   1077    }
   1078 
   1079    /*
   1080     * From the Sandy Bridge PRM, volume 4 part 1, page 136:
   1081     *
   1082     *     "The maximum message length allowed to the sampler is 11. This would
   1083     *      disallow sample_d, sample_b_c, and sample_l_c with a SIMD Mode of
   1084     *      SIMD16."
   1085     */
   1086    if (msg_len > 11)
   1087       tc_fail(tc, "maximum length for messages to the sampler is 11");
   1088 
   1089    if (ret_sampler_index)
   1090       *ret_sampler_index = sampler_index;
   1091 
   1092    return tsrc_imm_mdesc_sampler(tc, msg_len, 4 * param_size,
   1093          false, simd_mode, msg_type, sampler_index, binding_table_index);
   1094 }
   1095 
   1096 static void
   1097 fs_lower_opcode_tgsi_sampling(struct fs_compile_context *fcc,
   1098                               struct toy_inst *inst)
   1099 {
   1100    struct toy_compiler *tc = &fcc->tc;
   1101    struct toy_dst dst[4], tmp[4];
   1102    struct toy_src desc;
   1103    unsigned sampler_index;
   1104    int swizzles[4], i;
   1105    bool need_filter;
   1106 
   1107    desc = fs_prepare_tgsi_sampling(fcc, inst,
   1108          fcc->first_free_mrf,
   1109          fcc->variant->saturate_tex_coords,
   1110          &sampler_index);
   1111 
   1112    switch (inst->opcode) {
   1113    case TOY_OPCODE_TGSI_TXF:
   1114    case TOY_OPCODE_TGSI_TXQ:
   1115    case TOY_OPCODE_TGSI_TXQ_LZ:
   1116       need_filter = false;
   1117       break;
   1118    default:
   1119       need_filter = true;
   1120       break;
   1121    }
   1122 
   1123    toy_compiler_lower_to_send(tc, inst, false, GEN6_SFID_SAMPLER);
   1124    inst->src[0] = tsrc(TOY_FILE_MRF, fcc->first_free_mrf, 0);
   1125    inst->src[1] = desc;
   1126    for (i = 2; i < ARRAY_SIZE(inst->src); i++)
   1127       inst->src[i] = tsrc_null();
   1128 
   1129    /* write to temps first */
   1130    tc_alloc_tmp4(tc, tmp);
   1131    for (i = 0; i < 4; i++)
   1132       tmp[i].type = inst->dst.type;
   1133    tdst_transpose(inst->dst, dst);
   1134    inst->dst = tmp[0];
   1135 
   1136    tc_move_inst(tc, inst);
   1137 
   1138    if (need_filter) {
   1139       assert(sampler_index < fcc->variant->num_sampler_views);
   1140       swizzles[0] = fcc->variant->sampler_view_swizzles[sampler_index].r;
   1141       swizzles[1] = fcc->variant->sampler_view_swizzles[sampler_index].g;
   1142       swizzles[2] = fcc->variant->sampler_view_swizzles[sampler_index].b;
   1143       swizzles[3] = fcc->variant->sampler_view_swizzles[sampler_index].a;
   1144    }
   1145    else {
   1146       swizzles[0] = PIPE_SWIZZLE_X;
   1147       swizzles[1] = PIPE_SWIZZLE_Y;
   1148       swizzles[2] = PIPE_SWIZZLE_Z;
   1149       swizzles[3] = PIPE_SWIZZLE_W;
   1150    }
   1151 
   1152    /* swizzle the results */
   1153    for (i = 0; i < 4; i++) {
   1154       switch (swizzles[i]) {
   1155       case PIPE_SWIZZLE_0:
   1156          tc_MOV(tc, dst[i], tsrc_imm_f(0.0f));
   1157          break;
   1158       case PIPE_SWIZZLE_1:
   1159          tc_MOV(tc, dst[i], tsrc_imm_f(1.0f));
   1160          break;
   1161       default:
   1162          tc_MOV(tc, dst[i], tsrc_from(tmp[swizzles[i]]));
   1163          break;
   1164       }
   1165    }
   1166 }
   1167 
   1168 static void
   1169 fs_lower_opcode_derivative(struct toy_compiler *tc, struct toy_inst *inst)
   1170 {
   1171    struct toy_dst dst[4];
   1172    struct toy_src src[4];
   1173    unsigned i;
   1174 
   1175    tdst_transpose(inst->dst, dst);
   1176    tsrc_transpose(inst->src[0], src);
   1177 
   1178    /*
   1179     * Every four fragments are from a 2x2 subspan, with
   1180     *
   1181     *   fragment 1 on the top-left,
   1182     *   fragment 2 on the top-right,
   1183     *   fragment 3 on the bottom-left,
   1184     *   fragment 4 on the bottom-right.
   1185     *
   1186     * DDX should thus produce
   1187     *
   1188     *   dst = src.yyww - src.xxzz
   1189     *
   1190     * and DDY should produce
   1191     *
   1192     *   dst = src.zzww - src.xxyy
   1193     *
   1194     * But since we are in GEN6_ALIGN_1, swizzling does not work and we have to
   1195     * play with the region parameters.
   1196     */
   1197    if (inst->opcode == TOY_OPCODE_DDX) {
   1198       for (i = 0; i < 4; i++) {
   1199          struct toy_src left, right;
   1200 
   1201          left = tsrc_rect(src[i], TOY_RECT_220);
   1202          right = tsrc_offset(left, 0, 1);
   1203 
   1204          tc_ADD(tc, dst[i], right, tsrc_negate(left));
   1205       }
   1206    }
   1207    else {
   1208       for (i = 0; i < 4; i++) {
   1209          struct toy_src top, bottom;
   1210 
   1211          /* approximate with dst = src.zzzz - src.xxxx */
   1212          top = tsrc_rect(src[i], TOY_RECT_440);
   1213          bottom = tsrc_offset(top, 0, 2);
   1214 
   1215          tc_ADD(tc, dst[i], bottom, tsrc_negate(top));
   1216       }
   1217    }
   1218 
   1219    tc_discard_inst(tc, inst);
   1220 }
   1221 
   1222 static void
   1223 fs_lower_opcode_fb_write(struct toy_compiler *tc, struct toy_inst *inst)
   1224 {
   1225    /* fs_write_fb() has set up the message registers */
   1226    toy_compiler_lower_to_send(tc, inst, true,
   1227          GEN6_SFID_DP_RC);
   1228 }
   1229 
   1230 static void
   1231 fs_lower_opcode_kil(struct toy_compiler *tc, struct toy_inst *inst)
   1232 {
   1233    struct toy_dst pixel_mask_dst;
   1234    struct toy_src f0, pixel_mask;
   1235    struct toy_inst *tmp;
   1236 
   1237    /* lower half of r1.7:ud */
   1238    pixel_mask_dst = tdst_uw(tdst(TOY_FILE_GRF, 1, 7 * 4));
   1239    pixel_mask = tsrc_rect(tsrc_from(pixel_mask_dst), TOY_RECT_010);
   1240 
   1241    f0 = tsrc_rect(tsrc_uw(tsrc(TOY_FILE_ARF, GEN6_ARF_F0, 0)), TOY_RECT_010);
   1242 
   1243    /* KILL or KILL_IF */
   1244    if (tsrc_is_null(inst->src[0])) {
   1245       struct toy_src dummy = tsrc_uw(tsrc(TOY_FILE_GRF, 0, 0));
   1246       struct toy_dst f0_dst = tdst_uw(tdst(TOY_FILE_ARF, GEN6_ARF_F0, 0));
   1247 
   1248       /* create a mask that masks out all pixels */
   1249       tmp = tc_MOV(tc, f0_dst, tsrc_rect(tsrc_imm_uw(0xffff), TOY_RECT_010));
   1250       tmp->exec_size = GEN6_EXECSIZE_1;
   1251       tmp->mask_ctrl = GEN6_MASKCTRL_NOMASK;
   1252 
   1253       tc_CMP(tc, tdst_null(), dummy, dummy, GEN6_COND_NZ);
   1254 
   1255       /* swapping the two src operands breaks glBitmap()!? */
   1256       tmp = tc_AND(tc, pixel_mask_dst, f0, pixel_mask);
   1257       tmp->exec_size = GEN6_EXECSIZE_1;
   1258       tmp->mask_ctrl = GEN6_MASKCTRL_NOMASK;
   1259    }
   1260    else {
   1261       struct toy_src src[4];
   1262       unsigned i;
   1263 
   1264       tsrc_transpose(inst->src[0], src);
   1265       /* mask out killed pixels */
   1266       for (i = 0; i < 4; i++) {
   1267          tc_CMP(tc, tdst_null(), src[i], tsrc_imm_f(0.0f),
   1268                GEN6_COND_GE);
   1269 
   1270          /* swapping the two src operands breaks glBitmap()!? */
   1271          tmp = tc_AND(tc, pixel_mask_dst, f0, pixel_mask);
   1272          tmp->exec_size = GEN6_EXECSIZE_1;
   1273          tmp->mask_ctrl = GEN6_MASKCTRL_NOMASK;
   1274       }
   1275    }
   1276 
   1277    tc_discard_inst(tc, inst);
   1278 }
   1279 
   1280 static void
   1281 fs_lower_virtual_opcodes(struct fs_compile_context *fcc)
   1282 {
   1283    struct toy_compiler *tc = &fcc->tc;
   1284    struct toy_inst *inst;
   1285 
   1286    /* lower TGSI's first, as they might be lowered to other virtual opcodes */
   1287    tc_head(tc);
   1288    while ((inst = tc_next(tc)) != NULL) {
   1289       switch (inst->opcode) {
   1290       case TOY_OPCODE_TGSI_IN:
   1291       case TOY_OPCODE_TGSI_CONST:
   1292       case TOY_OPCODE_TGSI_SV:
   1293       case TOY_OPCODE_TGSI_IMM:
   1294          fs_lower_opcode_tgsi_direct(fcc, inst);
   1295          break;
   1296       case TOY_OPCODE_TGSI_INDIRECT_FETCH:
   1297       case TOY_OPCODE_TGSI_INDIRECT_STORE:
   1298          fs_lower_opcode_tgsi_indirect(fcc, inst);
   1299          break;
   1300       case TOY_OPCODE_TGSI_TEX:
   1301       case TOY_OPCODE_TGSI_TXB:
   1302       case TOY_OPCODE_TGSI_TXD:
   1303       case TOY_OPCODE_TGSI_TXL:
   1304       case TOY_OPCODE_TGSI_TXP:
   1305       case TOY_OPCODE_TGSI_TXF:
   1306       case TOY_OPCODE_TGSI_TXQ:
   1307       case TOY_OPCODE_TGSI_TXQ_LZ:
   1308       case TOY_OPCODE_TGSI_TEX2:
   1309       case TOY_OPCODE_TGSI_TXB2:
   1310       case TOY_OPCODE_TGSI_TXL2:
   1311       case TOY_OPCODE_TGSI_SAMPLE:
   1312       case TOY_OPCODE_TGSI_SAMPLE_I:
   1313       case TOY_OPCODE_TGSI_SAMPLE_I_MS:
   1314       case TOY_OPCODE_TGSI_SAMPLE_B:
   1315       case TOY_OPCODE_TGSI_SAMPLE_C:
   1316       case TOY_OPCODE_TGSI_SAMPLE_C_LZ:
   1317       case TOY_OPCODE_TGSI_SAMPLE_D:
   1318       case TOY_OPCODE_TGSI_SAMPLE_L:
   1319       case TOY_OPCODE_TGSI_GATHER4:
   1320       case TOY_OPCODE_TGSI_SVIEWINFO:
   1321       case TOY_OPCODE_TGSI_SAMPLE_POS:
   1322       case TOY_OPCODE_TGSI_SAMPLE_INFO:
   1323          fs_lower_opcode_tgsi_sampling(fcc, inst);
   1324          break;
   1325       }
   1326    }
   1327 
   1328    tc_head(tc);
   1329    while ((inst = tc_next(tc)) != NULL) {
   1330       switch (inst->opcode) {
   1331       case TOY_OPCODE_INV:
   1332       case TOY_OPCODE_LOG:
   1333       case TOY_OPCODE_EXP:
   1334       case TOY_OPCODE_SQRT:
   1335       case TOY_OPCODE_RSQ:
   1336       case TOY_OPCODE_SIN:
   1337       case TOY_OPCODE_COS:
   1338       case TOY_OPCODE_FDIV:
   1339       case TOY_OPCODE_POW:
   1340       case TOY_OPCODE_INT_DIV_QUOTIENT:
   1341       case TOY_OPCODE_INT_DIV_REMAINDER:
   1342          toy_compiler_lower_math(tc, inst);
   1343          break;
   1344       case TOY_OPCODE_DDX:
   1345       case TOY_OPCODE_DDY:
   1346          fs_lower_opcode_derivative(tc, inst);
   1347          break;
   1348       case TOY_OPCODE_FB_WRITE:
   1349          fs_lower_opcode_fb_write(tc, inst);
   1350          break;
   1351       case TOY_OPCODE_KIL:
   1352          fs_lower_opcode_kil(tc, inst);
   1353          break;
   1354       default:
   1355          if (inst->opcode > 127)
   1356             tc_fail(tc, "unhandled virtual opcode");
   1357          break;
   1358       }
   1359    }
   1360 }
   1361 
   1362 /**
   1363  * Compile the shader.
   1364  */
   1365 static bool
   1366 fs_compile(struct fs_compile_context *fcc)
   1367 {
   1368    struct toy_compiler *tc = &fcc->tc;
   1369    struct ilo_shader *sh = fcc->shader;
   1370 
   1371    fs_lower_virtual_opcodes(fcc);
   1372    toy_compiler_legalize_for_ra(tc);
   1373    toy_compiler_optimize(tc);
   1374    toy_compiler_allocate_registers(tc,
   1375          fcc->first_free_grf,
   1376          fcc->last_free_grf,
   1377          fcc->num_grf_per_vrf);
   1378    toy_compiler_legalize_for_asm(tc);
   1379 
   1380    if (tc->fail) {
   1381       ilo_err("failed to legalize FS instructions: %s\n", tc->reason);
   1382       return false;
   1383    }
   1384 
   1385    if (ilo_debug & ILO_DEBUG_FS) {
   1386       ilo_printf("legalized instructions:\n");
   1387       toy_compiler_dump(tc);
   1388       ilo_printf("\n");
   1389    }
   1390 
   1391    if (true) {
   1392       sh->kernel = toy_compiler_assemble(tc, &sh->kernel_size);
   1393    }
   1394    else {
   1395       static const uint32_t microcode[] = {
   1396          /* fill in the microcode here */
   1397          0x0, 0x0, 0x0, 0x0,
   1398       };
   1399       const bool swap = true;
   1400 
   1401       sh->kernel_size = sizeof(microcode);
   1402       sh->kernel = MALLOC(sh->kernel_size);
   1403 
   1404       if (sh->kernel) {
   1405          const int num_dwords = sizeof(microcode) / 4;
   1406          const uint32_t *src = microcode;
   1407          uint32_t *dst = (uint32_t *) sh->kernel;
   1408          int i;
   1409 
   1410          for (i = 0; i < num_dwords; i += 4) {
   1411             if (swap) {
   1412                dst[i + 0] = src[i + 3];
   1413                dst[i + 1] = src[i + 2];
   1414                dst[i + 2] = src[i + 1];
   1415                dst[i + 3] = src[i + 0];
   1416             }
   1417             else {
   1418                memcpy(dst, src, 16);
   1419             }
   1420          }
   1421       }
   1422    }
   1423 
   1424    if (!sh->kernel) {
   1425       ilo_err("failed to compile FS: %s\n", tc->reason);
   1426       return false;
   1427    }
   1428 
   1429    if (ilo_debug & ILO_DEBUG_FS) {
   1430       ilo_printf("disassembly:\n");
   1431       toy_compiler_disassemble(tc->dev, sh->kernel, sh->kernel_size, false);
   1432       ilo_printf("\n");
   1433    }
   1434 
   1435    return true;
   1436 }
   1437 
   1438 /**
   1439  * Emit instructions to write the color buffers (and the depth buffer).
   1440  */
   1441 static void
   1442 fs_write_fb(struct fs_compile_context *fcc)
   1443 {
   1444    struct toy_compiler *tc = &fcc->tc;
   1445    int base_mrf = fcc->first_free_mrf;
   1446    const struct toy_dst header = tdst_ud(tdst(TOY_FILE_MRF, base_mrf, 0));
   1447    bool header_present = false;
   1448    struct toy_src desc;
   1449    unsigned msg_type, ctrl;
   1450    int color_slots[ILO_MAX_DRAW_BUFFERS], num_cbufs;
   1451    int pos_slot = -1, cbuf, i;
   1452 
   1453    for (i = 0; i < ARRAY_SIZE(color_slots); i++)
   1454       color_slots[i] = -1;
   1455 
   1456    for (i = 0; i < fcc->tgsi.num_outputs; i++) {
   1457       if (fcc->tgsi.outputs[i].semantic_name == TGSI_SEMANTIC_COLOR) {
   1458          assert(fcc->tgsi.outputs[i].semantic_index < ARRAY_SIZE(color_slots));
   1459          color_slots[fcc->tgsi.outputs[i].semantic_index] = i;
   1460       }
   1461       else if (fcc->tgsi.outputs[i].semantic_name == TGSI_SEMANTIC_POSITION) {
   1462          pos_slot = i;
   1463       }
   1464    }
   1465 
   1466    num_cbufs = fcc->variant->u.fs.num_cbufs;
   1467    /* still need to send EOT (and probably depth) */
   1468    if (!num_cbufs)
   1469       num_cbufs = 1;
   1470 
   1471    /* we need the header to specify the pixel mask or render target */
   1472    if (fcc->tgsi.uses_kill || num_cbufs > 1) {
   1473       const struct toy_src r0 = tsrc_ud(tsrc(TOY_FILE_GRF, 0, 0));
   1474       struct toy_inst *inst;
   1475 
   1476       inst = tc_MOV(tc, header, r0);
   1477       inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
   1478       base_mrf += fcc->num_grf_per_vrf;
   1479 
   1480       /* this is a two-register header */
   1481       if (fcc->dispatch_mode == GEN6_PS_DISPATCH_8) {
   1482          inst = tc_MOV(tc, tdst_offset(header, 1, 0), tsrc_offset(r0, 1, 0));
   1483          inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
   1484          base_mrf += fcc->num_grf_per_vrf;
   1485       }
   1486 
   1487       header_present = true;
   1488    }
   1489 
   1490    for (cbuf = 0; cbuf < num_cbufs; cbuf++) {
   1491       const int slot =
   1492          color_slots[(fcc->tgsi.props.fs_color0_writes_all_cbufs) ? 0 : cbuf];
   1493       int mrf = base_mrf, vrf;
   1494       struct toy_src src[4];
   1495 
   1496       if (slot >= 0) {
   1497          const unsigned undefined_mask =
   1498             fcc->tgsi.outputs[slot].undefined_mask;
   1499          const int index = fcc->tgsi.outputs[slot].index;
   1500 
   1501          vrf = toy_tgsi_get_vrf(&fcc->tgsi, TGSI_FILE_OUTPUT, 0, index);
   1502          if (vrf >= 0) {
   1503             const struct toy_src tmp = tsrc(TOY_FILE_VRF, vrf, 0);
   1504             tsrc_transpose(tmp, src);
   1505          }
   1506          else {
   1507             /* use (0, 0, 0, 0) */
   1508             tsrc_transpose(tsrc_imm_f(0.0f), src);
   1509          }
   1510 
   1511          for (i = 0; i < 4; i++) {
   1512             const struct toy_dst dst = tdst(TOY_FILE_MRF, mrf, 0);
   1513 
   1514             if (undefined_mask & (1 << i))
   1515                src[i] = tsrc_imm_f(0.0f);
   1516 
   1517             tc_MOV(tc, dst, src[i]);
   1518 
   1519             mrf += fcc->num_grf_per_vrf;
   1520          }
   1521       }
   1522       else {
   1523          /* use (0, 0, 0, 0) */
   1524          for (i = 0; i < 4; i++) {
   1525             const struct toy_dst dst = tdst(TOY_FILE_MRF, mrf, 0);
   1526 
   1527             tc_MOV(tc, dst, tsrc_imm_f(0.0f));
   1528             mrf += fcc->num_grf_per_vrf;
   1529          }
   1530       }
   1531 
   1532       /* select BLEND_STATE[rt] */
   1533       if (cbuf > 0) {
   1534          struct toy_inst *inst;
   1535 
   1536          inst = tc_MOV(tc, tdst_offset(header, 0, 2), tsrc_imm_ud(cbuf));
   1537          inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
   1538          inst->exec_size = GEN6_EXECSIZE_1;
   1539          inst->src[0].rect = TOY_RECT_010;
   1540       }
   1541 
   1542       if (cbuf == 0 && pos_slot >= 0) {
   1543          const int index = fcc->tgsi.outputs[pos_slot].index;
   1544          const struct toy_dst dst = tdst(TOY_FILE_MRF, mrf, 0);
   1545          struct toy_src src[4];
   1546          int vrf;
   1547 
   1548          vrf = toy_tgsi_get_vrf(&fcc->tgsi, TGSI_FILE_OUTPUT, 0, index);
   1549          if (vrf >= 0) {
   1550             const struct toy_src tmp = tsrc(TOY_FILE_VRF, vrf, 0);
   1551             tsrc_transpose(tmp, src);
   1552          }
   1553          else {
   1554             /* use (0, 0, 0, 0) */
   1555             tsrc_transpose(tsrc_imm_f(0.0f), src);
   1556          }
   1557 
   1558          /* only Z */
   1559          tc_MOV(tc, dst, src[2]);
   1560 
   1561          mrf += fcc->num_grf_per_vrf;
   1562       }
   1563 
   1564       msg_type = (fcc->dispatch_mode == GEN6_PS_DISPATCH_16) ?
   1565          GEN6_MSG_DP_RT_MODE_SIMD16 >> 8 :
   1566          GEN6_MSG_DP_RT_MODE_SIMD8_LO >> 8;
   1567 
   1568       ctrl = (cbuf == num_cbufs - 1) << 12 |
   1569              msg_type << 8;
   1570 
   1571       desc = tsrc_imm_mdesc_data_port(tc, cbuf == num_cbufs - 1,
   1572             mrf - fcc->first_free_mrf, 0,
   1573             header_present, false,
   1574             GEN6_MSG_DP_RT_WRITE,
   1575             ctrl, fcc->shader->bt.rt_base + cbuf);
   1576 
   1577       tc_add2(tc, TOY_OPCODE_FB_WRITE, tdst_null(),
   1578             tsrc(TOY_FILE_MRF, fcc->first_free_mrf, 0), desc);
   1579    }
   1580 }
   1581 
   1582 /**
   1583  * Set up shader outputs for fixed-function units.
   1584  */
   1585 static void
   1586 fs_setup_shader_out(struct ilo_shader *sh, const struct toy_tgsi *tgsi)
   1587 {
   1588    unsigned i;
   1589 
   1590    sh->out.count = tgsi->num_outputs;
   1591    for (i = 0; i < tgsi->num_outputs; i++) {
   1592       sh->out.register_indices[i] = tgsi->outputs[i].index;
   1593       sh->out.semantic_names[i] = tgsi->outputs[i].semantic_name;
   1594       sh->out.semantic_indices[i] = tgsi->outputs[i].semantic_index;
   1595 
   1596       if (tgsi->outputs[i].semantic_name == TGSI_SEMANTIC_POSITION)
   1597          sh->out.has_pos = true;
   1598    }
   1599 }
   1600 
   1601 /**
   1602  * Set up shader inputs for fixed-function units.
   1603  */
   1604 static void
   1605 fs_setup_shader_in(struct ilo_shader *sh, const struct toy_tgsi *tgsi,
   1606                    bool flatshade)
   1607 {
   1608    unsigned i;
   1609 
   1610    sh->in.count = tgsi->num_inputs;
   1611    for (i = 0; i < tgsi->num_inputs; i++) {
   1612       sh->in.semantic_names[i] = tgsi->inputs[i].semantic_name;
   1613       sh->in.semantic_indices[i] = tgsi->inputs[i].semantic_index;
   1614       sh->in.interp[i] = tgsi->inputs[i].interp;
   1615       sh->in.centroid[i] = tgsi->inputs[i].centroid;
   1616 
   1617       if (tgsi->inputs[i].semantic_name == TGSI_SEMANTIC_POSITION) {
   1618          sh->in.has_pos = true;
   1619          continue;
   1620       }
   1621       else if (tgsi->inputs[i].semantic_name == TGSI_SEMANTIC_FACE) {
   1622          continue;
   1623       }
   1624 
   1625       switch (tgsi->inputs[i].interp) {
   1626       case TGSI_INTERPOLATE_CONSTANT:
   1627          sh->in.const_interp_enable |= 1 << i;
   1628          break;
   1629       case TGSI_INTERPOLATE_LINEAR:
   1630          sh->in.has_linear_interp = true;
   1631 
   1632          if (tgsi->inputs[i].centroid) {
   1633             sh->in.barycentric_interpolation_mode |=
   1634                GEN6_INTERP_NONPERSPECTIVE_CENTROID;
   1635          }
   1636          else {
   1637             sh->in.barycentric_interpolation_mode |=
   1638                GEN6_INTERP_NONPERSPECTIVE_PIXEL;
   1639          }
   1640          break;
   1641       case TGSI_INTERPOLATE_COLOR:
   1642          if (flatshade) {
   1643             sh->in.const_interp_enable |= 1 << i;
   1644             break;
   1645          }
   1646          /* fall through */
   1647       case TGSI_INTERPOLATE_PERSPECTIVE:
   1648          if (tgsi->inputs[i].centroid) {
   1649             sh->in.barycentric_interpolation_mode |=
   1650                GEN6_INTERP_PERSPECTIVE_CENTROID;
   1651          }
   1652          else {
   1653             sh->in.barycentric_interpolation_mode |=
   1654                GEN6_INTERP_PERSPECTIVE_PIXEL;
   1655          }
   1656          break;
   1657       default:
   1658          break;
   1659       }
   1660    }
   1661 }
   1662 
   1663 static int
   1664 fs_setup_payloads(struct fs_compile_context *fcc)
   1665 {
   1666    const struct ilo_shader *sh = fcc->shader;
   1667    int grf, i;
   1668 
   1669    grf = 0;
   1670 
   1671    /* r0: header */
   1672    grf++;
   1673 
   1674    /* r1-r2: coordinates and etc. */
   1675    grf += (fcc->dispatch_mode == GEN6_PS_DISPATCH_32) ? 2 : 1;
   1676 
   1677    for (i = 0; i < ARRAY_SIZE(fcc->payloads); i++) {
   1678       const int reg_scale =
   1679          (fcc->dispatch_mode == GEN6_PS_DISPATCH_8) ? 1 : 2;
   1680 
   1681       /* r3-r26 or r32-r55: barycentric interpolation parameters */
   1682       if (sh->in.barycentric_interpolation_mode &
   1683             (GEN6_INTERP_PERSPECTIVE_PIXEL)) {
   1684          fcc->payloads[i].interp_perspective_pixel = grf;
   1685          grf += 2 * reg_scale;
   1686       }
   1687       if (sh->in.barycentric_interpolation_mode &
   1688             (GEN6_INTERP_PERSPECTIVE_CENTROID)) {
   1689          fcc->payloads[i].interp_perspective_centroid = grf;
   1690          grf += 2 * reg_scale;
   1691       }
   1692       if (sh->in.barycentric_interpolation_mode &
   1693             (GEN6_INTERP_PERSPECTIVE_SAMPLE)) {
   1694          fcc->payloads[i].interp_perspective_sample = grf;
   1695          grf += 2 * reg_scale;
   1696       }
   1697       if (sh->in.barycentric_interpolation_mode &
   1698             (GEN6_INTERP_NONPERSPECTIVE_PIXEL)) {
   1699          fcc->payloads[i].interp_nonperspective_pixel = grf;
   1700          grf += 2 * reg_scale;
   1701       }
   1702       if (sh->in.barycentric_interpolation_mode &
   1703             (GEN6_INTERP_NONPERSPECTIVE_CENTROID)) {
   1704          fcc->payloads[i].interp_nonperspective_centroid = grf;
   1705          grf += 2 * reg_scale;
   1706       }
   1707       if (sh->in.barycentric_interpolation_mode &
   1708             (GEN6_INTERP_NONPERSPECTIVE_SAMPLE)) {
   1709          fcc->payloads[i].interp_nonperspective_sample = grf;
   1710          grf += 2 * reg_scale;
   1711       }
   1712 
   1713       /* r27-r28 or r56-r57: interpoloated depth */
   1714       if (sh->in.has_pos) {
   1715          fcc->payloads[i].source_depth = grf;
   1716          grf += 1 * reg_scale;
   1717       }
   1718 
   1719       /* r29-r30 or r58-r59: interpoloated w */
   1720       if (sh->in.has_pos) {
   1721          fcc->payloads[i].source_w = grf;
   1722          grf += 1 * reg_scale;
   1723       }
   1724 
   1725       /* r31 or r60: position offset */
   1726       if (false) {
   1727          fcc->payloads[i].pos_offset = grf;
   1728          grf++;
   1729       }
   1730 
   1731       if (fcc->dispatch_mode != GEN6_PS_DISPATCH_32)
   1732          break;
   1733    }
   1734 
   1735    return grf;
   1736 }
   1737 
   1738 /**
   1739  * Translate the TGSI tokens.
   1740  */
   1741 static bool
   1742 fs_setup_tgsi(struct toy_compiler *tc, const struct tgsi_token *tokens,
   1743               struct toy_tgsi *tgsi)
   1744 {
   1745    if (ilo_debug & ILO_DEBUG_FS) {
   1746       ilo_printf("dumping fragment shader\n");
   1747       ilo_printf("\n");
   1748 
   1749       tgsi_dump(tokens, 0);
   1750       ilo_printf("\n");
   1751    }
   1752 
   1753    toy_compiler_translate_tgsi(tc, tokens, false, tgsi);
   1754    if (tc->fail) {
   1755       ilo_err("failed to translate FS TGSI tokens: %s\n", tc->reason);
   1756       return false;
   1757    }
   1758 
   1759    if (ilo_debug & ILO_DEBUG_FS) {
   1760       ilo_printf("TGSI translator:\n");
   1761       toy_tgsi_dump(tgsi);
   1762       ilo_printf("\n");
   1763       toy_compiler_dump(tc);
   1764       ilo_printf("\n");
   1765    }
   1766 
   1767    return true;
   1768 }
   1769 
   1770 /**
   1771  * Set up FS compile context.  This includes translating the TGSI tokens.
   1772  */
   1773 static bool
   1774 fs_setup(struct fs_compile_context *fcc,
   1775          const struct ilo_shader_state *state,
   1776          const struct ilo_shader_variant *variant)
   1777 {
   1778    int num_consts;
   1779 
   1780    memset(fcc, 0, sizeof(*fcc));
   1781 
   1782    fcc->shader = CALLOC_STRUCT(ilo_shader);
   1783    if (!fcc->shader)
   1784       return false;
   1785 
   1786    fcc->variant = variant;
   1787 
   1788    toy_compiler_init(&fcc->tc, state->info.dev);
   1789 
   1790    fcc->dispatch_mode = GEN6_PS_DISPATCH_8;
   1791 
   1792    fcc->tc.templ.access_mode = GEN6_ALIGN_1;
   1793    if (fcc->dispatch_mode == GEN6_PS_DISPATCH_16) {
   1794       fcc->tc.templ.qtr_ctrl = GEN6_QTRCTRL_1H;
   1795       fcc->tc.templ.exec_size = GEN6_EXECSIZE_16;
   1796    }
   1797    else {
   1798       fcc->tc.templ.qtr_ctrl = GEN6_QTRCTRL_1Q;
   1799       fcc->tc.templ.exec_size = GEN6_EXECSIZE_8;
   1800    }
   1801 
   1802    fcc->tc.rect_linear_width = 8;
   1803 
   1804    /*
   1805     * The classic driver uses the sampler cache (gen6) or the data cache
   1806     * (gen7).  Why?
   1807     */
   1808    fcc->const_cache = GEN6_SFID_DP_CC;
   1809 
   1810    if (!fs_setup_tgsi(&fcc->tc, state->info.tokens, &fcc->tgsi)) {
   1811       toy_compiler_cleanup(&fcc->tc);
   1812       FREE(fcc->shader);
   1813       return false;
   1814    }
   1815 
   1816    fs_setup_shader_in(fcc->shader, &fcc->tgsi, fcc->variant->u.fs.flatshade);
   1817    fs_setup_shader_out(fcc->shader, &fcc->tgsi);
   1818 
   1819    if (fcc->variant->use_pcb && !fcc->tgsi.const_indirect) {
   1820       num_consts = (fcc->tgsi.const_count + 1) / 2;
   1821 
   1822       /*
   1823        * From the Sandy Bridge PRM, volume 2 part 1, page 287:
   1824        *
   1825        *     "The sum of all four read length fields (each incremented to
   1826        *      represent the actual read length) must be less than or equal to
   1827        *      64"
   1828        *
   1829        * Since we are usually under a high register pressure, do not allow
   1830        * for more than 8.
   1831        */
   1832       if (num_consts > 8)
   1833          num_consts = 0;
   1834    }
   1835    else {
   1836       num_consts = 0;
   1837    }
   1838 
   1839    fcc->shader->skip_cbuf0_upload = (!fcc->tgsi.const_count || num_consts);
   1840    fcc->shader->pcb.cbuf0_size = num_consts * (sizeof(float) * 8);
   1841 
   1842    fcc->first_const_grf = fs_setup_payloads(fcc);
   1843    fcc->first_attr_grf = fcc->first_const_grf + num_consts;
   1844    fcc->first_free_grf = fcc->first_attr_grf + fcc->shader->in.count * 2;
   1845    fcc->last_free_grf = 127;
   1846 
   1847    /* m0 is reserved for system routines */
   1848    fcc->first_free_mrf = 1;
   1849    fcc->last_free_mrf = 15;
   1850 
   1851    /* instructions are compressed with GEN6_EXECSIZE_16 */
   1852    fcc->num_grf_per_vrf =
   1853       (fcc->dispatch_mode == GEN6_PS_DISPATCH_16) ? 2 : 1;
   1854 
   1855    if (ilo_dev_gen(fcc->tc.dev) >= ILO_GEN(7)) {
   1856       fcc->last_free_grf -= 15;
   1857       fcc->first_free_mrf = fcc->last_free_grf + 1;
   1858       fcc->last_free_mrf = fcc->first_free_mrf + 14;
   1859    }
   1860 
   1861    fcc->shader->in.start_grf = fcc->first_const_grf;
   1862    fcc->shader->has_kill = fcc->tgsi.uses_kill;
   1863    fcc->shader->dispatch_16 =
   1864       (fcc->dispatch_mode == GEN6_PS_DISPATCH_16);
   1865 
   1866    fcc->shader->bt.rt_base = 0;
   1867    fcc->shader->bt.rt_count = fcc->variant->u.fs.num_cbufs;
   1868    /* to send EOT */
   1869    if (!fcc->shader->bt.rt_count)
   1870       fcc->shader->bt.rt_count = 1;
   1871 
   1872    fcc->shader->bt.tex_base = fcc->shader->bt.rt_base +
   1873                               fcc->shader->bt.rt_count;
   1874    fcc->shader->bt.tex_count = fcc->variant->num_sampler_views;
   1875 
   1876    fcc->shader->bt.const_base = fcc->shader->bt.tex_base +
   1877                                 fcc->shader->bt.tex_count;
   1878    fcc->shader->bt.const_count = state->info.constant_buffer_count;
   1879 
   1880    fcc->shader->bt.total_count = fcc->shader->bt.const_base +
   1881                                  fcc->shader->bt.const_count;
   1882 
   1883    return true;
   1884 }
   1885 
   1886 /**
   1887  * Compile the fragment shader.
   1888  */
   1889 struct ilo_shader *
   1890 ilo_shader_compile_fs(const struct ilo_shader_state *state,
   1891                       const struct ilo_shader_variant *variant)
   1892 {
   1893    struct fs_compile_context fcc;
   1894 
   1895    if (!fs_setup(&fcc, state, variant))
   1896       return NULL;
   1897 
   1898    fs_write_fb(&fcc);
   1899 
   1900    if (!fs_compile(&fcc)) {
   1901       FREE(fcc.shader);
   1902       fcc.shader = NULL;
   1903    }
   1904 
   1905    toy_tgsi_cleanup(&fcc.tgsi);
   1906    toy_compiler_cleanup(&fcc.tc);
   1907 
   1908    return fcc.shader;
   1909 }
   1910