Home | History | Annotate | Download | only in etnaviv
      1 /*
      2  * Copyright (c) 2012-2015 Etnaviv Project
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sub license,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the
     12  * next paragraph) shall be included in all copies or substantial portions
     13  * of the Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21  * DEALINGS IN THE SOFTWARE.
     22  *
     23  * Authors:
     24  *    Wladimir J. van der Laan <laanwj (at) gmail.com>
     25  */
     26 
     27 /* TGSI->Vivante shader ISA conversion */
     28 
     29 /* What does the compiler return (see etna_shader_object)?
     30  *  1) instruction data
     31  *  2) input-to-temporary mapping (fixed for ps)
     32  *      *) in case of ps, semantic -> varying id mapping
     33  *      *) for each varying: number of components used (r, rg, rgb, rgba)
     34  *  3) temporary-to-output mapping (in case of vs, fixed for ps)
     35  *  4) for each input/output: possible semantic (position, color, glpointcoord, ...)
     36  *  5) immediates base offset, immediates data
     37  *  6) used texture units (and possibly the TGSI_TEXTURE_* type); not needed to
     38  *     configure the hw, but useful for error checking
     39  *  7) enough information to add the z=(z+w)/2.0 necessary for older chips
     40  *     (output reg id is enough)
     41  *
     42  *  Empty shaders are not allowed, should always at least generate a NOP. Also
     43  *  if there is a label at the end of the shader, an extra NOP should be
     44  *  generated as jump target.
     45  *
     46  * TODO
     47  * * Use an instruction scheduler
     48  * * Indirect access to uniforms / temporaries using amode
     49  */
     50 
     51 #include "etnaviv_compiler.h"
     52 
     53 #include "etnaviv_asm.h"
     54 #include "etnaviv_context.h"
     55 #include "etnaviv_debug.h"
     56 #include "etnaviv_disasm.h"
     57 #include "etnaviv_uniforms.h"
     58 #include "etnaviv_util.h"
     59 
     60 #include "pipe/p_shader_tokens.h"
     61 #include "tgsi/tgsi_info.h"
     62 #include "tgsi/tgsi_iterate.h"
     63 #include "tgsi/tgsi_lowering.h"
     64 #include "tgsi/tgsi_strings.h"
     65 #include "tgsi/tgsi_util.h"
     66 #include "util/u_math.h"
     67 #include "util/u_memory.h"
     68 
     69 #include <fcntl.h>
     70 #include <stdio.h>
     71 #include <sys/stat.h>
     72 #include <sys/types.h>
     73 
     74 #define ETNA_MAX_INNER_TEMPS 2
     75 
     76 static const float sincos_const[2][4] = {
     77    {
     78       2., -1., 4., -4.,
     79    },
     80    {
     81       1. / (2. * M_PI), 0.75, 0.5, 0.0,
     82    },
     83 };
     84 
     85 /* Native register description structure */
     86 struct etna_native_reg {
     87    unsigned valid : 1;
     88    unsigned is_tex : 1; /* is texture unit, overrides rgroup */
     89    unsigned rgroup : 3;
     90    unsigned id : 9;
     91 };
     92 
     93 /* Register description */
     94 struct etna_reg_desc {
     95    enum tgsi_file_type file; /* IN, OUT, TEMP, ... */
     96    int idx; /* index into file */
     97    bool active; /* used in program */
     98    int first_use; /* instruction id of first use (scope begin) */
     99    int last_use; /* instruction id of last use (scope end, inclusive) */
    100 
    101    struct etna_native_reg native; /* native register to map to */
    102    unsigned usage_mask : 4; /* usage, per channel */
    103    bool has_semantic; /* register has associated TGSI semantic */
    104    struct tgsi_declaration_semantic semantic; /* TGSI semantic */
    105    struct tgsi_declaration_interp interp; /* Interpolation type */
    106 };
    107 
    108 /* Label information structure */
    109 struct etna_compile_label {
    110    int inst_idx; /* Instruction id that label points to */
    111 };
    112 
    113 enum etna_compile_frame_type {
    114    ETNA_COMPILE_FRAME_IF, /* IF/ELSE/ENDIF */
    115    ETNA_COMPILE_FRAME_LOOP,
    116 };
    117 
    118 /* nesting scope frame (LOOP, IF, ...) during compilation
    119  */
    120 struct etna_compile_frame {
    121    enum etna_compile_frame_type type;
    122    int lbl_else_idx;
    123    int lbl_endif_idx;
    124    int lbl_loop_bgn_idx;
    125    int lbl_loop_end_idx;
    126 };
    127 
    128 struct etna_compile_file {
    129    /* Number of registers in each TGSI file (max register+1) */
    130    size_t reg_size;
    131    /* Register descriptions, per register index */
    132    struct etna_reg_desc *reg;
    133 };
    134 
    135 #define array_insert(arr, val)                          \
    136    do {                                                 \
    137       if (arr##_count == arr##_sz) {                    \
    138          arr##_sz = MAX2(2 * arr##_sz, 16);             \
    139          arr = realloc(arr, arr##_sz * sizeof(arr[0])); \
    140       }                                                 \
    141       arr[arr##_count++] = val;                         \
    142    } while (0)
    143 
    144 
    145 /* scratch area for compiling shader, freed after compilation finishes */
    146 struct etna_compile {
    147    const struct tgsi_token *tokens;
    148    bool free_tokens;
    149 
    150    struct tgsi_shader_info info;
    151 
    152    /* Register descriptions, per TGSI file, per register index */
    153    struct etna_compile_file file[TGSI_FILE_COUNT];
    154 
    155    /* Keep track of TGSI register declarations */
    156    struct etna_reg_desc decl[ETNA_MAX_DECL];
    157    uint total_decls;
    158 
    159    /* Bitmap of dead instructions which are removed in a separate pass */
    160    bool dead_inst[ETNA_MAX_TOKENS];
    161 
    162    /* Immediate data */
    163    enum etna_immediate_contents imm_contents[ETNA_MAX_IMM];
    164    uint32_t imm_data[ETNA_MAX_IMM];
    165    uint32_t imm_base; /* base of immediates (in 32 bit units) */
    166    uint32_t imm_size; /* size of immediates (in 32 bit units) */
    167 
    168    /* Next free native register, for register allocation */
    169    uint32_t next_free_native;
    170 
    171    /* Temporary register for use within translated TGSI instruction,
    172     * only allocated when needed.
    173     */
    174    int inner_temps; /* number of inner temps used; only up to one available at
    175                        this point */
    176    struct etna_native_reg inner_temp[ETNA_MAX_INNER_TEMPS];
    177 
    178    /* Fields for handling nested conditionals */
    179    struct etna_compile_frame frame_stack[ETNA_MAX_DEPTH];
    180    int frame_sp;
    181    int lbl_usage[ETNA_MAX_INSTRUCTIONS];
    182 
    183    unsigned labels_count, labels_sz;
    184    struct etna_compile_label *labels;
    185 
    186    unsigned num_loops;
    187 
    188    /* Code generation */
    189    int inst_ptr; /* current instruction pointer */
    190    uint32_t code[ETNA_MAX_INSTRUCTIONS * ETNA_INST_SIZE];
    191 
    192    /* I/O */
    193 
    194    /* Number of varyings (PS only) */
    195    int num_varyings;
    196 
    197    /* GPU hardware specs */
    198    const struct etna_specs *specs;
    199 
    200    const struct etna_shader_key *key;
    201 };
    202 
    203 static struct etna_reg_desc *
    204 etna_get_dst_reg(struct etna_compile *c, struct tgsi_dst_register dst)
    205 {
    206    return &c->file[dst.File].reg[dst.Index];
    207 }
    208 
    209 static struct etna_reg_desc *
    210 etna_get_src_reg(struct etna_compile *c, struct tgsi_src_register src)
    211 {
    212    return &c->file[src.File].reg[src.Index];
    213 }
    214 
    215 static struct etna_native_reg
    216 etna_native_temp(unsigned reg)
    217 {
    218    return (struct etna_native_reg) {
    219       .valid = 1,
    220       .rgroup = INST_RGROUP_TEMP,
    221       .id = reg
    222    };
    223 }
    224 
    225 /** Register allocation **/
    226 enum reg_sort_order {
    227    FIRST_USE_ASC,
    228    FIRST_USE_DESC,
    229    LAST_USE_ASC,
    230    LAST_USE_DESC
    231 };
    232 
    233 /* Augmented register description for sorting */
    234 struct sort_rec {
    235    struct etna_reg_desc *ptr;
    236    int key;
    237 };
    238 
    239 static int
    240 sort_rec_compar(const struct sort_rec *a, const struct sort_rec *b)
    241 {
    242    if (a->key < b->key)
    243       return -1;
    244 
    245    if (a->key > b->key)
    246       return 1;
    247 
    248    return 0;
    249 }
    250 
    251 /* create an index on a register set based on certain criteria. */
    252 static int
    253 sort_registers(struct sort_rec *sorted, struct etna_compile_file *file,
    254                enum reg_sort_order so)
    255 {
    256    struct etna_reg_desc *regs = file->reg;
    257    int ptr = 0;
    258 
    259    /* pre-populate keys from active registers */
    260    for (int idx = 0; idx < file->reg_size; ++idx) {
    261       /* only interested in active registers now; will only assign inactive ones
    262        * if no space in active ones */
    263       if (regs[idx].active) {
    264          sorted[ptr].ptr = &regs[idx];
    265 
    266          switch (so) {
    267          case FIRST_USE_ASC:
    268             sorted[ptr].key = regs[idx].first_use;
    269             break;
    270          case LAST_USE_ASC:
    271             sorted[ptr].key = regs[idx].last_use;
    272             break;
    273          case FIRST_USE_DESC:
    274             sorted[ptr].key = -regs[idx].first_use;
    275             break;
    276          case LAST_USE_DESC:
    277             sorted[ptr].key = -regs[idx].last_use;
    278             break;
    279          }
    280          ptr++;
    281       }
    282    }
    283 
    284    /* sort index by key */
    285    qsort(sorted, ptr, sizeof(struct sort_rec),
    286          (int (*)(const void *, const void *))sort_rec_compar);
    287 
    288    return ptr;
    289 }
    290 
    291 /* Allocate a new, unused, native temp register */
    292 static struct etna_native_reg
    293 alloc_new_native_reg(struct etna_compile *c)
    294 {
    295    assert(c->next_free_native < ETNA_MAX_TEMPS);
    296    return etna_native_temp(c->next_free_native++);
    297 }
    298 
    299 /* assign TEMPs to native registers */
    300 static void
    301 assign_temporaries_to_native(struct etna_compile *c,
    302                              struct etna_compile_file *file)
    303 {
    304    struct etna_reg_desc *temps = file->reg;
    305 
    306    for (int idx = 0; idx < file->reg_size; ++idx)
    307       temps[idx].native = alloc_new_native_reg(c);
    308 }
    309 
    310 /* assign inputs and outputs to temporaries
    311  * Gallium assumes that the hardware has separate registers for taking input and
    312  * output, however Vivante GPUs use temporaries both for passing in inputs and
    313  * passing back outputs.
    314  * Try to re-use temporary registers where possible. */
    315 static void
    316 assign_inouts_to_temporaries(struct etna_compile *c, uint file)
    317 {
    318    bool mode_inputs = (file == TGSI_FILE_INPUT);
    319    int inout_ptr = 0, num_inouts;
    320    int temp_ptr = 0, num_temps;
    321    struct sort_rec inout_order[ETNA_MAX_TEMPS];
    322    struct sort_rec temps_order[ETNA_MAX_TEMPS];
    323    num_inouts = sort_registers(inout_order, &c->file[file],
    324                                mode_inputs ? LAST_USE_ASC : FIRST_USE_ASC);
    325    num_temps = sort_registers(temps_order, &c->file[TGSI_FILE_TEMPORARY],
    326                               mode_inputs ? FIRST_USE_ASC : LAST_USE_ASC);
    327 
    328    while (inout_ptr < num_inouts && temp_ptr < num_temps) {
    329       struct etna_reg_desc *inout = inout_order[inout_ptr].ptr;
    330       struct etna_reg_desc *temp = temps_order[temp_ptr].ptr;
    331 
    332       if (!inout->active || inout->native.valid) { /* Skip if already a native register assigned */
    333          inout_ptr++;
    334          continue;
    335       }
    336 
    337       /* last usage of this input is before or in same instruction of first use
    338        * of temporary? */
    339       if (mode_inputs ? (inout->last_use <= temp->first_use)
    340                       : (inout->first_use >= temp->last_use)) {
    341          /* assign it and advance to next input */
    342          inout->native = temp->native;
    343          inout_ptr++;
    344       }
    345 
    346       temp_ptr++;
    347    }
    348 
    349    /* if we couldn't reuse current ones, allocate new temporaries */
    350    for (inout_ptr = 0; inout_ptr < num_inouts; ++inout_ptr) {
    351       struct etna_reg_desc *inout = inout_order[inout_ptr].ptr;
    352 
    353       if (inout->active && !inout->native.valid)
    354          inout->native = alloc_new_native_reg(c);
    355    }
    356 }
    357 
    358 /* Allocate an immediate with a certain value and return the index. If
    359  * there is already an immediate with that value, return that.
    360  */
    361 static struct etna_inst_src
    362 alloc_imm(struct etna_compile *c, enum etna_immediate_contents contents,
    363           uint32_t value)
    364 {
    365    int idx;
    366 
    367    /* Could use a hash table to speed this up */
    368    for (idx = 0; idx < c->imm_size; ++idx) {
    369       if (c->imm_contents[idx] == contents && c->imm_data[idx] == value)
    370          break;
    371    }
    372 
    373    /* look if there is an unused slot */
    374    if (idx == c->imm_size) {
    375       for (idx = 0; idx < c->imm_size; ++idx) {
    376          if (c->imm_contents[idx] == ETNA_IMMEDIATE_UNUSED)
    377             break;
    378       }
    379    }
    380 
    381    /* allocate new immediate */
    382    if (idx == c->imm_size) {
    383       assert(c->imm_size < ETNA_MAX_IMM);
    384       idx = c->imm_size++;
    385       c->imm_data[idx] = value;
    386       c->imm_contents[idx] = contents;
    387    }
    388 
    389    /* swizzle so that component with value is returned in all components */
    390    idx += c->imm_base;
    391    struct etna_inst_src imm_src = {
    392       .use = 1,
    393       .rgroup = INST_RGROUP_UNIFORM_0,
    394       .reg = idx / 4,
    395       .swiz = INST_SWIZ_BROADCAST(idx & 3)
    396    };
    397 
    398    return imm_src;
    399 }
    400 
    401 static struct etna_inst_src
    402 alloc_imm_u32(struct etna_compile *c, uint32_t value)
    403 {
    404    return alloc_imm(c, ETNA_IMMEDIATE_CONSTANT, value);
    405 }
    406 
    407 static struct etna_inst_src
    408 alloc_imm_vec4u(struct etna_compile *c, enum etna_immediate_contents contents,
    409                 const uint32_t *values)
    410 {
    411    struct etna_inst_src imm_src = { };
    412    int idx, i;
    413 
    414    for (idx = 0; idx + 3 < c->imm_size; idx += 4) {
    415       /* What if we can use a uniform with a different swizzle? */
    416       for (i = 0; i < 4; i++)
    417          if (c->imm_contents[idx + i] != contents || c->imm_data[idx + i] != values[i])
    418             break;
    419       if (i == 4)
    420          break;
    421    }
    422 
    423    if (idx + 3 >= c->imm_size) {
    424       idx = align(c->imm_size, 4);
    425       assert(idx + 4 <= ETNA_MAX_IMM);
    426 
    427       for (i = 0; i < 4; i++) {
    428          c->imm_data[idx + i] = values[i];
    429          c->imm_contents[idx + i] = contents;
    430       }
    431 
    432       c->imm_size = idx + 4;
    433    }
    434 
    435    assert((c->imm_base & 3) == 0);
    436    idx += c->imm_base;
    437    imm_src.use = 1;
    438    imm_src.rgroup = INST_RGROUP_UNIFORM_0;
    439    imm_src.reg = idx / 4;
    440    imm_src.swiz = INST_SWIZ_IDENTITY;
    441 
    442    return imm_src;
    443 }
    444 
    445 static uint32_t
    446 get_imm_u32(struct etna_compile *c, const struct etna_inst_src *imm,
    447             unsigned swiz_idx)
    448 {
    449    assert(imm->use == 1 && imm->rgroup == INST_RGROUP_UNIFORM_0);
    450    unsigned int idx = imm->reg * 4 + ((imm->swiz >> (swiz_idx * 2)) & 3);
    451 
    452    return c->imm_data[idx];
    453 }
    454 
    455 /* Allocate immediate with a certain float value. If there is already an
    456  * immediate with that value, return that.
    457  */
    458 static struct etna_inst_src
    459 alloc_imm_f32(struct etna_compile *c, float value)
    460 {
    461    return alloc_imm_u32(c, fui(value));
    462 }
    463 
    464 static struct etna_inst_src
    465 etna_imm_vec4f(struct etna_compile *c, const float *vec4)
    466 {
    467    uint32_t val[4];
    468 
    469    for (int i = 0; i < 4; i++)
    470       val[i] = fui(vec4[i]);
    471 
    472    return alloc_imm_vec4u(c, ETNA_IMMEDIATE_CONSTANT, val);
    473 }
    474 
    475 /* Pass -- check register file declarations and immediates */
    476 static void
    477 etna_compile_parse_declarations(struct etna_compile *c)
    478 {
    479    struct tgsi_parse_context ctx = { };
    480    unsigned status = TGSI_PARSE_OK;
    481    status = tgsi_parse_init(&ctx, c->tokens);
    482    assert(status == TGSI_PARSE_OK);
    483 
    484    while (!tgsi_parse_end_of_tokens(&ctx)) {
    485       tgsi_parse_token(&ctx);
    486 
    487       switch (ctx.FullToken.Token.Type) {
    488       case TGSI_TOKEN_TYPE_IMMEDIATE: {
    489          /* immediates are handled differently from other files; they are
    490           * not declared explicitly, and always add four components */
    491          const struct tgsi_full_immediate *imm = &ctx.FullToken.FullImmediate;
    492          assert(c->imm_size <= (ETNA_MAX_IMM - 4));
    493 
    494          for (int i = 0; i < 4; ++i) {
    495             unsigned idx = c->imm_size++;
    496 
    497             c->imm_data[idx] = imm->u[i].Uint;
    498             c->imm_contents[idx] = ETNA_IMMEDIATE_CONSTANT;
    499          }
    500       }
    501       break;
    502       }
    503    }
    504 
    505    tgsi_parse_free(&ctx);
    506 }
    507 
    508 /* Allocate register declarations for the registers in all register files */
    509 static void
    510 etna_allocate_decls(struct etna_compile *c)
    511 {
    512    uint idx = 0;
    513 
    514    for (int x = 0; x < TGSI_FILE_COUNT; ++x) {
    515       c->file[x].reg = &c->decl[idx];
    516       c->file[x].reg_size = c->info.file_max[x] + 1;
    517 
    518       for (int sub = 0; sub < c->file[x].reg_size; ++sub) {
    519          c->decl[idx].file = x;
    520          c->decl[idx].idx = sub;
    521          idx++;
    522       }
    523    }
    524 
    525    c->total_decls = idx;
    526 }
    527 
    528 /* Pass -- check and record usage of temporaries, inputs, outputs */
    529 static void
    530 etna_compile_pass_check_usage(struct etna_compile *c)
    531 {
    532    struct tgsi_parse_context ctx = { };
    533    unsigned status = TGSI_PARSE_OK;
    534    status = tgsi_parse_init(&ctx, c->tokens);
    535    assert(status == TGSI_PARSE_OK);
    536 
    537    for (int idx = 0; idx < c->total_decls; ++idx) {
    538       c->decl[idx].active = false;
    539       c->decl[idx].first_use = c->decl[idx].last_use = -1;
    540    }
    541 
    542    int inst_idx = 0;
    543    while (!tgsi_parse_end_of_tokens(&ctx)) {
    544       tgsi_parse_token(&ctx);
    545       /* find out max register #s used
    546        * For every register mark first and last instruction index where it's
    547        * used this allows finding ranges where the temporary can be borrowed
    548        * as input and/or output register
    549        *
    550        * XXX in the case of loops this needs special care, or even be completely
    551        * disabled, as
    552        * the last usage of a register inside a loop means it can still be used
    553        * on next loop
    554        * iteration (execution is no longer * chronological). The register can
    555        * only be
    556        * declared "free" after the loop finishes.
    557        *
    558        * Same for inputs: the first usage of a register inside a loop doesn't
    559        * mean that the register
    560        * won't have been overwritten in previous iteration. The register can
    561        * only be declared free before the loop
    562        * starts.
    563        * The proper way would be to do full dominator / post-dominator analysis
    564        * (especially with more complicated
    565        * control flow such as direct branch instructions) but not for now...
    566        */
    567       switch (ctx.FullToken.Token.Type) {
    568       case TGSI_TOKEN_TYPE_DECLARATION: {
    569          /* Declaration: fill in file details */
    570          const struct tgsi_full_declaration *decl = &ctx.FullToken.FullDeclaration;
    571          struct etna_compile_file *file = &c->file[decl->Declaration.File];
    572 
    573          for (int idx = decl->Range.First; idx <= decl->Range.Last; ++idx) {
    574             file->reg[idx].usage_mask = 0; // we'll compute this ourselves
    575             file->reg[idx].has_semantic = decl->Declaration.Semantic;
    576             file->reg[idx].semantic = decl->Semantic;
    577             file->reg[idx].interp = decl->Interp;
    578          }
    579       } break;
    580       case TGSI_TOKEN_TYPE_INSTRUCTION: {
    581          /* Instruction: iterate over operands of instruction */
    582          const struct tgsi_full_instruction *inst = &ctx.FullToken.FullInstruction;
    583 
    584          /* iterate over destination registers */
    585          for (int idx = 0; idx < inst->Instruction.NumDstRegs; ++idx) {
    586             struct etna_reg_desc *reg_desc = &c->file[inst->Dst[idx].Register.File].reg[inst->Dst[idx].Register.Index];
    587 
    588             if (reg_desc->first_use == -1)
    589                reg_desc->first_use = inst_idx;
    590 
    591             reg_desc->last_use = inst_idx;
    592             reg_desc->active = true;
    593          }
    594 
    595          /* iterate over source registers */
    596          for (int idx = 0; idx < inst->Instruction.NumSrcRegs; ++idx) {
    597             struct etna_reg_desc *reg_desc = &c->file[inst->Src[idx].Register.File].reg[inst->Src[idx].Register.Index];
    598 
    599             if (reg_desc->first_use == -1)
    600                reg_desc->first_use = inst_idx;
    601 
    602             reg_desc->last_use = inst_idx;
    603             reg_desc->active = true;
    604             /* accumulate usage mask for register, this is used to determine how
    605              * many slots for varyings
    606              * should be allocated */
    607             reg_desc->usage_mask |= tgsi_util_get_inst_usage_mask(inst, idx);
    608          }
    609          inst_idx += 1;
    610       } break;
    611       default:
    612          break;
    613       }
    614    }
    615 
    616    tgsi_parse_free(&ctx);
    617 }
    618 
    619 /* assign inputs that need to be assigned to specific registers */
    620 static void
    621 assign_special_inputs(struct etna_compile *c)
    622 {
    623    if (c->info.processor == PIPE_SHADER_FRAGMENT) {
    624       /* never assign t0 as it is the position output, start assigning at t1 */
    625       c->next_free_native = 1;
    626 
    627       /* hardwire TGSI_SEMANTIC_POSITION (input and output) to t0 */
    628       for (int idx = 0; idx < c->total_decls; ++idx) {
    629          struct etna_reg_desc *reg = &c->decl[idx];
    630 
    631          if (reg->active && reg->semantic.Name == TGSI_SEMANTIC_POSITION)
    632             reg->native = etna_native_temp(0);
    633       }
    634    }
    635 }
    636 
    637 /* Check that a move instruction does not swizzle any of the components
    638  * that it writes.
    639  */
    640 static bool
    641 etna_mov_check_no_swizzle(const struct tgsi_dst_register dst,
    642                           const struct tgsi_src_register src)
    643 {
    644    return (!(dst.WriteMask & TGSI_WRITEMASK_X) || src.SwizzleX == TGSI_SWIZZLE_X) &&
    645           (!(dst.WriteMask & TGSI_WRITEMASK_Y) || src.SwizzleY == TGSI_SWIZZLE_Y) &&
    646           (!(dst.WriteMask & TGSI_WRITEMASK_Z) || src.SwizzleZ == TGSI_SWIZZLE_Z) &&
    647           (!(dst.WriteMask & TGSI_WRITEMASK_W) || src.SwizzleW == TGSI_SWIZZLE_W);
    648 }
    649 
    650 /* Pass -- optimize outputs
    651  * Mesa tends to generate code like this at the end if their shaders
    652  *   MOV OUT[1], TEMP[2]
    653  *   MOV OUT[0], TEMP[0]
    654  *   MOV OUT[2], TEMP[1]
    655  * Recognize if
    656  * a) there is only a single assignment to an output register and
    657  * b) the temporary is not used after that
    658  * Also recognize direct assignment of IN to OUT (passthrough)
    659  **/
    660 static void
    661 etna_compile_pass_optimize_outputs(struct etna_compile *c)
    662 {
    663    struct tgsi_parse_context ctx = { };
    664    int inst_idx = 0;
    665    unsigned status = TGSI_PARSE_OK;
    666    status = tgsi_parse_init(&ctx, c->tokens);
    667    assert(status == TGSI_PARSE_OK);
    668 
    669    while (!tgsi_parse_end_of_tokens(&ctx)) {
    670       tgsi_parse_token(&ctx);
    671 
    672       switch (ctx.FullToken.Token.Type) {
    673       case TGSI_TOKEN_TYPE_INSTRUCTION: {
    674          const struct tgsi_full_instruction *inst = &ctx.FullToken.FullInstruction;
    675 
    676          /* iterate over operands */
    677          switch (inst->Instruction.Opcode) {
    678          case TGSI_OPCODE_MOV: {
    679             /* We are only interested in eliminating MOVs which write to
    680              * the shader outputs. Test for this early. */
    681             if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
    682                break;
    683             /* Elimination of a MOV must have no visible effect on the
    684              * resulting shader: this means the MOV must not swizzle or
    685              * saturate, and its source must not have the negate or
    686              * absolute modifiers. */
    687             if (!etna_mov_check_no_swizzle(inst->Dst[0].Register, inst->Src[0].Register) ||
    688                 inst->Instruction.Saturate || inst->Src[0].Register.Negate ||
    689                 inst->Src[0].Register.Absolute)
    690                break;
    691 
    692             uint out_idx = inst->Dst[0].Register.Index;
    693             uint in_idx = inst->Src[0].Register.Index;
    694             /* assignment of temporary to output --
    695              * and the output doesn't yet have a native register assigned
    696              * and the last use of the temporary is this instruction
    697              * and the MOV does not do a swizzle
    698              */
    699             if (inst->Src[0].Register.File == TGSI_FILE_TEMPORARY &&
    700                 !c->file[TGSI_FILE_OUTPUT].reg[out_idx].native.valid &&
    701                 c->file[TGSI_FILE_TEMPORARY].reg[in_idx].last_use == inst_idx) {
    702                c->file[TGSI_FILE_OUTPUT].reg[out_idx].native =
    703                   c->file[TGSI_FILE_TEMPORARY].reg[in_idx].native;
    704                /* prevent temp from being re-used for the rest of the shader */
    705                c->file[TGSI_FILE_TEMPORARY].reg[in_idx].last_use = ETNA_MAX_TOKENS;
    706                /* mark this MOV instruction as a no-op */
    707                c->dead_inst[inst_idx] = true;
    708             }
    709             /* direct assignment of input to output --
    710              * and the input or output doesn't yet have a native register
    711              * assigned
    712              * and the output is only used in this instruction,
    713              * allocate a new register, and associate both input and output to
    714              * it
    715              * and the MOV does not do a swizzle
    716              */
    717             if (inst->Src[0].Register.File == TGSI_FILE_INPUT &&
    718                 !c->file[TGSI_FILE_INPUT].reg[in_idx].native.valid &&
    719                 !c->file[TGSI_FILE_OUTPUT].reg[out_idx].native.valid &&
    720                 c->file[TGSI_FILE_OUTPUT].reg[out_idx].last_use == inst_idx &&
    721                 c->file[TGSI_FILE_OUTPUT].reg[out_idx].first_use == inst_idx) {
    722                c->file[TGSI_FILE_OUTPUT].reg[out_idx].native =
    723                   c->file[TGSI_FILE_INPUT].reg[in_idx].native =
    724                      alloc_new_native_reg(c);
    725                /* mark this MOV instruction as a no-op */
    726                c->dead_inst[inst_idx] = true;
    727             }
    728          } break;
    729          default:;
    730          }
    731          inst_idx += 1;
    732       } break;
    733       }
    734    }
    735 
    736    tgsi_parse_free(&ctx);
    737 }
    738 
    739 /* Get a temporary to be used within one TGSI instruction.
    740  * The first time that this function is called the temporary will be allocated.
    741  * Each call to this function will return the same temporary.
    742  */
    743 static struct etna_native_reg
    744 etna_compile_get_inner_temp(struct etna_compile *c)
    745 {
    746    int inner_temp = c->inner_temps;
    747 
    748    if (inner_temp < ETNA_MAX_INNER_TEMPS) {
    749       if (!c->inner_temp[inner_temp].valid)
    750          c->inner_temp[inner_temp] = alloc_new_native_reg(c);
    751 
    752       /* alloc_new_native_reg() handles lack of registers */
    753       c->inner_temps += 1;
    754    } else {
    755       BUG("Too many inner temporaries (%i) requested in one instruction",
    756           inner_temp + 1);
    757    }
    758 
    759    return c->inner_temp[inner_temp];
    760 }
    761 
    762 static struct etna_inst_dst
    763 etna_native_to_dst(struct etna_native_reg native, unsigned comps)
    764 {
    765    /* Can only assign to temporaries */
    766    assert(native.valid && !native.is_tex && native.rgroup == INST_RGROUP_TEMP);
    767 
    768    struct etna_inst_dst rv = {
    769       .comps = comps,
    770       .use = 1,
    771       .reg = native.id,
    772    };
    773 
    774    return rv;
    775 }
    776 
    777 static struct etna_inst_src
    778 etna_native_to_src(struct etna_native_reg native, uint32_t swizzle)
    779 {
    780    assert(native.valid && !native.is_tex);
    781 
    782    struct etna_inst_src rv = {
    783       .use = 1,
    784       .swiz = swizzle,
    785       .rgroup = native.rgroup,
    786       .reg = native.id,
    787       .amode = INST_AMODE_DIRECT,
    788    };
    789 
    790    return rv;
    791 }
    792 
    793 static inline struct etna_inst_src
    794 negate(struct etna_inst_src src)
    795 {
    796    src.neg = !src.neg;
    797 
    798    return src;
    799 }
    800 
    801 static inline struct etna_inst_src
    802 absolute(struct etna_inst_src src)
    803 {
    804    src.abs = 1;
    805 
    806    return src;
    807 }
    808 
    809 static inline struct etna_inst_src
    810 swizzle(struct etna_inst_src src, unsigned swizzle)
    811 {
    812    src.swiz = inst_swiz_compose(src.swiz, swizzle);
    813 
    814    return src;
    815 }
    816 
    817 /* Emit instruction and append it to program */
    818 static void
    819 emit_inst(struct etna_compile *c, struct etna_inst *inst)
    820 {
    821    assert(c->inst_ptr <= ETNA_MAX_INSTRUCTIONS);
    822 
    823    /* Check for uniform conflicts (each instruction can only access one
    824     * uniform),
    825     * if detected, use an intermediate temporary */
    826    unsigned uni_rgroup = -1;
    827    unsigned uni_reg = -1;
    828 
    829    for (int src = 0; src < ETNA_NUM_SRC; ++src) {
    830       if (etna_rgroup_is_uniform(inst->src[src].rgroup)) {
    831          if (uni_reg == -1) { /* first unique uniform used */
    832             uni_rgroup = inst->src[src].rgroup;
    833             uni_reg = inst->src[src].reg;
    834          } else { /* second or later; check that it is a re-use */
    835             if (uni_rgroup != inst->src[src].rgroup ||
    836                 uni_reg != inst->src[src].reg) {
    837                DBG_F(ETNA_DBG_COMPILER_MSGS, "perf warning: instruction that "
    838                                              "accesses different uniforms, "
    839                                              "need to generate extra MOV");
    840                struct etna_native_reg inner_temp = etna_compile_get_inner_temp(c);
    841 
    842                /* Generate move instruction to temporary */
    843                etna_assemble(&c->code[c->inst_ptr * 4], &(struct etna_inst) {
    844                   .opcode = INST_OPCODE_MOV,
    845                   .dst = etna_native_to_dst(inner_temp, INST_COMPS_X | INST_COMPS_Y |
    846                                                         INST_COMPS_Z | INST_COMPS_W),
    847                   .src[2] = inst->src[src]
    848                });
    849 
    850                c->inst_ptr++;
    851 
    852                /* Modify instruction to use temp register instead of uniform */
    853                inst->src[src].use = 1;
    854                inst->src[src].rgroup = INST_RGROUP_TEMP;
    855                inst->src[src].reg = inner_temp.id;
    856                inst->src[src].swiz = INST_SWIZ_IDENTITY; /* swizzling happens on MOV */
    857                inst->src[src].neg = 0; /* negation happens on MOV */
    858                inst->src[src].abs = 0; /* abs happens on MOV */
    859                inst->src[src].amode = 0; /* amode effects happen on MOV */
    860             }
    861          }
    862       }
    863    }
    864 
    865    /* Finally assemble the actual instruction */
    866    etna_assemble(&c->code[c->inst_ptr * 4], inst);
    867    c->inst_ptr++;
    868 }
    869 
    870 static unsigned int
    871 etna_amode(struct tgsi_ind_register indirect)
    872 {
    873    assert(indirect.File == TGSI_FILE_ADDRESS);
    874    assert(indirect.Index == 0);
    875 
    876    switch (indirect.Swizzle) {
    877    case TGSI_SWIZZLE_X:
    878       return INST_AMODE_ADD_A_X;
    879    case TGSI_SWIZZLE_Y:
    880       return INST_AMODE_ADD_A_Y;
    881    case TGSI_SWIZZLE_Z:
    882       return INST_AMODE_ADD_A_Z;
    883    case TGSI_SWIZZLE_W:
    884       return INST_AMODE_ADD_A_W;
    885    default:
    886       assert(!"Invalid swizzle");
    887    }
    888 
    889    unreachable("bad swizzle");
    890 }
    891 
    892 /* convert destination operand */
    893 static struct etna_inst_dst
    894 convert_dst(struct etna_compile *c, const struct tgsi_full_dst_register *in)
    895 {
    896    struct etna_inst_dst rv = {
    897       /// XXX .amode
    898       .comps = in->Register.WriteMask,
    899    };
    900 
    901    if (in->Register.File == TGSI_FILE_ADDRESS) {
    902       assert(in->Register.Index == 0);
    903       rv.reg = in->Register.Index;
    904       rv.use = 0;
    905    } else {
    906       rv = etna_native_to_dst(etna_get_dst_reg(c, in->Register)->native,
    907                               in->Register.WriteMask);
    908    }
    909 
    910    if (in->Register.Indirect)
    911       rv.amode = etna_amode(in->Indirect);
    912 
    913    return rv;
    914 }
    915 
    916 /* convert texture operand */
    917 static struct etna_inst_tex
    918 convert_tex(struct etna_compile *c, const struct tgsi_full_src_register *in,
    919             const struct tgsi_instruction_texture *tex)
    920 {
    921    struct etna_native_reg native_reg = etna_get_src_reg(c, in->Register)->native;
    922    struct etna_inst_tex rv = {
    923       // XXX .amode (to allow for an array of samplers?)
    924       .swiz = INST_SWIZ_IDENTITY
    925    };
    926 
    927    assert(native_reg.is_tex && native_reg.valid);
    928    rv.id = native_reg.id;
    929 
    930    return rv;
    931 }
    932 
    933 /* convert source operand */
    934 static struct etna_inst_src
    935 etna_create_src(const struct tgsi_full_src_register *tgsi,
    936                 const struct etna_native_reg *native)
    937 {
    938    const struct tgsi_src_register *reg = &tgsi->Register;
    939    struct etna_inst_src rv = {
    940       .use = 1,
    941       .swiz = INST_SWIZ(reg->SwizzleX, reg->SwizzleY, reg->SwizzleZ, reg->SwizzleW),
    942       .neg = reg->Negate,
    943       .abs = reg->Absolute,
    944       .rgroup = native->rgroup,
    945       .reg = native->id,
    946       .amode = INST_AMODE_DIRECT,
    947    };
    948 
    949    assert(native->valid && !native->is_tex);
    950 
    951    if (reg->Indirect)
    952       rv.amode = etna_amode(tgsi->Indirect);
    953 
    954    return rv;
    955 }
    956 
    957 static struct etna_inst_src
    958 etna_mov_src_to_temp(struct etna_compile *c, struct etna_inst_src src,
    959                      struct etna_native_reg temp)
    960 {
    961    struct etna_inst mov = { };
    962 
    963    mov.opcode = INST_OPCODE_MOV;
    964    mov.sat = 0;
    965    mov.dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
    966                                       INST_COMPS_Z | INST_COMPS_W);
    967    mov.src[2] = src;
    968    emit_inst(c, &mov);
    969 
    970    src.swiz = INST_SWIZ_IDENTITY;
    971    src.neg = src.abs = 0;
    972    src.rgroup = temp.rgroup;
    973    src.reg = temp.id;
    974 
    975    return src;
    976 }
    977 
    978 static struct etna_inst_src
    979 etna_mov_src(struct etna_compile *c, struct etna_inst_src src)
    980 {
    981    struct etna_native_reg temp = etna_compile_get_inner_temp(c);
    982 
    983    return etna_mov_src_to_temp(c, src, temp);
    984 }
    985 
    986 static bool
    987 etna_src_uniforms_conflict(struct etna_inst_src a, struct etna_inst_src b)
    988 {
    989    return etna_rgroup_is_uniform(a.rgroup) &&
    990           etna_rgroup_is_uniform(b.rgroup) &&
    991           (a.rgroup != b.rgroup || a.reg != b.reg);
    992 }
    993 
    994 /* create a new label */
    995 static unsigned int
    996 alloc_new_label(struct etna_compile *c)
    997 {
    998    struct etna_compile_label label = {
    999       .inst_idx = -1, /* start by point to no specific instruction */
   1000    };
   1001 
   1002    array_insert(c->labels, label);
   1003 
   1004    return c->labels_count - 1;
   1005 }
   1006 
   1007 /* place label at current instruction pointer */
   1008 static void
   1009 label_place(struct etna_compile *c, struct etna_compile_label *label)
   1010 {
   1011    label->inst_idx = c->inst_ptr;
   1012 }
   1013 
   1014 /* mark label use at current instruction.
   1015  * target of the label will be filled in in the marked instruction's src2.imm
   1016  * slot as soon
   1017  * as the value becomes known.
   1018  */
   1019 static void
   1020 label_mark_use(struct etna_compile *c, int lbl_idx)
   1021 {
   1022    assert(c->inst_ptr < ETNA_MAX_INSTRUCTIONS);
   1023    c->lbl_usage[c->inst_ptr] = lbl_idx;
   1024 }
   1025 
   1026 /* walk the frame stack and return first frame with matching type */
   1027 static struct etna_compile_frame *
   1028 find_frame(struct etna_compile *c, enum etna_compile_frame_type type)
   1029 {
   1030    for (int sp = c->frame_sp; sp >= 0; sp--)
   1031       if (c->frame_stack[sp].type == type)
   1032          return &c->frame_stack[sp];
   1033 
   1034    assert(0);
   1035    return NULL;
   1036 }
   1037 
   1038 struct instr_translater {
   1039    void (*fxn)(const struct instr_translater *t, struct etna_compile *c,
   1040                const struct tgsi_full_instruction *inst,
   1041                struct etna_inst_src *src);
   1042    unsigned tgsi_opc;
   1043    uint8_t opc;
   1044 
   1045    /* tgsi src -> etna src swizzle */
   1046    int src[3];
   1047 
   1048    unsigned cond;
   1049 };
   1050 
   1051 static void
   1052 trans_instr(const struct instr_translater *t, struct etna_compile *c,
   1053             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1054 {
   1055    const struct tgsi_opcode_info *info = tgsi_get_opcode_info(inst->Instruction.Opcode);
   1056    struct etna_inst instr = { };
   1057 
   1058    instr.opcode = t->opc;
   1059    instr.cond = t->cond;
   1060    instr.sat = inst->Instruction.Saturate;
   1061 
   1062    assert(info->num_dst <= 1);
   1063    if (info->num_dst)
   1064       instr.dst = convert_dst(c, &inst->Dst[0]);
   1065 
   1066    assert(info->num_src <= ETNA_NUM_SRC);
   1067 
   1068    for (unsigned i = 0; i < info->num_src; i++) {
   1069       int swizzle = t->src[i];
   1070 
   1071       assert(swizzle != -1);
   1072       instr.src[swizzle] = src[i];
   1073    }
   1074 
   1075    emit_inst(c, &instr);
   1076 }
   1077 
   1078 static void
   1079 trans_min_max(const struct instr_translater *t, struct etna_compile *c,
   1080               const struct tgsi_full_instruction *inst,
   1081               struct etna_inst_src *src)
   1082 {
   1083    emit_inst(c, &(struct etna_inst) {
   1084       .opcode = INST_OPCODE_SELECT,
   1085        .cond = t->cond,
   1086        .sat = inst->Instruction.Saturate,
   1087        .dst = convert_dst(c, &inst->Dst[0]),
   1088        .src[0] = src[0],
   1089        .src[1] = src[1],
   1090        .src[2] = src[0],
   1091     });
   1092 }
   1093 
   1094 static void
   1095 trans_if(const struct instr_translater *t, struct etna_compile *c,
   1096          const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1097 {
   1098    struct etna_compile_frame *f = &c->frame_stack[c->frame_sp++];
   1099    struct etna_inst_src imm_0 = alloc_imm_f32(c, 0.0f);
   1100 
   1101    /* push IF to stack */
   1102    f->type = ETNA_COMPILE_FRAME_IF;
   1103    /* create "else" label */
   1104    f->lbl_else_idx = alloc_new_label(c);
   1105    f->lbl_endif_idx = -1;
   1106 
   1107    /* We need to avoid the emit_inst() below becoming two instructions */
   1108    if (etna_src_uniforms_conflict(src[0], imm_0))
   1109       src[0] = etna_mov_src(c, src[0]);
   1110 
   1111    /* mark position in instruction stream of label reference so that it can be
   1112     * filled in in next pass */
   1113    label_mark_use(c, f->lbl_else_idx);
   1114 
   1115    /* create conditional branch to label if src0 EQ 0 */
   1116    emit_inst(c, &(struct etna_inst){
   1117       .opcode = INST_OPCODE_BRANCH,
   1118       .cond = INST_CONDITION_EQ,
   1119       .src[0] = src[0],
   1120       .src[1] = imm_0,
   1121     /* imm is filled in later */
   1122    });
   1123 }
   1124 
   1125 static void
   1126 trans_else(const struct instr_translater *t, struct etna_compile *c,
   1127            const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1128 {
   1129    assert(c->frame_sp > 0);
   1130    struct etna_compile_frame *f = &c->frame_stack[c->frame_sp - 1];
   1131    assert(f->type == ETNA_COMPILE_FRAME_IF);
   1132 
   1133    /* create "endif" label, and branch to endif label */
   1134    f->lbl_endif_idx = alloc_new_label(c);
   1135    label_mark_use(c, f->lbl_endif_idx);
   1136    emit_inst(c, &(struct etna_inst) {
   1137       .opcode = INST_OPCODE_BRANCH,
   1138       .cond = INST_CONDITION_TRUE,
   1139       /* imm is filled in later */
   1140    });
   1141 
   1142    /* mark "else" label at this position in instruction stream */
   1143    label_place(c, &c->labels[f->lbl_else_idx]);
   1144 }
   1145 
   1146 static void
   1147 trans_endif(const struct instr_translater *t, struct etna_compile *c,
   1148             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1149 {
   1150    assert(c->frame_sp > 0);
   1151    struct etna_compile_frame *f = &c->frame_stack[--c->frame_sp];
   1152    assert(f->type == ETNA_COMPILE_FRAME_IF);
   1153 
   1154    /* assign "endif" or "else" (if no ELSE) label to current position in
   1155     * instruction stream, pop IF */
   1156    if (f->lbl_endif_idx != -1)
   1157       label_place(c, &c->labels[f->lbl_endif_idx]);
   1158    else
   1159       label_place(c, &c->labels[f->lbl_else_idx]);
   1160 }
   1161 
   1162 static void
   1163 trans_loop_bgn(const struct instr_translater *t, struct etna_compile *c,
   1164                const struct tgsi_full_instruction *inst,
   1165                struct etna_inst_src *src)
   1166 {
   1167    struct etna_compile_frame *f = &c->frame_stack[c->frame_sp++];
   1168 
   1169    /* push LOOP to stack */
   1170    f->type = ETNA_COMPILE_FRAME_LOOP;
   1171    f->lbl_loop_bgn_idx = alloc_new_label(c);
   1172    f->lbl_loop_end_idx = alloc_new_label(c);
   1173 
   1174    label_place(c, &c->labels[f->lbl_loop_bgn_idx]);
   1175 
   1176    c->num_loops++;
   1177 }
   1178 
   1179 static void
   1180 trans_loop_end(const struct instr_translater *t, struct etna_compile *c,
   1181                const struct tgsi_full_instruction *inst,
   1182                struct etna_inst_src *src)
   1183 {
   1184    assert(c->frame_sp > 0);
   1185    struct etna_compile_frame *f = &c->frame_stack[--c->frame_sp];
   1186    assert(f->type == ETNA_COMPILE_FRAME_LOOP);
   1187 
   1188    /* mark position in instruction stream of label reference so that it can be
   1189     * filled in in next pass */
   1190    label_mark_use(c, f->lbl_loop_bgn_idx);
   1191 
   1192    /* create branch to loop_bgn label */
   1193    emit_inst(c, &(struct etna_inst) {
   1194       .opcode = INST_OPCODE_BRANCH,
   1195       .cond = INST_CONDITION_TRUE,
   1196       .src[0] = src[0],
   1197       /* imm is filled in later */
   1198    });
   1199 
   1200    label_place(c, &c->labels[f->lbl_loop_end_idx]);
   1201 }
   1202 
   1203 static void
   1204 trans_brk(const struct instr_translater *t, struct etna_compile *c,
   1205           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1206 {
   1207    assert(c->frame_sp > 0);
   1208    struct etna_compile_frame *f = find_frame(c, ETNA_COMPILE_FRAME_LOOP);
   1209 
   1210    /* mark position in instruction stream of label reference so that it can be
   1211     * filled in in next pass */
   1212    label_mark_use(c, f->lbl_loop_end_idx);
   1213 
   1214    /* create branch to loop_end label */
   1215    emit_inst(c, &(struct etna_inst) {
   1216       .opcode = INST_OPCODE_BRANCH,
   1217       .cond = INST_CONDITION_TRUE,
   1218       .src[0] = src[0],
   1219       /* imm is filled in later */
   1220    });
   1221 }
   1222 
   1223 static void
   1224 trans_cont(const struct instr_translater *t, struct etna_compile *c,
   1225            const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1226 {
   1227    assert(c->frame_sp > 0);
   1228    struct etna_compile_frame *f = find_frame(c, ETNA_COMPILE_FRAME_LOOP);
   1229 
   1230    /* mark position in instruction stream of label reference so that it can be
   1231     * filled in in next pass */
   1232    label_mark_use(c, f->lbl_loop_bgn_idx);
   1233 
   1234    /* create branch to loop_end label */
   1235    emit_inst(c, &(struct etna_inst) {
   1236       .opcode = INST_OPCODE_BRANCH,
   1237       .cond = INST_CONDITION_TRUE,
   1238       .src[0] = src[0],
   1239       /* imm is filled in later */
   1240    });
   1241 }
   1242 
   1243 static void
   1244 trans_deriv(const struct instr_translater *t, struct etna_compile *c,
   1245             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1246 {
   1247    emit_inst(c, &(struct etna_inst) {
   1248       .opcode = t->opc,
   1249       .sat = inst->Instruction.Saturate,
   1250       .dst = convert_dst(c, &inst->Dst[0]),
   1251       .src[0] = src[0],
   1252       .src[2] = src[0],
   1253    });
   1254 }
   1255 
   1256 static void
   1257 trans_arl(const struct instr_translater *t, struct etna_compile *c,
   1258           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1259 {
   1260    struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1261    struct etna_inst arl = { };
   1262    struct etna_inst_dst dst;
   1263 
   1264    dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y | INST_COMPS_Z |
   1265                                   INST_COMPS_W);
   1266 
   1267    if (c->specs->has_sign_floor_ceil) {
   1268       struct etna_inst floor = { };
   1269 
   1270       floor.opcode = INST_OPCODE_FLOOR;
   1271       floor.src[2] = src[0];
   1272       floor.dst = dst;
   1273 
   1274       emit_inst(c, &floor);
   1275    } else {
   1276       struct etna_inst floor[2] = { };
   1277 
   1278       floor[0].opcode = INST_OPCODE_FRC;
   1279       floor[0].sat = inst->Instruction.Saturate;
   1280       floor[0].dst = dst;
   1281       floor[0].src[2] = src[0];
   1282 
   1283       floor[1].opcode = INST_OPCODE_ADD;
   1284       floor[1].sat = inst->Instruction.Saturate;
   1285       floor[1].dst = dst;
   1286       floor[1].src[0] = src[0];
   1287       floor[1].src[2].use = 1;
   1288       floor[1].src[2].swiz = INST_SWIZ_IDENTITY;
   1289       floor[1].src[2].neg = 1;
   1290       floor[1].src[2].rgroup = temp.rgroup;
   1291       floor[1].src[2].reg = temp.id;
   1292 
   1293       emit_inst(c, &floor[0]);
   1294       emit_inst(c, &floor[1]);
   1295    }
   1296 
   1297    arl.opcode = INST_OPCODE_MOVAR;
   1298    arl.sat = inst->Instruction.Saturate;
   1299    arl.dst = convert_dst(c, &inst->Dst[0]);
   1300    arl.src[2] = etna_native_to_src(temp, INST_SWIZ_IDENTITY);
   1301 
   1302    emit_inst(c, &arl);
   1303 }
   1304 
   1305 static void
   1306 trans_lrp(const struct instr_translater *t, struct etna_compile *c,
   1307           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1308 {
   1309    /* dst = src0 * src1 + (1 - src0) * src2
   1310     *     => src0 * src1 - (src0 - 1) * src2
   1311     *     => src0 * src1 - (src0 * src2 - src2)
   1312     * MAD tTEMP.xyzw, tSRC0.xyzw, tSRC2.xyzw, -tSRC2.xyzw
   1313     * MAD tDST.xyzw, tSRC0.xyzw, tSRC1.xyzw, -tTEMP.xyzw
   1314     */
   1315    struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1316    if (etna_src_uniforms_conflict(src[0], src[1]) ||
   1317        etna_src_uniforms_conflict(src[0], src[2])) {
   1318       src[0] = etna_mov_src(c, src[0]);
   1319    }
   1320 
   1321    struct etna_inst mad[2] = { };
   1322    mad[0].opcode = INST_OPCODE_MAD;
   1323    mad[0].sat = 0;
   1324    mad[0].dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
   1325                                          INST_COMPS_Z | INST_COMPS_W);
   1326    mad[0].src[0] = src[0];
   1327    mad[0].src[1] = src[2];
   1328    mad[0].src[2] = negate(src[2]);
   1329    mad[1].opcode = INST_OPCODE_MAD;
   1330    mad[1].sat = inst->Instruction.Saturate;
   1331    mad[1].dst = convert_dst(c, &inst->Dst[0]), mad[1].src[0] = src[0];
   1332    mad[1].src[1] = src[1];
   1333    mad[1].src[2] = negate(etna_native_to_src(temp, INST_SWIZ_IDENTITY));
   1334 
   1335    emit_inst(c, &mad[0]);
   1336    emit_inst(c, &mad[1]);
   1337 }
   1338 
   1339 static void
   1340 trans_lit(const struct instr_translater *t, struct etna_compile *c,
   1341           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1342 {
   1343    /* SELECT.LT tmp._y__, 0, src.yyyy, 0
   1344     *  - can be eliminated if src.y is a uniform and >= 0
   1345     * SELECT.GT tmp.___w, 128, src.wwww, 128
   1346     * SELECT.LT tmp.___w, -128, tmp.wwww, -128
   1347     *  - can be eliminated if src.w is a uniform and fits clamp
   1348     * LOG tmp.x, void, void, tmp.yyyy
   1349     * MUL tmp.x, tmp.xxxx, tmp.wwww, void
   1350     * LITP dst, undef, src.xxxx, tmp.xxxx
   1351     */
   1352    struct etna_native_reg inner_temp = etna_compile_get_inner_temp(c);
   1353    struct etna_inst_src src_y = { };
   1354 
   1355    if (!etna_rgroup_is_uniform(src[0].rgroup)) {
   1356       src_y = etna_native_to_src(inner_temp, SWIZZLE(Y, Y, Y, Y));
   1357 
   1358       struct etna_inst ins = { };
   1359       ins.opcode = INST_OPCODE_SELECT;
   1360       ins.cond = INST_CONDITION_LT;
   1361       ins.dst = etna_native_to_dst(inner_temp, INST_COMPS_Y);
   1362       ins.src[0] = ins.src[2] = alloc_imm_f32(c, 0.0);
   1363       ins.src[1] = swizzle(src[0], SWIZZLE(Y, Y, Y, Y));
   1364       emit_inst(c, &ins);
   1365    } else if (uif(get_imm_u32(c, &src[0], 1)) < 0)
   1366       src_y = alloc_imm_f32(c, 0.0);
   1367    else
   1368       src_y = swizzle(src[0], SWIZZLE(Y, Y, Y, Y));
   1369 
   1370    struct etna_inst_src src_w = { };
   1371 
   1372    if (!etna_rgroup_is_uniform(src[0].rgroup)) {
   1373       src_w = etna_native_to_src(inner_temp, SWIZZLE(W, W, W, W));
   1374 
   1375       struct etna_inst ins = { };
   1376       ins.opcode = INST_OPCODE_SELECT;
   1377       ins.cond = INST_CONDITION_GT;
   1378       ins.dst = etna_native_to_dst(inner_temp, INST_COMPS_W);
   1379       ins.src[0] = ins.src[2] = alloc_imm_f32(c, 128.);
   1380       ins.src[1] = swizzle(src[0], SWIZZLE(W, W, W, W));
   1381       emit_inst(c, &ins);
   1382       ins.cond = INST_CONDITION_LT;
   1383       ins.src[0].neg = !ins.src[0].neg;
   1384       ins.src[2].neg = !ins.src[2].neg;
   1385       ins.src[1] = src_w;
   1386       emit_inst(c, &ins);
   1387    } else if (uif(get_imm_u32(c, &src[0], 3)) < -128.)
   1388       src_w = alloc_imm_f32(c, -128.);
   1389    else if (uif(get_imm_u32(c, &src[0], 3)) > 128.)
   1390       src_w = alloc_imm_f32(c, 128.);
   1391    else
   1392       src_w = swizzle(src[0], SWIZZLE(W, W, W, W));
   1393 
   1394    if (c->specs->has_new_transcendentals) { /* Alternative LOG sequence */
   1395       emit_inst(c, &(struct etna_inst) {
   1396          .opcode = INST_OPCODE_LOG,
   1397          .dst = etna_native_to_dst(inner_temp, INST_COMPS_X | INST_COMPS_Y),
   1398          .src[2] = src_y,
   1399          .tex = { .amode=1 }, /* Unknown bit needs to be set */
   1400       });
   1401       emit_inst(c, &(struct etna_inst) {
   1402          .opcode = INST_OPCODE_MUL,
   1403          .dst = etna_native_to_dst(inner_temp, INST_COMPS_X),
   1404          .src[0] = etna_native_to_src(inner_temp, SWIZZLE(X, X, X, X)),
   1405          .src[1] = etna_native_to_src(inner_temp, SWIZZLE(Y, Y, Y, Y)),
   1406       });
   1407    } else {
   1408       struct etna_inst ins[3] = { };
   1409       ins[0].opcode = INST_OPCODE_LOG;
   1410       ins[0].dst = etna_native_to_dst(inner_temp, INST_COMPS_X);
   1411       ins[0].src[2] = src_y;
   1412 
   1413       emit_inst(c, &ins[0]);
   1414    }
   1415    emit_inst(c, &(struct etna_inst) {
   1416       .opcode = INST_OPCODE_MUL,
   1417       .sat = 0,
   1418       .dst = etna_native_to_dst(inner_temp, INST_COMPS_X),
   1419       .src[0] = etna_native_to_src(inner_temp, SWIZZLE(X, X, X, X)),
   1420       .src[1] = src_w,
   1421    });
   1422    emit_inst(c, &(struct etna_inst) {
   1423       .opcode = INST_OPCODE_LITP,
   1424       .sat = 0,
   1425       .dst = convert_dst(c, &inst->Dst[0]),
   1426       .src[0] = swizzle(src[0], SWIZZLE(X, X, X, X)),
   1427       .src[1] = swizzle(src[0], SWIZZLE(X, X, X, X)),
   1428       .src[2] = etna_native_to_src(inner_temp, SWIZZLE(X, X, X, X)),
   1429    });
   1430 }
   1431 
   1432 static void
   1433 trans_ssg(const struct instr_translater *t, struct etna_compile *c,
   1434           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1435 {
   1436    if (c->specs->has_sign_floor_ceil) {
   1437       emit_inst(c, &(struct etna_inst){
   1438          .opcode = INST_OPCODE_SIGN,
   1439          .sat = inst->Instruction.Saturate,
   1440          .dst = convert_dst(c, &inst->Dst[0]),
   1441          .src[2] = src[0],
   1442       });
   1443    } else {
   1444       struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1445       struct etna_inst ins[2] = { };
   1446 
   1447       ins[0].opcode = INST_OPCODE_SET;
   1448       ins[0].cond = INST_CONDITION_NZ;
   1449       ins[0].dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
   1450                                             INST_COMPS_Z | INST_COMPS_W);
   1451       ins[0].src[0] = src[0];
   1452 
   1453       ins[1].opcode = INST_OPCODE_SELECT;
   1454       ins[1].cond = INST_CONDITION_LZ;
   1455       ins[1].sat = inst->Instruction.Saturate;
   1456       ins[1].dst = convert_dst(c, &inst->Dst[0]);
   1457       ins[1].src[0] = src[0];
   1458       ins[1].src[2] = etna_native_to_src(temp, INST_SWIZ_IDENTITY);
   1459       ins[1].src[1] = negate(ins[1].src[2]);
   1460 
   1461       emit_inst(c, &ins[0]);
   1462       emit_inst(c, &ins[1]);
   1463    }
   1464 }
   1465 
   1466 static void
   1467 trans_trig(const struct instr_translater *t, struct etna_compile *c,
   1468            const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1469 {
   1470    if (c->specs->has_new_transcendentals) { /* Alternative SIN/COS */
   1471       /* On newer chips alternative SIN/COS instructions are implemented,
   1472        * which:
   1473        * - Need their input scaled by 1/pi instead of 2/pi
   1474        * - Output an x and y component, which need to be multiplied to
   1475        *   get the result
   1476        */
   1477       struct etna_native_reg temp = etna_compile_get_inner_temp(c); /* only using .xyz */
   1478       emit_inst(c, &(struct etna_inst) {
   1479          .opcode = INST_OPCODE_MUL,
   1480          .sat = 0,
   1481          .dst = etna_native_to_dst(temp, INST_COMPS_Z),
   1482          .src[0] = src[0], /* any swizzling happens here */
   1483          .src[1] = alloc_imm_f32(c, 1.0f / M_PI),
   1484       });
   1485       emit_inst(c, &(struct etna_inst) {
   1486          .opcode = inst->Instruction.Opcode == TGSI_OPCODE_COS
   1487                     ? INST_OPCODE_COS
   1488                     : INST_OPCODE_SIN,
   1489          .sat = 0,
   1490          .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y),
   1491          .src[2] = etna_native_to_src(temp, SWIZZLE(Z, Z, Z, Z)),
   1492          .tex = { .amode=1 }, /* Unknown bit needs to be set */
   1493       });
   1494       emit_inst(c, &(struct etna_inst) {
   1495          .opcode = INST_OPCODE_MUL,
   1496          .sat = inst->Instruction.Saturate,
   1497          .dst = convert_dst(c, &inst->Dst[0]),
   1498          .src[0] = etna_native_to_src(temp, SWIZZLE(X, X, X, X)),
   1499          .src[1] = etna_native_to_src(temp, SWIZZLE(Y, Y, Y, Y)),
   1500       });
   1501 
   1502    } else if (c->specs->has_sin_cos_sqrt) {
   1503       struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1504       /* add divide by PI/2, using a temp register. GC2000
   1505        * fails with src==dst for the trig instruction. */
   1506       emit_inst(c, &(struct etna_inst) {
   1507          .opcode = INST_OPCODE_MUL,
   1508          .sat = 0,
   1509          .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
   1510                                          INST_COMPS_Z | INST_COMPS_W),
   1511          .src[0] = src[0], /* any swizzling happens here */
   1512          .src[1] = alloc_imm_f32(c, 2.0f / M_PI),
   1513       });
   1514       emit_inst(c, &(struct etna_inst) {
   1515          .opcode = inst->Instruction.Opcode == TGSI_OPCODE_COS
   1516                     ? INST_OPCODE_COS
   1517                     : INST_OPCODE_SIN,
   1518          .sat = inst->Instruction.Saturate,
   1519          .dst = convert_dst(c, &inst->Dst[0]),
   1520          .src[2] = etna_native_to_src(temp, INST_SWIZ_IDENTITY),
   1521       });
   1522    } else {
   1523       /* Implement Nick's fast sine/cosine. Taken from:
   1524        * http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
   1525        * A=(1/2*PI 0 1/2*PI 0) B=(0.75 0 0.5 0) C=(-4 4 X X)
   1526        *  MAD t.x_zw, src.xxxx, A, B
   1527        *  FRC t.x_z_, void, void, t.xwzw
   1528        *  MAD t.x_z_, t.xwzw, 2, -1
   1529        *  MUL t._y__, t.wzww, |t.wzww|, void  (for sin/scs)
   1530        *  DP3 t.x_z_, t.zyww, C, void         (for sin)
   1531        *  DP3 t.__z_, t.zyww, C, void         (for scs)
   1532        *  MUL t._y__, t.wxww, |t.wxww|, void  (for cos/scs)
   1533        *  DP3 t.x_z_, t.xyww, C, void         (for cos)
   1534        *  DP3 t.x___, t.xyww, C, void         (for scs)
   1535        *  MAD t._y_w, t,xxzz, |t.xxzz|, -t.xxzz
   1536        *  MAD dst, t.ywyw, .2225, t.xzxz
   1537        */
   1538       struct etna_inst *p, ins[9] = { };
   1539       struct etna_native_reg t0 = etna_compile_get_inner_temp(c);
   1540       struct etna_inst_src t0s = etna_native_to_src(t0, INST_SWIZ_IDENTITY);
   1541       struct etna_inst_src sincos[3], in = src[0];
   1542       sincos[0] = etna_imm_vec4f(c, sincos_const[0]);
   1543       sincos[1] = etna_imm_vec4f(c, sincos_const[1]);
   1544 
   1545       /* A uniform source will cause the inner temp limit to
   1546        * be exceeded.  Explicitly deal with that scenario.
   1547        */
   1548       if (etna_rgroup_is_uniform(src[0].rgroup)) {
   1549          struct etna_inst ins = { };
   1550          ins.opcode = INST_OPCODE_MOV;
   1551          ins.dst = etna_native_to_dst(t0, INST_COMPS_X);
   1552          ins.src[2] = in;
   1553          emit_inst(c, &ins);
   1554          in = t0s;
   1555       }
   1556 
   1557       ins[0].opcode = INST_OPCODE_MAD;
   1558       ins[0].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z | INST_COMPS_W);
   1559       ins[0].src[0] = swizzle(in, SWIZZLE(X, X, X, X));
   1560       ins[0].src[1] = swizzle(sincos[1], SWIZZLE(X, W, X, W)); /* 1/2*PI */
   1561       ins[0].src[2] = swizzle(sincos[1], SWIZZLE(Y, W, Z, W)); /* 0.75, 0, 0.5, 0 */
   1562 
   1563       ins[1].opcode = INST_OPCODE_FRC;
   1564       ins[1].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z);
   1565       ins[1].src[2] = swizzle(t0s, SWIZZLE(X, W, Z, W));
   1566 
   1567       ins[2].opcode = INST_OPCODE_MAD;
   1568       ins[2].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z);
   1569       ins[2].src[0] = swizzle(t0s, SWIZZLE(X, W, Z, W));
   1570       ins[2].src[1] = swizzle(sincos[0], SWIZZLE(X, X, X, X)); /* 2 */
   1571       ins[2].src[2] = swizzle(sincos[0], SWIZZLE(Y, Y, Y, Y)); /* -1 */
   1572 
   1573       unsigned mul_swiz, dp3_swiz;
   1574       if (inst->Instruction.Opcode == TGSI_OPCODE_SIN) {
   1575          mul_swiz = SWIZZLE(W, Z, W, W);
   1576          dp3_swiz = SWIZZLE(Z, Y, W, W);
   1577       } else {
   1578          mul_swiz = SWIZZLE(W, X, W, W);
   1579          dp3_swiz = SWIZZLE(X, Y, W, W);
   1580       }
   1581 
   1582       ins[3].opcode = INST_OPCODE_MUL;
   1583       ins[3].dst = etna_native_to_dst(t0, INST_COMPS_Y);
   1584       ins[3].src[0] = swizzle(t0s, mul_swiz);
   1585       ins[3].src[1] = absolute(ins[3].src[0]);
   1586 
   1587       ins[4].opcode = INST_OPCODE_DP3;
   1588       ins[4].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z);
   1589       ins[4].src[0] = swizzle(t0s, dp3_swiz);
   1590       ins[4].src[1] = swizzle(sincos[0], SWIZZLE(Z, W, W, W));
   1591 
   1592       p = &ins[5];
   1593       p->opcode = INST_OPCODE_MAD;
   1594       p->dst = etna_native_to_dst(t0, INST_COMPS_Y | INST_COMPS_W);
   1595       p->src[0] = swizzle(t0s, SWIZZLE(X, X, Z, Z));
   1596       p->src[1] = absolute(p->src[0]);
   1597       p->src[2] = negate(p->src[0]);
   1598 
   1599       p++;
   1600       p->opcode = INST_OPCODE_MAD;
   1601       p->sat = inst->Instruction.Saturate;
   1602       p->dst = convert_dst(c, &inst->Dst[0]),
   1603       p->src[0] = swizzle(t0s, SWIZZLE(Y, W, Y, W));
   1604       p->src[1] = alloc_imm_f32(c, 0.2225);
   1605       p->src[2] = swizzle(t0s, SWIZZLE(X, Z, X, Z));
   1606 
   1607       for (int i = 0; &ins[i] <= p; i++)
   1608          emit_inst(c, &ins[i]);
   1609    }
   1610 }
   1611 
   1612 static void
   1613 trans_lg2(const struct instr_translater *t, struct etna_compile *c,
   1614             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1615 {
   1616    if (c->specs->has_new_transcendentals) {
   1617       /* On newer chips alternative LOG instruction is implemented,
   1618        * which outputs an x and y component, which need to be multiplied to
   1619        * get the result.
   1620        */
   1621       struct etna_native_reg temp = etna_compile_get_inner_temp(c); /* only using .xy */
   1622       emit_inst(c, &(struct etna_inst) {
   1623          .opcode = INST_OPCODE_LOG,
   1624          .sat = 0,
   1625          .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y),
   1626          .src[2] = src[0],
   1627          .tex = { .amode=1 }, /* Unknown bit needs to be set */
   1628       });
   1629       emit_inst(c, &(struct etna_inst) {
   1630          .opcode = INST_OPCODE_MUL,
   1631          .sat = inst->Instruction.Saturate,
   1632          .dst = convert_dst(c, &inst->Dst[0]),
   1633          .src[0] = etna_native_to_src(temp, SWIZZLE(X, X, X, X)),
   1634          .src[1] = etna_native_to_src(temp, SWIZZLE(Y, Y, Y, Y)),
   1635       });
   1636    } else {
   1637       emit_inst(c, &(struct etna_inst) {
   1638          .opcode = INST_OPCODE_LOG,
   1639          .sat = inst->Instruction.Saturate,
   1640          .dst = convert_dst(c, &inst->Dst[0]),
   1641          .src[2] = src[0],
   1642       });
   1643    }
   1644 }
   1645 
   1646 static void
   1647 trans_sampler(const struct instr_translater *t, struct etna_compile *c,
   1648               const struct tgsi_full_instruction *inst,
   1649               struct etna_inst_src *src)
   1650 {
   1651    /* There is no native support for GL texture rectangle coordinates, so
   1652     * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0, 1]). */
   1653    if (inst->Texture.Texture == TGSI_TEXTURE_RECT) {
   1654       uint32_t unit = inst->Src[1].Register.Index;
   1655       struct etna_inst ins[2] = { };
   1656       struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1657 
   1658       ins[0].opcode = INST_OPCODE_MUL;
   1659       ins[0].dst = etna_native_to_dst(temp, INST_COMPS_X);
   1660       ins[0].src[0] = src[0];
   1661       ins[0].src[1] = alloc_imm(c, ETNA_IMMEDIATE_TEXRECT_SCALE_X, unit);
   1662 
   1663       ins[1].opcode = INST_OPCODE_MUL;
   1664       ins[1].dst = etna_native_to_dst(temp, INST_COMPS_Y);
   1665       ins[1].src[0] = src[0];
   1666       ins[1].src[1] = alloc_imm(c, ETNA_IMMEDIATE_TEXRECT_SCALE_Y, unit);
   1667 
   1668       emit_inst(c, &ins[0]);
   1669       emit_inst(c, &ins[1]);
   1670 
   1671       src[0] = etna_native_to_src(temp, INST_SWIZ_IDENTITY); /* temp.xyzw */
   1672    }
   1673 
   1674    switch (inst->Instruction.Opcode) {
   1675    case TGSI_OPCODE_TEX:
   1676       emit_inst(c, &(struct etna_inst) {
   1677          .opcode = INST_OPCODE_TEXLD,
   1678          .sat = 0,
   1679          .dst = convert_dst(c, &inst->Dst[0]),
   1680          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
   1681          .src[0] = src[0],
   1682       });
   1683       break;
   1684 
   1685    case TGSI_OPCODE_TXB:
   1686       emit_inst(c, &(struct etna_inst) {
   1687          .opcode = INST_OPCODE_TEXLDB,
   1688          .sat = 0,
   1689          .dst = convert_dst(c, &inst->Dst[0]),
   1690          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
   1691          .src[0] = src[0],
   1692       });
   1693       break;
   1694 
   1695    case TGSI_OPCODE_TXL:
   1696       emit_inst(c, &(struct etna_inst) {
   1697          .opcode = INST_OPCODE_TEXLDL,
   1698          .sat = 0,
   1699          .dst = convert_dst(c, &inst->Dst[0]),
   1700          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
   1701          .src[0] = src[0],
   1702       });
   1703       break;
   1704 
   1705    case TGSI_OPCODE_TXP: { /* divide src.xyz by src.w */
   1706       struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1707 
   1708       emit_inst(c, &(struct etna_inst) {
   1709          .opcode = INST_OPCODE_RCP,
   1710          .sat = 0,
   1711          .dst = etna_native_to_dst(temp, INST_COMPS_W), /* tmp.w */
   1712          .src[2] = swizzle(src[0], SWIZZLE(W, W, W, W)),
   1713       });
   1714       emit_inst(c, &(struct etna_inst) {
   1715          .opcode = INST_OPCODE_MUL,
   1716          .sat = 0,
   1717          .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
   1718                                          INST_COMPS_Z), /* tmp.xyz */
   1719          .src[0] = etna_native_to_src(temp, SWIZZLE(W, W, W, W)),
   1720          .src[1] = src[0], /* src.xyzw */
   1721       });
   1722       emit_inst(c, &(struct etna_inst) {
   1723          .opcode = INST_OPCODE_TEXLD,
   1724          .sat = 0,
   1725          .dst = convert_dst(c, &inst->Dst[0]),
   1726          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
   1727          .src[0] = etna_native_to_src(temp, INST_SWIZ_IDENTITY), /* tmp.xyzw */
   1728       });
   1729    } break;
   1730 
   1731    default:
   1732       BUG("Unhandled instruction %s",
   1733           tgsi_get_opcode_name(inst->Instruction.Opcode));
   1734       assert(0);
   1735       break;
   1736    }
   1737 }
   1738 
   1739 static void
   1740 trans_dummy(const struct instr_translater *t, struct etna_compile *c,
   1741             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1742 {
   1743    /* nothing to do */
   1744 }
   1745 
   1746 static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
   1747 #define INSTR(n, f, ...) \
   1748    [TGSI_OPCODE_##n] = {.fxn = (f), .tgsi_opc = TGSI_OPCODE_##n, ##__VA_ARGS__}
   1749 
   1750    INSTR(MOV, trans_instr, .opc = INST_OPCODE_MOV, .src = {2, -1, -1}),
   1751    INSTR(RCP, trans_instr, .opc = INST_OPCODE_RCP, .src = {2, -1, -1}),
   1752    INSTR(RSQ, trans_instr, .opc = INST_OPCODE_RSQ, .src = {2, -1, -1}),
   1753    INSTR(MUL, trans_instr, .opc = INST_OPCODE_MUL, .src = {0, 1, -1}),
   1754    INSTR(ADD, trans_instr, .opc = INST_OPCODE_ADD, .src = {0, 2, -1}),
   1755    INSTR(DP2, trans_instr, .opc = INST_OPCODE_DP2, .src = {0, 1, -1}),
   1756    INSTR(DP3, trans_instr, .opc = INST_OPCODE_DP3, .src = {0, 1, -1}),
   1757    INSTR(DP4, trans_instr, .opc = INST_OPCODE_DP4, .src = {0, 1, -1}),
   1758    INSTR(DST, trans_instr, .opc = INST_OPCODE_DST, .src = {0, 1, -1}),
   1759    INSTR(MAD, trans_instr, .opc = INST_OPCODE_MAD, .src = {0, 1, 2}),
   1760    INSTR(EX2, trans_instr, .opc = INST_OPCODE_EXP, .src = {2, -1, -1}),
   1761    INSTR(LG2, trans_lg2),
   1762    INSTR(SQRT, trans_instr, .opc = INST_OPCODE_SQRT, .src = {2, -1, -1}),
   1763    INSTR(FRC, trans_instr, .opc = INST_OPCODE_FRC, .src = {2, -1, -1}),
   1764    INSTR(CEIL, trans_instr, .opc = INST_OPCODE_CEIL, .src = {2, -1, -1}),
   1765    INSTR(FLR, trans_instr, .opc = INST_OPCODE_FLOOR, .src = {2, -1, -1}),
   1766    INSTR(CMP, trans_instr, .opc = INST_OPCODE_SELECT, .src = {0, 1, 2}, .cond = INST_CONDITION_LZ),
   1767 
   1768    INSTR(KILL, trans_instr, .opc = INST_OPCODE_TEXKILL),
   1769    INSTR(KILL_IF, trans_instr, .opc = INST_OPCODE_TEXKILL, .src = {0, -1, -1}, .cond = INST_CONDITION_LZ),
   1770 
   1771    INSTR(DDX, trans_deriv, .opc = INST_OPCODE_DSX),
   1772    INSTR(DDY, trans_deriv, .opc = INST_OPCODE_DSY),
   1773 
   1774    INSTR(IF, trans_if),
   1775    INSTR(ELSE, trans_else),
   1776    INSTR(ENDIF, trans_endif),
   1777 
   1778    INSTR(BGNLOOP, trans_loop_bgn),
   1779    INSTR(ENDLOOP, trans_loop_end),
   1780    INSTR(BRK, trans_brk),
   1781    INSTR(CONT, trans_cont),
   1782 
   1783    INSTR(MIN, trans_min_max, .opc = INST_OPCODE_SELECT, .cond = INST_CONDITION_GT),
   1784    INSTR(MAX, trans_min_max, .opc = INST_OPCODE_SELECT, .cond = INST_CONDITION_LT),
   1785 
   1786    INSTR(ARL, trans_arl),
   1787    INSTR(LRP, trans_lrp),
   1788    INSTR(LIT, trans_lit),
   1789    INSTR(SSG, trans_ssg),
   1790 
   1791    INSTR(SIN, trans_trig),
   1792    INSTR(COS, trans_trig),
   1793 
   1794    INSTR(SLT, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_LT),
   1795    INSTR(SGE, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_GE),
   1796    INSTR(SEQ, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_EQ),
   1797    INSTR(SGT, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_GT),
   1798    INSTR(SLE, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_LE),
   1799    INSTR(SNE, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_NE),
   1800 
   1801    INSTR(TEX, trans_sampler),
   1802    INSTR(TXB, trans_sampler),
   1803    INSTR(TXL, trans_sampler),
   1804    INSTR(TXP, trans_sampler),
   1805 
   1806    INSTR(NOP, trans_dummy),
   1807    INSTR(END, trans_dummy),
   1808 };
   1809 
   1810 /* Pass -- compile instructions */
   1811 static void
   1812 etna_compile_pass_generate_code(struct etna_compile *c)
   1813 {
   1814    struct tgsi_parse_context ctx = { };
   1815    unsigned status = tgsi_parse_init(&ctx, c->tokens);
   1816    assert(status == TGSI_PARSE_OK);
   1817 
   1818    int inst_idx = 0;
   1819    while (!tgsi_parse_end_of_tokens(&ctx)) {
   1820       const struct tgsi_full_instruction *inst = 0;
   1821 
   1822       /* No inner temps used yet for this instruction, clear counter */
   1823       c->inner_temps = 0;
   1824 
   1825       tgsi_parse_token(&ctx);
   1826 
   1827       switch (ctx.FullToken.Token.Type) {
   1828       case TGSI_TOKEN_TYPE_INSTRUCTION:
   1829          /* iterate over operands */
   1830          inst = &ctx.FullToken.FullInstruction;
   1831          if (c->dead_inst[inst_idx]) { /* skip dead instructions */
   1832             inst_idx++;
   1833             continue;
   1834          }
   1835 
   1836          /* Lookup the TGSI information and generate the source arguments */
   1837          struct etna_inst_src src[ETNA_NUM_SRC];
   1838          memset(src, 0, sizeof(src));
   1839 
   1840          const struct tgsi_opcode_info *tgsi = tgsi_get_opcode_info(inst->Instruction.Opcode);
   1841 
   1842          for (int i = 0; i < tgsi->num_src && i < ETNA_NUM_SRC; i++) {
   1843             const struct tgsi_full_src_register *reg = &inst->Src[i];
   1844             const struct etna_native_reg *n = &etna_get_src_reg(c, reg->Register)->native;
   1845 
   1846             if (!n->valid || n->is_tex)
   1847                continue;
   1848 
   1849             src[i] = etna_create_src(reg, n);
   1850          }
   1851 
   1852          const unsigned opc = inst->Instruction.Opcode;
   1853          const struct instr_translater *t = &translaters[opc];
   1854 
   1855          if (t->fxn) {
   1856             t->fxn(t, c, inst, src);
   1857 
   1858             inst_idx += 1;
   1859          } else {
   1860             BUG("Unhandled instruction %s", tgsi_get_opcode_name(opc));
   1861             assert(0);
   1862          }
   1863          break;
   1864       }
   1865    }
   1866    tgsi_parse_free(&ctx);
   1867 }
   1868 
   1869 /* Look up register by semantic */
   1870 static struct etna_reg_desc *
   1871 find_decl_by_semantic(struct etna_compile *c, uint file, uint name, uint index)
   1872 {
   1873    for (int idx = 0; idx < c->file[file].reg_size; ++idx) {
   1874       struct etna_reg_desc *reg = &c->file[file].reg[idx];
   1875 
   1876       if (reg->semantic.Name == name && reg->semantic.Index == index)
   1877          return reg;
   1878    }
   1879 
   1880    return NULL; /* not found */
   1881 }
   1882 
   1883 /** Add ADD and MUL instruction to bring Z/W to 0..1 if -1..1 if needed:
   1884  * - this is a vertex shader
   1885  * - and this is an older GPU
   1886  */
   1887 static void
   1888 etna_compile_add_z_div_if_needed(struct etna_compile *c)
   1889 {
   1890    if (c->info.processor == PIPE_SHADER_VERTEX && c->specs->vs_need_z_div) {
   1891       /* find position out */
   1892       struct etna_reg_desc *pos_reg =
   1893          find_decl_by_semantic(c, TGSI_FILE_OUTPUT, TGSI_SEMANTIC_POSITION, 0);
   1894 
   1895       if (pos_reg != NULL) {
   1896          /*
   1897           * ADD tX.__z_, tX.zzzz, void, tX.wwww
   1898           * MUL tX.__z_, tX.zzzz, 0.5, void
   1899          */
   1900          emit_inst(c, &(struct etna_inst) {
   1901             .opcode = INST_OPCODE_ADD,
   1902             .dst = etna_native_to_dst(pos_reg->native, INST_COMPS_Z),
   1903             .src[0] = etna_native_to_src(pos_reg->native, SWIZZLE(Z, Z, Z, Z)),
   1904             .src[2] = etna_native_to_src(pos_reg->native, SWIZZLE(W, W, W, W)),
   1905          });
   1906          emit_inst(c, &(struct etna_inst) {
   1907             .opcode = INST_OPCODE_MUL,
   1908             .dst = etna_native_to_dst(pos_reg->native, INST_COMPS_Z),
   1909             .src[0] = etna_native_to_src(pos_reg->native, SWIZZLE(Z, Z, Z, Z)),
   1910             .src[1] = alloc_imm_f32(c, 0.5f),
   1911          });
   1912       }
   1913    }
   1914 }
   1915 
   1916 static void
   1917 etna_compile_frag_rb_swap(struct etna_compile *c)
   1918 {
   1919    if (c->info.processor == PIPE_SHADER_FRAGMENT && c->key->frag_rb_swap) {
   1920       /* find color out */
   1921       struct etna_reg_desc *color_reg =
   1922          find_decl_by_semantic(c, TGSI_FILE_OUTPUT, TGSI_SEMANTIC_COLOR, 0);
   1923 
   1924       emit_inst(c, &(struct etna_inst) {
   1925          .opcode = INST_OPCODE_MOV,
   1926          .dst = etna_native_to_dst(color_reg->native, INST_COMPS_X | INST_COMPS_Y | INST_COMPS_Z | INST_COMPS_W),
   1927          .src[2] = etna_native_to_src(color_reg->native, SWIZZLE(Z, Y, X, W)),
   1928       });
   1929    }
   1930 }
   1931 
   1932 /** add a NOP to the shader if
   1933  * a) the shader is empty
   1934  * or
   1935  * b) there is a label at the end of the shader
   1936  */
   1937 static void
   1938 etna_compile_add_nop_if_needed(struct etna_compile *c)
   1939 {
   1940    bool label_at_last_inst = false;
   1941 
   1942    for (int idx = 0; idx < c->labels_count; ++idx) {
   1943       if (c->labels[idx].inst_idx == c->inst_ptr)
   1944          label_at_last_inst = true;
   1945 
   1946    }
   1947 
   1948    if (c->inst_ptr == 0 || label_at_last_inst)
   1949       emit_inst(c, &(struct etna_inst){.opcode = INST_OPCODE_NOP});
   1950 }
   1951 
   1952 static void
   1953 assign_uniforms(struct etna_compile_file *file, unsigned base)
   1954 {
   1955    for (int idx = 0; idx < file->reg_size; ++idx) {
   1956       file->reg[idx].native.valid = 1;
   1957       file->reg[idx].native.rgroup = INST_RGROUP_UNIFORM_0;
   1958       file->reg[idx].native.id = base + idx;
   1959    }
   1960 }
   1961 
   1962 /* Allocate CONST and IMM to native ETNA_RGROUP_UNIFORM(x).
   1963  * CONST must be consecutive as const buffers are supposed to be consecutive,
   1964  * and before IMM, as this is
   1965  * more convenient because is possible for the compilation process itself to
   1966  * generate extra
   1967  * immediates for constants such as pi, one, zero.
   1968  */
   1969 static void
   1970 assign_constants_and_immediates(struct etna_compile *c)
   1971 {
   1972    assign_uniforms(&c->file[TGSI_FILE_CONSTANT], 0);
   1973    /* immediates start after the constants */
   1974    c->imm_base = c->file[TGSI_FILE_CONSTANT].reg_size * 4;
   1975    assign_uniforms(&c->file[TGSI_FILE_IMMEDIATE], c->imm_base / 4);
   1976    DBG_F(ETNA_DBG_COMPILER_MSGS, "imm base: %i size: %i", c->imm_base,
   1977          c->imm_size);
   1978 }
   1979 
   1980 /* Assign declared samplers to native texture units */
   1981 static void
   1982 assign_texture_units(struct etna_compile *c)
   1983 {
   1984    uint tex_base = 0;
   1985 
   1986    if (c->info.processor == PIPE_SHADER_VERTEX)
   1987       tex_base = c->specs->vertex_sampler_offset;
   1988 
   1989    for (int idx = 0; idx < c->file[TGSI_FILE_SAMPLER].reg_size; ++idx) {
   1990       c->file[TGSI_FILE_SAMPLER].reg[idx].native.valid = 1;
   1991       c->file[TGSI_FILE_SAMPLER].reg[idx].native.is_tex = 1; // overrides rgroup
   1992       c->file[TGSI_FILE_SAMPLER].reg[idx].native.id = tex_base + idx;
   1993    }
   1994 }
   1995 
   1996 /* Additional pass to fill in branch targets. This pass should be last
   1997  * as no instruction reordering or removing/addition can be done anymore
   1998  * once the branch targets are computed.
   1999  */
   2000 static void
   2001 etna_compile_fill_in_labels(struct etna_compile *c)
   2002 {
   2003    for (int idx = 0; idx < c->inst_ptr; ++idx) {
   2004       if (c->lbl_usage[idx] != -1)
   2005          etna_assemble_set_imm(&c->code[idx * 4],
   2006                                c->labels[c->lbl_usage[idx]].inst_idx);
   2007    }
   2008 }
   2009 
   2010 /* compare two etna_native_reg structures, return true if equal */
   2011 static bool
   2012 cmp_etna_native_reg(const struct etna_native_reg to,
   2013                     const struct etna_native_reg from)
   2014 {
   2015    return to.valid == from.valid && to.is_tex == from.is_tex &&
   2016           to.rgroup == from.rgroup && to.id == from.id;
   2017 }
   2018 
   2019 /* go through all declarations and swap native registers *to* and *from* */
   2020 static void
   2021 swap_native_registers(struct etna_compile *c, const struct etna_native_reg to,
   2022                       const struct etna_native_reg from)
   2023 {
   2024    if (cmp_etna_native_reg(from, to))
   2025       return; /* Nothing to do */
   2026 
   2027    for (int idx = 0; idx < c->total_decls; ++idx) {
   2028       if (cmp_etna_native_reg(c->decl[idx].native, from)) {
   2029          c->decl[idx].native = to;
   2030       } else if (cmp_etna_native_reg(c->decl[idx].native, to)) {
   2031          c->decl[idx].native = from;
   2032       }
   2033    }
   2034 }
   2035 
   2036 /* For PS we need to permute so that inputs are always in temporary 0..N-1.
   2037  * Semantic POS is always t0. If that semantic is not used, avoid t0.
   2038  */
   2039 static void
   2040 permute_ps_inputs(struct etna_compile *c)
   2041 {
   2042    /* Special inputs:
   2043     * gl_FragCoord  VARYING_SLOT_POS   TGSI_SEMANTIC_POSITION
   2044     * gl_PointCoord VARYING_SLOT_PNTC  TGSI_SEMANTIC_PCOORD
   2045     */
   2046    uint native_idx = 1;
   2047 
   2048    for (int idx = 0; idx < c->file[TGSI_FILE_INPUT].reg_size; ++idx) {
   2049       struct etna_reg_desc *reg = &c->file[TGSI_FILE_INPUT].reg[idx];
   2050       uint input_id;
   2051       assert(reg->has_semantic);
   2052 
   2053       if (!reg->active || reg->semantic.Name == TGSI_SEMANTIC_POSITION)
   2054          continue;
   2055 
   2056       input_id = native_idx++;
   2057       swap_native_registers(c, etna_native_temp(input_id),
   2058                             c->file[TGSI_FILE_INPUT].reg[idx].native);
   2059    }
   2060 
   2061    c->num_varyings = native_idx - 1;
   2062 
   2063    if (native_idx > c->next_free_native)
   2064       c->next_free_native = native_idx;
   2065 }
   2066 
   2067 /* fill in ps inputs into shader object */
   2068 static void
   2069 fill_in_ps_inputs(struct etna_shader_variant *sobj, struct etna_compile *c)
   2070 {
   2071    struct etna_shader_io_file *sf = &sobj->infile;
   2072 
   2073    sf->num_reg = 0;
   2074 
   2075    for (int idx = 0; idx < c->file[TGSI_FILE_INPUT].reg_size; ++idx) {
   2076       struct etna_reg_desc *reg = &c->file[TGSI_FILE_INPUT].reg[idx];
   2077 
   2078       if (reg->native.id > 0) {
   2079          assert(sf->num_reg < ETNA_NUM_INPUTS);
   2080          sf->reg[sf->num_reg].reg = reg->native.id;
   2081          sf->reg[sf->num_reg].semantic = reg->semantic;
   2082          /* convert usage mask to number of components (*=wildcard)
   2083           *   .r    (0..1)  -> 1 component
   2084           *   .*g   (2..3)  -> 2 component
   2085           *   .**b  (4..7)  -> 3 components
   2086           *   .***a (8..15) -> 4 components
   2087           */
   2088          sf->reg[sf->num_reg].num_components = util_last_bit(reg->usage_mask);
   2089          sf->num_reg++;
   2090       }
   2091    }
   2092 
   2093    assert(sf->num_reg == c->num_varyings);
   2094    sobj->input_count_unk8 = 31; /* XXX what is this */
   2095 }
   2096 
   2097 /* fill in output mapping for ps into shader object */
   2098 static void
   2099 fill_in_ps_outputs(struct etna_shader_variant *sobj, struct etna_compile *c)
   2100 {
   2101    sobj->outfile.num_reg = 0;
   2102 
   2103    for (int idx = 0; idx < c->file[TGSI_FILE_OUTPUT].reg_size; ++idx) {
   2104       struct etna_reg_desc *reg = &c->file[TGSI_FILE_OUTPUT].reg[idx];
   2105 
   2106       switch (reg->semantic.Name) {
   2107       case TGSI_SEMANTIC_COLOR: /* FRAG_RESULT_COLOR */
   2108          sobj->ps_color_out_reg = reg->native.id;
   2109          break;
   2110       case TGSI_SEMANTIC_POSITION: /* FRAG_RESULT_DEPTH */
   2111          sobj->ps_depth_out_reg = reg->native.id; /* =always native reg 0, only z component should be assigned */
   2112          break;
   2113       default:
   2114          assert(0); /* only outputs supported are COLOR and POSITION at the moment */
   2115       }
   2116    }
   2117 }
   2118 
   2119 /* fill in inputs for vs into shader object */
   2120 static void
   2121 fill_in_vs_inputs(struct etna_shader_variant *sobj, struct etna_compile *c)
   2122 {
   2123    struct etna_shader_io_file *sf = &sobj->infile;
   2124 
   2125    sf->num_reg = 0;
   2126    for (int idx = 0; idx < c->file[TGSI_FILE_INPUT].reg_size; ++idx) {
   2127       struct etna_reg_desc *reg = &c->file[TGSI_FILE_INPUT].reg[idx];
   2128       assert(sf->num_reg < ETNA_NUM_INPUTS);
   2129 
   2130       if (!reg->native.valid)
   2131          continue;
   2132 
   2133       /* XXX exclude inputs with special semantics such as gl_frontFacing */
   2134       sf->reg[sf->num_reg].reg = reg->native.id;
   2135       sf->reg[sf->num_reg].semantic = reg->semantic;
   2136       sf->reg[sf->num_reg].num_components = util_last_bit(reg->usage_mask);
   2137       sf->num_reg++;
   2138    }
   2139 
   2140    sobj->input_count_unk8 = (sf->num_reg + 19) / 16; /* XXX what is this */
   2141 }
   2142 
   2143 /* build two-level output index [Semantic][Index] for fast linking */
   2144 static void
   2145 build_output_index(struct etna_shader_variant *sobj)
   2146 {
   2147    int total = 0;
   2148    int offset = 0;
   2149 
   2150    for (int name = 0; name < TGSI_SEMANTIC_COUNT; ++name)
   2151       total += sobj->output_count_per_semantic[name];
   2152 
   2153    sobj->output_per_semantic_list = CALLOC(total, sizeof(struct etna_shader_inout *));
   2154 
   2155    for (int name = 0; name < TGSI_SEMANTIC_COUNT; ++name) {
   2156       sobj->output_per_semantic[name] = &sobj->output_per_semantic_list[offset];
   2157       offset += sobj->output_count_per_semantic[name];
   2158    }
   2159 
   2160    for (int idx = 0; idx < sobj->outfile.num_reg; ++idx) {
   2161       sobj->output_per_semantic[sobj->outfile.reg[idx].semantic.Name]
   2162                                [sobj->outfile.reg[idx].semantic.Index] =
   2163          &sobj->outfile.reg[idx];
   2164    }
   2165 }
   2166 
   2167 /* fill in outputs for vs into shader object */
   2168 static void
   2169 fill_in_vs_outputs(struct etna_shader_variant *sobj, struct etna_compile *c)
   2170 {
   2171    struct etna_shader_io_file *sf = &sobj->outfile;
   2172 
   2173    sf->num_reg = 0;
   2174    for (int idx = 0; idx < c->file[TGSI_FILE_OUTPUT].reg_size; ++idx) {
   2175       struct etna_reg_desc *reg = &c->file[TGSI_FILE_OUTPUT].reg[idx];
   2176       assert(sf->num_reg < ETNA_NUM_INPUTS);
   2177 
   2178       switch (reg->semantic.Name) {
   2179       case TGSI_SEMANTIC_POSITION:
   2180          sobj->vs_pos_out_reg = reg->native.id;
   2181          break;
   2182       case TGSI_SEMANTIC_PSIZE:
   2183          sobj->vs_pointsize_out_reg = reg->native.id;
   2184          break;
   2185       default:
   2186          sf->reg[sf->num_reg].reg = reg->native.id;
   2187          sf->reg[sf->num_reg].semantic = reg->semantic;
   2188          sf->reg[sf->num_reg].num_components = 4; // XXX reg->num_components;
   2189          sf->num_reg++;
   2190          sobj->output_count_per_semantic[reg->semantic.Name] =
   2191             MAX2(reg->semantic.Index + 1,
   2192                  sobj->output_count_per_semantic[reg->semantic.Name]);
   2193       }
   2194    }
   2195 
   2196    /* build two-level index for linking */
   2197    build_output_index(sobj);
   2198 
   2199    /* fill in "mystery meat" load balancing value. This value determines how
   2200     * work is scheduled between VS and PS
   2201     * in the unified shader architecture. More precisely, it is determined from
   2202     * the number of VS outputs, as well as chip-specific
   2203     * vertex output buffer size, vertex cache size, and the number of shader
   2204     * cores.
   2205     *
   2206     * XXX this is a conservative estimate, the "optimal" value is only known for
   2207     * sure at link time because some
   2208     * outputs may be unused and thus unmapped. Then again, in the general use
   2209     * case with GLSL the vertex and fragment
   2210     * shaders are linked already before submitting to Gallium, thus all outputs
   2211     * are used.
   2212     */
   2213    int half_out = (c->file[TGSI_FILE_OUTPUT].reg_size + 1) / 2;
   2214    assert(half_out);
   2215 
   2216    uint32_t b = ((20480 / (c->specs->vertex_output_buffer_size -
   2217                            2 * half_out * c->specs->vertex_cache_size)) +
   2218                  9) /
   2219                 10;
   2220    uint32_t a = (b + 256 / (c->specs->shader_core_count * half_out)) / 2;
   2221    sobj->vs_load_balancing = VIVS_VS_LOAD_BALANCING_A(MIN2(a, 255)) |
   2222                              VIVS_VS_LOAD_BALANCING_B(MIN2(b, 255)) |
   2223                              VIVS_VS_LOAD_BALANCING_C(0x3f) |
   2224                              VIVS_VS_LOAD_BALANCING_D(0x0f);
   2225 }
   2226 
   2227 static bool
   2228 etna_compile_check_limits(struct etna_compile *c)
   2229 {
   2230    int max_uniforms = (c->info.processor == PIPE_SHADER_VERTEX)
   2231                          ? c->specs->max_vs_uniforms
   2232                          : c->specs->max_ps_uniforms;
   2233    /* round up number of uniforms, including immediates, in units of four */
   2234    int num_uniforms = c->imm_base / 4 + (c->imm_size + 3) / 4;
   2235 
   2236    if (!c->specs->has_icache && c->inst_ptr > c->specs->max_instructions) {
   2237       DBG("Number of instructions (%d) exceeds maximum %d", c->inst_ptr,
   2238           c->specs->max_instructions);
   2239       return false;
   2240    }
   2241 
   2242    if (c->next_free_native > c->specs->max_registers) {
   2243       DBG("Number of registers (%d) exceeds maximum %d", c->next_free_native,
   2244           c->specs->max_registers);
   2245       return false;
   2246    }
   2247 
   2248    if (num_uniforms > max_uniforms) {
   2249       DBG("Number of uniforms (%d) exceeds maximum %d", num_uniforms,
   2250           max_uniforms);
   2251       return false;
   2252    }
   2253 
   2254    if (c->num_varyings > c->specs->max_varyings) {
   2255       DBG("Number of varyings (%d) exceeds maximum %d", c->num_varyings,
   2256           c->specs->max_varyings);
   2257       return false;
   2258    }
   2259 
   2260    if (c->imm_base > c->specs->num_constants) {
   2261       DBG("Number of constants (%d) exceeds maximum %d", c->imm_base,
   2262           c->specs->num_constants);
   2263    }
   2264 
   2265    return true;
   2266 }
   2267 
   2268 static void
   2269 copy_uniform_state_to_shader(struct etna_compile *c, struct etna_shader_variant *sobj)
   2270 {
   2271    uint32_t count = c->imm_size;
   2272    struct etna_shader_uniform_info *uinfo = &sobj->uniforms;
   2273 
   2274    uinfo->const_count = c->imm_base;
   2275    uinfo->imm_count = count;
   2276    uinfo->imm_data = mem_dup(c->imm_data, count * sizeof(*c->imm_data));
   2277    uinfo->imm_contents = mem_dup(c->imm_contents, count * sizeof(*c->imm_contents));
   2278 
   2279    etna_set_shader_uniforms_dirty_flags(sobj);
   2280 }
   2281 
   2282 bool
   2283 etna_compile_shader(struct etna_shader_variant *v)
   2284 {
   2285    /* Create scratch space that may be too large to fit on stack
   2286     */
   2287    bool ret;
   2288    struct etna_compile *c;
   2289 
   2290    if (unlikely(!v))
   2291       return false;
   2292 
   2293    const struct etna_specs *specs = v->shader->specs;
   2294 
   2295    struct tgsi_lowering_config lconfig = {
   2296       .lower_FLR = !specs->has_sign_floor_ceil,
   2297       .lower_CEIL = !specs->has_sign_floor_ceil,
   2298       .lower_POW = true,
   2299       .lower_EXP = true,
   2300       .lower_LOG = true,
   2301       .lower_DP2 = !specs->has_halti2_instructions,
   2302       .lower_TRUNC = true,
   2303    };
   2304 
   2305    c = CALLOC_STRUCT(etna_compile);
   2306    if (!c)
   2307       return false;
   2308 
   2309    memset(&c->lbl_usage, -1, sizeof(c->lbl_usage));
   2310 
   2311    const struct tgsi_token *tokens = v->shader->tokens;
   2312 
   2313    c->specs = specs;
   2314    c->key = &v->key;
   2315    c->tokens = tgsi_transform_lowering(&lconfig, tokens, &c->info);
   2316    c->free_tokens = !!c->tokens;
   2317    if (!c->tokens) {
   2318       /* no lowering */
   2319       c->tokens = tokens;
   2320    }
   2321 
   2322    /* Build a map from gallium register to native registers for files
   2323     * CONST, SAMP, IMM, OUT, IN, TEMP.
   2324     * SAMP will map as-is for fragment shaders, there will be a +8 offset for
   2325     * vertex shaders.
   2326     */
   2327    /* Pass one -- check register file declarations and immediates */
   2328    etna_compile_parse_declarations(c);
   2329 
   2330    etna_allocate_decls(c);
   2331 
   2332    /* Pass two -- check usage of temporaries, inputs, outputs */
   2333    etna_compile_pass_check_usage(c);
   2334 
   2335    assign_special_inputs(c);
   2336 
   2337    /* Assign native temp register to TEMPs */
   2338    assign_temporaries_to_native(c, &c->file[TGSI_FILE_TEMPORARY]);
   2339 
   2340    /* optimize outputs */
   2341    etna_compile_pass_optimize_outputs(c);
   2342 
   2343    /* XXX assign special inputs: gl_FrontFacing (VARYING_SLOT_FACE)
   2344     *     this is part of RGROUP_INTERNAL
   2345     */
   2346 
   2347    /* assign inputs: last usage of input should be <= first usage of temp */
   2348    /*   potential optimization case:
   2349     *     if single MOV TEMP[y], IN[x] before which temp y is not used, and
   2350     * after which IN[x]
   2351     *     is not read, temp[y] can be used as input register as-is
   2352     */
   2353    /*   sort temporaries by first use
   2354     *   sort inputs by last usage
   2355     *   iterate over inputs, temporaries
   2356     *     if last usage of input <= first usage of temp:
   2357     *       assign input to temp
   2358     *       advance input, temporary pointer
   2359     *     else
   2360     *       advance temporary pointer
   2361     *
   2362     *   potential problem: instruction with multiple inputs of which one is the
   2363     * temp and the other is the input;
   2364     *      however, as the temp is not used before this, how would this make
   2365     * sense? uninitialized temporaries have an undefined
   2366     *      value, so this would be ok
   2367     */
   2368    assign_inouts_to_temporaries(c, TGSI_FILE_INPUT);
   2369 
   2370    /* assign outputs: first usage of output should be >= last usage of temp */
   2371    /*   potential optimization case:
   2372     *      if single MOV OUT[x], TEMP[y] (with full write mask, or at least
   2373     * writing all components that are used in
   2374     *        the shader) after which temp y is no longer used temp[y] can be
   2375     * used as output register as-is
   2376     *
   2377     *   potential problem: instruction with multiple outputs of which one is the
   2378     * temp and the other is the output;
   2379     *      however, as the temp is not used after this, how would this make
   2380     * sense? could just discard the output value
   2381     */
   2382    /*   sort temporaries by last use
   2383     *   sort outputs by first usage
   2384     *   iterate over outputs, temporaries
   2385     *     if first usage of output >= last usage of temp:
   2386     *       assign output to temp
   2387     *       advance output, temporary pointer
   2388     *     else
   2389     *       advance temporary pointer
   2390     */
   2391    assign_inouts_to_temporaries(c, TGSI_FILE_OUTPUT);
   2392 
   2393    assign_constants_and_immediates(c);
   2394    assign_texture_units(c);
   2395 
   2396    /* list declarations */
   2397    for (int x = 0; x < c->total_decls; ++x) {
   2398       DBG_F(ETNA_DBG_COMPILER_MSGS, "%i: %s,%d active=%i first_use=%i "
   2399                                     "last_use=%i native=%i usage_mask=%x "
   2400                                     "has_semantic=%i",
   2401             x, tgsi_file_name(c->decl[x].file), c->decl[x].idx,
   2402             c->decl[x].active, c->decl[x].first_use, c->decl[x].last_use,
   2403             c->decl[x].native.valid ? c->decl[x].native.id : -1,
   2404             c->decl[x].usage_mask, c->decl[x].has_semantic);
   2405       if (c->decl[x].has_semantic)
   2406          DBG_F(ETNA_DBG_COMPILER_MSGS, " semantic_name=%s semantic_idx=%i",
   2407                tgsi_semantic_names[c->decl[x].semantic.Name],
   2408                c->decl[x].semantic.Index);
   2409    }
   2410    /* XXX for PS we need to permute so that inputs are always in temporary
   2411     * 0..N-1.
   2412     * There is no "switchboard" for varyings (AFAIK!). The output color,
   2413     * however, can be routed
   2414     * from an arbitrary temporary.
   2415     */
   2416    if (c->info.processor == PIPE_SHADER_FRAGMENT)
   2417       permute_ps_inputs(c);
   2418 
   2419 
   2420    /* list declarations */
   2421    for (int x = 0; x < c->total_decls; ++x) {
   2422       DBG_F(ETNA_DBG_COMPILER_MSGS, "%i: %s,%d active=%i first_use=%i "
   2423                                     "last_use=%i native=%i usage_mask=%x "
   2424                                     "has_semantic=%i",
   2425             x, tgsi_file_name(c->decl[x].file), c->decl[x].idx,
   2426             c->decl[x].active, c->decl[x].first_use, c->decl[x].last_use,
   2427             c->decl[x].native.valid ? c->decl[x].native.id : -1,
   2428             c->decl[x].usage_mask, c->decl[x].has_semantic);
   2429       if (c->decl[x].has_semantic)
   2430          DBG_F(ETNA_DBG_COMPILER_MSGS, " semantic_name=%s semantic_idx=%i",
   2431                tgsi_semantic_names[c->decl[x].semantic.Name],
   2432                c->decl[x].semantic.Index);
   2433    }
   2434 
   2435    /* pass 3: generate instructions */
   2436    etna_compile_pass_generate_code(c);
   2437    etna_compile_add_z_div_if_needed(c);
   2438    etna_compile_frag_rb_swap(c);
   2439    etna_compile_add_nop_if_needed(c);
   2440 
   2441    ret = etna_compile_check_limits(c);
   2442    if (!ret)
   2443       goto out;
   2444 
   2445    etna_compile_fill_in_labels(c);
   2446 
   2447    /* fill in output structure */
   2448    v->processor = c->info.processor;
   2449    v->code_size = c->inst_ptr * 4;
   2450    v->code = mem_dup(c->code, c->inst_ptr * 16);
   2451    v->num_loops = c->num_loops;
   2452    v->num_temps = c->next_free_native;
   2453    v->vs_pos_out_reg = -1;
   2454    v->vs_pointsize_out_reg = -1;
   2455    v->ps_color_out_reg = -1;
   2456    v->ps_depth_out_reg = -1;
   2457    v->needs_icache = c->inst_ptr > c->specs->max_instructions;
   2458    copy_uniform_state_to_shader(c, v);
   2459 
   2460    if (c->info.processor == PIPE_SHADER_VERTEX) {
   2461       fill_in_vs_inputs(v, c);
   2462       fill_in_vs_outputs(v, c);
   2463    } else if (c->info.processor == PIPE_SHADER_FRAGMENT) {
   2464       fill_in_ps_inputs(v, c);
   2465       fill_in_ps_outputs(v, c);
   2466    }
   2467 
   2468 out:
   2469    if (c->free_tokens)
   2470       FREE((void *)c->tokens);
   2471 
   2472    FREE(c->labels);
   2473    FREE(c);
   2474 
   2475    return ret;
   2476 }
   2477 
   2478 extern const char *tgsi_swizzle_names[];
   2479 void
   2480 etna_dump_shader(const struct etna_shader_variant *shader)
   2481 {
   2482    if (shader->processor == PIPE_SHADER_VERTEX)
   2483       printf("VERT\n");
   2484    else
   2485       printf("FRAG\n");
   2486 
   2487 
   2488    etna_disasm(shader->code, shader->code_size, PRINT_RAW);
   2489 
   2490    printf("num loops: %i\n", shader->num_loops);
   2491    printf("num temps: %i\n", shader->num_temps);
   2492    printf("num const: %i\n", shader->uniforms.const_count);
   2493    printf("immediates:\n");
   2494    for (int idx = 0; idx < shader->uniforms.imm_count; ++idx) {
   2495       printf(" [%i].%s = %f (0x%08x)\n",
   2496              (idx + shader->uniforms.const_count) / 4,
   2497              tgsi_swizzle_names[idx % 4],
   2498              *((float *)&shader->uniforms.imm_data[idx]),
   2499              shader->uniforms.imm_data[idx]);
   2500    }
   2501    printf("inputs:\n");
   2502    for (int idx = 0; idx < shader->infile.num_reg; ++idx) {
   2503       printf(" [%i] name=%s index=%i comps=%i\n", shader->infile.reg[idx].reg,
   2504              tgsi_semantic_names[shader->infile.reg[idx].semantic.Name],
   2505              shader->infile.reg[idx].semantic.Index,
   2506              shader->infile.reg[idx].num_components);
   2507    }
   2508    printf("outputs:\n");
   2509    for (int idx = 0; idx < shader->outfile.num_reg; ++idx) {
   2510       printf(" [%i] name=%s index=%i comps=%i\n", shader->outfile.reg[idx].reg,
   2511              tgsi_semantic_names[shader->outfile.reg[idx].semantic.Name],
   2512              shader->outfile.reg[idx].semantic.Index,
   2513              shader->outfile.reg[idx].num_components);
   2514    }
   2515    printf("special:\n");
   2516    if (shader->processor == PIPE_SHADER_VERTEX) {
   2517       printf("  vs_pos_out_reg=%i\n", shader->vs_pos_out_reg);
   2518       printf("  vs_pointsize_out_reg=%i\n", shader->vs_pointsize_out_reg);
   2519       printf("  vs_load_balancing=0x%08x\n", shader->vs_load_balancing);
   2520    } else {
   2521       printf("  ps_color_out_reg=%i\n", shader->ps_color_out_reg);
   2522       printf("  ps_depth_out_reg=%i\n", shader->ps_depth_out_reg);
   2523    }
   2524    printf("  input_count_unk8=0x%08x\n", shader->input_count_unk8);
   2525 }
   2526 
   2527 void
   2528 etna_destroy_shader(struct etna_shader_variant *shader)
   2529 {
   2530    assert(shader);
   2531 
   2532    FREE(shader->code);
   2533    FREE(shader->uniforms.imm_data);
   2534    FREE(shader->uniforms.imm_contents);
   2535    FREE(shader->output_per_semantic_list);
   2536    FREE(shader);
   2537 }
   2538 
   2539 static const struct etna_shader_inout *
   2540 etna_shader_vs_lookup(const struct etna_shader_variant *sobj,
   2541                       const struct etna_shader_inout *in)
   2542 {
   2543    if (in->semantic.Index < sobj->output_count_per_semantic[in->semantic.Name])
   2544       return sobj->output_per_semantic[in->semantic.Name][in->semantic.Index];
   2545 
   2546    return NULL;
   2547 }
   2548 
   2549 bool
   2550 etna_link_shader(struct etna_shader_link_info *info,
   2551                  const struct etna_shader_variant *vs, const struct etna_shader_variant *fs)
   2552 {
   2553    int comp_ofs = 0;
   2554    /* For each fragment input we need to find the associated vertex shader
   2555     * output, which can be found by matching on semantic name and index. A
   2556     * binary search could be used because the vs outputs are sorted by their
   2557     * semantic index and grouped by semantic type by fill_in_vs_outputs.
   2558     */
   2559    assert(fs->infile.num_reg < ETNA_NUM_INPUTS);
   2560    info->pcoord_varying_comp_ofs = -1;
   2561 
   2562    for (int idx = 0; idx < fs->infile.num_reg; ++idx) {
   2563       const struct etna_shader_inout *fsio = &fs->infile.reg[idx];
   2564       const struct etna_shader_inout *vsio = etna_shader_vs_lookup(vs, fsio);
   2565       struct etna_varying *varying;
   2566       bool interpolate_always = fsio->semantic.Name != TGSI_SEMANTIC_COLOR;
   2567 
   2568       assert(fsio->reg > 0 && fsio->reg <= ARRAY_SIZE(info->varyings));
   2569 
   2570       if (fsio->reg > info->num_varyings)
   2571          info->num_varyings = fsio->reg;
   2572 
   2573       varying = &info->varyings[fsio->reg - 1];
   2574       varying->num_components = fsio->num_components;
   2575 
   2576       if (!interpolate_always) /* colors affected by flat shading */
   2577          varying->pa_attributes = 0x200;
   2578       else /* texture coord or other bypasses flat shading */
   2579          varying->pa_attributes = 0x2f1;
   2580 
   2581       varying->use[0] = interpolate_always ? VARYING_COMPONENT_USE_POINTCOORD_X : VARYING_COMPONENT_USE_USED;
   2582       varying->use[1] = interpolate_always ? VARYING_COMPONENT_USE_POINTCOORD_Y : VARYING_COMPONENT_USE_USED;
   2583       varying->use[2] = VARYING_COMPONENT_USE_USED;
   2584       varying->use[3] = VARYING_COMPONENT_USE_USED;
   2585 
   2586 
   2587       /* point coord is an input to the PS without matching VS output,
   2588        * so it gets a varying slot without being assigned a VS register.
   2589        */
   2590       if (fsio->semantic.Name == TGSI_SEMANTIC_PCOORD) {
   2591          info->pcoord_varying_comp_ofs = comp_ofs;
   2592       } else {
   2593          if (vsio == NULL) { /* not found -- link error */
   2594             BUG("Semantic %d value %d not found in vertex shader outputs\n", fsio->semantic.Name, fsio->semantic.Index);
   2595             return true;
   2596          }
   2597 
   2598          varying->reg = vsio->reg;
   2599       }
   2600 
   2601       comp_ofs += varying->num_components;
   2602    }
   2603 
   2604    assert(info->num_varyings == fs->infile.num_reg);
   2605 
   2606    return false;
   2607 }
   2608