Home | History | Annotate | Download | only in etnaviv
      1 /*
      2  * Copyright (c) 2012-2015 Etnaviv Project
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sub license,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the
     12  * next paragraph) shall be included in all copies or substantial portions
     13  * of the Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21  * DEALINGS IN THE SOFTWARE.
     22  *
     23  * Authors:
     24  *    Wladimir J. van der Laan <laanwj (at) gmail.com>
     25  */
     26 
     27 /* TGSI->Vivante shader ISA conversion */
     28 
     29 /* What does the compiler return (see etna_shader_object)?
     30  *  1) instruction data
     31  *  2) input-to-temporary mapping (fixed for ps)
     32  *      *) in case of ps, semantic -> varying id mapping
     33  *      *) for each varying: number of components used (r, rg, rgb, rgba)
     34  *  3) temporary-to-output mapping (in case of vs, fixed for ps)
     35  *  4) for each input/output: possible semantic (position, color, glpointcoord, ...)
     36  *  5) immediates base offset, immediates data
     37  *  6) used texture units (and possibly the TGSI_TEXTURE_* type); not needed to
     38  *     configure the hw, but useful for error checking
     39  *  7) enough information to add the z=(z+w)/2.0 necessary for older chips
     40  *     (output reg id is enough)
     41  *
     42  *  Empty shaders are not allowed, should always at least generate a NOP. Also
     43  *  if there is a label at the end of the shader, an extra NOP should be
     44  *  generated as jump target.
     45  *
     46  * TODO
     47  * * Use an instruction scheduler
     48  * * Indirect access to uniforms / temporaries using amode
     49  */
     50 
     51 #include "etnaviv_compiler.h"
     52 
     53 #include "etnaviv_asm.h"
     54 #include "etnaviv_context.h"
     55 #include "etnaviv_debug.h"
     56 #include "etnaviv_disasm.h"
     57 #include "etnaviv_uniforms.h"
     58 #include "etnaviv_util.h"
     59 
     60 #include "pipe/p_shader_tokens.h"
     61 #include "tgsi/tgsi_info.h"
     62 #include "tgsi/tgsi_iterate.h"
     63 #include "tgsi/tgsi_lowering.h"
     64 #include "tgsi/tgsi_strings.h"
     65 #include "tgsi/tgsi_util.h"
     66 #include "util/u_math.h"
     67 #include "util/u_memory.h"
     68 
     69 #include <fcntl.h>
     70 #include <stdio.h>
     71 #include <sys/stat.h>
     72 #include <sys/types.h>
     73 
     74 #define ETNA_MAX_INNER_TEMPS 2
     75 
     76 static const float sincos_const[2][4] = {
     77    {
     78       2., -1., 4., -4.,
     79    },
     80    {
     81       1. / (2. * M_PI), 0.75, 0.5, 0.0,
     82    },
     83 };
     84 
     85 /* Native register description structure */
     86 struct etna_native_reg {
     87    unsigned valid : 1;
     88    unsigned is_tex : 1; /* is texture unit, overrides rgroup */
     89    unsigned rgroup : 3;
     90    unsigned id : 9;
     91 };
     92 
     93 /* Register description */
     94 struct etna_reg_desc {
     95    enum tgsi_file_type file; /* IN, OUT, TEMP, ... */
     96    int idx; /* index into file */
     97    bool active; /* used in program */
     98    int first_use; /* instruction id of first use (scope begin) */
     99    int last_use; /* instruction id of last use (scope end, inclusive) */
    100 
    101    struct etna_native_reg native; /* native register to map to */
    102    unsigned usage_mask : 4; /* usage, per channel */
    103    bool has_semantic; /* register has associated TGSI semantic */
    104    struct tgsi_declaration_semantic semantic; /* TGSI semantic */
    105    struct tgsi_declaration_interp interp; /* Interpolation type */
    106 };
    107 
    108 /* Label information structure */
    109 struct etna_compile_label {
    110    int inst_idx; /* Instruction id that label points to */
    111 };
    112 
    113 enum etna_compile_frame_type {
    114    ETNA_COMPILE_FRAME_IF, /* IF/ELSE/ENDIF */
    115    ETNA_COMPILE_FRAME_LOOP,
    116 };
    117 
    118 /* nesting scope frame (LOOP, IF, ...) during compilation
    119  */
    120 struct etna_compile_frame {
    121    enum etna_compile_frame_type type;
    122    struct etna_compile_label *lbl_else;
    123    struct etna_compile_label *lbl_endif;
    124    struct etna_compile_label *lbl_loop_bgn;
    125    struct etna_compile_label *lbl_loop_end;
    126 };
    127 
    128 struct etna_compile_file {
    129    /* Number of registers in each TGSI file (max register+1) */
    130    size_t reg_size;
    131    /* Register descriptions, per register index */
    132    struct etna_reg_desc *reg;
    133 };
    134 
    135 #define array_insert(arr, val)                          \
    136    do {                                                 \
    137       if (arr##_count == arr##_sz) {                    \
    138          arr##_sz = MAX2(2 * arr##_sz, 16);             \
    139          arr = realloc(arr, arr##_sz * sizeof(arr[0])); \
    140       }                                                 \
    141       arr[arr##_count++] = val;                         \
    142    } while (0)
    143 
    144 
    145 /* scratch area for compiling shader, freed after compilation finishes */
    146 struct etna_compile {
    147    const struct tgsi_token *tokens;
    148    bool free_tokens;
    149 
    150    struct tgsi_shader_info info;
    151 
    152    /* Register descriptions, per TGSI file, per register index */
    153    struct etna_compile_file file[TGSI_FILE_COUNT];
    154 
    155    /* Keep track of TGSI register declarations */
    156    struct etna_reg_desc decl[ETNA_MAX_DECL];
    157    uint total_decls;
    158 
    159    /* Bitmap of dead instructions which are removed in a separate pass */
    160    bool dead_inst[ETNA_MAX_TOKENS];
    161 
    162    /* Immediate data */
    163    enum etna_immediate_contents imm_contents[ETNA_MAX_IMM];
    164    uint32_t imm_data[ETNA_MAX_IMM];
    165    uint32_t imm_base; /* base of immediates (in 32 bit units) */
    166    uint32_t imm_size; /* size of immediates (in 32 bit units) */
    167 
    168    /* Next free native register, for register allocation */
    169    uint32_t next_free_native;
    170 
    171    /* Temporary register for use within translated TGSI instruction,
    172     * only allocated when needed.
    173     */
    174    int inner_temps; /* number of inner temps used; only up to one available at
    175                        this point */
    176    struct etna_native_reg inner_temp[ETNA_MAX_INNER_TEMPS];
    177 
    178    /* Fields for handling nested conditionals */
    179    struct etna_compile_frame frame_stack[ETNA_MAX_DEPTH];
    180    int frame_sp;
    181    struct etna_compile_label *lbl_usage[ETNA_MAX_INSTRUCTIONS];
    182 
    183    unsigned labels_count, labels_sz;
    184    struct etna_compile_label *labels;
    185 
    186    /* Code generation */
    187    int inst_ptr; /* current instruction pointer */
    188    uint32_t code[ETNA_MAX_INSTRUCTIONS * ETNA_INST_SIZE];
    189 
    190    /* I/O */
    191 
    192    /* Number of varyings (PS only) */
    193    int num_varyings;
    194 
    195    /* GPU hardware specs */
    196    const struct etna_specs *specs;
    197 };
    198 
    199 static struct etna_reg_desc *
    200 etna_get_dst_reg(struct etna_compile *c, struct tgsi_dst_register dst)
    201 {
    202    return &c->file[dst.File].reg[dst.Index];
    203 }
    204 
    205 static struct etna_reg_desc *
    206 etna_get_src_reg(struct etna_compile *c, struct tgsi_src_register src)
    207 {
    208    return &c->file[src.File].reg[src.Index];
    209 }
    210 
    211 static struct etna_native_reg
    212 etna_native_temp(unsigned reg)
    213 {
    214    return (struct etna_native_reg) {
    215       .valid = 1,
    216       .rgroup = INST_RGROUP_TEMP,
    217       .id = reg
    218    };
    219 }
    220 
    221 /** Register allocation **/
    222 enum reg_sort_order {
    223    FIRST_USE_ASC,
    224    FIRST_USE_DESC,
    225    LAST_USE_ASC,
    226    LAST_USE_DESC
    227 };
    228 
    229 /* Augmented register description for sorting */
    230 struct sort_rec {
    231    struct etna_reg_desc *ptr;
    232    int key;
    233 };
    234 
    235 static int
    236 sort_rec_compar(const struct sort_rec *a, const struct sort_rec *b)
    237 {
    238    if (a->key < b->key)
    239       return -1;
    240 
    241    if (a->key > b->key)
    242       return 1;
    243 
    244    return 0;
    245 }
    246 
    247 /* create an index on a register set based on certain criteria. */
    248 static int
    249 sort_registers(struct sort_rec *sorted, struct etna_compile_file *file,
    250                enum reg_sort_order so)
    251 {
    252    struct etna_reg_desc *regs = file->reg;
    253    int ptr = 0;
    254 
    255    /* pre-populate keys from active registers */
    256    for (int idx = 0; idx < file->reg_size; ++idx) {
    257       /* only interested in active registers now; will only assign inactive ones
    258        * if no space in active ones */
    259       if (regs[idx].active) {
    260          sorted[ptr].ptr = &regs[idx];
    261 
    262          switch (so) {
    263          case FIRST_USE_ASC:
    264             sorted[ptr].key = regs[idx].first_use;
    265             break;
    266          case LAST_USE_ASC:
    267             sorted[ptr].key = regs[idx].last_use;
    268             break;
    269          case FIRST_USE_DESC:
    270             sorted[ptr].key = -regs[idx].first_use;
    271             break;
    272          case LAST_USE_DESC:
    273             sorted[ptr].key = -regs[idx].last_use;
    274             break;
    275          }
    276          ptr++;
    277       }
    278    }
    279 
    280    /* sort index by key */
    281    qsort(sorted, ptr, sizeof(struct sort_rec),
    282          (int (*)(const void *, const void *))sort_rec_compar);
    283 
    284    return ptr;
    285 }
    286 
    287 /* Allocate a new, unused, native temp register */
    288 static struct etna_native_reg
    289 alloc_new_native_reg(struct etna_compile *c)
    290 {
    291    assert(c->next_free_native < ETNA_MAX_TEMPS);
    292    return etna_native_temp(c->next_free_native++);
    293 }
    294 
    295 /* assign TEMPs to native registers */
    296 static void
    297 assign_temporaries_to_native(struct etna_compile *c,
    298                              struct etna_compile_file *file)
    299 {
    300    struct etna_reg_desc *temps = file->reg;
    301 
    302    for (int idx = 0; idx < file->reg_size; ++idx)
    303       temps[idx].native = alloc_new_native_reg(c);
    304 }
    305 
    306 /* assign inputs and outputs to temporaries
    307  * Gallium assumes that the hardware has separate registers for taking input and
    308  * output, however Vivante GPUs use temporaries both for passing in inputs and
    309  * passing back outputs.
    310  * Try to re-use temporary registers where possible. */
    311 static void
    312 assign_inouts_to_temporaries(struct etna_compile *c, uint file)
    313 {
    314    bool mode_inputs = (file == TGSI_FILE_INPUT);
    315    int inout_ptr = 0, num_inouts;
    316    int temp_ptr = 0, num_temps;
    317    struct sort_rec inout_order[ETNA_MAX_TEMPS];
    318    struct sort_rec temps_order[ETNA_MAX_TEMPS];
    319    num_inouts = sort_registers(inout_order, &c->file[file],
    320                                mode_inputs ? LAST_USE_ASC : FIRST_USE_ASC);
    321    num_temps = sort_registers(temps_order, &c->file[TGSI_FILE_TEMPORARY],
    322                               mode_inputs ? FIRST_USE_ASC : LAST_USE_ASC);
    323 
    324    while (inout_ptr < num_inouts && temp_ptr < num_temps) {
    325       struct etna_reg_desc *inout = inout_order[inout_ptr].ptr;
    326       struct etna_reg_desc *temp = temps_order[temp_ptr].ptr;
    327 
    328       if (!inout->active || inout->native.valid) { /* Skip if already a native register assigned */
    329          inout_ptr++;
    330          continue;
    331       }
    332 
    333       /* last usage of this input is before or in same instruction of first use
    334        * of temporary? */
    335       if (mode_inputs ? (inout->last_use <= temp->first_use)
    336                       : (inout->first_use >= temp->last_use)) {
    337          /* assign it and advance to next input */
    338          inout->native = temp->native;
    339          inout_ptr++;
    340       }
    341 
    342       temp_ptr++;
    343    }
    344 
    345    /* if we couldn't reuse current ones, allocate new temporaries */
    346    for (inout_ptr = 0; inout_ptr < num_inouts; ++inout_ptr) {
    347       struct etna_reg_desc *inout = inout_order[inout_ptr].ptr;
    348 
    349       if (inout->active && !inout->native.valid)
    350          inout->native = alloc_new_native_reg(c);
    351    }
    352 }
    353 
    354 /* Allocate an immediate with a certain value and return the index. If
    355  * there is already an immediate with that value, return that.
    356  */
    357 static struct etna_inst_src
    358 alloc_imm(struct etna_compile *c, enum etna_immediate_contents contents,
    359           uint32_t value)
    360 {
    361    int idx;
    362 
    363    /* Could use a hash table to speed this up */
    364    for (idx = 0; idx < c->imm_size; ++idx) {
    365       if (c->imm_contents[idx] == contents && c->imm_data[idx] == value)
    366          break;
    367    }
    368 
    369    /* look if there is an unused slot */
    370    if (idx == c->imm_size) {
    371       for (idx = 0; idx < c->imm_size; ++idx) {
    372          if (c->imm_contents[idx] == ETNA_IMMEDIATE_UNUSED)
    373             break;
    374       }
    375    }
    376 
    377    /* allocate new immediate */
    378    if (idx == c->imm_size) {
    379       assert(c->imm_size < ETNA_MAX_IMM);
    380       idx = c->imm_size++;
    381       c->imm_data[idx] = value;
    382       c->imm_contents[idx] = contents;
    383    }
    384 
    385    /* swizzle so that component with value is returned in all components */
    386    idx += c->imm_base;
    387    struct etna_inst_src imm_src = {
    388       .use = 1,
    389       .rgroup = INST_RGROUP_UNIFORM_0,
    390       .reg = idx / 4,
    391       .swiz = INST_SWIZ_BROADCAST(idx & 3)
    392    };
    393 
    394    return imm_src;
    395 }
    396 
    397 static struct etna_inst_src
    398 alloc_imm_u32(struct etna_compile *c, uint32_t value)
    399 {
    400    return alloc_imm(c, ETNA_IMMEDIATE_CONSTANT, value);
    401 }
    402 
    403 static struct etna_inst_src
    404 alloc_imm_vec4u(struct etna_compile *c, enum etna_immediate_contents contents,
    405                 const uint32_t *values)
    406 {
    407    struct etna_inst_src imm_src = { };
    408    int idx, i;
    409 
    410    for (idx = 0; idx + 3 < c->imm_size; idx += 4) {
    411       /* What if we can use a uniform with a different swizzle? */
    412       for (i = 0; i < 4; i++)
    413          if (c->imm_contents[idx + i] != contents || c->imm_data[idx + i] != values[i])
    414             break;
    415       if (i == 4)
    416          break;
    417    }
    418 
    419    if (idx + 3 >= c->imm_size) {
    420       idx = align(c->imm_size, 4);
    421       assert(idx + 4 <= ETNA_MAX_IMM);
    422 
    423       for (i = 0; i < 4; i++) {
    424          c->imm_data[idx + i] = values[i];
    425          c->imm_contents[idx + i] = contents;
    426       }
    427 
    428       c->imm_size = idx + 4;
    429    }
    430 
    431    assert((c->imm_base & 3) == 0);
    432    idx += c->imm_base;
    433    imm_src.use = 1;
    434    imm_src.rgroup = INST_RGROUP_UNIFORM_0;
    435    imm_src.reg = idx / 4;
    436    imm_src.swiz = INST_SWIZ_IDENTITY;
    437 
    438    return imm_src;
    439 }
    440 
    441 static uint32_t
    442 get_imm_u32(struct etna_compile *c, const struct etna_inst_src *imm,
    443             unsigned swiz_idx)
    444 {
    445    assert(imm->use == 1 && imm->rgroup == INST_RGROUP_UNIFORM_0);
    446    unsigned int idx = imm->reg * 4 + ((imm->swiz >> (swiz_idx * 2)) & 3);
    447 
    448    return c->imm_data[idx];
    449 }
    450 
    451 /* Allocate immediate with a certain float value. If there is already an
    452  * immediate with that value, return that.
    453  */
    454 static struct etna_inst_src
    455 alloc_imm_f32(struct etna_compile *c, float value)
    456 {
    457    return alloc_imm_u32(c, fui(value));
    458 }
    459 
    460 static struct etna_inst_src
    461 etna_imm_vec4f(struct etna_compile *c, const float *vec4)
    462 {
    463    uint32_t val[4];
    464 
    465    for (int i = 0; i < 4; i++)
    466       val[i] = fui(vec4[i]);
    467 
    468    return alloc_imm_vec4u(c, ETNA_IMMEDIATE_CONSTANT, val);
    469 }
    470 
    471 /* Pass -- check register file declarations and immediates */
    472 static void
    473 etna_compile_parse_declarations(struct etna_compile *c)
    474 {
    475    struct tgsi_parse_context ctx = { };
    476    unsigned status = TGSI_PARSE_OK;
    477    status = tgsi_parse_init(&ctx, c->tokens);
    478    assert(status == TGSI_PARSE_OK);
    479 
    480    while (!tgsi_parse_end_of_tokens(&ctx)) {
    481       tgsi_parse_token(&ctx);
    482 
    483       switch (ctx.FullToken.Token.Type) {
    484       case TGSI_TOKEN_TYPE_IMMEDIATE: {
    485          /* immediates are handled differently from other files; they are
    486           * not declared explicitly, and always add four components */
    487          const struct tgsi_full_immediate *imm = &ctx.FullToken.FullImmediate;
    488          assert(c->imm_size <= (ETNA_MAX_IMM - 4));
    489 
    490          for (int i = 0; i < 4; ++i) {
    491             unsigned idx = c->imm_size++;
    492 
    493             c->imm_data[idx] = imm->u[i].Uint;
    494             c->imm_contents[idx] = ETNA_IMMEDIATE_CONSTANT;
    495          }
    496       }
    497       break;
    498       }
    499    }
    500 
    501    tgsi_parse_free(&ctx);
    502 }
    503 
    504 /* Allocate register declarations for the registers in all register files */
    505 static void
    506 etna_allocate_decls(struct etna_compile *c)
    507 {
    508    uint idx = 0;
    509 
    510    for (int x = 0; x < TGSI_FILE_COUNT; ++x) {
    511       c->file[x].reg = &c->decl[idx];
    512       c->file[x].reg_size = c->info.file_max[x] + 1;
    513 
    514       for (int sub = 0; sub < c->file[x].reg_size; ++sub) {
    515          c->decl[idx].file = x;
    516          c->decl[idx].idx = sub;
    517          idx++;
    518       }
    519    }
    520 
    521    c->total_decls = idx;
    522 }
    523 
    524 /* Pass -- check and record usage of temporaries, inputs, outputs */
    525 static void
    526 etna_compile_pass_check_usage(struct etna_compile *c)
    527 {
    528    struct tgsi_parse_context ctx = { };
    529    unsigned status = TGSI_PARSE_OK;
    530    status = tgsi_parse_init(&ctx, c->tokens);
    531    assert(status == TGSI_PARSE_OK);
    532 
    533    for (int idx = 0; idx < c->total_decls; ++idx) {
    534       c->decl[idx].active = false;
    535       c->decl[idx].first_use = c->decl[idx].last_use = -1;
    536    }
    537 
    538    int inst_idx = 0;
    539    while (!tgsi_parse_end_of_tokens(&ctx)) {
    540       tgsi_parse_token(&ctx);
    541       /* find out max register #s used
    542        * For every register mark first and last instruction index where it's
    543        * used this allows finding ranges where the temporary can be borrowed
    544        * as input and/or output register
    545        *
    546        * XXX in the case of loops this needs special care, or even be completely
    547        * disabled, as
    548        * the last usage of a register inside a loop means it can still be used
    549        * on next loop
    550        * iteration (execution is no longer * chronological). The register can
    551        * only be
    552        * declared "free" after the loop finishes.
    553        *
    554        * Same for inputs: the first usage of a register inside a loop doesn't
    555        * mean that the register
    556        * won't have been overwritten in previous iteration. The register can
    557        * only be declared free before the loop
    558        * starts.
    559        * The proper way would be to do full dominator / post-dominator analysis
    560        * (especially with more complicated
    561        * control flow such as direct branch instructions) but not for now...
    562        */
    563       switch (ctx.FullToken.Token.Type) {
    564       case TGSI_TOKEN_TYPE_DECLARATION: {
    565          /* Declaration: fill in file details */
    566          const struct tgsi_full_declaration *decl = &ctx.FullToken.FullDeclaration;
    567          struct etna_compile_file *file = &c->file[decl->Declaration.File];
    568 
    569          for (int idx = decl->Range.First; idx <= decl->Range.Last; ++idx) {
    570             file->reg[idx].usage_mask = 0; // we'll compute this ourselves
    571             file->reg[idx].has_semantic = decl->Declaration.Semantic;
    572             file->reg[idx].semantic = decl->Semantic;
    573             file->reg[idx].interp = decl->Interp;
    574          }
    575       } break;
    576       case TGSI_TOKEN_TYPE_INSTRUCTION: {
    577          /* Instruction: iterate over operands of instruction */
    578          const struct tgsi_full_instruction *inst = &ctx.FullToken.FullInstruction;
    579 
    580          /* iterate over destination registers */
    581          for (int idx = 0; idx < inst->Instruction.NumDstRegs; ++idx) {
    582             struct etna_reg_desc *reg_desc = &c->file[inst->Dst[idx].Register.File].reg[inst->Dst[idx].Register.Index];
    583 
    584             if (reg_desc->first_use == -1)
    585                reg_desc->first_use = inst_idx;
    586 
    587             reg_desc->last_use = inst_idx;
    588             reg_desc->active = true;
    589          }
    590 
    591          /* iterate over source registers */
    592          for (int idx = 0; idx < inst->Instruction.NumSrcRegs; ++idx) {
    593             struct etna_reg_desc *reg_desc = &c->file[inst->Src[idx].Register.File].reg[inst->Src[idx].Register.Index];
    594 
    595             if (reg_desc->first_use == -1)
    596                reg_desc->first_use = inst_idx;
    597 
    598             reg_desc->last_use = inst_idx;
    599             reg_desc->active = true;
    600             /* accumulate usage mask for register, this is used to determine how
    601              * many slots for varyings
    602              * should be allocated */
    603             reg_desc->usage_mask |= tgsi_util_get_inst_usage_mask(inst, idx);
    604          }
    605          inst_idx += 1;
    606       } break;
    607       default:
    608          break;
    609       }
    610    }
    611 
    612    tgsi_parse_free(&ctx);
    613 }
    614 
    615 /* assign inputs that need to be assigned to specific registers */
    616 static void
    617 assign_special_inputs(struct etna_compile *c)
    618 {
    619    if (c->info.processor == PIPE_SHADER_FRAGMENT) {
    620       /* never assign t0 as it is the position output, start assigning at t1 */
    621       c->next_free_native = 1;
    622 
    623       /* hardwire TGSI_SEMANTIC_POSITION (input and output) to t0 */
    624       for (int idx = 0; idx < c->total_decls; ++idx) {
    625          struct etna_reg_desc *reg = &c->decl[idx];
    626 
    627          if (reg->active && reg->semantic.Name == TGSI_SEMANTIC_POSITION)
    628             reg->native = etna_native_temp(0);
    629       }
    630    }
    631 }
    632 
    633 /* Check that a move instruction does not swizzle any of the components
    634  * that it writes.
    635  */
    636 static bool
    637 etna_mov_check_no_swizzle(const struct tgsi_dst_register dst,
    638                           const struct tgsi_src_register src)
    639 {
    640    return (!(dst.WriteMask & TGSI_WRITEMASK_X) || src.SwizzleX == TGSI_SWIZZLE_X) &&
    641           (!(dst.WriteMask & TGSI_WRITEMASK_Y) || src.SwizzleY == TGSI_SWIZZLE_Y) &&
    642           (!(dst.WriteMask & TGSI_WRITEMASK_Z) || src.SwizzleZ == TGSI_SWIZZLE_Z) &&
    643           (!(dst.WriteMask & TGSI_WRITEMASK_W) || src.SwizzleW == TGSI_SWIZZLE_W);
    644 }
    645 
    646 /* Pass -- optimize outputs
    647  * Mesa tends to generate code like this at the end if their shaders
    648  *   MOV OUT[1], TEMP[2]
    649  *   MOV OUT[0], TEMP[0]
    650  *   MOV OUT[2], TEMP[1]
    651  * Recognize if
    652  * a) there is only a single assignment to an output register and
    653  * b) the temporary is not used after that
    654  * Also recognize direct assignment of IN to OUT (passthrough)
    655  **/
    656 static void
    657 etna_compile_pass_optimize_outputs(struct etna_compile *c)
    658 {
    659    struct tgsi_parse_context ctx = { };
    660    int inst_idx = 0;
    661    unsigned status = TGSI_PARSE_OK;
    662    status = tgsi_parse_init(&ctx, c->tokens);
    663    assert(status == TGSI_PARSE_OK);
    664 
    665    while (!tgsi_parse_end_of_tokens(&ctx)) {
    666       tgsi_parse_token(&ctx);
    667 
    668       switch (ctx.FullToken.Token.Type) {
    669       case TGSI_TOKEN_TYPE_INSTRUCTION: {
    670          const struct tgsi_full_instruction *inst = &ctx.FullToken.FullInstruction;
    671 
    672          /* iterate over operands */
    673          switch (inst->Instruction.Opcode) {
    674          case TGSI_OPCODE_MOV: {
    675             /* We are only interested in eliminating MOVs which write to
    676              * the shader outputs. Test for this early. */
    677             if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
    678                break;
    679             /* Elimination of a MOV must have no visible effect on the
    680              * resulting shader: this means the MOV must not swizzle or
    681              * saturate, and its source must not have the negate or
    682              * absolute modifiers. */
    683             if (!etna_mov_check_no_swizzle(inst->Dst[0].Register, inst->Src[0].Register) ||
    684                 inst->Instruction.Saturate || inst->Src[0].Register.Negate ||
    685                 inst->Src[0].Register.Absolute)
    686                break;
    687 
    688             uint out_idx = inst->Dst[0].Register.Index;
    689             uint in_idx = inst->Src[0].Register.Index;
    690             /* assignment of temporary to output --
    691              * and the output doesn't yet have a native register assigned
    692              * and the last use of the temporary is this instruction
    693              * and the MOV does not do a swizzle
    694              */
    695             if (inst->Src[0].Register.File == TGSI_FILE_TEMPORARY &&
    696                 !c->file[TGSI_FILE_OUTPUT].reg[out_idx].native.valid &&
    697                 c->file[TGSI_FILE_TEMPORARY].reg[in_idx].last_use == inst_idx) {
    698                c->file[TGSI_FILE_OUTPUT].reg[out_idx].native =
    699                   c->file[TGSI_FILE_TEMPORARY].reg[in_idx].native;
    700                /* prevent temp from being re-used for the rest of the shader */
    701                c->file[TGSI_FILE_TEMPORARY].reg[in_idx].last_use = ETNA_MAX_TOKENS;
    702                /* mark this MOV instruction as a no-op */
    703                c->dead_inst[inst_idx] = true;
    704             }
    705             /* direct assignment of input to output --
    706              * and the input or output doesn't yet have a native register
    707              * assigned
    708              * and the output is only used in this instruction,
    709              * allocate a new register, and associate both input and output to
    710              * it
    711              * and the MOV does not do a swizzle
    712              */
    713             if (inst->Src[0].Register.File == TGSI_FILE_INPUT &&
    714                 !c->file[TGSI_FILE_INPUT].reg[in_idx].native.valid &&
    715                 !c->file[TGSI_FILE_OUTPUT].reg[out_idx].native.valid &&
    716                 c->file[TGSI_FILE_OUTPUT].reg[out_idx].last_use == inst_idx &&
    717                 c->file[TGSI_FILE_OUTPUT].reg[out_idx].first_use == inst_idx) {
    718                c->file[TGSI_FILE_OUTPUT].reg[out_idx].native =
    719                   c->file[TGSI_FILE_INPUT].reg[in_idx].native =
    720                      alloc_new_native_reg(c);
    721                /* mark this MOV instruction as a no-op */
    722                c->dead_inst[inst_idx] = true;
    723             }
    724          } break;
    725          default:;
    726          }
    727          inst_idx += 1;
    728       } break;
    729       }
    730    }
    731 
    732    tgsi_parse_free(&ctx);
    733 }
    734 
    735 /* Get a temporary to be used within one TGSI instruction.
    736  * The first time that this function is called the temporary will be allocated.
    737  * Each call to this function will return the same temporary.
    738  */
    739 static struct etna_native_reg
    740 etna_compile_get_inner_temp(struct etna_compile *c)
    741 {
    742    int inner_temp = c->inner_temps;
    743 
    744    if (inner_temp < ETNA_MAX_INNER_TEMPS) {
    745       if (!c->inner_temp[inner_temp].valid)
    746          c->inner_temp[inner_temp] = alloc_new_native_reg(c);
    747 
    748       /* alloc_new_native_reg() handles lack of registers */
    749       c->inner_temps += 1;
    750    } else {
    751       BUG("Too many inner temporaries (%i) requested in one instruction",
    752           inner_temp + 1);
    753    }
    754 
    755    return c->inner_temp[inner_temp];
    756 }
    757 
    758 static struct etna_inst_dst
    759 etna_native_to_dst(struct etna_native_reg native, unsigned comps)
    760 {
    761    /* Can only assign to temporaries */
    762    assert(native.valid && !native.is_tex && native.rgroup == INST_RGROUP_TEMP);
    763 
    764    struct etna_inst_dst rv = {
    765       .comps = comps,
    766       .use = 1,
    767       .reg = native.id,
    768    };
    769 
    770    return rv;
    771 }
    772 
    773 static struct etna_inst_src
    774 etna_native_to_src(struct etna_native_reg native, uint32_t swizzle)
    775 {
    776    assert(native.valid && !native.is_tex);
    777 
    778    struct etna_inst_src rv = {
    779       .use = 1,
    780       .swiz = swizzle,
    781       .rgroup = native.rgroup,
    782       .reg = native.id,
    783       .amode = INST_AMODE_DIRECT,
    784    };
    785 
    786    return rv;
    787 }
    788 
    789 static inline struct etna_inst_src
    790 negate(struct etna_inst_src src)
    791 {
    792    src.neg = !src.neg;
    793 
    794    return src;
    795 }
    796 
    797 static inline struct etna_inst_src
    798 absolute(struct etna_inst_src src)
    799 {
    800    src.abs = 1;
    801 
    802    return src;
    803 }
    804 
    805 static inline struct etna_inst_src
    806 swizzle(struct etna_inst_src src, unsigned swizzle)
    807 {
    808    src.swiz = inst_swiz_compose(src.swiz, swizzle);
    809 
    810    return src;
    811 }
    812 
    813 /* Emit instruction and append it to program */
    814 static void
    815 emit_inst(struct etna_compile *c, struct etna_inst *inst)
    816 {
    817    assert(c->inst_ptr <= ETNA_MAX_INSTRUCTIONS);
    818 
    819    /* Check for uniform conflicts (each instruction can only access one
    820     * uniform),
    821     * if detected, use an intermediate temporary */
    822    unsigned uni_rgroup = -1;
    823    unsigned uni_reg = -1;
    824 
    825    for (int src = 0; src < ETNA_NUM_SRC; ++src) {
    826       if (etna_rgroup_is_uniform(inst->src[src].rgroup)) {
    827          if (uni_reg == -1) { /* first unique uniform used */
    828             uni_rgroup = inst->src[src].rgroup;
    829             uni_reg = inst->src[src].reg;
    830          } else { /* second or later; check that it is a re-use */
    831             if (uni_rgroup != inst->src[src].rgroup ||
    832                 uni_reg != inst->src[src].reg) {
    833                DBG_F(ETNA_DBG_COMPILER_MSGS, "perf warning: instruction that "
    834                                              "accesses different uniforms, "
    835                                              "need to generate extra MOV");
    836                struct etna_native_reg inner_temp = etna_compile_get_inner_temp(c);
    837 
    838                /* Generate move instruction to temporary */
    839                etna_assemble(&c->code[c->inst_ptr * 4], &(struct etna_inst) {
    840                   .opcode = INST_OPCODE_MOV,
    841                   .dst = etna_native_to_dst(inner_temp, INST_COMPS_X | INST_COMPS_Y |
    842                                                         INST_COMPS_Z | INST_COMPS_W),
    843                   .src[2] = inst->src[src]
    844                });
    845 
    846                c->inst_ptr++;
    847 
    848                /* Modify instruction to use temp register instead of uniform */
    849                inst->src[src].use = 1;
    850                inst->src[src].rgroup = INST_RGROUP_TEMP;
    851                inst->src[src].reg = inner_temp.id;
    852                inst->src[src].swiz = INST_SWIZ_IDENTITY; /* swizzling happens on MOV */
    853                inst->src[src].neg = 0; /* negation happens on MOV */
    854                inst->src[src].abs = 0; /* abs happens on MOV */
    855                inst->src[src].amode = 0; /* amode effects happen on MOV */
    856             }
    857          }
    858       }
    859    }
    860 
    861    /* Finally assemble the actual instruction */
    862    etna_assemble(&c->code[c->inst_ptr * 4], inst);
    863    c->inst_ptr++;
    864 }
    865 
    866 static unsigned int
    867 etna_amode(struct tgsi_ind_register indirect)
    868 {
    869    assert(indirect.File == TGSI_FILE_ADDRESS);
    870    assert(indirect.Index == 0);
    871 
    872    switch (indirect.Swizzle) {
    873    case TGSI_SWIZZLE_X:
    874       return INST_AMODE_ADD_A_X;
    875    case TGSI_SWIZZLE_Y:
    876       return INST_AMODE_ADD_A_Y;
    877    case TGSI_SWIZZLE_Z:
    878       return INST_AMODE_ADD_A_Z;
    879    case TGSI_SWIZZLE_W:
    880       return INST_AMODE_ADD_A_W;
    881    default:
    882       assert(!"Invalid swizzle");
    883    }
    884 }
    885 
    886 /* convert destination operand */
    887 static struct etna_inst_dst
    888 convert_dst(struct etna_compile *c, const struct tgsi_full_dst_register *in)
    889 {
    890    struct etna_inst_dst rv = {
    891       /// XXX .amode
    892       .comps = in->Register.WriteMask,
    893    };
    894 
    895    if (in->Register.File == TGSI_FILE_ADDRESS) {
    896       assert(in->Register.Index == 0);
    897       rv.reg = in->Register.Index;
    898       rv.use = 0;
    899    } else {
    900       rv = etna_native_to_dst(etna_get_dst_reg(c, in->Register)->native,
    901                               in->Register.WriteMask);
    902    }
    903 
    904    if (in->Register.Indirect)
    905       rv.amode = etna_amode(in->Indirect);
    906 
    907    return rv;
    908 }
    909 
    910 /* convert texture operand */
    911 static struct etna_inst_tex
    912 convert_tex(struct etna_compile *c, const struct tgsi_full_src_register *in,
    913             const struct tgsi_instruction_texture *tex)
    914 {
    915    struct etna_native_reg native_reg = etna_get_src_reg(c, in->Register)->native;
    916    struct etna_inst_tex rv = {
    917       // XXX .amode (to allow for an array of samplers?)
    918       .swiz = INST_SWIZ_IDENTITY
    919    };
    920 
    921    assert(native_reg.is_tex && native_reg.valid);
    922    rv.id = native_reg.id;
    923 
    924    return rv;
    925 }
    926 
    927 /* convert source operand */
    928 static struct etna_inst_src
    929 etna_create_src(const struct tgsi_full_src_register *tgsi,
    930                 const struct etna_native_reg *native)
    931 {
    932    const struct tgsi_src_register *reg = &tgsi->Register;
    933    struct etna_inst_src rv = {
    934       .use = 1,
    935       .swiz = INST_SWIZ(reg->SwizzleX, reg->SwizzleY, reg->SwizzleZ, reg->SwizzleW),
    936       .neg = reg->Negate,
    937       .abs = reg->Absolute,
    938       .rgroup = native->rgroup,
    939       .reg = native->id,
    940       .amode = INST_AMODE_DIRECT,
    941    };
    942 
    943    assert(native->valid && !native->is_tex);
    944 
    945    if (reg->Indirect)
    946       rv.amode = etna_amode(tgsi->Indirect);
    947 
    948    return rv;
    949 }
    950 
    951 static struct etna_inst_src
    952 etna_mov_src_to_temp(struct etna_compile *c, struct etna_inst_src src,
    953                      struct etna_native_reg temp)
    954 {
    955    struct etna_inst mov = { };
    956 
    957    mov.opcode = INST_OPCODE_MOV;
    958    mov.sat = 0;
    959    mov.dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
    960                                       INST_COMPS_Z | INST_COMPS_W);
    961    mov.src[2] = src;
    962    emit_inst(c, &mov);
    963 
    964    src.swiz = INST_SWIZ_IDENTITY;
    965    src.neg = src.abs = 0;
    966    src.rgroup = temp.rgroup;
    967    src.reg = temp.id;
    968 
    969    return src;
    970 }
    971 
    972 static struct etna_inst_src
    973 etna_mov_src(struct etna_compile *c, struct etna_inst_src src)
    974 {
    975    struct etna_native_reg temp = etna_compile_get_inner_temp(c);
    976 
    977    return etna_mov_src_to_temp(c, src, temp);
    978 }
    979 
    980 static bool
    981 etna_src_uniforms_conflict(struct etna_inst_src a, struct etna_inst_src b)
    982 {
    983    return etna_rgroup_is_uniform(a.rgroup) &&
    984           etna_rgroup_is_uniform(b.rgroup) &&
    985           (a.rgroup != b.rgroup || a.reg != b.reg);
    986 }
    987 
    988 /* create a new label */
    989 static struct etna_compile_label *
    990 alloc_new_label(struct etna_compile *c)
    991 {
    992    struct etna_compile_label label = {
    993       .inst_idx = -1, /* start by point to no specific instruction */
    994    };
    995 
    996    array_insert(c->labels, label);
    997 
    998    return &c->labels[c->labels_count - 1];
    999 }
   1000 
   1001 /* place label at current instruction pointer */
   1002 static void
   1003 label_place(struct etna_compile *c, struct etna_compile_label *label)
   1004 {
   1005    label->inst_idx = c->inst_ptr;
   1006 }
   1007 
   1008 /* mark label use at current instruction.
   1009  * target of the label will be filled in in the marked instruction's src2.imm
   1010  * slot as soon
   1011  * as the value becomes known.
   1012  */
   1013 static void
   1014 label_mark_use(struct etna_compile *c, struct etna_compile_label *label)
   1015 {
   1016    assert(c->inst_ptr < ETNA_MAX_INSTRUCTIONS);
   1017    c->lbl_usage[c->inst_ptr] = label;
   1018 }
   1019 
   1020 /* walk the frame stack and return first frame with matching type */
   1021 static struct etna_compile_frame *
   1022 find_frame(struct etna_compile *c, enum etna_compile_frame_type type)
   1023 {
   1024    for (int sp = c->frame_sp; sp >= 0; sp--)
   1025       if (c->frame_stack[sp].type == type)
   1026          return &c->frame_stack[sp];
   1027 
   1028    assert(0);
   1029    return NULL;
   1030 }
   1031 
   1032 struct instr_translater {
   1033    void (*fxn)(const struct instr_translater *t, struct etna_compile *c,
   1034                const struct tgsi_full_instruction *inst,
   1035                struct etna_inst_src *src);
   1036    unsigned tgsi_opc;
   1037    uint8_t opc;
   1038 
   1039    /* tgsi src -> etna src swizzle */
   1040    int src[3];
   1041 
   1042    unsigned cond;
   1043 };
   1044 
   1045 static void
   1046 trans_instr(const struct instr_translater *t, struct etna_compile *c,
   1047             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1048 {
   1049    const struct tgsi_opcode_info *info = tgsi_get_opcode_info(inst->Instruction.Opcode);
   1050    struct etna_inst instr = { };
   1051 
   1052    instr.opcode = t->opc;
   1053    instr.cond = t->cond;
   1054    instr.sat = inst->Instruction.Saturate;
   1055 
   1056    assert(info->num_dst <= 1);
   1057    if (info->num_dst)
   1058       instr.dst = convert_dst(c, &inst->Dst[0]);
   1059 
   1060    assert(info->num_src <= ETNA_NUM_SRC);
   1061 
   1062    for (unsigned i = 0; i < info->num_src; i++) {
   1063       int swizzle = t->src[i];
   1064 
   1065       assert(swizzle != -1);
   1066       instr.src[swizzle] = src[i];
   1067    }
   1068 
   1069    emit_inst(c, &instr);
   1070 }
   1071 
   1072 static void
   1073 trans_min_max(const struct instr_translater *t, struct etna_compile *c,
   1074               const struct tgsi_full_instruction *inst,
   1075               struct etna_inst_src *src)
   1076 {
   1077    emit_inst(c, &(struct etna_inst) {
   1078       .opcode = INST_OPCODE_SELECT,
   1079        .cond = t->cond,
   1080        .sat = inst->Instruction.Saturate,
   1081        .dst = convert_dst(c, &inst->Dst[0]),
   1082        .src[0] = src[0],
   1083        .src[1] = src[1],
   1084        .src[2] = src[0],
   1085     });
   1086 }
   1087 
   1088 static void
   1089 trans_if(const struct instr_translater *t, struct etna_compile *c,
   1090          const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1091 {
   1092    struct etna_compile_frame *f = &c->frame_stack[c->frame_sp++];
   1093    struct etna_inst_src imm_0 = alloc_imm_f32(c, 0.0f);
   1094 
   1095    /* push IF to stack */
   1096    f->type = ETNA_COMPILE_FRAME_IF;
   1097    /* create "else" label */
   1098    f->lbl_else = alloc_new_label(c);
   1099    f->lbl_endif = NULL;
   1100 
   1101    /* We need to avoid the emit_inst() below becoming two instructions */
   1102    if (etna_src_uniforms_conflict(src[0], imm_0))
   1103       src[0] = etna_mov_src(c, src[0]);
   1104 
   1105    /* mark position in instruction stream of label reference so that it can be
   1106     * filled in in next pass */
   1107    label_mark_use(c, f->lbl_else);
   1108 
   1109    /* create conditional branch to label if src0 EQ 0 */
   1110    emit_inst(c, &(struct etna_inst){
   1111       .opcode = INST_OPCODE_BRANCH,
   1112       .cond = INST_CONDITION_EQ,
   1113       .src[0] = src[0],
   1114       .src[1] = imm_0,
   1115     /* imm is filled in later */
   1116    });
   1117 }
   1118 
   1119 static void
   1120 trans_else(const struct instr_translater *t, struct etna_compile *c,
   1121            const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1122 {
   1123    assert(c->frame_sp > 0);
   1124    struct etna_compile_frame *f = &c->frame_stack[c->frame_sp - 1];
   1125    assert(f->type == ETNA_COMPILE_FRAME_IF);
   1126 
   1127    /* create "endif" label, and branch to endif label */
   1128    f->lbl_endif = alloc_new_label(c);
   1129    label_mark_use(c, f->lbl_endif);
   1130    emit_inst(c, &(struct etna_inst) {
   1131       .opcode = INST_OPCODE_BRANCH,
   1132       .cond = INST_CONDITION_TRUE,
   1133       /* imm is filled in later */
   1134    });
   1135 
   1136    /* mark "else" label at this position in instruction stream */
   1137    label_place(c, f->lbl_else);
   1138 }
   1139 
   1140 static void
   1141 trans_endif(const struct instr_translater *t, struct etna_compile *c,
   1142             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1143 {
   1144    assert(c->frame_sp > 0);
   1145    struct etna_compile_frame *f = &c->frame_stack[--c->frame_sp];
   1146    assert(f->type == ETNA_COMPILE_FRAME_IF);
   1147 
   1148    /* assign "endif" or "else" (if no ELSE) label to current position in
   1149     * instruction stream, pop IF */
   1150    if (f->lbl_endif != NULL)
   1151       label_place(c, f->lbl_endif);
   1152    else
   1153       label_place(c, f->lbl_else);
   1154 }
   1155 
   1156 static void
   1157 trans_loop_bgn(const struct instr_translater *t, struct etna_compile *c,
   1158                const struct tgsi_full_instruction *inst,
   1159                struct etna_inst_src *src)
   1160 {
   1161    struct etna_compile_frame *f = &c->frame_stack[c->frame_sp++];
   1162 
   1163    /* push LOOP to stack */
   1164    f->type = ETNA_COMPILE_FRAME_LOOP;
   1165    f->lbl_loop_bgn = alloc_new_label(c);
   1166    f->lbl_loop_end = alloc_new_label(c);
   1167 
   1168    label_place(c, f->lbl_loop_bgn);
   1169 }
   1170 
   1171 static void
   1172 trans_loop_end(const struct instr_translater *t, struct etna_compile *c,
   1173                const struct tgsi_full_instruction *inst,
   1174                struct etna_inst_src *src)
   1175 {
   1176    assert(c->frame_sp > 0);
   1177    struct etna_compile_frame *f = &c->frame_stack[--c->frame_sp];
   1178    assert(f->type == ETNA_COMPILE_FRAME_LOOP);
   1179 
   1180    /* mark position in instruction stream of label reference so that it can be
   1181     * filled in in next pass */
   1182    label_mark_use(c, f->lbl_loop_bgn);
   1183 
   1184    /* create branch to loop_bgn label */
   1185    emit_inst(c, &(struct etna_inst) {
   1186       .opcode = INST_OPCODE_BRANCH,
   1187       .cond = INST_CONDITION_TRUE,
   1188       .src[0] = src[0],
   1189       /* imm is filled in later */
   1190    });
   1191 
   1192    label_place(c, f->lbl_loop_end);
   1193 }
   1194 
   1195 static void
   1196 trans_brk(const struct instr_translater *t, struct etna_compile *c,
   1197           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1198 {
   1199    assert(c->frame_sp > 0);
   1200    struct etna_compile_frame *f = find_frame(c, ETNA_COMPILE_FRAME_LOOP);
   1201 
   1202    /* mark position in instruction stream of label reference so that it can be
   1203     * filled in in next pass */
   1204    label_mark_use(c, f->lbl_loop_end);
   1205 
   1206    /* create branch to loop_end label */
   1207    emit_inst(c, &(struct etna_inst) {
   1208       .opcode = INST_OPCODE_BRANCH,
   1209       .cond = INST_CONDITION_TRUE,
   1210       .src[0] = src[0],
   1211       /* imm is filled in later */
   1212    });
   1213 }
   1214 
   1215 static void
   1216 trans_cont(const struct instr_translater *t, struct etna_compile *c,
   1217            const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1218 {
   1219    assert(c->frame_sp > 0);
   1220    struct etna_compile_frame *f = find_frame(c, ETNA_COMPILE_FRAME_LOOP);
   1221 
   1222    /* mark position in instruction stream of label reference so that it can be
   1223     * filled in in next pass */
   1224    label_mark_use(c, f->lbl_loop_bgn);
   1225 
   1226    /* create branch to loop_end label */
   1227    emit_inst(c, &(struct etna_inst) {
   1228       .opcode = INST_OPCODE_BRANCH,
   1229       .cond = INST_CONDITION_TRUE,
   1230       .src[0] = src[0],
   1231       /* imm is filled in later */
   1232    });
   1233 }
   1234 
   1235 static void
   1236 trans_deriv(const struct instr_translater *t, struct etna_compile *c,
   1237             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1238 {
   1239    emit_inst(c, &(struct etna_inst) {
   1240       .opcode = t->opc,
   1241       .sat = inst->Instruction.Saturate,
   1242       .dst = convert_dst(c, &inst->Dst[0]),
   1243       .src[0] = src[0],
   1244       .src[2] = src[0],
   1245    });
   1246 }
   1247 
   1248 static void
   1249 trans_arl(const struct instr_translater *t, struct etna_compile *c,
   1250           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1251 {
   1252    struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1253    struct etna_inst arl = { };
   1254    struct etna_inst_dst dst;
   1255 
   1256    dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y | INST_COMPS_Z |
   1257                                   INST_COMPS_W);
   1258 
   1259    if (c->specs->has_sign_floor_ceil) {
   1260       struct etna_inst floor = { };
   1261 
   1262       floor.opcode = INST_OPCODE_FLOOR;
   1263       floor.src[2] = src[0];
   1264       floor.dst = dst;
   1265 
   1266       emit_inst(c, &floor);
   1267    } else {
   1268       struct etna_inst floor[2] = { };
   1269 
   1270       floor[0].opcode = INST_OPCODE_FRC;
   1271       floor[0].sat = inst->Instruction.Saturate;
   1272       floor[0].dst = dst;
   1273       floor[0].src[2] = src[0];
   1274 
   1275       floor[1].opcode = INST_OPCODE_ADD;
   1276       floor[1].sat = inst->Instruction.Saturate;
   1277       floor[1].dst = dst;
   1278       floor[1].src[0] = src[0];
   1279       floor[1].src[2].use = 1;
   1280       floor[1].src[2].swiz = INST_SWIZ_IDENTITY;
   1281       floor[1].src[2].neg = 1;
   1282       floor[1].src[2].rgroup = temp.rgroup;
   1283       floor[1].src[2].reg = temp.id;
   1284 
   1285       emit_inst(c, &floor[0]);
   1286       emit_inst(c, &floor[1]);
   1287    }
   1288 
   1289    arl.opcode = INST_OPCODE_MOVAR;
   1290    arl.sat = inst->Instruction.Saturate;
   1291    arl.dst = convert_dst(c, &inst->Dst[0]);
   1292    arl.src[2] = etna_native_to_src(temp, INST_SWIZ_IDENTITY);
   1293 
   1294    emit_inst(c, &arl);
   1295 }
   1296 
   1297 static void
   1298 trans_lrp(const struct instr_translater *t, struct etna_compile *c,
   1299           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1300 {
   1301    /* dst = src0 * src1 + (1 - src0) * src2
   1302     *     => src0 * src1 - (src0 - 1) * src2
   1303     *     => src0 * src1 - (src0 * src2 - src2)
   1304     * MAD tTEMP.xyzw, tSRC0.xyzw, tSRC2.xyzw, -tSRC2.xyzw
   1305     * MAD tDST.xyzw, tSRC0.xyzw, tSRC1.xyzw, -tTEMP.xyzw
   1306     */
   1307    struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1308    if (etna_src_uniforms_conflict(src[0], src[1]) ||
   1309        etna_src_uniforms_conflict(src[0], src[2])) {
   1310       src[0] = etna_mov_src(c, src[0]);
   1311    }
   1312 
   1313    struct etna_inst mad[2] = { };
   1314    mad[0].opcode = INST_OPCODE_MAD;
   1315    mad[0].sat = 0;
   1316    mad[0].dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
   1317                                          INST_COMPS_Z | INST_COMPS_W);
   1318    mad[0].src[0] = src[0];
   1319    mad[0].src[1] = src[2];
   1320    mad[0].src[2] = negate(src[2]);
   1321    mad[1].opcode = INST_OPCODE_MAD;
   1322    mad[1].sat = inst->Instruction.Saturate;
   1323    mad[1].dst = convert_dst(c, &inst->Dst[0]), mad[1].src[0] = src[0];
   1324    mad[1].src[1] = src[1];
   1325    mad[1].src[2] = negate(etna_native_to_src(temp, INST_SWIZ_IDENTITY));
   1326 
   1327    emit_inst(c, &mad[0]);
   1328    emit_inst(c, &mad[1]);
   1329 }
   1330 
   1331 static void
   1332 trans_lit(const struct instr_translater *t, struct etna_compile *c,
   1333           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1334 {
   1335    /* SELECT.LT tmp._y__, 0, src.yyyy, 0
   1336     *  - can be eliminated if src.y is a uniform and >= 0
   1337     * SELECT.GT tmp.___w, 128, src.wwww, 128
   1338     * SELECT.LT tmp.___w, -128, tmp.wwww, -128
   1339     *  - can be eliminated if src.w is a uniform and fits clamp
   1340     * LOG tmp.x, void, void, tmp.yyyy
   1341     * MUL tmp.x, tmp.xxxx, tmp.wwww, void
   1342     * LITP dst, undef, src.xxxx, tmp.xxxx
   1343     */
   1344    struct etna_native_reg inner_temp = etna_compile_get_inner_temp(c);
   1345    struct etna_inst_src src_y = { };
   1346 
   1347    if (!etna_rgroup_is_uniform(src[0].rgroup)) {
   1348       src_y = etna_native_to_src(inner_temp, SWIZZLE(Y, Y, Y, Y));
   1349 
   1350       struct etna_inst ins = { };
   1351       ins.opcode = INST_OPCODE_SELECT;
   1352       ins.cond = INST_CONDITION_LT;
   1353       ins.dst = etna_native_to_dst(inner_temp, INST_COMPS_Y);
   1354       ins.src[0] = ins.src[2] = alloc_imm_f32(c, 0.0);
   1355       ins.src[1] = swizzle(src[0], SWIZZLE(Y, Y, Y, Y));
   1356       emit_inst(c, &ins);
   1357    } else if (uif(get_imm_u32(c, &src[0], 1)) < 0)
   1358       src_y = alloc_imm_f32(c, 0.0);
   1359    else
   1360       src_y = swizzle(src[0], SWIZZLE(Y, Y, Y, Y));
   1361 
   1362    struct etna_inst_src src_w = { };
   1363 
   1364    if (!etna_rgroup_is_uniform(src[0].rgroup)) {
   1365       src_w = etna_native_to_src(inner_temp, SWIZZLE(W, W, W, W));
   1366 
   1367       struct etna_inst ins = { };
   1368       ins.opcode = INST_OPCODE_SELECT;
   1369       ins.cond = INST_CONDITION_GT;
   1370       ins.dst = etna_native_to_dst(inner_temp, INST_COMPS_W);
   1371       ins.src[0] = ins.src[2] = alloc_imm_f32(c, 128.);
   1372       ins.src[1] = swizzle(src[0], SWIZZLE(W, W, W, W));
   1373       emit_inst(c, &ins);
   1374       ins.cond = INST_CONDITION_LT;
   1375       ins.src[0].neg = !ins.src[0].neg;
   1376       ins.src[2].neg = !ins.src[2].neg;
   1377       ins.src[1] = src_w;
   1378       emit_inst(c, &ins);
   1379    } else if (uif(get_imm_u32(c, &src[0], 3)) < -128.)
   1380       src_w = alloc_imm_f32(c, -128.);
   1381    else if (uif(get_imm_u32(c, &src[0], 3)) > 128.)
   1382       src_w = alloc_imm_f32(c, 128.);
   1383    else
   1384       src_w = swizzle(src[0], SWIZZLE(W, W, W, W));
   1385 
   1386    struct etna_inst ins[3] = { };
   1387    ins[0].opcode = INST_OPCODE_LOG;
   1388    ins[0].dst = etna_native_to_dst(inner_temp, INST_COMPS_X);
   1389    ins[0].src[2] = src_y;
   1390 
   1391    emit_inst(c, &ins[0]);
   1392    emit_inst(c, &(struct etna_inst) {
   1393       .opcode = INST_OPCODE_MUL,
   1394       .sat = 0,
   1395       .dst = etna_native_to_dst(inner_temp, INST_COMPS_X),
   1396       .src[0] = etna_native_to_src(inner_temp, SWIZZLE(X, X, X, X)),
   1397       .src[1] = src_w,
   1398    });
   1399    emit_inst(c, &(struct etna_inst) {
   1400       .opcode = INST_OPCODE_LITP,
   1401       .sat = 0,
   1402       .dst = convert_dst(c, &inst->Dst[0]),
   1403       .src[0] = swizzle(src[0], SWIZZLE(X, X, X, X)),
   1404       .src[1] = swizzle(src[0], SWIZZLE(X, X, X, X)),
   1405       .src[2] = etna_native_to_src(inner_temp, SWIZZLE(X, X, X, X)),
   1406    });
   1407 }
   1408 
   1409 static void
   1410 trans_ssg(const struct instr_translater *t, struct etna_compile *c,
   1411           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1412 {
   1413    if (c->specs->has_sign_floor_ceil) {
   1414       emit_inst(c, &(struct etna_inst){
   1415          .opcode = INST_OPCODE_SIGN,
   1416          .sat = inst->Instruction.Saturate,
   1417          .dst = convert_dst(c, &inst->Dst[0]),
   1418          .src[2] = src[0],
   1419       });
   1420    } else {
   1421       struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1422       struct etna_inst ins[2] = { };
   1423 
   1424       ins[0].opcode = INST_OPCODE_SET;
   1425       ins[0].cond = INST_CONDITION_NZ;
   1426       ins[0].dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
   1427                                             INST_COMPS_Z | INST_COMPS_W);
   1428       ins[0].src[0] = src[0];
   1429 
   1430       ins[1].opcode = INST_OPCODE_SELECT;
   1431       ins[1].cond = INST_CONDITION_LZ;
   1432       ins[1].sat = inst->Instruction.Saturate;
   1433       ins[1].dst = convert_dst(c, &inst->Dst[0]);
   1434       ins[1].src[0] = src[0];
   1435       ins[1].src[2] = etna_native_to_src(temp, INST_SWIZ_IDENTITY);
   1436       ins[1].src[1] = negate(ins[1].src[2]);
   1437 
   1438       emit_inst(c, &ins[0]);
   1439       emit_inst(c, &ins[1]);
   1440    }
   1441 }
   1442 
   1443 static void
   1444 trans_trig(const struct instr_translater *t, struct etna_compile *c,
   1445            const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1446 {
   1447    if (c->specs->has_new_sin_cos) { /* Alternative SIN/COS */
   1448       /* On newer chips alternative SIN/COS instructions are implemented,
   1449        * which:
   1450        * - Need their input scaled by 1/pi instead of 2/pi
   1451        * - Output an x and y component, which need to be multiplied to
   1452        *   get the result
   1453        */
   1454       /* TGSI lowering should deal with SCS */
   1455       assert(inst->Instruction.Opcode != TGSI_OPCODE_SCS);
   1456 
   1457       struct etna_native_reg temp = etna_compile_get_inner_temp(c); /* only using .xyz */
   1458       emit_inst(c, &(struct etna_inst) {
   1459          .opcode = INST_OPCODE_MUL,
   1460          .sat = 0,
   1461          .dst = etna_native_to_dst(temp, INST_COMPS_Z),
   1462          .src[0] = src[0], /* any swizzling happens here */
   1463          .src[1] = alloc_imm_f32(c, 1.0f / M_PI),
   1464       });
   1465       emit_inst(c, &(struct etna_inst) {
   1466          .opcode = inst->Instruction.Opcode == TGSI_OPCODE_COS
   1467                     ? INST_OPCODE_COS
   1468                     : INST_OPCODE_SIN,
   1469          .sat = 0,
   1470          .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y),
   1471          .src[2] = etna_native_to_src(temp, SWIZZLE(Z, Z, Z, Z)),
   1472          .tex = { .amode=1 }, /* Unknown bit needs to be set */
   1473       });
   1474       emit_inst(c, &(struct etna_inst) {
   1475          .opcode = INST_OPCODE_MUL,
   1476          .sat = inst->Instruction.Saturate,
   1477          .dst = convert_dst(c, &inst->Dst[0]),
   1478          .src[0] = etna_native_to_src(temp, SWIZZLE(X, X, X, X)),
   1479          .src[1] = etna_native_to_src(temp, SWIZZLE(Y, Y, Y, Y)),
   1480       });
   1481 
   1482    } else if (c->specs->has_sin_cos_sqrt) {
   1483       /* TGSI lowering should deal with SCS */
   1484       assert(inst->Instruction.Opcode != TGSI_OPCODE_SCS);
   1485 
   1486       struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1487       /* add divide by PI/2, using a temp register. GC2000
   1488        * fails with src==dst for the trig instruction. */
   1489       emit_inst(c, &(struct etna_inst) {
   1490          .opcode = INST_OPCODE_MUL,
   1491          .sat = 0,
   1492          .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
   1493                                          INST_COMPS_Z | INST_COMPS_W),
   1494          .src[0] = src[0], /* any swizzling happens here */
   1495          .src[1] = alloc_imm_f32(c, 2.0f / M_PI),
   1496       });
   1497       emit_inst(c, &(struct etna_inst) {
   1498          .opcode = inst->Instruction.Opcode == TGSI_OPCODE_COS
   1499                     ? INST_OPCODE_COS
   1500                     : INST_OPCODE_SIN,
   1501          .sat = inst->Instruction.Saturate,
   1502          .dst = convert_dst(c, &inst->Dst[0]),
   1503          .src[2] = etna_native_to_src(temp, INST_SWIZ_IDENTITY),
   1504       });
   1505    } else {
   1506       /* Implement Nick's fast sine/cosine. Taken from:
   1507        * http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
   1508        * A=(1/2*PI 0 1/2*PI 0) B=(0.75 0 0.5 0) C=(-4 4 X X)
   1509        *  MAD t.x_zw, src.xxxx, A, B
   1510        *  FRC t.x_z_, void, void, t.xwzw
   1511        *  MAD t.x_z_, t.xwzw, 2, -1
   1512        *  MUL t._y__, t.wzww, |t.wzww|, void  (for sin/scs)
   1513        *  DP3 t.x_z_, t.zyww, C, void         (for sin)
   1514        *  DP3 t.__z_, t.zyww, C, void         (for scs)
   1515        *  MUL t._y__, t.wxww, |t.wxww|, void  (for cos/scs)
   1516        *  DP3 t.x_z_, t.xyww, C, void         (for cos)
   1517        *  DP3 t.x___, t.xyww, C, void         (for scs)
   1518        *  MAD t._y_w, t,xxzz, |t.xxzz|, -t.xxzz
   1519        *  MAD dst, t.ywyw, .2225, t.xzxz
   1520        *
   1521        * TODO: we don't set dst.zw correctly for SCS.
   1522        */
   1523       struct etna_inst *p, ins[9] = { };
   1524       struct etna_native_reg t0 = etna_compile_get_inner_temp(c);
   1525       struct etna_inst_src t0s = etna_native_to_src(t0, INST_SWIZ_IDENTITY);
   1526       struct etna_inst_src sincos[3], in = src[0];
   1527       sincos[0] = etna_imm_vec4f(c, sincos_const[0]);
   1528       sincos[1] = etna_imm_vec4f(c, sincos_const[1]);
   1529 
   1530       /* A uniform source will cause the inner temp limit to
   1531        * be exceeded.  Explicitly deal with that scenario.
   1532        */
   1533       if (etna_rgroup_is_uniform(src[0].rgroup)) {
   1534          struct etna_inst ins = { };
   1535          ins.opcode = INST_OPCODE_MOV;
   1536          ins.dst = etna_native_to_dst(t0, INST_COMPS_X);
   1537          ins.src[2] = in;
   1538          emit_inst(c, &ins);
   1539          in = t0s;
   1540       }
   1541 
   1542       ins[0].opcode = INST_OPCODE_MAD;
   1543       ins[0].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z | INST_COMPS_W);
   1544       ins[0].src[0] = swizzle(in, SWIZZLE(X, X, X, X));
   1545       ins[0].src[1] = swizzle(sincos[1], SWIZZLE(X, W, X, W)); /* 1/2*PI */
   1546       ins[0].src[2] = swizzle(sincos[1], SWIZZLE(Y, W, Z, W)); /* 0.75, 0, 0.5, 0 */
   1547 
   1548       ins[1].opcode = INST_OPCODE_FRC;
   1549       ins[1].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z);
   1550       ins[1].src[2] = swizzle(t0s, SWIZZLE(X, W, Z, W));
   1551 
   1552       ins[2].opcode = INST_OPCODE_MAD;
   1553       ins[2].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z);
   1554       ins[2].src[0] = swizzle(t0s, SWIZZLE(X, W, Z, W));
   1555       ins[2].src[1] = swizzle(sincos[0], SWIZZLE(X, X, X, X)); /* 2 */
   1556       ins[2].src[2] = swizzle(sincos[0], SWIZZLE(Y, Y, Y, Y)); /* -1 */
   1557 
   1558       unsigned mul_swiz, dp3_swiz;
   1559       if (inst->Instruction.Opcode == TGSI_OPCODE_SIN) {
   1560          mul_swiz = SWIZZLE(W, Z, W, W);
   1561          dp3_swiz = SWIZZLE(Z, Y, W, W);
   1562       } else {
   1563          mul_swiz = SWIZZLE(W, X, W, W);
   1564          dp3_swiz = SWIZZLE(X, Y, W, W);
   1565       }
   1566 
   1567       ins[3].opcode = INST_OPCODE_MUL;
   1568       ins[3].dst = etna_native_to_dst(t0, INST_COMPS_Y);
   1569       ins[3].src[0] = swizzle(t0s, mul_swiz);
   1570       ins[3].src[1] = absolute(ins[3].src[0]);
   1571 
   1572       ins[4].opcode = INST_OPCODE_DP3;
   1573       ins[4].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z);
   1574       ins[4].src[0] = swizzle(t0s, dp3_swiz);
   1575       ins[4].src[1] = swizzle(sincos[0], SWIZZLE(Z, W, W, W));
   1576 
   1577       if (inst->Instruction.Opcode == TGSI_OPCODE_SCS) {
   1578          ins[5] = ins[3];
   1579          ins[6] = ins[4];
   1580          ins[4].dst.comps = INST_COMPS_X;
   1581          ins[6].dst.comps = INST_COMPS_Z;
   1582          ins[5].src[0] = swizzle(t0s, SWIZZLE(W, Z, W, W));
   1583          ins[6].src[0] = swizzle(t0s, SWIZZLE(Z, Y, W, W));
   1584          ins[5].src[1] = absolute(ins[5].src[0]);
   1585          p = &ins[7];
   1586       } else {
   1587          p = &ins[5];
   1588       }
   1589 
   1590       p->opcode = INST_OPCODE_MAD;
   1591       p->dst = etna_native_to_dst(t0, INST_COMPS_Y | INST_COMPS_W);
   1592       p->src[0] = swizzle(t0s, SWIZZLE(X, X, Z, Z));
   1593       p->src[1] = absolute(p->src[0]);
   1594       p->src[2] = negate(p->src[0]);
   1595 
   1596       p++;
   1597       p->opcode = INST_OPCODE_MAD;
   1598       p->sat = inst->Instruction.Saturate;
   1599       p->dst = convert_dst(c, &inst->Dst[0]),
   1600       p->src[0] = swizzle(t0s, SWIZZLE(Y, W, Y, W));
   1601       p->src[1] = alloc_imm_f32(c, 0.2225);
   1602       p->src[2] = swizzle(t0s, SWIZZLE(X, Z, X, Z));
   1603 
   1604       for (int i = 0; &ins[i] <= p; i++)
   1605          emit_inst(c, &ins[i]);
   1606    }
   1607 }
   1608 
   1609 static void
   1610 trans_dph(const struct instr_translater *t, struct etna_compile *c,
   1611           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1612 {
   1613    /*
   1614    DP3 tmp.xyzw, src0.xyzw, src1,xyzw, void
   1615    ADD dst.xyzw, tmp.xyzw, void, src1.wwww
   1616    */
   1617    struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1618    struct etna_inst ins[2] = { };
   1619 
   1620    ins[0].opcode = INST_OPCODE_DP3;
   1621    ins[0].dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
   1622                                          INST_COMPS_Z | INST_COMPS_W);
   1623    ins[0].src[0] = src[0];
   1624    ins[0].src[1] = src[1];
   1625 
   1626    ins[1].opcode = INST_OPCODE_ADD;
   1627    ins[1].sat = inst->Instruction.Saturate;
   1628    ins[1].dst = convert_dst(c, &inst->Dst[0]);
   1629    ins[1].src[0] = etna_native_to_src(temp, INST_SWIZ_IDENTITY);
   1630    ins[1].src[2] = swizzle(src[1], SWIZZLE(W, W, W, W));
   1631 
   1632    emit_inst(c, &ins[0]);
   1633    emit_inst(c, &ins[1]);
   1634 }
   1635 
   1636 static void
   1637 trans_sampler(const struct instr_translater *t, struct etna_compile *c,
   1638               const struct tgsi_full_instruction *inst,
   1639               struct etna_inst_src *src)
   1640 {
   1641    /* There is no native support for GL texture rectangle coordinates, so
   1642     * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0, 1]). */
   1643    if (inst->Texture.Texture == TGSI_TEXTURE_RECT) {
   1644       uint32_t unit = inst->Src[1].Register.Index;
   1645       struct etna_inst ins[2] = { };
   1646       struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1647 
   1648       ins[0].opcode = INST_OPCODE_MUL;
   1649       ins[0].dst = etna_native_to_dst(temp, INST_COMPS_X);
   1650       ins[0].src[0] = src[0];
   1651       ins[0].src[1] = alloc_imm(c, ETNA_IMMEDIATE_TEXRECT_SCALE_X, unit);
   1652 
   1653       ins[1].opcode = INST_OPCODE_MUL;
   1654       ins[1].dst = etna_native_to_dst(temp, INST_COMPS_Y);
   1655       ins[1].src[0] = src[0];
   1656       ins[1].src[1] = alloc_imm(c, ETNA_IMMEDIATE_TEXRECT_SCALE_Y, unit);
   1657 
   1658       emit_inst(c, &ins[0]);
   1659       emit_inst(c, &ins[1]);
   1660 
   1661       src[0] = etna_native_to_src(temp, INST_SWIZ_IDENTITY); /* temp.xyzw */
   1662    }
   1663 
   1664    switch (inst->Instruction.Opcode) {
   1665    case TGSI_OPCODE_TEX:
   1666       emit_inst(c, &(struct etna_inst) {
   1667          .opcode = INST_OPCODE_TEXLD,
   1668          .sat = 0,
   1669          .dst = convert_dst(c, &inst->Dst[0]),
   1670          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
   1671          .src[0] = src[0],
   1672       });
   1673       break;
   1674 
   1675    case TGSI_OPCODE_TXB:
   1676       emit_inst(c, &(struct etna_inst) {
   1677          .opcode = INST_OPCODE_TEXLDB,
   1678          .sat = 0,
   1679          .dst = convert_dst(c, &inst->Dst[0]),
   1680          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
   1681          .src[0] = src[0],
   1682       });
   1683       break;
   1684 
   1685    case TGSI_OPCODE_TXL:
   1686       emit_inst(c, &(struct etna_inst) {
   1687          .opcode = INST_OPCODE_TEXLDL,
   1688          .sat = 0,
   1689          .dst = convert_dst(c, &inst->Dst[0]),
   1690          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
   1691          .src[0] = src[0],
   1692       });
   1693       break;
   1694 
   1695    case TGSI_OPCODE_TXP: { /* divide src.xyz by src.w */
   1696       struct etna_native_reg temp = etna_compile_get_inner_temp(c);
   1697 
   1698       emit_inst(c, &(struct etna_inst) {
   1699          .opcode = INST_OPCODE_RCP,
   1700          .sat = 0,
   1701          .dst = etna_native_to_dst(temp, INST_COMPS_W), /* tmp.w */
   1702          .src[2] = swizzle(src[0], SWIZZLE(W, W, W, W)),
   1703       });
   1704       emit_inst(c, &(struct etna_inst) {
   1705          .opcode = INST_OPCODE_MUL,
   1706          .sat = 0,
   1707          .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
   1708                                          INST_COMPS_Z), /* tmp.xyz */
   1709          .src[0] = etna_native_to_src(temp, SWIZZLE(W, W, W, W)),
   1710          .src[1] = src[0], /* src.xyzw */
   1711       });
   1712       emit_inst(c, &(struct etna_inst) {
   1713          .opcode = INST_OPCODE_TEXLD,
   1714          .sat = 0,
   1715          .dst = convert_dst(c, &inst->Dst[0]),
   1716          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
   1717          .src[0] = etna_native_to_src(temp, INST_SWIZ_IDENTITY), /* tmp.xyzw */
   1718       });
   1719    } break;
   1720 
   1721    default:
   1722       BUG("Unhandled instruction %s",
   1723           tgsi_get_opcode_name(inst->Instruction.Opcode));
   1724       assert(0);
   1725       break;
   1726    }
   1727 }
   1728 
   1729 static void
   1730 trans_dummy(const struct instr_translater *t, struct etna_compile *c,
   1731             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
   1732 {
   1733    /* nothing to do */
   1734 }
   1735 
   1736 static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
   1737 #define INSTR(n, f, ...) \
   1738    [TGSI_OPCODE_##n] = {.fxn = (f), .tgsi_opc = TGSI_OPCODE_##n, ##__VA_ARGS__}
   1739 
   1740    INSTR(MOV, trans_instr, .opc = INST_OPCODE_MOV, .src = {2, -1, -1}),
   1741    INSTR(RCP, trans_instr, .opc = INST_OPCODE_RCP, .src = {2, -1, -1}),
   1742    INSTR(RSQ, trans_instr, .opc = INST_OPCODE_RSQ, .src = {2, -1, -1}),
   1743    INSTR(MUL, trans_instr, .opc = INST_OPCODE_MUL, .src = {0, 1, -1}),
   1744    INSTR(ADD, trans_instr, .opc = INST_OPCODE_ADD, .src = {0, 2, -1}),
   1745    INSTR(DP3, trans_instr, .opc = INST_OPCODE_DP3, .src = {0, 1, -1}),
   1746    INSTR(DP4, trans_instr, .opc = INST_OPCODE_DP4, .src = {0, 1, -1}),
   1747    INSTR(DST, trans_instr, .opc = INST_OPCODE_DST, .src = {0, 1, -1}),
   1748    INSTR(MAD, trans_instr, .opc = INST_OPCODE_MAD, .src = {0, 1, 2}),
   1749    INSTR(EX2, trans_instr, .opc = INST_OPCODE_EXP, .src = {2, -1, -1}),
   1750    INSTR(LG2, trans_instr, .opc = INST_OPCODE_LOG, .src = {2, -1, -1}),
   1751    INSTR(SQRT, trans_instr, .opc = INST_OPCODE_SQRT, .src = {2, -1, -1}),
   1752    INSTR(FRC, trans_instr, .opc = INST_OPCODE_FRC, .src = {2, -1, -1}),
   1753    INSTR(CEIL, trans_instr, .opc = INST_OPCODE_CEIL, .src = {2, -1, -1}),
   1754    INSTR(FLR, trans_instr, .opc = INST_OPCODE_FLOOR, .src = {2, -1, -1}),
   1755    INSTR(CMP, trans_instr, .opc = INST_OPCODE_SELECT, .src = {0, 1, 2}, .cond = INST_CONDITION_LZ),
   1756 
   1757    INSTR(KILL, trans_instr, .opc = INST_OPCODE_TEXKILL),
   1758    INSTR(KILL_IF, trans_instr, .opc = INST_OPCODE_TEXKILL, .src = {0, -1, -1}, .cond = INST_CONDITION_LZ),
   1759 
   1760    INSTR(DDX, trans_deriv, .opc = INST_OPCODE_DSX),
   1761    INSTR(DDY, trans_deriv, .opc = INST_OPCODE_DSY),
   1762 
   1763    INSTR(IF, trans_if),
   1764    INSTR(ELSE, trans_else),
   1765    INSTR(ENDIF, trans_endif),
   1766 
   1767    INSTR(BGNLOOP, trans_loop_bgn),
   1768    INSTR(ENDLOOP, trans_loop_end),
   1769    INSTR(BRK, trans_brk),
   1770    INSTR(CONT, trans_cont),
   1771 
   1772    INSTR(MIN, trans_min_max, .opc = INST_OPCODE_SELECT, .cond = INST_CONDITION_GT),
   1773    INSTR(MAX, trans_min_max, .opc = INST_OPCODE_SELECT, .cond = INST_CONDITION_LT),
   1774 
   1775    INSTR(ARL, trans_arl),
   1776    INSTR(LRP, trans_lrp),
   1777    INSTR(LIT, trans_lit),
   1778    INSTR(SSG, trans_ssg),
   1779    INSTR(DPH, trans_dph),
   1780 
   1781    INSTR(SIN, trans_trig),
   1782    INSTR(COS, trans_trig),
   1783    INSTR(SCS, trans_trig),
   1784 
   1785    INSTR(SLT, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_LT),
   1786    INSTR(SGE, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_GE),
   1787    INSTR(SEQ, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_EQ),
   1788    INSTR(SGT, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_GT),
   1789    INSTR(SLE, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_LE),
   1790    INSTR(SNE, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_NE),
   1791 
   1792    INSTR(TEX, trans_sampler),
   1793    INSTR(TXB, trans_sampler),
   1794    INSTR(TXL, trans_sampler),
   1795    INSTR(TXP, trans_sampler),
   1796 
   1797    INSTR(NOP, trans_dummy),
   1798    INSTR(END, trans_dummy),
   1799 };
   1800 
   1801 /* Pass -- compile instructions */
   1802 static void
   1803 etna_compile_pass_generate_code(struct etna_compile *c)
   1804 {
   1805    struct tgsi_parse_context ctx = { };
   1806    unsigned status = tgsi_parse_init(&ctx, c->tokens);
   1807    assert(status == TGSI_PARSE_OK);
   1808 
   1809    int inst_idx = 0;
   1810    while (!tgsi_parse_end_of_tokens(&ctx)) {
   1811       const struct tgsi_full_instruction *inst = 0;
   1812 
   1813       /* No inner temps used yet for this instruction, clear counter */
   1814       c->inner_temps = 0;
   1815 
   1816       tgsi_parse_token(&ctx);
   1817 
   1818       switch (ctx.FullToken.Token.Type) {
   1819       case TGSI_TOKEN_TYPE_INSTRUCTION:
   1820          /* iterate over operands */
   1821          inst = &ctx.FullToken.FullInstruction;
   1822          if (c->dead_inst[inst_idx]) { /* skip dead instructions */
   1823             inst_idx++;
   1824             continue;
   1825          }
   1826 
   1827          /* Lookup the TGSI information and generate the source arguments */
   1828          struct etna_inst_src src[ETNA_NUM_SRC];
   1829          memset(src, 0, sizeof(src));
   1830 
   1831          const struct tgsi_opcode_info *tgsi = tgsi_get_opcode_info(inst->Instruction.Opcode);
   1832 
   1833          for (int i = 0; i < tgsi->num_src && i < ETNA_NUM_SRC; i++) {
   1834             const struct tgsi_full_src_register *reg = &inst->Src[i];
   1835             const struct etna_native_reg *n = &etna_get_src_reg(c, reg->Register)->native;
   1836 
   1837             if (!n->valid || n->is_tex)
   1838                continue;
   1839 
   1840             src[i] = etna_create_src(reg, n);
   1841          }
   1842 
   1843          const unsigned opc = inst->Instruction.Opcode;
   1844          const struct instr_translater *t = &translaters[opc];
   1845 
   1846          if (t->fxn) {
   1847             t->fxn(t, c, inst, src);
   1848 
   1849             inst_idx += 1;
   1850          } else {
   1851             BUG("Unhandled instruction %s", tgsi_get_opcode_name(opc));
   1852             assert(0);
   1853          }
   1854          break;
   1855       }
   1856    }
   1857    tgsi_parse_free(&ctx);
   1858 }
   1859 
   1860 /* Look up register by semantic */
   1861 static struct etna_reg_desc *
   1862 find_decl_by_semantic(struct etna_compile *c, uint file, uint name, uint index)
   1863 {
   1864    for (int idx = 0; idx < c->file[file].reg_size; ++idx) {
   1865       struct etna_reg_desc *reg = &c->file[file].reg[idx];
   1866 
   1867       if (reg->semantic.Name == name && reg->semantic.Index == index)
   1868          return reg;
   1869    }
   1870 
   1871    return NULL; /* not found */
   1872 }
   1873 
   1874 /** Add ADD and MUL instruction to bring Z/W to 0..1 if -1..1 if needed:
   1875  * - this is a vertex shader
   1876  * - and this is an older GPU
   1877  */
   1878 static void
   1879 etna_compile_add_z_div_if_needed(struct etna_compile *c)
   1880 {
   1881    if (c->info.processor == PIPE_SHADER_VERTEX && c->specs->vs_need_z_div) {
   1882       /* find position out */
   1883       struct etna_reg_desc *pos_reg =
   1884          find_decl_by_semantic(c, TGSI_FILE_OUTPUT, TGSI_SEMANTIC_POSITION, 0);
   1885 
   1886       if (pos_reg != NULL) {
   1887          /*
   1888           * ADD tX.__z_, tX.zzzz, void, tX.wwww
   1889           * MUL tX.__z_, tX.zzzz, 0.5, void
   1890          */
   1891          emit_inst(c, &(struct etna_inst) {
   1892             .opcode = INST_OPCODE_ADD,
   1893             .dst = etna_native_to_dst(pos_reg->native, INST_COMPS_Z),
   1894             .src[0] = etna_native_to_src(pos_reg->native, SWIZZLE(Z, Z, Z, Z)),
   1895             .src[2] = etna_native_to_src(pos_reg->native, SWIZZLE(W, W, W, W)),
   1896          });
   1897          emit_inst(c, &(struct etna_inst) {
   1898             .opcode = INST_OPCODE_MUL,
   1899             .dst = etna_native_to_dst(pos_reg->native, INST_COMPS_Z),
   1900             .src[0] = etna_native_to_src(pos_reg->native, SWIZZLE(Z, Z, Z, Z)),
   1901             .src[1] = alloc_imm_f32(c, 0.5f),
   1902          });
   1903       }
   1904    }
   1905 }
   1906 
   1907 /** add a NOP to the shader if
   1908  * a) the shader is empty
   1909  * or
   1910  * b) there is a label at the end of the shader
   1911  */
   1912 static void
   1913 etna_compile_add_nop_if_needed(struct etna_compile *c)
   1914 {
   1915    bool label_at_last_inst = false;
   1916 
   1917    for (int idx = 0; idx < c->labels_count; ++idx) {
   1918       if (c->labels[idx].inst_idx == c->inst_ptr)
   1919          label_at_last_inst = true;
   1920 
   1921    }
   1922 
   1923    if (c->inst_ptr == 0 || label_at_last_inst)
   1924       emit_inst(c, &(struct etna_inst){.opcode = INST_OPCODE_NOP});
   1925 }
   1926 
   1927 static void
   1928 assign_uniforms(struct etna_compile_file *file, unsigned base)
   1929 {
   1930    for (int idx = 0; idx < file->reg_size; ++idx) {
   1931       file->reg[idx].native.valid = 1;
   1932       file->reg[idx].native.rgroup = INST_RGROUP_UNIFORM_0;
   1933       file->reg[idx].native.id = base + idx;
   1934    }
   1935 }
   1936 
   1937 /* Allocate CONST and IMM to native ETNA_RGROUP_UNIFORM(x).
   1938  * CONST must be consecutive as const buffers are supposed to be consecutive,
   1939  * and before IMM, as this is
   1940  * more convenient because is possible for the compilation process itself to
   1941  * generate extra
   1942  * immediates for constants such as pi, one, zero.
   1943  */
   1944 static void
   1945 assign_constants_and_immediates(struct etna_compile *c)
   1946 {
   1947    assign_uniforms(&c->file[TGSI_FILE_CONSTANT], 0);
   1948    /* immediates start after the constants */
   1949    c->imm_base = c->file[TGSI_FILE_CONSTANT].reg_size * 4;
   1950    assign_uniforms(&c->file[TGSI_FILE_IMMEDIATE], c->imm_base / 4);
   1951    DBG_F(ETNA_DBG_COMPILER_MSGS, "imm base: %i size: %i", c->imm_base,
   1952          c->imm_size);
   1953 }
   1954 
   1955 /* Assign declared samplers to native texture units */
   1956 static void
   1957 assign_texture_units(struct etna_compile *c)
   1958 {
   1959    uint tex_base = 0;
   1960 
   1961    if (c->info.processor == PIPE_SHADER_VERTEX)
   1962       tex_base = c->specs->vertex_sampler_offset;
   1963 
   1964    for (int idx = 0; idx < c->file[TGSI_FILE_SAMPLER].reg_size; ++idx) {
   1965       c->file[TGSI_FILE_SAMPLER].reg[idx].native.valid = 1;
   1966       c->file[TGSI_FILE_SAMPLER].reg[idx].native.is_tex = 1; // overrides rgroup
   1967       c->file[TGSI_FILE_SAMPLER].reg[idx].native.id = tex_base + idx;
   1968    }
   1969 }
   1970 
   1971 /* Additional pass to fill in branch targets. This pass should be last
   1972  * as no instruction reordering or removing/addition can be done anymore
   1973  * once the branch targets are computed.
   1974  */
   1975 static void
   1976 etna_compile_fill_in_labels(struct etna_compile *c)
   1977 {
   1978    for (int idx = 0; idx < c->inst_ptr; ++idx) {
   1979       if (c->lbl_usage[idx])
   1980          etna_assemble_set_imm(&c->code[idx * 4], c->lbl_usage[idx]->inst_idx);
   1981    }
   1982 }
   1983 
   1984 /* compare two etna_native_reg structures, return true if equal */
   1985 static bool
   1986 cmp_etna_native_reg(const struct etna_native_reg to,
   1987                     const struct etna_native_reg from)
   1988 {
   1989    return to.valid == from.valid && to.is_tex == from.is_tex &&
   1990           to.rgroup == from.rgroup && to.id == from.id;
   1991 }
   1992 
   1993 /* go through all declarations and swap native registers *to* and *from* */
   1994 static void
   1995 swap_native_registers(struct etna_compile *c, const struct etna_native_reg to,
   1996                       const struct etna_native_reg from)
   1997 {
   1998    if (cmp_etna_native_reg(from, to))
   1999       return; /* Nothing to do */
   2000 
   2001    for (int idx = 0; idx < c->total_decls; ++idx) {
   2002       if (cmp_etna_native_reg(c->decl[idx].native, from)) {
   2003          c->decl[idx].native = to;
   2004       } else if (cmp_etna_native_reg(c->decl[idx].native, to)) {
   2005          c->decl[idx].native = from;
   2006       }
   2007    }
   2008 }
   2009 
   2010 /* For PS we need to permute so that inputs are always in temporary 0..N-1.
   2011  * Semantic POS is always t0. If that semantic is not used, avoid t0.
   2012  */
   2013 static void
   2014 permute_ps_inputs(struct etna_compile *c)
   2015 {
   2016    /* Special inputs:
   2017     * gl_FragCoord  VARYING_SLOT_POS   TGSI_SEMANTIC_POSITION
   2018     * gl_PointCoord VARYING_SLOT_PNTC  TGSI_SEMANTIC_PCOORD
   2019     */
   2020    uint native_idx = 1;
   2021 
   2022    for (int idx = 0; idx < c->file[TGSI_FILE_INPUT].reg_size; ++idx) {
   2023       struct etna_reg_desc *reg = &c->file[TGSI_FILE_INPUT].reg[idx];
   2024       uint input_id;
   2025       assert(reg->has_semantic);
   2026 
   2027       if (!reg->active || reg->semantic.Name == TGSI_SEMANTIC_POSITION)
   2028          continue;
   2029 
   2030       input_id = native_idx++;
   2031       swap_native_registers(c, etna_native_temp(input_id),
   2032                             c->file[TGSI_FILE_INPUT].reg[idx].native);
   2033    }
   2034 
   2035    c->num_varyings = native_idx - 1;
   2036 
   2037    if (native_idx > c->next_free_native)
   2038       c->next_free_native = native_idx;
   2039 }
   2040 
   2041 /* fill in ps inputs into shader object */
   2042 static void
   2043 fill_in_ps_inputs(struct etna_shader *sobj, struct etna_compile *c)
   2044 {
   2045    struct etna_shader_io_file *sf = &sobj->infile;
   2046 
   2047    sf->num_reg = 0;
   2048 
   2049    for (int idx = 0; idx < c->file[TGSI_FILE_INPUT].reg_size; ++idx) {
   2050       struct etna_reg_desc *reg = &c->file[TGSI_FILE_INPUT].reg[idx];
   2051 
   2052       if (reg->native.id > 0) {
   2053          assert(sf->num_reg < ETNA_NUM_INPUTS);
   2054          sf->reg[sf->num_reg].reg = reg->native.id;
   2055          sf->reg[sf->num_reg].semantic = reg->semantic;
   2056          /* convert usage mask to number of components (*=wildcard)
   2057           *   .r    (0..1)  -> 1 component
   2058           *   .*g   (2..3)  -> 2 component
   2059           *   .**b  (4..7)  -> 3 components
   2060           *   .***a (8..15) -> 4 components
   2061           */
   2062          sf->reg[sf->num_reg].num_components = util_last_bit(reg->usage_mask);
   2063          sf->num_reg++;
   2064       }
   2065    }
   2066 
   2067    assert(sf->num_reg == c->num_varyings);
   2068    sobj->input_count_unk8 = 31; /* XXX what is this */
   2069 }
   2070 
   2071 /* fill in output mapping for ps into shader object */
   2072 static void
   2073 fill_in_ps_outputs(struct etna_shader *sobj, struct etna_compile *c)
   2074 {
   2075    sobj->outfile.num_reg = 0;
   2076 
   2077    for (int idx = 0; idx < c->file[TGSI_FILE_OUTPUT].reg_size; ++idx) {
   2078       struct etna_reg_desc *reg = &c->file[TGSI_FILE_OUTPUT].reg[idx];
   2079 
   2080       switch (reg->semantic.Name) {
   2081       case TGSI_SEMANTIC_COLOR: /* FRAG_RESULT_COLOR */
   2082          sobj->ps_color_out_reg = reg->native.id;
   2083          break;
   2084       case TGSI_SEMANTIC_POSITION: /* FRAG_RESULT_DEPTH */
   2085          sobj->ps_depth_out_reg = reg->native.id; /* =always native reg 0, only z component should be assigned */
   2086          break;
   2087       default:
   2088          assert(0); /* only outputs supported are COLOR and POSITION at the moment */
   2089       }
   2090    }
   2091 }
   2092 
   2093 /* fill in inputs for vs into shader object */
   2094 static void
   2095 fill_in_vs_inputs(struct etna_shader *sobj, struct etna_compile *c)
   2096 {
   2097    struct etna_shader_io_file *sf = &sobj->infile;
   2098 
   2099    sf->num_reg = 0;
   2100    for (int idx = 0; idx < c->file[TGSI_FILE_INPUT].reg_size; ++idx) {
   2101       struct etna_reg_desc *reg = &c->file[TGSI_FILE_INPUT].reg[idx];
   2102       assert(sf->num_reg < ETNA_NUM_INPUTS);
   2103       /* XXX exclude inputs with special semantics such as gl_frontFacing */
   2104       sf->reg[sf->num_reg].reg = reg->native.id;
   2105       sf->reg[sf->num_reg].semantic = reg->semantic;
   2106       sf->reg[sf->num_reg].num_components = util_last_bit(reg->usage_mask);
   2107       sf->num_reg++;
   2108    }
   2109 
   2110    sobj->input_count_unk8 = (sf->num_reg + 19) / 16; /* XXX what is this */
   2111 }
   2112 
   2113 /* build two-level output index [Semantic][Index] for fast linking */
   2114 static void
   2115 build_output_index(struct etna_shader *sobj)
   2116 {
   2117    int total = 0;
   2118    int offset = 0;
   2119 
   2120    for (int name = 0; name < TGSI_SEMANTIC_COUNT; ++name)
   2121       total += sobj->output_count_per_semantic[name];
   2122 
   2123    sobj->output_per_semantic_list = CALLOC(total, sizeof(struct etna_shader_inout *));
   2124 
   2125    for (int name = 0; name < TGSI_SEMANTIC_COUNT; ++name) {
   2126       sobj->output_per_semantic[name] = &sobj->output_per_semantic_list[offset];
   2127       offset += sobj->output_count_per_semantic[name];
   2128    }
   2129 
   2130    for (int idx = 0; idx < sobj->outfile.num_reg; ++idx) {
   2131       sobj->output_per_semantic[sobj->outfile.reg[idx].semantic.Name]
   2132                                [sobj->outfile.reg[idx].semantic.Index] =
   2133          &sobj->outfile.reg[idx];
   2134    }
   2135 }
   2136 
   2137 /* fill in outputs for vs into shader object */
   2138 static void
   2139 fill_in_vs_outputs(struct etna_shader *sobj, struct etna_compile *c)
   2140 {
   2141    struct etna_shader_io_file *sf = &sobj->outfile;
   2142 
   2143    sf->num_reg = 0;
   2144    for (int idx = 0; idx < c->file[TGSI_FILE_OUTPUT].reg_size; ++idx) {
   2145       struct etna_reg_desc *reg = &c->file[TGSI_FILE_OUTPUT].reg[idx];
   2146       assert(sf->num_reg < ETNA_NUM_INPUTS);
   2147 
   2148       switch (reg->semantic.Name) {
   2149       case TGSI_SEMANTIC_POSITION:
   2150          sobj->vs_pos_out_reg = reg->native.id;
   2151          break;
   2152       case TGSI_SEMANTIC_PSIZE:
   2153          sobj->vs_pointsize_out_reg = reg->native.id;
   2154          break;
   2155       default:
   2156          sf->reg[sf->num_reg].reg = reg->native.id;
   2157          sf->reg[sf->num_reg].semantic = reg->semantic;
   2158          sf->reg[sf->num_reg].num_components = 4; // XXX reg->num_components;
   2159          sf->num_reg++;
   2160          sobj->output_count_per_semantic[reg->semantic.Name] =
   2161             MAX2(reg->semantic.Index + 1,
   2162                  sobj->output_count_per_semantic[reg->semantic.Name]);
   2163       }
   2164    }
   2165 
   2166    /* build two-level index for linking */
   2167    build_output_index(sobj);
   2168 
   2169    /* fill in "mystery meat" load balancing value. This value determines how
   2170     * work is scheduled between VS and PS
   2171     * in the unified shader architecture. More precisely, it is determined from
   2172     * the number of VS outputs, as well as chip-specific
   2173     * vertex output buffer size, vertex cache size, and the number of shader
   2174     * cores.
   2175     *
   2176     * XXX this is a conservative estimate, the "optimal" value is only known for
   2177     * sure at link time because some
   2178     * outputs may be unused and thus unmapped. Then again, in the general use
   2179     * case with GLSL the vertex and fragment
   2180     * shaders are linked already before submitting to Gallium, thus all outputs
   2181     * are used.
   2182     */
   2183    int half_out = (c->file[TGSI_FILE_OUTPUT].reg_size + 1) / 2;
   2184    assert(half_out);
   2185 
   2186    uint32_t b = ((20480 / (c->specs->vertex_output_buffer_size -
   2187                            2 * half_out * c->specs->vertex_cache_size)) +
   2188                  9) /
   2189                 10;
   2190    uint32_t a = (b + 256 / (c->specs->shader_core_count * half_out)) / 2;
   2191    sobj->vs_load_balancing = VIVS_VS_LOAD_BALANCING_A(MIN2(a, 255)) |
   2192                              VIVS_VS_LOAD_BALANCING_B(MIN2(b, 255)) |
   2193                              VIVS_VS_LOAD_BALANCING_C(0x3f) |
   2194                              VIVS_VS_LOAD_BALANCING_D(0x0f);
   2195 }
   2196 
   2197 static bool
   2198 etna_compile_check_limits(struct etna_compile *c)
   2199 {
   2200    int max_uniforms = (c->info.processor == PIPE_SHADER_VERTEX)
   2201                          ? c->specs->max_vs_uniforms
   2202                          : c->specs->max_ps_uniforms;
   2203    /* round up number of uniforms, including immediates, in units of four */
   2204    int num_uniforms = c->imm_base / 4 + (c->imm_size + 3) / 4;
   2205 
   2206    if (c->inst_ptr > c->specs->max_instructions) {
   2207       DBG("Number of instructions (%d) exceeds maximum %d", c->inst_ptr,
   2208           c->specs->max_instructions);
   2209       return false;
   2210    }
   2211 
   2212    if (c->next_free_native > c->specs->max_registers) {
   2213       DBG("Number of registers (%d) exceeds maximum %d", c->next_free_native,
   2214           c->specs->max_registers);
   2215       return false;
   2216    }
   2217 
   2218    if (num_uniforms > max_uniforms) {
   2219       DBG("Number of uniforms (%d) exceeds maximum %d", num_uniforms,
   2220           max_uniforms);
   2221       return false;
   2222    }
   2223 
   2224    if (c->num_varyings > c->specs->max_varyings) {
   2225       DBG("Number of varyings (%d) exceeds maximum %d", c->num_varyings,
   2226           c->specs->max_varyings);
   2227       return false;
   2228    }
   2229 
   2230    if (c->imm_base > c->specs->num_constants) {
   2231       DBG("Number of constants (%d) exceeds maximum %d", c->imm_base,
   2232           c->specs->num_constants);
   2233    }
   2234 
   2235    return true;
   2236 }
   2237 
   2238 static void
   2239 copy_uniform_state_to_shader(struct etna_compile *c, struct etna_shader *sobj)
   2240 {
   2241    uint32_t count = c->imm_size;
   2242    struct etna_shader_uniform_info *uinfo = &sobj->uniforms;
   2243 
   2244    uinfo->const_count = c->imm_base;
   2245    uinfo->imm_count = count;
   2246    uinfo->imm_data = mem_dup(c->imm_data, count * sizeof(*c->imm_data));
   2247    uinfo->imm_contents = mem_dup(c->imm_contents, count * sizeof(*c->imm_contents));
   2248 
   2249    etna_set_shader_uniforms_dirty_flags(sobj);
   2250 }
   2251 
   2252 struct etna_shader *
   2253 etna_compile_shader(const struct etna_specs *specs,
   2254                     const struct tgsi_token *tokens)
   2255 {
   2256    /* Create scratch space that may be too large to fit on stack
   2257     */
   2258    bool ret;
   2259    struct etna_compile *c;
   2260    struct etna_shader *shader;
   2261 
   2262    struct tgsi_lowering_config lconfig = {
   2263       .lower_SCS = specs->has_sin_cos_sqrt,
   2264       .lower_FLR = !specs->has_sign_floor_ceil,
   2265       .lower_CEIL = !specs->has_sign_floor_ceil,
   2266       .lower_POW = true,
   2267       .lower_EXP = true,
   2268       .lower_LOG = true,
   2269       .lower_DP2 = true,
   2270       .lower_DP2A = true,
   2271       .lower_TRUNC = true,
   2272       .lower_XPD = true
   2273    };
   2274 
   2275    c = CALLOC_STRUCT(etna_compile);
   2276    if (!c)
   2277       return NULL;
   2278 
   2279    shader = CALLOC_STRUCT(etna_shader);
   2280    if (!shader)
   2281       goto out;
   2282 
   2283    c->specs = specs;
   2284    c->tokens = tgsi_transform_lowering(&lconfig, tokens, &c->info);
   2285    c->free_tokens = !!c->tokens;
   2286    if (!c->tokens) {
   2287       /* no lowering */
   2288       c->tokens = tokens;
   2289    }
   2290 
   2291    /* Build a map from gallium register to native registers for files
   2292     * CONST, SAMP, IMM, OUT, IN, TEMP.
   2293     * SAMP will map as-is for fragment shaders, there will be a +8 offset for
   2294     * vertex shaders.
   2295     */
   2296    /* Pass one -- check register file declarations and immediates */
   2297    etna_compile_parse_declarations(c);
   2298 
   2299    etna_allocate_decls(c);
   2300 
   2301    /* Pass two -- check usage of temporaries, inputs, outputs */
   2302    etna_compile_pass_check_usage(c);
   2303 
   2304    assign_special_inputs(c);
   2305 
   2306    /* Assign native temp register to TEMPs */
   2307    assign_temporaries_to_native(c, &c->file[TGSI_FILE_TEMPORARY]);
   2308 
   2309    /* optimize outputs */
   2310    etna_compile_pass_optimize_outputs(c);
   2311 
   2312    /* XXX assign special inputs: gl_FrontFacing (VARYING_SLOT_FACE)
   2313     *     this is part of RGROUP_INTERNAL
   2314     */
   2315 
   2316    /* assign inputs: last usage of input should be <= first usage of temp */
   2317    /*   potential optimization case:
   2318     *     if single MOV TEMP[y], IN[x] before which temp y is not used, and
   2319     * after which IN[x]
   2320     *     is not read, temp[y] can be used as input register as-is
   2321     */
   2322    /*   sort temporaries by first use
   2323     *   sort inputs by last usage
   2324     *   iterate over inputs, temporaries
   2325     *     if last usage of input <= first usage of temp:
   2326     *       assign input to temp
   2327     *       advance input, temporary pointer
   2328     *     else
   2329     *       advance temporary pointer
   2330     *
   2331     *   potential problem: instruction with multiple inputs of which one is the
   2332     * temp and the other is the input;
   2333     *      however, as the temp is not used before this, how would this make
   2334     * sense? uninitialized temporaries have an undefined
   2335     *      value, so this would be ok
   2336     */
   2337    assign_inouts_to_temporaries(c, TGSI_FILE_INPUT);
   2338 
   2339    /* assign outputs: first usage of output should be >= last usage of temp */
   2340    /*   potential optimization case:
   2341     *      if single MOV OUT[x], TEMP[y] (with full write mask, or at least
   2342     * writing all components that are used in
   2343     *        the shader) after which temp y is no longer used temp[y] can be
   2344     * used as output register as-is
   2345     *
   2346     *   potential problem: instruction with multiple outputs of which one is the
   2347     * temp and the other is the output;
   2348     *      however, as the temp is not used after this, how would this make
   2349     * sense? could just discard the output value
   2350     */
   2351    /*   sort temporaries by last use
   2352     *   sort outputs by first usage
   2353     *   iterate over outputs, temporaries
   2354     *     if first usage of output >= last usage of temp:
   2355     *       assign output to temp
   2356     *       advance output, temporary pointer
   2357     *     else
   2358     *       advance temporary pointer
   2359     */
   2360    assign_inouts_to_temporaries(c, TGSI_FILE_OUTPUT);
   2361 
   2362    assign_constants_and_immediates(c);
   2363    assign_texture_units(c);
   2364 
   2365    /* list declarations */
   2366    for (int x = 0; x < c->total_decls; ++x) {
   2367       DBG_F(ETNA_DBG_COMPILER_MSGS, "%i: %s,%d active=%i first_use=%i "
   2368                                     "last_use=%i native=%i usage_mask=%x "
   2369                                     "has_semantic=%i",
   2370             x, tgsi_file_name(c->decl[x].file), c->decl[x].idx,
   2371             c->decl[x].active, c->decl[x].first_use, c->decl[x].last_use,
   2372             c->decl[x].native.valid ? c->decl[x].native.id : -1,
   2373             c->decl[x].usage_mask, c->decl[x].has_semantic);
   2374       if (c->decl[x].has_semantic)
   2375          DBG_F(ETNA_DBG_COMPILER_MSGS, " semantic_name=%s semantic_idx=%i",
   2376                tgsi_semantic_names[c->decl[x].semantic.Name],
   2377                c->decl[x].semantic.Index);
   2378    }
   2379    /* XXX for PS we need to permute so that inputs are always in temporary
   2380     * 0..N-1.
   2381     * There is no "switchboard" for varyings (AFAIK!). The output color,
   2382     * however, can be routed
   2383     * from an arbitrary temporary.
   2384     */
   2385    if (c->info.processor == PIPE_SHADER_FRAGMENT)
   2386       permute_ps_inputs(c);
   2387 
   2388 
   2389    /* list declarations */
   2390    for (int x = 0; x < c->total_decls; ++x) {
   2391       DBG_F(ETNA_DBG_COMPILER_MSGS, "%i: %s,%d active=%i first_use=%i "
   2392                                     "last_use=%i native=%i usage_mask=%x "
   2393                                     "has_semantic=%i",
   2394             x, tgsi_file_name(c->decl[x].file), c->decl[x].idx,
   2395             c->decl[x].active, c->decl[x].first_use, c->decl[x].last_use,
   2396             c->decl[x].native.valid ? c->decl[x].native.id : -1,
   2397             c->decl[x].usage_mask, c->decl[x].has_semantic);
   2398       if (c->decl[x].has_semantic)
   2399          DBG_F(ETNA_DBG_COMPILER_MSGS, " semantic_name=%s semantic_idx=%i",
   2400                tgsi_semantic_names[c->decl[x].semantic.Name],
   2401                c->decl[x].semantic.Index);
   2402    }
   2403 
   2404    /* pass 3: generate instructions */
   2405    etna_compile_pass_generate_code(c);
   2406    etna_compile_add_z_div_if_needed(c);
   2407    etna_compile_add_nop_if_needed(c);
   2408    etna_compile_fill_in_labels(c);
   2409 
   2410    ret = etna_compile_check_limits(c);
   2411    if (!ret) {
   2412       FREE(shader);
   2413       shader = NULL;
   2414       goto out;
   2415    }
   2416 
   2417    /* fill in output structure */
   2418    shader->processor = c->info.processor;
   2419    shader->code_size = c->inst_ptr * 4;
   2420    shader->code = mem_dup(c->code, c->inst_ptr * 16);
   2421    shader->num_temps = c->next_free_native;
   2422    shader->vs_pos_out_reg = -1;
   2423    shader->vs_pointsize_out_reg = -1;
   2424    shader->ps_color_out_reg = -1;
   2425    shader->ps_depth_out_reg = -1;
   2426    copy_uniform_state_to_shader(c, shader);
   2427 
   2428    if (c->info.processor == PIPE_SHADER_VERTEX) {
   2429       fill_in_vs_inputs(shader, c);
   2430       fill_in_vs_outputs(shader, c);
   2431    } else if (c->info.processor == PIPE_SHADER_FRAGMENT) {
   2432       fill_in_ps_inputs(shader, c);
   2433       fill_in_ps_outputs(shader, c);
   2434    }
   2435 
   2436 out:
   2437    if (c->free_tokens)
   2438       FREE((void *)c->tokens);
   2439 
   2440    FREE(c->labels);
   2441    FREE(c);
   2442 
   2443    return shader;
   2444 }
   2445 
   2446 extern const char *tgsi_swizzle_names[];
   2447 void
   2448 etna_dump_shader(const struct etna_shader *shader)
   2449 {
   2450    if (shader->processor == PIPE_SHADER_VERTEX)
   2451       printf("VERT\n");
   2452    else
   2453       printf("FRAG\n");
   2454 
   2455 
   2456    etna_disasm(shader->code, shader->code_size, PRINT_RAW);
   2457 
   2458    printf("num temps: %i\n", shader->num_temps);
   2459    printf("num const: %i\n", shader->uniforms.const_count);
   2460    printf("immediates:\n");
   2461    for (int idx = 0; idx < shader->uniforms.imm_count; ++idx) {
   2462       printf(" [%i].%s = %f (0x%08x)\n",
   2463              (idx + shader->uniforms.const_count) / 4,
   2464              tgsi_swizzle_names[idx % 4],
   2465              *((float *)&shader->uniforms.imm_data[idx]),
   2466              shader->uniforms.imm_data[idx]);
   2467    }
   2468    printf("inputs:\n");
   2469    for (int idx = 0; idx < shader->infile.num_reg; ++idx) {
   2470       printf(" [%i] name=%s index=%i comps=%i\n", shader->infile.reg[idx].reg,
   2471              tgsi_semantic_names[shader->infile.reg[idx].semantic.Name],
   2472              shader->infile.reg[idx].semantic.Index,
   2473              shader->infile.reg[idx].num_components);
   2474    }
   2475    printf("outputs:\n");
   2476    for (int idx = 0; idx < shader->outfile.num_reg; ++idx) {
   2477       printf(" [%i] name=%s index=%i comps=%i\n", shader->outfile.reg[idx].reg,
   2478              tgsi_semantic_names[shader->outfile.reg[idx].semantic.Name],
   2479              shader->outfile.reg[idx].semantic.Index,
   2480              shader->outfile.reg[idx].num_components);
   2481    }
   2482    printf("special:\n");
   2483    if (shader->processor == PIPE_SHADER_VERTEX) {
   2484       printf("  vs_pos_out_reg=%i\n", shader->vs_pos_out_reg);
   2485       printf("  vs_pointsize_out_reg=%i\n", shader->vs_pointsize_out_reg);
   2486       printf("  vs_load_balancing=0x%08x\n", shader->vs_load_balancing);
   2487    } else {
   2488       printf("  ps_color_out_reg=%i\n", shader->ps_color_out_reg);
   2489       printf("  ps_depth_out_reg=%i\n", shader->ps_depth_out_reg);
   2490    }
   2491    printf("  input_count_unk8=0x%08x\n", shader->input_count_unk8);
   2492 }
   2493 
   2494 void
   2495 etna_destroy_shader(struct etna_shader *shader)
   2496 {
   2497    assert(shader);
   2498 
   2499    FREE(shader->code);
   2500    FREE(shader->uniforms.imm_data);
   2501    FREE(shader->uniforms.imm_contents);
   2502    FREE(shader->output_per_semantic_list);
   2503    FREE(shader);
   2504 }
   2505 
   2506 static const struct etna_shader_inout *
   2507 etna_shader_vs_lookup(const struct etna_shader *sobj,
   2508                       const struct etna_shader_inout *in)
   2509 {
   2510    if (in->semantic.Index < sobj->output_count_per_semantic[in->semantic.Name])
   2511       return sobj->output_per_semantic[in->semantic.Name][in->semantic.Index];
   2512 
   2513    return NULL;
   2514 }
   2515 
   2516 bool
   2517 etna_link_shader(struct etna_shader_link_info *info,
   2518                  const struct etna_shader *vs, const struct etna_shader *fs)
   2519 {
   2520    /* For each fragment input we need to find the associated vertex shader
   2521     * output, which can be found by matching on semantic name and index. A
   2522     * binary search could be used because the vs outputs are sorted by their
   2523     * semantic index and grouped by semantic type by fill_in_vs_outputs.
   2524     */
   2525    assert(fs->infile.num_reg < ETNA_NUM_INPUTS);
   2526 
   2527    for (int idx = 0; idx < fs->infile.num_reg; ++idx) {
   2528       const struct etna_shader_inout *fsio = &fs->infile.reg[idx];
   2529       const struct etna_shader_inout *vsio = etna_shader_vs_lookup(vs, fsio);
   2530       struct etna_varying *varying;
   2531 
   2532       assert(fsio->reg > 0 && fsio->reg <= ARRAY_SIZE(info->varyings));
   2533 
   2534       if (fsio->reg > info->num_varyings)
   2535          info->num_varyings = fsio->reg;
   2536 
   2537       varying = &info->varyings[fsio->reg - 1];
   2538       varying->num_components = fsio->num_components;
   2539 
   2540       if (fsio->semantic.Name == TGSI_SEMANTIC_COLOR) /* colors affected by flat shading */
   2541          varying->pa_attributes = 0x200;
   2542       else /* texture coord or other bypasses flat shading */
   2543          varying->pa_attributes = 0x2f1;
   2544 
   2545       if (fsio->semantic.Name == TGSI_SEMANTIC_PCOORD) {
   2546          varying->use[0] = VARYING_COMPONENT_USE_POINTCOORD_X;
   2547          varying->use[1] = VARYING_COMPONENT_USE_POINTCOORD_Y;
   2548          varying->use[2] = VARYING_COMPONENT_USE_USED;
   2549          varying->use[3] = VARYING_COMPONENT_USE_USED;
   2550          varying->reg = 0; /* replaced by point coord -- doesn't matter */
   2551          continue;
   2552       }
   2553 
   2554       if (vsio == NULL)
   2555          return true; /* not found -- link error */
   2556 
   2557       varying->use[0] = VARYING_COMPONENT_USE_USED;
   2558       varying->use[1] = VARYING_COMPONENT_USE_USED;
   2559       varying->use[2] = VARYING_COMPONENT_USE_USED;
   2560       varying->use[3] = VARYING_COMPONENT_USE_USED;
   2561       varying->reg = vsio->reg;
   2562    }
   2563 
   2564    assert(info->num_varyings == fs->infile.num_reg);
   2565 
   2566    return false;
   2567 }
   2568