Home | History | Annotate | Download | only in r200
      1 /**************************************************************************
      2 
      3 Copyright (C) 2005 Aapo Tahkola.
      4 
      5 All Rights Reserved.
      6 
      7 Permission is hereby granted, free of charge, to any person obtaining a
      8 copy of this software and associated documentation files (the "Software"),
      9 to deal in the Software without restriction, including without limitation
     10 on the rights to use, copy, modify, merge, publish, distribute, sub
     11 license, and/or sell copies of the Software, and to permit persons to whom
     12 the Software is furnished to do so, subject to the following conditions:
     13 
     14 The above copyright notice and this permission notice (including the next
     15 paragraph) shall be included in all copies or substantial portions of the
     16 Software.
     17 
     18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     20 FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     21 THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
     22 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     23 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     24 USE OR OTHER DEALINGS IN THE SOFTWARE.
     25 
     26 **************************************************************************/
     27 
     28 /*
     29  * Authors:
     30  *   Aapo Tahkola <aet (at) rasterburn.org>
     31  *   Roland Scheidegger <rscheidegger_lists (at) hispeed.ch>
     32  */
     33 #include "main/glheader.h"
     34 #include "main/macros.h"
     35 #include "main/enums.h"
     36 #include "program/program.h"
     37 #include "program/prog_instruction.h"
     38 #include "program/prog_parameter.h"
     39 #include "program/prog_statevars.h"
     40 #include "program/programopt.h"
     41 #include "tnl/tnl.h"
     42 
     43 #include "r200_context.h"
     44 #include "r200_vertprog.h"
     45 #include "r200_ioctl.h"
     46 #include "r200_tcl.h"
     47 
     48 #if SWIZZLE_X != VSF_IN_COMPONENT_X || \
     49     SWIZZLE_Y != VSF_IN_COMPONENT_Y || \
     50     SWIZZLE_Z != VSF_IN_COMPONENT_Z || \
     51     SWIZZLE_W != VSF_IN_COMPONENT_W || \
     52     SWIZZLE_ZERO != VSF_IN_COMPONENT_ZERO || \
     53     SWIZZLE_ONE != VSF_IN_COMPONENT_ONE || \
     54     WRITEMASK_X != VSF_FLAG_X || \
     55     WRITEMASK_Y != VSF_FLAG_Y || \
     56     WRITEMASK_Z != VSF_FLAG_Z || \
     57     WRITEMASK_W != VSF_FLAG_W
     58 #error Cannot change these!
     59 #endif
     60 
     61 #define SCALAR_FLAG (1<<31)
     62 #define FLAG_MASK (1<<31)
     63 #define OP_MASK (0xf)  /* we are unlikely to have more than 15 */
     64 #define OPN(operator, ip) {#operator, OPCODE_##operator, ip}
     65 
     66 static struct{
     67    char *name;
     68    int opcode;
     69    unsigned long ip; /* number of input operands and flags */
     70 }op_names[]={
     71    OPN(ABS, 1),
     72    OPN(ADD, 2),
     73    OPN(ARL, 1|SCALAR_FLAG),
     74    OPN(DP3, 2),
     75    OPN(DP4, 2),
     76    OPN(DPH, 2),
     77    OPN(DST, 2),
     78    OPN(EX2, 1|SCALAR_FLAG),
     79    OPN(EXP, 1|SCALAR_FLAG),
     80    OPN(FLR, 1),
     81    OPN(FRC, 1),
     82    OPN(LG2, 1|SCALAR_FLAG),
     83    OPN(LIT, 1),
     84    OPN(LOG, 1|SCALAR_FLAG),
     85    OPN(MAD, 3),
     86    OPN(MAX, 2),
     87    OPN(MIN, 2),
     88    OPN(MOV, 1),
     89    OPN(MUL, 2),
     90    OPN(POW, 2|SCALAR_FLAG),
     91    OPN(RCP, 1|SCALAR_FLAG),
     92    OPN(RSQ, 1|SCALAR_FLAG),
     93    OPN(SGE, 2),
     94    OPN(SLT, 2),
     95    OPN(SUB, 2),
     96    OPN(SWZ, 1),
     97    OPN(XPD, 2),
     98    OPN(END, 0),
     99 };
    100 #undef OPN
    101 
    102 static GLboolean r200VertexProgUpdateParams(struct gl_context *ctx, struct r200_vertex_program *vp)
    103 {
    104    r200ContextPtr rmesa = R200_CONTEXT( ctx );
    105    GLfloat *fcmd = (GLfloat *)&rmesa->hw.vpp[0].cmd[VPP_CMD_0 + 1];
    106    int pi;
    107    struct gl_program *mesa_vp = &vp->mesa_program;
    108    struct gl_program_parameter_list *paramList;
    109    drm_radeon_cmd_header_t tmp;
    110 
    111    R200_STATECHANGE( rmesa, vpp[0] );
    112    R200_STATECHANGE( rmesa, vpp[1] );
    113    assert(mesa_vp->Parameters);
    114    _mesa_load_state_parameters(ctx, mesa_vp->Parameters);
    115    paramList = mesa_vp->Parameters;
    116 
    117    if(paramList->NumParameters > R200_VSF_MAX_PARAM){
    118       fprintf(stderr, "%s:Params exhausted\n", __func__);
    119       return GL_FALSE;
    120    }
    121 
    122    for(pi = 0; pi < paramList->NumParameters; pi++) {
    123       switch(paramList->Parameters[pi].Type) {
    124       case PROGRAM_STATE_VAR:
    125       //fprintf(stderr, "%s", vp->Parameters->Parameters[pi].Name);
    126       case PROGRAM_CONSTANT:
    127 	 *fcmd++ = paramList->ParameterValues[pi][0].f;
    128 	 *fcmd++ = paramList->ParameterValues[pi][1].f;
    129 	 *fcmd++ = paramList->ParameterValues[pi][2].f;
    130 	 *fcmd++ = paramList->ParameterValues[pi][3].f;
    131 	 break;
    132       default:
    133 	 _mesa_problem(NULL, "Bad param type in %s", __func__);
    134 	 break;
    135       }
    136       if (pi == 95) {
    137 	 fcmd = (GLfloat *)&rmesa->hw.vpp[1].cmd[VPP_CMD_0 + 1];
    138       }
    139    }
    140    /* hack up the cmd_size so not the whole state atom is emitted always. */
    141    rmesa->hw.vpp[0].cmd_size =
    142       1 + 4 * ((paramList->NumParameters > 96) ? 96 : paramList->NumParameters);
    143    tmp.i = rmesa->hw.vpp[0].cmd[VPP_CMD_0];
    144    tmp.veclinear.count = (paramList->NumParameters > 96) ? 96 : paramList->NumParameters;
    145    rmesa->hw.vpp[0].cmd[VPP_CMD_0] = tmp.i;
    146    if (paramList->NumParameters > 96) {
    147       rmesa->hw.vpp[1].cmd_size = 1 + 4 * (paramList->NumParameters - 96);
    148       tmp.i = rmesa->hw.vpp[1].cmd[VPP_CMD_0];
    149       tmp.veclinear.count = paramList->NumParameters - 96;
    150       rmesa->hw.vpp[1].cmd[VPP_CMD_0] = tmp.i;
    151    }
    152    return GL_TRUE;
    153 }
    154 
    155 static inline unsigned long t_dst_mask(GLuint mask)
    156 {
    157    /* WRITEMASK_* is equivalent to VSF_FLAG_* */
    158    return mask & VSF_FLAG_ALL;
    159 }
    160 
    161 static unsigned long t_dst(struct prog_dst_register *dst)
    162 {
    163    switch(dst->File) {
    164    case PROGRAM_TEMPORARY:
    165       return ((dst->Index << R200_VPI_OUT_REG_INDEX_SHIFT)
    166 	 | R200_VSF_OUT_CLASS_TMP);
    167    case PROGRAM_OUTPUT:
    168       switch (dst->Index) {
    169       case VARYING_SLOT_POS:
    170 	 return R200_VSF_OUT_CLASS_RESULT_POS;
    171       case VARYING_SLOT_COL0:
    172 	 return R200_VSF_OUT_CLASS_RESULT_COLOR;
    173       case VARYING_SLOT_COL1:
    174 	 return ((1 << R200_VPI_OUT_REG_INDEX_SHIFT)
    175 	    | R200_VSF_OUT_CLASS_RESULT_COLOR);
    176       case VARYING_SLOT_FOGC:
    177 	 return R200_VSF_OUT_CLASS_RESULT_FOGC;
    178       case VARYING_SLOT_TEX0:
    179       case VARYING_SLOT_TEX1:
    180       case VARYING_SLOT_TEX2:
    181       case VARYING_SLOT_TEX3:
    182       case VARYING_SLOT_TEX4:
    183       case VARYING_SLOT_TEX5:
    184 	 return (((dst->Index - VARYING_SLOT_TEX0) << R200_VPI_OUT_REG_INDEX_SHIFT)
    185 	    | R200_VSF_OUT_CLASS_RESULT_TEXC);
    186       case VARYING_SLOT_PSIZ:
    187 	 return R200_VSF_OUT_CLASS_RESULT_POINTSIZE;
    188       default:
    189 	 fprintf(stderr, "problem in %s, unknown dst output reg %d\n", __func__, dst->Index);
    190 	 exit(0);
    191 	 return 0;
    192       }
    193    case PROGRAM_ADDRESS:
    194       assert (dst->Index == 0);
    195       return R200_VSF_OUT_CLASS_ADDR;
    196    default:
    197       fprintf(stderr, "problem in %s, unknown register type %d\n", __func__, dst->File);
    198       exit(0);
    199       return 0;
    200    }
    201 }
    202 
    203 static unsigned long t_src_class(gl_register_file file)
    204 {
    205 
    206    switch(file){
    207    case PROGRAM_TEMPORARY:
    208       return VSF_IN_CLASS_TMP;
    209 
    210    case PROGRAM_INPUT:
    211       return VSF_IN_CLASS_ATTR;
    212 
    213    case PROGRAM_CONSTANT:
    214    case PROGRAM_STATE_VAR:
    215       return VSF_IN_CLASS_PARAM;
    216    /*
    217    case PROGRAM_OUTPUT:
    218    case PROGRAM_ADDRESS:
    219    */
    220    default:
    221       fprintf(stderr, "problem in %s", __func__);
    222       exit(0);
    223    }
    224 }
    225 
    226 static inline unsigned long t_swizzle(GLubyte swizzle)
    227 {
    228 /* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
    229    return swizzle;
    230 }
    231 
    232 #if 0
    233 static void vp_dump_inputs(struct r200_vertex_program *vp, char *caller)
    234 {
    235    int i;
    236 
    237    if(vp == NULL){
    238       fprintf(stderr, "vp null in call to %s from %s\n", __func__, caller);
    239       return ;
    240    }
    241 
    242    fprintf(stderr, "%s:<", caller);
    243    for(i=0; i < VERT_ATTRIB_MAX; i++)
    244    fprintf(stderr, "%d ", vp->inputs[i]);
    245    fprintf(stderr, ">\n");
    246 
    247 }
    248 #endif
    249 
    250 static unsigned long t_src_index(struct r200_vertex_program *vp, struct prog_src_register *src)
    251 {
    252 /*
    253    int i;
    254    int max_reg = -1;
    255 */
    256    if(src->File == PROGRAM_INPUT){
    257 /*      if(vp->inputs[src->Index] != -1)
    258 	 return vp->inputs[src->Index];
    259 
    260       for(i=0; i < VERT_ATTRIB_MAX; i++)
    261 	 if(vp->inputs[i] > max_reg)
    262 	    max_reg = vp->inputs[i];
    263 
    264       vp->inputs[src->Index] = max_reg+1;*/
    265 
    266       //vp_dump_inputs(vp, __func__);
    267       assert(vp->inputs[src->Index] != -1);
    268       return vp->inputs[src->Index];
    269    } else {
    270       if (src->Index < 0) {
    271 	 fprintf(stderr, "WARNING negative offsets for indirect addressing do not work\n");
    272 	 return 0;
    273       }
    274       return src->Index;
    275    }
    276 }
    277 
    278 static unsigned long t_src(struct r200_vertex_program *vp, struct prog_src_register *src)
    279 {
    280 
    281    return MAKE_VSF_SOURCE(t_src_index(vp, src),
    282 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    283 			t_swizzle(GET_SWZ(src->Swizzle, 1)),
    284 			t_swizzle(GET_SWZ(src->Swizzle, 2)),
    285 			t_swizzle(GET_SWZ(src->Swizzle, 3)),
    286 			t_src_class(src->File),
    287 			src->Negate) | (src->RelAddr << 4);
    288 }
    289 
    290 static unsigned long t_src_scalar(struct r200_vertex_program *vp, struct prog_src_register *src)
    291 {
    292 
    293    return MAKE_VSF_SOURCE(t_src_index(vp, src),
    294 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    295 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    296 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    297 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    298 			t_src_class(src->File),
    299 			src->Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src->RelAddr << 4);
    300 }
    301 
    302 static unsigned long t_opcode(enum prog_opcode opcode)
    303 {
    304 
    305    switch(opcode){
    306    case OPCODE_ADD: return R200_VPI_OUT_OP_ADD;
    307    /* FIXME: ARL works fine, but negative offsets won't work - fglrx just
    308     * seems to ignore neg offsets which isn't quite correct...
    309     */
    310    case OPCODE_ARL: return R200_VPI_OUT_OP_ARL;
    311    case OPCODE_DP4: return R200_VPI_OUT_OP_DOT;
    312    case OPCODE_DST: return R200_VPI_OUT_OP_DST;
    313    case OPCODE_EX2: return R200_VPI_OUT_OP_EX2;
    314    case OPCODE_EXP: return R200_VPI_OUT_OP_EXP;
    315    case OPCODE_FRC: return R200_VPI_OUT_OP_FRC;
    316    case OPCODE_LG2: return R200_VPI_OUT_OP_LG2;
    317    case OPCODE_LIT: return R200_VPI_OUT_OP_LIT;
    318    case OPCODE_LOG: return R200_VPI_OUT_OP_LOG;
    319    case OPCODE_MAX: return R200_VPI_OUT_OP_MAX;
    320    case OPCODE_MIN: return R200_VPI_OUT_OP_MIN;
    321    case OPCODE_MUL: return R200_VPI_OUT_OP_MUL;
    322    case OPCODE_RCP: return R200_VPI_OUT_OP_RCP;
    323    case OPCODE_RSQ: return R200_VPI_OUT_OP_RSQ;
    324    case OPCODE_SGE: return R200_VPI_OUT_OP_SGE;
    325    case OPCODE_SLT: return R200_VPI_OUT_OP_SLT;
    326 
    327    default:
    328       fprintf(stderr, "%s: Should not be called with opcode %d!", __func__, opcode);
    329    }
    330    exit(-1);
    331    return 0;
    332 }
    333 
    334 static unsigned long op_operands(enum prog_opcode opcode)
    335 {
    336    int i;
    337 
    338    /* Can we trust mesas opcodes to be in order ? */
    339    for(i=0; i < sizeof(op_names) / sizeof(*op_names); i++)
    340       if(op_names[i].opcode == opcode)
    341 	 return op_names[i].ip;
    342 
    343    fprintf(stderr, "op %d not found in op_names\n", opcode);
    344    exit(-1);
    345    return 0;
    346 }
    347 
    348 /* TODO: Get rid of t_src_class call */
    349 #define CMP_SRCS(a, b) (((a.RelAddr != b.RelAddr) || (a.Index != b.Index)) && \
    350 		       ((t_src_class(a.File) == VSF_IN_CLASS_PARAM && \
    351 			 t_src_class(b.File) == VSF_IN_CLASS_PARAM) || \
    352 			(t_src_class(a.File) == VSF_IN_CLASS_ATTR && \
    353 			 t_src_class(b.File) == VSF_IN_CLASS_ATTR))) \
    354 
    355 /* fglrx on rv250 codes up unused sources as follows:
    356    unused but necessary sources are same as previous source, zero-ed out.
    357    unnecessary sources are same as previous source but with VSF_IN_CLASS_NONE set.
    358    i.e. an add (2 args) has its 2nd arg (if you use it as mov) zero-ed out, and 3rd arg
    359    set to VSF_IN_CLASS_NONE. Not sure if strictly necessary. */
    360 
    361 /* use these simpler definitions. Must obviously not be used with not yet set up regs.
    362    Those are NOT semantically equivalent to the r300 ones, requires code changes */
    363 #define ZERO_SRC_0 (((o_inst->src0 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
    364 				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
    365 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
    366 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
    367 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
    368 
    369 #define ZERO_SRC_1 (((o_inst->src1 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
    370 				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
    371 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
    372 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
    373 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
    374 
    375 #define ZERO_SRC_2 (((o_inst->src2 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
    376 				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
    377 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
    378 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
    379 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
    380 
    381 #define UNUSED_SRC_0 ((o_inst->src0 & ~15) | 9)
    382 
    383 #define UNUSED_SRC_1 ((o_inst->src1 & ~15) | 9)
    384 
    385 #define UNUSED_SRC_2 ((o_inst->src2 & ~15) | 9)
    386 
    387 
    388 /**
    389  * Generate an R200 vertex program from Mesa's internal representation.
    390  *
    391  * \return  GL_TRUE for success, GL_FALSE for failure.
    392  */
    393 static GLboolean r200_translate_vertex_program(struct gl_context *ctx, struct r200_vertex_program *vp)
    394 {
    395    struct gl_program *mesa_vp = &vp->mesa_program;
    396    struct prog_instruction *vpi;
    397    int i;
    398    VERTEX_SHADER_INSTRUCTION *o_inst;
    399    unsigned long operands;
    400    int are_srcs_scalar;
    401    unsigned long hw_op;
    402    int dofogfix = 0;
    403    int fog_temp_i = 0;
    404    int free_inputs;
    405    int array_count = 0;
    406    int u_temp_used;
    407 
    408    vp->native = GL_FALSE;
    409    vp->translated = GL_TRUE;
    410    vp->fogmode = ctx->Fog.Mode;
    411 
    412    if (mesa_vp->arb.NumInstructions == 0)
    413       return GL_FALSE;
    414 
    415 #if 0
    416    if ((mesa_vp->info.inputs_read &
    417       ~(VERT_BIT_POS | VERT_BIT_NORMAL | VERT_BIT_COLOR0 | VERT_BIT_COLOR1 |
    418       VERT_BIT_FOG | VERT_BIT_TEX0 | VERT_BIT_TEX1 | VERT_BIT_TEX2 |
    419       VERT_BIT_TEX3 | VERT_BIT_TEX4 | VERT_BIT_TEX5)) != 0) {
    420       if (R200_DEBUG & RADEON_FALLBACKS) {
    421 	 fprintf(stderr, "can't handle vert prog inputs 0x%x\n",
    422 	    mesa_vp->info.inputs_read);
    423       }
    424       return GL_FALSE;
    425    }
    426 #endif
    427 
    428    if ((mesa_vp->info.outputs_written &
    429       ~((1 << VARYING_SLOT_POS) | (1 << VARYING_SLOT_COL0) | (1 << VARYING_SLOT_COL1) |
    430       (1 << VARYING_SLOT_FOGC) | (1 << VARYING_SLOT_TEX0) | (1 << VARYING_SLOT_TEX1) |
    431       (1 << VARYING_SLOT_TEX2) | (1 << VARYING_SLOT_TEX3) | (1 << VARYING_SLOT_TEX4) |
    432       (1 << VARYING_SLOT_TEX5) | (1 << VARYING_SLOT_PSIZ))) != 0) {
    433       if (R200_DEBUG & RADEON_FALLBACKS) {
    434 	 fprintf(stderr, "can't handle vert prog outputs 0x%llx\n",
    435                  (unsigned long long) mesa_vp->info.outputs_written);
    436       }
    437       return GL_FALSE;
    438    }
    439 
    440    /* Initial value should be last tmp reg that hw supports.
    441       Strangely enough r300 doesnt mind even though these would be out of range.
    442       Smart enough to realize that it doesnt need it? */
    443    int u_temp_i = R200_VSF_MAX_TEMPS - 1;
    444    struct prog_src_register src[3];
    445    struct prog_dst_register dst;
    446 
    447 /* FIXME: is changing the prog safe to do here? */
    448    if (mesa_vp->arb.IsPositionInvariant &&
    449       /* make sure we only do this once */
    450        !(mesa_vp->info.outputs_written & (1 << VARYING_SLOT_POS))) {
    451 	 _mesa_insert_mvp_code(ctx, mesa_vp);
    452       }
    453 
    454    /* for fogc, can't change mesa_vp, as it would hose swtnl, and exp with
    455       base e isn't directly available neither. */
    456    if ((mesa_vp->info.outputs_written & (1 << VARYING_SLOT_FOGC)) &&
    457        !vp->fogpidx) {
    458       struct gl_program_parameter_list *paramList;
    459       gl_state_index tokens[STATE_LENGTH] = { STATE_FOG_PARAMS, 0, 0, 0, 0 };
    460       paramList = mesa_vp->Parameters;
    461       vp->fogpidx = _mesa_add_state_reference(paramList, tokens);
    462    }
    463 
    464    vp->pos_end = 0;
    465    mesa_vp->arb.NumNativeInstructions = 0;
    466    if (mesa_vp->Parameters)
    467       mesa_vp->arb.NumNativeParameters = mesa_vp->Parameters->NumParameters;
    468    else
    469       mesa_vp->arb.NumNativeParameters = 0;
    470 
    471    for(i = 0; i < VERT_ATTRIB_MAX; i++)
    472       vp->inputs[i] = -1;
    473    for(i = 0; i < 15; i++)
    474       vp->inputmap_rev[i] = 255;
    475    free_inputs = 0x2ffd;
    476 
    477 /* fglrx uses fixed inputs as follows for conventional attribs.
    478    generic attribs use non-fixed assignment, fglrx will always use the
    479    lowest attrib values available. We'll just do the same.
    480    There are 12 generic attribs possible, corresponding to attrib 0, 2-11
    481    and 13 in a hw vertex prog.
    482    attr 1 and 12 aren't used for generic attribs as those cannot be made vec4
    483    (correspond to vertex normal/weight - maybe weight actually could be made vec4).
    484    Additionally, not more than 12 arrays in total are possible I think.
    485    attr 0 is pos, R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0
    486    attr 2-5 use colors 0-3 (R200_VTX_FP_RGBA << R200_VTX_COLOR_0/1/2/3_SHIFT in R200_SE_VTX_FMT_0)
    487    attr 6-11 use tex 0-5 (4 << R200_VTX_TEX0/1/2/3/4/5_COMP_CNT_SHIFT in R200_SE_VTX_FMT_1)
    488    attr 13 uses vtx1 pos (R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0)
    489 */
    490 
    491 /* attr 4,5 and 13 are only used with generic attribs.
    492    Haven't seen attr 14 used, maybe that's for the hw pointsize vec1 (which is
    493    not possibe to use with vertex progs as it is lacking in vert prog specification) */
    494 /* may look different when using idx buf / input_route instead of se_vtx_fmt? */
    495    if (mesa_vp->info.inputs_read & VERT_BIT_POS) {
    496       vp->inputs[VERT_ATTRIB_POS] = 0;
    497       vp->inputmap_rev[0] = VERT_ATTRIB_POS;
    498       free_inputs &= ~(1 << 0);
    499       array_count++;
    500    }
    501    if (mesa_vp->info.inputs_read & VERT_BIT_WEIGHT) {
    502       vp->inputs[VERT_ATTRIB_WEIGHT] = 12;
    503       vp->inputmap_rev[1] = VERT_ATTRIB_WEIGHT;
    504       array_count++;
    505    }
    506    if (mesa_vp->info.inputs_read & VERT_BIT_NORMAL) {
    507       vp->inputs[VERT_ATTRIB_NORMAL] = 1;
    508       vp->inputmap_rev[2] = VERT_ATTRIB_NORMAL;
    509       array_count++;
    510    }
    511    if (mesa_vp->info.inputs_read & VERT_BIT_COLOR0) {
    512       vp->inputs[VERT_ATTRIB_COLOR0] = 2;
    513       vp->inputmap_rev[4] = VERT_ATTRIB_COLOR0;
    514       free_inputs &= ~(1 << 2);
    515       array_count++;
    516    }
    517    if (mesa_vp->info.inputs_read & VERT_BIT_COLOR1) {
    518       vp->inputs[VERT_ATTRIB_COLOR1] = 3;
    519       vp->inputmap_rev[5] = VERT_ATTRIB_COLOR1;
    520       free_inputs &= ~(1 << 3);
    521       array_count++;
    522    }
    523    if (mesa_vp->info.inputs_read & VERT_BIT_FOG) {
    524       vp->inputs[VERT_ATTRIB_FOG] = 15; array_count++;
    525       vp->inputmap_rev[3] = VERT_ATTRIB_FOG;
    526       array_count++;
    527    }
    528    /* VERT_ATTRIB_TEX0-5 */
    529    for (i = 0; i <= 5; i++) {
    530       if (mesa_vp->info.inputs_read & VERT_BIT_TEX(i)) {
    531 	 vp->inputs[VERT_ATTRIB_TEX(i)] = i + 6;
    532 	 vp->inputmap_rev[8 + i] = VERT_ATTRIB_TEX(i);
    533 	 free_inputs &= ~(1 << (i + 6));
    534 	 array_count++;
    535       }
    536    }
    537    /* using VERT_ATTRIB_TEX6/7 would be illegal */
    538    for (; i < VERT_ATTRIB_TEX_MAX; i++) {
    539       if (mesa_vp->info.inputs_read & VERT_BIT_TEX(i)) {
    540           if (R200_DEBUG & RADEON_FALLBACKS) {
    541               fprintf(stderr, "texture attribute %d in vert prog\n", i);
    542           }
    543           return GL_FALSE;
    544       }
    545    }
    546    /* completely ignore aliasing? */
    547    for (i = 0; i < VERT_ATTRIB_GENERIC_MAX; i++) {
    548       int j;
    549    /* completely ignore aliasing? */
    550       if (mesa_vp->info.inputs_read & VERT_BIT_GENERIC(i)) {
    551 	 array_count++;
    552 	 if (array_count > 12) {
    553 	    if (R200_DEBUG & RADEON_FALLBACKS) {
    554 	       fprintf(stderr, "more than 12 attribs used in vert prog\n");
    555 	    }
    556 	    return GL_FALSE;
    557 	 }
    558 	 for (j = 0; j < 14; j++) {
    559 	    /* will always find one due to limited array_count */
    560 	    if (free_inputs & (1 << j)) {
    561 	       free_inputs &= ~(1 << j);
    562 	       vp->inputs[VERT_ATTRIB_GENERIC(i)] = j;
    563 	       if (j == 0) {
    564                   /* mapped to pos */
    565                   vp->inputmap_rev[j] = VERT_ATTRIB_GENERIC(i);
    566 	       } else if (j < 12) {
    567                   /* mapped to col/tex */
    568                   vp->inputmap_rev[j + 2] = VERT_ATTRIB_GENERIC(i);
    569 	       } else {
    570                   /* mapped to pos1 */
    571                   vp->inputmap_rev[j + 1] = VERT_ATTRIB_GENERIC(i);
    572                }
    573 	       break;
    574 	    }
    575 	 }
    576       }
    577    }
    578 
    579    if (!(mesa_vp->info.outputs_written & (1 << VARYING_SLOT_POS))) {
    580       if (R200_DEBUG & RADEON_FALLBACKS) {
    581 	 fprintf(stderr, "can't handle vert prog without position output\n");
    582       }
    583       return GL_FALSE;
    584    }
    585    if (free_inputs & 1) {
    586       if (R200_DEBUG & RADEON_FALLBACKS) {
    587 	 fprintf(stderr, "can't handle vert prog without position input\n");
    588       }
    589       return GL_FALSE;
    590    }
    591 
    592    o_inst = vp->instr;
    593    for (vpi = mesa_vp->arb.Instructions; vpi->Opcode != OPCODE_END; vpi++, o_inst++){
    594       operands = op_operands(vpi->Opcode);
    595       are_srcs_scalar = operands & SCALAR_FLAG;
    596       operands &= OP_MASK;
    597 
    598       for(i = 0; i < operands; i++) {
    599 	 src[i] = vpi->SrcReg[i];
    600 	 /* hack up default attrib values as per spec as swizzling.
    601 	    normal, fog, secondary color. Crazy?
    602 	    May need more if we don't submit vec4 elements? */
    603 	 if (src[i].File == PROGRAM_INPUT) {
    604 	    if (src[i].Index == VERT_ATTRIB_NORMAL) {
    605 	       int j;
    606 	       for (j = 0; j < 4; j++) {
    607 		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
    608 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    609 		     src[i].Swizzle |= SWIZZLE_ONE << (j*3);
    610 		  }
    611 	       }
    612 	    }
    613 	    else if (src[i].Index == VERT_ATTRIB_COLOR1) {
    614 	       int j;
    615 	       for (j = 0; j < 4; j++) {
    616 		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
    617 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    618 		     src[i].Swizzle |= SWIZZLE_ZERO << (j*3);
    619 		  }
    620 	       }
    621 	    }
    622 	    else if (src[i].Index == VERT_ATTRIB_FOG) {
    623 	       int j;
    624 	       for (j = 0; j < 4; j++) {
    625 		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
    626 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    627 		     src[i].Swizzle |= SWIZZLE_ONE << (j*3);
    628 		  }
    629 		  else if ((GET_SWZ(src[i].Swizzle, j) == SWIZZLE_Y) ||
    630 			    GET_SWZ(src[i].Swizzle, j) == SWIZZLE_Z) {
    631 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    632 		     src[i].Swizzle |= SWIZZLE_ZERO << (j*3);
    633 		  }
    634 	       }
    635 	    }
    636 	 }
    637       }
    638 
    639       if(operands == 3){
    640 	 if( CMP_SRCS(src[1], src[2]) || CMP_SRCS(src[0], src[2]) ){
    641 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
    642 		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    643 		VSF_FLAG_ALL);
    644 
    645 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[2]),
    646 		  SWIZZLE_X, SWIZZLE_Y,
    647 		  SWIZZLE_Z, SWIZZLE_W,
    648 		  t_src_class(src[2].File), VSF_FLAG_NONE) | (src[2].RelAddr << 4);
    649 
    650 	    o_inst->src1 = ZERO_SRC_0;
    651 	    o_inst->src2 = UNUSED_SRC_1;
    652 	    o_inst++;
    653 
    654 	    src[2].File = PROGRAM_TEMPORARY;
    655 	    src[2].Index = u_temp_i;
    656 	    src[2].RelAddr = 0;
    657 	    u_temp_i--;
    658 	 }
    659       }
    660 
    661       if(operands >= 2){
    662 	 if( CMP_SRCS(src[1], src[0]) ){
    663 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
    664 		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    665 		VSF_FLAG_ALL);
    666 
    667 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    668 		  SWIZZLE_X, SWIZZLE_Y,
    669 		  SWIZZLE_Z, SWIZZLE_W,
    670 		  t_src_class(src[0].File), VSF_FLAG_NONE) | (src[0].RelAddr << 4);
    671 
    672 	    o_inst->src1 = ZERO_SRC_0;
    673 	    o_inst->src2 = UNUSED_SRC_1;
    674 	    o_inst++;
    675 
    676 	    src[0].File = PROGRAM_TEMPORARY;
    677 	    src[0].Index = u_temp_i;
    678 	    src[0].RelAddr = 0;
    679 	    u_temp_i--;
    680 	 }
    681       }
    682 
    683       dst = vpi->DstReg;
    684       if (dst.File == PROGRAM_OUTPUT &&
    685 	  dst.Index == VARYING_SLOT_FOGC &&
    686 	  dst.WriteMask & WRITEMASK_X) {
    687 	  fog_temp_i = u_temp_i;
    688 	  dst.File = PROGRAM_TEMPORARY;
    689 	  dst.Index = fog_temp_i;
    690 	  dofogfix = 1;
    691 	  u_temp_i--;
    692       }
    693 
    694       /* These ops need special handling. */
    695       switch(vpi->Opcode){
    696       case OPCODE_POW:
    697 /* pow takes only one argument, first scalar is in slot x, 2nd in slot z (other slots don't matter).
    698    So may need to insert additional instruction */
    699 	 if ((src[0].File == src[1].File) &&
    700 	     (src[0].Index == src[1].Index)) {
    701 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&dst),
    702 		   t_dst_mask(dst.WriteMask));
    703 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    704 		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    705 		   SWIZZLE_ZERO,
    706 		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
    707 		   SWIZZLE_ZERO,
    708 		   t_src_class(src[0].File),
    709 		   src[0].Negate) | (src[0].RelAddr << 4);
    710 	    o_inst->src1 = UNUSED_SRC_0;
    711 	    o_inst->src2 = UNUSED_SRC_0;
    712 	 }
    713 	 else {
    714 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
    715 		   (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    716 		   VSF_FLAG_ALL);
    717 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    718 		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    719 		   SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO,
    720 		   t_src_class(src[0].File),
    721 		   src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
    722 	    o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    723 		   SWIZZLE_ZERO, SWIZZLE_ZERO,
    724 		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)), SWIZZLE_ZERO,
    725 		   t_src_class(src[1].File),
    726 		   src[1].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
    727 	    o_inst->src2 = UNUSED_SRC_1;
    728 	    o_inst++;
    729 
    730 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&dst),
    731 		   t_dst_mask(dst.WriteMask));
    732 	    o_inst->src0 = MAKE_VSF_SOURCE(u_temp_i,
    733 		   VSF_IN_COMPONENT_X,
    734 		   VSF_IN_COMPONENT_Y,
    735 		   VSF_IN_COMPONENT_Z,
    736 		   VSF_IN_COMPONENT_W,
    737 		   VSF_IN_CLASS_TMP,
    738 		   VSF_FLAG_NONE);
    739 	    o_inst->src1 = UNUSED_SRC_0;
    740 	    o_inst->src2 = UNUSED_SRC_0;
    741 	    u_temp_i--;
    742 	 }
    743 	 goto next;
    744 
    745       case OPCODE_MOV://ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO}
    746       case OPCODE_SWZ:
    747 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
    748 		t_dst_mask(dst.WriteMask));
    749 	 o_inst->src0 = t_src(vp, &src[0]);
    750 	 o_inst->src1 = ZERO_SRC_0;
    751 	 o_inst->src2 = UNUSED_SRC_1;
    752 	 goto next;
    753 
    754       case OPCODE_MAD:
    755 	 /* only 2 read ports into temp memory thus may need the macro op MAD_2
    756 	    instead (requiring 2 clocks) if all inputs are in temp memory
    757 	    (and, only if they actually reference 3 distinct temps) */
    758 	 hw_op=(src[0].File == PROGRAM_TEMPORARY &&
    759 	    src[1].File == PROGRAM_TEMPORARY &&
    760 	    src[2].File == PROGRAM_TEMPORARY &&
    761 	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[1].RelAddr << 8) | src[1].Index)) &&
    762 	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[2].RelAddr << 8) | src[2].Index)) &&
    763 	    (((src[1].RelAddr << 8) | src[1].Index) != ((src[2].RelAddr << 8) | src[2].Index))) ?
    764 	    R200_VPI_OUT_OP_MAD_2 : R200_VPI_OUT_OP_MAD;
    765 
    766 	 o_inst->op = MAKE_VSF_OP(hw_op, t_dst(&dst),
    767 	    t_dst_mask(dst.WriteMask));
    768 	 o_inst->src0 = t_src(vp, &src[0]);
    769 #if 0
    770 if ((o_inst - vp->instr) == 31) {
    771 /* fix up the broken vertex program of quake4 demo... */
    772 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    773 			SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X,
    774 			t_src_class(src[1].File),
    775 			src[1].Negate) | (src[1].RelAddr << 4);
    776 o_inst->src2 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    777 			SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y,
    778 			t_src_class(src[1].File),
    779 			src[1].Negate) | (src[1].RelAddr << 4);
    780 }
    781 else {
    782 	 o_inst->src1 = t_src(vp, &src[1]);
    783 	 o_inst->src2 = t_src(vp, &src[2]);
    784 }
    785 #else
    786 	 o_inst->src1 = t_src(vp, &src[1]);
    787 	 o_inst->src2 = t_src(vp, &src[2]);
    788 #endif
    789 	 goto next;
    790 
    791       case OPCODE_DP3://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ZERO} PARAM 0{} {X Y Z ZERO}
    792 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&dst),
    793 		t_dst_mask(dst.WriteMask));
    794 
    795 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    796 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    797 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
    798 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
    799 		SWIZZLE_ZERO,
    800 		t_src_class(src[0].File),
    801 		src[0].Negate) | (src[0].RelAddr << 4);
    802 
    803 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    804 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
    805 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
    806 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
    807 		SWIZZLE_ZERO,
    808 		t_src_class(src[1].File),
    809 		src[1].Negate) | (src[1].RelAddr << 4);
    810 
    811 	 o_inst->src2 = UNUSED_SRC_1;
    812 	 goto next;
    813 
    814       case OPCODE_DPH://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W}
    815 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&dst),
    816 		t_dst_mask(dst.WriteMask));
    817 
    818 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    819 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    820 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
    821 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
    822 		VSF_IN_COMPONENT_ONE,
    823 		t_src_class(src[0].File),
    824 		src[0].Negate) | (src[0].RelAddr << 4);
    825 	 o_inst->src1 = t_src(vp, &src[1]);
    826 	 o_inst->src2 = UNUSED_SRC_1;
    827 	 goto next;
    828 
    829       case OPCODE_SUB://ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
    830 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
    831 		t_dst_mask(dst.WriteMask));
    832 
    833 	 o_inst->src0 = t_src(vp, &src[0]);
    834 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    835 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
    836 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
    837 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
    838 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
    839 		t_src_class(src[1].File),
    840 		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
    841 	 o_inst->src2 = UNUSED_SRC_1;
    842 	 goto next;
    843 
    844       case OPCODE_ABS://MAX RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
    845 	 o_inst->op=MAKE_VSF_OP(R200_VPI_OUT_OP_MAX, t_dst(&dst),
    846 		t_dst_mask(dst.WriteMask));
    847 
    848 	 o_inst->src0=t_src(vp, &src[0]);
    849 	 o_inst->src1=MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    850 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    851 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
    852 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
    853 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)),
    854 		t_src_class(src[0].File),
    855 		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
    856 	 o_inst->src2 = UNUSED_SRC_1;
    857 	 goto next;
    858 
    859       case OPCODE_FLR:
    860       /* FRC TMP 0.X Y Z W PARAM 0{} {X Y Z W}
    861          ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} TMP 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W */
    862 
    863 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_FRC,
    864 	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    865 	    t_dst_mask(dst.WriteMask));
    866 
    867 	 o_inst->src0 = t_src(vp, &src[0]);
    868 	 o_inst->src1 = UNUSED_SRC_0;
    869 	 o_inst->src2 = UNUSED_SRC_1;
    870 	 o_inst++;
    871 
    872 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
    873 		t_dst_mask(dst.WriteMask));
    874 
    875 	 o_inst->src0 = t_src(vp, &src[0]);
    876 	 o_inst->src1 = MAKE_VSF_SOURCE(u_temp_i,
    877 		VSF_IN_COMPONENT_X,
    878 		VSF_IN_COMPONENT_Y,
    879 		VSF_IN_COMPONENT_Z,
    880 		VSF_IN_COMPONENT_W,
    881 		VSF_IN_CLASS_TMP,
    882 		/* Not 100% sure about this */
    883 		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE/*VSF_FLAG_ALL*/);
    884 
    885 	 o_inst->src2 = UNUSED_SRC_0;
    886 	 u_temp_i--;
    887 	 goto next;
    888 
    889       case OPCODE_XPD:
    890 	 /* mul r0, r1.yzxw, r2.zxyw
    891 	    mad r0, -r2.yzxw, r1.zxyw, r0
    892 	  */
    893 	 hw_op=(src[0].File == PROGRAM_TEMPORARY &&
    894 	    src[1].File == PROGRAM_TEMPORARY &&
    895 	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[1].RelAddr << 8) | src[1].Index))) ?
    896 	    R200_VPI_OUT_OP_MAD_2 : R200_VPI_OUT_OP_MAD;
    897 
    898 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
    899 	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    900 	    t_dst_mask(dst.WriteMask));
    901 
    902 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    903 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
    904 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
    905 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
    906 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
    907 		t_src_class(src[0].File),
    908 		src[0].Negate) | (src[0].RelAddr << 4);
    909 
    910 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    911 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
    912 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
    913 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
    914 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
    915 		t_src_class(src[1].File),
    916 		src[1].Negate) | (src[1].RelAddr << 4);
    917 
    918 	 o_inst->src2 = UNUSED_SRC_1;
    919 	 o_inst++;
    920 	 u_temp_i--;
    921 
    922 	 o_inst->op = MAKE_VSF_OP(hw_op, t_dst(&dst),
    923 		t_dst_mask(dst.WriteMask));
    924 
    925 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    926 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
    927 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
    928 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
    929 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
    930 		t_src_class(src[1].File),
    931 		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
    932 
    933 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    934 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
    935 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
    936 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
    937 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
    938 		t_src_class(src[0].File),
    939 		src[0].Negate) | (src[0].RelAddr << 4);
    940 
    941 	 o_inst->src2 = MAKE_VSF_SOURCE(u_temp_i+1,
    942 		VSF_IN_COMPONENT_X,
    943 		VSF_IN_COMPONENT_Y,
    944 		VSF_IN_COMPONENT_Z,
    945 		VSF_IN_COMPONENT_W,
    946 		VSF_IN_CLASS_TMP,
    947 		VSF_FLAG_NONE);
    948 	 goto next;
    949 
    950       case OPCODE_END:
    951 	 assert(0);
    952       default:
    953 	 break;
    954       }
    955 
    956       o_inst->op = MAKE_VSF_OP(t_opcode(vpi->Opcode), t_dst(&dst),
    957 	    t_dst_mask(dst.WriteMask));
    958 
    959       if(are_srcs_scalar){
    960 	 switch(operands){
    961 	    case 1:
    962 		o_inst->src0 = t_src_scalar(vp, &src[0]);
    963 		o_inst->src1 = UNUSED_SRC_0;
    964 		o_inst->src2 = UNUSED_SRC_1;
    965 	    break;
    966 
    967 	    case 2:
    968 		o_inst->src0 = t_src_scalar(vp, &src[0]);
    969 		o_inst->src1 = t_src_scalar(vp, &src[1]);
    970 		o_inst->src2 = UNUSED_SRC_1;
    971 	    break;
    972 
    973 	    case 3:
    974 		o_inst->src0 = t_src_scalar(vp, &src[0]);
    975 		o_inst->src1 = t_src_scalar(vp, &src[1]);
    976 		o_inst->src2 = t_src_scalar(vp, &src[2]);
    977 	    break;
    978 
    979 	    default:
    980 		fprintf(stderr, "illegal number of operands %lu\n", operands);
    981 		exit(-1);
    982 	    break;
    983 	 }
    984       } else {
    985 	 switch(operands){
    986 	    case 1:
    987 		o_inst->src0 = t_src(vp, &src[0]);
    988 		o_inst->src1 = UNUSED_SRC_0;
    989 		o_inst->src2 = UNUSED_SRC_1;
    990 	    break;
    991 
    992 	    case 2:
    993 		o_inst->src0 = t_src(vp, &src[0]);
    994 		o_inst->src1 = t_src(vp, &src[1]);
    995 		o_inst->src2 = UNUSED_SRC_1;
    996 	    break;
    997 
    998 	    case 3:
    999 		o_inst->src0 = t_src(vp, &src[0]);
   1000 		o_inst->src1 = t_src(vp, &src[1]);
   1001 		o_inst->src2 = t_src(vp, &src[2]);
   1002 	    break;
   1003 
   1004 	    default:
   1005 		fprintf(stderr, "illegal number of operands %lu\n", operands);
   1006 		exit(-1);
   1007 	    break;
   1008 	 }
   1009       }
   1010       next:
   1011 
   1012       if (dofogfix) {
   1013 	 o_inst++;
   1014 	 if (vp->fogmode == GL_EXP) {
   1015 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1016 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1017 		VSF_FLAG_X);
   1018 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1019 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, X, X, X, X, PARAM, NONE);
   1020 	    o_inst->src2 = UNUSED_SRC_1;
   1021 	    o_inst++;
   1022 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_EXP_E,
   1023 		R200_VSF_OUT_CLASS_RESULT_FOGC,
   1024 		VSF_FLAG_X);
   1025 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
   1026 	    o_inst->src1 = UNUSED_SRC_0;
   1027 	    o_inst->src2 = UNUSED_SRC_1;
   1028 	 }
   1029 	 else if (vp->fogmode == GL_EXP2) {
   1030 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1031 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1032 		VSF_FLAG_X);
   1033 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1034 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, X, X, X, X, PARAM, NONE);
   1035 	    o_inst->src2 = UNUSED_SRC_1;
   1036 	    o_inst++;
   1037 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1038 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1039 		VSF_FLAG_X);
   1040 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1041 	    o_inst->src1 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1042 	    o_inst->src2 = UNUSED_SRC_1;
   1043 	    o_inst++;
   1044 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_EXP_E,
   1045 		R200_VSF_OUT_CLASS_RESULT_FOGC,
   1046 		VSF_FLAG_X);
   1047 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
   1048 	    o_inst->src1 = UNUSED_SRC_0;
   1049 	    o_inst->src2 = UNUSED_SRC_1;
   1050 	 }
   1051 	 else { /* fogmode == GL_LINEAR */
   1052 		/* could do that with single op (dot) if using params like
   1053 		   with fixed function pipeline fog */
   1054 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
   1055 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1056 		VSF_FLAG_X);
   1057 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
   1058 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, Z, Z, Z, Z, PARAM, NONE);
   1059 	    o_inst->src2 = UNUSED_SRC_1;
   1060 	    o_inst++;
   1061 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1062 		R200_VSF_OUT_CLASS_RESULT_FOGC,
   1063 		VSF_FLAG_X);
   1064 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1065 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, W, W, W, W, PARAM, NONE);
   1066 	    o_inst->src2 = UNUSED_SRC_1;
   1067 
   1068 	 }
   1069          dofogfix = 0;
   1070       }
   1071 
   1072       u_temp_used = (R200_VSF_MAX_TEMPS - 1) - u_temp_i;
   1073       if (mesa_vp->arb.NumNativeTemporaries <
   1074           (mesa_vp->arb.NumTemporaries + u_temp_used)) {
   1075          mesa_vp->arb.NumNativeTemporaries =
   1076             mesa_vp->arb.NumTemporaries + u_temp_used;
   1077       }
   1078       if ((mesa_vp->arb.NumTemporaries + u_temp_used) > R200_VSF_MAX_TEMPS) {
   1079 	 if (R200_DEBUG & RADEON_FALLBACKS) {
   1080             fprintf(stderr, "Ran out of temps, num temps %d, us %d\n", mesa_vp->arb.NumTemporaries, u_temp_used);
   1081 	 }
   1082 	 return GL_FALSE;
   1083       }
   1084       u_temp_i = R200_VSF_MAX_TEMPS - 1;
   1085       if(o_inst - vp->instr >= R200_VSF_MAX_INST) {
   1086          mesa_vp->arb.NumNativeInstructions = 129;
   1087 	 if (R200_DEBUG & RADEON_FALLBACKS) {
   1088 	    fprintf(stderr, "more than 128 native instructions\n");
   1089 	 }
   1090 	 return GL_FALSE;
   1091       }
   1092       if ((o_inst->op & R200_VSF_OUT_CLASS_MASK) == R200_VSF_OUT_CLASS_RESULT_POS) {
   1093 	 vp->pos_end = (o_inst - vp->instr);
   1094       }
   1095    }
   1096 
   1097    vp->native = GL_TRUE;
   1098    mesa_vp->arb.NumNativeInstructions = (o_inst - vp->instr);
   1099 #if 0
   1100    fprintf(stderr, "hw program:\n");
   1101    for(i=0; i < vp->program.length; i++)
   1102       fprintf(stderr, "%08x\n", vp->instr[i]);
   1103 #endif
   1104    return GL_TRUE;
   1105 }
   1106 
   1107 void r200SetupVertexProg( struct gl_context *ctx ) {
   1108    r200ContextPtr rmesa = R200_CONTEXT(ctx);
   1109    struct r200_vertex_program *vp = (struct r200_vertex_program *)ctx->VertexProgram.Current;
   1110    GLboolean fallback;
   1111    GLint i;
   1112 
   1113    if (!vp->translated || (ctx->Fog.Enabled && ctx->Fog.Mode != vp->fogmode)) {
   1114       rmesa->curr_vp_hw = NULL;
   1115       r200_translate_vertex_program(ctx, vp);
   1116    }
   1117    /* could optimize setting up vertex progs away for non-tcl hw */
   1118    fallback = !(vp->native && r200VertexProgUpdateParams(ctx, vp));
   1119    TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, fallback);
   1120    if (rmesa->radeon.TclFallback) return;
   1121 
   1122    R200_STATECHANGE( rmesa, vap );
   1123    /* FIXME: fglrx sets R200_VAP_SINGLE_BUF_STATE_ENABLE too. Do we need it?
   1124              maybe only when using more than 64 inst / 96 param? */
   1125    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_PROG_VTX_SHADER_ENABLE /*| R200_VAP_SINGLE_BUF_STATE_ENABLE*/;
   1126 
   1127    R200_STATECHANGE( rmesa, pvs );
   1128 
   1129    rmesa->hw.pvs.cmd[PVS_CNTL_1] = (0 << R200_PVS_CNTL_1_PROGRAM_START_SHIFT) |
   1130       ((vp->mesa_program.arb.NumNativeInstructions - 1) << R200_PVS_CNTL_1_PROGRAM_END_SHIFT) |
   1131       (vp->pos_end << R200_PVS_CNTL_1_POS_END_SHIFT);
   1132    rmesa->hw.pvs.cmd[PVS_CNTL_2] = (0 << R200_PVS_CNTL_2_PARAM_OFFSET_SHIFT) |
   1133       (vp->mesa_program.arb.NumNativeParameters << R200_PVS_CNTL_2_PARAM_COUNT_SHIFT);
   1134 
   1135    /* maybe user clip planes just work with vertex progs... untested */
   1136    if (ctx->Transform.ClipPlanesEnabled) {
   1137       R200_STATECHANGE( rmesa, tcl );
   1138       if (vp->mesa_program.arb.IsPositionInvariant) {
   1139 	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= (ctx->Transform.ClipPlanesEnabled << 2);
   1140       }
   1141       else {
   1142 	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~(0xfc);
   1143       }
   1144    }
   1145 
   1146    if (vp != rmesa->curr_vp_hw) {
   1147       GLuint count = vp->mesa_program.arb.NumNativeInstructions;
   1148       drm_radeon_cmd_header_t tmp;
   1149 
   1150       R200_STATECHANGE( rmesa, vpi[0] );
   1151       R200_STATECHANGE( rmesa, vpi[1] );
   1152 
   1153       /* FIXME: what about using a memcopy... */
   1154       for (i = 0; (i < 64) && i < count; i++) {
   1155 	 rmesa->hw.vpi[0].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i].op;
   1156 	 rmesa->hw.vpi[0].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i].src0;
   1157 	 rmesa->hw.vpi[0].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i].src1;
   1158 	 rmesa->hw.vpi[0].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i].src2;
   1159       }
   1160       /* hack up the cmd_size so not the whole state atom is emitted always.
   1161          This may require some more thought, we may emit half progs on lost state, but
   1162          hopefully it won't matter?
   1163          WARNING: must not use R200_DB_STATECHANGE, this will produce bogus (and rejected)
   1164          packet emits (due to the mismatched cmd_size and count in cmd/last_cmd) */
   1165       rmesa->hw.vpi[0].cmd_size = 1 + 4 * ((count > 64) ? 64 : count);
   1166       tmp.i = rmesa->hw.vpi[0].cmd[VPI_CMD_0];
   1167       tmp.veclinear.count = (count > 64) ? 64 : count;
   1168       rmesa->hw.vpi[0].cmd[VPI_CMD_0] = tmp.i;
   1169       if (count > 64) {
   1170 	 for (i = 0; i < (count - 64); i++) {
   1171 	    rmesa->hw.vpi[1].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i + 64].op;
   1172 	    rmesa->hw.vpi[1].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i + 64].src0;
   1173 	    rmesa->hw.vpi[1].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i + 64].src1;
   1174 	    rmesa->hw.vpi[1].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i + 64].src2;
   1175 	 }
   1176 	 rmesa->hw.vpi[1].cmd_size = 1 + 4 * (count - 64);
   1177 	 tmp.i = rmesa->hw.vpi[1].cmd[VPI_CMD_0];
   1178 	 tmp.veclinear.count = count - 64;
   1179 	 rmesa->hw.vpi[1].cmd[VPI_CMD_0] = tmp.i;
   1180       }
   1181       rmesa->curr_vp_hw = vp;
   1182    }
   1183 }
   1184 
   1185 
   1186 static void
   1187 r200BindProgram(struct gl_context *ctx, GLenum target, struct gl_program *prog)
   1188 {
   1189    r200ContextPtr rmesa = R200_CONTEXT(ctx);
   1190 
   1191    switch(target){
   1192    case GL_VERTEX_PROGRAM_ARB:
   1193       rmesa->curr_vp_hw = NULL;
   1194       break;
   1195    default:
   1196       _mesa_problem(ctx, "Target not supported yet!");
   1197       break;
   1198    }
   1199 }
   1200 
   1201 static struct gl_program *
   1202 r200NewProgram(struct gl_context *ctx, GLenum target, GLuint id,
   1203                bool is_arb_asm)
   1204 {
   1205    switch(target){
   1206    case GL_VERTEX_PROGRAM_ARB: {
   1207       struct r200_vertex_program *vp = rzalloc(NULL,
   1208                                                struct r200_vertex_program);
   1209       return _mesa_init_gl_program(&vp->mesa_program, target, id, is_arb_asm);
   1210    }
   1211    case GL_FRAGMENT_PROGRAM_ARB: {
   1212       struct gl_program *prog = rzalloc(NULL, struct gl_program);
   1213       return _mesa_init_gl_program(prog, target, id, is_arb_asm);
   1214    }
   1215    default:
   1216       _mesa_problem(ctx, "Bad target in r200NewProgram");
   1217       return NULL;
   1218    }
   1219 }
   1220 
   1221 
   1222 static void
   1223 r200DeleteProgram(struct gl_context *ctx, struct gl_program *prog)
   1224 {
   1225    _mesa_delete_program(ctx, prog);
   1226 }
   1227 
   1228 static GLboolean
   1229 r200ProgramStringNotify(struct gl_context *ctx, GLenum target, struct gl_program *prog)
   1230 {
   1231    struct r200_vertex_program *vp = (void *)prog;
   1232    r200ContextPtr rmesa = R200_CONTEXT(ctx);
   1233 
   1234    switch(target) {
   1235    case GL_VERTEX_PROGRAM_ARB:
   1236       vp->translated = GL_FALSE;
   1237       vp->fogpidx = 0;
   1238 /*      memset(&vp->translated, 0, sizeof(struct r200_vertex_program) - sizeof(struct gl_program));*/
   1239       r200_translate_vertex_program(ctx, vp);
   1240       rmesa->curr_vp_hw = NULL;
   1241       break;
   1242    case GL_FRAGMENT_SHADER_ATI:
   1243       rmesa->afs_loaded = NULL;
   1244       break;
   1245    }
   1246    /* need this for tcl fallbacks */
   1247    (void) _tnl_program_string(ctx, target, prog);
   1248 
   1249    /* XXX check if program is legal, within limits */
   1250    return GL_TRUE;
   1251 }
   1252 
   1253 static GLboolean
   1254 r200IsProgramNative(struct gl_context *ctx, GLenum target, struct gl_program *prog)
   1255 {
   1256    struct r200_vertex_program *vp = (void *)prog;
   1257 
   1258    switch(target){
   1259    case GL_VERTEX_PROGRAM_ARB:
   1260       if (!vp->translated) {
   1261 	 r200_translate_vertex_program(ctx, vp);
   1262       }
   1263      /* does not take parameters etc. into account */
   1264       return vp->native;
   1265    default:
   1266       _mesa_problem(ctx, "Bad target in r200NewProgram");
   1267    }
   1268    return 0;
   1269 }
   1270 
   1271 void r200InitShaderFuncs(struct dd_function_table *functions)
   1272 {
   1273    functions->NewProgram = r200NewProgram;
   1274    functions->BindProgram = r200BindProgram;
   1275    functions->DeleteProgram = r200DeleteProgram;
   1276    functions->ProgramStringNotify = r200ProgramStringNotify;
   1277    functions->IsProgramNative = r200IsProgramNative;
   1278 }
   1279