Home | History | Annotate | Download | only in r200
      1 /**************************************************************************
      2 
      3 Copyright (C) 2005 Aapo Tahkola.
      4 
      5 All Rights Reserved.
      6 
      7 Permission is hereby granted, free of charge, to any person obtaining a
      8 copy of this software and associated documentation files (the "Software"),
      9 to deal in the Software without restriction, including without limitation
     10 on the rights to use, copy, modify, merge, publish, distribute, sub
     11 license, and/or sell copies of the Software, and to permit persons to whom
     12 the Software is furnished to do so, subject to the following conditions:
     13 
     14 The above copyright notice and this permission notice (including the next
     15 paragraph) shall be included in all copies or substantial portions of the
     16 Software.
     17 
     18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     20 FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     21 THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
     22 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     23 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     24 USE OR OTHER DEALINGS IN THE SOFTWARE.
     25 
     26 **************************************************************************/
     27 
     28 /*
     29  * Authors:
     30  *   Aapo Tahkola <aet (at) rasterburn.org>
     31  *   Roland Scheidegger <rscheidegger_lists (at) hispeed.ch>
     32  */
     33 #include "main/glheader.h"
     34 #include "main/macros.h"
     35 #include "main/enums.h"
     36 #include "program/program.h"
     37 #include "program/prog_instruction.h"
     38 #include "program/prog_parameter.h"
     39 #include "program/prog_statevars.h"
     40 #include "program/programopt.h"
     41 #include "tnl/tnl.h"
     42 
     43 #include "r200_context.h"
     44 #include "r200_vertprog.h"
     45 #include "r200_ioctl.h"
     46 #include "r200_tcl.h"
     47 
     48 #if SWIZZLE_X != VSF_IN_COMPONENT_X || \
     49     SWIZZLE_Y != VSF_IN_COMPONENT_Y || \
     50     SWIZZLE_Z != VSF_IN_COMPONENT_Z || \
     51     SWIZZLE_W != VSF_IN_COMPONENT_W || \
     52     SWIZZLE_ZERO != VSF_IN_COMPONENT_ZERO || \
     53     SWIZZLE_ONE != VSF_IN_COMPONENT_ONE || \
     54     WRITEMASK_X != VSF_FLAG_X || \
     55     WRITEMASK_Y != VSF_FLAG_Y || \
     56     WRITEMASK_Z != VSF_FLAG_Z || \
     57     WRITEMASK_W != VSF_FLAG_W
     58 #error Cannot change these!
     59 #endif
     60 
     61 #define SCALAR_FLAG (1<<31)
     62 #define FLAG_MASK (1<<31)
     63 #define OP_MASK (0xf)  /* we are unlikely to have more than 15 */
     64 #define OPN(operator, ip) {#operator, OPCODE_##operator, ip}
     65 
     66 static struct{
     67    char *name;
     68    int opcode;
     69    unsigned long ip; /* number of input operands and flags */
     70 }op_names[]={
     71    OPN(ABS, 1),
     72    OPN(ADD, 2),
     73    OPN(ARL, 1|SCALAR_FLAG),
     74    OPN(DP3, 2),
     75    OPN(DP4, 2),
     76    OPN(DPH, 2),
     77    OPN(DST, 2),
     78    OPN(EX2, 1|SCALAR_FLAG),
     79    OPN(EXP, 1|SCALAR_FLAG),
     80    OPN(FLR, 1),
     81    OPN(FRC, 1),
     82    OPN(LG2, 1|SCALAR_FLAG),
     83    OPN(LIT, 1),
     84    OPN(LOG, 1|SCALAR_FLAG),
     85    OPN(MAD, 3),
     86    OPN(MAX, 2),
     87    OPN(MIN, 2),
     88    OPN(MOV, 1),
     89    OPN(MUL, 2),
     90    OPN(POW, 2|SCALAR_FLAG),
     91    OPN(RCP, 1|SCALAR_FLAG),
     92    OPN(RSQ, 1|SCALAR_FLAG),
     93    OPN(SGE, 2),
     94    OPN(SLT, 2),
     95    OPN(SUB, 2),
     96    OPN(SWZ, 1),
     97    OPN(XPD, 2),
     98    OPN(PRINT, 0),
     99    OPN(END, 0),
    100 };
    101 #undef OPN
    102 
    103 static GLboolean r200VertexProgUpdateParams(struct gl_context *ctx, struct r200_vertex_program *vp)
    104 {
    105    r200ContextPtr rmesa = R200_CONTEXT( ctx );
    106    GLfloat *fcmd = (GLfloat *)&rmesa->hw.vpp[0].cmd[VPP_CMD_0 + 1];
    107    int pi;
    108    struct gl_vertex_program *mesa_vp = &vp->mesa_program;
    109    struct gl_program_parameter_list *paramList;
    110    drm_radeon_cmd_header_t tmp;
    111 
    112    R200_STATECHANGE( rmesa, vpp[0] );
    113    R200_STATECHANGE( rmesa, vpp[1] );
    114    assert(mesa_vp->Base.Parameters);
    115    _mesa_load_state_parameters(ctx, mesa_vp->Base.Parameters);
    116    paramList = mesa_vp->Base.Parameters;
    117 
    118    if(paramList->NumParameters > R200_VSF_MAX_PARAM){
    119       fprintf(stderr, "%s:Params exhausted\n", __FUNCTION__);
    120       return GL_FALSE;
    121    }
    122 
    123    for(pi = 0; pi < paramList->NumParameters; pi++) {
    124       switch(paramList->Parameters[pi].Type) {
    125       case PROGRAM_STATE_VAR:
    126       case PROGRAM_NAMED_PARAM:
    127       //fprintf(stderr, "%s", vp->Parameters->Parameters[pi].Name);
    128       case PROGRAM_CONSTANT:
    129 	 *fcmd++ = paramList->ParameterValues[pi][0].f;
    130 	 *fcmd++ = paramList->ParameterValues[pi][1].f;
    131 	 *fcmd++ = paramList->ParameterValues[pi][2].f;
    132 	 *fcmd++ = paramList->ParameterValues[pi][3].f;
    133 	 break;
    134       default:
    135 	 _mesa_problem(NULL, "Bad param type in %s", __FUNCTION__);
    136 	 break;
    137       }
    138       if (pi == 95) {
    139 	 fcmd = (GLfloat *)&rmesa->hw.vpp[1].cmd[VPP_CMD_0 + 1];
    140       }
    141    }
    142    /* hack up the cmd_size so not the whole state atom is emitted always. */
    143    rmesa->hw.vpp[0].cmd_size =
    144       1 + 4 * ((paramList->NumParameters > 96) ? 96 : paramList->NumParameters);
    145    tmp.i = rmesa->hw.vpp[0].cmd[VPP_CMD_0];
    146    tmp.veclinear.count = (paramList->NumParameters > 96) ? 96 : paramList->NumParameters;
    147    rmesa->hw.vpp[0].cmd[VPP_CMD_0] = tmp.i;
    148    if (paramList->NumParameters > 96) {
    149       rmesa->hw.vpp[1].cmd_size = 1 + 4 * (paramList->NumParameters - 96);
    150       tmp.i = rmesa->hw.vpp[1].cmd[VPP_CMD_0];
    151       tmp.veclinear.count = paramList->NumParameters - 96;
    152       rmesa->hw.vpp[1].cmd[VPP_CMD_0] = tmp.i;
    153    }
    154    return GL_TRUE;
    155 }
    156 
    157 static INLINE unsigned long t_dst_mask(GLuint mask)
    158 {
    159    /* WRITEMASK_* is equivalent to VSF_FLAG_* */
    160    return mask & VSF_FLAG_ALL;
    161 }
    162 
    163 static unsigned long t_dst(struct prog_dst_register *dst)
    164 {
    165    switch(dst->File) {
    166    case PROGRAM_TEMPORARY:
    167       return ((dst->Index << R200_VPI_OUT_REG_INDEX_SHIFT)
    168 	 | R200_VSF_OUT_CLASS_TMP);
    169    case PROGRAM_OUTPUT:
    170       switch (dst->Index) {
    171       case VERT_RESULT_HPOS:
    172 	 return R200_VSF_OUT_CLASS_RESULT_POS;
    173       case VERT_RESULT_COL0:
    174 	 return R200_VSF_OUT_CLASS_RESULT_COLOR;
    175       case VERT_RESULT_COL1:
    176 	 return ((1 << R200_VPI_OUT_REG_INDEX_SHIFT)
    177 	    | R200_VSF_OUT_CLASS_RESULT_COLOR);
    178       case VERT_RESULT_FOGC:
    179 	 return R200_VSF_OUT_CLASS_RESULT_FOGC;
    180       case VERT_RESULT_TEX0:
    181       case VERT_RESULT_TEX1:
    182       case VERT_RESULT_TEX2:
    183       case VERT_RESULT_TEX3:
    184       case VERT_RESULT_TEX4:
    185       case VERT_RESULT_TEX5:
    186 	 return (((dst->Index - VERT_RESULT_TEX0) << R200_VPI_OUT_REG_INDEX_SHIFT)
    187 	    | R200_VSF_OUT_CLASS_RESULT_TEXC);
    188       case VERT_RESULT_PSIZ:
    189 	 return R200_VSF_OUT_CLASS_RESULT_POINTSIZE;
    190       default:
    191 	 fprintf(stderr, "problem in %s, unknown dst output reg %d\n", __FUNCTION__, dst->Index);
    192 	 exit(0);
    193 	 return 0;
    194       }
    195    case PROGRAM_ADDRESS:
    196       assert (dst->Index == 0);
    197       return R200_VSF_OUT_CLASS_ADDR;
    198    default:
    199       fprintf(stderr, "problem in %s, unknown register type %d\n", __FUNCTION__, dst->File);
    200       exit(0);
    201       return 0;
    202    }
    203 }
    204 
    205 static unsigned long t_src_class(gl_register_file file)
    206 {
    207 
    208    switch(file){
    209    case PROGRAM_TEMPORARY:
    210       return VSF_IN_CLASS_TMP;
    211 
    212    case PROGRAM_INPUT:
    213       return VSF_IN_CLASS_ATTR;
    214 
    215    case PROGRAM_LOCAL_PARAM:
    216    case PROGRAM_ENV_PARAM:
    217    case PROGRAM_NAMED_PARAM:
    218    case PROGRAM_CONSTANT:
    219    case PROGRAM_STATE_VAR:
    220       return VSF_IN_CLASS_PARAM;
    221    /*
    222    case PROGRAM_OUTPUT:
    223    case PROGRAM_WRITE_ONLY:
    224    case PROGRAM_ADDRESS:
    225    */
    226    default:
    227       fprintf(stderr, "problem in %s", __FUNCTION__);
    228       exit(0);
    229    }
    230 }
    231 
    232 static INLINE unsigned long t_swizzle(GLubyte swizzle)
    233 {
    234 /* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
    235    return swizzle;
    236 }
    237 
    238 #if 0
    239 static void vp_dump_inputs(struct r200_vertex_program *vp, char *caller)
    240 {
    241    int i;
    242 
    243    if(vp == NULL){
    244       fprintf(stderr, "vp null in call to %s from %s\n", __FUNCTION__, caller);
    245       return ;
    246    }
    247 
    248    fprintf(stderr, "%s:<", caller);
    249    for(i=0; i < VERT_ATTRIB_MAX; i++)
    250    fprintf(stderr, "%d ", vp->inputs[i]);
    251    fprintf(stderr, ">\n");
    252 
    253 }
    254 #endif
    255 
    256 static unsigned long t_src_index(struct r200_vertex_program *vp, struct prog_src_register *src)
    257 {
    258 /*
    259    int i;
    260    int max_reg = -1;
    261 */
    262    if(src->File == PROGRAM_INPUT){
    263 /*      if(vp->inputs[src->Index] != -1)
    264 	 return vp->inputs[src->Index];
    265 
    266       for(i=0; i < VERT_ATTRIB_MAX; i++)
    267 	 if(vp->inputs[i] > max_reg)
    268 	    max_reg = vp->inputs[i];
    269 
    270       vp->inputs[src->Index] = max_reg+1;*/
    271 
    272       //vp_dump_inputs(vp, __FUNCTION__);
    273       assert(vp->inputs[src->Index] != -1);
    274       return vp->inputs[src->Index];
    275    } else {
    276       if (src->Index < 0) {
    277 	 fprintf(stderr, "WARNING negative offsets for indirect addressing do not work\n");
    278 	 return 0;
    279       }
    280       return src->Index;
    281    }
    282 }
    283 
    284 static unsigned long t_src(struct r200_vertex_program *vp, struct prog_src_register *src)
    285 {
    286 
    287    return MAKE_VSF_SOURCE(t_src_index(vp, src),
    288 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    289 			t_swizzle(GET_SWZ(src->Swizzle, 1)),
    290 			t_swizzle(GET_SWZ(src->Swizzle, 2)),
    291 			t_swizzle(GET_SWZ(src->Swizzle, 3)),
    292 			t_src_class(src->File),
    293 			src->Negate) | (src->RelAddr << 4);
    294 }
    295 
    296 static unsigned long t_src_scalar(struct r200_vertex_program *vp, struct prog_src_register *src)
    297 {
    298 
    299    return MAKE_VSF_SOURCE(t_src_index(vp, src),
    300 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    301 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    302 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    303 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    304 			t_src_class(src->File),
    305 			src->Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src->RelAddr << 4);
    306 }
    307 
    308 static unsigned long t_opcode(enum prog_opcode opcode)
    309 {
    310 
    311    switch(opcode){
    312    case OPCODE_ADD: return R200_VPI_OUT_OP_ADD;
    313    /* FIXME: ARL works fine, but negative offsets won't work - fglrx just
    314     * seems to ignore neg offsets which isn't quite correct...
    315     */
    316    case OPCODE_ARL: return R200_VPI_OUT_OP_ARL;
    317    case OPCODE_DP4: return R200_VPI_OUT_OP_DOT;
    318    case OPCODE_DST: return R200_VPI_OUT_OP_DST;
    319    case OPCODE_EX2: return R200_VPI_OUT_OP_EX2;
    320    case OPCODE_EXP: return R200_VPI_OUT_OP_EXP;
    321    case OPCODE_FRC: return R200_VPI_OUT_OP_FRC;
    322    case OPCODE_LG2: return R200_VPI_OUT_OP_LG2;
    323    case OPCODE_LIT: return R200_VPI_OUT_OP_LIT;
    324    case OPCODE_LOG: return R200_VPI_OUT_OP_LOG;
    325    case OPCODE_MAX: return R200_VPI_OUT_OP_MAX;
    326    case OPCODE_MIN: return R200_VPI_OUT_OP_MIN;
    327    case OPCODE_MUL: return R200_VPI_OUT_OP_MUL;
    328    case OPCODE_RCP: return R200_VPI_OUT_OP_RCP;
    329    case OPCODE_RSQ: return R200_VPI_OUT_OP_RSQ;
    330    case OPCODE_SGE: return R200_VPI_OUT_OP_SGE;
    331    case OPCODE_SLT: return R200_VPI_OUT_OP_SLT;
    332 
    333    default:
    334       fprintf(stderr, "%s: Should not be called with opcode %d!", __FUNCTION__, opcode);
    335    }
    336    exit(-1);
    337    return 0;
    338 }
    339 
    340 static unsigned long op_operands(enum prog_opcode opcode)
    341 {
    342    int i;
    343 
    344    /* Can we trust mesas opcodes to be in order ? */
    345    for(i=0; i < sizeof(op_names) / sizeof(*op_names); i++)
    346       if(op_names[i].opcode == opcode)
    347 	 return op_names[i].ip;
    348 
    349    fprintf(stderr, "op %d not found in op_names\n", opcode);
    350    exit(-1);
    351    return 0;
    352 }
    353 
    354 /* TODO: Get rid of t_src_class call */
    355 #define CMP_SRCS(a, b) (((a.RelAddr != b.RelAddr) || (a.Index != b.Index)) && \
    356 		       ((t_src_class(a.File) == VSF_IN_CLASS_PARAM && \
    357 			 t_src_class(b.File) == VSF_IN_CLASS_PARAM) || \
    358 			(t_src_class(a.File) == VSF_IN_CLASS_ATTR && \
    359 			 t_src_class(b.File) == VSF_IN_CLASS_ATTR))) \
    360 
    361 /* fglrx on rv250 codes up unused sources as follows:
    362    unused but necessary sources are same as previous source, zero-ed out.
    363    unnecessary sources are same as previous source but with VSF_IN_CLASS_NONE set.
    364    i.e. an add (2 args) has its 2nd arg (if you use it as mov) zero-ed out, and 3rd arg
    365    set to VSF_IN_CLASS_NONE. Not sure if strictly necessary. */
    366 
    367 /* use these simpler definitions. Must obviously not be used with not yet set up regs.
    368    Those are NOT semantically equivalent to the r300 ones, requires code changes */
    369 #define ZERO_SRC_0 (((o_inst->src0 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
    370 				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
    371 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
    372 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
    373 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
    374 
    375 #define ZERO_SRC_1 (((o_inst->src1 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
    376 				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
    377 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
    378 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
    379 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
    380 
    381 #define ZERO_SRC_2 (((o_inst->src2 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
    382 				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
    383 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
    384 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
    385 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
    386 
    387 #define UNUSED_SRC_0 ((o_inst->src0 & ~15) | 9)
    388 
    389 #define UNUSED_SRC_1 ((o_inst->src1 & ~15) | 9)
    390 
    391 #define UNUSED_SRC_2 ((o_inst->src2 & ~15) | 9)
    392 
    393 
    394 /**
    395  * Generate an R200 vertex program from Mesa's internal representation.
    396  *
    397  * \return  GL_TRUE for success, GL_FALSE for failure.
    398  */
    399 static GLboolean r200_translate_vertex_program(struct gl_context *ctx, struct r200_vertex_program *vp)
    400 {
    401    struct gl_vertex_program *mesa_vp = &vp->mesa_program;
    402    struct prog_instruction *vpi;
    403    int i;
    404    VERTEX_SHADER_INSTRUCTION *o_inst;
    405    unsigned long operands;
    406    int are_srcs_scalar;
    407    unsigned long hw_op;
    408    int dofogfix = 0;
    409    int fog_temp_i = 0;
    410    int free_inputs;
    411    int array_count = 0;
    412    int u_temp_used;
    413 
    414    vp->native = GL_FALSE;
    415    vp->translated = GL_TRUE;
    416    vp->fogmode = ctx->Fog.Mode;
    417 
    418    if (mesa_vp->Base.NumInstructions == 0)
    419       return GL_FALSE;
    420 
    421 #if 0
    422    if ((mesa_vp->Base.InputsRead &
    423       ~(VERT_BIT_POS | VERT_BIT_NORMAL | VERT_BIT_COLOR0 | VERT_BIT_COLOR1 |
    424       VERT_BIT_FOG | VERT_BIT_TEX0 | VERT_BIT_TEX1 | VERT_BIT_TEX2 |
    425       VERT_BIT_TEX3 | VERT_BIT_TEX4 | VERT_BIT_TEX5)) != 0) {
    426       if (R200_DEBUG & RADEON_FALLBACKS) {
    427 	 fprintf(stderr, "can't handle vert prog inputs 0x%x\n",
    428 	    mesa_vp->Base.InputsRead);
    429       }
    430       return GL_FALSE;
    431    }
    432 #endif
    433 
    434    if ((mesa_vp->Base.OutputsWritten &
    435       ~((1 << VERT_RESULT_HPOS) | (1 << VERT_RESULT_COL0) | (1 << VERT_RESULT_COL1) |
    436       (1 << VERT_RESULT_FOGC) | (1 << VERT_RESULT_TEX0) | (1 << VERT_RESULT_TEX1) |
    437       (1 << VERT_RESULT_TEX2) | (1 << VERT_RESULT_TEX3) | (1 << VERT_RESULT_TEX4) |
    438       (1 << VERT_RESULT_TEX5) | (1 << VERT_RESULT_PSIZ))) != 0) {
    439       if (R200_DEBUG & RADEON_FALLBACKS) {
    440 	 fprintf(stderr, "can't handle vert prog outputs 0x%llx\n",
    441                  (unsigned long long) mesa_vp->Base.OutputsWritten);
    442       }
    443       return GL_FALSE;
    444    }
    445 
    446    if (mesa_vp->IsNVProgram) {
    447    /* subtle differences in spec like guaranteed initialized regs could cause
    448       headaches. Might want to remove the driconf option to enable it completely */
    449       return GL_FALSE;
    450    }
    451    /* Initial value should be last tmp reg that hw supports.
    452       Strangely enough r300 doesnt mind even though these would be out of range.
    453       Smart enough to realize that it doesnt need it? */
    454    int u_temp_i = R200_VSF_MAX_TEMPS - 1;
    455    struct prog_src_register src[3];
    456    struct prog_dst_register dst;
    457 
    458 /* FIXME: is changing the prog safe to do here? */
    459    if (mesa_vp->IsPositionInvariant &&
    460       /* make sure we only do this once */
    461        !(mesa_vp->Base.OutputsWritten & (1 << VERT_RESULT_HPOS))) {
    462 	 _mesa_insert_mvp_code(ctx, mesa_vp);
    463       }
    464 
    465    /* for fogc, can't change mesa_vp, as it would hose swtnl, and exp with
    466       base e isn't directly available neither. */
    467    if ((mesa_vp->Base.OutputsWritten & (1 << VERT_RESULT_FOGC)) && !vp->fogpidx) {
    468       struct gl_program_parameter_list *paramList;
    469       gl_state_index tokens[STATE_LENGTH] = { STATE_FOG_PARAMS, 0, 0, 0, 0 };
    470       paramList = mesa_vp->Base.Parameters;
    471       vp->fogpidx = _mesa_add_state_reference(paramList, tokens);
    472    }
    473 
    474    vp->pos_end = 0;
    475    mesa_vp->Base.NumNativeInstructions = 0;
    476    if (mesa_vp->Base.Parameters)
    477       mesa_vp->Base.NumNativeParameters = mesa_vp->Base.Parameters->NumParameters;
    478    else
    479       mesa_vp->Base.NumNativeParameters = 0;
    480 
    481    for(i = 0; i < VERT_ATTRIB_MAX; i++)
    482       vp->inputs[i] = -1;
    483    for(i = 0; i < 15; i++)
    484       vp->inputmap_rev[i] = 255;
    485    free_inputs = 0x2ffd;
    486 
    487 /* fglrx uses fixed inputs as follows for conventional attribs.
    488    generic attribs use non-fixed assignment, fglrx will always use the
    489    lowest attrib values available. We'll just do the same.
    490    There are 12 generic attribs possible, corresponding to attrib 0, 2-11
    491    and 13 in a hw vertex prog.
    492    attr 1 and 12 aren't used for generic attribs as those cannot be made vec4
    493    (correspond to vertex normal/weight - maybe weight actually could be made vec4).
    494    Additionally, not more than 12 arrays in total are possible I think.
    495    attr 0 is pos, R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0
    496    attr 2-5 use colors 0-3 (R200_VTX_FP_RGBA << R200_VTX_COLOR_0/1/2/3_SHIFT in R200_SE_VTX_FMT_0)
    497    attr 6-11 use tex 0-5 (4 << R200_VTX_TEX0/1/2/3/4/5_COMP_CNT_SHIFT in R200_SE_VTX_FMT_1)
    498    attr 13 uses vtx1 pos (R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0)
    499 */
    500 
    501 /* attr 4,5 and 13 are only used with generic attribs.
    502    Haven't seen attr 14 used, maybe that's for the hw pointsize vec1 (which is
    503    not possibe to use with vertex progs as it is lacking in vert prog specification) */
    504 /* may look different when using idx buf / input_route instead of se_vtx_fmt? */
    505    if (mesa_vp->Base.InputsRead & VERT_BIT_POS) {
    506       vp->inputs[VERT_ATTRIB_POS] = 0;
    507       vp->inputmap_rev[0] = VERT_ATTRIB_POS;
    508       free_inputs &= ~(1 << 0);
    509       array_count++;
    510    }
    511    if (mesa_vp->Base.InputsRead & VERT_BIT_WEIGHT) {
    512       vp->inputs[VERT_ATTRIB_WEIGHT] = 12;
    513       vp->inputmap_rev[1] = VERT_ATTRIB_WEIGHT;
    514       array_count++;
    515    }
    516    if (mesa_vp->Base.InputsRead & VERT_BIT_NORMAL) {
    517       vp->inputs[VERT_ATTRIB_NORMAL] = 1;
    518       vp->inputmap_rev[2] = VERT_ATTRIB_NORMAL;
    519       array_count++;
    520    }
    521    if (mesa_vp->Base.InputsRead & VERT_BIT_COLOR0) {
    522       vp->inputs[VERT_ATTRIB_COLOR0] = 2;
    523       vp->inputmap_rev[4] = VERT_ATTRIB_COLOR0;
    524       free_inputs &= ~(1 << 2);
    525       array_count++;
    526    }
    527    if (mesa_vp->Base.InputsRead & VERT_BIT_COLOR1) {
    528       vp->inputs[VERT_ATTRIB_COLOR1] = 3;
    529       vp->inputmap_rev[5] = VERT_ATTRIB_COLOR1;
    530       free_inputs &= ~(1 << 3);
    531       array_count++;
    532    }
    533    if (mesa_vp->Base.InputsRead & VERT_BIT_FOG) {
    534       vp->inputs[VERT_ATTRIB_FOG] = 15; array_count++;
    535       vp->inputmap_rev[3] = VERT_ATTRIB_FOG;
    536       array_count++;
    537    }
    538    /* VERT_ATTRIB_TEX0-5 */
    539    for (i = 0; i <= 5; i++) {
    540       if (mesa_vp->Base.InputsRead & VERT_BIT_TEX(i)) {
    541 	 vp->inputs[VERT_ATTRIB_TEX(i)] = i + 6;
    542 	 vp->inputmap_rev[8 + i] = VERT_ATTRIB_TEX(i);
    543 	 free_inputs &= ~(1 << (i + 6));
    544 	 array_count++;
    545       }
    546    }
    547    /* using VERT_ATTRIB_TEX6/7 would be illegal */
    548    for (; i < VERT_ATTRIB_TEX_MAX; i++) {
    549       if (mesa_vp->Base.InputsRead & VERT_BIT_TEX(i)) {
    550           if (R200_DEBUG & RADEON_FALLBACKS) {
    551               fprintf(stderr, "texture attribute %d in vert prog\n", i);
    552           }
    553           return GL_FALSE;
    554       }
    555    }
    556    /* completely ignore aliasing? */
    557    for (i = 0; i < VERT_ATTRIB_GENERIC_MAX; i++) {
    558       int j;
    559    /* completely ignore aliasing? */
    560       if (mesa_vp->Base.InputsRead & VERT_BIT_GENERIC(i)) {
    561 	 array_count++;
    562 	 if (array_count > 12) {
    563 	    if (R200_DEBUG & RADEON_FALLBACKS) {
    564 	       fprintf(stderr, "more than 12 attribs used in vert prog\n");
    565 	    }
    566 	    return GL_FALSE;
    567 	 }
    568 	 for (j = 0; j < 14; j++) {
    569 	    /* will always find one due to limited array_count */
    570 	    if (free_inputs & (1 << j)) {
    571 	       free_inputs &= ~(1 << j);
    572 	       vp->inputs[VERT_ATTRIB_GENERIC(i)] = j;
    573 	       if (j == 0) {
    574                   /* mapped to pos */
    575                   vp->inputmap_rev[j] = VERT_ATTRIB_GENERIC(i);
    576 	       } else if (j < 12) {
    577                   /* mapped to col/tex */
    578                   vp->inputmap_rev[j + 2] = VERT_ATTRIB_GENERIC(i);
    579 	       } else {
    580                   /* mapped to pos1 */
    581                   vp->inputmap_rev[j + 1] = VERT_ATTRIB_GENERIC(i);
    582                }
    583 	       break;
    584 	    }
    585 	 }
    586       }
    587    }
    588 
    589    if (!(mesa_vp->Base.OutputsWritten & (1 << VERT_RESULT_HPOS))) {
    590       if (R200_DEBUG & RADEON_FALLBACKS) {
    591 	 fprintf(stderr, "can't handle vert prog without position output\n");
    592       }
    593       return GL_FALSE;
    594    }
    595    if (free_inputs & 1) {
    596       if (R200_DEBUG & RADEON_FALLBACKS) {
    597 	 fprintf(stderr, "can't handle vert prog without position input\n");
    598       }
    599       return GL_FALSE;
    600    }
    601 
    602    o_inst = vp->instr;
    603    for (vpi = mesa_vp->Base.Instructions; vpi->Opcode != OPCODE_END; vpi++, o_inst++){
    604       operands = op_operands(vpi->Opcode);
    605       are_srcs_scalar = operands & SCALAR_FLAG;
    606       operands &= OP_MASK;
    607 
    608       for(i = 0; i < operands; i++) {
    609 	 src[i] = vpi->SrcReg[i];
    610 	 /* hack up default attrib values as per spec as swizzling.
    611 	    normal, fog, secondary color. Crazy?
    612 	    May need more if we don't submit vec4 elements? */
    613 	 if (src[i].File == PROGRAM_INPUT) {
    614 	    if (src[i].Index == VERT_ATTRIB_NORMAL) {
    615 	       int j;
    616 	       for (j = 0; j < 4; j++) {
    617 		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
    618 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    619 		     src[i].Swizzle |= SWIZZLE_ONE << (j*3);
    620 		  }
    621 	       }
    622 	    }
    623 	    else if (src[i].Index == VERT_ATTRIB_COLOR1) {
    624 	       int j;
    625 	       for (j = 0; j < 4; j++) {
    626 		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
    627 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    628 		     src[i].Swizzle |= SWIZZLE_ZERO << (j*3);
    629 		  }
    630 	       }
    631 	    }
    632 	    else if (src[i].Index == VERT_ATTRIB_FOG) {
    633 	       int j;
    634 	       for (j = 0; j < 4; j++) {
    635 		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
    636 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    637 		     src[i].Swizzle |= SWIZZLE_ONE << (j*3);
    638 		  }
    639 		  else if ((GET_SWZ(src[i].Swizzle, j) == SWIZZLE_Y) ||
    640 			    GET_SWZ(src[i].Swizzle, j) == SWIZZLE_Z) {
    641 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    642 		     src[i].Swizzle |= SWIZZLE_ZERO << (j*3);
    643 		  }
    644 	       }
    645 	    }
    646 	 }
    647       }
    648 
    649       if(operands == 3){
    650 	 if( CMP_SRCS(src[1], src[2]) || CMP_SRCS(src[0], src[2]) ){
    651 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
    652 		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    653 		VSF_FLAG_ALL);
    654 
    655 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[2]),
    656 		  SWIZZLE_X, SWIZZLE_Y,
    657 		  SWIZZLE_Z, SWIZZLE_W,
    658 		  t_src_class(src[2].File), VSF_FLAG_NONE) | (src[2].RelAddr << 4);
    659 
    660 	    o_inst->src1 = ZERO_SRC_0;
    661 	    o_inst->src2 = UNUSED_SRC_1;
    662 	    o_inst++;
    663 
    664 	    src[2].File = PROGRAM_TEMPORARY;
    665 	    src[2].Index = u_temp_i;
    666 	    src[2].RelAddr = 0;
    667 	    u_temp_i--;
    668 	 }
    669       }
    670 
    671       if(operands >= 2){
    672 	 if( CMP_SRCS(src[1], src[0]) ){
    673 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
    674 		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    675 		VSF_FLAG_ALL);
    676 
    677 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    678 		  SWIZZLE_X, SWIZZLE_Y,
    679 		  SWIZZLE_Z, SWIZZLE_W,
    680 		  t_src_class(src[0].File), VSF_FLAG_NONE) | (src[0].RelAddr << 4);
    681 
    682 	    o_inst->src1 = ZERO_SRC_0;
    683 	    o_inst->src2 = UNUSED_SRC_1;
    684 	    o_inst++;
    685 
    686 	    src[0].File = PROGRAM_TEMPORARY;
    687 	    src[0].Index = u_temp_i;
    688 	    src[0].RelAddr = 0;
    689 	    u_temp_i--;
    690 	 }
    691       }
    692 
    693       dst = vpi->DstReg;
    694       if (dst.File == PROGRAM_OUTPUT &&
    695 	  dst.Index == VERT_RESULT_FOGC &&
    696 	  dst.WriteMask & WRITEMASK_X) {
    697 	  fog_temp_i = u_temp_i;
    698 	  dst.File = PROGRAM_TEMPORARY;
    699 	  dst.Index = fog_temp_i;
    700 	  dofogfix = 1;
    701 	  u_temp_i--;
    702       }
    703 
    704       /* These ops need special handling. */
    705       switch(vpi->Opcode){
    706       case OPCODE_POW:
    707 /* pow takes only one argument, first scalar is in slot x, 2nd in slot z (other slots don't matter).
    708    So may need to insert additional instruction */
    709 	 if ((src[0].File == src[1].File) &&
    710 	     (src[0].Index == src[1].Index)) {
    711 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&dst),
    712 		   t_dst_mask(dst.WriteMask));
    713 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    714 		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    715 		   SWIZZLE_ZERO,
    716 		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
    717 		   SWIZZLE_ZERO,
    718 		   t_src_class(src[0].File),
    719 		   src[0].Negate) | (src[0].RelAddr << 4);
    720 	    o_inst->src1 = UNUSED_SRC_0;
    721 	    o_inst->src2 = UNUSED_SRC_0;
    722 	 }
    723 	 else {
    724 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
    725 		   (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    726 		   VSF_FLAG_ALL);
    727 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    728 		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    729 		   SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO,
    730 		   t_src_class(src[0].File),
    731 		   src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
    732 	    o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    733 		   SWIZZLE_ZERO, SWIZZLE_ZERO,
    734 		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)), SWIZZLE_ZERO,
    735 		   t_src_class(src[1].File),
    736 		   src[1].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
    737 	    o_inst->src2 = UNUSED_SRC_1;
    738 	    o_inst++;
    739 
    740 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&dst),
    741 		   t_dst_mask(dst.WriteMask));
    742 	    o_inst->src0 = MAKE_VSF_SOURCE(u_temp_i,
    743 		   VSF_IN_COMPONENT_X,
    744 		   VSF_IN_COMPONENT_Y,
    745 		   VSF_IN_COMPONENT_Z,
    746 		   VSF_IN_COMPONENT_W,
    747 		   VSF_IN_CLASS_TMP,
    748 		   VSF_FLAG_NONE);
    749 	    o_inst->src1 = UNUSED_SRC_0;
    750 	    o_inst->src2 = UNUSED_SRC_0;
    751 	    u_temp_i--;
    752 	 }
    753 	 goto next;
    754 
    755       case OPCODE_MOV://ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO}
    756       case OPCODE_SWZ:
    757 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
    758 		t_dst_mask(dst.WriteMask));
    759 	 o_inst->src0 = t_src(vp, &src[0]);
    760 	 o_inst->src1 = ZERO_SRC_0;
    761 	 o_inst->src2 = UNUSED_SRC_1;
    762 	 goto next;
    763 
    764       case OPCODE_MAD:
    765 	 /* only 2 read ports into temp memory thus may need the macro op MAD_2
    766 	    instead (requiring 2 clocks) if all inputs are in temp memory
    767 	    (and, only if they actually reference 3 distinct temps) */
    768 	 hw_op=(src[0].File == PROGRAM_TEMPORARY &&
    769 	    src[1].File == PROGRAM_TEMPORARY &&
    770 	    src[2].File == PROGRAM_TEMPORARY &&
    771 	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[1].RelAddr << 8) | src[1].Index)) &&
    772 	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[2].RelAddr << 8) | src[2].Index)) &&
    773 	    (((src[1].RelAddr << 8) | src[1].Index) != ((src[2].RelAddr << 8) | src[2].Index))) ?
    774 	    R200_VPI_OUT_OP_MAD_2 : R200_VPI_OUT_OP_MAD;
    775 
    776 	 o_inst->op = MAKE_VSF_OP(hw_op, t_dst(&dst),
    777 	    t_dst_mask(dst.WriteMask));
    778 	 o_inst->src0 = t_src(vp, &src[0]);
    779 #if 0
    780 if ((o_inst - vp->instr) == 31) {
    781 /* fix up the broken vertex program of quake4 demo... */
    782 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    783 			SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X,
    784 			t_src_class(src[1].File),
    785 			src[1].Negate) | (src[1].RelAddr << 4);
    786 o_inst->src2 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    787 			SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y,
    788 			t_src_class(src[1].File),
    789 			src[1].Negate) | (src[1].RelAddr << 4);
    790 }
    791 else {
    792 	 o_inst->src1 = t_src(vp, &src[1]);
    793 	 o_inst->src2 = t_src(vp, &src[2]);
    794 }
    795 #else
    796 	 o_inst->src1 = t_src(vp, &src[1]);
    797 	 o_inst->src2 = t_src(vp, &src[2]);
    798 #endif
    799 	 goto next;
    800 
    801       case OPCODE_DP3://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ZERO} PARAM 0{} {X Y Z ZERO}
    802 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&dst),
    803 		t_dst_mask(dst.WriteMask));
    804 
    805 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    806 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    807 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
    808 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
    809 		SWIZZLE_ZERO,
    810 		t_src_class(src[0].File),
    811 		src[0].Negate) | (src[0].RelAddr << 4);
    812 
    813 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    814 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
    815 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
    816 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
    817 		SWIZZLE_ZERO,
    818 		t_src_class(src[1].File),
    819 		src[1].Negate) | (src[1].RelAddr << 4);
    820 
    821 	 o_inst->src2 = UNUSED_SRC_1;
    822 	 goto next;
    823 
    824       case OPCODE_DPH://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W}
    825 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&dst),
    826 		t_dst_mask(dst.WriteMask));
    827 
    828 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    829 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    830 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
    831 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
    832 		VSF_IN_COMPONENT_ONE,
    833 		t_src_class(src[0].File),
    834 		src[0].Negate) | (src[0].RelAddr << 4);
    835 	 o_inst->src1 = t_src(vp, &src[1]);
    836 	 o_inst->src2 = UNUSED_SRC_1;
    837 	 goto next;
    838 
    839       case OPCODE_SUB://ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
    840 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
    841 		t_dst_mask(dst.WriteMask));
    842 
    843 	 o_inst->src0 = t_src(vp, &src[0]);
    844 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    845 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
    846 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
    847 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
    848 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
    849 		t_src_class(src[1].File),
    850 		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
    851 	 o_inst->src2 = UNUSED_SRC_1;
    852 	 goto next;
    853 
    854       case OPCODE_ABS://MAX RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
    855 	 o_inst->op=MAKE_VSF_OP(R200_VPI_OUT_OP_MAX, t_dst(&dst),
    856 		t_dst_mask(dst.WriteMask));
    857 
    858 	 o_inst->src0=t_src(vp, &src[0]);
    859 	 o_inst->src1=MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    860 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    861 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
    862 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
    863 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)),
    864 		t_src_class(src[0].File),
    865 		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
    866 	 o_inst->src2 = UNUSED_SRC_1;
    867 	 goto next;
    868 
    869       case OPCODE_FLR:
    870       /* FRC TMP 0.X Y Z W PARAM 0{} {X Y Z W}
    871          ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} TMP 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W */
    872 
    873 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_FRC,
    874 	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    875 	    t_dst_mask(dst.WriteMask));
    876 
    877 	 o_inst->src0 = t_src(vp, &src[0]);
    878 	 o_inst->src1 = UNUSED_SRC_0;
    879 	 o_inst->src2 = UNUSED_SRC_1;
    880 	 o_inst++;
    881 
    882 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
    883 		t_dst_mask(dst.WriteMask));
    884 
    885 	 o_inst->src0 = t_src(vp, &src[0]);
    886 	 o_inst->src1 = MAKE_VSF_SOURCE(u_temp_i,
    887 		VSF_IN_COMPONENT_X,
    888 		VSF_IN_COMPONENT_Y,
    889 		VSF_IN_COMPONENT_Z,
    890 		VSF_IN_COMPONENT_W,
    891 		VSF_IN_CLASS_TMP,
    892 		/* Not 100% sure about this */
    893 		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE/*VSF_FLAG_ALL*/);
    894 
    895 	 o_inst->src2 = UNUSED_SRC_0;
    896 	 u_temp_i--;
    897 	 goto next;
    898 
    899       case OPCODE_XPD:
    900 	 /* mul r0, r1.yzxw, r2.zxyw
    901 	    mad r0, -r2.yzxw, r1.zxyw, r0
    902 	  */
    903 	 hw_op=(src[0].File == PROGRAM_TEMPORARY &&
    904 	    src[1].File == PROGRAM_TEMPORARY &&
    905 	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[1].RelAddr << 8) | src[1].Index))) ?
    906 	    R200_VPI_OUT_OP_MAD_2 : R200_VPI_OUT_OP_MAD;
    907 
    908 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
    909 	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    910 	    t_dst_mask(dst.WriteMask));
    911 
    912 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    913 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
    914 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
    915 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
    916 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
    917 		t_src_class(src[0].File),
    918 		src[0].Negate) | (src[0].RelAddr << 4);
    919 
    920 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    921 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
    922 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
    923 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
    924 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
    925 		t_src_class(src[1].File),
    926 		src[1].Negate) | (src[1].RelAddr << 4);
    927 
    928 	 o_inst->src2 = UNUSED_SRC_1;
    929 	 o_inst++;
    930 	 u_temp_i--;
    931 
    932 	 o_inst->op = MAKE_VSF_OP(hw_op, t_dst(&dst),
    933 		t_dst_mask(dst.WriteMask));
    934 
    935 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    936 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
    937 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
    938 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
    939 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
    940 		t_src_class(src[1].File),
    941 		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
    942 
    943 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    944 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
    945 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
    946 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
    947 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
    948 		t_src_class(src[0].File),
    949 		src[0].Negate) | (src[0].RelAddr << 4);
    950 
    951 	 o_inst->src2 = MAKE_VSF_SOURCE(u_temp_i+1,
    952 		VSF_IN_COMPONENT_X,
    953 		VSF_IN_COMPONENT_Y,
    954 		VSF_IN_COMPONENT_Z,
    955 		VSF_IN_COMPONENT_W,
    956 		VSF_IN_CLASS_TMP,
    957 		VSF_FLAG_NONE);
    958 	 goto next;
    959 
    960       case OPCODE_END:
    961 	 assert(0);
    962       default:
    963 	 break;
    964       }
    965 
    966       o_inst->op = MAKE_VSF_OP(t_opcode(vpi->Opcode), t_dst(&dst),
    967 	    t_dst_mask(dst.WriteMask));
    968 
    969       if(are_srcs_scalar){
    970 	 switch(operands){
    971 	    case 1:
    972 		o_inst->src0 = t_src_scalar(vp, &src[0]);
    973 		o_inst->src1 = UNUSED_SRC_0;
    974 		o_inst->src2 = UNUSED_SRC_1;
    975 	    break;
    976 
    977 	    case 2:
    978 		o_inst->src0 = t_src_scalar(vp, &src[0]);
    979 		o_inst->src1 = t_src_scalar(vp, &src[1]);
    980 		o_inst->src2 = UNUSED_SRC_1;
    981 	    break;
    982 
    983 	    case 3:
    984 		o_inst->src0 = t_src_scalar(vp, &src[0]);
    985 		o_inst->src1 = t_src_scalar(vp, &src[1]);
    986 		o_inst->src2 = t_src_scalar(vp, &src[2]);
    987 	    break;
    988 
    989 	    default:
    990 		fprintf(stderr, "illegal number of operands %lu\n", operands);
    991 		exit(-1);
    992 	    break;
    993 	 }
    994       } else {
    995 	 switch(operands){
    996 	    case 1:
    997 		o_inst->src0 = t_src(vp, &src[0]);
    998 		o_inst->src1 = UNUSED_SRC_0;
    999 		o_inst->src2 = UNUSED_SRC_1;
   1000 	    break;
   1001 
   1002 	    case 2:
   1003 		o_inst->src0 = t_src(vp, &src[0]);
   1004 		o_inst->src1 = t_src(vp, &src[1]);
   1005 		o_inst->src2 = UNUSED_SRC_1;
   1006 	    break;
   1007 
   1008 	    case 3:
   1009 		o_inst->src0 = t_src(vp, &src[0]);
   1010 		o_inst->src1 = t_src(vp, &src[1]);
   1011 		o_inst->src2 = t_src(vp, &src[2]);
   1012 	    break;
   1013 
   1014 	    default:
   1015 		fprintf(stderr, "illegal number of operands %lu\n", operands);
   1016 		exit(-1);
   1017 	    break;
   1018 	 }
   1019       }
   1020       next:
   1021 
   1022       if (dofogfix) {
   1023 	 o_inst++;
   1024 	 if (vp->fogmode == GL_EXP) {
   1025 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1026 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1027 		VSF_FLAG_X);
   1028 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1029 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, X, X, X, X, PARAM, NONE);
   1030 	    o_inst->src2 = UNUSED_SRC_1;
   1031 	    o_inst++;
   1032 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_EXP_E,
   1033 		R200_VSF_OUT_CLASS_RESULT_FOGC,
   1034 		VSF_FLAG_X);
   1035 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
   1036 	    o_inst->src1 = UNUSED_SRC_0;
   1037 	    o_inst->src2 = UNUSED_SRC_1;
   1038 	 }
   1039 	 else if (vp->fogmode == GL_EXP2) {
   1040 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1041 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1042 		VSF_FLAG_X);
   1043 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1044 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, X, X, X, X, PARAM, NONE);
   1045 	    o_inst->src2 = UNUSED_SRC_1;
   1046 	    o_inst++;
   1047 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1048 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1049 		VSF_FLAG_X);
   1050 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1051 	    o_inst->src1 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1052 	    o_inst->src2 = UNUSED_SRC_1;
   1053 	    o_inst++;
   1054 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_EXP_E,
   1055 		R200_VSF_OUT_CLASS_RESULT_FOGC,
   1056 		VSF_FLAG_X);
   1057 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
   1058 	    o_inst->src1 = UNUSED_SRC_0;
   1059 	    o_inst->src2 = UNUSED_SRC_1;
   1060 	 }
   1061 	 else { /* fogmode == GL_LINEAR */
   1062 		/* could do that with single op (dot) if using params like
   1063 		   with fixed function pipeline fog */
   1064 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
   1065 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1066 		VSF_FLAG_X);
   1067 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
   1068 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, Z, Z, Z, Z, PARAM, NONE);
   1069 	    o_inst->src2 = UNUSED_SRC_1;
   1070 	    o_inst++;
   1071 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1072 		R200_VSF_OUT_CLASS_RESULT_FOGC,
   1073 		VSF_FLAG_X);
   1074 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1075 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, W, W, W, W, PARAM, NONE);
   1076 	    o_inst->src2 = UNUSED_SRC_1;
   1077 
   1078 	 }
   1079          dofogfix = 0;
   1080       }
   1081 
   1082       u_temp_used = (R200_VSF_MAX_TEMPS - 1) - u_temp_i;
   1083       if (mesa_vp->Base.NumNativeTemporaries <
   1084 	 (mesa_vp->Base.NumTemporaries + u_temp_used)) {
   1085 	 mesa_vp->Base.NumNativeTemporaries =
   1086 	    mesa_vp->Base.NumTemporaries + u_temp_used;
   1087       }
   1088       if ((mesa_vp->Base.NumTemporaries + u_temp_used) > R200_VSF_MAX_TEMPS) {
   1089 	 if (R200_DEBUG & RADEON_FALLBACKS) {
   1090 	    fprintf(stderr, "Ran out of temps, num temps %d, us %d\n", mesa_vp->Base.NumTemporaries, u_temp_used);
   1091 	 }
   1092 	 return GL_FALSE;
   1093       }
   1094       u_temp_i = R200_VSF_MAX_TEMPS - 1;
   1095       if(o_inst - vp->instr >= R200_VSF_MAX_INST) {
   1096 	 mesa_vp->Base.NumNativeInstructions = 129;
   1097 	 if (R200_DEBUG & RADEON_FALLBACKS) {
   1098 	    fprintf(stderr, "more than 128 native instructions\n");
   1099 	 }
   1100 	 return GL_FALSE;
   1101       }
   1102       if ((o_inst->op & R200_VSF_OUT_CLASS_MASK) == R200_VSF_OUT_CLASS_RESULT_POS) {
   1103 	 vp->pos_end = (o_inst - vp->instr);
   1104       }
   1105    }
   1106 
   1107    vp->native = GL_TRUE;
   1108    mesa_vp->Base.NumNativeInstructions = (o_inst - vp->instr);
   1109 #if 0
   1110    fprintf(stderr, "hw program:\n");
   1111    for(i=0; i < vp->program.length; i++)
   1112       fprintf(stderr, "%08x\n", vp->instr[i]);
   1113 #endif
   1114    return GL_TRUE;
   1115 }
   1116 
   1117 void r200SetupVertexProg( struct gl_context *ctx ) {
   1118    r200ContextPtr rmesa = R200_CONTEXT(ctx);
   1119    struct r200_vertex_program *vp = (struct r200_vertex_program *)ctx->VertexProgram.Current;
   1120    GLboolean fallback;
   1121    GLint i;
   1122 
   1123    if (!vp->translated || (ctx->Fog.Enabled && ctx->Fog.Mode != vp->fogmode)) {
   1124       rmesa->curr_vp_hw = NULL;
   1125       r200_translate_vertex_program(ctx, vp);
   1126    }
   1127    /* could optimize setting up vertex progs away for non-tcl hw */
   1128    fallback = !(vp->native && r200VertexProgUpdateParams(ctx, vp));
   1129    TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, fallback);
   1130    if (rmesa->radeon.TclFallback) return;
   1131 
   1132    R200_STATECHANGE( rmesa, vap );
   1133    /* FIXME: fglrx sets R200_VAP_SINGLE_BUF_STATE_ENABLE too. Do we need it?
   1134              maybe only when using more than 64 inst / 96 param? */
   1135    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_PROG_VTX_SHADER_ENABLE /*| R200_VAP_SINGLE_BUF_STATE_ENABLE*/;
   1136 
   1137    R200_STATECHANGE( rmesa, pvs );
   1138 
   1139    rmesa->hw.pvs.cmd[PVS_CNTL_1] = (0 << R200_PVS_CNTL_1_PROGRAM_START_SHIFT) |
   1140       ((vp->mesa_program.Base.NumNativeInstructions - 1) << R200_PVS_CNTL_1_PROGRAM_END_SHIFT) |
   1141       (vp->pos_end << R200_PVS_CNTL_1_POS_END_SHIFT);
   1142    rmesa->hw.pvs.cmd[PVS_CNTL_2] = (0 << R200_PVS_CNTL_2_PARAM_OFFSET_SHIFT) |
   1143       (vp->mesa_program.Base.NumNativeParameters << R200_PVS_CNTL_2_PARAM_COUNT_SHIFT);
   1144 
   1145    /* maybe user clip planes just work with vertex progs... untested */
   1146    if (ctx->Transform.ClipPlanesEnabled) {
   1147       R200_STATECHANGE( rmesa, tcl );
   1148       if (vp->mesa_program.IsPositionInvariant) {
   1149 	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= (ctx->Transform.ClipPlanesEnabled << 2);
   1150       }
   1151       else {
   1152 	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~(0xfc);
   1153       }
   1154    }
   1155 
   1156    if (vp != rmesa->curr_vp_hw) {
   1157       GLuint count = vp->mesa_program.Base.NumNativeInstructions;
   1158       drm_radeon_cmd_header_t tmp;
   1159 
   1160       R200_STATECHANGE( rmesa, vpi[0] );
   1161       R200_STATECHANGE( rmesa, vpi[1] );
   1162 
   1163       /* FIXME: what about using a memcopy... */
   1164       for (i = 0; (i < 64) && i < count; i++) {
   1165 	 rmesa->hw.vpi[0].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i].op;
   1166 	 rmesa->hw.vpi[0].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i].src0;
   1167 	 rmesa->hw.vpi[0].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i].src1;
   1168 	 rmesa->hw.vpi[0].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i].src2;
   1169       }
   1170       /* hack up the cmd_size so not the whole state atom is emitted always.
   1171          This may require some more thought, we may emit half progs on lost state, but
   1172          hopefully it won't matter?
   1173          WARNING: must not use R200_DB_STATECHANGE, this will produce bogus (and rejected)
   1174          packet emits (due to the mismatched cmd_size and count in cmd/last_cmd) */
   1175       rmesa->hw.vpi[0].cmd_size = 1 + 4 * ((count > 64) ? 64 : count);
   1176       tmp.i = rmesa->hw.vpi[0].cmd[VPI_CMD_0];
   1177       tmp.veclinear.count = (count > 64) ? 64 : count;
   1178       rmesa->hw.vpi[0].cmd[VPI_CMD_0] = tmp.i;
   1179       if (count > 64) {
   1180 	 for (i = 0; i < (count - 64); i++) {
   1181 	    rmesa->hw.vpi[1].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i + 64].op;
   1182 	    rmesa->hw.vpi[1].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i + 64].src0;
   1183 	    rmesa->hw.vpi[1].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i + 64].src1;
   1184 	    rmesa->hw.vpi[1].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i + 64].src2;
   1185 	 }
   1186 	 rmesa->hw.vpi[1].cmd_size = 1 + 4 * (count - 64);
   1187 	 tmp.i = rmesa->hw.vpi[1].cmd[VPI_CMD_0];
   1188 	 tmp.veclinear.count = count - 64;
   1189 	 rmesa->hw.vpi[1].cmd[VPI_CMD_0] = tmp.i;
   1190       }
   1191       rmesa->curr_vp_hw = vp;
   1192    }
   1193 }
   1194 
   1195 
   1196 static void
   1197 r200BindProgram(struct gl_context *ctx, GLenum target, struct gl_program *prog)
   1198 {
   1199    r200ContextPtr rmesa = R200_CONTEXT(ctx);
   1200 
   1201    switch(target){
   1202    case GL_VERTEX_PROGRAM_ARB:
   1203       rmesa->curr_vp_hw = NULL;
   1204       break;
   1205    default:
   1206       _mesa_problem(ctx, "Target not supported yet!");
   1207       break;
   1208    }
   1209 }
   1210 
   1211 static struct gl_program *
   1212 r200NewProgram(struct gl_context *ctx, GLenum target, GLuint id)
   1213 {
   1214    struct r200_vertex_program *vp;
   1215 
   1216    switch(target){
   1217    case GL_VERTEX_PROGRAM_ARB:
   1218       vp = CALLOC_STRUCT(r200_vertex_program);
   1219       return _mesa_init_vertex_program(ctx, &vp->mesa_program, target, id);
   1220    case GL_FRAGMENT_PROGRAM_ARB:
   1221    case GL_FRAGMENT_PROGRAM_NV:
   1222       return _mesa_init_fragment_program( ctx, CALLOC_STRUCT(gl_fragment_program), target, id );
   1223    default:
   1224       _mesa_problem(ctx, "Bad target in r200NewProgram");
   1225    }
   1226    return NULL;
   1227 }
   1228 
   1229 
   1230 static void
   1231 r200DeleteProgram(struct gl_context *ctx, struct gl_program *prog)
   1232 {
   1233    _mesa_delete_program(ctx, prog);
   1234 }
   1235 
   1236 static GLboolean
   1237 r200ProgramStringNotify(struct gl_context *ctx, GLenum target, struct gl_program *prog)
   1238 {
   1239    struct r200_vertex_program *vp = (void *)prog;
   1240    r200ContextPtr rmesa = R200_CONTEXT(ctx);
   1241 
   1242    switch(target) {
   1243    case GL_VERTEX_PROGRAM_ARB:
   1244       vp->translated = GL_FALSE;
   1245       vp->fogpidx = 0;
   1246 /*      memset(&vp->translated, 0, sizeof(struct r200_vertex_program) - sizeof(struct gl_vertex_program));*/
   1247       r200_translate_vertex_program(ctx, vp);
   1248       rmesa->curr_vp_hw = NULL;
   1249       break;
   1250    case GL_FRAGMENT_SHADER_ATI:
   1251       rmesa->afs_loaded = NULL;
   1252       break;
   1253    }
   1254    /* need this for tcl fallbacks */
   1255    (void) _tnl_program_string(ctx, target, prog);
   1256 
   1257    /* XXX check if program is legal, within limits */
   1258    return GL_TRUE;
   1259 }
   1260 
   1261 static GLboolean
   1262 r200IsProgramNative(struct gl_context *ctx, GLenum target, struct gl_program *prog)
   1263 {
   1264    struct r200_vertex_program *vp = (void *)prog;
   1265 
   1266    switch(target){
   1267    case GL_VERTEX_STATE_PROGRAM_NV:
   1268    case GL_VERTEX_PROGRAM_ARB:
   1269       if (!vp->translated) {
   1270 	 r200_translate_vertex_program(ctx, vp);
   1271       }
   1272      /* does not take parameters etc. into account */
   1273       return vp->native;
   1274    default:
   1275       _mesa_problem(ctx, "Bad target in r200NewProgram");
   1276    }
   1277    return 0;
   1278 }
   1279 
   1280 void r200InitShaderFuncs(struct dd_function_table *functions)
   1281 {
   1282    functions->NewProgram = r200NewProgram;
   1283    functions->BindProgram = r200BindProgram;
   1284    functions->DeleteProgram = r200DeleteProgram;
   1285    functions->ProgramStringNotify = r200ProgramStringNotify;
   1286    functions->IsProgramNative = r200IsProgramNative;
   1287 }
   1288