Home | History | Annotate | Download | only in r200
      1 /**************************************************************************
      2 
      3 Copyright (C) 2005 Aapo Tahkola.
      4 
      5 All Rights Reserved.
      6 
      7 Permission is hereby granted, free of charge, to any person obtaining a
      8 copy of this software and associated documentation files (the "Software"),
      9 to deal in the Software without restriction, including without limitation
     10 on the rights to use, copy, modify, merge, publish, distribute, sub
     11 license, and/or sell copies of the Software, and to permit persons to whom
     12 the Software is furnished to do so, subject to the following conditions:
     13 
     14 The above copyright notice and this permission notice (including the next
     15 paragraph) shall be included in all copies or substantial portions of the
     16 Software.
     17 
     18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     20 FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     21 THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
     22 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     23 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     24 USE OR OTHER DEALINGS IN THE SOFTWARE.
     25 
     26 **************************************************************************/
     27 
     28 /*
     29  * Authors:
     30  *   Aapo Tahkola <aet (at) rasterburn.org>
     31  *   Roland Scheidegger <rscheidegger_lists (at) hispeed.ch>
     32  */
     33 #include "main/glheader.h"
     34 #include "main/macros.h"
     35 #include "main/enums.h"
     36 #include "program/program.h"
     37 #include "program/prog_instruction.h"
     38 #include "program/prog_parameter.h"
     39 #include "program/prog_statevars.h"
     40 #include "program/programopt.h"
     41 #include "tnl/tnl.h"
     42 
     43 #include "r200_context.h"
     44 #include "r200_vertprog.h"
     45 #include "r200_ioctl.h"
     46 #include "r200_tcl.h"
     47 
     48 #if SWIZZLE_X != VSF_IN_COMPONENT_X || \
     49     SWIZZLE_Y != VSF_IN_COMPONENT_Y || \
     50     SWIZZLE_Z != VSF_IN_COMPONENT_Z || \
     51     SWIZZLE_W != VSF_IN_COMPONENT_W || \
     52     SWIZZLE_ZERO != VSF_IN_COMPONENT_ZERO || \
     53     SWIZZLE_ONE != VSF_IN_COMPONENT_ONE || \
     54     WRITEMASK_X != VSF_FLAG_X || \
     55     WRITEMASK_Y != VSF_FLAG_Y || \
     56     WRITEMASK_Z != VSF_FLAG_Z || \
     57     WRITEMASK_W != VSF_FLAG_W
     58 #error Cannot change these!
     59 #endif
     60 
     61 #define SCALAR_FLAG (1<<31)
     62 #define FLAG_MASK (1<<31)
     63 #define OP_MASK (0xf)  /* we are unlikely to have more than 15 */
     64 #define OPN(operator, ip) {#operator, OPCODE_##operator, ip}
     65 
     66 static struct{
     67    char *name;
     68    int opcode;
     69    unsigned long ip; /* number of input operands and flags */
     70 }op_names[]={
     71    OPN(ABS, 1),
     72    OPN(ADD, 2),
     73    OPN(ARL, 1|SCALAR_FLAG),
     74    OPN(DP3, 2),
     75    OPN(DP4, 2),
     76    OPN(DPH, 2),
     77    OPN(DST, 2),
     78    OPN(EX2, 1|SCALAR_FLAG),
     79    OPN(EXP, 1|SCALAR_FLAG),
     80    OPN(FLR, 1),
     81    OPN(FRC, 1),
     82    OPN(LG2, 1|SCALAR_FLAG),
     83    OPN(LIT, 1),
     84    OPN(LOG, 1|SCALAR_FLAG),
     85    OPN(MAD, 3),
     86    OPN(MAX, 2),
     87    OPN(MIN, 2),
     88    OPN(MOV, 1),
     89    OPN(MUL, 2),
     90    OPN(POW, 2|SCALAR_FLAG),
     91    OPN(RCP, 1|SCALAR_FLAG),
     92    OPN(RSQ, 1|SCALAR_FLAG),
     93    OPN(SGE, 2),
     94    OPN(SLT, 2),
     95    OPN(SUB, 2),
     96    OPN(SWZ, 1),
     97    OPN(XPD, 2),
     98    OPN(END, 0),
     99 };
    100 #undef OPN
    101 
    102 static GLboolean r200VertexProgUpdateParams(struct gl_context *ctx, struct r200_vertex_program *vp)
    103 {
    104    r200ContextPtr rmesa = R200_CONTEXT( ctx );
    105    GLfloat *fcmd = (GLfloat *)&rmesa->hw.vpp[0].cmd[VPP_CMD_0 + 1];
    106    int pi;
    107    struct gl_program *mesa_vp = &vp->mesa_program;
    108    struct gl_program_parameter_list *paramList;
    109    drm_radeon_cmd_header_t tmp;
    110 
    111    R200_STATECHANGE( rmesa, vpp[0] );
    112    R200_STATECHANGE( rmesa, vpp[1] );
    113    assert(mesa_vp->Parameters);
    114    _mesa_load_state_parameters(ctx, mesa_vp->Parameters);
    115    paramList = mesa_vp->Parameters;
    116 
    117    if(paramList->NumParameters > R200_VSF_MAX_PARAM){
    118       fprintf(stderr, "%s:Params exhausted\n", __func__);
    119       return GL_FALSE;
    120    }
    121 
    122    for(pi = 0; pi < paramList->NumParameters; pi++) {
    123       switch(paramList->Parameters[pi].Type) {
    124       case PROGRAM_STATE_VAR:
    125       //fprintf(stderr, "%s", vp->Parameters->Parameters[pi].Name);
    126       case PROGRAM_CONSTANT:
    127 	 *fcmd++ = paramList->ParameterValues[pi][0].f;
    128 	 *fcmd++ = paramList->ParameterValues[pi][1].f;
    129 	 *fcmd++ = paramList->ParameterValues[pi][2].f;
    130 	 *fcmd++ = paramList->ParameterValues[pi][3].f;
    131 	 break;
    132       default:
    133 	 _mesa_problem(NULL, "Bad param type in %s", __func__);
    134 	 break;
    135       }
    136       if (pi == 95) {
    137 	 fcmd = (GLfloat *)&rmesa->hw.vpp[1].cmd[VPP_CMD_0 + 1];
    138       }
    139    }
    140    /* hack up the cmd_size so not the whole state atom is emitted always. */
    141    rmesa->hw.vpp[0].cmd_size =
    142       1 + 4 * ((paramList->NumParameters > 96) ? 96 : paramList->NumParameters);
    143    tmp.i = rmesa->hw.vpp[0].cmd[VPP_CMD_0];
    144    tmp.veclinear.count = (paramList->NumParameters > 96) ? 96 : paramList->NumParameters;
    145    rmesa->hw.vpp[0].cmd[VPP_CMD_0] = tmp.i;
    146    if (paramList->NumParameters > 96) {
    147       rmesa->hw.vpp[1].cmd_size = 1 + 4 * (paramList->NumParameters - 96);
    148       tmp.i = rmesa->hw.vpp[1].cmd[VPP_CMD_0];
    149       tmp.veclinear.count = paramList->NumParameters - 96;
    150       rmesa->hw.vpp[1].cmd[VPP_CMD_0] = tmp.i;
    151    }
    152    return GL_TRUE;
    153 }
    154 
    155 static inline unsigned long t_dst_mask(GLuint mask)
    156 {
    157    /* WRITEMASK_* is equivalent to VSF_FLAG_* */
    158    return mask & VSF_FLAG_ALL;
    159 }
    160 
    161 static unsigned long t_dst(struct prog_dst_register *dst)
    162 {
    163    switch(dst->File) {
    164    case PROGRAM_TEMPORARY:
    165       return ((dst->Index << R200_VPI_OUT_REG_INDEX_SHIFT)
    166 	 | R200_VSF_OUT_CLASS_TMP);
    167    case PROGRAM_OUTPUT:
    168       switch (dst->Index) {
    169       case VARYING_SLOT_POS:
    170 	 return R200_VSF_OUT_CLASS_RESULT_POS;
    171       case VARYING_SLOT_COL0:
    172 	 return R200_VSF_OUT_CLASS_RESULT_COLOR;
    173       case VARYING_SLOT_COL1:
    174 	 return ((1 << R200_VPI_OUT_REG_INDEX_SHIFT)
    175 	    | R200_VSF_OUT_CLASS_RESULT_COLOR);
    176       case VARYING_SLOT_FOGC:
    177 	 return R200_VSF_OUT_CLASS_RESULT_FOGC;
    178       case VARYING_SLOT_TEX0:
    179       case VARYING_SLOT_TEX1:
    180       case VARYING_SLOT_TEX2:
    181       case VARYING_SLOT_TEX3:
    182       case VARYING_SLOT_TEX4:
    183       case VARYING_SLOT_TEX5:
    184 	 return (((dst->Index - VARYING_SLOT_TEX0) << R200_VPI_OUT_REG_INDEX_SHIFT)
    185 	    | R200_VSF_OUT_CLASS_RESULT_TEXC);
    186       case VARYING_SLOT_PSIZ:
    187 	 return R200_VSF_OUT_CLASS_RESULT_POINTSIZE;
    188       default:
    189 	 fprintf(stderr, "problem in %s, unknown dst output reg %d\n", __func__, dst->Index);
    190 	 exit(0);
    191 	 return 0;
    192       }
    193    case PROGRAM_ADDRESS:
    194       assert (dst->Index == 0);
    195       return R200_VSF_OUT_CLASS_ADDR;
    196    default:
    197       fprintf(stderr, "problem in %s, unknown register type %d\n", __func__, dst->File);
    198       exit(0);
    199       return 0;
    200    }
    201 }
    202 
    203 static unsigned long t_src_class(gl_register_file file)
    204 {
    205 
    206    switch(file){
    207    case PROGRAM_TEMPORARY:
    208       return VSF_IN_CLASS_TMP;
    209 
    210    case PROGRAM_INPUT:
    211       return VSF_IN_CLASS_ATTR;
    212 
    213    case PROGRAM_CONSTANT:
    214    case PROGRAM_STATE_VAR:
    215       return VSF_IN_CLASS_PARAM;
    216    /*
    217    case PROGRAM_OUTPUT:
    218    case PROGRAM_ADDRESS:
    219    */
    220    default:
    221       fprintf(stderr, "problem in %s", __func__);
    222       exit(0);
    223    }
    224 }
    225 
    226 static inline unsigned long t_swizzle(GLubyte swizzle)
    227 {
    228 /* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
    229    return swizzle;
    230 }
    231 
    232 #if 0
    233 static void vp_dump_inputs(struct r200_vertex_program *vp, char *caller)
    234 {
    235    int i;
    236 
    237    if(vp == NULL){
    238       fprintf(stderr, "vp null in call to %s from %s\n", __func__, caller);
    239       return ;
    240    }
    241 
    242    fprintf(stderr, "%s:<", caller);
    243    for(i=0; i < VERT_ATTRIB_MAX; i++)
    244    fprintf(stderr, "%d ", vp->inputs[i]);
    245    fprintf(stderr, ">\n");
    246 
    247 }
    248 #endif
    249 
    250 static unsigned long t_src_index(struct r200_vertex_program *vp, struct prog_src_register *src)
    251 {
    252 /*
    253    int i;
    254    int max_reg = -1;
    255 */
    256    if(src->File == PROGRAM_INPUT){
    257 /*      if(vp->inputs[src->Index] != -1)
    258 	 return vp->inputs[src->Index];
    259 
    260       for(i=0; i < VERT_ATTRIB_MAX; i++)
    261 	 if(vp->inputs[i] > max_reg)
    262 	    max_reg = vp->inputs[i];
    263 
    264       vp->inputs[src->Index] = max_reg+1;*/
    265 
    266       //vp_dump_inputs(vp, __func__);
    267       assert(vp->inputs[src->Index] != -1);
    268       return vp->inputs[src->Index];
    269    } else {
    270       if (src->Index < 0) {
    271 	 fprintf(stderr, "WARNING negative offsets for indirect addressing do not work\n");
    272 	 return 0;
    273       }
    274       return src->Index;
    275    }
    276 }
    277 
    278 static unsigned long t_src(struct r200_vertex_program *vp, struct prog_src_register *src)
    279 {
    280 
    281    return MAKE_VSF_SOURCE(t_src_index(vp, src),
    282 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    283 			t_swizzle(GET_SWZ(src->Swizzle, 1)),
    284 			t_swizzle(GET_SWZ(src->Swizzle, 2)),
    285 			t_swizzle(GET_SWZ(src->Swizzle, 3)),
    286 			t_src_class(src->File),
    287 			src->Negate) | (src->RelAddr << 4);
    288 }
    289 
    290 static unsigned long t_src_scalar(struct r200_vertex_program *vp, struct prog_src_register *src)
    291 {
    292 
    293    return MAKE_VSF_SOURCE(t_src_index(vp, src),
    294 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    295 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    296 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    297 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
    298 			t_src_class(src->File),
    299 			src->Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src->RelAddr << 4);
    300 }
    301 
    302 static unsigned long t_opcode(enum prog_opcode opcode)
    303 {
    304 
    305    switch(opcode){
    306    case OPCODE_ADD: return R200_VPI_OUT_OP_ADD;
    307    /* FIXME: ARL works fine, but negative offsets won't work - fglrx just
    308     * seems to ignore neg offsets which isn't quite correct...
    309     */
    310    case OPCODE_ARL: return R200_VPI_OUT_OP_ARL;
    311    case OPCODE_DP4: return R200_VPI_OUT_OP_DOT;
    312    case OPCODE_DST: return R200_VPI_OUT_OP_DST;
    313    case OPCODE_EX2: return R200_VPI_OUT_OP_EX2;
    314    case OPCODE_EXP: return R200_VPI_OUT_OP_EXP;
    315    case OPCODE_FRC: return R200_VPI_OUT_OP_FRC;
    316    case OPCODE_LG2: return R200_VPI_OUT_OP_LG2;
    317    case OPCODE_LIT: return R200_VPI_OUT_OP_LIT;
    318    case OPCODE_LOG: return R200_VPI_OUT_OP_LOG;
    319    case OPCODE_MAX: return R200_VPI_OUT_OP_MAX;
    320    case OPCODE_MIN: return R200_VPI_OUT_OP_MIN;
    321    case OPCODE_MUL: return R200_VPI_OUT_OP_MUL;
    322    case OPCODE_RCP: return R200_VPI_OUT_OP_RCP;
    323    case OPCODE_RSQ: return R200_VPI_OUT_OP_RSQ;
    324    case OPCODE_SGE: return R200_VPI_OUT_OP_SGE;
    325    case OPCODE_SLT: return R200_VPI_OUT_OP_SLT;
    326 
    327    default:
    328       fprintf(stderr, "%s: Should not be called with opcode %d!", __func__, opcode);
    329    }
    330    exit(-1);
    331    return 0;
    332 }
    333 
    334 static unsigned long op_operands(enum prog_opcode opcode)
    335 {
    336    int i;
    337 
    338    /* Can we trust mesas opcodes to be in order ? */
    339    for(i=0; i < sizeof(op_names) / sizeof(*op_names); i++)
    340       if(op_names[i].opcode == opcode)
    341 	 return op_names[i].ip;
    342 
    343    fprintf(stderr, "op %d not found in op_names\n", opcode);
    344    exit(-1);
    345    return 0;
    346 }
    347 
    348 /* TODO: Get rid of t_src_class call */
    349 #define CMP_SRCS(a, b) (((a.RelAddr != b.RelAddr) || (a.Index != b.Index)) && \
    350 		       ((t_src_class(a.File) == VSF_IN_CLASS_PARAM && \
    351 			 t_src_class(b.File) == VSF_IN_CLASS_PARAM) || \
    352 			(t_src_class(a.File) == VSF_IN_CLASS_ATTR && \
    353 			 t_src_class(b.File) == VSF_IN_CLASS_ATTR)))
    354 
    355 /* fglrx on rv250 codes up unused sources as follows:
    356    unused but necessary sources are same as previous source, zero-ed out.
    357    unnecessary sources are same as previous source but with VSF_IN_CLASS_NONE set.
    358    i.e. an add (2 args) has its 2nd arg (if you use it as mov) zero-ed out, and 3rd arg
    359    set to VSF_IN_CLASS_NONE. Not sure if strictly necessary. */
    360 
    361 /* use these simpler definitions. Must obviously not be used with not yet set up regs.
    362    Those are NOT semantically equivalent to the r300 ones, requires code changes */
    363 #define ZERO_SRC_0 (((o_inst->src0 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
    364 				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
    365 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
    366 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
    367 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
    368 
    369 #define ZERO_SRC_1 (((o_inst->src1 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
    370 				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
    371 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
    372 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
    373 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
    374 
    375 #define ZERO_SRC_2 (((o_inst->src2 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
    376 				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
    377 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
    378 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
    379 				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
    380 
    381 #define UNUSED_SRC_0 ((o_inst->src0 & ~15) | 9)
    382 
    383 #define UNUSED_SRC_1 ((o_inst->src1 & ~15) | 9)
    384 
    385 #define UNUSED_SRC_2 ((o_inst->src2 & ~15) | 9)
    386 
    387 
    388 /**
    389  * Generate an R200 vertex program from Mesa's internal representation.
    390  *
    391  * \return  GL_TRUE for success, GL_FALSE for failure.
    392  */
    393 static GLboolean r200_translate_vertex_program(struct gl_context *ctx, struct r200_vertex_program *vp)
    394 {
    395    struct gl_program *mesa_vp = &vp->mesa_program;
    396    struct prog_instruction *vpi;
    397    int i;
    398    VERTEX_SHADER_INSTRUCTION *o_inst;
    399    unsigned long operands;
    400    int are_srcs_scalar;
    401    unsigned long hw_op;
    402    int dofogfix = 0;
    403    int fog_temp_i = 0;
    404    int free_inputs;
    405    int array_count = 0;
    406    int u_temp_used;
    407 
    408    vp->native = GL_FALSE;
    409    vp->translated = GL_TRUE;
    410    vp->fogmode = ctx->Fog.Mode;
    411 
    412    if (mesa_vp->arb.NumInstructions == 0)
    413       return GL_FALSE;
    414 
    415 #if 0
    416    if ((mesa_vp->info.inputs_read &
    417       ~(VERT_BIT_POS | VERT_BIT_NORMAL | VERT_BIT_COLOR0 | VERT_BIT_COLOR1 |
    418       VERT_BIT_FOG | VERT_BIT_TEX0 | VERT_BIT_TEX1 | VERT_BIT_TEX2 |
    419       VERT_BIT_TEX3 | VERT_BIT_TEX4 | VERT_BIT_TEX5)) != 0) {
    420       if (R200_DEBUG & RADEON_FALLBACKS) {
    421 	 fprintf(stderr, "can't handle vert prog inputs 0x%x\n",
    422 	    mesa_vp->info.inputs_read);
    423       }
    424       return GL_FALSE;
    425    }
    426 #endif
    427 
    428    if ((mesa_vp->info.outputs_written &
    429       ~((1 << VARYING_SLOT_POS) | (1 << VARYING_SLOT_COL0) | (1 << VARYING_SLOT_COL1) |
    430       (1 << VARYING_SLOT_FOGC) | (1 << VARYING_SLOT_TEX0) | (1 << VARYING_SLOT_TEX1) |
    431       (1 << VARYING_SLOT_TEX2) | (1 << VARYING_SLOT_TEX3) | (1 << VARYING_SLOT_TEX4) |
    432       (1 << VARYING_SLOT_TEX5) | (1 << VARYING_SLOT_PSIZ))) != 0) {
    433       if (R200_DEBUG & RADEON_FALLBACKS) {
    434 	 fprintf(stderr, "can't handle vert prog outputs 0x%llx\n",
    435                  (unsigned long long) mesa_vp->info.outputs_written);
    436       }
    437       return GL_FALSE;
    438    }
    439 
    440    /* Initial value should be last tmp reg that hw supports.
    441       Strangely enough r300 doesnt mind even though these would be out of range.
    442       Smart enough to realize that it doesnt need it? */
    443    int u_temp_i = R200_VSF_MAX_TEMPS - 1;
    444    struct prog_src_register src[3];
    445    struct prog_dst_register dst;
    446 
    447 /* FIXME: is changing the prog safe to do here? */
    448    if (mesa_vp->arb.IsPositionInvariant &&
    449       /* make sure we only do this once */
    450        !(mesa_vp->info.outputs_written & (1 << VARYING_SLOT_POS))) {
    451 	 _mesa_insert_mvp_code(ctx, mesa_vp);
    452       }
    453 
    454    /* for fogc, can't change mesa_vp, as it would hose swtnl, and exp with
    455       base e isn't directly available neither. */
    456    if ((mesa_vp->info.outputs_written & (1 << VARYING_SLOT_FOGC)) &&
    457        !vp->fogpidx) {
    458       struct gl_program_parameter_list *paramList;
    459       gl_state_index tokens[STATE_LENGTH] = { STATE_FOG_PARAMS, 0, 0, 0, 0 };
    460       paramList = mesa_vp->Parameters;
    461       vp->fogpidx = _mesa_add_state_reference(paramList, tokens);
    462    }
    463 
    464    vp->pos_end = 0;
    465    mesa_vp->arb.NumNativeInstructions = 0;
    466    if (mesa_vp->Parameters)
    467       mesa_vp->arb.NumNativeParameters = mesa_vp->Parameters->NumParameters;
    468    else
    469       mesa_vp->arb.NumNativeParameters = 0;
    470 
    471    for(i = 0; i < VERT_ATTRIB_MAX; i++)
    472       vp->inputs[i] = -1;
    473    for(i = 0; i < 15; i++)
    474       vp->inputmap_rev[i] = 255;
    475    free_inputs = 0x2ffd;
    476 
    477 /* fglrx uses fixed inputs as follows for conventional attribs.
    478    generic attribs use non-fixed assignment, fglrx will always use the
    479    lowest attrib values available. We'll just do the same.
    480    There are 12 generic attribs possible, corresponding to attrib 0, 2-11
    481    and 13 in a hw vertex prog.
    482    attr 1 and 12 aren't used for generic attribs as those cannot be made vec4
    483    (correspond to vertex normal/weight - maybe weight actually could be made vec4).
    484    Additionally, not more than 12 arrays in total are possible I think.
    485    attr 0 is pos, R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0
    486    attr 2-5 use colors 0-3 (R200_VTX_FP_RGBA << R200_VTX_COLOR_0/1/2/3_SHIFT in R200_SE_VTX_FMT_0)
    487    attr 6-11 use tex 0-5 (4 << R200_VTX_TEX0/1/2/3/4/5_COMP_CNT_SHIFT in R200_SE_VTX_FMT_1)
    488    attr 13 uses vtx1 pos (R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0)
    489 */
    490 
    491 /* attr 4,5 and 13 are only used with generic attribs.
    492    Haven't seen attr 14 used, maybe that's for the hw pointsize vec1 (which is
    493    not possibe to use with vertex progs as it is lacking in vert prog specification) */
    494 /* may look different when using idx buf / input_route instead of se_vtx_fmt? */
    495    if (mesa_vp->info.inputs_read & VERT_BIT_POS) {
    496       vp->inputs[VERT_ATTRIB_POS] = 0;
    497       vp->inputmap_rev[0] = VERT_ATTRIB_POS;
    498       free_inputs &= ~(1 << 0);
    499       array_count++;
    500    }
    501    if (mesa_vp->info.inputs_read & VERT_BIT_NORMAL) {
    502       vp->inputs[VERT_ATTRIB_NORMAL] = 1;
    503       vp->inputmap_rev[2] = VERT_ATTRIB_NORMAL;
    504       array_count++;
    505    }
    506    if (mesa_vp->info.inputs_read & VERT_BIT_COLOR0) {
    507       vp->inputs[VERT_ATTRIB_COLOR0] = 2;
    508       vp->inputmap_rev[4] = VERT_ATTRIB_COLOR0;
    509       free_inputs &= ~(1 << 2);
    510       array_count++;
    511    }
    512    if (mesa_vp->info.inputs_read & VERT_BIT_COLOR1) {
    513       vp->inputs[VERT_ATTRIB_COLOR1] = 3;
    514       vp->inputmap_rev[5] = VERT_ATTRIB_COLOR1;
    515       free_inputs &= ~(1 << 3);
    516       array_count++;
    517    }
    518    if (mesa_vp->info.inputs_read & VERT_BIT_FOG) {
    519       vp->inputs[VERT_ATTRIB_FOG] = 15; array_count++;
    520       vp->inputmap_rev[3] = VERT_ATTRIB_FOG;
    521       array_count++;
    522    }
    523    /* VERT_ATTRIB_TEX0-5 */
    524    for (i = 0; i <= 5; i++) {
    525       if (mesa_vp->info.inputs_read & VERT_BIT_TEX(i)) {
    526 	 vp->inputs[VERT_ATTRIB_TEX(i)] = i + 6;
    527 	 vp->inputmap_rev[8 + i] = VERT_ATTRIB_TEX(i);
    528 	 free_inputs &= ~(1 << (i + 6));
    529 	 array_count++;
    530       }
    531    }
    532    /* using VERT_ATTRIB_TEX6/7 would be illegal */
    533    for (; i < VERT_ATTRIB_TEX_MAX; i++) {
    534       if (mesa_vp->info.inputs_read & VERT_BIT_TEX(i)) {
    535           if (R200_DEBUG & RADEON_FALLBACKS) {
    536               fprintf(stderr, "texture attribute %d in vert prog\n", i);
    537           }
    538           return GL_FALSE;
    539       }
    540    }
    541    /* completely ignore aliasing? */
    542    for (i = 0; i < VERT_ATTRIB_GENERIC_MAX; i++) {
    543       int j;
    544    /* completely ignore aliasing? */
    545       if (mesa_vp->info.inputs_read & VERT_BIT_GENERIC(i)) {
    546 	 array_count++;
    547 	 if (array_count > 12) {
    548 	    if (R200_DEBUG & RADEON_FALLBACKS) {
    549 	       fprintf(stderr, "more than 12 attribs used in vert prog\n");
    550 	    }
    551 	    return GL_FALSE;
    552 	 }
    553 	 for (j = 0; j < 14; j++) {
    554 	    /* will always find one due to limited array_count */
    555 	    if (free_inputs & (1 << j)) {
    556 	       free_inputs &= ~(1 << j);
    557 	       vp->inputs[VERT_ATTRIB_GENERIC(i)] = j;
    558 	       if (j == 0) {
    559                   /* mapped to pos */
    560                   vp->inputmap_rev[j] = VERT_ATTRIB_GENERIC(i);
    561 	       } else if (j < 12) {
    562                   /* mapped to col/tex */
    563                   vp->inputmap_rev[j + 2] = VERT_ATTRIB_GENERIC(i);
    564 	       } else {
    565                   /* mapped to pos1 */
    566                   vp->inputmap_rev[j + 1] = VERT_ATTRIB_GENERIC(i);
    567                }
    568 	       break;
    569 	    }
    570 	 }
    571       }
    572    }
    573 
    574    if (!(mesa_vp->info.outputs_written & (1 << VARYING_SLOT_POS))) {
    575       if (R200_DEBUG & RADEON_FALLBACKS) {
    576 	 fprintf(stderr, "can't handle vert prog without position output\n");
    577       }
    578       return GL_FALSE;
    579    }
    580    if (free_inputs & 1) {
    581       if (R200_DEBUG & RADEON_FALLBACKS) {
    582 	 fprintf(stderr, "can't handle vert prog without position input\n");
    583       }
    584       return GL_FALSE;
    585    }
    586 
    587    o_inst = vp->instr;
    588    for (vpi = mesa_vp->arb.Instructions; vpi->Opcode != OPCODE_END; vpi++, o_inst++){
    589       operands = op_operands(vpi->Opcode);
    590       are_srcs_scalar = operands & SCALAR_FLAG;
    591       operands &= OP_MASK;
    592 
    593       for(i = 0; i < operands; i++) {
    594 	 src[i] = vpi->SrcReg[i];
    595 	 /* hack up default attrib values as per spec as swizzling.
    596 	    normal, fog, secondary color. Crazy?
    597 	    May need more if we don't submit vec4 elements? */
    598 	 if (src[i].File == PROGRAM_INPUT) {
    599 	    if (src[i].Index == VERT_ATTRIB_NORMAL) {
    600 	       int j;
    601 	       for (j = 0; j < 4; j++) {
    602 		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
    603 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    604 		     src[i].Swizzle |= SWIZZLE_ONE << (j*3);
    605 		  }
    606 	       }
    607 	    }
    608 	    else if (src[i].Index == VERT_ATTRIB_COLOR1) {
    609 	       int j;
    610 	       for (j = 0; j < 4; j++) {
    611 		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
    612 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    613 		     src[i].Swizzle |= SWIZZLE_ZERO << (j*3);
    614 		  }
    615 	       }
    616 	    }
    617 	    else if (src[i].Index == VERT_ATTRIB_FOG) {
    618 	       int j;
    619 	       for (j = 0; j < 4; j++) {
    620 		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
    621 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    622 		     src[i].Swizzle |= SWIZZLE_ONE << (j*3);
    623 		  }
    624 		  else if ((GET_SWZ(src[i].Swizzle, j) == SWIZZLE_Y) ||
    625 			    GET_SWZ(src[i].Swizzle, j) == SWIZZLE_Z) {
    626 		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
    627 		     src[i].Swizzle |= SWIZZLE_ZERO << (j*3);
    628 		  }
    629 	       }
    630 	    }
    631 	 }
    632       }
    633 
    634       if(operands == 3){
    635 	 if( CMP_SRCS(src[1], src[2]) || CMP_SRCS(src[0], src[2]) ){
    636 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
    637 		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    638 		VSF_FLAG_ALL);
    639 
    640 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[2]),
    641 		  SWIZZLE_X, SWIZZLE_Y,
    642 		  SWIZZLE_Z, SWIZZLE_W,
    643 		  t_src_class(src[2].File), VSF_FLAG_NONE) | (src[2].RelAddr << 4);
    644 
    645 	    o_inst->src1 = ZERO_SRC_0;
    646 	    o_inst->src2 = UNUSED_SRC_1;
    647 	    o_inst++;
    648 
    649 	    src[2].File = PROGRAM_TEMPORARY;
    650 	    src[2].Index = u_temp_i;
    651 	    src[2].RelAddr = 0;
    652 	    u_temp_i--;
    653 	 }
    654       }
    655 
    656       if(operands >= 2){
    657 	 if( CMP_SRCS(src[1], src[0]) ){
    658 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
    659 		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    660 		VSF_FLAG_ALL);
    661 
    662 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    663 		  SWIZZLE_X, SWIZZLE_Y,
    664 		  SWIZZLE_Z, SWIZZLE_W,
    665 		  t_src_class(src[0].File), VSF_FLAG_NONE) | (src[0].RelAddr << 4);
    666 
    667 	    o_inst->src1 = ZERO_SRC_0;
    668 	    o_inst->src2 = UNUSED_SRC_1;
    669 	    o_inst++;
    670 
    671 	    src[0].File = PROGRAM_TEMPORARY;
    672 	    src[0].Index = u_temp_i;
    673 	    src[0].RelAddr = 0;
    674 	    u_temp_i--;
    675 	 }
    676       }
    677 
    678       dst = vpi->DstReg;
    679       if (dst.File == PROGRAM_OUTPUT &&
    680 	  dst.Index == VARYING_SLOT_FOGC &&
    681 	  dst.WriteMask & WRITEMASK_X) {
    682 	  fog_temp_i = u_temp_i;
    683 	  dst.File = PROGRAM_TEMPORARY;
    684 	  dst.Index = fog_temp_i;
    685 	  dofogfix = 1;
    686 	  u_temp_i--;
    687       }
    688 
    689       /* These ops need special handling. */
    690       switch(vpi->Opcode){
    691       case OPCODE_POW:
    692 /* pow takes only one argument, first scalar is in slot x, 2nd in slot z (other slots don't matter).
    693    So may need to insert additional instruction */
    694 	 if ((src[0].File == src[1].File) &&
    695 	     (src[0].Index == src[1].Index)) {
    696 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&dst),
    697 		   t_dst_mask(dst.WriteMask));
    698 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    699 		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    700 		   SWIZZLE_ZERO,
    701 		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
    702 		   SWIZZLE_ZERO,
    703 		   t_src_class(src[0].File),
    704 		   src[0].Negate) | (src[0].RelAddr << 4);
    705 	    o_inst->src1 = UNUSED_SRC_0;
    706 	    o_inst->src2 = UNUSED_SRC_0;
    707 	 }
    708 	 else {
    709 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
    710 		   (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    711 		   VSF_FLAG_ALL);
    712 	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    713 		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    714 		   SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO,
    715 		   t_src_class(src[0].File),
    716 		   src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
    717 	    o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    718 		   SWIZZLE_ZERO, SWIZZLE_ZERO,
    719 		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)), SWIZZLE_ZERO,
    720 		   t_src_class(src[1].File),
    721 		   src[1].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
    722 	    o_inst->src2 = UNUSED_SRC_1;
    723 	    o_inst++;
    724 
    725 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&dst),
    726 		   t_dst_mask(dst.WriteMask));
    727 	    o_inst->src0 = MAKE_VSF_SOURCE(u_temp_i,
    728 		   VSF_IN_COMPONENT_X,
    729 		   VSF_IN_COMPONENT_Y,
    730 		   VSF_IN_COMPONENT_Z,
    731 		   VSF_IN_COMPONENT_W,
    732 		   VSF_IN_CLASS_TMP,
    733 		   VSF_FLAG_NONE);
    734 	    o_inst->src1 = UNUSED_SRC_0;
    735 	    o_inst->src2 = UNUSED_SRC_0;
    736 	    u_temp_i--;
    737 	 }
    738 	 goto next;
    739 
    740       case OPCODE_MOV://ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO}
    741       case OPCODE_SWZ:
    742 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
    743 		t_dst_mask(dst.WriteMask));
    744 	 o_inst->src0 = t_src(vp, &src[0]);
    745 	 o_inst->src1 = ZERO_SRC_0;
    746 	 o_inst->src2 = UNUSED_SRC_1;
    747 	 goto next;
    748 
    749       case OPCODE_MAD:
    750 	 /* only 2 read ports into temp memory thus may need the macro op MAD_2
    751 	    instead (requiring 2 clocks) if all inputs are in temp memory
    752 	    (and, only if they actually reference 3 distinct temps) */
    753 	 hw_op=(src[0].File == PROGRAM_TEMPORARY &&
    754 	    src[1].File == PROGRAM_TEMPORARY &&
    755 	    src[2].File == PROGRAM_TEMPORARY &&
    756 	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[1].RelAddr << 8) | src[1].Index)) &&
    757 	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[2].RelAddr << 8) | src[2].Index)) &&
    758 	    (((src[1].RelAddr << 8) | src[1].Index) != ((src[2].RelAddr << 8) | src[2].Index))) ?
    759 	    R200_VPI_OUT_OP_MAD_2 : R200_VPI_OUT_OP_MAD;
    760 
    761 	 o_inst->op = MAKE_VSF_OP(hw_op, t_dst(&dst),
    762 	    t_dst_mask(dst.WriteMask));
    763 	 o_inst->src0 = t_src(vp, &src[0]);
    764 #if 0
    765 if ((o_inst - vp->instr) == 31) {
    766 /* fix up the broken vertex program of quake4 demo... */
    767 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    768 			SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X,
    769 			t_src_class(src[1].File),
    770 			src[1].Negate) | (src[1].RelAddr << 4);
    771 o_inst->src2 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    772 			SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y,
    773 			t_src_class(src[1].File),
    774 			src[1].Negate) | (src[1].RelAddr << 4);
    775 }
    776 else {
    777 	 o_inst->src1 = t_src(vp, &src[1]);
    778 	 o_inst->src2 = t_src(vp, &src[2]);
    779 }
    780 #else
    781 	 o_inst->src1 = t_src(vp, &src[1]);
    782 	 o_inst->src2 = t_src(vp, &src[2]);
    783 #endif
    784 	 goto next;
    785 
    786       case OPCODE_DP3://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ZERO} PARAM 0{} {X Y Z ZERO}
    787 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&dst),
    788 		t_dst_mask(dst.WriteMask));
    789 
    790 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    791 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    792 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
    793 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
    794 		SWIZZLE_ZERO,
    795 		t_src_class(src[0].File),
    796 		src[0].Negate) | (src[0].RelAddr << 4);
    797 
    798 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    799 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
    800 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
    801 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
    802 		SWIZZLE_ZERO,
    803 		t_src_class(src[1].File),
    804 		src[1].Negate) | (src[1].RelAddr << 4);
    805 
    806 	 o_inst->src2 = UNUSED_SRC_1;
    807 	 goto next;
    808 
    809       case OPCODE_DPH://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W}
    810 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&dst),
    811 		t_dst_mask(dst.WriteMask));
    812 
    813 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    814 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    815 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
    816 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
    817 		VSF_IN_COMPONENT_ONE,
    818 		t_src_class(src[0].File),
    819 		src[0].Negate) | (src[0].RelAddr << 4);
    820 	 o_inst->src1 = t_src(vp, &src[1]);
    821 	 o_inst->src2 = UNUSED_SRC_1;
    822 	 goto next;
    823 
    824       case OPCODE_SUB://ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
    825 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
    826 		t_dst_mask(dst.WriteMask));
    827 
    828 	 o_inst->src0 = t_src(vp, &src[0]);
    829 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    830 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
    831 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
    832 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
    833 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
    834 		t_src_class(src[1].File),
    835 		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
    836 	 o_inst->src2 = UNUSED_SRC_1;
    837 	 goto next;
    838 
    839       case OPCODE_ABS://MAX RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
    840 	 o_inst->op=MAKE_VSF_OP(R200_VPI_OUT_OP_MAX, t_dst(&dst),
    841 		t_dst_mask(dst.WriteMask));
    842 
    843 	 o_inst->src0=t_src(vp, &src[0]);
    844 	 o_inst->src1=MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    845 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
    846 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
    847 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
    848 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)),
    849 		t_src_class(src[0].File),
    850 		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
    851 	 o_inst->src2 = UNUSED_SRC_1;
    852 	 goto next;
    853 
    854       case OPCODE_FLR:
    855       /* FRC TMP 0.X Y Z W PARAM 0{} {X Y Z W}
    856          ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} TMP 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W */
    857 
    858 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_FRC,
    859 	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    860 	    t_dst_mask(dst.WriteMask));
    861 
    862 	 o_inst->src0 = t_src(vp, &src[0]);
    863 	 o_inst->src1 = UNUSED_SRC_0;
    864 	 o_inst->src2 = UNUSED_SRC_1;
    865 	 o_inst++;
    866 
    867 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
    868 		t_dst_mask(dst.WriteMask));
    869 
    870 	 o_inst->src0 = t_src(vp, &src[0]);
    871 	 o_inst->src1 = MAKE_VSF_SOURCE(u_temp_i,
    872 		VSF_IN_COMPONENT_X,
    873 		VSF_IN_COMPONENT_Y,
    874 		VSF_IN_COMPONENT_Z,
    875 		VSF_IN_COMPONENT_W,
    876 		VSF_IN_CLASS_TMP,
    877 		/* Not 100% sure about this */
    878 		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE/*VSF_FLAG_ALL*/);
    879 
    880 	 o_inst->src2 = UNUSED_SRC_0;
    881 	 u_temp_i--;
    882 	 goto next;
    883 
    884       case OPCODE_XPD:
    885 	 /* mul r0, r1.yzxw, r2.zxyw
    886 	    mad r0, -r2.yzxw, r1.zxyw, r0
    887 	  */
    888 	 hw_op=(src[0].File == PROGRAM_TEMPORARY &&
    889 	    src[1].File == PROGRAM_TEMPORARY &&
    890 	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[1].RelAddr << 8) | src[1].Index))) ?
    891 	    R200_VPI_OUT_OP_MAD_2 : R200_VPI_OUT_OP_MAD;
    892 
    893 	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
    894 	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
    895 	    t_dst_mask(dst.WriteMask));
    896 
    897 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    898 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
    899 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
    900 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
    901 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
    902 		t_src_class(src[0].File),
    903 		src[0].Negate) | (src[0].RelAddr << 4);
    904 
    905 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    906 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
    907 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
    908 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
    909 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
    910 		t_src_class(src[1].File),
    911 		src[1].Negate) | (src[1].RelAddr << 4);
    912 
    913 	 o_inst->src2 = UNUSED_SRC_1;
    914 	 o_inst++;
    915 	 u_temp_i--;
    916 
    917 	 o_inst->op = MAKE_VSF_OP(hw_op, t_dst(&dst),
    918 		t_dst_mask(dst.WriteMask));
    919 
    920 	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
    921 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
    922 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
    923 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
    924 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
    925 		t_src_class(src[1].File),
    926 		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
    927 
    928 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
    929 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
    930 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
    931 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
    932 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
    933 		t_src_class(src[0].File),
    934 		src[0].Negate) | (src[0].RelAddr << 4);
    935 
    936 	 o_inst->src2 = MAKE_VSF_SOURCE(u_temp_i+1,
    937 		VSF_IN_COMPONENT_X,
    938 		VSF_IN_COMPONENT_Y,
    939 		VSF_IN_COMPONENT_Z,
    940 		VSF_IN_COMPONENT_W,
    941 		VSF_IN_CLASS_TMP,
    942 		VSF_FLAG_NONE);
    943 	 goto next;
    944 
    945       case OPCODE_END:
    946 	 assert(0);
    947       default:
    948 	 break;
    949       }
    950 
    951       o_inst->op = MAKE_VSF_OP(t_opcode(vpi->Opcode), t_dst(&dst),
    952 	    t_dst_mask(dst.WriteMask));
    953 
    954       if(are_srcs_scalar){
    955 	 switch(operands){
    956 	    case 1:
    957 		o_inst->src0 = t_src_scalar(vp, &src[0]);
    958 		o_inst->src1 = UNUSED_SRC_0;
    959 		o_inst->src2 = UNUSED_SRC_1;
    960 	    break;
    961 
    962 	    case 2:
    963 		o_inst->src0 = t_src_scalar(vp, &src[0]);
    964 		o_inst->src1 = t_src_scalar(vp, &src[1]);
    965 		o_inst->src2 = UNUSED_SRC_1;
    966 	    break;
    967 
    968 	    case 3:
    969 		o_inst->src0 = t_src_scalar(vp, &src[0]);
    970 		o_inst->src1 = t_src_scalar(vp, &src[1]);
    971 		o_inst->src2 = t_src_scalar(vp, &src[2]);
    972 	    break;
    973 
    974 	    default:
    975 		fprintf(stderr, "illegal number of operands %lu\n", operands);
    976 		exit(-1);
    977 	    break;
    978 	 }
    979       } else {
    980 	 switch(operands){
    981 	    case 1:
    982 		o_inst->src0 = t_src(vp, &src[0]);
    983 		o_inst->src1 = UNUSED_SRC_0;
    984 		o_inst->src2 = UNUSED_SRC_1;
    985 	    break;
    986 
    987 	    case 2:
    988 		o_inst->src0 = t_src(vp, &src[0]);
    989 		o_inst->src1 = t_src(vp, &src[1]);
    990 		o_inst->src2 = UNUSED_SRC_1;
    991 	    break;
    992 
    993 	    case 3:
    994 		o_inst->src0 = t_src(vp, &src[0]);
    995 		o_inst->src1 = t_src(vp, &src[1]);
    996 		o_inst->src2 = t_src(vp, &src[2]);
    997 	    break;
    998 
    999 	    default:
   1000 		fprintf(stderr, "illegal number of operands %lu\n", operands);
   1001 		exit(-1);
   1002 	    break;
   1003 	 }
   1004       }
   1005       next:
   1006 
   1007       if (dofogfix) {
   1008 	 o_inst++;
   1009 	 if (vp->fogmode == GL_EXP) {
   1010 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1011 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1012 		VSF_FLAG_X);
   1013 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1014 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, X, X, X, X, PARAM, NONE);
   1015 	    o_inst->src2 = UNUSED_SRC_1;
   1016 	    o_inst++;
   1017 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_EXP_E,
   1018 		R200_VSF_OUT_CLASS_RESULT_FOGC,
   1019 		VSF_FLAG_X);
   1020 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
   1021 	    o_inst->src1 = UNUSED_SRC_0;
   1022 	    o_inst->src2 = UNUSED_SRC_1;
   1023 	 }
   1024 	 else if (vp->fogmode == GL_EXP2) {
   1025 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1026 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1027 		VSF_FLAG_X);
   1028 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1029 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, X, X, X, X, PARAM, NONE);
   1030 	    o_inst->src2 = UNUSED_SRC_1;
   1031 	    o_inst++;
   1032 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1033 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1034 		VSF_FLAG_X);
   1035 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1036 	    o_inst->src1 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1037 	    o_inst->src2 = UNUSED_SRC_1;
   1038 	    o_inst++;
   1039 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_EXP_E,
   1040 		R200_VSF_OUT_CLASS_RESULT_FOGC,
   1041 		VSF_FLAG_X);
   1042 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
   1043 	    o_inst->src1 = UNUSED_SRC_0;
   1044 	    o_inst->src2 = UNUSED_SRC_1;
   1045 	 }
   1046 	 else { /* fogmode == GL_LINEAR */
   1047 		/* could do that with single op (dot) if using params like
   1048 		   with fixed function pipeline fog */
   1049 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
   1050 		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
   1051 		VSF_FLAG_X);
   1052 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
   1053 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, Z, Z, Z, Z, PARAM, NONE);
   1054 	    o_inst->src2 = UNUSED_SRC_1;
   1055 	    o_inst++;
   1056 	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
   1057 		R200_VSF_OUT_CLASS_RESULT_FOGC,
   1058 		VSF_FLAG_X);
   1059 	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
   1060 	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, W, W, W, W, PARAM, NONE);
   1061 	    o_inst->src2 = UNUSED_SRC_1;
   1062 
   1063 	 }
   1064          dofogfix = 0;
   1065       }
   1066 
   1067       u_temp_used = (R200_VSF_MAX_TEMPS - 1) - u_temp_i;
   1068       if (mesa_vp->arb.NumNativeTemporaries <
   1069           (mesa_vp->arb.NumTemporaries + u_temp_used)) {
   1070          mesa_vp->arb.NumNativeTemporaries =
   1071             mesa_vp->arb.NumTemporaries + u_temp_used;
   1072       }
   1073       if ((mesa_vp->arb.NumTemporaries + u_temp_used) > R200_VSF_MAX_TEMPS) {
   1074 	 if (R200_DEBUG & RADEON_FALLBACKS) {
   1075             fprintf(stderr, "Ran out of temps, num temps %d, us %d\n", mesa_vp->arb.NumTemporaries, u_temp_used);
   1076 	 }
   1077 	 return GL_FALSE;
   1078       }
   1079       u_temp_i = R200_VSF_MAX_TEMPS - 1;
   1080       if(o_inst - vp->instr >= R200_VSF_MAX_INST) {
   1081          mesa_vp->arb.NumNativeInstructions = 129;
   1082 	 if (R200_DEBUG & RADEON_FALLBACKS) {
   1083 	    fprintf(stderr, "more than 128 native instructions\n");
   1084 	 }
   1085 	 return GL_FALSE;
   1086       }
   1087       if ((o_inst->op & R200_VSF_OUT_CLASS_MASK) == R200_VSF_OUT_CLASS_RESULT_POS) {
   1088 	 vp->pos_end = (o_inst - vp->instr);
   1089       }
   1090    }
   1091 
   1092    vp->native = GL_TRUE;
   1093    mesa_vp->arb.NumNativeInstructions = (o_inst - vp->instr);
   1094 #if 0
   1095    fprintf(stderr, "hw program:\n");
   1096    for(i=0; i < vp->program.length; i++)
   1097       fprintf(stderr, "%08x\n", vp->instr[i]);
   1098 #endif
   1099    return GL_TRUE;
   1100 }
   1101 
   1102 void r200SetupVertexProg( struct gl_context *ctx ) {
   1103    r200ContextPtr rmesa = R200_CONTEXT(ctx);
   1104    struct r200_vertex_program *vp = (struct r200_vertex_program *)ctx->VertexProgram.Current;
   1105    GLboolean fallback;
   1106    GLint i;
   1107 
   1108    if (!vp->translated || (ctx->Fog.Enabled && ctx->Fog.Mode != vp->fogmode)) {
   1109       rmesa->curr_vp_hw = NULL;
   1110       r200_translate_vertex_program(ctx, vp);
   1111    }
   1112    /* could optimize setting up vertex progs away for non-tcl hw */
   1113    fallback = !(vp->native && r200VertexProgUpdateParams(ctx, vp));
   1114    TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, fallback);
   1115    if (rmesa->radeon.TclFallback) return;
   1116 
   1117    R200_STATECHANGE( rmesa, vap );
   1118    /* FIXME: fglrx sets R200_VAP_SINGLE_BUF_STATE_ENABLE too. Do we need it?
   1119              maybe only when using more than 64 inst / 96 param? */
   1120    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_PROG_VTX_SHADER_ENABLE /*| R200_VAP_SINGLE_BUF_STATE_ENABLE*/;
   1121 
   1122    R200_STATECHANGE( rmesa, pvs );
   1123 
   1124    rmesa->hw.pvs.cmd[PVS_CNTL_1] = (0 << R200_PVS_CNTL_1_PROGRAM_START_SHIFT) |
   1125       ((vp->mesa_program.arb.NumNativeInstructions - 1) << R200_PVS_CNTL_1_PROGRAM_END_SHIFT) |
   1126       (vp->pos_end << R200_PVS_CNTL_1_POS_END_SHIFT);
   1127    rmesa->hw.pvs.cmd[PVS_CNTL_2] = (0 << R200_PVS_CNTL_2_PARAM_OFFSET_SHIFT) |
   1128       (vp->mesa_program.arb.NumNativeParameters << R200_PVS_CNTL_2_PARAM_COUNT_SHIFT);
   1129 
   1130    /* maybe user clip planes just work with vertex progs... untested */
   1131    if (ctx->Transform.ClipPlanesEnabled) {
   1132       R200_STATECHANGE( rmesa, tcl );
   1133       if (vp->mesa_program.arb.IsPositionInvariant) {
   1134 	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= (ctx->Transform.ClipPlanesEnabled << 2);
   1135       }
   1136       else {
   1137 	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~(0xfc);
   1138       }
   1139    }
   1140 
   1141    if (vp != rmesa->curr_vp_hw) {
   1142       GLuint count = vp->mesa_program.arb.NumNativeInstructions;
   1143       drm_radeon_cmd_header_t tmp;
   1144 
   1145       R200_STATECHANGE( rmesa, vpi[0] );
   1146       R200_STATECHANGE( rmesa, vpi[1] );
   1147 
   1148       /* FIXME: what about using a memcopy... */
   1149       for (i = 0; (i < 64) && i < count; i++) {
   1150 	 rmesa->hw.vpi[0].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i].op;
   1151 	 rmesa->hw.vpi[0].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i].src0;
   1152 	 rmesa->hw.vpi[0].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i].src1;
   1153 	 rmesa->hw.vpi[0].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i].src2;
   1154       }
   1155       /* hack up the cmd_size so not the whole state atom is emitted always.
   1156          This may require some more thought, we may emit half progs on lost state, but
   1157          hopefully it won't matter?
   1158          WARNING: must not use R200_DB_STATECHANGE, this will produce bogus (and rejected)
   1159          packet emits (due to the mismatched cmd_size and count in cmd/last_cmd) */
   1160       rmesa->hw.vpi[0].cmd_size = 1 + 4 * ((count > 64) ? 64 : count);
   1161       tmp.i = rmesa->hw.vpi[0].cmd[VPI_CMD_0];
   1162       tmp.veclinear.count = (count > 64) ? 64 : count;
   1163       rmesa->hw.vpi[0].cmd[VPI_CMD_0] = tmp.i;
   1164       if (count > 64) {
   1165 	 for (i = 0; i < (count - 64); i++) {
   1166 	    rmesa->hw.vpi[1].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i + 64].op;
   1167 	    rmesa->hw.vpi[1].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i + 64].src0;
   1168 	    rmesa->hw.vpi[1].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i + 64].src1;
   1169 	    rmesa->hw.vpi[1].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i + 64].src2;
   1170 	 }
   1171 	 rmesa->hw.vpi[1].cmd_size = 1 + 4 * (count - 64);
   1172 	 tmp.i = rmesa->hw.vpi[1].cmd[VPI_CMD_0];
   1173 	 tmp.veclinear.count = count - 64;
   1174 	 rmesa->hw.vpi[1].cmd[VPI_CMD_0] = tmp.i;
   1175       }
   1176       rmesa->curr_vp_hw = vp;
   1177    }
   1178 }
   1179 
   1180 
   1181 static struct gl_program *
   1182 r200NewProgram(struct gl_context *ctx, GLenum target, GLuint id,
   1183                bool is_arb_asm)
   1184 {
   1185    switch(target){
   1186    case GL_VERTEX_PROGRAM_ARB: {
   1187       struct r200_vertex_program *vp = rzalloc(NULL,
   1188                                                struct r200_vertex_program);
   1189       return _mesa_init_gl_program(&vp->mesa_program, target, id, is_arb_asm);
   1190    }
   1191    case GL_FRAGMENT_PROGRAM_ARB: {
   1192       struct gl_program *prog = rzalloc(NULL, struct gl_program);
   1193       return _mesa_init_gl_program(prog, target, id, is_arb_asm);
   1194    }
   1195    default:
   1196       _mesa_problem(ctx, "Bad target in r200NewProgram");
   1197       return NULL;
   1198    }
   1199 }
   1200 
   1201 
   1202 static void
   1203 r200DeleteProgram(struct gl_context *ctx, struct gl_program *prog)
   1204 {
   1205    _mesa_delete_program(ctx, prog);
   1206 }
   1207 
   1208 static GLboolean
   1209 r200ProgramStringNotify(struct gl_context *ctx, GLenum target, struct gl_program *prog)
   1210 {
   1211    struct r200_vertex_program *vp = (void *)prog;
   1212    r200ContextPtr rmesa = R200_CONTEXT(ctx);
   1213 
   1214    switch(target) {
   1215    case GL_VERTEX_PROGRAM_ARB:
   1216       vp->translated = GL_FALSE;
   1217       vp->fogpidx = 0;
   1218 /*      memset(&vp->translated, 0, sizeof(struct r200_vertex_program) - sizeof(struct gl_program));*/
   1219       r200_translate_vertex_program(ctx, vp);
   1220       rmesa->curr_vp_hw = NULL;
   1221       break;
   1222    case GL_FRAGMENT_SHADER_ATI:
   1223       rmesa->afs_loaded = NULL;
   1224       break;
   1225    }
   1226    /* need this for tcl fallbacks */
   1227    (void) _tnl_program_string(ctx, target, prog);
   1228 
   1229    /* XXX check if program is legal, within limits */
   1230    return GL_TRUE;
   1231 }
   1232 
   1233 static GLboolean
   1234 r200IsProgramNative(struct gl_context *ctx, GLenum target, struct gl_program *prog)
   1235 {
   1236    struct r200_vertex_program *vp = (void *)prog;
   1237 
   1238    switch(target){
   1239    case GL_VERTEX_PROGRAM_ARB:
   1240       if (!vp->translated) {
   1241 	 r200_translate_vertex_program(ctx, vp);
   1242       }
   1243      /* does not take parameters etc. into account */
   1244       return vp->native;
   1245    default:
   1246       _mesa_problem(ctx, "Bad target in r200NewProgram");
   1247    }
   1248    return 0;
   1249 }
   1250 
   1251 void r200InitShaderFuncs(struct dd_function_table *functions)
   1252 {
   1253    functions->NewProgram = r200NewProgram;
   1254    functions->DeleteProgram = r200DeleteProgram;
   1255    functions->ProgramStringNotify = r200ProgramStringNotify;
   1256    functions->IsProgramNative = r200IsProgramNative;
   1257 }
   1258