Home | History | Annotate | Download | only in nine
      1 
      2 /* FF is big and ugly so feel free to write lines as long as you like.
      3  * Aieeeeeeeee !
      4  *
      5  * Let me make that clearer:
      6  * Aieeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee ! !! !!!
      7  */
      8 
      9 #include "device9.h"
     10 #include "basetexture9.h"
     11 #include "vertexdeclaration9.h"
     12 #include "vertexshader9.h"
     13 #include "pixelshader9.h"
     14 #include "nine_ff.h"
     15 #include "nine_defines.h"
     16 #include "nine_helpers.h"
     17 #include "nine_pipe.h"
     18 #include "nine_dump.h"
     19 
     20 #include "pipe/p_context.h"
     21 #include "tgsi/tgsi_ureg.h"
     22 #include "tgsi/tgsi_dump.h"
     23 #include "util/u_box.h"
     24 #include "util/u_hash_table.h"
     25 #include "util/u_upload_mgr.h"
     26 
     27 #define DBG_CHANNEL DBG_FF
     28 
     29 #define NINE_FF_NUM_VS_CONST 196
     30 #define NINE_FF_NUM_PS_CONST 24
     31 
     32 struct fvec4
     33 {
     34     float x, y, z, w;
     35 };
     36 
     37 struct nine_ff_vs_key
     38 {
     39     union {
     40         struct {
     41             uint32_t position_t : 1;
     42             uint32_t lighting   : 1;
     43             uint32_t darkness   : 1; /* lighting enabled but no active lights */
     44             uint32_t localviewer : 1;
     45             uint32_t vertexpointsize : 1;
     46             uint32_t pointscale : 1;
     47             uint32_t vertexblend : 3;
     48             uint32_t vertexblend_indexed : 1;
     49             uint32_t vertextween : 1;
     50             uint32_t mtl_diffuse : 2; /* 0 = material, 1 = color1, 2 = color2 */
     51             uint32_t mtl_ambient : 2;
     52             uint32_t mtl_specular : 2;
     53             uint32_t mtl_emissive : 2;
     54             uint32_t fog_mode : 2;
     55             uint32_t fog_range : 1;
     56             uint32_t color0in_one : 1;
     57             uint32_t color1in_zero : 1;
     58             uint32_t has_normal : 1;
     59             uint32_t fog : 1;
     60             uint32_t normalizenormals : 1;
     61             uint32_t ucp : 1;
     62             uint32_t pad1 : 4;
     63             uint32_t tc_dim_input: 16; /* 8 * 2 bits */
     64             uint32_t pad2 : 16;
     65             uint32_t tc_dim_output: 24; /* 8 * 3 bits */
     66             uint32_t pad3 : 8;
     67             uint32_t tc_gen : 24; /* 8 * 3 bits */
     68             uint32_t pad4 : 8;
     69             uint32_t tc_idx : 24;
     70             uint32_t pad5 : 8;
     71             uint32_t passthrough;
     72         };
     73         uint64_t value64[3]; /* don't forget to resize VertexShader9.ff_key */
     74         uint32_t value32[6];
     75     };
     76 };
     77 
     78 /* Texture stage state:
     79  *
     80  * COLOROP       D3DTOP 5 bit
     81  * ALPHAOP       D3DTOP 5 bit
     82  * COLORARG0     D3DTA  3 bit
     83  * COLORARG1     D3DTA  3 bit
     84  * COLORARG2     D3DTA  3 bit
     85  * ALPHAARG0     D3DTA  3 bit
     86  * ALPHAARG1     D3DTA  3 bit
     87  * ALPHAARG2     D3DTA  3 bit
     88  * RESULTARG     D3DTA  1 bit (CURRENT:0 or TEMP:1)
     89  * TEXCOORDINDEX 0 - 7  3 bit
     90  * ===========================
     91  *                     32 bit per stage
     92  */
     93 struct nine_ff_ps_key
     94 {
     95     union {
     96         struct {
     97             struct {
     98                 uint32_t colorop   : 5;
     99                 uint32_t alphaop   : 5;
    100                 uint32_t colorarg0 : 3;
    101                 uint32_t colorarg1 : 3;
    102                 uint32_t colorarg2 : 3;
    103                 uint32_t alphaarg0 : 3;
    104                 uint32_t alphaarg1 : 3;
    105                 uint32_t alphaarg2 : 3;
    106                 uint32_t resultarg : 1; /* CURRENT:0 or TEMP:1 */
    107                 uint32_t textarget : 2; /* 1D/2D/3D/CUBE */
    108                 uint32_t pad       : 1;
    109                 /* that's 32 bit exactly */
    110             } ts[8];
    111             uint32_t projected : 16;
    112             uint32_t fog : 1; /* for vFog coming from VS */
    113             uint32_t fog_mode : 2;
    114             uint32_t fog_source : 1; /* 0: Z, 1: W */
    115             uint32_t specular : 1;
    116             uint32_t pad1 : 11; /* 9 32-bit words with this */
    117             uint8_t colorarg_b4[3];
    118             uint8_t colorarg_b5[3];
    119             uint8_t alphaarg_b4[3]; /* 11 32-bit words plus a byte */
    120             uint8_t pad2[3];
    121         };
    122         uint64_t value64[6]; /* don't forget to resize PixelShader9.ff_key */
    123         uint32_t value32[12];
    124     };
    125 };
    126 
    127 static unsigned nine_ff_vs_key_hash(void *key)
    128 {
    129     struct nine_ff_vs_key *vs = key;
    130     unsigned i;
    131     uint32_t hash = vs->value32[0];
    132     for (i = 1; i < ARRAY_SIZE(vs->value32); ++i)
    133         hash ^= vs->value32[i];
    134     return hash;
    135 }
    136 static int nine_ff_vs_key_comp(void *key1, void *key2)
    137 {
    138     struct nine_ff_vs_key *a = (struct nine_ff_vs_key *)key1;
    139     struct nine_ff_vs_key *b = (struct nine_ff_vs_key *)key2;
    140 
    141     return memcmp(a->value64, b->value64, sizeof(a->value64));
    142 }
    143 static unsigned nine_ff_ps_key_hash(void *key)
    144 {
    145     struct nine_ff_ps_key *ps = key;
    146     unsigned i;
    147     uint32_t hash = ps->value32[0];
    148     for (i = 1; i < ARRAY_SIZE(ps->value32); ++i)
    149         hash ^= ps->value32[i];
    150     return hash;
    151 }
    152 static int nine_ff_ps_key_comp(void *key1, void *key2)
    153 {
    154     struct nine_ff_ps_key *a = (struct nine_ff_ps_key *)key1;
    155     struct nine_ff_ps_key *b = (struct nine_ff_ps_key *)key2;
    156 
    157     return memcmp(a->value64, b->value64, sizeof(a->value64));
    158 }
    159 static unsigned nine_ff_fvf_key_hash(void *key)
    160 {
    161     return *(DWORD *)key;
    162 }
    163 static int nine_ff_fvf_key_comp(void *key1, void *key2)
    164 {
    165     return *(DWORD *)key1 != *(DWORD *)key2;
    166 }
    167 
    168 static void nine_ff_prune_vs(struct NineDevice9 *);
    169 static void nine_ff_prune_ps(struct NineDevice9 *);
    170 
    171 static void nine_ureg_tgsi_dump(struct ureg_program *ureg, boolean override)
    172 {
    173     if (debug_get_bool_option("NINE_FF_DUMP", FALSE) || override) {
    174         const struct tgsi_token *toks = ureg_get_tokens(ureg, NULL);
    175         tgsi_dump(toks, 0);
    176         ureg_free_tokens(toks);
    177     }
    178 }
    179 
    180 #define _X(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_X)
    181 #define _Y(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Y)
    182 #define _Z(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Z)
    183 #define _W(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_W)
    184 
    185 #define _XXXX(r) ureg_scalar(r, TGSI_SWIZZLE_X)
    186 #define _YYYY(r) ureg_scalar(r, TGSI_SWIZZLE_Y)
    187 #define _ZZZZ(r) ureg_scalar(r, TGSI_SWIZZLE_Z)
    188 #define _WWWW(r) ureg_scalar(r, TGSI_SWIZZLE_W)
    189 
    190 #define _XYZW(r) (r)
    191 
    192 /* AL should contain base address of lights table. */
    193 #define LIGHT_CONST(i)                                                \
    194     ureg_src_indirect(ureg_DECL_constant(ureg, i), _X(AL))
    195 
    196 #define MATERIAL_CONST(i) \
    197     ureg_DECL_constant(ureg, 19 + (i))
    198 
    199 #define _CONST(n) ureg_DECL_constant(ureg, n)
    200 
    201 /* VS FF constants layout:
    202  *
    203  * CONST[ 0.. 3] D3DTS_WORLD * D3DTS_VIEW * D3DTS_PROJECTION
    204  * CONST[ 4.. 7] D3DTS_WORLD * D3DTS_VIEW
    205  * CONST[ 8..11] D3DTS_PROJECTION
    206  * CONST[12..15] D3DTS_VIEW^(-1)
    207  * CONST[16..18] Normal matrix
    208  *
    209  * CONST[19].xyz  MATERIAL.Emissive + Material.Ambient * RS.Ambient
    210  * CONST[20]      MATERIAL.Diffuse
    211  * CONST[21]      MATERIAL.Ambient
    212  * CONST[22]      MATERIAL.Specular
    213  * CONST[23].x___ MATERIAL.Power
    214  * CONST[24]      MATERIAL.Emissive
    215  * CONST[25]      RS.Ambient
    216  *
    217  * CONST[26].x___ RS.PointSizeMin
    218  * CONST[26]._y__ RS.PointSizeMax
    219  * CONST[26].__z_ RS.PointSize
    220  * CONST[26].___w RS.PointScaleA
    221  * CONST[27].x___ RS.PointScaleB
    222  * CONST[27]._y__ RS.PointScaleC
    223  *
    224  * CONST[28].x___ RS.FogEnd
    225  * CONST[28]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
    226  * CONST[28].__z_ RS.FogDensity
    227 
    228  * CONST[30].x___ TWEENFACTOR
    229  *
    230  * CONST[32].x___ LIGHT[0].Type
    231  * CONST[32]._yzw LIGHT[0].Attenuation0,1,2
    232  * CONST[33]      LIGHT[0].Diffuse
    233  * CONST[34]      LIGHT[0].Specular
    234  * CONST[35]      LIGHT[0].Ambient
    235  * CONST[36].xyz_ LIGHT[0].Position
    236  * CONST[36].___w LIGHT[0].Range
    237  * CONST[37].xyz_ LIGHT[0].Direction
    238  * CONST[37].___w LIGHT[0].Falloff
    239  * CONST[38].x___ cos(LIGHT[0].Theta / 2)
    240  * CONST[38]._y__ cos(LIGHT[0].Phi / 2)
    241  * CONST[38].__z_ 1.0f / (cos(LIGHT[0].Theta / 2) - cos(Light[0].Phi / 2))
    242  * CONST[39].xyz_ LIGHT[0].HalfVector (for directional lights)
    243  * CONST[39].___w 1 if this is the last active light, 0 if not
    244  * CONST[40]      LIGHT[1]
    245  * CONST[48]      LIGHT[2]
    246  * CONST[56]      LIGHT[3]
    247  * CONST[64]      LIGHT[4]
    248  * CONST[72]      LIGHT[5]
    249  * CONST[80]      LIGHT[6]
    250  * CONST[88]      LIGHT[7]
    251  * NOTE: no lighting code is generated if there are no active lights
    252  *
    253  * CONST[100].x___ Viewport 2/width
    254  * CONST[100]._y__ Viewport 2/height
    255  * CONST[100].__z_ Viewport 1/(zmax - zmin)
    256  * CONST[100].___w Viewport width
    257  * CONST[101].x___ Viewport x0
    258  * CONST[101]._y__ Viewport y0
    259  * CONST[101].__z_ Viewport z0
    260  *
    261  * CONST[128..131] D3DTS_TEXTURE0
    262  * CONST[132..135] D3DTS_TEXTURE1
    263  * CONST[136..139] D3DTS_TEXTURE2
    264  * CONST[140..143] D3DTS_TEXTURE3
    265  * CONST[144..147] D3DTS_TEXTURE4
    266  * CONST[148..151] D3DTS_TEXTURE5
    267  * CONST[152..155] D3DTS_TEXTURE6
    268  * CONST[156..159] D3DTS_TEXTURE7
    269  *
    270  * CONST[160] D3DTS_WORLDMATRIX[0] * D3DTS_VIEW
    271  * CONST[164] D3DTS_WORLDMATRIX[1] * D3DTS_VIEW
    272  * ...
    273  * CONST[192] D3DTS_WORLDMATRIX[8] * D3DTS_VIEW
    274  */
    275 struct vs_build_ctx
    276 {
    277     struct ureg_program *ureg;
    278     const struct nine_ff_vs_key *key;
    279 
    280     uint16_t input[PIPE_MAX_ATTRIBS];
    281     unsigned num_inputs;
    282 
    283     struct ureg_src aVtx;
    284     struct ureg_src aNrm;
    285     struct ureg_src aCol[2];
    286     struct ureg_src aTex[8];
    287     struct ureg_src aPsz;
    288     struct ureg_src aInd;
    289     struct ureg_src aWgt;
    290 
    291     struct ureg_src aVtx1; /* tweening */
    292     struct ureg_src aNrm1;
    293 
    294     struct ureg_src mtlA;
    295     struct ureg_src mtlD;
    296     struct ureg_src mtlS;
    297     struct ureg_src mtlE;
    298 };
    299 
    300 static inline unsigned
    301 get_texcoord_sn(struct pipe_screen *screen)
    302 {
    303     if (screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD))
    304         return TGSI_SEMANTIC_TEXCOORD;
    305     return TGSI_SEMANTIC_GENERIC;
    306 }
    307 
    308 static inline struct ureg_src
    309 build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
    310 {
    311     const unsigned i = vs->num_inputs++;
    312     assert(i < PIPE_MAX_ATTRIBS);
    313     vs->input[i] = ndecl;
    314     return ureg_DECL_vs_input(vs->ureg, i);
    315 }
    316 
    317 /* NOTE: dst may alias src */
    318 static inline void
    319 ureg_normalize3(struct ureg_program *ureg,
    320                 struct ureg_dst dst, struct ureg_src src)
    321 {
    322     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    323     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
    324 
    325     ureg_DP3(ureg, tmp_x, src, src);
    326     ureg_RSQ(ureg, tmp_x, _X(tmp));
    327     ureg_MUL(ureg, dst, src, _X(tmp));
    328     ureg_release_temporary(ureg, tmp);
    329 }
    330 
    331 static void *
    332 nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
    333 {
    334     const struct nine_ff_vs_key *key = vs->key;
    335     struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
    336     struct ureg_dst oPos, oCol[2], oPsz, oFog;
    337     struct ureg_dst AR;
    338     unsigned i, c;
    339     unsigned label[32], l = 0;
    340     boolean need_aNrm = key->lighting || key->passthrough & (1 << NINE_DECLUSAGE_NORMAL);
    341     boolean has_aNrm = need_aNrm && key->has_normal;
    342     boolean need_aVtx = key->lighting || key->fog_mode || key->pointscale || key->ucp;
    343     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
    344 
    345     vs->ureg = ureg;
    346 
    347     /* Check which inputs we should transform. */
    348     for (i = 0; i < 8 * 3; i += 3) {
    349         switch ((key->tc_gen >> i) & 0x7) {
    350         case NINED3DTSS_TCI_CAMERASPACENORMAL:
    351             need_aNrm = TRUE;
    352             break;
    353         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
    354             need_aVtx = TRUE;
    355             break;
    356         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
    357             need_aVtx = need_aNrm = TRUE;
    358             break;
    359         case NINED3DTSS_TCI_SPHEREMAP:
    360             need_aVtx = need_aNrm = TRUE;
    361             break;
    362         default:
    363             break;
    364         }
    365     }
    366 
    367     /* Declare and record used inputs (needed for linkage with vertex format):
    368      * (texture coordinates handled later)
    369      */
    370     vs->aVtx = build_vs_add_input(vs,
    371         key->position_t ? NINE_DECLUSAGE_POSITIONT : NINE_DECLUSAGE_POSITION);
    372 
    373     vs->aNrm = ureg_imm1f(ureg, 0.0f);
    374     if (has_aNrm)
    375         vs->aNrm = build_vs_add_input(vs, NINE_DECLUSAGE_NORMAL);
    376 
    377     vs->aCol[0] = ureg_imm1f(ureg, 1.0f);
    378     vs->aCol[1] = ureg_imm1f(ureg, 0.0f);
    379 
    380     if (key->lighting || key->darkness) {
    381         const unsigned mask = key->mtl_diffuse | key->mtl_specular |
    382                               key->mtl_ambient | key->mtl_emissive;
    383         if ((mask & 0x1) && !key->color0in_one)
    384             vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
    385         if ((mask & 0x2) && !key->color1in_zero)
    386             vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
    387 
    388         vs->mtlD = MATERIAL_CONST(1);
    389         vs->mtlA = MATERIAL_CONST(2);
    390         vs->mtlS = MATERIAL_CONST(3);
    391         vs->mtlE = MATERIAL_CONST(5);
    392         if (key->mtl_diffuse  == 1) vs->mtlD = vs->aCol[0]; else
    393         if (key->mtl_diffuse  == 2) vs->mtlD = vs->aCol[1];
    394         if (key->mtl_ambient  == 1) vs->mtlA = vs->aCol[0]; else
    395         if (key->mtl_ambient  == 2) vs->mtlA = vs->aCol[1];
    396         if (key->mtl_specular == 1) vs->mtlS = vs->aCol[0]; else
    397         if (key->mtl_specular == 2) vs->mtlS = vs->aCol[1];
    398         if (key->mtl_emissive == 1) vs->mtlE = vs->aCol[0]; else
    399         if (key->mtl_emissive == 2) vs->mtlE = vs->aCol[1];
    400     } else {
    401         if (!key->color0in_one) vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
    402         if (!key->color1in_zero) vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
    403     }
    404 
    405     if (key->vertexpointsize)
    406         vs->aPsz = build_vs_add_input(vs, NINE_DECLUSAGE_PSIZE);
    407 
    408     if (key->vertexblend_indexed || key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES))
    409         vs->aInd = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDINDICES);
    410     if (key->vertexblend || key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT))
    411         vs->aWgt = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDWEIGHT);
    412     if (key->vertextween) {
    413         vs->aVtx1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(POSITION,1));
    414         vs->aNrm1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(NORMAL,1));
    415     }
    416 
    417     /* Declare outputs:
    418      */
    419     oPos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); /* HPOS */
    420     oCol[0] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0));
    421     oCol[1] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1));
    422     if (key->fog || key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
    423         oFog = ureg_DECL_output(ureg, TGSI_SEMANTIC_FOG, 0);
    424         oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
    425     }
    426 
    427     if (key->vertexpointsize || key->pointscale) {
    428         oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0,
    429                                        TGSI_WRITEMASK_X, 0, 1);
    430         oPsz = ureg_writemask(oPsz, TGSI_WRITEMASK_X);
    431     }
    432 
    433     if (key->lighting || key->vertexblend)
    434         AR = ureg_DECL_address(ureg);
    435 
    436     /* === Vertex transformation / vertex blending:
    437      */
    438 
    439     if (key->position_t) {
    440         if (device->driver_caps.window_space_position_support) {
    441             ureg_MOV(ureg, oPos, vs->aVtx);
    442         } else {
    443             struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    444             /* vs->aVtx contains the coordinates buffer wise.
    445             * later in the pipeline, clipping, viewport and division
    446             * by w (rhw = 1/w) are going to be applied, so do the reverse
    447             * of these transformations (except clipping) to have the good
    448             * position at the end.*/
    449             ureg_MOV(ureg, tmp, vs->aVtx);
    450             /* X from [X_min, X_min + width] to [-1, 1], same for Y. Z to [0, 1] */
    451             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), ureg_negate(_CONST(101)));
    452             ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _CONST(100));
    453             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, -1.0f));
    454             /* Y needs to be reversed */
    455             ureg_MOV(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_negate(ureg_src(tmp)));
    456             /* inverse rhw */
    457             ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W), _W(tmp));
    458             /* multiply X, Y, Z by w */
    459             ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _W(tmp));
    460             ureg_MOV(ureg, oPos, ureg_src(tmp));
    461             ureg_release_temporary(ureg, tmp);
    462         }
    463     } else if (key->vertexblend) {
    464         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    465         struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
    466         struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
    467         struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
    468         struct ureg_dst sum_blendweights = ureg_DECL_temporary(ureg);
    469         struct ureg_src cWM[4];
    470 
    471         for (i = 160; i <= 195; ++i)
    472             ureg_DECL_constant(ureg, i);
    473 
    474         /* translate world matrix index to constant file index */
    475         if (key->vertexblend_indexed) {
    476             ureg_MAD(ureg, tmp, vs->aInd, ureg_imm1f(ureg, 4.0f), ureg_imm1f(ureg, 160.0f));
    477             ureg_ARL(ureg, AR, ureg_src(tmp));
    478         }
    479 
    480         ureg_MOV(ureg, aVtx_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
    481         ureg_MOV(ureg, aNrm_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
    482         ureg_MOV(ureg, sum_blendweights, ureg_imm4f(ureg, 1.0f, 1.0f, 1.0f, 1.0f));
    483 
    484         for (i = 0; i < key->vertexblend; ++i) {
    485             for (c = 0; c < 4; ++c) {
    486                 cWM[c] = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, (160 + i * 4) * !key->vertexblend_indexed + c), 0);
    487                 if (key->vertexblend_indexed)
    488                     cWM[c] = ureg_src_indirect(cWM[c], ureg_scalar(ureg_src(AR), i));
    489             }
    490 
    491             /* multiply by WORLD(index) */
    492             ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), cWM[0]);
    493             ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), cWM[1], ureg_src(tmp));
    494             ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), cWM[2], ureg_src(tmp));
    495             ureg_MAD(ureg, tmp, _WWWW(vs->aVtx), cWM[3], ureg_src(tmp));
    496 
    497             if (has_aNrm) {
    498                 /* Note: the spec says the transpose of the inverse of the
    499                  * WorldView matrices should be used, but all tests show
    500                  * otherwise.
    501                  * Only case unknown: D3DVBF_0WEIGHTS */
    502                 ureg_MUL(ureg, tmp2, _XXXX(vs->aNrm), cWM[0]);
    503                 ureg_MAD(ureg, tmp2, _YYYY(vs->aNrm), cWM[1], ureg_src(tmp2));
    504                 ureg_MAD(ureg, tmp2, _ZZZZ(vs->aNrm), cWM[2], ureg_src(tmp2));
    505             }
    506 
    507             if (i < (key->vertexblend - 1)) {
    508                 /* accumulate weighted position value */
    509                 ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(vs->aWgt, i), ureg_src(aVtx_dst));
    510                 if (has_aNrm)
    511                     ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(vs->aWgt, i), ureg_src(aNrm_dst));
    512                 /* subtract weighted position value for last value */
    513                 ureg_ADD(ureg, sum_blendweights, ureg_src(sum_blendweights), ureg_negate(ureg_scalar(vs->aWgt, i)));
    514             }
    515         }
    516 
    517         /* the last weighted position is always 1 - sum_of_previous_weights */
    518         ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aVtx_dst));
    519         if (has_aNrm)
    520             ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aNrm_dst));
    521 
    522         /* multiply by VIEW_PROJ */
    523         ureg_MUL(ureg, tmp, _X(aVtx_dst), _CONST(8));
    524         ureg_MAD(ureg, tmp, _Y(aVtx_dst), _CONST(9),  ureg_src(tmp));
    525         ureg_MAD(ureg, tmp, _Z(aVtx_dst), _CONST(10), ureg_src(tmp));
    526         ureg_MAD(ureg, oPos, _W(aVtx_dst), _CONST(11), ureg_src(tmp));
    527 
    528         if (need_aVtx)
    529             vs->aVtx = ureg_src(aVtx_dst);
    530 
    531         ureg_release_temporary(ureg, tmp);
    532         ureg_release_temporary(ureg, tmp2);
    533         ureg_release_temporary(ureg, sum_blendweights);
    534         if (!need_aVtx)
    535             ureg_release_temporary(ureg, aVtx_dst);
    536 
    537         if (has_aNrm) {
    538             if (key->normalizenormals)
    539                ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
    540             vs->aNrm = ureg_src(aNrm_dst);
    541         } else
    542             ureg_release_temporary(ureg, aNrm_dst);
    543     } else {
    544         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    545 
    546         if (key->vertextween) {
    547             struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
    548             ureg_LRP(ureg, aVtx_dst, _XXXX(_CONST(30)), vs->aVtx1, vs->aVtx);
    549             vs->aVtx = ureg_src(aVtx_dst);
    550             if (has_aNrm) {
    551                 struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
    552                 ureg_LRP(ureg, aNrm_dst, _XXXX(_CONST(30)), vs->aNrm1, vs->aNrm);
    553                 vs->aNrm = ureg_src(aNrm_dst);
    554             }
    555         }
    556 
    557         /* position = vertex * WORLD_VIEW_PROJ */
    558         ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(0));
    559         ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(1), ureg_src(tmp));
    560         ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(2), ureg_src(tmp));
    561         ureg_MAD(ureg, oPos, _WWWW(vs->aVtx), _CONST(3), ureg_src(tmp));
    562         ureg_release_temporary(ureg, tmp);
    563 
    564         if (need_aVtx) {
    565             struct ureg_dst aVtx_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    566             ureg_MUL(ureg, aVtx_dst, _XXXX(vs->aVtx), _CONST(4));
    567             ureg_MAD(ureg, aVtx_dst, _YYYY(vs->aVtx), _CONST(5), ureg_src(aVtx_dst));
    568             ureg_MAD(ureg, aVtx_dst, _ZZZZ(vs->aVtx), _CONST(6), ureg_src(aVtx_dst));
    569             ureg_MAD(ureg, aVtx_dst, _WWWW(vs->aVtx), _CONST(7), ureg_src(aVtx_dst));
    570             vs->aVtx = ureg_src(aVtx_dst);
    571         }
    572         if (has_aNrm) {
    573             struct ureg_dst aNrm_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    574             ureg_MUL(ureg, aNrm_dst, _XXXX(vs->aNrm), _CONST(16));
    575             ureg_MAD(ureg, aNrm_dst, _YYYY(vs->aNrm), _CONST(17), ureg_src(aNrm_dst));
    576             ureg_MAD(ureg, aNrm_dst, _ZZZZ(vs->aNrm), _CONST(18), ureg_src(aNrm_dst));
    577             if (key->normalizenormals)
    578                ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
    579             vs->aNrm = ureg_src(aNrm_dst);
    580         }
    581     }
    582 
    583     /* === Process point size:
    584      */
    585     if (key->vertexpointsize || key->pointscale) {
    586         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    587         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
    588         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
    589         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
    590         if (key->vertexpointsize) {
    591             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
    592             ureg_MAX(ureg, tmp_z, _XXXX(vs->aPsz), _XXXX(cPsz1));
    593             ureg_MIN(ureg, tmp_z, _Z(tmp), _YYYY(cPsz1));
    594         } else {
    595             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
    596             ureg_MOV(ureg, tmp_z, _ZZZZ(cPsz1));
    597         }
    598 
    599         if (key->pointscale) {
    600             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
    601             struct ureg_src cPsz2 = ureg_DECL_constant(ureg, 27);
    602 
    603             ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
    604             ureg_RSQ(ureg, tmp_y, _X(tmp));
    605             ureg_MUL(ureg, tmp_y, _Y(tmp), _X(tmp));
    606             ureg_CMP(ureg, tmp_y, ureg_negate(_Y(tmp)), _Y(tmp), ureg_imm1f(ureg, 0.0f));
    607             ureg_MAD(ureg, tmp_x, _Y(tmp), _YYYY(cPsz2), _XXXX(cPsz2));
    608             ureg_MAD(ureg, tmp_x, _Y(tmp), _X(tmp), _WWWW(cPsz1));
    609             ureg_RSQ(ureg, tmp_x, _X(tmp));
    610             ureg_MUL(ureg, tmp_x, _X(tmp), _Z(tmp));
    611             ureg_MUL(ureg, tmp_x, _X(tmp), _WWWW(_CONST(100)));
    612             ureg_MAX(ureg, tmp_x, _X(tmp), _XXXX(cPsz1));
    613             ureg_MIN(ureg, tmp_z, _X(tmp), _YYYY(cPsz1));
    614         }
    615 
    616         ureg_MOV(ureg, oPsz, _Z(tmp));
    617         ureg_release_temporary(ureg, tmp);
    618     }
    619 
    620     for (i = 0; i < 8; ++i) {
    621         struct ureg_dst tmp, tmp_x, tmp2;
    622         struct ureg_dst oTex, input_coord, transformed, t, aVtx_normed;
    623         unsigned c, writemask;
    624         const unsigned tci = (key->tc_gen >> (i * 3)) & 0x7;
    625         const unsigned idx = (key->tc_idx >> (i * 3)) & 0x7;
    626         unsigned dim_input = 1 + ((key->tc_dim_input >> (i * 2)) & 0x3);
    627         const unsigned dim_output = (key->tc_dim_output >> (i * 3)) & 0x7;
    628 
    629         /* No texture output of index s */
    630         if (tci == NINED3DTSS_TCI_DISABLE)
    631             continue;
    632         oTex = ureg_DECL_output(ureg, texcoord_sn, i);
    633         tmp = ureg_DECL_temporary(ureg);
    634         tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
    635         input_coord = ureg_DECL_temporary(ureg);
    636         transformed = ureg_DECL_temporary(ureg);
    637 
    638         /* Get the coordinate */
    639         switch (tci) {
    640         case NINED3DTSS_TCI_PASSTHRU:
    641             /* NINED3DTSS_TCI_PASSTHRU => Use texcoord coming from index idx *
    642              * Else the idx is used only to determine wrapping mode. */
    643             vs->aTex[idx] = build_vs_add_input(vs, NINE_DECLUSAGE_i(TEXCOORD,idx));
    644             ureg_MOV(ureg, input_coord, vs->aTex[idx]);
    645             break;
    646         case NINED3DTSS_TCI_CAMERASPACENORMAL:
    647             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aNrm);
    648             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
    649             dim_input = 4;
    650             break;
    651         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
    652             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aVtx);
    653             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
    654             dim_input = 4;
    655             break;
    656         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
    657             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
    658             aVtx_normed = ureg_DECL_temporary(ureg);
    659             ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
    660             ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
    661             ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
    662             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
    663             ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
    664             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
    665             ureg_release_temporary(ureg, aVtx_normed);
    666             dim_input = 4;
    667             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
    668             break;
    669         case NINED3DTSS_TCI_SPHEREMAP:
    670             /* Implement the formula of GL_SPHERE_MAP */
    671             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
    672             aVtx_normed = ureg_DECL_temporary(ureg);
    673             tmp2 = ureg_DECL_temporary(ureg);
    674             ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
    675             ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
    676             ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
    677             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
    678             ureg_ADD(ureg, tmp, ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
    679             /* now tmp = normed(Vtx) - 2 dot3(normed(Vtx), Nrm) Nrm */
    680             ureg_MOV(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_XYZ), ureg_src(tmp));
    681             ureg_MUL(ureg, tmp2, ureg_src(tmp2), ureg_src(tmp2));
    682             ureg_DP3(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_src(tmp2));
    683             ureg_RSQ(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2));
    684             ureg_MUL(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_imm1f(ureg, 0.5f));
    685             /* tmp2 = 0.5 / sqrt(tmp.x^2 + tmp.y^2 + (tmp.z+1)^2)
    686              * TODO: z coordinates are a bit different gl vs d3d, should the formula be adapted ? */
    687             ureg_MUL(ureg, tmp, ureg_src(tmp), _X(tmp2));
    688             ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, 0.5f));
    689             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_ZW), ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
    690             ureg_release_temporary(ureg, aVtx_normed);
    691             ureg_release_temporary(ureg, tmp2);
    692             dim_input = 4;
    693             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
    694             break;
    695         default:
    696             assert(0);
    697             break;
    698         }
    699 
    700         /* Apply the transformation */
    701         /* dim_output == 0 => do not transform the components.
    702          * XYZRHW also disables transformation */
    703         if (!dim_output || key->position_t) {
    704             ureg_release_temporary(ureg, transformed);
    705             transformed = input_coord;
    706             writemask = TGSI_WRITEMASK_XYZW;
    707         } else {
    708             for (c = 0; c < dim_output; c++) {
    709                 t = ureg_writemask(transformed, 1 << c);
    710                 switch (dim_input) {
    711                 /* dim_input = 1 2 3: -> we add trailing 1 to input*/
    712                 case 1: ureg_MAD(ureg, t, _X(input_coord), _XXXX(_CONST(128 + i * 4 + c)), _YYYY(_CONST(128 + i * 4 + c)));
    713                         break;
    714                 case 2: ureg_DP2(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
    715                         ureg_ADD(ureg, t, ureg_src(transformed), _ZZZZ(_CONST(128 + i * 4 + c)));
    716                         break;
    717                 case 3: ureg_DP3(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
    718                         ureg_ADD(ureg, t, ureg_src(transformed), _WWWW(_CONST(128 + i * 4 + c)));
    719                         break;
    720                 case 4: ureg_DP4(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c)); break;
    721                 default:
    722                     assert(0);
    723                 }
    724             }
    725             writemask = (1 << dim_output) - 1;
    726             ureg_release_temporary(ureg, input_coord);
    727         }
    728 
    729         ureg_MOV(ureg, ureg_writemask(oTex, writemask), ureg_src(transformed));
    730         ureg_release_temporary(ureg, transformed);
    731         ureg_release_temporary(ureg, tmp);
    732     }
    733 
    734     /* === Lighting:
    735      *
    736      * DIRECTIONAL:  Light at infinite distance, parallel rays, no attenuation.
    737      * POINT: Finite distance to scene, divergent rays, isotropic, attenuation.
    738      * SPOT: Finite distance, divergent rays, angular dependence, attenuation.
    739      *
    740      * vec3 normal = normalize(in.Normal * NormalMatrix);
    741      * vec3 hitDir = light.direction;
    742      * float atten = 1.0;
    743      *
    744      * if (light.type != DIRECTIONAL)
    745      * {
    746      *     vec3 hitVec = light.position - eyeVertex;
    747      *     float d = length(hitVec);
    748      *     hitDir = hitVec / d;
    749      *     atten = 1 / ((light.atten2 * d + light.atten1) * d + light.atten0);
    750      * }
    751      *
    752      * if (light.type == SPOTLIGHT)
    753      * {
    754      *     float rho = dp3(-hitVec, light.direction);
    755      *     if (rho < cos(light.phi / 2))
    756      *         atten = 0;
    757      *     if (rho < cos(light.theta / 2))
    758      *         atten *= pow(some_func(rho), light.falloff);
    759      * }
    760      *
    761      * float nDotHit = dp3_sat(normal, hitVec);
    762      * float powFact = 0.0;
    763      *
    764      * if (nDotHit > 0.0)
    765      * {
    766      *     vec3 midVec = normalize(hitDir + eye);
    767      *     float nDotMid = dp3_sat(normal, midVec);
    768      *     pFact = pow(nDotMid, material.power);
    769      * }
    770      *
    771      * ambient += light.ambient * atten;
    772      * diffuse += light.diffuse * atten * nDotHit;
    773      * specular += light.specular * atten * powFact;
    774      */
    775     if (key->lighting) {
    776         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    777         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
    778         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
    779         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
    780         struct ureg_dst rAtt = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
    781         struct ureg_dst rHit = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    782         struct ureg_dst rMid = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    783 
    784         struct ureg_dst rCtr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
    785 
    786         struct ureg_dst AL = ureg_writemask(AR, TGSI_WRITEMASK_X);
    787 
    788         /* Light.*.Alpha is not used. */
    789         struct ureg_dst rD = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    790         struct ureg_dst rA = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    791         struct ureg_dst rS = ureg_DECL_temporary(ureg);
    792 
    793         struct ureg_src mtlP = _XXXX(MATERIAL_CONST(4));
    794 
    795         struct ureg_src cLKind = _XXXX(LIGHT_CONST(0));
    796         struct ureg_src cLAtt0 = _YYYY(LIGHT_CONST(0));
    797         struct ureg_src cLAtt1 = _ZZZZ(LIGHT_CONST(0));
    798         struct ureg_src cLAtt2 = _WWWW(LIGHT_CONST(0));
    799         struct ureg_src cLColD = _XYZW(LIGHT_CONST(1));
    800         struct ureg_src cLColS = _XYZW(LIGHT_CONST(2));
    801         struct ureg_src cLColA = _XYZW(LIGHT_CONST(3));
    802         struct ureg_src cLPos  = _XYZW(LIGHT_CONST(4));
    803         struct ureg_src cLRng  = _WWWW(LIGHT_CONST(4));
    804         struct ureg_src cLDir  = _XYZW(LIGHT_CONST(5));
    805         struct ureg_src cLFOff = _WWWW(LIGHT_CONST(5));
    806         struct ureg_src cLTht  = _XXXX(LIGHT_CONST(6));
    807         struct ureg_src cLPhi  = _YYYY(LIGHT_CONST(6));
    808         struct ureg_src cLSDiv = _ZZZZ(LIGHT_CONST(6));
    809         struct ureg_src cLLast = _WWWW(LIGHT_CONST(7));
    810 
    811         const unsigned loop_label = l++;
    812 
    813         /* Declare all light constants to allow indirect adressing */
    814         for (i = 32; i < 96; i++)
    815             ureg_DECL_constant(ureg, i);
    816 
    817         ureg_MOV(ureg, rCtr, ureg_imm1f(ureg, 32.0f)); /* &lightconst(0) */
    818         ureg_MOV(ureg, rD, ureg_imm1f(ureg, 0.0f));
    819         ureg_MOV(ureg, rA, ureg_imm1f(ureg, 0.0f));
    820         ureg_MOV(ureg, rS, ureg_imm1f(ureg, 0.0f));
    821 
    822         /* loop management */
    823         ureg_BGNLOOP(ureg, &label[loop_label]);
    824         ureg_ARL(ureg, AL, _W(rCtr));
    825 
    826         /* if (not DIRECTIONAL light): */
    827         ureg_SNE(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_DIRECTIONAL));
    828         ureg_MOV(ureg, rHit, ureg_negate(cLDir));
    829         ureg_MOV(ureg, rAtt, ureg_imm1f(ureg, 1.0f));
    830         ureg_IF(ureg, _X(tmp), &label[l++]);
    831         {
    832             /* hitDir = light.position - eyeVtx
    833              * d = length(hitDir)
    834              */
    835             ureg_ADD(ureg, rHit, cLPos, ureg_negate(vs->aVtx));
    836             ureg_DP3(ureg, tmp_x, ureg_src(rHit), ureg_src(rHit));
    837             ureg_RSQ(ureg, tmp_y, _X(tmp));
    838             ureg_MUL(ureg, tmp_x, _X(tmp), _Y(tmp)); /* length */
    839 
    840             /* att = 1.0 / (light.att0 + (light.att1 + light.att2 * d) * d) */
    841             ureg_MAD(ureg, rAtt, _X(tmp), cLAtt2, cLAtt1);
    842             ureg_MAD(ureg, rAtt, _X(tmp), _W(rAtt), cLAtt0);
    843             ureg_RCP(ureg, rAtt, _W(rAtt));
    844             /* cut-off if distance exceeds Light.Range */
    845             ureg_SLT(ureg, tmp_x, _X(tmp), cLRng);
    846             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
    847         }
    848         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
    849         ureg_ENDIF(ureg);
    850 
    851         /* normalize hitDir */
    852         ureg_normalize3(ureg, rHit, ureg_src(rHit));
    853 
    854         /* if (SPOT light) */
    855         ureg_SEQ(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_SPOT));
    856         ureg_IF(ureg, _X(tmp), &label[l++]);
    857         {
    858             /* rho = dp3(-hitDir, light.spotDir)
    859              *
    860              * if (rho  > light.ctht2) NOTE: 0 <= phi <= pi, 0 <= theta <= phi
    861              *     spotAtt = 1
    862              * else
    863              * if (rho <= light.cphi2)
    864              *     spotAtt = 0
    865              * else
    866              *     spotAtt = (rho - light.cphi2) / (light.ctht2 - light.cphi2) ^ light.falloff
    867              */
    868             ureg_DP3(ureg, tmp_y, ureg_negate(ureg_src(rHit)), cLDir); /* rho */
    869             ureg_ADD(ureg, tmp_x, _Y(tmp), ureg_negate(cLPhi));
    870             ureg_MUL(ureg, tmp_x, _X(tmp), cLSDiv);
    871             ureg_POW(ureg, tmp_x, _X(tmp), cLFOff); /* spotAtten */
    872             ureg_SGE(ureg, tmp_z, _Y(tmp), cLTht); /* if inside theta && phi */
    873             ureg_SGE(ureg, tmp_y, _Y(tmp), cLPhi); /* if inside phi */
    874             ureg_MAD(ureg, ureg_saturate(tmp_x), _X(tmp), _Y(tmp), _Z(tmp));
    875             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
    876         }
    877         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
    878         ureg_ENDIF(ureg);
    879 
    880         /* directional factors, let's not use LIT because of clarity */
    881 
    882         if (has_aNrm) {
    883             if (key->localviewer) {
    884                 ureg_normalize3(ureg, rMid, vs->aVtx);
    885                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_negate(ureg_src(rMid)));
    886             } else {
    887                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_imm3f(ureg, 0.0f, 0.0f, -1.0f));
    888             }
    889             ureg_normalize3(ureg, rMid, ureg_src(rMid));
    890             ureg_DP3(ureg, ureg_saturate(tmp_x), vs->aNrm, ureg_src(rHit));
    891             ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
    892             ureg_MUL(ureg, tmp_z, _X(tmp), _Y(tmp));
    893             /* Tests show that specular is computed only if (dp3(normal,hitDir) > 0).
    894              * For front facing, it is more restrictive than test (dp3(normal,mid) > 0).
    895              * No tests were made for backfacing, so add the two conditions */
    896             ureg_IF(ureg, _Z(tmp), &label[l++]);
    897             {
    898                 ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
    899                 ureg_POW(ureg, tmp_y, _Y(tmp), mtlP);
    900                 ureg_MUL(ureg, tmp_y, _W(rAtt), _Y(tmp)); /* power factor * att */
    901                 ureg_MAD(ureg, rS, cLColS, _Y(tmp), ureg_src(rS)); /* accumulate specular */
    902             }
    903             ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
    904             ureg_ENDIF(ureg);
    905 
    906             ureg_MUL(ureg, tmp_x, _W(rAtt), _X(tmp)); /* dp3(normal,hitDir) * att */
    907             ureg_MAD(ureg, rD, cLColD, _X(tmp), ureg_src(rD)); /* accumulate diffuse */
    908         }
    909 
    910         ureg_MAD(ureg, rA, cLColA, _W(rAtt), ureg_src(rA)); /* accumulate ambient */
    911 
    912         /* break if this was the last light */
    913         ureg_IF(ureg, cLLast, &label[l++]);
    914         ureg_BRK(ureg);
    915         ureg_ENDIF(ureg);
    916         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
    917 
    918         ureg_ADD(ureg, rCtr, _W(rCtr), ureg_imm1f(ureg, 8.0f));
    919         ureg_fixup_label(ureg, label[loop_label], ureg_get_instruction_number(ureg));
    920         ureg_ENDLOOP(ureg, &label[loop_label]);
    921 
    922         /* Apply to material:
    923          *
    924          * oCol[0] = (material.emissive + material.ambient * rs.ambient) +
    925          *           material.ambient * ambient +
    926          *           material.diffuse * diffuse +
    927          * oCol[1] = material.specular * specular;
    928          */
    929         if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
    930             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), vs->mtlA, _CONST(19));
    931         else {
    932             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), _CONST(25));
    933             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), vs->mtlA, ureg_src(tmp), vs->mtlE);
    934         }
    935 
    936         ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), ureg_src(rD), vs->mtlD, ureg_src(tmp));
    937         ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
    938         ureg_MUL(ureg, oCol[1], ureg_src(rS), vs->mtlS);
    939         ureg_release_temporary(ureg, rAtt);
    940         ureg_release_temporary(ureg, rHit);
    941         ureg_release_temporary(ureg, rMid);
    942         ureg_release_temporary(ureg, rCtr);
    943         ureg_release_temporary(ureg, rD);
    944         ureg_release_temporary(ureg, rA);
    945         ureg_release_temporary(ureg, rS);
    946         ureg_release_temporary(ureg, rAtt);
    947         ureg_release_temporary(ureg, tmp);
    948     } else
    949     /* COLOR */
    950     if (key->darkness) {
    951         if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
    952             ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), _CONST(19));
    953         else
    954             ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), vs->mtlA, _CONST(25), vs->mtlE);
    955         ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
    956         ureg_MOV(ureg, oCol[1], ureg_imm1f(ureg, 0.0f));
    957     } else {
    958         ureg_MOV(ureg, oCol[0], vs->aCol[0]);
    959         ureg_MOV(ureg, oCol[1], vs->aCol[1]);
    960     }
    961 
    962     /* === Process fog.
    963      *
    964      * exp(x) = ex2(log2(e) * x)
    965      */
    966     if (key->fog_mode) {
    967         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    968         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
    969         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
    970         if (key->fog_range) {
    971             ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
    972             ureg_RSQ(ureg, tmp_z, _X(tmp));
    973             ureg_MUL(ureg, tmp_z, _Z(tmp), _X(tmp));
    974         } else {
    975             ureg_MOV(ureg, tmp_z, ureg_abs(_ZZZZ(vs->aVtx)));
    976         }
    977 
    978         if (key->fog_mode == D3DFOG_EXP) {
    979             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
    980             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
    981             ureg_EX2(ureg, tmp_x, _X(tmp));
    982         } else
    983         if (key->fog_mode == D3DFOG_EXP2) {
    984             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
    985             ureg_MUL(ureg, tmp_x, _X(tmp), _X(tmp));
    986             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
    987             ureg_EX2(ureg, tmp_x, _X(tmp));
    988         } else
    989         if (key->fog_mode == D3DFOG_LINEAR) {
    990             ureg_ADD(ureg, tmp_x, _XXXX(_CONST(28)), ureg_negate(_Z(tmp)));
    991             ureg_MUL(ureg, ureg_saturate(tmp_x), _X(tmp), _YYYY(_CONST(28)));
    992         }
    993         ureg_MOV(ureg, oFog, _X(tmp));
    994         ureg_release_temporary(ureg, tmp);
    995     } else if (key->fog && !(key->passthrough & (1 << NINE_DECLUSAGE_FOG))) {
    996         ureg_MOV(ureg, oFog, ureg_scalar(vs->aCol[1], TGSI_SWIZZLE_W));
    997     }
    998 
    999     if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT)) {
   1000         struct ureg_src input;
   1001         struct ureg_dst output;
   1002         input = vs->aWgt;
   1003         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 18);
   1004         ureg_MOV(ureg, output, input);
   1005     }
   1006     if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES)) {
   1007         struct ureg_src input;
   1008         struct ureg_dst output;
   1009         input = vs->aInd;
   1010         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 19);
   1011         ureg_MOV(ureg, output, input);
   1012     }
   1013     if (key->passthrough & (1 << NINE_DECLUSAGE_NORMAL)) {
   1014         struct ureg_src input;
   1015         struct ureg_dst output;
   1016         input = vs->aNrm;
   1017         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 20);
   1018         ureg_MOV(ureg, output, input);
   1019     }
   1020     if (key->passthrough & (1 << NINE_DECLUSAGE_TANGENT)) {
   1021         struct ureg_src input;
   1022         struct ureg_dst output;
   1023         input = build_vs_add_input(vs, NINE_DECLUSAGE_TANGENT);
   1024         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 21);
   1025         ureg_MOV(ureg, output, input);
   1026     }
   1027     if (key->passthrough & (1 << NINE_DECLUSAGE_BINORMAL)) {
   1028         struct ureg_src input;
   1029         struct ureg_dst output;
   1030         input = build_vs_add_input(vs, NINE_DECLUSAGE_BINORMAL);
   1031         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 22);
   1032         ureg_MOV(ureg, output, input);
   1033     }
   1034     if (key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
   1035         struct ureg_src input;
   1036         struct ureg_dst output;
   1037         input = build_vs_add_input(vs, NINE_DECLUSAGE_FOG);
   1038         input = ureg_scalar(input, TGSI_SWIZZLE_X);
   1039         output = oFog;
   1040         ureg_MOV(ureg, output, input);
   1041     }
   1042     if (key->passthrough & (1 << NINE_DECLUSAGE_DEPTH)) {
   1043         (void) 0; /* TODO: replace z of position output ? */
   1044     }
   1045 
   1046     /* ucp for ff applies on world coordinates.
   1047      * aVtx is in worldview coordinates. */
   1048     if (key->ucp) {
   1049         struct ureg_dst clipVect = ureg_DECL_output(ureg, TGSI_SEMANTIC_CLIPVERTEX, 0);
   1050         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
   1051         ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(12));
   1052         ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(13),  ureg_src(tmp));
   1053         ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(14), ureg_src(tmp));
   1054         ureg_ADD(ureg, clipVect, _CONST(15), ureg_src(tmp));
   1055         ureg_release_temporary(ureg, tmp);
   1056     }
   1057 
   1058     if (key->position_t && device->driver_caps.window_space_position_support)
   1059         ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, TRUE);
   1060 
   1061     ureg_END(ureg);
   1062     nine_ureg_tgsi_dump(ureg, FALSE);
   1063     return ureg_create_shader_and_destroy(ureg, device->context.pipe);
   1064 }
   1065 
   1066 /* PS FF constants layout:
   1067  *
   1068  * CONST[ 0.. 7]      stage[i].D3DTSS_CONSTANT
   1069  * CONST[ 8..15].x___ stage[i].D3DTSS_BUMPENVMAT00
   1070  * CONST[ 8..15]._y__ stage[i].D3DTSS_BUMPENVMAT01
   1071  * CONST[ 8..15].__z_ stage[i].D3DTSS_BUMPENVMAT10
   1072  * CONST[ 8..15].___w stage[i].D3DTSS_BUMPENVMAT11
   1073  * CONST[16..19].x_z_ stage[i].D3DTSS_BUMPENVLSCALE
   1074  * CONST[17..19]._y_w stage[i].D3DTSS_BUMPENVLOFFSET
   1075  *
   1076  * CONST[20] D3DRS_TEXTUREFACTOR
   1077  * CONST[21] D3DRS_FOGCOLOR
   1078  * CONST[22].x___ RS.FogEnd
   1079  * CONST[22]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
   1080  * CONST[22].__z_ RS.FogDensity
   1081  */
   1082 struct ps_build_ctx
   1083 {
   1084     struct ureg_program *ureg;
   1085 
   1086     struct ureg_src vC[2]; /* DIFFUSE, SPECULAR */
   1087     struct ureg_src vT[8]; /* TEXCOORD[i] */
   1088     struct ureg_dst rCur; /* D3DTA_CURRENT */
   1089     struct ureg_dst rMod;
   1090     struct ureg_src rCurSrc;
   1091     struct ureg_dst rTmp; /* D3DTA_TEMP */
   1092     struct ureg_src rTmpSrc;
   1093     struct ureg_dst rTex;
   1094     struct ureg_src rTexSrc;
   1095     struct ureg_src cBEM[8];
   1096     struct ureg_src s[8];
   1097 
   1098     struct {
   1099         unsigned index;
   1100         unsigned index_pre_mod;
   1101     } stage;
   1102 };
   1103 
   1104 static struct ureg_src
   1105 ps_get_ts_arg(struct ps_build_ctx *ps, unsigned ta)
   1106 {
   1107     struct ureg_src reg;
   1108 
   1109     switch (ta & D3DTA_SELECTMASK) {
   1110     case D3DTA_CONSTANT:
   1111         reg = ureg_DECL_constant(ps->ureg, ps->stage.index);
   1112         break;
   1113     case D3DTA_CURRENT:
   1114         reg = (ps->stage.index == ps->stage.index_pre_mod) ? ureg_src(ps->rMod) : ps->rCurSrc;
   1115         break;
   1116     case D3DTA_DIFFUSE:
   1117         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
   1118         break;
   1119     case D3DTA_SPECULAR:
   1120         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
   1121         break;
   1122     case D3DTA_TEMP:
   1123         reg = ps->rTmpSrc;
   1124         break;
   1125     case D3DTA_TEXTURE:
   1126         reg = ps->rTexSrc;
   1127         break;
   1128     case D3DTA_TFACTOR:
   1129         reg = ureg_DECL_constant(ps->ureg, 20);
   1130         break;
   1131     default:
   1132         assert(0);
   1133         reg = ureg_src_undef();
   1134         break;
   1135     }
   1136     if (ta & D3DTA_COMPLEMENT) {
   1137         struct ureg_dst dst = ureg_DECL_temporary(ps->ureg);
   1138         ureg_ADD(ps->ureg, dst, ureg_imm1f(ps->ureg, 1.0f), ureg_negate(reg));
   1139         reg = ureg_src(dst);
   1140     }
   1141     if (ta & D3DTA_ALPHAREPLICATE)
   1142         reg = _WWWW(reg);
   1143     return reg;
   1144 }
   1145 
   1146 static struct ureg_dst
   1147 ps_get_ts_dst(struct ps_build_ctx *ps, unsigned ta)
   1148 {
   1149     assert(!(ta & (D3DTA_COMPLEMENT | D3DTA_ALPHAREPLICATE)));
   1150 
   1151     switch (ta & D3DTA_SELECTMASK) {
   1152     case D3DTA_CURRENT:
   1153         return ps->rCur;
   1154     case D3DTA_TEMP:
   1155         return ps->rTmp;
   1156     default:
   1157         assert(0);
   1158         return ureg_dst_undef();
   1159     }
   1160 }
   1161 
   1162 static uint8_t ps_d3dtop_args_mask(D3DTEXTUREOP top)
   1163 {
   1164     switch (top) {
   1165     case D3DTOP_DISABLE:
   1166         return 0x0;
   1167     case D3DTOP_SELECTARG1:
   1168     case D3DTOP_PREMODULATE:
   1169         return 0x2;
   1170     case D3DTOP_SELECTARG2:
   1171         return 0x4;
   1172     case D3DTOP_MULTIPLYADD:
   1173     case D3DTOP_LERP:
   1174         return 0x7;
   1175     default:
   1176         return 0x6;
   1177     }
   1178 }
   1179 
   1180 static inline boolean
   1181 is_MOV_no_op(struct ureg_dst dst, struct ureg_src src)
   1182 {
   1183     return !dst.WriteMask ||
   1184         (dst.File == src.File &&
   1185          dst.Index == src.Index &&
   1186          !dst.Indirect &&
   1187          !dst.Saturate &&
   1188          !src.Indirect &&
   1189          !src.Negate &&
   1190          !src.Absolute &&
   1191          (!(dst.WriteMask & TGSI_WRITEMASK_X) || (src.SwizzleX == TGSI_SWIZZLE_X)) &&
   1192          (!(dst.WriteMask & TGSI_WRITEMASK_Y) || (src.SwizzleY == TGSI_SWIZZLE_Y)) &&
   1193          (!(dst.WriteMask & TGSI_WRITEMASK_Z) || (src.SwizzleZ == TGSI_SWIZZLE_Z)) &&
   1194          (!(dst.WriteMask & TGSI_WRITEMASK_W) || (src.SwizzleW == TGSI_SWIZZLE_W)));
   1195 
   1196 }
   1197 
   1198 static void
   1199 ps_do_ts_op(struct ps_build_ctx *ps, unsigned top, struct ureg_dst dst, struct ureg_src *arg)
   1200 {
   1201     struct ureg_program *ureg = ps->ureg;
   1202     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
   1203     struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
   1204     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
   1205 
   1206     tmp.WriteMask = dst.WriteMask;
   1207 
   1208     if (top != D3DTOP_SELECTARG1 && top != D3DTOP_SELECTARG2 &&
   1209         top != D3DTOP_MODULATE && top != D3DTOP_PREMODULATE &&
   1210         top != D3DTOP_BLENDDIFFUSEALPHA && top != D3DTOP_BLENDTEXTUREALPHA &&
   1211         top != D3DTOP_BLENDFACTORALPHA && top != D3DTOP_BLENDCURRENTALPHA &&
   1212         top != D3DTOP_BUMPENVMAP && top != D3DTOP_BUMPENVMAPLUMINANCE &&
   1213         top != D3DTOP_LERP)
   1214         dst = ureg_saturate(dst);
   1215 
   1216     switch (top) {
   1217     case D3DTOP_SELECTARG1:
   1218         if (!is_MOV_no_op(dst, arg[1]))
   1219             ureg_MOV(ureg, dst, arg[1]);
   1220         break;
   1221     case D3DTOP_SELECTARG2:
   1222         if (!is_MOV_no_op(dst, arg[2]))
   1223             ureg_MOV(ureg, dst, arg[2]);
   1224         break;
   1225     case D3DTOP_MODULATE:
   1226         ureg_MUL(ureg, dst, arg[1], arg[2]);
   1227         break;
   1228     case D3DTOP_MODULATE2X:
   1229         ureg_MUL(ureg, tmp, arg[1], arg[2]);
   1230         ureg_ADD(ureg, dst, ureg_src(tmp), ureg_src(tmp));
   1231         break;
   1232     case D3DTOP_MODULATE4X:
   1233         ureg_MUL(ureg, tmp, arg[1], arg[2]);
   1234         ureg_MUL(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 4.0f));
   1235         break;
   1236     case D3DTOP_ADD:
   1237         ureg_ADD(ureg, dst, arg[1], arg[2]);
   1238         break;
   1239     case D3DTOP_ADDSIGNED:
   1240         ureg_ADD(ureg, tmp, arg[1], arg[2]);
   1241         ureg_ADD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, -0.5f));
   1242         break;
   1243     case D3DTOP_ADDSIGNED2X:
   1244         ureg_ADD(ureg, tmp, arg[1], arg[2]);
   1245         ureg_MAD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 2.0f), ureg_imm1f(ureg, -1.0f));
   1246         break;
   1247     case D3DTOP_SUBTRACT:
   1248         ureg_ADD(ureg, dst, arg[1], ureg_negate(arg[2]));
   1249         break;
   1250     case D3DTOP_ADDSMOOTH:
   1251         ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
   1252         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], arg[1]);
   1253         break;
   1254     case D3DTOP_BLENDDIFFUSEALPHA:
   1255         ureg_LRP(ureg, dst, _WWWW(ps->vC[0]), arg[1], arg[2]);
   1256         break;
   1257     case D3DTOP_BLENDTEXTUREALPHA:
   1258         /* XXX: alpha taken from previous stage, texture or result ? */
   1259         ureg_LRP(ureg, dst, _W(ps->rTex), arg[1], arg[2]);
   1260         break;
   1261     case D3DTOP_BLENDFACTORALPHA:
   1262         ureg_LRP(ureg, dst, _WWWW(_CONST(20)), arg[1], arg[2]);
   1263         break;
   1264     case D3DTOP_BLENDTEXTUREALPHAPM:
   1265         ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_W(ps->rTex)));
   1266         ureg_MAD(ureg, dst, arg[2], _X(tmp), arg[1]);
   1267         break;
   1268     case D3DTOP_BLENDCURRENTALPHA:
   1269         ureg_LRP(ureg, dst, _WWWW(ps->rCurSrc), arg[1], arg[2]);
   1270         break;
   1271     case D3DTOP_PREMODULATE:
   1272         ureg_MOV(ureg, dst, arg[1]);
   1273         ps->stage.index_pre_mod = ps->stage.index + 1;
   1274         break;
   1275     case D3DTOP_MODULATEALPHA_ADDCOLOR:
   1276         ureg_MAD(ureg, dst, _WWWW(arg[1]), arg[2], arg[1]);
   1277         break;
   1278     case D3DTOP_MODULATECOLOR_ADDALPHA:
   1279         ureg_MAD(ureg, dst, arg[1], arg[2], _WWWW(arg[1]));
   1280         break;
   1281     case D3DTOP_MODULATEINVALPHA_ADDCOLOR:
   1282         ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_WWWW(arg[1])));
   1283         ureg_MAD(ureg, dst, _X(tmp), arg[2], arg[1]);
   1284         break;
   1285     case D3DTOP_MODULATEINVCOLOR_ADDALPHA:
   1286         ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
   1287         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], _WWWW(arg[1]));
   1288         break;
   1289     case D3DTOP_BUMPENVMAP:
   1290         break;
   1291     case D3DTOP_BUMPENVMAPLUMINANCE:
   1292         break;
   1293     case D3DTOP_DOTPRODUCT3:
   1294         ureg_ADD(ureg, tmp, arg[1], ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
   1295         ureg_ADD(ureg, tmp2, arg[2] , ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
   1296         ureg_DP3(ureg, tmp, ureg_src(tmp), ureg_src(tmp2));
   1297         ureg_MUL(ureg, ureg_saturate(dst), ureg_src(tmp), ureg_imm4f(ureg,4.0,4.0,4.0,4.0));
   1298         break;
   1299     case D3DTOP_MULTIPLYADD:
   1300         ureg_MAD(ureg, dst, arg[1], arg[2], arg[0]);
   1301         break;
   1302     case D3DTOP_LERP:
   1303         ureg_LRP(ureg, dst, arg[0], arg[1], arg[2]);
   1304         break;
   1305     case D3DTOP_DISABLE:
   1306         /* no-op ? */
   1307         break;
   1308     default:
   1309         assert(!"invalid D3DTOP");
   1310         break;
   1311     }
   1312     ureg_release_temporary(ureg, tmp);
   1313     ureg_release_temporary(ureg, tmp2);
   1314 }
   1315 
   1316 static void *
   1317 nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
   1318 {
   1319     struct ps_build_ctx ps;
   1320     struct ureg_program *ureg = ureg_create(PIPE_SHADER_FRAGMENT);
   1321     struct ureg_dst oCol;
   1322     unsigned s;
   1323     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
   1324 
   1325     memset(&ps, 0, sizeof(ps));
   1326     ps.ureg = ureg;
   1327     ps.stage.index_pre_mod = -1;
   1328 
   1329     ps.vC[0] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
   1330 
   1331     ps.rCur = ureg_DECL_temporary(ureg);
   1332     ps.rTmp = ureg_DECL_temporary(ureg);
   1333     ps.rTex = ureg_DECL_temporary(ureg);
   1334     ps.rCurSrc = ureg_src(ps.rCur);
   1335     ps.rTmpSrc = ureg_src(ps.rTmp);
   1336     ps.rTexSrc = ureg_src(ps.rTex);
   1337 
   1338     /* Initial values */
   1339     ureg_MOV(ureg, ps.rCur, ps.vC[0]);
   1340     ureg_MOV(ureg, ps.rTmp, ureg_imm1f(ureg, 0.0f));
   1341     ureg_MOV(ureg, ps.rTex, ureg_imm1f(ureg, 0.0f));
   1342 
   1343     for (s = 0; s < 8; ++s) {
   1344         ps.s[s] = ureg_src_undef();
   1345 
   1346         if (key->ts[s].colorop != D3DTOP_DISABLE) {
   1347             if (key->ts[s].colorarg0 == D3DTA_SPECULAR ||
   1348                 key->ts[s].colorarg1 == D3DTA_SPECULAR ||
   1349                 key->ts[s].colorarg2 == D3DTA_SPECULAR)
   1350                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
   1351 
   1352             if (key->ts[s].colorarg0 == D3DTA_TEXTURE ||
   1353                 key->ts[s].colorarg1 == D3DTA_TEXTURE ||
   1354                 key->ts[s].colorarg2 == D3DTA_TEXTURE) {
   1355                 ps.s[s] = ureg_DECL_sampler(ureg, s);
   1356                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
   1357             }
   1358             if (s && (key->ts[s - 1].colorop == D3DTOP_PREMODULATE ||
   1359                       key->ts[s - 1].alphaop == D3DTOP_PREMODULATE))
   1360                 ps.s[s] = ureg_DECL_sampler(ureg, s);
   1361         }
   1362 
   1363         if (key->ts[s].alphaop != D3DTOP_DISABLE) {
   1364             if (key->ts[s].alphaarg0 == D3DTA_SPECULAR ||
   1365                 key->ts[s].alphaarg1 == D3DTA_SPECULAR ||
   1366                 key->ts[s].alphaarg2 == D3DTA_SPECULAR)
   1367                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
   1368 
   1369             if (key->ts[s].alphaarg0 == D3DTA_TEXTURE ||
   1370                 key->ts[s].alphaarg1 == D3DTA_TEXTURE ||
   1371                 key->ts[s].alphaarg2 == D3DTA_TEXTURE) {
   1372                 ps.s[s] = ureg_DECL_sampler(ureg, s);
   1373                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
   1374             }
   1375         }
   1376     }
   1377     if (key->specular)
   1378         ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
   1379 
   1380     oCol = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
   1381 
   1382     /* Run stages.
   1383      */
   1384     for (s = 0; s < 8; ++s) {
   1385         unsigned colorarg[3];
   1386         unsigned alphaarg[3];
   1387         const uint8_t used_c = ps_d3dtop_args_mask(key->ts[s].colorop);
   1388         const uint8_t used_a = ps_d3dtop_args_mask(key->ts[s].alphaop);
   1389         struct ureg_dst dst;
   1390         struct ureg_src arg[3];
   1391 
   1392         if (key->ts[s].colorop == D3DTOP_DISABLE) {
   1393             assert (key->ts[s].alphaop == D3DTOP_DISABLE);
   1394             continue;
   1395         }
   1396         ps.stage.index = s;
   1397 
   1398         DBG("STAGE[%u]: colorop=%s alphaop=%s\n", s,
   1399             nine_D3DTOP_to_str(key->ts[s].colorop),
   1400             nine_D3DTOP_to_str(key->ts[s].alphaop));
   1401 
   1402         if (!ureg_src_is_undef(ps.s[s])) {
   1403             unsigned target;
   1404             struct ureg_src texture_coord = ps.vT[s];
   1405             struct ureg_dst delta;
   1406             switch (key->ts[s].textarget) {
   1407             case 0: target = TGSI_TEXTURE_1D; break;
   1408             case 1: target = TGSI_TEXTURE_2D; break;
   1409             case 2: target = TGSI_TEXTURE_3D; break;
   1410             case 3: target = TGSI_TEXTURE_CUBE; break;
   1411             /* this is a 2 bit bitfield, do I really need a default case ? */
   1412             }
   1413 
   1414             /* Modify coordinates */
   1415             if (s >= 1 &&
   1416                 (key->ts[s-1].colorop == D3DTOP_BUMPENVMAP ||
   1417                  key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)) {
   1418                 delta = ureg_DECL_temporary(ureg);
   1419                 /* Du' = D3DTSS_BUMPENVMAT00(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT10(stage s-1)*t(s-1)G */
   1420                 ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _X(ps.rTex), _XXXX(_CONST(8 + s - 1)));
   1421                 ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _Y(ps.rTex), _ZZZZ(_CONST(8 + s - 1)), ureg_src(delta));
   1422                 /* Dv' = D3DTSS_BUMPENVMAT01(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT11(stage s-1)*t(s-1)G */
   1423                 ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _X(ps.rTex), _YYYY(_CONST(8 + s - 1)));
   1424                 ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _Y(ps.rTex), _WWWW(_CONST(8 + s - 1)), ureg_src(delta));
   1425                 texture_coord = ureg_src(ureg_DECL_temporary(ureg));
   1426                 ureg_MOV(ureg, ureg_writemask(ureg_dst(texture_coord), ureg_dst(ps.vT[s]).WriteMask), ps.vT[s]);
   1427                 ureg_ADD(ureg, ureg_writemask(ureg_dst(texture_coord), TGSI_WRITEMASK_XY), texture_coord, ureg_src(delta));
   1428                 /* Prepare luminance multiplier
   1429                  * t(s)RGBA = t(s)RGBA * clamp[(t(s-1)B * D3DTSS_BUMPENVLSCALE(stage s-1)) + D3DTSS_BUMPENVLOFFSET(stage s-1)] */
   1430                 if (key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE) {
   1431                     struct ureg_src bumpenvlscale = ((s-1) & 1) ? _ZZZZ(_CONST(16 + (s-1) / 2)) : _XXXX(_CONST(16 + (s-1) / 2));
   1432                     struct ureg_src bumpenvloffset = ((s-1) & 1) ? _WWWW(_CONST(16 + (s-1) / 2)) : _YYYY(_CONST(16 + (s-1) / 2));
   1433 
   1434                     ureg_MAD(ureg, ureg_saturate(ureg_writemask(delta, TGSI_WRITEMASK_X)), _Z(ps.rTex), bumpenvlscale, bumpenvloffset);
   1435                 }
   1436             }
   1437             if (key->projected & (3 << (s *2))) {
   1438                 unsigned dim = 1 + ((key->projected >> (2 * s)) & 3);
   1439                 if (dim == 4)
   1440                     ureg_TXP(ureg, ps.rTex, target, texture_coord, ps.s[s]);
   1441                 else {
   1442                     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
   1443                     ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(texture_coord, dim-1));
   1444                     ureg_MUL(ureg, ps.rTmp, _X(tmp), texture_coord);
   1445                     ureg_TEX(ureg, ps.rTex, target, ps.rTmpSrc, ps.s[s]);
   1446                     ureg_release_temporary(ureg, tmp);
   1447                 }
   1448             } else {
   1449                 ureg_TEX(ureg, ps.rTex, target, texture_coord, ps.s[s]);
   1450             }
   1451             if (s >= 1 && key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
   1452                 ureg_MUL(ureg, ps.rTex, ureg_src(ps.rTex), _X(delta));
   1453         }
   1454 
   1455         if (key->ts[s].colorop == D3DTOP_BUMPENVMAP ||
   1456             key->ts[s].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
   1457             continue;
   1458 
   1459         dst = ps_get_ts_dst(&ps, key->ts[s].resultarg ? D3DTA_TEMP : D3DTA_CURRENT);
   1460 
   1461         if (ps.stage.index_pre_mod == ps.stage.index) {
   1462             ps.rMod = ureg_DECL_temporary(ureg);
   1463             ureg_MUL(ureg, ps.rMod, ps.rCurSrc, ps.rTexSrc);
   1464         }
   1465 
   1466         colorarg[0] = (key->ts[s].colorarg0 | ((key->colorarg_b4[0] >> s) << 4) | ((key->colorarg_b5[0] >> s) << 5)) & 0x3f;
   1467         colorarg[1] = (key->ts[s].colorarg1 | ((key->colorarg_b4[1] >> s) << 4) | ((key->colorarg_b5[1] >> s) << 5)) & 0x3f;
   1468         colorarg[2] = (key->ts[s].colorarg2 | ((key->colorarg_b4[2] >> s) << 4) | ((key->colorarg_b5[2] >> s) << 5)) & 0x3f;
   1469         alphaarg[0] = (key->ts[s].alphaarg0 | ((key->alphaarg_b4[0] >> s) << 4)) & 0x1f;
   1470         alphaarg[1] = (key->ts[s].alphaarg1 | ((key->alphaarg_b4[1] >> s) << 4)) & 0x1f;
   1471         alphaarg[2] = (key->ts[s].alphaarg2 | ((key->alphaarg_b4[2] >> s) << 4)) & 0x1f;
   1472 
   1473         if (key->ts[s].colorop != key->ts[s].alphaop ||
   1474             colorarg[0] != alphaarg[0] ||
   1475             colorarg[1] != alphaarg[1] ||
   1476             colorarg[2] != alphaarg[2])
   1477             dst.WriteMask = TGSI_WRITEMASK_XYZ;
   1478 
   1479         /* Special DOTPRODUCT behaviour (see wine tests) */
   1480         if (key->ts[s].colorop == D3DTOP_DOTPRODUCT3)
   1481             dst.WriteMask = TGSI_WRITEMASK_XYZW;
   1482 
   1483         if (used_c & 0x1) arg[0] = ps_get_ts_arg(&ps, colorarg[0]);
   1484         if (used_c & 0x2) arg[1] = ps_get_ts_arg(&ps, colorarg[1]);
   1485         if (used_c & 0x4) arg[2] = ps_get_ts_arg(&ps, colorarg[2]);
   1486         ps_do_ts_op(&ps, key->ts[s].colorop, dst, arg);
   1487 
   1488         if (dst.WriteMask != TGSI_WRITEMASK_XYZW) {
   1489             dst.WriteMask = TGSI_WRITEMASK_W;
   1490 
   1491             if (used_a & 0x1) arg[0] = ps_get_ts_arg(&ps, alphaarg[0]);
   1492             if (used_a & 0x2) arg[1] = ps_get_ts_arg(&ps, alphaarg[1]);
   1493             if (used_a & 0x4) arg[2] = ps_get_ts_arg(&ps, alphaarg[2]);
   1494             ps_do_ts_op(&ps, key->ts[s].alphaop, dst, arg);
   1495         }
   1496     }
   1497 
   1498     if (key->specular)
   1499         ureg_ADD(ureg, ureg_writemask(ps.rCur, TGSI_WRITEMASK_XYZ), ps.rCurSrc, ps.vC[1]);
   1500 
   1501     /* Fog.
   1502      */
   1503     if (key->fog_mode) {
   1504         struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
   1505         struct ureg_src vPos;
   1506         if (device->screen->get_param(device->screen,
   1507                                       PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL)) {
   1508             vPos = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
   1509         } else {
   1510             vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
   1511                                       TGSI_INTERPOLATE_LINEAR);
   1512         }
   1513 
   1514         /* Source is either W or Z.
   1515          * When we use vs ff,
   1516          * Z is when an orthogonal projection matrix is detected,
   1517          * W (WFOG) else.
   1518          * Z is used for programmable vs.
   1519          * Note: Tests indicate that the projection matrix coefficients do
   1520          * actually affect pixel fog (and not vertex fog) when vs ff is used,
   1521          * which justifies taking the position's w instead of taking the z coordinate
   1522          * before the projection in the vs shader.
   1523          */
   1524         if (!key->fog_source)
   1525             ureg_MOV(ureg, rFog, _ZZZZ(vPos));
   1526         else
   1527             /* Position's w is 1/w */
   1528             ureg_RCP(ureg, rFog, _WWWW(vPos));
   1529 
   1530         if (key->fog_mode == D3DFOG_EXP) {
   1531             ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
   1532             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
   1533             ureg_EX2(ureg, rFog, _X(rFog));
   1534         } else
   1535         if (key->fog_mode == D3DFOG_EXP2) {
   1536             ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
   1537             ureg_MUL(ureg, rFog, _X(rFog), _X(rFog));
   1538             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
   1539             ureg_EX2(ureg, rFog, _X(rFog));
   1540         } else
   1541         if (key->fog_mode == D3DFOG_LINEAR) {
   1542             ureg_ADD(ureg, rFog, _XXXX(_CONST(22)), ureg_negate(_X(rFog)));
   1543             ureg_MUL(ureg, ureg_saturate(rFog), _X(rFog), _YYYY(_CONST(22)));
   1544         }
   1545         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _X(rFog), ps.rCurSrc, _CONST(21));
   1546         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
   1547     } else
   1548     if (key->fog) {
   1549         struct ureg_src vFog = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_FOG, 0, TGSI_INTERPOLATE_PERSPECTIVE);
   1550         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _XXXX(vFog), ps.rCurSrc, _CONST(21));
   1551         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
   1552     } else {
   1553         ureg_MOV(ureg, oCol, ps.rCurSrc);
   1554     }
   1555 
   1556     ureg_END(ureg);
   1557     nine_ureg_tgsi_dump(ureg, FALSE);
   1558     return ureg_create_shader_and_destroy(ureg, device->context.pipe);
   1559 }
   1560 
   1561 static struct NineVertexShader9 *
   1562 nine_ff_get_vs(struct NineDevice9 *device)
   1563 {
   1564     const struct nine_context *context = &device->context;
   1565     struct NineVertexShader9 *vs;
   1566     enum pipe_error err;
   1567     struct vs_build_ctx bld;
   1568     struct nine_ff_vs_key key;
   1569     unsigned s, i;
   1570     boolean has_indexes = false;
   1571     boolean has_weights = false;
   1572     char input_texture_coord[8];
   1573 
   1574     assert(sizeof(key) <= sizeof(key.value32));
   1575 
   1576     memset(&key, 0, sizeof(key));
   1577     memset(&bld, 0, sizeof(bld));
   1578     memset(&input_texture_coord, 0, sizeof(input_texture_coord));
   1579 
   1580     bld.key = &key;
   1581 
   1582     /* FIXME: this shouldn't be NULL, but it is on init */
   1583     if (context->vdecl) {
   1584         key.color0in_one = 1;
   1585         key.color1in_zero = 1;
   1586         for (i = 0; i < context->vdecl->nelems; i++) {
   1587             uint16_t usage = context->vdecl->usage_map[i];
   1588             if (usage == NINE_DECLUSAGE_POSITIONT)
   1589                 key.position_t = 1;
   1590             else if (usage == NINE_DECLUSAGE_i(COLOR, 0))
   1591                 key.color0in_one = 0;
   1592             else if (usage == NINE_DECLUSAGE_i(COLOR, 1))
   1593                 key.color1in_zero = 0;
   1594             else if (usage == NINE_DECLUSAGE_i(BLENDINDICES, 0)) {
   1595                 has_indexes = true;
   1596                 key.passthrough |= 1 << usage;
   1597             } else if (usage == NINE_DECLUSAGE_i(BLENDWEIGHT, 0)) {
   1598                 has_weights = true;
   1599                 key.passthrough |= 1 << usage;
   1600             } else if (usage == NINE_DECLUSAGE_i(NORMAL, 0)) {
   1601                 key.has_normal = 1;
   1602                 key.passthrough |= 1 << usage;
   1603             } else if (usage == NINE_DECLUSAGE_PSIZE)
   1604                 key.vertexpointsize = 1;
   1605             else if (usage % NINE_DECLUSAGE_COUNT == NINE_DECLUSAGE_TEXCOORD) {
   1606                 s = usage / NINE_DECLUSAGE_COUNT;
   1607                 if (s < 8)
   1608                     input_texture_coord[s] = nine_decltype_get_dim(context->vdecl->decls[i].Type);
   1609                 else
   1610                     DBG("FF given texture coordinate >= 8. Ignoring\n");
   1611             } else if (usage < NINE_DECLUSAGE_NONE)
   1612                 key.passthrough |= 1 << usage;
   1613         }
   1614     }
   1615     /* ff vs + ps 3.0: some elements are passed to the ps (wine test).
   1616      * We do restrict to indices 0 */
   1617     key.passthrough &= ~((1 << NINE_DECLUSAGE_POSITION) | (1 << NINE_DECLUSAGE_PSIZE) |
   1618                          (1 << NINE_DECLUSAGE_TEXCOORD) | (1 << NINE_DECLUSAGE_POSITIONT) |
   1619                          (1 << NINE_DECLUSAGE_TESSFACTOR) | (1 << NINE_DECLUSAGE_SAMPLE));
   1620     if (!key.position_t)
   1621         key.passthrough = 0;
   1622     key.pointscale = !!context->rs[D3DRS_POINTSCALEENABLE];
   1623 
   1624     key.lighting = !!context->rs[D3DRS_LIGHTING] &&  context->ff.num_lights_active;
   1625     key.darkness = !!context->rs[D3DRS_LIGHTING] && !context->ff.num_lights_active;
   1626     if (key.position_t) {
   1627         key.darkness = 0; /* |= key.lighting; */ /* XXX ? */
   1628         key.lighting = 0;
   1629     }
   1630     if ((key.lighting | key.darkness) && context->rs[D3DRS_COLORVERTEX]) {
   1631         uint32_t mask = (key.color0in_one ? 0 : 1) | (key.color1in_zero ? 0 : 2);
   1632         key.mtl_diffuse = context->rs[D3DRS_DIFFUSEMATERIALSOURCE] & mask;
   1633         key.mtl_ambient = context->rs[D3DRS_AMBIENTMATERIALSOURCE] & mask;
   1634         key.mtl_specular = context->rs[D3DRS_SPECULARMATERIALSOURCE] & mask;
   1635         key.mtl_emissive = context->rs[D3DRS_EMISSIVEMATERIALSOURCE] & mask;
   1636     }
   1637     key.fog = !!context->rs[D3DRS_FOGENABLE];
   1638     key.fog_mode = (!key.position_t && context->rs[D3DRS_FOGENABLE]) ? context->rs[D3DRS_FOGVERTEXMODE] : 0;
   1639     if (key.fog_mode)
   1640         key.fog_range = context->rs[D3DRS_RANGEFOGENABLE];
   1641 
   1642     key.localviewer = !!context->rs[D3DRS_LOCALVIEWER];
   1643     key.normalizenormals = !!context->rs[D3DRS_NORMALIZENORMALS];
   1644     key.ucp = !!context->rs[D3DRS_CLIPPLANEENABLE];
   1645 
   1646     if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
   1647         key.vertexblend_indexed = !!context->rs[D3DRS_INDEXEDVERTEXBLENDENABLE] && has_indexes;
   1648 
   1649         switch (context->rs[D3DRS_VERTEXBLEND]) {
   1650         case D3DVBF_0WEIGHTS: key.vertexblend = key.vertexblend_indexed; break;
   1651         case D3DVBF_1WEIGHTS: key.vertexblend = 2; break;
   1652         case D3DVBF_2WEIGHTS: key.vertexblend = 3; break;
   1653         case D3DVBF_3WEIGHTS: key.vertexblend = 4; break;
   1654         case D3DVBF_TWEENING: key.vertextween = 1; break;
   1655         default:
   1656             assert(!"invalid D3DVBF");
   1657             break;
   1658         }
   1659         if (!has_weights && context->rs[D3DRS_VERTEXBLEND] != D3DVBF_0WEIGHTS)
   1660             key.vertexblend = 0; /* TODO: if key.vertexblend_indexed, perhaps it should use 1.0 as weight, or revert to D3DVBF_0WEIGHTS */
   1661     }
   1662 
   1663     for (s = 0; s < 8; ++s) {
   1664         unsigned gen = (context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] >> 16) + 1;
   1665         unsigned idx = context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] & 7;
   1666         unsigned dim;
   1667 
   1668         if (key.position_t && gen > NINED3DTSS_TCI_PASSTHRU)
   1669             gen = NINED3DTSS_TCI_PASSTHRU;
   1670 
   1671         if (!input_texture_coord[idx] && gen == NINED3DTSS_TCI_PASSTHRU)
   1672             gen = NINED3DTSS_TCI_DISABLE;
   1673 
   1674         key.tc_gen |= gen << (s * 3);
   1675         key.tc_idx |= idx << (s * 3);
   1676         key.tc_dim_input |= ((input_texture_coord[idx]-1) & 0x3) << (s * 2);
   1677 
   1678         dim = context->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7;
   1679         if (dim > 4)
   1680             dim = input_texture_coord[idx];
   1681         if (dim == 1) /* NV behaviour */
   1682             dim = 0;
   1683         key.tc_dim_output |= dim << (s * 3);
   1684     }
   1685 
   1686     vs = util_hash_table_get(device->ff.ht_vs, &key);
   1687     if (vs)
   1688         return vs;
   1689     NineVertexShader9_new(device, &vs, NULL, nine_ff_build_vs(device, &bld));
   1690 
   1691     nine_ff_prune_vs(device);
   1692     if (vs) {
   1693         unsigned n;
   1694 
   1695         memcpy(&vs->ff_key, &key, sizeof(vs->ff_key));
   1696 
   1697         err = util_hash_table_set(device->ff.ht_vs, &vs->ff_key, vs);
   1698         (void)err;
   1699         assert(err == PIPE_OK);
   1700         device->ff.num_vs++;
   1701         NineUnknown_ConvertRefToBind(NineUnknown(vs));
   1702 
   1703         vs->num_inputs = bld.num_inputs;
   1704         for (n = 0; n < bld.num_inputs; ++n)
   1705             vs->input_map[n].ndecl = bld.input[n];
   1706 
   1707         vs->position_t = key.position_t;
   1708         vs->point_size = key.vertexpointsize | key.pointscale;
   1709     }
   1710     return vs;
   1711 }
   1712 
   1713 #define GET_D3DTS(n) nine_state_access_transform(&context->ff, D3DTS_##n, FALSE)
   1714 #define IS_D3DTS_DIRTY(s,n) ((s)->ff.changed.transform[(D3DTS_##n) / 32] & (1 << ((D3DTS_##n) % 32)))
   1715 
   1716 static struct NinePixelShader9 *
   1717 nine_ff_get_ps(struct NineDevice9 *device)
   1718 {
   1719     struct nine_context *context = &device->context;
   1720     D3DMATRIX *projection_matrix = GET_D3DTS(PROJECTION);
   1721     struct NinePixelShader9 *ps;
   1722     enum pipe_error err;
   1723     struct nine_ff_ps_key key;
   1724     unsigned s;
   1725     uint8_t sampler_mask = 0;
   1726 
   1727     assert(sizeof(key) <= sizeof(key.value32));
   1728 
   1729     memset(&key, 0, sizeof(key));
   1730     for (s = 0; s < 8; ++s) {
   1731         key.ts[s].colorop = context->ff.tex_stage[s][D3DTSS_COLOROP];
   1732         key.ts[s].alphaop = context->ff.tex_stage[s][D3DTSS_ALPHAOP];
   1733         const uint8_t used_c = ps_d3dtop_args_mask(key.ts[s].colorop);
   1734         const uint8_t used_a = ps_d3dtop_args_mask(key.ts[s].alphaop);
   1735         /* MSDN says D3DTOP_DISABLE disables this and all subsequent stages.
   1736          * ALPHAOP cannot be enabled if COLOROP is disabled.
   1737          * Verified on Windows. */
   1738         if (key.ts[s].colorop == D3DTOP_DISABLE) {
   1739             key.ts[s].alphaop = D3DTOP_DISABLE; /* DISABLE == 1, avoid degenerate keys */
   1740             break;
   1741         }
   1742 
   1743         if (!context->texture[s].enabled &&
   1744             ((context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE &&
   1745               used_c & 0x1) ||
   1746              (context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE &&
   1747               used_c & 0x2) ||
   1748              (context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE &&
   1749               used_c & 0x4))) {
   1750             /* Tested on Windows: Invalid texture read disables the stage
   1751              * and the subsequent ones, but only for colorop. For alpha,
   1752              * it's as if the texture had alpha of 1.0, which is what
   1753              * has our dummy texture in that case. Invalid color also
   1754              * disabled the following alpha stages. */
   1755             key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
   1756             break;
   1757         }
   1758 
   1759         if (context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE ||
   1760             context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE ||
   1761             context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE ||
   1762             context->ff.tex_stage[s][D3DTSS_ALPHAARG0] == D3DTA_TEXTURE ||
   1763             context->ff.tex_stage[s][D3DTSS_ALPHAARG1] == D3DTA_TEXTURE ||
   1764             context->ff.tex_stage[s][D3DTSS_ALPHAARG2] == D3DTA_TEXTURE)
   1765             sampler_mask |= (1 << s);
   1766 
   1767         if (key.ts[s].colorop != D3DTOP_DISABLE) {
   1768             if (used_c & 0x1) key.ts[s].colorarg0 = context->ff.tex_stage[s][D3DTSS_COLORARG0];
   1769             if (used_c & 0x2) key.ts[s].colorarg1 = context->ff.tex_stage[s][D3DTSS_COLORARG1];
   1770             if (used_c & 0x4) key.ts[s].colorarg2 = context->ff.tex_stage[s][D3DTSS_COLORARG2];
   1771             if (used_c & 0x1) key.colorarg_b4[0] |= (context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 4) << s;
   1772             if (used_c & 0x1) key.colorarg_b5[0] |= (context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 5) << s;
   1773             if (used_c & 0x2) key.colorarg_b4[1] |= (context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 4) << s;
   1774             if (used_c & 0x2) key.colorarg_b5[1] |= (context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 5) << s;
   1775             if (used_c & 0x4) key.colorarg_b4[2] |= (context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 4) << s;
   1776             if (used_c & 0x4) key.colorarg_b5[2] |= (context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 5) << s;
   1777         }
   1778         if (key.ts[s].alphaop != D3DTOP_DISABLE) {
   1779             if (used_a & 0x1) key.ts[s].alphaarg0 = context->ff.tex_stage[s][D3DTSS_ALPHAARG0];
   1780             if (used_a & 0x2) key.ts[s].alphaarg1 = context->ff.tex_stage[s][D3DTSS_ALPHAARG1];
   1781             if (used_a & 0x4) key.ts[s].alphaarg2 = context->ff.tex_stage[s][D3DTSS_ALPHAARG2];
   1782             if (used_a & 0x1) key.alphaarg_b4[0] |= (context->ff.tex_stage[s][D3DTSS_ALPHAARG0] >> 4) << s;
   1783             if (used_a & 0x2) key.alphaarg_b4[1] |= (context->ff.tex_stage[s][D3DTSS_ALPHAARG1] >> 4) << s;
   1784             if (used_a & 0x4) key.alphaarg_b4[2] |= (context->ff.tex_stage[s][D3DTSS_ALPHAARG2] >> 4) << s;
   1785         }
   1786         key.ts[s].resultarg = context->ff.tex_stage[s][D3DTSS_RESULTARG] == D3DTA_TEMP;
   1787 
   1788         if (context->texture[s].enabled) {
   1789             switch (context->texture[s].type) {
   1790             case D3DRTYPE_TEXTURE:       key.ts[s].textarget = 1; break;
   1791             case D3DRTYPE_VOLUMETEXTURE: key.ts[s].textarget = 2; break;
   1792             case D3DRTYPE_CUBETEXTURE:   key.ts[s].textarget = 3; break;
   1793             default:
   1794                 assert(!"unexpected texture type");
   1795                 break;
   1796             }
   1797         } else {
   1798             key.ts[s].textarget = 1;
   1799         }
   1800     }
   1801 
   1802     /* Note: If colorop is D3DTOP_DISABLE for the first stage
   1803      * (which implies alphaop is too), nothing particular happens,
   1804      * that is, current is equal to diffuse (which is the case anyway,
   1805      * because it is how it is initialized).
   1806      * Special case seems if alphaop is D3DTOP_DISABLE and not colorop,
   1807      * because then if the resultarg is TEMP, then diffuse alpha is written
   1808      * to it. */
   1809     if (key.ts[0].colorop != D3DTOP_DISABLE &&
   1810         key.ts[0].alphaop == D3DTOP_DISABLE &&
   1811         key.ts[0].resultarg != 0) {
   1812         key.ts[0].alphaop = D3DTOP_SELECTARG1;
   1813         key.ts[0].alphaarg1 = D3DTA_DIFFUSE;
   1814     }
   1815     /* When no alpha stage writes to current, diffuse alpha is taken.
   1816      * Since we initialize current to diffuse, we have the behaviour. */
   1817 
   1818     /* Last stage always writes to Current */
   1819     if (s >= 1)
   1820         key.ts[s-1].resultarg = 0;
   1821 
   1822     key.projected = nine_ff_get_projected_key(context);
   1823     key.specular = !!context->rs[D3DRS_SPECULARENABLE];
   1824 
   1825     for (; s < 8; ++s)
   1826         key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
   1827     if (context->rs[D3DRS_FOGENABLE])
   1828         key.fog_mode = context->rs[D3DRS_FOGTABLEMODE];
   1829     key.fog = !!context->rs[D3DRS_FOGENABLE];
   1830     /* Pixel fog (with WFOG advertised): source is either Z or W.
   1831      * W is the source if vs ff is used, and the
   1832      * projection matrix is not orthogonal.
   1833      * Tests on Win 10 seem to indicate _34
   1834      * and _33 are checked against 0, 1. */
   1835     if (key.fog_mode && key.fog)
   1836         key.fog_source = !context->programmable_vs &&
   1837             !(projection_matrix->_34 == 0.0f &&
   1838               projection_matrix->_44 == 1.0f);
   1839 
   1840     ps = util_hash_table_get(device->ff.ht_ps, &key);
   1841     if (ps)
   1842         return ps;
   1843     NinePixelShader9_new(device, &ps, NULL, nine_ff_build_ps(device, &key));
   1844 
   1845     nine_ff_prune_ps(device);
   1846     if (ps) {
   1847         memcpy(&ps->ff_key, &key, sizeof(ps->ff_key));
   1848 
   1849         err = util_hash_table_set(device->ff.ht_ps, &ps->ff_key, ps);
   1850         (void)err;
   1851         assert(err == PIPE_OK);
   1852         device->ff.num_ps++;
   1853         NineUnknown_ConvertRefToBind(NineUnknown(ps));
   1854 
   1855         ps->rt_mask = 0x1;
   1856         ps->sampler_mask = sampler_mask;
   1857     }
   1858     return ps;
   1859 }
   1860 
   1861 static void
   1862 nine_ff_load_vs_transforms(struct NineDevice9 *device)
   1863 {
   1864     struct nine_context *context = &device->context;
   1865     D3DMATRIX T;
   1866     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
   1867     unsigned i;
   1868 
   1869     /* TODO: make this nicer, and only upload the ones we need */
   1870     /* TODO: use ff.vs_const as storage of W, V, P matrices */
   1871 
   1872     if (IS_D3DTS_DIRTY(context, WORLD) ||
   1873         IS_D3DTS_DIRTY(context, VIEW) ||
   1874         IS_D3DTS_DIRTY(context, PROJECTION)) {
   1875         /* WVP, WV matrices */
   1876         nine_d3d_matrix_matrix_mul(&M[1], GET_D3DTS(WORLD), GET_D3DTS(VIEW));
   1877         nine_d3d_matrix_matrix_mul(&M[0], &M[1], GET_D3DTS(PROJECTION));
   1878 
   1879         /* normal matrix == transpose(inverse(WV)) */
   1880         nine_d3d_matrix_inverse(&T, &M[1]);
   1881         nine_d3d_matrix_transpose(&M[4], &T);
   1882 
   1883         /* P matrix */
   1884         M[2] = *GET_D3DTS(PROJECTION);
   1885 
   1886         /* V and W matrix */
   1887         nine_d3d_matrix_inverse(&M[3], GET_D3DTS(VIEW));
   1888         M[40] = M[1];
   1889     }
   1890 
   1891     if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
   1892         /* load other world matrices */
   1893         for (i = 1; i <= 8; ++i) {
   1894             nine_d3d_matrix_matrix_mul(&M[40 + i], GET_D3DTS(WORLDMATRIX(i)), GET_D3DTS(VIEW));
   1895         }
   1896     }
   1897 
   1898     device->ff.vs_const[30 * 4] = asfloat(context->rs[D3DRS_TWEENFACTOR]);
   1899 }
   1900 
   1901 static void
   1902 nine_ff_load_lights(struct NineDevice9 *device)
   1903 {
   1904     struct nine_context *context = &device->context;
   1905     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
   1906     unsigned l;
   1907 
   1908     if (context->changed.group & NINE_STATE_FF_MATERIAL) {
   1909         const D3DMATERIAL9 *mtl = &context->ff.material;
   1910 
   1911         memcpy(&dst[20], &mtl->Diffuse, 4 * sizeof(float));
   1912         memcpy(&dst[21], &mtl->Ambient, 4 * sizeof(float));
   1913         memcpy(&dst[22], &mtl->Specular, 4 * sizeof(float));
   1914         dst[23].x = mtl->Power;
   1915         memcpy(&dst[24], &mtl->Emissive, 4 * sizeof(float));
   1916         d3dcolor_to_rgba(&dst[25].x, context->rs[D3DRS_AMBIENT]);
   1917         dst[19].x = dst[25].x * mtl->Ambient.r + mtl->Emissive.r;
   1918         dst[19].y = dst[25].y * mtl->Ambient.g + mtl->Emissive.g;
   1919         dst[19].z = dst[25].z * mtl->Ambient.b + mtl->Emissive.b;
   1920     }
   1921 
   1922     if (!(context->changed.group & NINE_STATE_FF_LIGHTING))
   1923         return;
   1924 
   1925     for (l = 0; l < context->ff.num_lights_active; ++l) {
   1926         const D3DLIGHT9 *light = &context->ff.light[context->ff.active_light[l]];
   1927 
   1928         dst[32 + l * 8].x = light->Type;
   1929         dst[32 + l * 8].y = light->Attenuation0;
   1930         dst[32 + l * 8].z = light->Attenuation1;
   1931         dst[32 + l * 8].w = light->Attenuation2;
   1932         memcpy(&dst[33 + l * 8].x, &light->Diffuse, sizeof(light->Diffuse));
   1933         memcpy(&dst[34 + l * 8].x, &light->Specular, sizeof(light->Specular));
   1934         memcpy(&dst[35 + l * 8].x, &light->Ambient, sizeof(light->Ambient));
   1935         nine_d3d_vector4_matrix_mul((D3DVECTOR *)&dst[36 + l * 8].x, &light->Position, GET_D3DTS(VIEW));
   1936         nine_d3d_vector3_matrix_mul((D3DVECTOR *)&dst[37 + l * 8].x, &light->Direction, GET_D3DTS(VIEW));
   1937         dst[36 + l * 8].w = light->Type == D3DLIGHT_DIRECTIONAL ? 1e9f : light->Range;
   1938         dst[37 + l * 8].w = light->Falloff;
   1939         dst[38 + l * 8].x = cosf(light->Theta * 0.5f);
   1940         dst[38 + l * 8].y = cosf(light->Phi * 0.5f);
   1941         dst[38 + l * 8].z = 1.0f / (dst[38 + l * 8].x - dst[38 + l * 8].y);
   1942         dst[39 + l * 8].w = (float)((l + 1) == context->ff.num_lights_active);
   1943     }
   1944 }
   1945 
   1946 static void
   1947 nine_ff_load_point_and_fog_params(struct NineDevice9 *device)
   1948 {
   1949     struct nine_context *context = &device->context;
   1950     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
   1951 
   1952     if (!(context->changed.group & NINE_STATE_FF_OTHER))
   1953         return;
   1954     dst[26].x = asfloat(context->rs[D3DRS_POINTSIZE_MIN]);
   1955     dst[26].y = asfloat(context->rs[D3DRS_POINTSIZE_MAX]);
   1956     dst[26].z = asfloat(context->rs[D3DRS_POINTSIZE]);
   1957     dst[26].w = asfloat(context->rs[D3DRS_POINTSCALE_A]);
   1958     dst[27].x = asfloat(context->rs[D3DRS_POINTSCALE_B]);
   1959     dst[27].y = asfloat(context->rs[D3DRS_POINTSCALE_C]);
   1960     dst[28].x = asfloat(context->rs[D3DRS_FOGEND]);
   1961     dst[28].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
   1962     if (isinf(dst[28].y))
   1963         dst[28].y = 0.0f;
   1964     dst[28].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
   1965 }
   1966 
   1967 static void
   1968 nine_ff_load_tex_matrices(struct NineDevice9 *device)
   1969 {
   1970     struct nine_context *context = &device->context;
   1971     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
   1972     unsigned s;
   1973 
   1974     if (!(context->ff.changed.transform[0] & 0xff0000))
   1975         return;
   1976     for (s = 0; s < 8; ++s) {
   1977         if (IS_D3DTS_DIRTY(context, TEXTURE0 + s))
   1978             nine_d3d_matrix_transpose(&M[32 + s], nine_state_access_transform(&context->ff, D3DTS_TEXTURE0 + s, FALSE));
   1979     }
   1980 }
   1981 
   1982 static void
   1983 nine_ff_load_ps_params(struct NineDevice9 *device)
   1984 {
   1985     struct nine_context *context = &device->context;
   1986     struct fvec4 *dst = (struct fvec4 *)device->ff.ps_const;
   1987     unsigned s;
   1988 
   1989     if (!(context->changed.group & (NINE_STATE_FF_PSSTAGES | NINE_STATE_FF_OTHER)))
   1990         return;
   1991 
   1992     for (s = 0; s < 8; ++s)
   1993         d3dcolor_to_rgba(&dst[s].x, context->ff.tex_stage[s][D3DTSS_CONSTANT]);
   1994 
   1995     for (s = 0; s < 8; ++s) {
   1996         dst[8 + s].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT00]);
   1997         dst[8 + s].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT01]);
   1998         dst[8 + s].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT10]);
   1999         dst[8 + s].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT11]);
   2000         if (s & 1) {
   2001             dst[16 + s / 2].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
   2002             dst[16 + s / 2].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
   2003         } else {
   2004             dst[16 + s / 2].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
   2005             dst[16 + s / 2].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
   2006         }
   2007     }
   2008 
   2009     d3dcolor_to_rgba(&dst[20].x, context->rs[D3DRS_TEXTUREFACTOR]);
   2010     d3dcolor_to_rgba(&dst[21].x, context->rs[D3DRS_FOGCOLOR]);
   2011     dst[22].x = asfloat(context->rs[D3DRS_FOGEND]);
   2012     dst[22].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
   2013     dst[22].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
   2014 }
   2015 
   2016 static void
   2017 nine_ff_load_viewport_info(struct NineDevice9 *device)
   2018 {
   2019     D3DVIEWPORT9 *viewport = &device->context.viewport;
   2020     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
   2021     float diffZ = viewport->MaxZ - viewport->MinZ;
   2022 
   2023     /* Note: the other functions avoids to fill the const again if nothing changed.
   2024      * But we don't have much to fill, and adding code to allow that may be complex
   2025      * so just fill it always */
   2026     dst[100].x = 2.0f / (float)(viewport->Width);
   2027     dst[100].y = 2.0f / (float)(viewport->Height);
   2028     dst[100].z = (diffZ == 0.0f) ? 0.0f : (1.0f / diffZ);
   2029     dst[100].w = (float)(viewport->Width);
   2030     dst[101].x = (float)(viewport->X);
   2031     dst[101].y = (float)(viewport->Y);
   2032     dst[101].z = (float)(viewport->MinZ);
   2033 }
   2034 
   2035 void
   2036 nine_ff_update(struct NineDevice9 *device)
   2037 {
   2038     struct nine_context *context = &device->context;
   2039     struct pipe_constant_buffer cb;
   2040 
   2041     DBG("vs=%p ps=%p\n", context->vs, context->ps);
   2042 
   2043     /* NOTE: the only reference belongs to the hash table */
   2044     if (!context->programmable_vs) {
   2045         device->ff.vs = nine_ff_get_vs(device);
   2046         context->changed.group |= NINE_STATE_VS;
   2047     }
   2048     if (!context->ps) {
   2049         device->ff.ps = nine_ff_get_ps(device);
   2050         context->changed.group |= NINE_STATE_PS;
   2051     }
   2052 
   2053     if (!context->programmable_vs) {
   2054         nine_ff_load_vs_transforms(device);
   2055         nine_ff_load_tex_matrices(device);
   2056         nine_ff_load_lights(device);
   2057         nine_ff_load_point_and_fog_params(device);
   2058         nine_ff_load_viewport_info(device);
   2059 
   2060         memset(context->ff.changed.transform, 0, sizeof(context->ff.changed.transform));
   2061 
   2062         cb.buffer_offset = 0;
   2063         cb.buffer = NULL;
   2064         cb.user_buffer = device->ff.vs_const;
   2065         cb.buffer_size = NINE_FF_NUM_VS_CONST * 4 * sizeof(float);
   2066 
   2067         context->pipe_data.cb_vs_ff = cb;
   2068         context->commit |= NINE_STATE_COMMIT_CONST_VS;
   2069     }
   2070 
   2071     if (!context->ps) {
   2072         nine_ff_load_ps_params(device);
   2073 
   2074         cb.buffer_offset = 0;
   2075         cb.buffer = NULL;
   2076         cb.user_buffer = device->ff.ps_const;
   2077         cb.buffer_size = NINE_FF_NUM_PS_CONST * 4 * sizeof(float);
   2078 
   2079         context->pipe_data.cb_ps_ff = cb;
   2080         context->commit |= NINE_STATE_COMMIT_CONST_PS;
   2081     }
   2082 
   2083     context->changed.group &= ~NINE_STATE_FF;
   2084 }
   2085 
   2086 
   2087 boolean
   2088 nine_ff_init(struct NineDevice9 *device)
   2089 {
   2090     device->ff.ht_vs = util_hash_table_create(nine_ff_vs_key_hash,
   2091                                               nine_ff_vs_key_comp);
   2092     device->ff.ht_ps = util_hash_table_create(nine_ff_ps_key_hash,
   2093                                               nine_ff_ps_key_comp);
   2094 
   2095     device->ff.ht_fvf = util_hash_table_create(nine_ff_fvf_key_hash,
   2096                                                nine_ff_fvf_key_comp);
   2097 
   2098     device->ff.vs_const = CALLOC(NINE_FF_NUM_VS_CONST, 4 * sizeof(float));
   2099     device->ff.ps_const = CALLOC(NINE_FF_NUM_PS_CONST, 4 * sizeof(float));
   2100 
   2101     return device->ff.ht_vs && device->ff.ht_ps &&
   2102         device->ff.ht_fvf &&
   2103         device->ff.vs_const && device->ff.ps_const;
   2104 }
   2105 
   2106 static enum pipe_error nine_ff_ht_delete_cb(void *key, void *value, void *data)
   2107 {
   2108     NineUnknown_Unbind(NineUnknown(value));
   2109     return PIPE_OK;
   2110 }
   2111 
   2112 void
   2113 nine_ff_fini(struct NineDevice9 *device)
   2114 {
   2115     if (device->ff.ht_vs) {
   2116         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
   2117         util_hash_table_destroy(device->ff.ht_vs);
   2118     }
   2119     if (device->ff.ht_ps) {
   2120         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
   2121         util_hash_table_destroy(device->ff.ht_ps);
   2122     }
   2123     if (device->ff.ht_fvf) {
   2124         util_hash_table_foreach(device->ff.ht_fvf, nine_ff_ht_delete_cb, NULL);
   2125         util_hash_table_destroy(device->ff.ht_fvf);
   2126     }
   2127     device->ff.vs = NULL; /* destroyed by unbinding from hash table */
   2128     device->ff.ps = NULL;
   2129 
   2130     FREE(device->ff.vs_const);
   2131     FREE(device->ff.ps_const);
   2132 }
   2133 
   2134 static void
   2135 nine_ff_prune_vs(struct NineDevice9 *device)
   2136 {
   2137     struct nine_context *context = &device->context;
   2138 
   2139     if (device->ff.num_vs > 100) {
   2140         /* could destroy the bound one here, so unbind */
   2141         context->pipe->bind_vs_state(context->pipe, NULL);
   2142         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
   2143         util_hash_table_clear(device->ff.ht_vs);
   2144         device->ff.num_vs = 0;
   2145         context->changed.group |= NINE_STATE_VS;
   2146     }
   2147 }
   2148 static void
   2149 nine_ff_prune_ps(struct NineDevice9 *device)
   2150 {
   2151     struct nine_context *context = &device->context;
   2152 
   2153     if (device->ff.num_ps > 100) {
   2154         /* could destroy the bound one here, so unbind */
   2155         context->pipe->bind_fs_state(context->pipe, NULL);
   2156         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
   2157         util_hash_table_clear(device->ff.ht_ps);
   2158         device->ff.num_ps = 0;
   2159         context->changed.group |= NINE_STATE_PS;
   2160     }
   2161 }
   2162 
   2163 /* ========================================================================== */
   2164 
   2165 /* Matrix multiplication:
   2166  *
   2167  * in memory: 0 1 2 3 (row major)
   2168  *            4 5 6 7
   2169  *            8 9 a b
   2170  *            c d e f
   2171  *
   2172  *    cA cB cC cD
   2173  * r0             = (r0 * cA) (r0 * cB) . .
   2174  * r1             = (r1 * cA) (r1 * cB)
   2175  * r2             = (r2 * cA) .
   2176  * r3             = (r3 * cA) .
   2177  *
   2178  *               r: (11) (12) (13) (14)
   2179  *                  (21) (22) (23) (24)
   2180  *                  (31) (32) (33) (34)
   2181  *                  (41) (42) (43) (44)
   2182  * l: (11 12 13 14)
   2183  *    (21 22 23 24)
   2184  *    (31 32 33 34)
   2185  *    (41 42 43 44)
   2186  *
   2187  * v: (x  y  z  1 )
   2188  *
   2189  * t.xyzw = MUL(v.xxxx, r[0]);
   2190  * t.xyzw = MAD(v.yyyy, r[1], t.xyzw);
   2191  * t.xyzw = MAD(v.zzzz, r[2], t.xyzw);
   2192  * v.xyzw = MAD(v.wwww, r[3], t.xyzw);
   2193  *
   2194  * v.x = DP4(v, c[0]);
   2195  * v.y = DP4(v, c[1]);
   2196  * v.z = DP4(v, c[2]);
   2197  * v.w = DP4(v, c[3]) = 1
   2198  */
   2199 
   2200 /*
   2201 static void
   2202 nine_D3DMATRIX_print(const D3DMATRIX *M)
   2203 {
   2204     DBG("\n(%f %f %f %f)\n"
   2205         "(%f %f %f %f)\n"
   2206         "(%f %f %f %f)\n"
   2207         "(%f %f %f %f)\n",
   2208         M->m[0][0], M->m[0][1], M->m[0][2], M->m[0][3],
   2209         M->m[1][0], M->m[1][1], M->m[1][2], M->m[1][3],
   2210         M->m[2][0], M->m[2][1], M->m[2][2], M->m[2][3],
   2211         M->m[3][0], M->m[3][1], M->m[3][2], M->m[3][3]);
   2212 }
   2213 */
   2214 
   2215 static inline float
   2216 nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
   2217 {
   2218     return A->m[r][0] * B->m[0][c] +
   2219            A->m[r][1] * B->m[1][c] +
   2220            A->m[r][2] * B->m[2][c] +
   2221            A->m[r][3] * B->m[3][c];
   2222 }
   2223 
   2224 static inline float
   2225 nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
   2226 {
   2227     return v->x * M->m[0][c] +
   2228            v->y * M->m[1][c] +
   2229            v->z * M->m[2][c] +
   2230            1.0f * M->m[3][c];
   2231 }
   2232 
   2233 static inline float
   2234 nine_DP3_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
   2235 {
   2236     return v->x * M->m[0][c] +
   2237            v->y * M->m[1][c] +
   2238            v->z * M->m[2][c];
   2239 }
   2240 
   2241 void
   2242 nine_d3d_matrix_matrix_mul(D3DMATRIX *D, const D3DMATRIX *L, const D3DMATRIX *R)
   2243 {
   2244     D->_11 = nine_DP4_row_col(L, 0, R, 0);
   2245     D->_12 = nine_DP4_row_col(L, 0, R, 1);
   2246     D->_13 = nine_DP4_row_col(L, 0, R, 2);
   2247     D->_14 = nine_DP4_row_col(L, 0, R, 3);
   2248 
   2249     D->_21 = nine_DP4_row_col(L, 1, R, 0);
   2250     D->_22 = nine_DP4_row_col(L, 1, R, 1);
   2251     D->_23 = nine_DP4_row_col(L, 1, R, 2);
   2252     D->_24 = nine_DP4_row_col(L, 1, R, 3);
   2253 
   2254     D->_31 = nine_DP4_row_col(L, 2, R, 0);
   2255     D->_32 = nine_DP4_row_col(L, 2, R, 1);
   2256     D->_33 = nine_DP4_row_col(L, 2, R, 2);
   2257     D->_34 = nine_DP4_row_col(L, 2, R, 3);
   2258 
   2259     D->_41 = nine_DP4_row_col(L, 3, R, 0);
   2260     D->_42 = nine_DP4_row_col(L, 3, R, 1);
   2261     D->_43 = nine_DP4_row_col(L, 3, R, 2);
   2262     D->_44 = nine_DP4_row_col(L, 3, R, 3);
   2263 }
   2264 
   2265 void
   2266 nine_d3d_vector4_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
   2267 {
   2268     d->x = nine_DP4_vec_col(v, M, 0);
   2269     d->y = nine_DP4_vec_col(v, M, 1);
   2270     d->z = nine_DP4_vec_col(v, M, 2);
   2271 }
   2272 
   2273 void
   2274 nine_d3d_vector3_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
   2275 {
   2276     d->x = nine_DP3_vec_col(v, M, 0);
   2277     d->y = nine_DP3_vec_col(v, M, 1);
   2278     d->z = nine_DP3_vec_col(v, M, 2);
   2279 }
   2280 
   2281 void
   2282 nine_d3d_matrix_transpose(D3DMATRIX *D, const D3DMATRIX *M)
   2283 {
   2284     unsigned i, j;
   2285     for (i = 0; i < 4; ++i)
   2286     for (j = 0; j < 4; ++j)
   2287         D->m[i][j] = M->m[j][i];
   2288 }
   2289 
   2290 #define _M_ADD_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
   2291     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
   2292     if (t > 0.0f) pos += t; else neg += t; } while(0)
   2293 
   2294 #define _M_SUB_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
   2295     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
   2296     if (t > 0.0f) neg -= t; else pos -= t; } while(0)
   2297 float
   2298 nine_d3d_matrix_det(const D3DMATRIX *M)
   2299 {
   2300     float pos = 0.0f;
   2301     float neg = 0.0f;
   2302 
   2303     _M_ADD_PROD_1i_2j_3k_4l(1, 2, 3, 4);
   2304     _M_ADD_PROD_1i_2j_3k_4l(1, 3, 4, 2);
   2305     _M_ADD_PROD_1i_2j_3k_4l(1, 4, 2, 3);
   2306 
   2307     _M_ADD_PROD_1i_2j_3k_4l(2, 1, 4, 3);
   2308     _M_ADD_PROD_1i_2j_3k_4l(2, 3, 1, 4);
   2309     _M_ADD_PROD_1i_2j_3k_4l(2, 4, 3, 1);
   2310 
   2311     _M_ADD_PROD_1i_2j_3k_4l(3, 1, 2, 4);
   2312     _M_ADD_PROD_1i_2j_3k_4l(3, 2, 4, 1);
   2313     _M_ADD_PROD_1i_2j_3k_4l(3, 4, 1, 2);
   2314 
   2315     _M_ADD_PROD_1i_2j_3k_4l(4, 1, 3, 2);
   2316     _M_ADD_PROD_1i_2j_3k_4l(4, 2, 1, 3);
   2317     _M_ADD_PROD_1i_2j_3k_4l(4, 3, 2, 1);
   2318 
   2319     _M_SUB_PROD_1i_2j_3k_4l(1, 2, 4, 3);
   2320     _M_SUB_PROD_1i_2j_3k_4l(1, 3, 2, 4);
   2321     _M_SUB_PROD_1i_2j_3k_4l(1, 4, 3, 2);
   2322 
   2323     _M_SUB_PROD_1i_2j_3k_4l(2, 1, 3, 4);
   2324     _M_SUB_PROD_1i_2j_3k_4l(2, 3, 4, 1);
   2325     _M_SUB_PROD_1i_2j_3k_4l(2, 4, 1, 3);
   2326 
   2327     _M_SUB_PROD_1i_2j_3k_4l(3, 1, 4, 2);
   2328     _M_SUB_PROD_1i_2j_3k_4l(3, 2, 1, 4);
   2329     _M_SUB_PROD_1i_2j_3k_4l(3, 4, 2, 1);
   2330 
   2331     _M_SUB_PROD_1i_2j_3k_4l(4, 1, 2, 3);
   2332     _M_SUB_PROD_1i_2j_3k_4l(4, 2, 3, 1);
   2333     _M_SUB_PROD_1i_2j_3k_4l(4, 3, 1, 2);
   2334 
   2335     return pos + neg;
   2336 }
   2337 
   2338 /* XXX: Probably better to just use src/mesa/math/m_matrix.c because
   2339  * I have no idea where this code came from.
   2340  */
   2341 void
   2342 nine_d3d_matrix_inverse(D3DMATRIX *D, const D3DMATRIX *M)
   2343 {
   2344     int i, k;
   2345     float det;
   2346 
   2347     D->m[0][0] =
   2348         M->m[1][1] * M->m[2][2] * M->m[3][3] -
   2349         M->m[1][1] * M->m[3][2] * M->m[2][3] -
   2350         M->m[1][2] * M->m[2][1] * M->m[3][3] +
   2351         M->m[1][2] * M->m[3][1] * M->m[2][3] +
   2352         M->m[1][3] * M->m[2][1] * M->m[3][2] -
   2353         M->m[1][3] * M->m[3][1] * M->m[2][2];
   2354 
   2355     D->m[0][1] =
   2356        -M->m[0][1] * M->m[2][2] * M->m[3][3] +
   2357         M->m[0][1] * M->m[3][2] * M->m[2][3] +
   2358         M->m[0][2] * M->m[2][1] * M->m[3][3] -
   2359         M->m[0][2] * M->m[3][1] * M->m[2][3] -
   2360         M->m[0][3] * M->m[2][1] * M->m[3][2] +
   2361         M->m[0][3] * M->m[3][1] * M->m[2][2];
   2362 
   2363     D->m[0][2] =
   2364         M->m[0][1] * M->m[1][2] * M->m[3][3] -
   2365         M->m[0][1] * M->m[3][2] * M->m[1][3] -
   2366         M->m[0][2] * M->m[1][1] * M->m[3][3] +
   2367         M->m[0][2] * M->m[3][1] * M->m[1][3] +
   2368         M->m[0][3] * M->m[1][1] * M->m[3][2] -
   2369         M->m[0][3] * M->m[3][1] * M->m[1][2];
   2370 
   2371     D->m[0][3] =
   2372        -M->m[0][1] * M->m[1][2] * M->m[2][3] +
   2373         M->m[0][1] * M->m[2][2] * M->m[1][3] +
   2374         M->m[0][2] * M->m[1][1] * M->m[2][3] -
   2375         M->m[0][2] * M->m[2][1] * M->m[1][3] -
   2376         M->m[0][3] * M->m[1][1] * M->m[2][2] +
   2377         M->m[0][3] * M->m[2][1] * M->m[1][2];
   2378 
   2379     D->m[1][0] =
   2380        -M->m[1][0] * M->m[2][2] * M->m[3][3] +
   2381         M->m[1][0] * M->m[3][2] * M->m[2][3] +
   2382         M->m[1][2] * M->m[2][0] * M->m[3][3] -
   2383         M->m[1][2] * M->m[3][0] * M->m[2][3] -
   2384         M->m[1][3] * M->m[2][0] * M->m[3][2] +
   2385         M->m[1][3] * M->m[3][0] * M->m[2][2];
   2386 
   2387     D->m[1][1] =
   2388         M->m[0][0] * M->m[2][2] * M->m[3][3] -
   2389         M->m[0][0] * M->m[3][2] * M->m[2][3] -
   2390         M->m[0][2] * M->m[2][0] * M->m[3][3] +
   2391         M->m[0][2] * M->m[3][0] * M->m[2][3] +
   2392         M->m[0][3] * M->m[2][0] * M->m[3][2] -
   2393         M->m[0][3] * M->m[3][0] * M->m[2][2];
   2394 
   2395     D->m[1][2] =
   2396        -M->m[0][0] * M->m[1][2] * M->m[3][3] +
   2397         M->m[0][0] * M->m[3][2] * M->m[1][3] +
   2398         M->m[0][2] * M->m[1][0] * M->m[3][3] -
   2399         M->m[0][2] * M->m[3][0] * M->m[1][3] -
   2400         M->m[0][3] * M->m[1][0] * M->m[3][2] +
   2401         M->m[0][3] * M->m[3][0] * M->m[1][2];
   2402 
   2403     D->m[1][3] =
   2404         M->m[0][0] * M->m[1][2] * M->m[2][3] -
   2405         M->m[0][0] * M->m[2][2] * M->m[1][3] -
   2406         M->m[0][2] * M->m[1][0] * M->m[2][3] +
   2407         M->m[0][2] * M->m[2][0] * M->m[1][3] +
   2408         M->m[0][3] * M->m[1][0] * M->m[2][2] -
   2409         M->m[0][3] * M->m[2][0] * M->m[1][2];
   2410 
   2411     D->m[2][0] =
   2412         M->m[1][0] * M->m[2][1] * M->m[3][3] -
   2413         M->m[1][0] * M->m[3][1] * M->m[2][3] -
   2414         M->m[1][1] * M->m[2][0] * M->m[3][3] +
   2415         M->m[1][1] * M->m[3][0] * M->m[2][3] +
   2416         M->m[1][3] * M->m[2][0] * M->m[3][1] -
   2417         M->m[1][3] * M->m[3][0] * M->m[2][1];
   2418 
   2419     D->m[2][1] =
   2420        -M->m[0][0] * M->m[2][1] * M->m[3][3] +
   2421         M->m[0][0] * M->m[3][1] * M->m[2][3] +
   2422         M->m[0][1] * M->m[2][0] * M->m[3][3] -
   2423         M->m[0][1] * M->m[3][0] * M->m[2][3] -
   2424         M->m[0][3] * M->m[2][0] * M->m[3][1] +
   2425         M->m[0][3] * M->m[3][0] * M->m[2][1];
   2426 
   2427     D->m[2][2] =
   2428         M->m[0][0] * M->m[1][1] * M->m[3][3] -
   2429         M->m[0][0] * M->m[3][1] * M->m[1][3] -
   2430         M->m[0][1] * M->m[1][0] * M->m[3][3] +
   2431         M->m[0][1] * M->m[3][0] * M->m[1][3] +
   2432         M->m[0][3] * M->m[1][0] * M->m[3][1] -
   2433         M->m[0][3] * M->m[3][0] * M->m[1][1];
   2434 
   2435     D->m[2][3] =
   2436        -M->m[0][0] * M->m[1][1] * M->m[2][3] +
   2437         M->m[0][0] * M->m[2][1] * M->m[1][3] +
   2438         M->m[0][1] * M->m[1][0] * M->m[2][3] -
   2439         M->m[0][1] * M->m[2][0] * M->m[1][3] -
   2440         M->m[0][3] * M->m[1][0] * M->m[2][1] +
   2441         M->m[0][3] * M->m[2][0] * M->m[1][1];
   2442 
   2443     D->m[3][0] =
   2444        -M->m[1][0] * M->m[2][1] * M->m[3][2] +
   2445         M->m[1][0] * M->m[3][1] * M->m[2][2] +
   2446         M->m[1][1] * M->m[2][0] * M->m[3][2] -
   2447         M->m[1][1] * M->m[3][0] * M->m[2][2] -
   2448         M->m[1][2] * M->m[2][0] * M->m[3][1] +
   2449         M->m[1][2] * M->m[3][0] * M->m[2][1];
   2450 
   2451     D->m[3][1] =
   2452         M->m[0][0] * M->m[2][1] * M->m[3][2] -
   2453         M->m[0][0] * M->m[3][1] * M->m[2][2] -
   2454         M->m[0][1] * M->m[2][0] * M->m[3][2] +
   2455         M->m[0][1] * M->m[3][0] * M->m[2][2] +
   2456         M->m[0][2] * M->m[2][0] * M->m[3][1] -
   2457         M->m[0][2] * M->m[3][0] * M->m[2][1];
   2458 
   2459     D->m[3][2] =
   2460        -M->m[0][0] * M->m[1][1] * M->m[3][2] +
   2461         M->m[0][0] * M->m[3][1] * M->m[1][2] +
   2462         M->m[0][1] * M->m[1][0] * M->m[3][2] -
   2463         M->m[0][1] * M->m[3][0] * M->m[1][2] -
   2464         M->m[0][2] * M->m[1][0] * M->m[3][1] +
   2465         M->m[0][2] * M->m[3][0] * M->m[1][1];
   2466 
   2467     D->m[3][3] =
   2468         M->m[0][0] * M->m[1][1] * M->m[2][2] -
   2469         M->m[0][0] * M->m[2][1] * M->m[1][2] -
   2470         M->m[0][1] * M->m[1][0] * M->m[2][2] +
   2471         M->m[0][1] * M->m[2][0] * M->m[1][2] +
   2472         M->m[0][2] * M->m[1][0] * M->m[2][1] -
   2473         M->m[0][2] * M->m[2][0] * M->m[1][1];
   2474 
   2475     det =
   2476         M->m[0][0] * D->m[0][0] +
   2477         M->m[1][0] * D->m[0][1] +
   2478         M->m[2][0] * D->m[0][2] +
   2479         M->m[3][0] * D->m[0][3];
   2480 
   2481     if (fabsf(det) < 1e-30) {/* non inversible */
   2482         *D = *M; /* wine tests */
   2483         return;
   2484     }
   2485 
   2486     det = 1.0 / det;
   2487 
   2488     for (i = 0; i < 4; i++)
   2489     for (k = 0; k < 4; k++)
   2490         D->m[i][k] *= det;
   2491 
   2492 #ifdef DEBUG
   2493     {
   2494         D3DMATRIX I;
   2495 
   2496         nine_d3d_matrix_matrix_mul(&I, D, M);
   2497 
   2498         for (i = 0; i < 4; ++i)
   2499         for (k = 0; k < 4; ++k)
   2500             if (fabsf(I.m[i][k] - (float)(i == k)) > 1e-3)
   2501                 DBG("Matrix inversion check FAILED !\n");
   2502     }
   2503 #endif
   2504 }
   2505