Home | History | Annotate | Download | only in nine
      1 
      2 /* FF is big and ugly so feel free to write lines as long as you like.
      3  * Aieeeeeeeee !
      4  *
      5  * Let me make that clearer:
      6  * Aieeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee ! !! !!!
      7  */
      8 
      9 #include "device9.h"
     10 #include "basetexture9.h"
     11 #include "vertexdeclaration9.h"
     12 #include "vertexshader9.h"
     13 #include "pixelshader9.h"
     14 #include "nine_ff.h"
     15 #include "nine_defines.h"
     16 #include "nine_helpers.h"
     17 #include "nine_pipe.h"
     18 #include "nine_dump.h"
     19 
     20 #include "pipe/p_context.h"
     21 #include "tgsi/tgsi_ureg.h"
     22 #include "tgsi/tgsi_dump.h"
     23 #include "util/u_box.h"
     24 #include "util/u_hash_table.h"
     25 #include "util/u_upload_mgr.h"
     26 
     27 #define DBG_CHANNEL DBG_FF
     28 
     29 #define NINE_FF_NUM_VS_CONST 196
     30 #define NINE_FF_NUM_PS_CONST 24
     31 
     32 struct fvec4
     33 {
     34     float x, y, z, w;
     35 };
     36 
     37 struct nine_ff_vs_key
     38 {
     39     union {
     40         struct {
     41             uint32_t position_t : 1;
     42             uint32_t lighting   : 1;
     43             uint32_t darkness   : 1; /* lighting enabled but no active lights */
     44             uint32_t localviewer : 1;
     45             uint32_t vertexpointsize : 1;
     46             uint32_t pointscale : 1;
     47             uint32_t vertexblend : 3;
     48             uint32_t vertexblend_indexed : 1;
     49             uint32_t vertextween : 1;
     50             uint32_t mtl_diffuse : 2; /* 0 = material, 1 = color1, 2 = color2 */
     51             uint32_t mtl_ambient : 2;
     52             uint32_t mtl_specular : 2;
     53             uint32_t mtl_emissive : 2;
     54             uint32_t fog_mode : 2;
     55             uint32_t fog_range : 1;
     56             uint32_t color0in_one : 1;
     57             uint32_t color1in_zero : 1;
     58             uint32_t has_normal : 1;
     59             uint32_t fog : 1;
     60             uint32_t normalizenormals : 1;
     61             uint32_t ucp : 1;
     62             uint32_t pad1 : 4;
     63             uint32_t tc_dim_input: 16; /* 8 * 2 bits */
     64             uint32_t pad2 : 16;
     65             uint32_t tc_dim_output: 24; /* 8 * 3 bits */
     66             uint32_t pad3 : 8;
     67             uint32_t tc_gen : 24; /* 8 * 3 bits */
     68             uint32_t pad4 : 8;
     69             uint32_t tc_idx : 24;
     70             uint32_t pad5 : 8;
     71             uint32_t passthrough;
     72         };
     73         uint64_t value64[3]; /* don't forget to resize VertexShader9.ff_key */
     74         uint32_t value32[6];
     75     };
     76 };
     77 
     78 /* Texture stage state:
     79  *
     80  * COLOROP       D3DTOP 5 bit
     81  * ALPHAOP       D3DTOP 5 bit
     82  * COLORARG0     D3DTA  3 bit
     83  * COLORARG1     D3DTA  3 bit
     84  * COLORARG2     D3DTA  3 bit
     85  * ALPHAARG0     D3DTA  3 bit
     86  * ALPHAARG1     D3DTA  3 bit
     87  * ALPHAARG2     D3DTA  3 bit
     88  * RESULTARG     D3DTA  1 bit (CURRENT:0 or TEMP:1)
     89  * TEXCOORDINDEX 0 - 7  3 bit
     90  * ===========================
     91  *                     32 bit per stage
     92  */
     93 struct nine_ff_ps_key
     94 {
     95     union {
     96         struct {
     97             struct {
     98                 uint32_t colorop   : 5;
     99                 uint32_t alphaop   : 5;
    100                 uint32_t colorarg0 : 3;
    101                 uint32_t colorarg1 : 3;
    102                 uint32_t colorarg2 : 3;
    103                 uint32_t alphaarg0 : 3;
    104                 uint32_t alphaarg1 : 3;
    105                 uint32_t alphaarg2 : 3;
    106                 uint32_t resultarg : 1; /* CURRENT:0 or TEMP:1 */
    107                 uint32_t textarget : 2; /* 1D/2D/3D/CUBE */
    108                 uint32_t pad       : 1;
    109                 /* that's 32 bit exactly */
    110             } ts[8];
    111             uint32_t projected : 16;
    112             uint32_t fog : 1; /* for vFog coming from VS */
    113             uint32_t fog_mode : 2;
    114             uint32_t fog_source : 1; /* 0: Z, 1: W */
    115             uint32_t specular : 1;
    116             uint32_t pad1 : 11; /* 9 32-bit words with this */
    117             uint8_t colorarg_b4[3];
    118             uint8_t colorarg_b5[3];
    119             uint8_t alphaarg_b4[3]; /* 11 32-bit words plus a byte */
    120             uint8_t pad2[3];
    121         };
    122         uint64_t value64[6]; /* don't forget to resize PixelShader9.ff_key */
    123         uint32_t value32[12];
    124     };
    125 };
    126 
    127 static unsigned nine_ff_vs_key_hash(void *key)
    128 {
    129     struct nine_ff_vs_key *vs = key;
    130     unsigned i;
    131     uint32_t hash = vs->value32[0];
    132     for (i = 1; i < ARRAY_SIZE(vs->value32); ++i)
    133         hash ^= vs->value32[i];
    134     return hash;
    135 }
    136 static int nine_ff_vs_key_comp(void *key1, void *key2)
    137 {
    138     struct nine_ff_vs_key *a = (struct nine_ff_vs_key *)key1;
    139     struct nine_ff_vs_key *b = (struct nine_ff_vs_key *)key2;
    140 
    141     return memcmp(a->value64, b->value64, sizeof(a->value64));
    142 }
    143 static unsigned nine_ff_ps_key_hash(void *key)
    144 {
    145     struct nine_ff_ps_key *ps = key;
    146     unsigned i;
    147     uint32_t hash = ps->value32[0];
    148     for (i = 1; i < ARRAY_SIZE(ps->value32); ++i)
    149         hash ^= ps->value32[i];
    150     return hash;
    151 }
    152 static int nine_ff_ps_key_comp(void *key1, void *key2)
    153 {
    154     struct nine_ff_ps_key *a = (struct nine_ff_ps_key *)key1;
    155     struct nine_ff_ps_key *b = (struct nine_ff_ps_key *)key2;
    156 
    157     return memcmp(a->value64, b->value64, sizeof(a->value64));
    158 }
    159 static unsigned nine_ff_fvf_key_hash(void *key)
    160 {
    161     return *(DWORD *)key;
    162 }
    163 static int nine_ff_fvf_key_comp(void *key1, void *key2)
    164 {
    165     return *(DWORD *)key1 != *(DWORD *)key2;
    166 }
    167 
    168 static void nine_ff_prune_vs(struct NineDevice9 *);
    169 static void nine_ff_prune_ps(struct NineDevice9 *);
    170 
    171 static void nine_ureg_tgsi_dump(struct ureg_program *ureg, boolean override)
    172 {
    173     if (debug_get_bool_option("NINE_FF_DUMP", FALSE) || override) {
    174         unsigned count;
    175         const struct tgsi_token *toks = ureg_get_tokens(ureg, &count);
    176         tgsi_dump(toks, 0);
    177         ureg_free_tokens(toks);
    178     }
    179 }
    180 
    181 #define _X(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_X)
    182 #define _Y(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Y)
    183 #define _Z(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Z)
    184 #define _W(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_W)
    185 
    186 #define _XXXX(r) ureg_scalar(r, TGSI_SWIZZLE_X)
    187 #define _YYYY(r) ureg_scalar(r, TGSI_SWIZZLE_Y)
    188 #define _ZZZZ(r) ureg_scalar(r, TGSI_SWIZZLE_Z)
    189 #define _WWWW(r) ureg_scalar(r, TGSI_SWIZZLE_W)
    190 
    191 #define _XYZW(r) (r)
    192 
    193 /* AL should contain base address of lights table. */
    194 #define LIGHT_CONST(i)                                                \
    195     ureg_src_indirect(ureg_DECL_constant(ureg, i), _X(AL))
    196 
    197 #define MATERIAL_CONST(i) \
    198     ureg_DECL_constant(ureg, 19 + (i))
    199 
    200 #define _CONST(n) ureg_DECL_constant(ureg, n)
    201 
    202 /* VS FF constants layout:
    203  *
    204  * CONST[ 0.. 3] D3DTS_WORLD * D3DTS_VIEW * D3DTS_PROJECTION
    205  * CONST[ 4.. 7] D3DTS_WORLD * D3DTS_VIEW
    206  * CONST[ 8..11] D3DTS_PROJECTION
    207  * CONST[12..15] D3DTS_VIEW^(-1)
    208  * CONST[16..18] Normal matrix
    209  *
    210  * CONST[19].xyz  MATERIAL.Emissive + Material.Ambient * RS.Ambient
    211  * CONST[20]      MATERIAL.Diffuse
    212  * CONST[21]      MATERIAL.Ambient
    213  * CONST[22]      MATERIAL.Specular
    214  * CONST[23].x___ MATERIAL.Power
    215  * CONST[24]      MATERIAL.Emissive
    216  * CONST[25]      RS.Ambient
    217  *
    218  * CONST[26].x___ RS.PointSizeMin
    219  * CONST[26]._y__ RS.PointSizeMax
    220  * CONST[26].__z_ RS.PointSize
    221  * CONST[26].___w RS.PointScaleA
    222  * CONST[27].x___ RS.PointScaleB
    223  * CONST[27]._y__ RS.PointScaleC
    224  *
    225  * CONST[28].x___ RS.FogEnd
    226  * CONST[28]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
    227  * CONST[28].__z_ RS.FogDensity
    228 
    229  * CONST[30].x___ TWEENFACTOR
    230  *
    231  * CONST[32].x___ LIGHT[0].Type
    232  * CONST[32]._yzw LIGHT[0].Attenuation0,1,2
    233  * CONST[33]      LIGHT[0].Diffuse
    234  * CONST[34]      LIGHT[0].Specular
    235  * CONST[35]      LIGHT[0].Ambient
    236  * CONST[36].xyz_ LIGHT[0].Position
    237  * CONST[36].___w LIGHT[0].Range
    238  * CONST[37].xyz_ LIGHT[0].Direction
    239  * CONST[37].___w LIGHT[0].Falloff
    240  * CONST[38].x___ cos(LIGHT[0].Theta / 2)
    241  * CONST[38]._y__ cos(LIGHT[0].Phi / 2)
    242  * CONST[38].__z_ 1.0f / (cos(LIGHT[0].Theta / 2) - cos(Light[0].Phi / 2))
    243  * CONST[39].xyz_ LIGHT[0].HalfVector (for directional lights)
    244  * CONST[39].___w 1 if this is the last active light, 0 if not
    245  * CONST[40]      LIGHT[1]
    246  * CONST[48]      LIGHT[2]
    247  * CONST[56]      LIGHT[3]
    248  * CONST[64]      LIGHT[4]
    249  * CONST[72]      LIGHT[5]
    250  * CONST[80]      LIGHT[6]
    251  * CONST[88]      LIGHT[7]
    252  * NOTE: no lighting code is generated if there are no active lights
    253  *
    254  * CONST[100].x___ Viewport 2/width
    255  * CONST[100]._y__ Viewport 2/height
    256  * CONST[100].__z_ Viewport 1/(zmax - zmin)
    257  * CONST[100].___w Viewport width
    258  * CONST[101].x___ Viewport x0
    259  * CONST[101]._y__ Viewport y0
    260  * CONST[101].__z_ Viewport z0
    261  *
    262  * CONST[128..131] D3DTS_TEXTURE0
    263  * CONST[132..135] D3DTS_TEXTURE1
    264  * CONST[136..139] D3DTS_TEXTURE2
    265  * CONST[140..143] D3DTS_TEXTURE3
    266  * CONST[144..147] D3DTS_TEXTURE4
    267  * CONST[148..151] D3DTS_TEXTURE5
    268  * CONST[152..155] D3DTS_TEXTURE6
    269  * CONST[156..159] D3DTS_TEXTURE7
    270  *
    271  * CONST[160] D3DTS_WORLDMATRIX[0] * D3DTS_VIEW
    272  * CONST[164] D3DTS_WORLDMATRIX[1] * D3DTS_VIEW
    273  * ...
    274  * CONST[192] D3DTS_WORLDMATRIX[8] * D3DTS_VIEW
    275  */
    276 struct vs_build_ctx
    277 {
    278     struct ureg_program *ureg;
    279     const struct nine_ff_vs_key *key;
    280 
    281     uint16_t input[PIPE_MAX_ATTRIBS];
    282     unsigned num_inputs;
    283 
    284     struct ureg_src aVtx;
    285     struct ureg_src aNrm;
    286     struct ureg_src aCol[2];
    287     struct ureg_src aTex[8];
    288     struct ureg_src aPsz;
    289     struct ureg_src aInd;
    290     struct ureg_src aWgt;
    291 
    292     struct ureg_src aVtx1; /* tweening */
    293     struct ureg_src aNrm1;
    294 
    295     struct ureg_src mtlA;
    296     struct ureg_src mtlD;
    297     struct ureg_src mtlS;
    298     struct ureg_src mtlE;
    299 };
    300 
    301 static inline unsigned
    302 get_texcoord_sn(struct pipe_screen *screen)
    303 {
    304     if (screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD))
    305         return TGSI_SEMANTIC_TEXCOORD;
    306     return TGSI_SEMANTIC_GENERIC;
    307 }
    308 
    309 static inline struct ureg_src
    310 build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
    311 {
    312     const unsigned i = vs->num_inputs++;
    313     assert(i < PIPE_MAX_ATTRIBS);
    314     vs->input[i] = ndecl;
    315     return ureg_DECL_vs_input(vs->ureg, i);
    316 }
    317 
    318 /* NOTE: dst may alias src */
    319 static inline void
    320 ureg_normalize3(struct ureg_program *ureg,
    321                 struct ureg_dst dst, struct ureg_src src)
    322 {
    323     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    324     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
    325 
    326     ureg_DP3(ureg, tmp_x, src, src);
    327     ureg_RSQ(ureg, tmp_x, _X(tmp));
    328     ureg_MUL(ureg, dst, src, _X(tmp));
    329     ureg_release_temporary(ureg, tmp);
    330 }
    331 
    332 static void *
    333 nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
    334 {
    335     const struct nine_ff_vs_key *key = vs->key;
    336     struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
    337     struct ureg_dst oPos, oCol[2], oPsz, oFog;
    338     struct ureg_dst AR;
    339     unsigned i, c;
    340     unsigned label[32], l = 0;
    341     boolean need_aNrm = key->lighting || key->passthrough & (1 << NINE_DECLUSAGE_NORMAL);
    342     boolean has_aNrm = need_aNrm && key->has_normal;
    343     boolean need_aVtx = key->lighting || key->fog_mode || key->pointscale || key->ucp;
    344     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
    345 
    346     vs->ureg = ureg;
    347 
    348     /* Check which inputs we should transform. */
    349     for (i = 0; i < 8 * 3; i += 3) {
    350         switch ((key->tc_gen >> i) & 0x7) {
    351         case NINED3DTSS_TCI_CAMERASPACENORMAL:
    352             need_aNrm = TRUE;
    353             break;
    354         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
    355             need_aVtx = TRUE;
    356             break;
    357         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
    358             need_aVtx = need_aNrm = TRUE;
    359             break;
    360         case NINED3DTSS_TCI_SPHEREMAP:
    361             need_aVtx = need_aNrm = TRUE;
    362             break;
    363         default:
    364             break;
    365         }
    366     }
    367 
    368     /* Declare and record used inputs (needed for linkage with vertex format):
    369      * (texture coordinates handled later)
    370      */
    371     vs->aVtx = build_vs_add_input(vs,
    372         key->position_t ? NINE_DECLUSAGE_POSITIONT : NINE_DECLUSAGE_POSITION);
    373 
    374     vs->aNrm = ureg_imm1f(ureg, 0.0f);
    375     if (has_aNrm)
    376         vs->aNrm = build_vs_add_input(vs, NINE_DECLUSAGE_NORMAL);
    377 
    378     vs->aCol[0] = ureg_imm1f(ureg, 1.0f);
    379     vs->aCol[1] = ureg_imm1f(ureg, 0.0f);
    380 
    381     if (key->lighting || key->darkness) {
    382         const unsigned mask = key->mtl_diffuse | key->mtl_specular |
    383                               key->mtl_ambient | key->mtl_emissive;
    384         if ((mask & 0x1) && !key->color0in_one)
    385             vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
    386         if ((mask & 0x2) && !key->color1in_zero)
    387             vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
    388 
    389         vs->mtlD = MATERIAL_CONST(1);
    390         vs->mtlA = MATERIAL_CONST(2);
    391         vs->mtlS = MATERIAL_CONST(3);
    392         vs->mtlE = MATERIAL_CONST(5);
    393         if (key->mtl_diffuse  == 1) vs->mtlD = vs->aCol[0]; else
    394         if (key->mtl_diffuse  == 2) vs->mtlD = vs->aCol[1];
    395         if (key->mtl_ambient  == 1) vs->mtlA = vs->aCol[0]; else
    396         if (key->mtl_ambient  == 2) vs->mtlA = vs->aCol[1];
    397         if (key->mtl_specular == 1) vs->mtlS = vs->aCol[0]; else
    398         if (key->mtl_specular == 2) vs->mtlS = vs->aCol[1];
    399         if (key->mtl_emissive == 1) vs->mtlE = vs->aCol[0]; else
    400         if (key->mtl_emissive == 2) vs->mtlE = vs->aCol[1];
    401     } else {
    402         if (!key->color0in_one) vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
    403         if (!key->color1in_zero) vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
    404     }
    405 
    406     if (key->vertexpointsize)
    407         vs->aPsz = build_vs_add_input(vs, NINE_DECLUSAGE_PSIZE);
    408 
    409     if (key->vertexblend_indexed || key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES))
    410         vs->aInd = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDINDICES);
    411     if (key->vertexblend || key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT))
    412         vs->aWgt = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDWEIGHT);
    413     if (key->vertextween) {
    414         vs->aVtx1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(POSITION,1));
    415         vs->aNrm1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(NORMAL,1));
    416     }
    417 
    418     /* Declare outputs:
    419      */
    420     oPos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); /* HPOS */
    421     oCol[0] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0));
    422     oCol[1] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1));
    423     if (key->fog || key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
    424         oFog = ureg_DECL_output(ureg, TGSI_SEMANTIC_FOG, 0);
    425         oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
    426     }
    427 
    428     if (key->vertexpointsize || key->pointscale) {
    429         oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0,
    430                                        TGSI_WRITEMASK_X, 0, 1);
    431         oPsz = ureg_writemask(oPsz, TGSI_WRITEMASK_X);
    432     }
    433 
    434     if (key->lighting || key->vertexblend)
    435         AR = ureg_DECL_address(ureg);
    436 
    437     /* === Vertex transformation / vertex blending:
    438      */
    439 
    440     if (key->position_t) {
    441         if (device->driver_caps.window_space_position_support) {
    442             ureg_MOV(ureg, oPos, vs->aVtx);
    443         } else {
    444             struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    445             /* vs->aVtx contains the coordinates buffer wise.
    446             * later in the pipeline, clipping, viewport and division
    447             * by w (rhw = 1/w) are going to be applied, so do the reverse
    448             * of these transformations (except clipping) to have the good
    449             * position at the end.*/
    450             ureg_MOV(ureg, tmp, vs->aVtx);
    451             /* X from [X_min, X_min + width] to [-1, 1], same for Y. Z to [0, 1] */
    452             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), ureg_negate(_CONST(101)));
    453             ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _CONST(100));
    454             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, -1.0f));
    455             /* Y needs to be reversed */
    456             ureg_MOV(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_negate(ureg_src(tmp)));
    457             /* inverse rhw */
    458             ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W), _W(tmp));
    459             /* multiply X, Y, Z by w */
    460             ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _W(tmp));
    461             ureg_MOV(ureg, oPos, ureg_src(tmp));
    462             ureg_release_temporary(ureg, tmp);
    463         }
    464     } else if (key->vertexblend) {
    465         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    466         struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
    467         struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
    468         struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
    469         struct ureg_dst sum_blendweights = ureg_DECL_temporary(ureg);
    470         struct ureg_src cWM[4];
    471 
    472         for (i = 160; i <= 195; ++i)
    473             ureg_DECL_constant(ureg, i);
    474 
    475         /* translate world matrix index to constant file index */
    476         if (key->vertexblend_indexed) {
    477             ureg_MAD(ureg, tmp, vs->aInd, ureg_imm1f(ureg, 4.0f), ureg_imm1f(ureg, 160.0f));
    478             ureg_ARL(ureg, AR, ureg_src(tmp));
    479         }
    480 
    481         ureg_MOV(ureg, aVtx_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
    482         ureg_MOV(ureg, aNrm_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
    483         ureg_MOV(ureg, sum_blendweights, ureg_imm4f(ureg, 1.0f, 1.0f, 1.0f, 1.0f));
    484 
    485         for (i = 0; i < key->vertexblend; ++i) {
    486             for (c = 0; c < 4; ++c) {
    487                 cWM[c] = ureg_src_register(TGSI_FILE_CONSTANT, (160 + i * 4) * !key->vertexblend_indexed + c);
    488                 if (key->vertexblend_indexed)
    489                     cWM[c] = ureg_src_indirect(cWM[c], ureg_scalar(ureg_src(AR), i));
    490             }
    491 
    492             /* multiply by WORLD(index) */
    493             ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), cWM[0]);
    494             ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), cWM[1], ureg_src(tmp));
    495             ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), cWM[2], ureg_src(tmp));
    496             ureg_MAD(ureg, tmp, _WWWW(vs->aVtx), cWM[3], ureg_src(tmp));
    497 
    498             if (has_aNrm) {
    499                 /* Note: the spec says the transpose of the inverse of the
    500                  * WorldView matrices should be used, but all tests show
    501                  * otherwise.
    502                  * Only case unknown: D3DVBF_0WEIGHTS */
    503                 ureg_MUL(ureg, tmp2, _XXXX(vs->aNrm), cWM[0]);
    504                 ureg_MAD(ureg, tmp2, _YYYY(vs->aNrm), cWM[1], ureg_src(tmp2));
    505                 ureg_MAD(ureg, tmp2, _ZZZZ(vs->aNrm), cWM[2], ureg_src(tmp2));
    506             }
    507 
    508             if (i < (key->vertexblend - 1)) {
    509                 /* accumulate weighted position value */
    510                 ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(vs->aWgt, i), ureg_src(aVtx_dst));
    511                 if (has_aNrm)
    512                     ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(vs->aWgt, i), ureg_src(aNrm_dst));
    513                 /* subtract weighted position value for last value */
    514                 ureg_ADD(ureg, sum_blendweights, ureg_src(sum_blendweights), ureg_negate(ureg_scalar(vs->aWgt, i)));
    515             }
    516         }
    517 
    518         /* the last weighted position is always 1 - sum_of_previous_weights */
    519         ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aVtx_dst));
    520         if (has_aNrm)
    521             ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aNrm_dst));
    522 
    523         /* multiply by VIEW_PROJ */
    524         ureg_MUL(ureg, tmp, _X(aVtx_dst), _CONST(8));
    525         ureg_MAD(ureg, tmp, _Y(aVtx_dst), _CONST(9),  ureg_src(tmp));
    526         ureg_MAD(ureg, tmp, _Z(aVtx_dst), _CONST(10), ureg_src(tmp));
    527         ureg_MAD(ureg, oPos, _W(aVtx_dst), _CONST(11), ureg_src(tmp));
    528 
    529         if (need_aVtx)
    530             vs->aVtx = ureg_src(aVtx_dst);
    531 
    532         ureg_release_temporary(ureg, tmp);
    533         ureg_release_temporary(ureg, tmp2);
    534         ureg_release_temporary(ureg, sum_blendweights);
    535         if (!need_aVtx)
    536             ureg_release_temporary(ureg, aVtx_dst);
    537 
    538         if (has_aNrm) {
    539             if (key->normalizenormals)
    540                ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
    541             vs->aNrm = ureg_src(aNrm_dst);
    542         } else
    543             ureg_release_temporary(ureg, aNrm_dst);
    544     } else {
    545         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    546 
    547         if (key->vertextween) {
    548             struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
    549             ureg_LRP(ureg, aVtx_dst, _XXXX(_CONST(30)), vs->aVtx1, vs->aVtx);
    550             vs->aVtx = ureg_src(aVtx_dst);
    551             if (has_aNrm) {
    552                 struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
    553                 ureg_LRP(ureg, aNrm_dst, _XXXX(_CONST(30)), vs->aNrm1, vs->aNrm);
    554                 vs->aNrm = ureg_src(aNrm_dst);
    555             }
    556         }
    557 
    558         /* position = vertex * WORLD_VIEW_PROJ */
    559         ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(0));
    560         ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(1), ureg_src(tmp));
    561         ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(2), ureg_src(tmp));
    562         ureg_MAD(ureg, oPos, _WWWW(vs->aVtx), _CONST(3), ureg_src(tmp));
    563         ureg_release_temporary(ureg, tmp);
    564 
    565         if (need_aVtx) {
    566             struct ureg_dst aVtx_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    567             ureg_MUL(ureg, aVtx_dst, _XXXX(vs->aVtx), _CONST(4));
    568             ureg_MAD(ureg, aVtx_dst, _YYYY(vs->aVtx), _CONST(5), ureg_src(aVtx_dst));
    569             ureg_MAD(ureg, aVtx_dst, _ZZZZ(vs->aVtx), _CONST(6), ureg_src(aVtx_dst));
    570             ureg_MAD(ureg, aVtx_dst, _WWWW(vs->aVtx), _CONST(7), ureg_src(aVtx_dst));
    571             vs->aVtx = ureg_src(aVtx_dst);
    572         }
    573         if (has_aNrm) {
    574             struct ureg_dst aNrm_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    575             ureg_MUL(ureg, aNrm_dst, _XXXX(vs->aNrm), _CONST(16));
    576             ureg_MAD(ureg, aNrm_dst, _YYYY(vs->aNrm), _CONST(17), ureg_src(aNrm_dst));
    577             ureg_MAD(ureg, aNrm_dst, _ZZZZ(vs->aNrm), _CONST(18), ureg_src(aNrm_dst));
    578             if (key->normalizenormals)
    579                ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
    580             vs->aNrm = ureg_src(aNrm_dst);
    581         }
    582     }
    583 
    584     /* === Process point size:
    585      */
    586     if (key->vertexpointsize || key->pointscale) {
    587         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    588         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
    589         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
    590         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
    591         if (key->vertexpointsize) {
    592             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
    593             ureg_MAX(ureg, tmp_z, _XXXX(vs->aPsz), _XXXX(cPsz1));
    594             ureg_MIN(ureg, tmp_z, _Z(tmp), _YYYY(cPsz1));
    595         } else {
    596             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
    597             ureg_MOV(ureg, tmp_z, _ZZZZ(cPsz1));
    598         }
    599 
    600         if (key->pointscale) {
    601             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
    602             struct ureg_src cPsz2 = ureg_DECL_constant(ureg, 27);
    603 
    604             ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
    605             ureg_RSQ(ureg, tmp_y, _X(tmp));
    606             ureg_MUL(ureg, tmp_y, _Y(tmp), _X(tmp));
    607             ureg_CMP(ureg, tmp_y, ureg_negate(_Y(tmp)), _Y(tmp), ureg_imm1f(ureg, 0.0f));
    608             ureg_MAD(ureg, tmp_x, _Y(tmp), _YYYY(cPsz2), _XXXX(cPsz2));
    609             ureg_MAD(ureg, tmp_x, _Y(tmp), _X(tmp), _WWWW(cPsz1));
    610             ureg_RSQ(ureg, tmp_x, _X(tmp));
    611             ureg_MUL(ureg, tmp_x, _X(tmp), _Z(tmp));
    612             ureg_MUL(ureg, tmp_x, _X(tmp), _WWWW(_CONST(100)));
    613             ureg_MAX(ureg, tmp_x, _X(tmp), _XXXX(cPsz1));
    614             ureg_MIN(ureg, tmp_z, _X(tmp), _YYYY(cPsz1));
    615         }
    616 
    617         ureg_MOV(ureg, oPsz, _Z(tmp));
    618         ureg_release_temporary(ureg, tmp);
    619     }
    620 
    621     for (i = 0; i < 8; ++i) {
    622         struct ureg_dst tmp, tmp_x, tmp2;
    623         struct ureg_dst oTex, input_coord, transformed, t, aVtx_normed;
    624         unsigned c, writemask;
    625         const unsigned tci = (key->tc_gen >> (i * 3)) & 0x7;
    626         const unsigned idx = (key->tc_idx >> (i * 3)) & 0x7;
    627         unsigned dim_input = 1 + ((key->tc_dim_input >> (i * 2)) & 0x3);
    628         const unsigned dim_output = (key->tc_dim_output >> (i * 3)) & 0x7;
    629 
    630         /* No texture output of index s */
    631         if (tci == NINED3DTSS_TCI_DISABLE)
    632             continue;
    633         oTex = ureg_DECL_output(ureg, texcoord_sn, i);
    634         tmp = ureg_DECL_temporary(ureg);
    635         tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
    636         input_coord = ureg_DECL_temporary(ureg);
    637         transformed = ureg_DECL_temporary(ureg);
    638 
    639         /* Get the coordinate */
    640         switch (tci) {
    641         case NINED3DTSS_TCI_PASSTHRU:
    642             /* NINED3DTSS_TCI_PASSTHRU => Use texcoord coming from index idx *
    643              * Else the idx is used only to determine wrapping mode. */
    644             vs->aTex[idx] = build_vs_add_input(vs, NINE_DECLUSAGE_i(TEXCOORD,idx));
    645             ureg_MOV(ureg, input_coord, vs->aTex[idx]);
    646             break;
    647         case NINED3DTSS_TCI_CAMERASPACENORMAL:
    648             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aNrm);
    649             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
    650             dim_input = 4;
    651             break;
    652         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
    653             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aVtx);
    654             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
    655             dim_input = 4;
    656             break;
    657         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
    658             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
    659             aVtx_normed = ureg_DECL_temporary(ureg);
    660             ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
    661             ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
    662             ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
    663             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
    664             ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
    665             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
    666             ureg_release_temporary(ureg, aVtx_normed);
    667             dim_input = 4;
    668             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
    669             break;
    670         case NINED3DTSS_TCI_SPHEREMAP:
    671             /* Implement the formula of GL_SPHERE_MAP */
    672             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
    673             aVtx_normed = ureg_DECL_temporary(ureg);
    674             tmp2 = ureg_DECL_temporary(ureg);
    675             ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
    676             ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
    677             ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
    678             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
    679             ureg_ADD(ureg, tmp, ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
    680             /* now tmp = normed(Vtx) - 2 dot3(normed(Vtx), Nrm) Nrm */
    681             ureg_MOV(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_XYZ), ureg_src(tmp));
    682             ureg_MUL(ureg, tmp2, ureg_src(tmp2), ureg_src(tmp2));
    683             ureg_DP3(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_src(tmp2));
    684             ureg_RSQ(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2));
    685             ureg_MUL(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_imm1f(ureg, 0.5f));
    686             /* tmp2 = 0.5 / sqrt(tmp.x^2 + tmp.y^2 + (tmp.z+1)^2)
    687              * TODO: z coordinates are a bit different gl vs d3d, should the formula be adapted ? */
    688             ureg_MUL(ureg, tmp, ureg_src(tmp), _X(tmp2));
    689             ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, 0.5f));
    690             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_ZW), ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
    691             ureg_release_temporary(ureg, aVtx_normed);
    692             ureg_release_temporary(ureg, tmp2);
    693             dim_input = 4;
    694             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
    695             break;
    696         default:
    697             assert(0);
    698             break;
    699         }
    700 
    701         /* Apply the transformation */
    702         /* dim_output == 0 => do not transform the components.
    703          * XYZRHW also disables transformation */
    704         if (!dim_output || key->position_t) {
    705             ureg_release_temporary(ureg, transformed);
    706             transformed = input_coord;
    707             writemask = TGSI_WRITEMASK_XYZW;
    708         } else {
    709             for (c = 0; c < dim_output; c++) {
    710                 t = ureg_writemask(transformed, 1 << c);
    711                 switch (dim_input) {
    712                 /* dim_input = 1 2 3: -> we add trailing 1 to input*/
    713                 case 1: ureg_MAD(ureg, t, _X(input_coord), _XXXX(_CONST(128 + i * 4 + c)), _YYYY(_CONST(128 + i * 4 + c)));
    714                         break;
    715                 case 2: ureg_DP2(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
    716                         ureg_ADD(ureg, t, ureg_src(transformed), _ZZZZ(_CONST(128 + i * 4 + c)));
    717                         break;
    718                 case 3: ureg_DP3(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
    719                         ureg_ADD(ureg, t, ureg_src(transformed), _WWWW(_CONST(128 + i * 4 + c)));
    720                         break;
    721                 case 4: ureg_DP4(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c)); break;
    722                 default:
    723                     assert(0);
    724                 }
    725             }
    726             writemask = (1 << dim_output) - 1;
    727             ureg_release_temporary(ureg, input_coord);
    728         }
    729 
    730         ureg_MOV(ureg, ureg_writemask(oTex, writemask), ureg_src(transformed));
    731         ureg_release_temporary(ureg, transformed);
    732         ureg_release_temporary(ureg, tmp);
    733     }
    734 
    735     /* === Lighting:
    736      *
    737      * DIRECTIONAL:  Light at infinite distance, parallel rays, no attenuation.
    738      * POINT: Finite distance to scene, divergent rays, isotropic, attenuation.
    739      * SPOT: Finite distance, divergent rays, angular dependence, attenuation.
    740      *
    741      * vec3 normal = normalize(in.Normal * NormalMatrix);
    742      * vec3 hitDir = light.direction;
    743      * float atten = 1.0;
    744      *
    745      * if (light.type != DIRECTIONAL)
    746      * {
    747      *     vec3 hitVec = light.position - eyeVertex;
    748      *     float d = length(hitVec);
    749      *     hitDir = hitVec / d;
    750      *     atten = 1 / ((light.atten2 * d + light.atten1) * d + light.atten0);
    751      * }
    752      *
    753      * if (light.type == SPOTLIGHT)
    754      * {
    755      *     float rho = dp3(-hitVec, light.direction);
    756      *     if (rho < cos(light.phi / 2))
    757      *         atten = 0;
    758      *     if (rho < cos(light.theta / 2))
    759      *         atten *= pow(some_func(rho), light.falloff);
    760      * }
    761      *
    762      * float nDotHit = dp3_sat(normal, hitVec);
    763      * float powFact = 0.0;
    764      *
    765      * if (nDotHit > 0.0)
    766      * {
    767      *     vec3 midVec = normalize(hitDir + eye);
    768      *     float nDotMid = dp3_sat(normal, midVec);
    769      *     pFact = pow(nDotMid, material.power);
    770      * }
    771      *
    772      * ambient += light.ambient * atten;
    773      * diffuse += light.diffuse * atten * nDotHit;
    774      * specular += light.specular * atten * powFact;
    775      */
    776     if (key->lighting) {
    777         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    778         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
    779         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
    780         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
    781         struct ureg_dst rAtt = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
    782         struct ureg_dst rHit = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    783         struct ureg_dst rMid = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    784 
    785         struct ureg_dst rCtr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
    786 
    787         struct ureg_dst AL = ureg_writemask(AR, TGSI_WRITEMASK_X);
    788 
    789         /* Light.*.Alpha is not used. */
    790         struct ureg_dst rD = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    791         struct ureg_dst rA = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
    792         struct ureg_dst rS = ureg_DECL_temporary(ureg);
    793 
    794         struct ureg_src mtlP = _XXXX(MATERIAL_CONST(4));
    795 
    796         struct ureg_src cLKind = _XXXX(LIGHT_CONST(0));
    797         struct ureg_src cLAtt0 = _YYYY(LIGHT_CONST(0));
    798         struct ureg_src cLAtt1 = _ZZZZ(LIGHT_CONST(0));
    799         struct ureg_src cLAtt2 = _WWWW(LIGHT_CONST(0));
    800         struct ureg_src cLColD = _XYZW(LIGHT_CONST(1));
    801         struct ureg_src cLColS = _XYZW(LIGHT_CONST(2));
    802         struct ureg_src cLColA = _XYZW(LIGHT_CONST(3));
    803         struct ureg_src cLPos  = _XYZW(LIGHT_CONST(4));
    804         struct ureg_src cLRng  = _WWWW(LIGHT_CONST(4));
    805         struct ureg_src cLDir  = _XYZW(LIGHT_CONST(5));
    806         struct ureg_src cLFOff = _WWWW(LIGHT_CONST(5));
    807         struct ureg_src cLTht  = _XXXX(LIGHT_CONST(6));
    808         struct ureg_src cLPhi  = _YYYY(LIGHT_CONST(6));
    809         struct ureg_src cLSDiv = _ZZZZ(LIGHT_CONST(6));
    810         struct ureg_src cLLast = _WWWW(LIGHT_CONST(7));
    811 
    812         const unsigned loop_label = l++;
    813 
    814         ureg_MOV(ureg, rCtr, ureg_imm1f(ureg, 32.0f)); /* &lightconst(0) */
    815         ureg_MOV(ureg, rD, ureg_imm1f(ureg, 0.0f));
    816         ureg_MOV(ureg, rA, ureg_imm1f(ureg, 0.0f));
    817         ureg_MOV(ureg, rS, ureg_imm1f(ureg, 0.0f));
    818 
    819         /* loop management */
    820         ureg_BGNLOOP(ureg, &label[loop_label]);
    821         ureg_ARL(ureg, AL, _W(rCtr));
    822 
    823         /* if (not DIRECTIONAL light): */
    824         ureg_SNE(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_DIRECTIONAL));
    825         ureg_MOV(ureg, rHit, ureg_negate(cLDir));
    826         ureg_MOV(ureg, rAtt, ureg_imm1f(ureg, 1.0f));
    827         ureg_IF(ureg, _X(tmp), &label[l++]);
    828         {
    829             /* hitDir = light.position - eyeVtx
    830              * d = length(hitDir)
    831              */
    832             ureg_ADD(ureg, rHit, cLPos, ureg_negate(vs->aVtx));
    833             ureg_DP3(ureg, tmp_x, ureg_src(rHit), ureg_src(rHit));
    834             ureg_RSQ(ureg, tmp_y, _X(tmp));
    835             ureg_MUL(ureg, tmp_x, _X(tmp), _Y(tmp)); /* length */
    836 
    837             /* att = 1.0 / (light.att0 + (light.att1 + light.att2 * d) * d) */
    838             ureg_MAD(ureg, rAtt, _X(tmp), cLAtt2, cLAtt1);
    839             ureg_MAD(ureg, rAtt, _X(tmp), _W(rAtt), cLAtt0);
    840             ureg_RCP(ureg, rAtt, _W(rAtt));
    841             /* cut-off if distance exceeds Light.Range */
    842             ureg_SLT(ureg, tmp_x, _X(tmp), cLRng);
    843             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
    844         }
    845         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
    846         ureg_ENDIF(ureg);
    847 
    848         /* normalize hitDir */
    849         ureg_normalize3(ureg, rHit, ureg_src(rHit));
    850 
    851         /* if (SPOT light) */
    852         ureg_SEQ(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_SPOT));
    853         ureg_IF(ureg, _X(tmp), &label[l++]);
    854         {
    855             /* rho = dp3(-hitDir, light.spotDir)
    856              *
    857              * if (rho  > light.ctht2) NOTE: 0 <= phi <= pi, 0 <= theta <= phi
    858              *     spotAtt = 1
    859              * else
    860              * if (rho <= light.cphi2)
    861              *     spotAtt = 0
    862              * else
    863              *     spotAtt = (rho - light.cphi2) / (light.ctht2 - light.cphi2) ^ light.falloff
    864              */
    865             ureg_DP3(ureg, tmp_y, ureg_negate(ureg_src(rHit)), cLDir); /* rho */
    866             ureg_ADD(ureg, tmp_x, _Y(tmp), ureg_negate(cLPhi));
    867             ureg_MUL(ureg, tmp_x, _X(tmp), cLSDiv);
    868             ureg_POW(ureg, tmp_x, _X(tmp), cLFOff); /* spotAtten */
    869             ureg_SGE(ureg, tmp_z, _Y(tmp), cLTht); /* if inside theta && phi */
    870             ureg_SGE(ureg, tmp_y, _Y(tmp), cLPhi); /* if inside phi */
    871             ureg_MAD(ureg, ureg_saturate(tmp_x), _X(tmp), _Y(tmp), _Z(tmp));
    872             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
    873         }
    874         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
    875         ureg_ENDIF(ureg);
    876 
    877         /* directional factors, let's not use LIT because of clarity */
    878 
    879         if (has_aNrm) {
    880             if (key->localviewer) {
    881                 ureg_normalize3(ureg, rMid, vs->aVtx);
    882                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_negate(ureg_src(rMid)));
    883             } else {
    884                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_imm3f(ureg, 0.0f, 0.0f, -1.0f));
    885             }
    886             ureg_normalize3(ureg, rMid, ureg_src(rMid));
    887             ureg_DP3(ureg, ureg_saturate(tmp_x), vs->aNrm, ureg_src(rHit));
    888             ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
    889             ureg_MUL(ureg, tmp_z, _X(tmp), _Y(tmp));
    890             /* Tests show that specular is computed only if (dp3(normal,hitDir) > 0).
    891              * For front facing, it is more restrictive than test (dp3(normal,mid) > 0).
    892              * No tests were made for backfacing, so add the two conditions */
    893             ureg_IF(ureg, _Z(tmp), &label[l++]);
    894             {
    895                 ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
    896                 ureg_POW(ureg, tmp_y, _Y(tmp), mtlP);
    897                 ureg_MUL(ureg, tmp_y, _W(rAtt), _Y(tmp)); /* power factor * att */
    898                 ureg_MAD(ureg, rS, cLColS, _Y(tmp), ureg_src(rS)); /* accumulate specular */
    899             }
    900             ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
    901             ureg_ENDIF(ureg);
    902 
    903             ureg_MUL(ureg, tmp_x, _W(rAtt), _X(tmp)); /* dp3(normal,hitDir) * att */
    904             ureg_MAD(ureg, rD, cLColD, _X(tmp), ureg_src(rD)); /* accumulate diffuse */
    905         }
    906 
    907         ureg_MAD(ureg, rA, cLColA, _W(rAtt), ureg_src(rA)); /* accumulate ambient */
    908 
    909         /* break if this was the last light */
    910         ureg_IF(ureg, cLLast, &label[l++]);
    911         ureg_BRK(ureg);
    912         ureg_ENDIF(ureg);
    913         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
    914 
    915         ureg_ADD(ureg, rCtr, _W(rCtr), ureg_imm1f(ureg, 8.0f));
    916         ureg_fixup_label(ureg, label[loop_label], ureg_get_instruction_number(ureg));
    917         ureg_ENDLOOP(ureg, &label[loop_label]);
    918 
    919         /* Apply to material:
    920          *
    921          * oCol[0] = (material.emissive + material.ambient * rs.ambient) +
    922          *           material.ambient * ambient +
    923          *           material.diffuse * diffuse +
    924          * oCol[1] = material.specular * specular;
    925          */
    926         if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
    927             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), vs->mtlA, _CONST(19));
    928         else {
    929             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), _CONST(25));
    930             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), vs->mtlA, ureg_src(tmp), vs->mtlE);
    931         }
    932 
    933         ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), ureg_src(rD), vs->mtlD, ureg_src(tmp));
    934         ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
    935         ureg_MUL(ureg, oCol[1], ureg_src(rS), vs->mtlS);
    936         ureg_release_temporary(ureg, rAtt);
    937         ureg_release_temporary(ureg, rHit);
    938         ureg_release_temporary(ureg, rMid);
    939         ureg_release_temporary(ureg, rCtr);
    940         ureg_release_temporary(ureg, rD);
    941         ureg_release_temporary(ureg, rA);
    942         ureg_release_temporary(ureg, rS);
    943         ureg_release_temporary(ureg, rAtt);
    944         ureg_release_temporary(ureg, tmp);
    945     } else
    946     /* COLOR */
    947     if (key->darkness) {
    948         if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
    949             ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), _CONST(19));
    950         else
    951             ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), vs->mtlA, _CONST(25), vs->mtlE);
    952         ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
    953         ureg_MOV(ureg, oCol[1], ureg_imm1f(ureg, 0.0f));
    954     } else {
    955         ureg_MOV(ureg, oCol[0], vs->aCol[0]);
    956         ureg_MOV(ureg, oCol[1], vs->aCol[1]);
    957     }
    958 
    959     /* === Process fog.
    960      *
    961      * exp(x) = ex2(log2(e) * x)
    962      */
    963     if (key->fog_mode) {
    964         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    965         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
    966         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
    967         if (key->fog_range) {
    968             ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
    969             ureg_RSQ(ureg, tmp_z, _X(tmp));
    970             ureg_MUL(ureg, tmp_z, _Z(tmp), _X(tmp));
    971         } else {
    972             ureg_MOV(ureg, tmp_z, ureg_abs(_ZZZZ(vs->aVtx)));
    973         }
    974 
    975         if (key->fog_mode == D3DFOG_EXP) {
    976             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
    977             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
    978             ureg_EX2(ureg, tmp_x, _X(tmp));
    979         } else
    980         if (key->fog_mode == D3DFOG_EXP2) {
    981             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
    982             ureg_MUL(ureg, tmp_x, _X(tmp), _X(tmp));
    983             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
    984             ureg_EX2(ureg, tmp_x, _X(tmp));
    985         } else
    986         if (key->fog_mode == D3DFOG_LINEAR) {
    987             ureg_ADD(ureg, tmp_x, _XXXX(_CONST(28)), ureg_negate(_Z(tmp)));
    988             ureg_MUL(ureg, ureg_saturate(tmp_x), _X(tmp), _YYYY(_CONST(28)));
    989         }
    990         ureg_MOV(ureg, oFog, _X(tmp));
    991         ureg_release_temporary(ureg, tmp);
    992     } else if (key->fog && !(key->passthrough & (1 << NINE_DECLUSAGE_FOG))) {
    993         ureg_MOV(ureg, oFog, ureg_scalar(vs->aCol[1], TGSI_SWIZZLE_W));
    994     }
    995 
    996     if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT)) {
    997         struct ureg_src input;
    998         struct ureg_dst output;
    999         input = vs->aWgt;
   1000         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 18);
   1001         ureg_MOV(ureg, output, input);
   1002     }
   1003     if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES)) {
   1004         struct ureg_src input;
   1005         struct ureg_dst output;
   1006         input = vs->aInd;
   1007         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 19);
   1008         ureg_MOV(ureg, output, input);
   1009     }
   1010     if (key->passthrough & (1 << NINE_DECLUSAGE_NORMAL)) {
   1011         struct ureg_src input;
   1012         struct ureg_dst output;
   1013         input = vs->aNrm;
   1014         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 20);
   1015         ureg_MOV(ureg, output, input);
   1016     }
   1017     if (key->passthrough & (1 << NINE_DECLUSAGE_TANGENT)) {
   1018         struct ureg_src input;
   1019         struct ureg_dst output;
   1020         input = build_vs_add_input(vs, NINE_DECLUSAGE_TANGENT);
   1021         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 21);
   1022         ureg_MOV(ureg, output, input);
   1023     }
   1024     if (key->passthrough & (1 << NINE_DECLUSAGE_BINORMAL)) {
   1025         struct ureg_src input;
   1026         struct ureg_dst output;
   1027         input = build_vs_add_input(vs, NINE_DECLUSAGE_BINORMAL);
   1028         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 22);
   1029         ureg_MOV(ureg, output, input);
   1030     }
   1031     if (key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
   1032         struct ureg_src input;
   1033         struct ureg_dst output;
   1034         input = build_vs_add_input(vs, NINE_DECLUSAGE_FOG);
   1035         input = ureg_scalar(input, TGSI_SWIZZLE_X);
   1036         output = oFog;
   1037         ureg_MOV(ureg, output, input);
   1038     }
   1039     if (key->passthrough & (1 << NINE_DECLUSAGE_DEPTH)) {
   1040         (void) 0; /* TODO: replace z of position output ? */
   1041     }
   1042 
   1043     /* ucp for ff applies on world coordinates.
   1044      * aVtx is in worldview coordinates. */
   1045     if (key->ucp) {
   1046         struct ureg_dst clipVect = ureg_DECL_output(ureg, TGSI_SEMANTIC_CLIPVERTEX, 0);
   1047         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
   1048         ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(12));
   1049         ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(13),  ureg_src(tmp));
   1050         ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(14), ureg_src(tmp));
   1051         ureg_ADD(ureg, clipVect, _CONST(15), ureg_src(tmp));
   1052         ureg_release_temporary(ureg, tmp);
   1053     }
   1054 
   1055     if (key->position_t && device->driver_caps.window_space_position_support)
   1056         ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, TRUE);
   1057 
   1058     ureg_END(ureg);
   1059     nine_ureg_tgsi_dump(ureg, FALSE);
   1060     return ureg_create_shader_and_destroy(ureg, device->context.pipe);
   1061 }
   1062 
   1063 /* PS FF constants layout:
   1064  *
   1065  * CONST[ 0.. 7]      stage[i].D3DTSS_CONSTANT
   1066  * CONST[ 8..15].x___ stage[i].D3DTSS_BUMPENVMAT00
   1067  * CONST[ 8..15]._y__ stage[i].D3DTSS_BUMPENVMAT01
   1068  * CONST[ 8..15].__z_ stage[i].D3DTSS_BUMPENVMAT10
   1069  * CONST[ 8..15].___w stage[i].D3DTSS_BUMPENVMAT11
   1070  * CONST[16..19].x_z_ stage[i].D3DTSS_BUMPENVLSCALE
   1071  * CONST[17..19]._y_w stage[i].D3DTSS_BUMPENVLOFFSET
   1072  *
   1073  * CONST[20] D3DRS_TEXTUREFACTOR
   1074  * CONST[21] D3DRS_FOGCOLOR
   1075  * CONST[22].x___ RS.FogEnd
   1076  * CONST[22]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
   1077  * CONST[22].__z_ RS.FogDensity
   1078  */
   1079 struct ps_build_ctx
   1080 {
   1081     struct ureg_program *ureg;
   1082 
   1083     struct ureg_src vC[2]; /* DIFFUSE, SPECULAR */
   1084     struct ureg_src vT[8]; /* TEXCOORD[i] */
   1085     struct ureg_dst rCur; /* D3DTA_CURRENT */
   1086     struct ureg_dst rMod;
   1087     struct ureg_src rCurSrc;
   1088     struct ureg_dst rTmp; /* D3DTA_TEMP */
   1089     struct ureg_src rTmpSrc;
   1090     struct ureg_dst rTex;
   1091     struct ureg_src rTexSrc;
   1092     struct ureg_src cBEM[8];
   1093     struct ureg_src s[8];
   1094 
   1095     struct {
   1096         unsigned index;
   1097         unsigned index_pre_mod;
   1098     } stage;
   1099 };
   1100 
   1101 static struct ureg_src
   1102 ps_get_ts_arg(struct ps_build_ctx *ps, unsigned ta)
   1103 {
   1104     struct ureg_src reg;
   1105 
   1106     switch (ta & D3DTA_SELECTMASK) {
   1107     case D3DTA_CONSTANT:
   1108         reg = ureg_DECL_constant(ps->ureg, ps->stage.index);
   1109         break;
   1110     case D3DTA_CURRENT:
   1111         reg = (ps->stage.index == ps->stage.index_pre_mod) ? ureg_src(ps->rMod) : ps->rCurSrc;
   1112         break;
   1113     case D3DTA_DIFFUSE:
   1114         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
   1115         break;
   1116     case D3DTA_SPECULAR:
   1117         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
   1118         break;
   1119     case D3DTA_TEMP:
   1120         reg = ps->rTmpSrc;
   1121         break;
   1122     case D3DTA_TEXTURE:
   1123         reg = ps->rTexSrc;
   1124         break;
   1125     case D3DTA_TFACTOR:
   1126         reg = ureg_DECL_constant(ps->ureg, 20);
   1127         break;
   1128     default:
   1129         assert(0);
   1130         reg = ureg_src_undef();
   1131         break;
   1132     }
   1133     if (ta & D3DTA_COMPLEMENT) {
   1134         struct ureg_dst dst = ureg_DECL_temporary(ps->ureg);
   1135         ureg_ADD(ps->ureg, dst, ureg_imm1f(ps->ureg, 1.0f), ureg_negate(reg));
   1136         reg = ureg_src(dst);
   1137     }
   1138     if (ta & D3DTA_ALPHAREPLICATE)
   1139         reg = _WWWW(reg);
   1140     return reg;
   1141 }
   1142 
   1143 static struct ureg_dst
   1144 ps_get_ts_dst(struct ps_build_ctx *ps, unsigned ta)
   1145 {
   1146     assert(!(ta & (D3DTA_COMPLEMENT | D3DTA_ALPHAREPLICATE)));
   1147 
   1148     switch (ta & D3DTA_SELECTMASK) {
   1149     case D3DTA_CURRENT:
   1150         return ps->rCur;
   1151     case D3DTA_TEMP:
   1152         return ps->rTmp;
   1153     default:
   1154         assert(0);
   1155         return ureg_dst_undef();
   1156     }
   1157 }
   1158 
   1159 static uint8_t ps_d3dtop_args_mask(D3DTEXTUREOP top)
   1160 {
   1161     switch (top) {
   1162     case D3DTOP_DISABLE:
   1163         return 0x0;
   1164     case D3DTOP_SELECTARG1:
   1165     case D3DTOP_PREMODULATE:
   1166         return 0x2;
   1167     case D3DTOP_SELECTARG2:
   1168         return 0x4;
   1169     case D3DTOP_MULTIPLYADD:
   1170     case D3DTOP_LERP:
   1171         return 0x7;
   1172     default:
   1173         return 0x6;
   1174     }
   1175 }
   1176 
   1177 static inline boolean
   1178 is_MOV_no_op(struct ureg_dst dst, struct ureg_src src)
   1179 {
   1180     return !dst.WriteMask ||
   1181         (dst.File == src.File &&
   1182          dst.Index == src.Index &&
   1183          !dst.Indirect &&
   1184          !dst.Saturate &&
   1185          !src.Indirect &&
   1186          !src.Negate &&
   1187          !src.Absolute &&
   1188          (!(dst.WriteMask & TGSI_WRITEMASK_X) || (src.SwizzleX == TGSI_SWIZZLE_X)) &&
   1189          (!(dst.WriteMask & TGSI_WRITEMASK_Y) || (src.SwizzleY == TGSI_SWIZZLE_Y)) &&
   1190          (!(dst.WriteMask & TGSI_WRITEMASK_Z) || (src.SwizzleZ == TGSI_SWIZZLE_Z)) &&
   1191          (!(dst.WriteMask & TGSI_WRITEMASK_W) || (src.SwizzleW == TGSI_SWIZZLE_W)));
   1192 
   1193 }
   1194 
   1195 static void
   1196 ps_do_ts_op(struct ps_build_ctx *ps, unsigned top, struct ureg_dst dst, struct ureg_src *arg)
   1197 {
   1198     struct ureg_program *ureg = ps->ureg;
   1199     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
   1200     struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
   1201     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
   1202 
   1203     tmp.WriteMask = dst.WriteMask;
   1204 
   1205     if (top != D3DTOP_SELECTARG1 && top != D3DTOP_SELECTARG2 &&
   1206         top != D3DTOP_MODULATE && top != D3DTOP_PREMODULATE &&
   1207         top != D3DTOP_BLENDDIFFUSEALPHA && top != D3DTOP_BLENDTEXTUREALPHA &&
   1208         top != D3DTOP_BLENDFACTORALPHA && top != D3DTOP_BLENDCURRENTALPHA &&
   1209         top != D3DTOP_BUMPENVMAP && top != D3DTOP_BUMPENVMAPLUMINANCE &&
   1210         top != D3DTOP_LERP)
   1211         dst = ureg_saturate(dst);
   1212 
   1213     switch (top) {
   1214     case D3DTOP_SELECTARG1:
   1215         if (!is_MOV_no_op(dst, arg[1]))
   1216             ureg_MOV(ureg, dst, arg[1]);
   1217         break;
   1218     case D3DTOP_SELECTARG2:
   1219         if (!is_MOV_no_op(dst, arg[2]))
   1220             ureg_MOV(ureg, dst, arg[2]);
   1221         break;
   1222     case D3DTOP_MODULATE:
   1223         ureg_MUL(ureg, dst, arg[1], arg[2]);
   1224         break;
   1225     case D3DTOP_MODULATE2X:
   1226         ureg_MUL(ureg, tmp, arg[1], arg[2]);
   1227         ureg_ADD(ureg, dst, ureg_src(tmp), ureg_src(tmp));
   1228         break;
   1229     case D3DTOP_MODULATE4X:
   1230         ureg_MUL(ureg, tmp, arg[1], arg[2]);
   1231         ureg_MUL(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 4.0f));
   1232         break;
   1233     case D3DTOP_ADD:
   1234         ureg_ADD(ureg, dst, arg[1], arg[2]);
   1235         break;
   1236     case D3DTOP_ADDSIGNED:
   1237         ureg_ADD(ureg, tmp, arg[1], arg[2]);
   1238         ureg_ADD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, -0.5f));
   1239         break;
   1240     case D3DTOP_ADDSIGNED2X:
   1241         ureg_ADD(ureg, tmp, arg[1], arg[2]);
   1242         ureg_MAD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 2.0f), ureg_imm1f(ureg, -1.0f));
   1243         break;
   1244     case D3DTOP_SUBTRACT:
   1245         ureg_ADD(ureg, dst, arg[1], ureg_negate(arg[2]));
   1246         break;
   1247     case D3DTOP_ADDSMOOTH:
   1248         ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
   1249         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], arg[1]);
   1250         break;
   1251     case D3DTOP_BLENDDIFFUSEALPHA:
   1252         ureg_LRP(ureg, dst, _WWWW(ps->vC[0]), arg[1], arg[2]);
   1253         break;
   1254     case D3DTOP_BLENDTEXTUREALPHA:
   1255         /* XXX: alpha taken from previous stage, texture or result ? */
   1256         ureg_LRP(ureg, dst, _W(ps->rTex), arg[1], arg[2]);
   1257         break;
   1258     case D3DTOP_BLENDFACTORALPHA:
   1259         ureg_LRP(ureg, dst, _WWWW(_CONST(20)), arg[1], arg[2]);
   1260         break;
   1261     case D3DTOP_BLENDTEXTUREALPHAPM:
   1262         ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_W(ps->rTex)));
   1263         ureg_MAD(ureg, dst, arg[2], _X(tmp), arg[1]);
   1264         break;
   1265     case D3DTOP_BLENDCURRENTALPHA:
   1266         ureg_LRP(ureg, dst, _WWWW(ps->rCurSrc), arg[1], arg[2]);
   1267         break;
   1268     case D3DTOP_PREMODULATE:
   1269         ureg_MOV(ureg, dst, arg[1]);
   1270         ps->stage.index_pre_mod = ps->stage.index + 1;
   1271         break;
   1272     case D3DTOP_MODULATEALPHA_ADDCOLOR:
   1273         ureg_MAD(ureg, dst, _WWWW(arg[1]), arg[2], arg[1]);
   1274         break;
   1275     case D3DTOP_MODULATECOLOR_ADDALPHA:
   1276         ureg_MAD(ureg, dst, arg[1], arg[2], _WWWW(arg[1]));
   1277         break;
   1278     case D3DTOP_MODULATEINVALPHA_ADDCOLOR:
   1279         ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_WWWW(arg[1])));
   1280         ureg_MAD(ureg, dst, _X(tmp), arg[2], arg[1]);
   1281         break;
   1282     case D3DTOP_MODULATEINVCOLOR_ADDALPHA:
   1283         ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
   1284         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], _WWWW(arg[1]));
   1285         break;
   1286     case D3DTOP_BUMPENVMAP:
   1287         break;
   1288     case D3DTOP_BUMPENVMAPLUMINANCE:
   1289         break;
   1290     case D3DTOP_DOTPRODUCT3:
   1291         ureg_ADD(ureg, tmp, arg[1], ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
   1292         ureg_ADD(ureg, tmp2, arg[2] , ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
   1293         ureg_DP3(ureg, tmp, ureg_src(tmp), ureg_src(tmp2));
   1294         ureg_MUL(ureg, ureg_saturate(dst), ureg_src(tmp), ureg_imm4f(ureg,4.0,4.0,4.0,4.0));
   1295         break;
   1296     case D3DTOP_MULTIPLYADD:
   1297         ureg_MAD(ureg, dst, arg[1], arg[2], arg[0]);
   1298         break;
   1299     case D3DTOP_LERP:
   1300         ureg_LRP(ureg, dst, arg[0], arg[1], arg[2]);
   1301         break;
   1302     case D3DTOP_DISABLE:
   1303         /* no-op ? */
   1304         break;
   1305     default:
   1306         assert(!"invalid D3DTOP");
   1307         break;
   1308     }
   1309     ureg_release_temporary(ureg, tmp);
   1310     ureg_release_temporary(ureg, tmp2);
   1311 }
   1312 
   1313 static void *
   1314 nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
   1315 {
   1316     struct ps_build_ctx ps;
   1317     struct ureg_program *ureg = ureg_create(PIPE_SHADER_FRAGMENT);
   1318     struct ureg_dst oCol;
   1319     unsigned s;
   1320     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
   1321 
   1322     memset(&ps, 0, sizeof(ps));
   1323     ps.ureg = ureg;
   1324     ps.stage.index_pre_mod = -1;
   1325 
   1326     ps.vC[0] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
   1327 
   1328     ps.rCur = ureg_DECL_temporary(ureg);
   1329     ps.rTmp = ureg_DECL_temporary(ureg);
   1330     ps.rTex = ureg_DECL_temporary(ureg);
   1331     ps.rCurSrc = ureg_src(ps.rCur);
   1332     ps.rTmpSrc = ureg_src(ps.rTmp);
   1333     ps.rTexSrc = ureg_src(ps.rTex);
   1334 
   1335     /* Initial values */
   1336     ureg_MOV(ureg, ps.rCur, ps.vC[0]);
   1337     ureg_MOV(ureg, ps.rTmp, ureg_imm1f(ureg, 0.0f));
   1338     ureg_MOV(ureg, ps.rTex, ureg_imm1f(ureg, 0.0f));
   1339 
   1340     for (s = 0; s < 8; ++s) {
   1341         ps.s[s] = ureg_src_undef();
   1342 
   1343         if (key->ts[s].colorop != D3DTOP_DISABLE) {
   1344             if (key->ts[s].colorarg0 == D3DTA_SPECULAR ||
   1345                 key->ts[s].colorarg1 == D3DTA_SPECULAR ||
   1346                 key->ts[s].colorarg2 == D3DTA_SPECULAR)
   1347                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
   1348 
   1349             if (key->ts[s].colorarg0 == D3DTA_TEXTURE ||
   1350                 key->ts[s].colorarg1 == D3DTA_TEXTURE ||
   1351                 key->ts[s].colorarg2 == D3DTA_TEXTURE) {
   1352                 ps.s[s] = ureg_DECL_sampler(ureg, s);
   1353                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
   1354             }
   1355             if (s && (key->ts[s - 1].colorop == D3DTOP_PREMODULATE ||
   1356                       key->ts[s - 1].alphaop == D3DTOP_PREMODULATE))
   1357                 ps.s[s] = ureg_DECL_sampler(ureg, s);
   1358         }
   1359 
   1360         if (key->ts[s].alphaop != D3DTOP_DISABLE) {
   1361             if (key->ts[s].alphaarg0 == D3DTA_SPECULAR ||
   1362                 key->ts[s].alphaarg1 == D3DTA_SPECULAR ||
   1363                 key->ts[s].alphaarg2 == D3DTA_SPECULAR)
   1364                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
   1365 
   1366             if (key->ts[s].alphaarg0 == D3DTA_TEXTURE ||
   1367                 key->ts[s].alphaarg1 == D3DTA_TEXTURE ||
   1368                 key->ts[s].alphaarg2 == D3DTA_TEXTURE) {
   1369                 ps.s[s] = ureg_DECL_sampler(ureg, s);
   1370                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
   1371             }
   1372         }
   1373     }
   1374     if (key->specular)
   1375         ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
   1376 
   1377     oCol = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
   1378 
   1379     /* Run stages.
   1380      */
   1381     for (s = 0; s < 8; ++s) {
   1382         unsigned colorarg[3];
   1383         unsigned alphaarg[3];
   1384         const uint8_t used_c = ps_d3dtop_args_mask(key->ts[s].colorop);
   1385         const uint8_t used_a = ps_d3dtop_args_mask(key->ts[s].alphaop);
   1386         struct ureg_dst dst;
   1387         struct ureg_src arg[3];
   1388 
   1389         if (key->ts[s].colorop == D3DTOP_DISABLE) {
   1390             assert (key->ts[s].alphaop == D3DTOP_DISABLE);
   1391             continue;
   1392         }
   1393         ps.stage.index = s;
   1394 
   1395         DBG("STAGE[%u]: colorop=%s alphaop=%s\n", s,
   1396             nine_D3DTOP_to_str(key->ts[s].colorop),
   1397             nine_D3DTOP_to_str(key->ts[s].alphaop));
   1398 
   1399         if (!ureg_src_is_undef(ps.s[s])) {
   1400             unsigned target;
   1401             struct ureg_src texture_coord = ps.vT[s];
   1402             struct ureg_dst delta;
   1403             switch (key->ts[s].textarget) {
   1404             case 0: target = TGSI_TEXTURE_1D; break;
   1405             case 1: target = TGSI_TEXTURE_2D; break;
   1406             case 2: target = TGSI_TEXTURE_3D; break;
   1407             case 3: target = TGSI_TEXTURE_CUBE; break;
   1408             /* this is a 2 bit bitfield, do I really need a default case ? */
   1409             }
   1410 
   1411             /* Modify coordinates */
   1412             if (s >= 1 &&
   1413                 (key->ts[s-1].colorop == D3DTOP_BUMPENVMAP ||
   1414                  key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)) {
   1415                 delta = ureg_DECL_temporary(ureg);
   1416                 /* Du' = D3DTSS_BUMPENVMAT00(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT10(stage s-1)*t(s-1)G */
   1417                 ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _X(ps.rTex), _XXXX(_CONST(8 + s - 1)));
   1418                 ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _Y(ps.rTex), _ZZZZ(_CONST(8 + s - 1)), ureg_src(delta));
   1419                 /* Dv' = D3DTSS_BUMPENVMAT01(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT11(stage s-1)*t(s-1)G */
   1420                 ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _X(ps.rTex), _YYYY(_CONST(8 + s - 1)));
   1421                 ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _Y(ps.rTex), _WWWW(_CONST(8 + s - 1)), ureg_src(delta));
   1422                 texture_coord = ureg_src(ureg_DECL_temporary(ureg));
   1423                 ureg_MOV(ureg, ureg_writemask(ureg_dst(texture_coord), ureg_dst(ps.vT[s]).WriteMask), ps.vT[s]);
   1424                 ureg_ADD(ureg, ureg_writemask(ureg_dst(texture_coord), TGSI_WRITEMASK_XY), texture_coord, ureg_src(delta));
   1425                 /* Prepare luminance multiplier
   1426                  * t(s)RGBA = t(s)RGBA * clamp[(t(s-1)B * D3DTSS_BUMPENVLSCALE(stage s-1)) + D3DTSS_BUMPENVLOFFSET(stage s-1)] */
   1427                 if (key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE) {
   1428                     struct ureg_src bumpenvlscale = ((s-1) & 1) ? _ZZZZ(_CONST(16 + (s-1) / 2)) : _XXXX(_CONST(16 + (s-1) / 2));
   1429                     struct ureg_src bumpenvloffset = ((s-1) & 1) ? _WWWW(_CONST(16 + (s-1) / 2)) : _YYYY(_CONST(16 + (s-1) / 2));
   1430 
   1431                     ureg_MAD(ureg, ureg_saturate(ureg_writemask(delta, TGSI_WRITEMASK_X)), _Z(ps.rTex), bumpenvlscale, bumpenvloffset);
   1432                 }
   1433             }
   1434             if (key->projected & (3 << (s *2))) {
   1435                 unsigned dim = 1 + ((key->projected >> (2 * s)) & 3);
   1436                 if (dim == 4)
   1437                     ureg_TXP(ureg, ps.rTex, target, texture_coord, ps.s[s]);
   1438                 else {
   1439                     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
   1440                     ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(texture_coord, dim-1));
   1441                     ureg_MUL(ureg, ps.rTmp, _X(tmp), texture_coord);
   1442                     ureg_TEX(ureg, ps.rTex, target, ps.rTmpSrc, ps.s[s]);
   1443                     ureg_release_temporary(ureg, tmp);
   1444                 }
   1445             } else {
   1446                 ureg_TEX(ureg, ps.rTex, target, texture_coord, ps.s[s]);
   1447             }
   1448             if (s >= 1 && key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
   1449                 ureg_MUL(ureg, ps.rTex, ureg_src(ps.rTex), _X(delta));
   1450         }
   1451 
   1452         if (key->ts[s].colorop == D3DTOP_BUMPENVMAP ||
   1453             key->ts[s].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
   1454             continue;
   1455 
   1456         dst = ps_get_ts_dst(&ps, key->ts[s].resultarg ? D3DTA_TEMP : D3DTA_CURRENT);
   1457 
   1458         if (ps.stage.index_pre_mod == ps.stage.index) {
   1459             ps.rMod = ureg_DECL_temporary(ureg);
   1460             ureg_MUL(ureg, ps.rMod, ps.rCurSrc, ps.rTexSrc);
   1461         }
   1462 
   1463         colorarg[0] = (key->ts[s].colorarg0 | ((key->colorarg_b4[0] >> s) << 4) | ((key->colorarg_b5[0] >> s) << 5)) & 0x3f;
   1464         colorarg[1] = (key->ts[s].colorarg1 | ((key->colorarg_b4[1] >> s) << 4) | ((key->colorarg_b5[1] >> s) << 5)) & 0x3f;
   1465         colorarg[2] = (key->ts[s].colorarg2 | ((key->colorarg_b4[2] >> s) << 4) | ((key->colorarg_b5[2] >> s) << 5)) & 0x3f;
   1466         alphaarg[0] = (key->ts[s].alphaarg0 | ((key->alphaarg_b4[0] >> s) << 4)) & 0x1f;
   1467         alphaarg[1] = (key->ts[s].alphaarg1 | ((key->alphaarg_b4[1] >> s) << 4)) & 0x1f;
   1468         alphaarg[2] = (key->ts[s].alphaarg2 | ((key->alphaarg_b4[2] >> s) << 4)) & 0x1f;
   1469 
   1470         if (key->ts[s].colorop != key->ts[s].alphaop ||
   1471             colorarg[0] != alphaarg[0] ||
   1472             colorarg[1] != alphaarg[1] ||
   1473             colorarg[2] != alphaarg[2])
   1474             dst.WriteMask = TGSI_WRITEMASK_XYZ;
   1475 
   1476         /* Special DOTPRODUCT behaviour (see wine tests) */
   1477         if (key->ts[s].colorop == D3DTOP_DOTPRODUCT3)
   1478             dst.WriteMask = TGSI_WRITEMASK_XYZW;
   1479 
   1480         if (used_c & 0x1) arg[0] = ps_get_ts_arg(&ps, colorarg[0]);
   1481         if (used_c & 0x2) arg[1] = ps_get_ts_arg(&ps, colorarg[1]);
   1482         if (used_c & 0x4) arg[2] = ps_get_ts_arg(&ps, colorarg[2]);
   1483         ps_do_ts_op(&ps, key->ts[s].colorop, dst, arg);
   1484 
   1485         if (dst.WriteMask != TGSI_WRITEMASK_XYZW) {
   1486             dst.WriteMask = TGSI_WRITEMASK_W;
   1487 
   1488             if (used_a & 0x1) arg[0] = ps_get_ts_arg(&ps, alphaarg[0]);
   1489             if (used_a & 0x2) arg[1] = ps_get_ts_arg(&ps, alphaarg[1]);
   1490             if (used_a & 0x4) arg[2] = ps_get_ts_arg(&ps, alphaarg[2]);
   1491             ps_do_ts_op(&ps, key->ts[s].alphaop, dst, arg);
   1492         }
   1493     }
   1494 
   1495     if (key->specular)
   1496         ureg_ADD(ureg, ureg_writemask(ps.rCur, TGSI_WRITEMASK_XYZ), ps.rCurSrc, ps.vC[1]);
   1497 
   1498     /* Fog.
   1499      */
   1500     if (key->fog_mode) {
   1501         struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
   1502         struct ureg_src vPos;
   1503         if (device->screen->get_param(device->screen,
   1504                                       PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL)) {
   1505             vPos = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
   1506         } else {
   1507             vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
   1508                                       TGSI_INTERPOLATE_LINEAR);
   1509         }
   1510 
   1511         /* Source is either W or Z.
   1512          * When we use vs ff,
   1513          * Z is when an orthogonal projection matrix is detected,
   1514          * W (WFOG) else.
   1515          * Z is used for programmable vs.
   1516          * Note: Tests indicate that the projection matrix coefficients do
   1517          * actually affect pixel fog (and not vertex fog) when vs ff is used,
   1518          * which justifies taking the position's w instead of taking the z coordinate
   1519          * before the projection in the vs shader.
   1520          */
   1521         if (!key->fog_source)
   1522             ureg_MOV(ureg, rFog, _ZZZZ(vPos));
   1523         else
   1524             /* Position's w is 1/w */
   1525             ureg_RCP(ureg, rFog, _WWWW(vPos));
   1526 
   1527         if (key->fog_mode == D3DFOG_EXP) {
   1528             ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
   1529             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
   1530             ureg_EX2(ureg, rFog, _X(rFog));
   1531         } else
   1532         if (key->fog_mode == D3DFOG_EXP2) {
   1533             ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
   1534             ureg_MUL(ureg, rFog, _X(rFog), _X(rFog));
   1535             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
   1536             ureg_EX2(ureg, rFog, _X(rFog));
   1537         } else
   1538         if (key->fog_mode == D3DFOG_LINEAR) {
   1539             ureg_ADD(ureg, rFog, _XXXX(_CONST(22)), ureg_negate(_X(rFog)));
   1540             ureg_MUL(ureg, ureg_saturate(rFog), _X(rFog), _YYYY(_CONST(22)));
   1541         }
   1542         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _X(rFog), ps.rCurSrc, _CONST(21));
   1543         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
   1544     } else
   1545     if (key->fog) {
   1546         struct ureg_src vFog = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_FOG, 0, TGSI_INTERPOLATE_PERSPECTIVE);
   1547         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _XXXX(vFog), ps.rCurSrc, _CONST(21));
   1548         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
   1549     } else {
   1550         ureg_MOV(ureg, oCol, ps.rCurSrc);
   1551     }
   1552 
   1553     ureg_END(ureg);
   1554     nine_ureg_tgsi_dump(ureg, FALSE);
   1555     return ureg_create_shader_and_destroy(ureg, device->context.pipe);
   1556 }
   1557 
   1558 static struct NineVertexShader9 *
   1559 nine_ff_get_vs(struct NineDevice9 *device)
   1560 {
   1561     const struct nine_context *context = &device->context;
   1562     struct NineVertexShader9 *vs;
   1563     enum pipe_error err;
   1564     struct vs_build_ctx bld;
   1565     struct nine_ff_vs_key key;
   1566     unsigned s, i;
   1567     boolean has_indexes = false;
   1568     boolean has_weights = false;
   1569     char input_texture_coord[8];
   1570 
   1571     assert(sizeof(key) <= sizeof(key.value32));
   1572 
   1573     memset(&key, 0, sizeof(key));
   1574     memset(&bld, 0, sizeof(bld));
   1575     memset(&input_texture_coord, 0, sizeof(input_texture_coord));
   1576 
   1577     bld.key = &key;
   1578 
   1579     /* FIXME: this shouldn't be NULL, but it is on init */
   1580     if (context->vdecl) {
   1581         key.color0in_one = 1;
   1582         key.color1in_zero = 1;
   1583         for (i = 0; i < context->vdecl->nelems; i++) {
   1584             uint16_t usage = context->vdecl->usage_map[i];
   1585             if (usage == NINE_DECLUSAGE_POSITIONT)
   1586                 key.position_t = 1;
   1587             else if (usage == NINE_DECLUSAGE_i(COLOR, 0))
   1588                 key.color0in_one = 0;
   1589             else if (usage == NINE_DECLUSAGE_i(COLOR, 1))
   1590                 key.color1in_zero = 0;
   1591             else if (usage == NINE_DECLUSAGE_i(BLENDINDICES, 0)) {
   1592                 has_indexes = true;
   1593                 key.passthrough |= 1 << usage;
   1594             } else if (usage == NINE_DECLUSAGE_i(BLENDWEIGHT, 0)) {
   1595                 has_weights = true;
   1596                 key.passthrough |= 1 << usage;
   1597             } else if (usage == NINE_DECLUSAGE_i(NORMAL, 0)) {
   1598                 key.has_normal = 1;
   1599                 key.passthrough |= 1 << usage;
   1600             } else if (usage == NINE_DECLUSAGE_PSIZE)
   1601                 key.vertexpointsize = 1;
   1602             else if (usage % NINE_DECLUSAGE_COUNT == NINE_DECLUSAGE_TEXCOORD) {
   1603                 s = usage / NINE_DECLUSAGE_COUNT;
   1604                 if (s < 8)
   1605                     input_texture_coord[s] = nine_decltype_get_dim(context->vdecl->decls[i].Type);
   1606                 else
   1607                     DBG("FF given texture coordinate >= 8. Ignoring\n");
   1608             } else if (usage < NINE_DECLUSAGE_NONE)
   1609                 key.passthrough |= 1 << usage;
   1610         }
   1611     }
   1612     /* ff vs + ps 3.0: some elements are passed to the ps (wine test).
   1613      * We do restrict to indices 0 */
   1614     key.passthrough &= ~((1 << NINE_DECLUSAGE_POSITION) | (1 << NINE_DECLUSAGE_PSIZE) |
   1615                          (1 << NINE_DECLUSAGE_TEXCOORD) | (1 << NINE_DECLUSAGE_POSITIONT) |
   1616                          (1 << NINE_DECLUSAGE_TESSFACTOR) | (1 << NINE_DECLUSAGE_SAMPLE));
   1617     if (!key.position_t)
   1618         key.passthrough = 0;
   1619     key.pointscale = !!context->rs[D3DRS_POINTSCALEENABLE];
   1620 
   1621     key.lighting = !!context->rs[D3DRS_LIGHTING] &&  context->ff.num_lights_active;
   1622     key.darkness = !!context->rs[D3DRS_LIGHTING] && !context->ff.num_lights_active;
   1623     if (key.position_t) {
   1624         key.darkness = 0; /* |= key.lighting; */ /* XXX ? */
   1625         key.lighting = 0;
   1626     }
   1627     if ((key.lighting | key.darkness) && context->rs[D3DRS_COLORVERTEX]) {
   1628         uint32_t mask = (key.color0in_one ? 0 : 1) | (key.color1in_zero ? 0 : 2);
   1629         key.mtl_diffuse = context->rs[D3DRS_DIFFUSEMATERIALSOURCE] & mask;
   1630         key.mtl_ambient = context->rs[D3DRS_AMBIENTMATERIALSOURCE] & mask;
   1631         key.mtl_specular = context->rs[D3DRS_SPECULARMATERIALSOURCE] & mask;
   1632         key.mtl_emissive = context->rs[D3DRS_EMISSIVEMATERIALSOURCE] & mask;
   1633     }
   1634     key.fog = !!context->rs[D3DRS_FOGENABLE];
   1635     key.fog_mode = (!key.position_t && context->rs[D3DRS_FOGENABLE]) ? context->rs[D3DRS_FOGVERTEXMODE] : 0;
   1636     if (key.fog_mode)
   1637         key.fog_range = context->rs[D3DRS_RANGEFOGENABLE];
   1638 
   1639     key.localviewer = !!context->rs[D3DRS_LOCALVIEWER];
   1640     key.normalizenormals = !!context->rs[D3DRS_NORMALIZENORMALS];
   1641     key.ucp = !!context->rs[D3DRS_CLIPPLANEENABLE];
   1642 
   1643     if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
   1644         key.vertexblend_indexed = !!context->rs[D3DRS_INDEXEDVERTEXBLENDENABLE] && has_indexes;
   1645 
   1646         switch (context->rs[D3DRS_VERTEXBLEND]) {
   1647         case D3DVBF_0WEIGHTS: key.vertexblend = key.vertexblend_indexed; break;
   1648         case D3DVBF_1WEIGHTS: key.vertexblend = 2; break;
   1649         case D3DVBF_2WEIGHTS: key.vertexblend = 3; break;
   1650         case D3DVBF_3WEIGHTS: key.vertexblend = 4; break;
   1651         case D3DVBF_TWEENING: key.vertextween = 1; break;
   1652         default:
   1653             assert(!"invalid D3DVBF");
   1654             break;
   1655         }
   1656         if (!has_weights && context->rs[D3DRS_VERTEXBLEND] != D3DVBF_0WEIGHTS)
   1657             key.vertexblend = 0; /* TODO: if key.vertexblend_indexed, perhaps it should use 1.0 as weight, or revert to D3DVBF_0WEIGHTS */
   1658     }
   1659 
   1660     for (s = 0; s < 8; ++s) {
   1661         unsigned gen = (context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] >> 16) + 1;
   1662         unsigned idx = context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] & 7;
   1663         unsigned dim;
   1664 
   1665         if (key.position_t && gen > NINED3DTSS_TCI_PASSTHRU)
   1666             gen = NINED3DTSS_TCI_PASSTHRU;
   1667 
   1668         if (!input_texture_coord[idx] && gen == NINED3DTSS_TCI_PASSTHRU)
   1669             gen = NINED3DTSS_TCI_DISABLE;
   1670 
   1671         key.tc_gen |= gen << (s * 3);
   1672         key.tc_idx |= idx << (s * 3);
   1673         key.tc_dim_input |= ((input_texture_coord[idx]-1) & 0x3) << (s * 2);
   1674 
   1675         dim = context->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7;
   1676         if (dim > 4)
   1677             dim = input_texture_coord[idx];
   1678         if (dim == 1) /* NV behaviour */
   1679             dim = 0;
   1680         key.tc_dim_output |= dim << (s * 3);
   1681     }
   1682 
   1683     vs = util_hash_table_get(device->ff.ht_vs, &key);
   1684     if (vs)
   1685         return vs;
   1686     NineVertexShader9_new(device, &vs, NULL, nine_ff_build_vs(device, &bld));
   1687 
   1688     nine_ff_prune_vs(device);
   1689     if (vs) {
   1690         unsigned n;
   1691 
   1692         memcpy(&vs->ff_key, &key, sizeof(vs->ff_key));
   1693 
   1694         err = util_hash_table_set(device->ff.ht_vs, &vs->ff_key, vs);
   1695         (void)err;
   1696         assert(err == PIPE_OK);
   1697         device->ff.num_vs++;
   1698         NineUnknown_ConvertRefToBind(NineUnknown(vs));
   1699 
   1700         vs->num_inputs = bld.num_inputs;
   1701         for (n = 0; n < bld.num_inputs; ++n)
   1702             vs->input_map[n].ndecl = bld.input[n];
   1703 
   1704         vs->position_t = key.position_t;
   1705         vs->point_size = key.vertexpointsize | key.pointscale;
   1706     }
   1707     return vs;
   1708 }
   1709 
   1710 #define GET_D3DTS(n) nine_state_access_transform(&context->ff, D3DTS_##n, FALSE)
   1711 #define IS_D3DTS_DIRTY(s,n) ((s)->ff.changed.transform[(D3DTS_##n) / 32] & (1 << ((D3DTS_##n) % 32)))
   1712 
   1713 static struct NinePixelShader9 *
   1714 nine_ff_get_ps(struct NineDevice9 *device)
   1715 {
   1716     struct nine_context *context = &device->context;
   1717     D3DMATRIX *projection_matrix = GET_D3DTS(PROJECTION);
   1718     struct NinePixelShader9 *ps;
   1719     enum pipe_error err;
   1720     struct nine_ff_ps_key key;
   1721     unsigned s;
   1722     uint8_t sampler_mask = 0;
   1723 
   1724     assert(sizeof(key) <= sizeof(key.value32));
   1725 
   1726     memset(&key, 0, sizeof(key));
   1727     for (s = 0; s < 8; ++s) {
   1728         key.ts[s].colorop = context->ff.tex_stage[s][D3DTSS_COLOROP];
   1729         key.ts[s].alphaop = context->ff.tex_stage[s][D3DTSS_ALPHAOP];
   1730         const uint8_t used_c = ps_d3dtop_args_mask(key.ts[s].colorop);
   1731         const uint8_t used_a = ps_d3dtop_args_mask(key.ts[s].alphaop);
   1732         /* MSDN says D3DTOP_DISABLE disables this and all subsequent stages.
   1733          * ALPHAOP cannot be enabled if COLOROP is disabled.
   1734          * Verified on Windows. */
   1735         if (key.ts[s].colorop == D3DTOP_DISABLE) {
   1736             key.ts[s].alphaop = D3DTOP_DISABLE; /* DISABLE == 1, avoid degenerate keys */
   1737             break;
   1738         }
   1739 
   1740         if (!context->texture[s].enabled &&
   1741             ((context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE &&
   1742               used_c & 0x1) ||
   1743              (context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE &&
   1744               used_c & 0x2) ||
   1745              (context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE &&
   1746               used_c & 0x4))) {
   1747             /* Tested on Windows: Invalid texture read disables the stage
   1748              * and the subsequent ones, but only for colorop. For alpha,
   1749              * it's as if the texture had alpha of 1.0, which is what
   1750              * has our dummy texture in that case. Invalid color also
   1751              * disabled the following alpha stages. */
   1752             key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
   1753             break;
   1754         }
   1755 
   1756         if (context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE ||
   1757             context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE ||
   1758             context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE ||
   1759             context->ff.tex_stage[s][D3DTSS_ALPHAARG0] == D3DTA_TEXTURE ||
   1760             context->ff.tex_stage[s][D3DTSS_ALPHAARG1] == D3DTA_TEXTURE ||
   1761             context->ff.tex_stage[s][D3DTSS_ALPHAARG2] == D3DTA_TEXTURE)
   1762             sampler_mask |= (1 << s);
   1763 
   1764         if (key.ts[s].colorop != D3DTOP_DISABLE) {
   1765             if (used_c & 0x1) key.ts[s].colorarg0 = context->ff.tex_stage[s][D3DTSS_COLORARG0];
   1766             if (used_c & 0x2) key.ts[s].colorarg1 = context->ff.tex_stage[s][D3DTSS_COLORARG1];
   1767             if (used_c & 0x4) key.ts[s].colorarg2 = context->ff.tex_stage[s][D3DTSS_COLORARG2];
   1768             if (used_c & 0x1) key.colorarg_b4[0] |= (context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 4) << s;
   1769             if (used_c & 0x1) key.colorarg_b5[0] |= (context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 5) << s;
   1770             if (used_c & 0x2) key.colorarg_b4[1] |= (context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 4) << s;
   1771             if (used_c & 0x2) key.colorarg_b5[1] |= (context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 5) << s;
   1772             if (used_c & 0x4) key.colorarg_b4[2] |= (context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 4) << s;
   1773             if (used_c & 0x4) key.colorarg_b5[2] |= (context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 5) << s;
   1774         }
   1775         if (key.ts[s].alphaop != D3DTOP_DISABLE) {
   1776             if (used_a & 0x1) key.ts[s].alphaarg0 = context->ff.tex_stage[s][D3DTSS_ALPHAARG0];
   1777             if (used_a & 0x2) key.ts[s].alphaarg1 = context->ff.tex_stage[s][D3DTSS_ALPHAARG1];
   1778             if (used_a & 0x4) key.ts[s].alphaarg2 = context->ff.tex_stage[s][D3DTSS_ALPHAARG2];
   1779             if (used_a & 0x1) key.alphaarg_b4[0] |= (context->ff.tex_stage[s][D3DTSS_ALPHAARG0] >> 4) << s;
   1780             if (used_a & 0x2) key.alphaarg_b4[1] |= (context->ff.tex_stage[s][D3DTSS_ALPHAARG1] >> 4) << s;
   1781             if (used_a & 0x4) key.alphaarg_b4[2] |= (context->ff.tex_stage[s][D3DTSS_ALPHAARG2] >> 4) << s;
   1782         }
   1783         key.ts[s].resultarg = context->ff.tex_stage[s][D3DTSS_RESULTARG] == D3DTA_TEMP;
   1784 
   1785         if (context->texture[s].enabled) {
   1786             switch (context->texture[s].type) {
   1787             case D3DRTYPE_TEXTURE:       key.ts[s].textarget = 1; break;
   1788             case D3DRTYPE_VOLUMETEXTURE: key.ts[s].textarget = 2; break;
   1789             case D3DRTYPE_CUBETEXTURE:   key.ts[s].textarget = 3; break;
   1790             default:
   1791                 assert(!"unexpected texture type");
   1792                 break;
   1793             }
   1794         } else {
   1795             key.ts[s].textarget = 1;
   1796         }
   1797     }
   1798 
   1799     /* Note: If colorop is D3DTOP_DISABLE for the first stage
   1800      * (which implies alphaop is too), nothing particular happens,
   1801      * that is, current is equal to diffuse (which is the case anyway,
   1802      * because it is how it is initialized).
   1803      * Special case seems if alphaop is D3DTOP_DISABLE and not colorop,
   1804      * because then if the resultarg is TEMP, then diffuse alpha is written
   1805      * to it. */
   1806     if (key.ts[0].colorop != D3DTOP_DISABLE &&
   1807         key.ts[0].alphaop == D3DTOP_DISABLE &&
   1808         key.ts[0].resultarg != 0) {
   1809         key.ts[0].alphaop = D3DTOP_SELECTARG1;
   1810         key.ts[0].alphaarg1 = D3DTA_DIFFUSE;
   1811     }
   1812     /* When no alpha stage writes to current, diffuse alpha is taken.
   1813      * Since we initialize current to diffuse, we have the behaviour. */
   1814 
   1815     /* Last stage always writes to Current */
   1816     if (s >= 1)
   1817         key.ts[s-1].resultarg = 0;
   1818 
   1819     key.projected = nine_ff_get_projected_key(context);
   1820     key.specular = !!context->rs[D3DRS_SPECULARENABLE];
   1821 
   1822     for (; s < 8; ++s)
   1823         key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
   1824     if (context->rs[D3DRS_FOGENABLE])
   1825         key.fog_mode = context->rs[D3DRS_FOGTABLEMODE];
   1826     key.fog = !!context->rs[D3DRS_FOGENABLE];
   1827     /* Pixel fog (with WFOG advertised): source is either Z or W.
   1828      * W is the source if vs ff is used, and the
   1829      * projection matrix is not orthogonal.
   1830      * Tests on Win 10 seem to indicate _34
   1831      * and _33 are checked against 0, 1. */
   1832     if (key.fog_mode && key.fog)
   1833         key.fog_source = !context->programmable_vs &&
   1834             !(projection_matrix->_34 == 0.0f &&
   1835               projection_matrix->_44 == 1.0f);
   1836 
   1837     ps = util_hash_table_get(device->ff.ht_ps, &key);
   1838     if (ps)
   1839         return ps;
   1840     NinePixelShader9_new(device, &ps, NULL, nine_ff_build_ps(device, &key));
   1841 
   1842     nine_ff_prune_ps(device);
   1843     if (ps) {
   1844         memcpy(&ps->ff_key, &key, sizeof(ps->ff_key));
   1845 
   1846         err = util_hash_table_set(device->ff.ht_ps, &ps->ff_key, ps);
   1847         (void)err;
   1848         assert(err == PIPE_OK);
   1849         device->ff.num_ps++;
   1850         NineUnknown_ConvertRefToBind(NineUnknown(ps));
   1851 
   1852         ps->rt_mask = 0x1;
   1853         ps->sampler_mask = sampler_mask;
   1854     }
   1855     return ps;
   1856 }
   1857 
   1858 static void
   1859 nine_ff_load_vs_transforms(struct NineDevice9 *device)
   1860 {
   1861     struct nine_context *context = &device->context;
   1862     D3DMATRIX T;
   1863     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
   1864     unsigned i;
   1865 
   1866     /* TODO: make this nicer, and only upload the ones we need */
   1867     /* TODO: use ff.vs_const as storage of W, V, P matrices */
   1868 
   1869     if (IS_D3DTS_DIRTY(context, WORLD) ||
   1870         IS_D3DTS_DIRTY(context, VIEW) ||
   1871         IS_D3DTS_DIRTY(context, PROJECTION)) {
   1872         /* WVP, WV matrices */
   1873         nine_d3d_matrix_matrix_mul(&M[1], GET_D3DTS(WORLD), GET_D3DTS(VIEW));
   1874         nine_d3d_matrix_matrix_mul(&M[0], &M[1], GET_D3DTS(PROJECTION));
   1875 
   1876         /* normal matrix == transpose(inverse(WV)) */
   1877         nine_d3d_matrix_inverse(&T, &M[1]);
   1878         nine_d3d_matrix_transpose(&M[4], &T);
   1879 
   1880         /* P matrix */
   1881         M[2] = *GET_D3DTS(PROJECTION);
   1882 
   1883         /* V and W matrix */
   1884         nine_d3d_matrix_inverse(&M[3], GET_D3DTS(VIEW));
   1885         M[40] = M[1];
   1886     }
   1887 
   1888     if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
   1889         /* load other world matrices */
   1890         for (i = 1; i <= 8; ++i) {
   1891             nine_d3d_matrix_matrix_mul(&M[40 + i], GET_D3DTS(WORLDMATRIX(i)), GET_D3DTS(VIEW));
   1892         }
   1893     }
   1894 
   1895     device->ff.vs_const[30 * 4] = asfloat(context->rs[D3DRS_TWEENFACTOR]);
   1896 }
   1897 
   1898 static void
   1899 nine_ff_load_lights(struct NineDevice9 *device)
   1900 {
   1901     struct nine_context *context = &device->context;
   1902     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
   1903     unsigned l;
   1904 
   1905     if (context->changed.group & NINE_STATE_FF_MATERIAL) {
   1906         const D3DMATERIAL9 *mtl = &context->ff.material;
   1907 
   1908         memcpy(&dst[20], &mtl->Diffuse, 4 * sizeof(float));
   1909         memcpy(&dst[21], &mtl->Ambient, 4 * sizeof(float));
   1910         memcpy(&dst[22], &mtl->Specular, 4 * sizeof(float));
   1911         dst[23].x = mtl->Power;
   1912         memcpy(&dst[24], &mtl->Emissive, 4 * sizeof(float));
   1913         d3dcolor_to_rgba(&dst[25].x, context->rs[D3DRS_AMBIENT]);
   1914         dst[19].x = dst[25].x * mtl->Ambient.r + mtl->Emissive.r;
   1915         dst[19].y = dst[25].y * mtl->Ambient.g + mtl->Emissive.g;
   1916         dst[19].z = dst[25].z * mtl->Ambient.b + mtl->Emissive.b;
   1917     }
   1918 
   1919     if (!(context->changed.group & NINE_STATE_FF_LIGHTING))
   1920         return;
   1921 
   1922     for (l = 0; l < context->ff.num_lights_active; ++l) {
   1923         const D3DLIGHT9 *light = &context->ff.light[context->ff.active_light[l]];
   1924 
   1925         dst[32 + l * 8].x = light->Type;
   1926         dst[32 + l * 8].y = light->Attenuation0;
   1927         dst[32 + l * 8].z = light->Attenuation1;
   1928         dst[32 + l * 8].w = light->Attenuation2;
   1929         memcpy(&dst[33 + l * 8].x, &light->Diffuse, sizeof(light->Diffuse));
   1930         memcpy(&dst[34 + l * 8].x, &light->Specular, sizeof(light->Specular));
   1931         memcpy(&dst[35 + l * 8].x, &light->Ambient, sizeof(light->Ambient));
   1932         nine_d3d_vector4_matrix_mul((D3DVECTOR *)&dst[36 + l * 8].x, &light->Position, GET_D3DTS(VIEW));
   1933         nine_d3d_vector3_matrix_mul((D3DVECTOR *)&dst[37 + l * 8].x, &light->Direction, GET_D3DTS(VIEW));
   1934         dst[36 + l * 8].w = light->Type == D3DLIGHT_DIRECTIONAL ? 1e9f : light->Range;
   1935         dst[37 + l * 8].w = light->Falloff;
   1936         dst[38 + l * 8].x = cosf(light->Theta * 0.5f);
   1937         dst[38 + l * 8].y = cosf(light->Phi * 0.5f);
   1938         dst[38 + l * 8].z = 1.0f / (dst[38 + l * 8].x - dst[38 + l * 8].y);
   1939         dst[39 + l * 8].w = (l + 1) == context->ff.num_lights_active;
   1940     }
   1941 }
   1942 
   1943 static void
   1944 nine_ff_load_point_and_fog_params(struct NineDevice9 *device)
   1945 {
   1946     struct nine_context *context = &device->context;
   1947     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
   1948 
   1949     if (!(context->changed.group & NINE_STATE_FF_OTHER))
   1950         return;
   1951     dst[26].x = asfloat(context->rs[D3DRS_POINTSIZE_MIN]);
   1952     dst[26].y = asfloat(context->rs[D3DRS_POINTSIZE_MAX]);
   1953     dst[26].z = asfloat(context->rs[D3DRS_POINTSIZE]);
   1954     dst[26].w = asfloat(context->rs[D3DRS_POINTSCALE_A]);
   1955     dst[27].x = asfloat(context->rs[D3DRS_POINTSCALE_B]);
   1956     dst[27].y = asfloat(context->rs[D3DRS_POINTSCALE_C]);
   1957     dst[28].x = asfloat(context->rs[D3DRS_FOGEND]);
   1958     dst[28].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
   1959     if (isinf(dst[28].y))
   1960         dst[28].y = 0.0f;
   1961     dst[28].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
   1962 }
   1963 
   1964 static void
   1965 nine_ff_load_tex_matrices(struct NineDevice9 *device)
   1966 {
   1967     struct nine_context *context = &device->context;
   1968     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
   1969     unsigned s;
   1970 
   1971     if (!(context->ff.changed.transform[0] & 0xff0000))
   1972         return;
   1973     for (s = 0; s < 8; ++s) {
   1974         if (IS_D3DTS_DIRTY(context, TEXTURE0 + s))
   1975             nine_d3d_matrix_transpose(&M[32 + s], nine_state_access_transform(&context->ff, D3DTS_TEXTURE0 + s, FALSE));
   1976     }
   1977 }
   1978 
   1979 static void
   1980 nine_ff_load_ps_params(struct NineDevice9 *device)
   1981 {
   1982     struct nine_context *context = &device->context;
   1983     struct fvec4 *dst = (struct fvec4 *)device->ff.ps_const;
   1984     unsigned s;
   1985 
   1986     if (!(context->changed.group & (NINE_STATE_FF_PSSTAGES | NINE_STATE_FF_OTHER)))
   1987         return;
   1988 
   1989     for (s = 0; s < 8; ++s)
   1990         d3dcolor_to_rgba(&dst[s].x, context->ff.tex_stage[s][D3DTSS_CONSTANT]);
   1991 
   1992     for (s = 0; s < 8; ++s) {
   1993         dst[8 + s].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT00]);
   1994         dst[8 + s].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT01]);
   1995         dst[8 + s].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT10]);
   1996         dst[8 + s].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT11]);
   1997         if (s & 1) {
   1998             dst[16 + s / 2].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
   1999             dst[16 + s / 2].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
   2000         } else {
   2001             dst[16 + s / 2].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
   2002             dst[16 + s / 2].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
   2003         }
   2004     }
   2005 
   2006     d3dcolor_to_rgba(&dst[20].x, context->rs[D3DRS_TEXTUREFACTOR]);
   2007     d3dcolor_to_rgba(&dst[21].x, context->rs[D3DRS_FOGCOLOR]);
   2008     dst[22].x = asfloat(context->rs[D3DRS_FOGEND]);
   2009     dst[22].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
   2010     dst[22].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
   2011 }
   2012 
   2013 static void
   2014 nine_ff_load_viewport_info(struct NineDevice9 *device)
   2015 {
   2016     D3DVIEWPORT9 *viewport = &device->context.viewport;
   2017     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
   2018     float diffZ = viewport->MaxZ - viewport->MinZ;
   2019 
   2020     /* Note: the other functions avoids to fill the const again if nothing changed.
   2021      * But we don't have much to fill, and adding code to allow that may be complex
   2022      * so just fill it always */
   2023     dst[100].x = 2.0f / (float)(viewport->Width);
   2024     dst[100].y = 2.0f / (float)(viewport->Height);
   2025     dst[100].z = (diffZ == 0.0f) ? 0.0f : (1.0f / diffZ);
   2026     dst[100].w = (float)(viewport->Width);
   2027     dst[101].x = (float)(viewport->X);
   2028     dst[101].y = (float)(viewport->Y);
   2029     dst[101].z = (float)(viewport->MinZ);
   2030 }
   2031 
   2032 void
   2033 nine_ff_update(struct NineDevice9 *device)
   2034 {
   2035     struct nine_context *context = &device->context;
   2036     struct pipe_constant_buffer cb;
   2037 
   2038     DBG("vs=%p ps=%p\n", context->vs, context->ps);
   2039 
   2040     /* NOTE: the only reference belongs to the hash table */
   2041     if (!context->programmable_vs) {
   2042         device->ff.vs = nine_ff_get_vs(device);
   2043         context->changed.group |= NINE_STATE_VS;
   2044     }
   2045     if (!context->ps) {
   2046         device->ff.ps = nine_ff_get_ps(device);
   2047         context->changed.group |= NINE_STATE_PS;
   2048     }
   2049 
   2050     if (!context->programmable_vs) {
   2051         nine_ff_load_vs_transforms(device);
   2052         nine_ff_load_tex_matrices(device);
   2053         nine_ff_load_lights(device);
   2054         nine_ff_load_point_and_fog_params(device);
   2055         nine_ff_load_viewport_info(device);
   2056 
   2057         memset(context->ff.changed.transform, 0, sizeof(context->ff.changed.transform));
   2058 
   2059         cb.buffer_offset = 0;
   2060         cb.buffer = NULL;
   2061         cb.user_buffer = device->ff.vs_const;
   2062         cb.buffer_size = NINE_FF_NUM_VS_CONST * 4 * sizeof(float);
   2063 
   2064         if (!device->driver_caps.user_cbufs) {
   2065             context->pipe_data.cb_vs_ff.buffer_size = cb.buffer_size;
   2066             u_upload_data(device->constbuf_uploader,
   2067                           0,
   2068                           cb.buffer_size,
   2069                           device->constbuf_alignment,
   2070                           cb.user_buffer,
   2071                           &context->pipe_data.cb_vs_ff.buffer_offset,
   2072                           &context->pipe_data.cb_vs_ff.buffer);
   2073             u_upload_unmap(device->constbuf_uploader);
   2074             context->pipe_data.cb_vs_ff.user_buffer = NULL;
   2075         } else
   2076             context->pipe_data.cb_vs_ff = cb;
   2077         context->commit |= NINE_STATE_COMMIT_CONST_VS;
   2078     }
   2079 
   2080     if (!context->ps) {
   2081         nine_ff_load_ps_params(device);
   2082 
   2083         cb.buffer_offset = 0;
   2084         cb.buffer = NULL;
   2085         cb.user_buffer = device->ff.ps_const;
   2086         cb.buffer_size = NINE_FF_NUM_PS_CONST * 4 * sizeof(float);
   2087 
   2088         if (!device->driver_caps.user_cbufs) {
   2089             context->pipe_data.cb_ps_ff.buffer_size = cb.buffer_size;
   2090             u_upload_data(device->constbuf_uploader,
   2091                           0,
   2092                           cb.buffer_size,
   2093                           device->constbuf_alignment,
   2094                           cb.user_buffer,
   2095                           &context->pipe_data.cb_ps_ff.buffer_offset,
   2096                           &context->pipe_data.cb_ps_ff.buffer);
   2097             u_upload_unmap(device->constbuf_uploader);
   2098             context->pipe_data.cb_ps_ff.user_buffer = NULL;
   2099         } else
   2100             context->pipe_data.cb_ps_ff = cb;
   2101         context->commit |= NINE_STATE_COMMIT_CONST_PS;
   2102     }
   2103 
   2104     context->changed.group &= ~NINE_STATE_FF;
   2105 }
   2106 
   2107 
   2108 boolean
   2109 nine_ff_init(struct NineDevice9 *device)
   2110 {
   2111     device->ff.ht_vs = util_hash_table_create(nine_ff_vs_key_hash,
   2112                                               nine_ff_vs_key_comp);
   2113     device->ff.ht_ps = util_hash_table_create(nine_ff_ps_key_hash,
   2114                                               nine_ff_ps_key_comp);
   2115 
   2116     device->ff.ht_fvf = util_hash_table_create(nine_ff_fvf_key_hash,
   2117                                                nine_ff_fvf_key_comp);
   2118 
   2119     device->ff.vs_const = CALLOC(NINE_FF_NUM_VS_CONST, 4 * sizeof(float));
   2120     device->ff.ps_const = CALLOC(NINE_FF_NUM_PS_CONST, 4 * sizeof(float));
   2121 
   2122     return device->ff.ht_vs && device->ff.ht_ps &&
   2123         device->ff.ht_fvf &&
   2124         device->ff.vs_const && device->ff.ps_const;
   2125 }
   2126 
   2127 static enum pipe_error nine_ff_ht_delete_cb(void *key, void *value, void *data)
   2128 {
   2129     NineUnknown_Unbind(NineUnknown(value));
   2130     return PIPE_OK;
   2131 }
   2132 
   2133 void
   2134 nine_ff_fini(struct NineDevice9 *device)
   2135 {
   2136     if (device->ff.ht_vs) {
   2137         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
   2138         util_hash_table_destroy(device->ff.ht_vs);
   2139     }
   2140     if (device->ff.ht_ps) {
   2141         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
   2142         util_hash_table_destroy(device->ff.ht_ps);
   2143     }
   2144     if (device->ff.ht_fvf) {
   2145         util_hash_table_foreach(device->ff.ht_fvf, nine_ff_ht_delete_cb, NULL);
   2146         util_hash_table_destroy(device->ff.ht_fvf);
   2147     }
   2148     device->ff.vs = NULL; /* destroyed by unbinding from hash table */
   2149     device->ff.ps = NULL;
   2150 
   2151     FREE(device->ff.vs_const);
   2152     FREE(device->ff.ps_const);
   2153 }
   2154 
   2155 static void
   2156 nine_ff_prune_vs(struct NineDevice9 *device)
   2157 {
   2158     struct nine_context *context = &device->context;
   2159 
   2160     if (device->ff.num_vs > 100) {
   2161         /* could destroy the bound one here, so unbind */
   2162         context->pipe->bind_vs_state(context->pipe, NULL);
   2163         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
   2164         util_hash_table_clear(device->ff.ht_vs);
   2165         device->ff.num_vs = 0;
   2166         context->changed.group |= NINE_STATE_VS;
   2167     }
   2168 }
   2169 static void
   2170 nine_ff_prune_ps(struct NineDevice9 *device)
   2171 {
   2172     struct nine_context *context = &device->context;
   2173 
   2174     if (device->ff.num_ps > 100) {
   2175         /* could destroy the bound one here, so unbind */
   2176         context->pipe->bind_fs_state(context->pipe, NULL);
   2177         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
   2178         util_hash_table_clear(device->ff.ht_ps);
   2179         device->ff.num_ps = 0;
   2180         context->changed.group |= NINE_STATE_PS;
   2181     }
   2182 }
   2183 
   2184 /* ========================================================================== */
   2185 
   2186 /* Matrix multiplication:
   2187  *
   2188  * in memory: 0 1 2 3 (row major)
   2189  *            4 5 6 7
   2190  *            8 9 a b
   2191  *            c d e f
   2192  *
   2193  *    cA cB cC cD
   2194  * r0             = (r0 * cA) (r0 * cB) . .
   2195  * r1             = (r1 * cA) (r1 * cB)
   2196  * r2             = (r2 * cA) .
   2197  * r3             = (r3 * cA) .
   2198  *
   2199  *               r: (11) (12) (13) (14)
   2200  *                  (21) (22) (23) (24)
   2201  *                  (31) (32) (33) (34)
   2202  *                  (41) (42) (43) (44)
   2203  * l: (11 12 13 14)
   2204  *    (21 22 23 24)
   2205  *    (31 32 33 34)
   2206  *    (41 42 43 44)
   2207  *
   2208  * v: (x  y  z  1 )
   2209  *
   2210  * t.xyzw = MUL(v.xxxx, r[0]);
   2211  * t.xyzw = MAD(v.yyyy, r[1], t.xyzw);
   2212  * t.xyzw = MAD(v.zzzz, r[2], t.xyzw);
   2213  * v.xyzw = MAD(v.wwww, r[3], t.xyzw);
   2214  *
   2215  * v.x = DP4(v, c[0]);
   2216  * v.y = DP4(v, c[1]);
   2217  * v.z = DP4(v, c[2]);
   2218  * v.w = DP4(v, c[3]) = 1
   2219  */
   2220 
   2221 /*
   2222 static void
   2223 nine_D3DMATRIX_print(const D3DMATRIX *M)
   2224 {
   2225     DBG("\n(%f %f %f %f)\n"
   2226         "(%f %f %f %f)\n"
   2227         "(%f %f %f %f)\n"
   2228         "(%f %f %f %f)\n",
   2229         M->m[0][0], M->m[0][1], M->m[0][2], M->m[0][3],
   2230         M->m[1][0], M->m[1][1], M->m[1][2], M->m[1][3],
   2231         M->m[2][0], M->m[2][1], M->m[2][2], M->m[2][3],
   2232         M->m[3][0], M->m[3][1], M->m[3][2], M->m[3][3]);
   2233 }
   2234 */
   2235 
   2236 static inline float
   2237 nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
   2238 {
   2239     return A->m[r][0] * B->m[0][c] +
   2240            A->m[r][1] * B->m[1][c] +
   2241            A->m[r][2] * B->m[2][c] +
   2242            A->m[r][3] * B->m[3][c];
   2243 }
   2244 
   2245 static inline float
   2246 nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
   2247 {
   2248     return v->x * M->m[0][c] +
   2249            v->y * M->m[1][c] +
   2250            v->z * M->m[2][c] +
   2251            1.0f * M->m[3][c];
   2252 }
   2253 
   2254 static inline float
   2255 nine_DP3_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
   2256 {
   2257     return v->x * M->m[0][c] +
   2258            v->y * M->m[1][c] +
   2259            v->z * M->m[2][c];
   2260 }
   2261 
   2262 void
   2263 nine_d3d_matrix_matrix_mul(D3DMATRIX *D, const D3DMATRIX *L, const D3DMATRIX *R)
   2264 {
   2265     D->_11 = nine_DP4_row_col(L, 0, R, 0);
   2266     D->_12 = nine_DP4_row_col(L, 0, R, 1);
   2267     D->_13 = nine_DP4_row_col(L, 0, R, 2);
   2268     D->_14 = nine_DP4_row_col(L, 0, R, 3);
   2269 
   2270     D->_21 = nine_DP4_row_col(L, 1, R, 0);
   2271     D->_22 = nine_DP4_row_col(L, 1, R, 1);
   2272     D->_23 = nine_DP4_row_col(L, 1, R, 2);
   2273     D->_24 = nine_DP4_row_col(L, 1, R, 3);
   2274 
   2275     D->_31 = nine_DP4_row_col(L, 2, R, 0);
   2276     D->_32 = nine_DP4_row_col(L, 2, R, 1);
   2277     D->_33 = nine_DP4_row_col(L, 2, R, 2);
   2278     D->_34 = nine_DP4_row_col(L, 2, R, 3);
   2279 
   2280     D->_41 = nine_DP4_row_col(L, 3, R, 0);
   2281     D->_42 = nine_DP4_row_col(L, 3, R, 1);
   2282     D->_43 = nine_DP4_row_col(L, 3, R, 2);
   2283     D->_44 = nine_DP4_row_col(L, 3, R, 3);
   2284 }
   2285 
   2286 void
   2287 nine_d3d_vector4_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
   2288 {
   2289     d->x = nine_DP4_vec_col(v, M, 0);
   2290     d->y = nine_DP4_vec_col(v, M, 1);
   2291     d->z = nine_DP4_vec_col(v, M, 2);
   2292 }
   2293 
   2294 void
   2295 nine_d3d_vector3_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
   2296 {
   2297     d->x = nine_DP3_vec_col(v, M, 0);
   2298     d->y = nine_DP3_vec_col(v, M, 1);
   2299     d->z = nine_DP3_vec_col(v, M, 2);
   2300 }
   2301 
   2302 void
   2303 nine_d3d_matrix_transpose(D3DMATRIX *D, const D3DMATRIX *M)
   2304 {
   2305     unsigned i, j;
   2306     for (i = 0; i < 4; ++i)
   2307     for (j = 0; j < 4; ++j)
   2308         D->m[i][j] = M->m[j][i];
   2309 }
   2310 
   2311 #define _M_ADD_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
   2312     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
   2313     if (t > 0.0f) pos += t; else neg += t; } while(0)
   2314 
   2315 #define _M_SUB_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
   2316     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
   2317     if (t > 0.0f) neg -= t; else pos -= t; } while(0)
   2318 float
   2319 nine_d3d_matrix_det(const D3DMATRIX *M)
   2320 {
   2321     float pos = 0.0f;
   2322     float neg = 0.0f;
   2323 
   2324     _M_ADD_PROD_1i_2j_3k_4l(1, 2, 3, 4);
   2325     _M_ADD_PROD_1i_2j_3k_4l(1, 3, 4, 2);
   2326     _M_ADD_PROD_1i_2j_3k_4l(1, 4, 2, 3);
   2327 
   2328     _M_ADD_PROD_1i_2j_3k_4l(2, 1, 4, 3);
   2329     _M_ADD_PROD_1i_2j_3k_4l(2, 3, 1, 4);
   2330     _M_ADD_PROD_1i_2j_3k_4l(2, 4, 3, 1);
   2331 
   2332     _M_ADD_PROD_1i_2j_3k_4l(3, 1, 2, 4);
   2333     _M_ADD_PROD_1i_2j_3k_4l(3, 2, 4, 1);
   2334     _M_ADD_PROD_1i_2j_3k_4l(3, 4, 1, 2);
   2335 
   2336     _M_ADD_PROD_1i_2j_3k_4l(4, 1, 3, 2);
   2337     _M_ADD_PROD_1i_2j_3k_4l(4, 2, 1, 3);
   2338     _M_ADD_PROD_1i_2j_3k_4l(4, 3, 2, 1);
   2339 
   2340     _M_SUB_PROD_1i_2j_3k_4l(1, 2, 4, 3);
   2341     _M_SUB_PROD_1i_2j_3k_4l(1, 3, 2, 4);
   2342     _M_SUB_PROD_1i_2j_3k_4l(1, 4, 3, 2);
   2343 
   2344     _M_SUB_PROD_1i_2j_3k_4l(2, 1, 3, 4);
   2345     _M_SUB_PROD_1i_2j_3k_4l(2, 3, 4, 1);
   2346     _M_SUB_PROD_1i_2j_3k_4l(2, 4, 1, 3);
   2347 
   2348     _M_SUB_PROD_1i_2j_3k_4l(3, 1, 4, 2);
   2349     _M_SUB_PROD_1i_2j_3k_4l(3, 2, 1, 4);
   2350     _M_SUB_PROD_1i_2j_3k_4l(3, 4, 2, 1);
   2351 
   2352     _M_SUB_PROD_1i_2j_3k_4l(4, 1, 2, 3);
   2353     _M_SUB_PROD_1i_2j_3k_4l(4, 2, 3, 1);
   2354     _M_SUB_PROD_1i_2j_3k_4l(4, 3, 1, 2);
   2355 
   2356     return pos + neg;
   2357 }
   2358 
   2359 /* XXX: Probably better to just use src/mesa/math/m_matrix.c because
   2360  * I have no idea where this code came from.
   2361  */
   2362 void
   2363 nine_d3d_matrix_inverse(D3DMATRIX *D, const D3DMATRIX *M)
   2364 {
   2365     int i, k;
   2366     float det;
   2367 
   2368     D->m[0][0] =
   2369         M->m[1][1] * M->m[2][2] * M->m[3][3] -
   2370         M->m[1][1] * M->m[3][2] * M->m[2][3] -
   2371         M->m[1][2] * M->m[2][1] * M->m[3][3] +
   2372         M->m[1][2] * M->m[3][1] * M->m[2][3] +
   2373         M->m[1][3] * M->m[2][1] * M->m[3][2] -
   2374         M->m[1][3] * M->m[3][1] * M->m[2][2];
   2375 
   2376     D->m[0][1] =
   2377        -M->m[0][1] * M->m[2][2] * M->m[3][3] +
   2378         M->m[0][1] * M->m[3][2] * M->m[2][3] +
   2379         M->m[0][2] * M->m[2][1] * M->m[3][3] -
   2380         M->m[0][2] * M->m[3][1] * M->m[2][3] -
   2381         M->m[0][3] * M->m[2][1] * M->m[3][2] +
   2382         M->m[0][3] * M->m[3][1] * M->m[2][2];
   2383 
   2384     D->m[0][2] =
   2385         M->m[0][1] * M->m[1][2] * M->m[3][3] -
   2386         M->m[0][1] * M->m[3][2] * M->m[1][3] -
   2387         M->m[0][2] * M->m[1][1] * M->m[3][3] +
   2388         M->m[0][2] * M->m[3][1] * M->m[1][3] +
   2389         M->m[0][3] * M->m[1][1] * M->m[3][2] -
   2390         M->m[0][3] * M->m[3][1] * M->m[1][2];
   2391 
   2392     D->m[0][3] =
   2393        -M->m[0][1] * M->m[1][2] * M->m[2][3] +
   2394         M->m[0][1] * M->m[2][2] * M->m[1][3] +
   2395         M->m[0][2] * M->m[1][1] * M->m[2][3] -
   2396         M->m[0][2] * M->m[2][1] * M->m[1][3] -
   2397         M->m[0][3] * M->m[1][1] * M->m[2][2] +
   2398         M->m[0][3] * M->m[2][1] * M->m[1][2];
   2399 
   2400     D->m[1][0] =
   2401        -M->m[1][0] * M->m[2][2] * M->m[3][3] +
   2402         M->m[1][0] * M->m[3][2] * M->m[2][3] +
   2403         M->m[1][2] * M->m[2][0] * M->m[3][3] -
   2404         M->m[1][2] * M->m[3][0] * M->m[2][3] -
   2405         M->m[1][3] * M->m[2][0] * M->m[3][2] +
   2406         M->m[1][3] * M->m[3][0] * M->m[2][2];
   2407 
   2408     D->m[1][1] =
   2409         M->m[0][0] * M->m[2][2] * M->m[3][3] -
   2410         M->m[0][0] * M->m[3][2] * M->m[2][3] -
   2411         M->m[0][2] * M->m[2][0] * M->m[3][3] +
   2412         M->m[0][2] * M->m[3][0] * M->m[2][3] +
   2413         M->m[0][3] * M->m[2][0] * M->m[3][2] -
   2414         M->m[0][3] * M->m[3][0] * M->m[2][2];
   2415 
   2416     D->m[1][2] =
   2417        -M->m[0][0] * M->m[1][2] * M->m[3][3] +
   2418         M->m[0][0] * M->m[3][2] * M->m[1][3] +
   2419         M->m[0][2] * M->m[1][0] * M->m[3][3] -
   2420         M->m[0][2] * M->m[3][0] * M->m[1][3] -
   2421         M->m[0][3] * M->m[1][0] * M->m[3][2] +
   2422         M->m[0][3] * M->m[3][0] * M->m[1][2];
   2423 
   2424     D->m[1][3] =
   2425         M->m[0][0] * M->m[1][2] * M->m[2][3] -
   2426         M->m[0][0] * M->m[2][2] * M->m[1][3] -
   2427         M->m[0][2] * M->m[1][0] * M->m[2][3] +
   2428         M->m[0][2] * M->m[2][0] * M->m[1][3] +
   2429         M->m[0][3] * M->m[1][0] * M->m[2][2] -
   2430         M->m[0][3] * M->m[2][0] * M->m[1][2];
   2431 
   2432     D->m[2][0] =
   2433         M->m[1][0] * M->m[2][1] * M->m[3][3] -
   2434         M->m[1][0] * M->m[3][1] * M->m[2][3] -
   2435         M->m[1][1] * M->m[2][0] * M->m[3][3] +
   2436         M->m[1][1] * M->m[3][0] * M->m[2][3] +
   2437         M->m[1][3] * M->m[2][0] * M->m[3][1] -
   2438         M->m[1][3] * M->m[3][0] * M->m[2][1];
   2439 
   2440     D->m[2][1] =
   2441        -M->m[0][0] * M->m[2][1] * M->m[3][3] +
   2442         M->m[0][0] * M->m[3][1] * M->m[2][3] +
   2443         M->m[0][1] * M->m[2][0] * M->m[3][3] -
   2444         M->m[0][1] * M->m[3][0] * M->m[2][3] -
   2445         M->m[0][3] * M->m[2][0] * M->m[3][1] +
   2446         M->m[0][3] * M->m[3][0] * M->m[2][1];
   2447 
   2448     D->m[2][2] =
   2449         M->m[0][0] * M->m[1][1] * M->m[3][3] -
   2450         M->m[0][0] * M->m[3][1] * M->m[1][3] -
   2451         M->m[0][1] * M->m[1][0] * M->m[3][3] +
   2452         M->m[0][1] * M->m[3][0] * M->m[1][3] +
   2453         M->m[0][3] * M->m[1][0] * M->m[3][1] -
   2454         M->m[0][3] * M->m[3][0] * M->m[1][1];
   2455 
   2456     D->m[2][3] =
   2457        -M->m[0][0] * M->m[1][1] * M->m[2][3] +
   2458         M->m[0][0] * M->m[2][1] * M->m[1][3] +
   2459         M->m[0][1] * M->m[1][0] * M->m[2][3] -
   2460         M->m[0][1] * M->m[2][0] * M->m[1][3] -
   2461         M->m[0][3] * M->m[1][0] * M->m[2][1] +
   2462         M->m[0][3] * M->m[2][0] * M->m[1][1];
   2463 
   2464     D->m[3][0] =
   2465        -M->m[1][0] * M->m[2][1] * M->m[3][2] +
   2466         M->m[1][0] * M->m[3][1] * M->m[2][2] +
   2467         M->m[1][1] * M->m[2][0] * M->m[3][2] -
   2468         M->m[1][1] * M->m[3][0] * M->m[2][2] -
   2469         M->m[1][2] * M->m[2][0] * M->m[3][1] +
   2470         M->m[1][2] * M->m[3][0] * M->m[2][1];
   2471 
   2472     D->m[3][1] =
   2473         M->m[0][0] * M->m[2][1] * M->m[3][2] -
   2474         M->m[0][0] * M->m[3][1] * M->m[2][2] -
   2475         M->m[0][1] * M->m[2][0] * M->m[3][2] +
   2476         M->m[0][1] * M->m[3][0] * M->m[2][2] +
   2477         M->m[0][2] * M->m[2][0] * M->m[3][1] -
   2478         M->m[0][2] * M->m[3][0] * M->m[2][1];
   2479 
   2480     D->m[3][2] =
   2481        -M->m[0][0] * M->m[1][1] * M->m[3][2] +
   2482         M->m[0][0] * M->m[3][1] * M->m[1][2] +
   2483         M->m[0][1] * M->m[1][0] * M->m[3][2] -
   2484         M->m[0][1] * M->m[3][0] * M->m[1][2] -
   2485         M->m[0][2] * M->m[1][0] * M->m[3][1] +
   2486         M->m[0][2] * M->m[3][0] * M->m[1][1];
   2487 
   2488     D->m[3][3] =
   2489         M->m[0][0] * M->m[1][1] * M->m[2][2] -
   2490         M->m[0][0] * M->m[2][1] * M->m[1][2] -
   2491         M->m[0][1] * M->m[1][0] * M->m[2][2] +
   2492         M->m[0][1] * M->m[2][0] * M->m[1][2] +
   2493         M->m[0][2] * M->m[1][0] * M->m[2][1] -
   2494         M->m[0][2] * M->m[2][0] * M->m[1][1];
   2495 
   2496     det =
   2497         M->m[0][0] * D->m[0][0] +
   2498         M->m[1][0] * D->m[0][1] +
   2499         M->m[2][0] * D->m[0][2] +
   2500         M->m[3][0] * D->m[0][3];
   2501 
   2502     if (det < 1e-30) {/* non inversible */
   2503         *D = *M; /* wine tests */
   2504         return;
   2505     }
   2506 
   2507     det = 1.0 / det;
   2508 
   2509     for (i = 0; i < 4; i++)
   2510     for (k = 0; k < 4; k++)
   2511         D->m[i][k] *= det;
   2512 
   2513 #ifdef DEBUG
   2514     {
   2515         D3DMATRIX I;
   2516 
   2517         nine_d3d_matrix_matrix_mul(&I, D, M);
   2518 
   2519         for (i = 0; i < 4; ++i)
   2520         for (k = 0; k < 4; ++k)
   2521             if (fabsf(I.m[i][k] - (float)(i == k)) > 1e-3)
   2522                 DBG("Matrix inversion check FAILED !\n");
   2523     }
   2524 #endif
   2525 }
   2526