1 /* 2 * Mesa 3-D graphics library 3 * 4 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included 14 * in all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: 25 * Keith Whitwell <keithw (at) vmware.com> 26 */ 27 28 #include "main/glheader.h" 29 #include "main/context.h" 30 #include "main/imports.h" 31 #include "main/mtypes.h" 32 33 #include "t_context.h" 34 #include "t_pipeline.h" 35 #include "t_vp_build.h" 36 #include "t_vertex.h" 37 38 void _tnl_install_pipeline( struct gl_context *ctx, 39 const struct tnl_pipeline_stage **stages ) 40 { 41 TNLcontext *tnl = TNL_CONTEXT(ctx); 42 GLuint i; 43 44 tnl->pipeline.new_state = ~0; 45 46 /* Create a writeable copy of each stage. 47 */ 48 for (i = 0 ; i < MAX_PIPELINE_STAGES && stages[i] ; i++) { 49 struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i]; 50 memcpy(s, stages[i], sizeof(*s)); 51 if (s->create) 52 s->create(ctx, s); 53 } 54 55 tnl->pipeline.nr_stages = i; 56 } 57 58 void _tnl_destroy_pipeline( struct gl_context *ctx ) 59 { 60 TNLcontext *tnl = TNL_CONTEXT(ctx); 61 GLuint i; 62 63 for (i = 0 ; i < tnl->pipeline.nr_stages ; i++) { 64 struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i]; 65 if (s->destroy) 66 s->destroy(s); 67 } 68 69 tnl->pipeline.nr_stages = 0; 70 } 71 72 73 74 static GLuint check_input_changes( struct gl_context *ctx ) 75 { 76 TNLcontext *tnl = TNL_CONTEXT(ctx); 77 GLuint i; 78 79 for (i = 0; i <= _TNL_LAST_MAT; i++) { 80 if (tnl->vb.AttribPtr[i]->size != tnl->pipeline.last_attrib_size[i] || 81 tnl->vb.AttribPtr[i]->stride != tnl->pipeline.last_attrib_stride[i]) { 82 tnl->pipeline.last_attrib_size[i] = tnl->vb.AttribPtr[i]->size; 83 tnl->pipeline.last_attrib_stride[i] = tnl->vb.AttribPtr[i]->stride; 84 tnl->pipeline.input_changes |= 1<<i; 85 } 86 } 87 88 return tnl->pipeline.input_changes; 89 } 90 91 92 static GLuint check_output_changes( struct gl_context *ctx ) 93 { 94 #if 0 95 TNLcontext *tnl = TNL_CONTEXT(ctx); 96 97 for (i = 0; i < VARYING_SLOT_MAX; i++) { 98 if (tnl->vb.ResultPtr[i]->size != tnl->last_result_size[i] || 99 tnl->vb.ResultPtr[i]->stride != tnl->last_result_stride[i]) { 100 tnl->last_result_size[i] = tnl->vb.ResultPtr[i]->size; 101 tnl->last_result_stride[i] = tnl->vb.ResultPtr[i]->stride; 102 tnl->pipeline.output_changes |= 1<<i; 103 } 104 } 105 106 if (tnl->pipeline.output_changes) 107 tnl->Driver.NotifyOutputChanges( ctx, tnl->pipeline.output_changes ); 108 109 return tnl->pipeline.output_changes; 110 #else 111 return ~0; 112 #endif 113 } 114 115 /** 116 * START/END_FAST_MATH macros: 117 * 118 * START_FAST_MATH: Set x86 FPU to faster, 32-bit precision mode (and save 119 * original mode to a temporary). 120 * END_FAST_MATH: Restore x86 FPU to original mode. 121 */ 122 #if defined(__GNUC__) && defined(__i386__) 123 /* 124 * Set the x86 FPU control word to guarentee only 32 bits of precision 125 * are stored in registers. Allowing the FPU to store more introduces 126 * differences between situations where numbers are pulled out of memory 127 * vs. situations where the compiler is able to optimize register usage. 128 * 129 * In the worst case, we force the compiler to use a memory access to 130 * truncate the float, by specifying the 'volatile' keyword. 131 */ 132 /* Hardware default: All exceptions masked, extended double precision, 133 * round to nearest (IEEE compliant): 134 */ 135 #define DEFAULT_X86_FPU 0x037f 136 /* All exceptions masked, single precision, round to nearest: 137 */ 138 #define FAST_X86_FPU 0x003f 139 /* The fldcw instruction will cause any pending FP exceptions to be 140 * raised prior to entering the block, and we clear any pending 141 * exceptions before exiting the block. Hence, asm code has free 142 * reign over the FPU while in the fast math block. 143 */ 144 #if defined(NO_FAST_MATH) 145 #define START_FAST_MATH(x) \ 146 do { \ 147 static GLuint mask = DEFAULT_X86_FPU; \ 148 __asm__ ( "fnstcw %0" : "=m" (*&(x)) ); \ 149 __asm__ ( "fldcw %0" : : "m" (mask) ); \ 150 } while (0) 151 #else 152 #define START_FAST_MATH(x) \ 153 do { \ 154 static GLuint mask = FAST_X86_FPU; \ 155 __asm__ ( "fnstcw %0" : "=m" (*&(x)) ); \ 156 __asm__ ( "fldcw %0" : : "m" (mask) ); \ 157 } while (0) 158 #endif 159 /* Restore original FPU mode, and clear any exceptions that may have 160 * occurred in the FAST_MATH block. 161 */ 162 #define END_FAST_MATH(x) \ 163 do { \ 164 __asm__ ( "fnclex ; fldcw %0" : : "m" (*&(x)) ); \ 165 } while (0) 166 167 #elif defined(_MSC_VER) && defined(_M_IX86) 168 #define DEFAULT_X86_FPU 0x037f /* See GCC comments above */ 169 #define FAST_X86_FPU 0x003f /* See GCC comments above */ 170 #if defined(NO_FAST_MATH) 171 #define START_FAST_MATH(x) do {\ 172 static GLuint mask = DEFAULT_X86_FPU;\ 173 __asm fnstcw word ptr [x]\ 174 __asm fldcw word ptr [mask]\ 175 } while(0) 176 #else 177 #define START_FAST_MATH(x) do {\ 178 static GLuint mask = FAST_X86_FPU;\ 179 __asm fnstcw word ptr [x]\ 180 __asm fldcw word ptr [mask]\ 181 } while(0) 182 #endif 183 #define END_FAST_MATH(x) do {\ 184 __asm fnclex\ 185 __asm fldcw word ptr [x]\ 186 } while(0) 187 188 #else 189 #define START_FAST_MATH(x) x = 0 190 #define END_FAST_MATH(x) (void)(x) 191 #endif 192 193 194 void _tnl_run_pipeline( struct gl_context *ctx ) 195 { 196 TNLcontext *tnl = TNL_CONTEXT(ctx); 197 unsigned short __tmp; 198 GLuint i; 199 200 if (!tnl->vb.Count) 201 return; 202 203 /* Check for changed input sizes or change in stride to/from zero 204 * (ie const or non-const). 205 */ 206 if (check_input_changes( ctx ) || tnl->pipeline.new_state) { 207 if (ctx->VertexProgram._MaintainTnlProgram) 208 _tnl_UpdateFixedFunctionProgram( ctx ); 209 210 for (i = 0; i < tnl->pipeline.nr_stages ; i++) { 211 struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i]; 212 if (s->validate) 213 s->validate( ctx, s ); 214 } 215 216 tnl->pipeline.new_state = 0; 217 tnl->pipeline.input_changes = 0; 218 219 /* Pipeline can only change its output in response to either a 220 * statechange or an input size/stride change. No other changes 221 * are allowed. 222 */ 223 if (check_output_changes( ctx )) 224 _tnl_notify_pipeline_output_change( ctx ); 225 } 226 227 #ifndef _OPENMP 228 /* Don't adjust FPU precision mode in case multiple threads are to be used. 229 * This would require that the additional threads also changed the FPU mode 230 * which is quite a mess as this had to be done in all parallelized sections; 231 * otherwise the master thread and all other threads are running in different 232 * modes, producing inconsistent results. 233 * Note that all x64 implementations don't define/use START_FAST_MATH, so 234 * this is "hack" is only used in i386 mode 235 */ 236 START_FAST_MATH(__tmp); 237 #endif 238 239 for (i = 0; i < tnl->pipeline.nr_stages ; i++) { 240 struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i]; 241 if (!s->run( ctx, s )) 242 break; 243 } 244 245 #ifndef _OPENMP 246 END_FAST_MATH(__tmp); 247 #endif 248 } 249 250 251 252 /* The default pipeline. This is useful for software rasterizers, and 253 * simple hardware rasterizers. For customization, I don't recommend 254 * tampering with the internals of these stages in the way that 255 * drivers did in Mesa 3.4. These stages are basically black boxes, 256 * and should be left intact. 257 * 258 * To customize the pipeline, consider: 259 * 260 * - removing redundant stages (making sure that the software rasterizer 261 * can cope with this on fallback paths). An example is fog 262 * coordinate generation, which is not required in the FX driver. 263 * 264 * - replacing general-purpose machine-independent stages with 265 * general-purpose machine-specific stages. There is no example of 266 * this to date, though it must be borne in mind that all subsequent 267 * stages that reference the output of the new stage must cope with 268 * any machine-specific data introduced. This may not be easy 269 * unless there are no such stages (ie the new stage is the last in 270 * the pipe). 271 * 272 * - inserting optimized (but specialized) stages ahead of the 273 * general-purpose fallback implementation. For example, the old 274 * fastpath mechanism, which only works when the VB->Elts input is 275 * available, can be duplicated by placing the fastpath stage at the 276 * head of this pipeline. Such specialized stages are currently 277 * constrained to have no outputs (ie. they must either finish the * 278 * pipeline by returning GL_FALSE from run(), or do nothing). 279 * 280 * Some work can be done to lift some of the restrictions in the final 281 * case, if it becomes necessary to do so. 282 */ 283 const struct tnl_pipeline_stage *_tnl_default_pipeline[] = { 284 &_tnl_vertex_transform_stage, 285 &_tnl_normal_transform_stage, 286 &_tnl_lighting_stage, 287 &_tnl_texgen_stage, 288 &_tnl_texture_transform_stage, 289 &_tnl_point_attenuation_stage, 290 &_tnl_vertex_program_stage, 291 &_tnl_fog_coordinate_stage, 292 &_tnl_render_stage, 293 NULL 294 }; 295 296 const struct tnl_pipeline_stage *_tnl_vp_pipeline[] = { 297 &_tnl_vertex_program_stage, 298 &_tnl_render_stage, 299 NULL 300 }; 301