1 /* 2 * Mesa 3-D graphics library 3 * 4 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included 14 * in all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: 25 * Gareth Hughes 26 */ 27 28 #ifndef __M_DEBUG_UTIL_H__ 29 #define __M_DEBUG_UTIL_H__ 30 31 32 #ifdef DEBUG_MATH /* This code only used for debugging */ 33 34 35 #include "c99_math.h" 36 37 38 /* Comment this out to deactivate the cycle counter. 39 * NOTE: it works only on CPUs which know the 'rdtsc' command (586 or higher) 40 * (hope, you don't try to debug Mesa on a 386 ;) 41 */ 42 #if defined(__GNUC__) && \ 43 ((defined(__i386__) && defined(USE_X86_ASM)) || \ 44 (defined(__sparc__) && defined(USE_SPARC_ASM))) 45 #define RUN_DEBUG_BENCHMARK 46 #endif 47 48 #define TEST_COUNT 128 /* size of the tested vector array */ 49 50 #define REQUIRED_PRECISION 10 /* allow 4 bits to miss */ 51 #define MAX_PRECISION 24 /* max. precision possible */ 52 53 54 #ifdef RUN_DEBUG_BENCHMARK 55 /* Overhead of profiling counter in cycles. Automatically adjusted to 56 * your machine at run time - counter initialization should give very 57 * consistent results. 58 */ 59 extern long counter_overhead; 60 61 /* This is the value of the environment variable MESA_PROFILE, and is 62 * used to determine if we should benchmark the functions as well as 63 * verify their correctness. 64 */ 65 extern char *mesa_profile; 66 67 /* Modify the number of tests if you like. 68 * We take the minimum of all results, because every error should be 69 * positive (time used by other processes, task switches etc). 70 * It is assumed that all calculations are done in the cache. 71 */ 72 73 #if defined(__i386__) 74 75 #if 1 /* PPro, PII, PIII version */ 76 77 /* Profiling on the P6 architecture requires a little more work, due to 78 * the internal out-of-order execution. We must perform a serializing 79 * 'cpuid' instruction before and after the 'rdtsc' instructions to make 80 * sure no other uops are executed when we sample the timestamp counter. 81 */ 82 #define INIT_COUNTER() \ 83 do { \ 84 int cycle_i; \ 85 counter_overhead = LONG_MAX; \ 86 for ( cycle_i = 0 ; cycle_i < 8 ; cycle_i++ ) { \ 87 long cycle_tmp1 = 0, cycle_tmp2 = 0; \ 88 __asm__ __volatile__ ( "push %%ebx \n" \ 89 "xor %%eax, %%eax \n" \ 90 "cpuid \n" \ 91 "rdtsc \n" \ 92 "mov %%eax, %0 \n" \ 93 "xor %%eax, %%eax \n" \ 94 "cpuid \n" \ 95 "pop %%ebx \n" \ 96 "push %%ebx \n" \ 97 "xor %%eax, %%eax \n" \ 98 "cpuid \n" \ 99 "rdtsc \n" \ 100 "mov %%eax, %1 \n" \ 101 "xor %%eax, %%eax \n" \ 102 "cpuid \n" \ 103 "pop %%ebx \n" \ 104 : "=m" (cycle_tmp1), "=m" (cycle_tmp2) \ 105 : : "eax", "ecx", "edx" ); \ 106 if ( counter_overhead > (cycle_tmp2 - cycle_tmp1) ) { \ 107 counter_overhead = cycle_tmp2 - cycle_tmp1; \ 108 } \ 109 } \ 110 } while (0) 111 112 #define BEGIN_RACE(x) \ 113 x = LONG_MAX; \ 114 for ( cycle_i = 0 ; cycle_i < 10 ; cycle_i++ ) { \ 115 long cycle_tmp1 = 0, cycle_tmp2 = 0; \ 116 __asm__ __volatile__ ( "push %%ebx \n" \ 117 "xor %%eax, %%eax \n" \ 118 "cpuid \n" \ 119 "rdtsc \n" \ 120 "mov %%eax, %0 \n" \ 121 "xor %%eax, %%eax \n" \ 122 "cpuid \n" \ 123 "pop %%ebx \n" \ 124 : "=m" (cycle_tmp1) \ 125 : : "eax", "ecx", "edx" ); 126 127 #define END_RACE(x) \ 128 __asm__ __volatile__ ( "push %%ebx \n" \ 129 "xor %%eax, %%eax \n" \ 130 "cpuid \n" \ 131 "rdtsc \n" \ 132 "mov %%eax, %0 \n" \ 133 "xor %%eax, %%eax \n" \ 134 "cpuid \n" \ 135 "pop %%ebx \n" \ 136 : "=m" (cycle_tmp2) \ 137 : : "eax", "ecx", "edx" ); \ 138 if ( x > (cycle_tmp2 - cycle_tmp1) ) { \ 139 x = cycle_tmp2 - cycle_tmp1; \ 140 } \ 141 } \ 142 x -= counter_overhead; 143 144 #else /* PPlain, PMMX version */ 145 146 /* To ensure accurate results, we stall the pipelines with the 147 * non-pairable 'cdq' instruction. This ensures all the code being 148 * profiled is complete when the 'rdtsc' instruction executes. 149 */ 150 #define INIT_COUNTER(x) \ 151 do { \ 152 int cycle_i; \ 153 x = LONG_MAX; \ 154 for ( cycle_i = 0 ; cycle_i < 32 ; cycle_i++ ) { \ 155 long cycle_tmp1, cycle_tmp2, dummy; \ 156 __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp1) ); \ 157 __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp2) ); \ 158 __asm__ ( "cdq" ); \ 159 __asm__ ( "cdq" ); \ 160 __asm__ ( "rdtsc" : "=a" (cycle_tmp1), "=d" (dummy) ); \ 161 __asm__ ( "cdq" ); \ 162 __asm__ ( "cdq" ); \ 163 __asm__ ( "rdtsc" : "=a" (cycle_tmp2), "=d" (dummy) ); \ 164 if ( x > (cycle_tmp2 - cycle_tmp1) ) \ 165 x = cycle_tmp2 - cycle_tmp1; \ 166 } \ 167 } while (0) 168 169 #define BEGIN_RACE(x) \ 170 x = LONG_MAX; \ 171 for ( cycle_i = 0 ; cycle_i < 16 ; cycle_i++ ) { \ 172 long cycle_tmp1, cycle_tmp2, dummy; \ 173 __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp1) ); \ 174 __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp2) ); \ 175 __asm__ ( "cdq" ); \ 176 __asm__ ( "cdq" ); \ 177 __asm__ ( "rdtsc" : "=a" (cycle_tmp1), "=d" (dummy) ); 178 179 180 #define END_RACE(x) \ 181 __asm__ ( "cdq" ); \ 182 __asm__ ( "cdq" ); \ 183 __asm__ ( "rdtsc" : "=a" (cycle_tmp2), "=d" (dummy) ); \ 184 if ( x > (cycle_tmp2 - cycle_tmp1) ) \ 185 x = cycle_tmp2 - cycle_tmp1; \ 186 } \ 187 x -= counter_overhead; 188 189 #endif 190 191 #elif defined(__x86_64__) 192 193 #define rdtscll(val) do { \ 194 unsigned int a,d; \ 195 __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); \ 196 (val) = ((unsigned long)a) | (((unsigned long)d)<<32); \ 197 } while(0) 198 199 /* Copied from i386 PIII version */ 200 #define INIT_COUNTER() \ 201 do { \ 202 int cycle_i; \ 203 counter_overhead = LONG_MAX; \ 204 for ( cycle_i = 0 ; cycle_i < 16 ; cycle_i++ ) { \ 205 unsigned long cycle_tmp1, cycle_tmp2; \ 206 rdtscll(cycle_tmp1); \ 207 rdtscll(cycle_tmp2); \ 208 if ( counter_overhead > (cycle_tmp2 - cycle_tmp1) ) { \ 209 counter_overhead = cycle_tmp2 - cycle_tmp1; \ 210 } \ 211 } \ 212 } while (0) 213 214 215 #define BEGIN_RACE(x) \ 216 x = LONG_MAX; \ 217 for ( cycle_i = 0 ; cycle_i < 10 ; cycle_i++ ) { \ 218 unsigned long cycle_tmp1, cycle_tmp2; \ 219 rdtscll(cycle_tmp1); 220 221 #define END_RACE(x) \ 222 rdtscll(cycle_tmp2); \ 223 if ( x > (cycle_tmp2 - cycle_tmp1) ) { \ 224 x = cycle_tmp2 - cycle_tmp1; \ 225 } \ 226 } \ 227 x -= counter_overhead; 228 229 #elif defined(__sparc__) 230 231 #define INIT_COUNTER() \ 232 do { counter_overhead = 5; } while(0) 233 234 #define BEGIN_RACE(x) \ 235 x = LONG_MAX; \ 236 for (cycle_i = 0; cycle_i <10; cycle_i++) { \ 237 register long cycle_tmp1 __asm__("l0"); \ 238 register long cycle_tmp2 __asm__("l1"); \ 239 /* rd %tick, %l0 */ \ 240 __asm__ __volatile__ (".word 0xa1410000" : "=r" (cycle_tmp1)); /* save timestamp */ 241 242 #define END_RACE(x) \ 243 /* rd %tick, %l1 */ \ 244 __asm__ __volatile__ (".word 0xa3410000" : "=r" (cycle_tmp2)); \ 245 if (x > (cycle_tmp2-cycle_tmp1)) x = cycle_tmp2 - cycle_tmp1; \ 246 } \ 247 x -= counter_overhead; 248 249 #else 250 #error Your processor is not supported for RUN_XFORM_BENCHMARK 251 #endif 252 253 #else 254 255 #define BEGIN_RACE(x) 256 #define END_RACE(x) 257 258 #endif 259 260 261 /* ============================================================= 262 * Helper functions 263 */ 264 265 static GLfloat rnd( void ) 266 { 267 GLfloat f = (GLfloat)rand() / (GLfloat)RAND_MAX; 268 GLfloat gran = (GLfloat)(1 << 13); 269 270 f = (GLfloat)(GLint)(f * gran) / gran; 271 272 return f * 2.0 - 1.0; 273 } 274 275 static int significand_match( GLfloat a, GLfloat b ) 276 { 277 GLfloat d = a - b; 278 int a_ex, b_ex, d_ex; 279 280 if ( d == 0.0F ) { 281 return MAX_PRECISION; /* Exact match */ 282 } 283 284 if ( a == 0.0F || b == 0.0F ) { 285 /* It would probably be better to check if the 286 * non-zero number is denormalized and return 287 * the index of the highest set bit here. 288 */ 289 return 0; 290 } 291 292 frexpf( a, &a_ex ); 293 frexpf( b, &b_ex ); 294 frexpf( d, &d_ex ); 295 296 if ( a_ex < b_ex ) { 297 return a_ex - d_ex; 298 } else { 299 return b_ex - d_ex; 300 } 301 } 302 303 enum { NIL = 0, ONE = 1, NEG = -1, VAR = 2 }; 304 305 /* Ensure our arrays are correctly aligned. 306 */ 307 #if defined(__GNUC__) 308 # define ALIGN16(type, array) type array __attribute__ ((aligned (16))) 309 #elif defined(_MSC_VER) 310 # define ALIGN16(type, array) type array __declspec(align(16)) /* GH: Does this work? */ 311 #elif defined(__xlC__) 312 # define ALIGN16(type, array) type __align (16) array 313 #else 314 # warning "ALIGN16 will not 16-byte align!\n" 315 # define ALIGN16 316 #endif 317 318 319 #endif /* DEBUG_MATH */ 320 321 #endif /* __M_DEBUG_UTIL_H__ */ 322