Home | History | Annotate | Download | only in perf
      1 // This small program computes a Fast Fourier Transform.  It tests
      2 // Valgrind's handling of FP operations.  It is representative of all
      3 // programs that do a lot of FP operations.
      4 
      5 // Licensing: This program is closely based on the one of the same name from
      6 // http://www.fourmilab.ch/.  The front page of that site says:
      7 //
      8 //   "Except for a few clearly-marked exceptions, all the material on this
      9 //   site is in the public domain and may be used in any manner without
     10 //   permission, restriction, attribution, or compensation."
     11 
     12 /*
     13 
     14 	Two-dimensional FFT benchmark
     15 
     16 	Designed and implemented by John Walker in April of 1989.
     17 
     18 	This  benchmark executes a specified number of passes (default
     19 	20) through a loop in which each  iteration  performs  a  fast
     20 	Fourier transform of a square matrix (default size 256x256) of
     21 	complex numbers (default precision double),  followed  by  the
     22 	inverse  transform.   After  all loop iterations are performed
     23 	the results are checked against known correct values.
     24 
     25 	This benchmark is intended for use on C implementations  which
     26         define  "int"  as  32 bits or longer and permit allocation and
     27 	direct addressing of arrays larger than one megabyte.
     28 
     29 	If CAPOUT is defined,  the  result  after  all	iterations  is
     30 	written  as  a	CA  Lab  pattern  file.   This is intended for
     31 	debugging in case horribly wrong results  are  obtained  on  a
     32 	given machine.
     33 
     34 	Archival  timings  are	run  with the definitions below set as
     35 	follows: Float = double, Asize = 256, Passes = 20, CAPOUT  not
     36 	defined.
     37 
     38 	Time (seconds)		    System
     39 
     40             2393.93       Sun 3/260, SunOS 3.4, C, "-f68881 -O".
     41 			  (John Walker).
     42 
     43             1928          Macintosh IIx, MPW C 3.0, "-mc68020
     44                           -mc68881 -elems881 -m".  (Hugh Hoover).
     45 
     46             1636.1        Sun 4/110, "cc -O3 -lm".  (Michael McClary).
     47 			  The suspicion is that this is software
     48 			  floating point.
     49 
     50             1556.7        Macintosh II, A/UX, "cc -O -lm"
     51 			  (Michael McClary).
     52 
     53 	    1388.8	  Sun 386i/250, SunOS 4.0.1 C
     54                           "-O /usr/lib/trig.il".  (James Carrington).
     55 
     56 	    1331.93	  Sun 3/60, SunOS 4.0.1, C,
     57                           "-O4 -f68881 /usr/lib/libm.il"
     58 			  (Bob Elman).
     59 
     60             1204.0        Apollo Domain DN4000, C, "-cpu 3000 -opt 4".
     61 			  (Sam Crupi).
     62 
     63 	    1174.66	  Compaq 386/25, SCO Xenix 386 C.
     64 			  (Peter Shieh).
     65 
     66 	    1068	  Compaq 386/25, SCO Xenix 386,
     67 			  Metaware High C.  (Robert Wenig).
     68 
     69 	    1064.0	  Sun 3/80, SunOS 4.0.3 Beta C
     70                           "-O3 -f68881 /usr/lib/libm.il".  (James Carrington).
     71 
     72 	    1061.4	  Compaq 386/25, SCO Xenix, High C 1.4.
     73 			  (James Carrington).
     74 
     75 	    1059.79	  Compaq 386/25, 387/25, High C 1.4,
     76 			  DOS|Extender 2.2, 387 inline code
     77 			  generation.  (Nathan Bender).
     78 
     79 	     777.14	  Compaq 386/25, IIT 3C87-25 (387 Compatible),
     80 			  High C 1.5, DOS|Extender 2.2, 387 inline
     81 			  code generation.  (Nathan Bender).
     82 
     83 	     751	  Compaq DeskPro 386/33, High C 1.5 + DOS|Extender,
     84 			  387 code generation.	(James Carrington).
     85 
     86 	     431.44	  Compaq 386/25, Weitek 3167-25, DOS 3.31,
     87 			  High C 1.4, DOS|Extender, Weitek code generation.
     88 			  (Nathan Bender).
     89 
     90 	     344.9	  Compaq 486/25, Metaware High C 1.6, Phar Lap
     91 			  DOS|Extender, in-line floating point.  (Nathan
     92 			  Bender).
     93 
     94 	     324.2	  Data General Motorola 88000, 16 Mhz, Gnu C.
     95 
     96              323.1        Sun 4/280, C, "-O4".  (Eric Hill).
     97 
     98 	     254	  Compaq SystemPro 486/33, High C 1.5 + DOS|Extender,
     99 			  387 code generation.	(James Carrington).
    100 
    101 	     242.8	  Silicon Graphics Personal IRIS, MIPS R2000A,
    102                           12.5 Mhz, "-O3" (highest level optimisation).
    103 			  (Mike Zentner).
    104 
    105              233.0        Sun SPARCStation 1, C, "-O4", SunOS 4.0.3.
    106 			  (Nathan Bender).
    107 
    108 	     187.30	  DEC PMAX 3100, MIPS 2000 chip.
    109 			  (Robert Wenig).
    110 
    111              120.46       Sun SparcStation 2, C, "-O4", SunOS 4.1.1.
    112 			  (John Walker).
    113 
    114              120.21       DEC 3MAX, MIPS 3000, "-O4".
    115 
    116 	      98.0	  Intel i860 experimental environment,
    117 			  OS/2, data caching disabled.	(Kern
    118 			  Sibbald).
    119 
    120 	      34.9	  Silicon Graphics Indigo, MIPS R4400,
    121                           175 Mhz, IRIX 5.2, "-O".
    122 
    123 	      32.4	  Pentium 133, Windows NT, Microsoft Visual
    124 			  C++ 4.0.
    125 
    126 	      17.25	  Silicon Graphics Indigo, MIPS R4400,
    127                           175 Mhz, IRIX 6.5, "-O3".
    128 
    129 	      14.10	  Dell Dimension XPS R100, Pentium II 400 MHz,
    130 			  Windows 98, Microsoft Visual C 5.0.
    131 
    132 	      10.7	  Hewlett-Packard Kayak XU 450Mhz Pentium II,
    133 			  Microsoft Visual C++ 6.0, Windows NT 4.0sp3.	(Nathan Bender).
    134 
    135 	       5.09	  Sun Ultra 2, UltraSPARC V9, 300 MHz, gcc -O3.
    136 
    137 	       0.846	  Dell Inspiron 9100, Pentium 4, 3.4 GHz, gcc -O3.
    138 
    139 */
    140 
    141 #include <stdio.h>
    142 #include <stdlib.h>
    143 #include <math.h>
    144 #include <string.h>
    145 
    146 /*  The  program  may  be  run	with  Float defined as either float or
    147     double.  With IEEE arithmetic, the same answers are generated  for
    148     either floating point mode.  */
    149 
    150 #define Float	 double 	   /* Floating point type used in FFT */
    151 
    152 #define Asize	 256		   /* Array edge size */
    153 #define Passes	 20		   /* Number of FFT/Inverse passes */
    154 
    155 #define max(a,b) ((a)>(b)?(a):(b))
    156 #define min(a,b) ((a)<=(b)?(a):(b))
    157 
    158 /*
    159 
    160 	Multi-dimensional fast Fourier transform
    161 
    162         Adapted from Press et al., "Numerical Recipes in C".
    163 
    164 */
    165 
    166 #define SWAP(a,b) tempr=(a); (a)=(b); (b)=tempr
    167 
    168 static void fourn(data, nn, ndim, isign)
    169   Float data[];
    170   int nn[], ndim, isign;
    171 {
    172 	register int i1, i2, i3;
    173 	int i2rev, i3rev, ip1, ip2, ip3, ifp1, ifp2;
    174 	int ibit, idim, k1, k2, n, nprev, nrem, ntot;
    175 	Float tempi, tempr;
    176 	double theta, wi, wpi, wpr, wr, wtemp;
    177 
    178 	ntot = 1;
    179 	for (idim = 1; idim <= ndim; idim++)
    180 	   ntot *= nn[idim];
    181 	nprev = 1;
    182 	for (idim = ndim; idim >= 1; idim--) {
    183 	   n = nn[idim];
    184 	   nrem = ntot / (n * nprev);
    185 	   ip1 = nprev << 1;
    186 	   ip2 = ip1 * n;
    187 	   ip3 = ip2 * nrem;
    188 	   i2rev = 1;
    189 	   for (i2 = 1; i2 <= ip2; i2 += ip1) {
    190 	      if (i2 < i2rev) {
    191 		 for (i1 = i2; i1 <= i2 + ip1 - 2; i1 += 2) {
    192 		    for (i3 = i1; i3 <= ip3; i3 += ip2) {
    193 		       i3rev = i2rev + i3 - i2;
    194 		       SWAP(data[i3], data[i3rev]);
    195 		       SWAP(data[i3 + 1], data[i3rev + 1]);
    196 		    }
    197 		 }
    198 	      }
    199 	      ibit = ip2 >> 1;
    200 	      while (ibit >= ip1 && i2rev > ibit) {
    201 		 i2rev -= ibit;
    202 		 ibit >>= 1;
    203 	      }
    204 	      i2rev += ibit;
    205 	   }
    206 	   ifp1 = ip1;
    207 	   while (ifp1 < ip2) {
    208 	      ifp2 = ifp1 << 1;
    209 	      theta = isign * 6.28318530717959 / (ifp2 / ip1);
    210 	      wtemp = sin(0.5 * theta);
    211 	      wpr = -2.0 * wtemp * wtemp;
    212 	      wpi = sin(theta);
    213 	      wr = 1.0;
    214 	      wi = 0.0;
    215 	      for (i3 = 1; i3 <= ifp1; i3 += ip1) {
    216 		 for (i1 = i3; i1 <= i3 + ip1 - 2; i1 += 2) {
    217 		    for (i2 = i1; i2 <= ip3; i2 += ifp2) {
    218 		       k1 = i2;
    219 		       k2 = k1 + ifp1;
    220 		       tempr = wr * data[k2] - wi * data[k2 + 1];
    221 		       tempi = wr * data[k2 + 1] + wi * data[k2];
    222 		       data[k2] = data[k1] - tempr;
    223 		       data[k2 + 1] = data[k1 + 1] - tempi;
    224 		       data[k1] += tempr;
    225 		       data[k1 + 1] += tempi;
    226 		    }
    227 		 }
    228 		 wr = (wtemp = wr) * wpr - wi * wpi + wr;
    229 		 wi = wi * wpr + wtemp * wpi + wi;
    230 	      }
    231 	      ifp1 = ifp2;
    232 	   }
    233 	   nprev *= n;
    234 	}
    235 }
    236 #undef SWAP
    237 
    238 int main()
    239 {
    240 	int i, j, k, l, m, npasses = Passes, faedge;
    241 	Float *fdata /* , *fd */ ;
    242 	static int nsize[] = {0, 0, 0};
    243 	long fanum, fasize;
    244 	double mapbase, mapscale, /* x, */ rmin, rmax, imin, imax;
    245 
    246 	faedge = Asize; 	   /* FFT array edge size */
    247 	fanum = faedge * faedge;   /* Elements in FFT array */
    248 	fasize = ((fanum + 1) * 2 * sizeof(Float)); /* FFT array size */
    249 	nsize[1] = nsize[2] = faedge;
    250 
    251 	fdata = (Float *) malloc(fasize);
    252 	if (fdata == NULL) {
    253            fprintf(stdout, "Can't allocate data array.\n");
    254 	   exit(1);
    255 	}
    256 
    257 	/*  Generate data array to process.  */
    258 
    259 #define Re(x,y) fdata[1 + (faedge * (x) + (y)) * 2]
    260 #define Im(x,y) fdata[2 + (faedge * (x) + (y)) * 2]
    261 
    262 	memset(fdata, 0, fasize);
    263 	for (i = 0; i < faedge; i++) {
    264 	   for (j = 0; j < faedge; j++) {
    265 	      if (((i & 15) == 8) || ((j & 15) == 8))
    266 		 Re(i, j) = 128.0;
    267 	   }
    268 	}
    269 
    270 	for (i = 0; i < npasses; i++) {
    271 /*printf("Pass %d\n", i);*/
    272 	   /* Transform image to frequency domain. */
    273 	   fourn(fdata, nsize, 2, 1);
    274 
    275 	   /*  Back-transform to image. */
    276 	   fourn(fdata, nsize, 2, -1);
    277 	}
    278 
    279 	{
    280 	   double r, ij, ar, ai;
    281 	   rmin = 1e10; rmax = -1e10;
    282 	   imin = 1e10; imax = -1e10;
    283 	   ar = 0;
    284 	   ai = 0;
    285 
    286 	   for (i = 1; i <= fanum; i += 2) {
    287 	      r = fdata[i];
    288 	      ij = fdata[i + 1];
    289 	      ar += r;
    290 	      ai += ij;
    291 	      rmin = min(r, rmin);
    292 	      rmax = max(r, rmax);
    293 	      imin = min(ij, imin);
    294 	      imax = max(ij, imax);
    295 	   }
    296 #ifdef DEBUG
    297            printf("Real min %.4g, max %.4g.  Imaginary min %.4g, max %.4g.\n",
    298 	      rmin, rmax, imin, imax);
    299            printf("Average real %.4g, imaginary %.4g.\n",
    300 	      ar / fanum, ai / fanum);
    301 #endif
    302 	   mapbase = rmin;
    303 	   mapscale = 255 / (rmax - rmin);
    304 	}
    305 
    306 	/* See if we got the right answers. */
    307 
    308 	m = 0;
    309 	for (i = 0; i < faedge; i++) {
    310 	   for (j = 0; j < faedge; j++) {
    311 	      k = (Re(i, j) - mapbase) * mapscale;
    312 	      l = (((i & 15) == 8) || ((j & 15) == 8)) ? 255 : 0;
    313 	      if (k != l) {
    314 		 m++;
    315 		 fprintf(stdout,
    316                     "Wrong answer at (%d,%d)!  Expected %d, got %d.\n",
    317 		    i, j, l, k);
    318 	      }
    319 	   }
    320 	}
    321 	if (m == 0) {
    322            fprintf(stdout, "%d passes.  No errors in results.\n", npasses);
    323 	} else {
    324            fprintf(stdout, "%d passes.  %d errors in results.\n",
    325 	      npasses, m);
    326 	}
    327 
    328 #ifdef CAPOUT
    329 
    330 	/* Output the result of the transform as a CA Lab pattern
    331 	   file for debugging. */
    332 
    333 	{
    334 #define SCRX  322
    335 #define SCRY  200
    336 #define SCRN  (SCRX * SCRY)
    337 	   unsigned char patarr[SCRY][SCRX];
    338 	   FILE *fp;
    339 
    340 /*  Map user external state numbers to internal state index  */
    341 
    342 #define UtoI(x)     (((((x) >> 1) & 0x7F) | ((x) << 7)) & 0xFF)
    343 
    344 	   /* Copy data from FFT buffer to map. */
    345 
    346 	   memset(patarr, 0, sizeof patarr);
    347 	   l = (SCRX - faedge) / 2;
    348 	   m = (faedge > SCRY) ? 0 : ((SCRY - faedge) / 2);
    349 	   for (i = 1; i < faedge; i++) {
    350 	      for (j = 0; j < min(SCRY, faedge); j++) {
    351 		 k = (Re(i, j) - mapbase) * mapscale;
    352 		 patarr[j + m][i + l] = UtoI(k);
    353 	      }
    354 	   }
    355 
    356 	   /* Dump pattern map to file. */
    357 
    358            fp = fopen("fft.cap", "w");
    359 	   if (fp == NULL) {
    360               fprintf(stdout, "Cannot open output file.\n");
    361 	      exit(0);
    362 	   }
    363            putc(':', fp);
    364 	   putc(1, fp);
    365 	   fwrite(patarr, SCRN, 1, fp);
    366 	   putc(6, fp);
    367 	   fclose(fp);
    368 	}
    369 #endif
    370 
    371 	return 0;
    372 }
    373