1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.commons.math.random; 19 20 import java.io.BufferedReader; 21 import java.io.File; 22 import java.io.FileReader; 23 import java.io.IOException; 24 import java.io.InputStreamReader; 25 import java.io.Serializable; 26 import java.net.URL; 27 import java.util.ArrayList; 28 import java.util.List; 29 30 import org.apache.commons.math.MathRuntimeException; 31 import org.apache.commons.math.exception.util.LocalizedFormats; 32 import org.apache.commons.math.stat.descriptive.StatisticalSummary; 33 import org.apache.commons.math.stat.descriptive.SummaryStatistics; 34 import org.apache.commons.math.util.FastMath; 35 36 /** 37 * Implements <code>EmpiricalDistribution</code> interface. This implementation 38 * uses what amounts to the 39 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html"> 40 * Variable Kernel Method</a> with Gaussian smoothing:<p> 41 * <strong>Digesting the input file</strong> 42 * <ol><li>Pass the file once to compute min and max.</li> 43 * <li>Divide the range from min-max into <code>binCount</code> "bins."</li> 44 * <li>Pass the data file again, computing bin counts and univariate 45 * statistics (mean, std dev.) for each of the bins </li> 46 * <li>Divide the interval (0,1) into subintervals associated with the bins, 47 * with the length of a bin's subinterval proportional to its count.</li></ol> 48 * <strong>Generating random values from the distribution</strong><ol> 49 * <li>Generate a uniformly distributed value in (0,1) </li> 50 * <li>Select the subinterval to which the value belongs. 51 * <li>Generate a random Gaussian value with mean = mean of the associated 52 * bin and std dev = std dev of associated bin.</li></ol></p><p> 53 *<strong>USAGE NOTES:</strong><ul> 54 *<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb 55 * is to set the bin count to approximately the length of the input file divided 56 * by 10. </li> 57 *<li>The input file <i>must</i> be a plain text file containing one valid numeric 58 * entry per line.</li> 59 * </ul></p> 60 * 61 * @version $Revision: 1003886 $ $Date: 2010-10-02 23:04:44 +0200 (sam. 02 oct. 2010) $ 62 */ 63 public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution { 64 65 /** Serializable version identifier */ 66 private static final long serialVersionUID = 5729073523949762654L; 67 68 /** List of SummaryStatistics objects characterizing the bins */ 69 private final List<SummaryStatistics> binStats; 70 71 /** Sample statistics */ 72 private SummaryStatistics sampleStats = null; 73 74 /** Max loaded value */ 75 private double max = Double.NEGATIVE_INFINITY; 76 77 /** Min loaded value */ 78 private double min = Double.POSITIVE_INFINITY; 79 80 /** Grid size */ 81 private double delta = 0d; 82 83 /** number of bins */ 84 private final int binCount; 85 86 /** is the distribution loaded? */ 87 private boolean loaded = false; 88 89 /** upper bounds of subintervals in (0,1) "belonging" to the bins */ 90 private double[] upperBounds = null; 91 92 /** RandomData instance to use in repeated calls to getNext() */ 93 private final RandomData randomData = new RandomDataImpl(); 94 95 /** 96 * Creates a new EmpiricalDistribution with the default bin count. 97 */ 98 public EmpiricalDistributionImpl() { 99 binCount = 1000; 100 binStats = new ArrayList<SummaryStatistics>(); 101 } 102 103 /** 104 * Creates a new EmpiricalDistribution with the specified bin count. 105 * 106 * @param binCount number of bins 107 */ 108 public EmpiricalDistributionImpl(int binCount) { 109 this.binCount = binCount; 110 binStats = new ArrayList<SummaryStatistics>(); 111 } 112 113 /** 114 * Computes the empirical distribution from the provided 115 * array of numbers. 116 * 117 * @param in the input data array 118 */ 119 public void load(double[] in) { 120 DataAdapter da = new ArrayDataAdapter(in); 121 try { 122 da.computeStats(); 123 fillBinStats(in); 124 } catch (IOException e) { 125 throw new MathRuntimeException(e); 126 } 127 loaded = true; 128 129 } 130 131 /** 132 * Computes the empirical distribution using data read from a URL. 133 * @param url url of the input file 134 * 135 * @throws IOException if an IO error occurs 136 */ 137 public void load(URL url) throws IOException { 138 BufferedReader in = 139 new BufferedReader(new InputStreamReader(url.openStream())); 140 try { 141 DataAdapter da = new StreamDataAdapter(in); 142 da.computeStats(); 143 if (sampleStats.getN() == 0) { 144 throw MathRuntimeException.createEOFException(LocalizedFormats.URL_CONTAINS_NO_DATA, 145 url); 146 } 147 in = new BufferedReader(new InputStreamReader(url.openStream())); 148 fillBinStats(in); 149 loaded = true; 150 } finally { 151 try { 152 in.close(); 153 } catch (IOException ex) { 154 // ignore 155 } 156 } 157 } 158 159 /** 160 * Computes the empirical distribution from the input file. 161 * 162 * @param file the input file 163 * @throws IOException if an IO error occurs 164 */ 165 public void load(File file) throws IOException { 166 BufferedReader in = new BufferedReader(new FileReader(file)); 167 try { 168 DataAdapter da = new StreamDataAdapter(in); 169 da.computeStats(); 170 in = new BufferedReader(new FileReader(file)); 171 fillBinStats(in); 172 loaded = true; 173 } finally { 174 try { 175 in.close(); 176 } catch (IOException ex) { 177 // ignore 178 } 179 } 180 } 181 182 /** 183 * Provides methods for computing <code>sampleStats</code> and 184 * <code>beanStats</code> abstracting the source of data. 185 */ 186 private abstract class DataAdapter{ 187 188 /** 189 * Compute bin stats. 190 * 191 * @throws IOException if an error occurs computing bin stats 192 */ 193 public abstract void computeBinStats() throws IOException; 194 195 /** 196 * Compute sample statistics. 197 * 198 * @throws IOException if an error occurs computing sample stats 199 */ 200 public abstract void computeStats() throws IOException; 201 202 } 203 204 /** 205 * Factory of <code>DataAdapter</code> objects. For every supported source 206 * of data (array of doubles, file, etc.) an instance of the proper object 207 * is returned. 208 */ 209 private class DataAdapterFactory{ 210 /** 211 * Creates a DataAdapter from a data object 212 * 213 * @param in object providing access to the data 214 * @return DataAdapter instance 215 */ 216 public DataAdapter getAdapter(Object in) { 217 if (in instanceof BufferedReader) { 218 BufferedReader inputStream = (BufferedReader) in; 219 return new StreamDataAdapter(inputStream); 220 } else if (in instanceof double[]) { 221 double[] inputArray = (double[]) in; 222 return new ArrayDataAdapter(inputArray); 223 } else { 224 throw MathRuntimeException.createIllegalArgumentException( 225 LocalizedFormats.INPUT_DATA_FROM_UNSUPPORTED_DATASOURCE, 226 in.getClass().getName(), 227 BufferedReader.class.getName(), double[].class.getName()); 228 } 229 } 230 } 231 /** 232 * <code>DataAdapter</code> for data provided through some input stream 233 */ 234 private class StreamDataAdapter extends DataAdapter{ 235 236 /** Input stream providing access to the data */ 237 private BufferedReader inputStream; 238 239 /** 240 * Create a StreamDataAdapter from a BufferedReader 241 * 242 * @param in BufferedReader input stream 243 */ 244 public StreamDataAdapter(BufferedReader in){ 245 super(); 246 inputStream = in; 247 } 248 249 /** {@inheritDoc} */ 250 @Override 251 public void computeBinStats() throws IOException { 252 String str = null; 253 double val = 0.0d; 254 while ((str = inputStream.readLine()) != null) { 255 val = Double.parseDouble(str); 256 SummaryStatistics stats = binStats.get(findBin(val)); 257 stats.addValue(val); 258 } 259 260 inputStream.close(); 261 inputStream = null; 262 } 263 264 /** {@inheritDoc} */ 265 @Override 266 public void computeStats() throws IOException { 267 String str = null; 268 double val = 0.0; 269 sampleStats = new SummaryStatistics(); 270 while ((str = inputStream.readLine()) != null) { 271 val = Double.valueOf(str).doubleValue(); 272 sampleStats.addValue(val); 273 } 274 inputStream.close(); 275 inputStream = null; 276 } 277 } 278 279 /** 280 * <code>DataAdapter</code> for data provided as array of doubles. 281 */ 282 private class ArrayDataAdapter extends DataAdapter { 283 284 /** Array of input data values */ 285 private double[] inputArray; 286 287 /** 288 * Construct an ArrayDataAdapter from a double[] array 289 * 290 * @param in double[] array holding the data 291 */ 292 public ArrayDataAdapter(double[] in){ 293 super(); 294 inputArray = in; 295 } 296 297 /** {@inheritDoc} */ 298 @Override 299 public void computeStats() throws IOException { 300 sampleStats = new SummaryStatistics(); 301 for (int i = 0; i < inputArray.length; i++) { 302 sampleStats.addValue(inputArray[i]); 303 } 304 } 305 306 /** {@inheritDoc} */ 307 @Override 308 public void computeBinStats() throws IOException { 309 for (int i = 0; i < inputArray.length; i++) { 310 SummaryStatistics stats = 311 binStats.get(findBin(inputArray[i])); 312 stats.addValue(inputArray[i]); 313 } 314 } 315 } 316 317 /** 318 * Fills binStats array (second pass through data file). 319 * 320 * @param in object providing access to the data 321 * @throws IOException if an IO error occurs 322 */ 323 private void fillBinStats(Object in) throws IOException { 324 // Set up grid 325 min = sampleStats.getMin(); 326 max = sampleStats.getMax(); 327 delta = (max - min)/(Double.valueOf(binCount)).doubleValue(); 328 329 // Initialize binStats ArrayList 330 if (!binStats.isEmpty()) { 331 binStats.clear(); 332 } 333 for (int i = 0; i < binCount; i++) { 334 SummaryStatistics stats = new SummaryStatistics(); 335 binStats.add(i,stats); 336 } 337 338 // Filling data in binStats Array 339 DataAdapterFactory aFactory = new DataAdapterFactory(); 340 DataAdapter da = aFactory.getAdapter(in); 341 da.computeBinStats(); 342 343 // Assign upperBounds based on bin counts 344 upperBounds = new double[binCount]; 345 upperBounds[0] = 346 ((double) binStats.get(0).getN()) / (double) sampleStats.getN(); 347 for (int i = 1; i < binCount-1; i++) { 348 upperBounds[i] = upperBounds[i-1] + 349 ((double) binStats.get(i).getN()) / (double) sampleStats.getN(); 350 } 351 upperBounds[binCount-1] = 1.0d; 352 } 353 354 /** 355 * Returns the index of the bin to which the given value belongs 356 * 357 * @param value the value whose bin we are trying to find 358 * @return the index of the bin containing the value 359 */ 360 private int findBin(double value) { 361 return FastMath.min( 362 FastMath.max((int) FastMath.ceil((value- min) / delta) - 1, 0), 363 binCount - 1); 364 } 365 366 /** 367 * Generates a random value from this distribution. 368 * 369 * @return the random value. 370 * @throws IllegalStateException if the distribution has not been loaded 371 */ 372 public double getNextValue() throws IllegalStateException { 373 374 if (!loaded) { 375 throw MathRuntimeException.createIllegalStateException(LocalizedFormats.DISTRIBUTION_NOT_LOADED); 376 } 377 378 // Start with a uniformly distributed random number in (0,1) 379 double x = FastMath.random(); 380 381 // Use this to select the bin and generate a Gaussian within the bin 382 for (int i = 0; i < binCount; i++) { 383 if (x <= upperBounds[i]) { 384 SummaryStatistics stats = binStats.get(i); 385 if (stats.getN() > 0) { 386 if (stats.getStandardDeviation() > 0) { // more than one obs 387 return randomData.nextGaussian 388 (stats.getMean(),stats.getStandardDeviation()); 389 } else { 390 return stats.getMean(); // only one obs in bin 391 } 392 } 393 } 394 } 395 throw new MathRuntimeException(LocalizedFormats.NO_BIN_SELECTED); 396 } 397 398 /** 399 * Returns a {@link StatisticalSummary} describing this distribution. 400 * <strong>Preconditions:</strong><ul> 401 * <li>the distribution must be loaded before invoking this method</li></ul> 402 * 403 * @return the sample statistics 404 * @throws IllegalStateException if the distribution has not been loaded 405 */ 406 public StatisticalSummary getSampleStats() { 407 return sampleStats; 408 } 409 410 /** 411 * Returns the number of bins. 412 * 413 * @return the number of bins. 414 */ 415 public int getBinCount() { 416 return binCount; 417 } 418 419 /** 420 * Returns a List of {@link SummaryStatistics} instances containing 421 * statistics describing the values in each of the bins. The list is 422 * indexed on the bin number. 423 * 424 * @return List of bin statistics. 425 */ 426 public List<SummaryStatistics> getBinStats() { 427 return binStats; 428 } 429 430 /** 431 * <p>Returns a fresh copy of the array of upper bounds for the bins. 432 * Bins are: <br/> 433 * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],..., 434 * (upperBounds[binCount-2], upperBounds[binCount-1] = max].</p> 435 * 436 * <p>Note: In versions 1.0-2.0 of commons-math, this method 437 * incorrectly returned the array of probability generator upper 438 * bounds now returned by {@link #getGeneratorUpperBounds()}.</p> 439 * 440 * @return array of bin upper bounds 441 * @since 2.1 442 */ 443 public double[] getUpperBounds() { 444 double[] binUpperBounds = new double[binCount]; 445 binUpperBounds[0] = min + delta; 446 for (int i = 1; i < binCount - 1; i++) { 447 binUpperBounds[i] = binUpperBounds[i-1] + delta; 448 } 449 binUpperBounds[binCount - 1] = max; 450 return binUpperBounds; 451 } 452 453 /** 454 * <p>Returns a fresh copy of the array of upper bounds of the subintervals 455 * of [0,1] used in generating data from the empirical distribution. 456 * Subintervals correspond to bins with lengths proportional to bin counts.</p> 457 * 458 * <p>In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned 459 * by {@link #getUpperBounds()}.</p> 460 * 461 * @since 2.1 462 * @return array of upper bounds of subintervals used in data generation 463 */ 464 public double[] getGeneratorUpperBounds() { 465 int len = upperBounds.length; 466 double[] out = new double[len]; 467 System.arraycopy(upperBounds, 0, out, 0, len); 468 return out; 469 } 470 471 /** 472 * Property indicating whether or not the distribution has been loaded. 473 * 474 * @return true if the distribution has been loaded 475 */ 476 public boolean isLoaded() { 477 return loaded; 478 } 479 } 480