Home | History | Annotate | Download | only in random
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one or more
      3  * contributor license agreements.  See the NOTICE file distributed with
      4  * this work for additional information regarding copyright ownership.
      5  * The ASF licenses this file to You under the Apache License, Version 2.0
      6  * (the "License"); you may not use this file except in compliance with
      7  * the License.  You may obtain a copy of the License at
      8  *
      9  *      http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  */
     17 
     18 package org.apache.commons.math.random;
     19 
     20 import java.io.BufferedReader;
     21 import java.io.File;
     22 import java.io.FileReader;
     23 import java.io.IOException;
     24 import java.io.InputStreamReader;
     25 import java.io.Serializable;
     26 import java.net.URL;
     27 import java.util.ArrayList;
     28 import java.util.List;
     29 
     30 import org.apache.commons.math.MathRuntimeException;
     31 import org.apache.commons.math.exception.util.LocalizedFormats;
     32 import org.apache.commons.math.stat.descriptive.StatisticalSummary;
     33 import org.apache.commons.math.stat.descriptive.SummaryStatistics;
     34 import org.apache.commons.math.util.FastMath;
     35 
     36 /**
     37  * Implements <code>EmpiricalDistribution</code> interface.  This implementation
     38  * uses what amounts to the
     39  * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
     40  * Variable Kernel Method</a> with Gaussian smoothing:<p>
     41  * <strong>Digesting the input file</strong>
     42  * <ol><li>Pass the file once to compute min and max.</li>
     43  * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
     44  * <li>Pass the data file again, computing bin counts and univariate
     45  *     statistics (mean, std dev.) for each of the bins </li>
     46  * <li>Divide the interval (0,1) into subintervals associated with the bins,
     47  *     with the length of a bin's subinterval proportional to its count.</li></ol>
     48  * <strong>Generating random values from the distribution</strong><ol>
     49  * <li>Generate a uniformly distributed value in (0,1) </li>
     50  * <li>Select the subinterval to which the value belongs.
     51  * <li>Generate a random Gaussian value with mean = mean of the associated
     52  *     bin and std dev = std dev of associated bin.</li></ol></p><p>
     53  *<strong>USAGE NOTES:</strong><ul>
     54  *<li>The <code>binCount</code> is set by default to 1000.  A good rule of thumb
     55  *    is to set the bin count to approximately the length of the input file divided
     56  *    by 10. </li>
     57  *<li>The input file <i>must</i> be a plain text file containing one valid numeric
     58  *    entry per line.</li>
     59  * </ul></p>
     60  *
     61  * @version $Revision: 1003886 $ $Date: 2010-10-02 23:04:44 +0200 (sam. 02 oct. 2010) $
     62  */
     63 public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
     64 
     65     /** Serializable version identifier */
     66     private static final long serialVersionUID = 5729073523949762654L;
     67 
     68     /** List of SummaryStatistics objects characterizing the bins */
     69     private final List<SummaryStatistics> binStats;
     70 
     71     /** Sample statistics */
     72     private SummaryStatistics sampleStats = null;
     73 
     74     /** Max loaded value */
     75     private double max = Double.NEGATIVE_INFINITY;
     76 
     77     /** Min loaded value */
     78     private double min = Double.POSITIVE_INFINITY;
     79 
     80     /** Grid size */
     81     private double delta = 0d;
     82 
     83     /** number of bins */
     84     private final int binCount;
     85 
     86     /** is the distribution loaded? */
     87     private boolean loaded = false;
     88 
     89     /** upper bounds of subintervals in (0,1) "belonging" to the bins */
     90     private double[] upperBounds = null;
     91 
     92     /** RandomData instance to use in repeated calls to getNext() */
     93     private final RandomData randomData = new RandomDataImpl();
     94 
     95     /**
     96      * Creates a new EmpiricalDistribution with the default bin count.
     97      */
     98     public EmpiricalDistributionImpl() {
     99         binCount = 1000;
    100         binStats = new ArrayList<SummaryStatistics>();
    101     }
    102 
    103     /**
    104      * Creates a new EmpiricalDistribution  with the specified bin count.
    105      *
    106      * @param binCount number of bins
    107      */
    108     public EmpiricalDistributionImpl(int binCount) {
    109         this.binCount = binCount;
    110         binStats = new ArrayList<SummaryStatistics>();
    111     }
    112 
    113      /**
    114      * Computes the empirical distribution from the provided
    115      * array of numbers.
    116      *
    117      * @param in the input data array
    118      */
    119     public void load(double[] in) {
    120         DataAdapter da = new ArrayDataAdapter(in);
    121         try {
    122             da.computeStats();
    123             fillBinStats(in);
    124         } catch (IOException e) {
    125             throw new MathRuntimeException(e);
    126         }
    127         loaded = true;
    128 
    129     }
    130 
    131     /**
    132      * Computes the empirical distribution using data read from a URL.
    133      * @param url  url of the input file
    134      *
    135      * @throws IOException if an IO error occurs
    136      */
    137     public void load(URL url) throws IOException {
    138         BufferedReader in =
    139             new BufferedReader(new InputStreamReader(url.openStream()));
    140         try {
    141             DataAdapter da = new StreamDataAdapter(in);
    142             da.computeStats();
    143             if (sampleStats.getN() == 0) {
    144                 throw MathRuntimeException.createEOFException(LocalizedFormats.URL_CONTAINS_NO_DATA,
    145                                                               url);
    146             }
    147             in = new BufferedReader(new InputStreamReader(url.openStream()));
    148             fillBinStats(in);
    149             loaded = true;
    150         } finally {
    151            try {
    152                in.close();
    153            } catch (IOException ex) {
    154                // ignore
    155            }
    156         }
    157     }
    158 
    159     /**
    160      * Computes the empirical distribution from the input file.
    161      *
    162      * @param file the input file
    163      * @throws IOException if an IO error occurs
    164      */
    165     public void load(File file) throws IOException {
    166         BufferedReader in = new BufferedReader(new FileReader(file));
    167         try {
    168             DataAdapter da = new StreamDataAdapter(in);
    169             da.computeStats();
    170             in = new BufferedReader(new FileReader(file));
    171             fillBinStats(in);
    172             loaded = true;
    173         } finally {
    174             try {
    175                 in.close();
    176             } catch (IOException ex) {
    177                 // ignore
    178             }
    179         }
    180     }
    181 
    182     /**
    183      * Provides methods for computing <code>sampleStats</code> and
    184      * <code>beanStats</code> abstracting the source of data.
    185      */
    186     private abstract class DataAdapter{
    187 
    188         /**
    189          * Compute bin stats.
    190          *
    191          * @throws IOException  if an error occurs computing bin stats
    192          */
    193         public abstract void computeBinStats() throws IOException;
    194 
    195         /**
    196          * Compute sample statistics.
    197          *
    198          * @throws IOException if an error occurs computing sample stats
    199          */
    200         public abstract void computeStats() throws IOException;
    201 
    202     }
    203 
    204     /**
    205      * Factory of <code>DataAdapter</code> objects. For every supported source
    206      * of data (array of doubles, file, etc.) an instance of the proper object
    207      * is returned.
    208      */
    209     private class DataAdapterFactory{
    210         /**
    211          * Creates a DataAdapter from a data object
    212          *
    213          * @param in object providing access to the data
    214          * @return DataAdapter instance
    215          */
    216         public DataAdapter getAdapter(Object in) {
    217             if (in instanceof BufferedReader) {
    218                 BufferedReader inputStream = (BufferedReader) in;
    219                 return new StreamDataAdapter(inputStream);
    220             } else if (in instanceof double[]) {
    221                 double[] inputArray = (double[]) in;
    222                 return new ArrayDataAdapter(inputArray);
    223             } else {
    224                 throw MathRuntimeException.createIllegalArgumentException(
    225                       LocalizedFormats.INPUT_DATA_FROM_UNSUPPORTED_DATASOURCE,
    226                       in.getClass().getName(),
    227                       BufferedReader.class.getName(), double[].class.getName());
    228             }
    229         }
    230     }
    231     /**
    232      * <code>DataAdapter</code> for data provided through some input stream
    233      */
    234     private class StreamDataAdapter extends DataAdapter{
    235 
    236         /** Input stream providing access to the data */
    237         private BufferedReader inputStream;
    238 
    239         /**
    240          * Create a StreamDataAdapter from a BufferedReader
    241          *
    242          * @param in BufferedReader input stream
    243          */
    244         public StreamDataAdapter(BufferedReader in){
    245             super();
    246             inputStream = in;
    247         }
    248 
    249         /** {@inheritDoc} */
    250         @Override
    251         public void computeBinStats() throws IOException {
    252             String str = null;
    253             double val = 0.0d;
    254             while ((str = inputStream.readLine()) != null) {
    255                 val = Double.parseDouble(str);
    256                 SummaryStatistics stats = binStats.get(findBin(val));
    257                 stats.addValue(val);
    258             }
    259 
    260             inputStream.close();
    261             inputStream = null;
    262         }
    263 
    264         /** {@inheritDoc} */
    265         @Override
    266         public void computeStats() throws IOException {
    267             String str = null;
    268             double val = 0.0;
    269             sampleStats = new SummaryStatistics();
    270             while ((str = inputStream.readLine()) != null) {
    271                 val = Double.valueOf(str).doubleValue();
    272                 sampleStats.addValue(val);
    273             }
    274             inputStream.close();
    275             inputStream = null;
    276         }
    277     }
    278 
    279     /**
    280      * <code>DataAdapter</code> for data provided as array of doubles.
    281      */
    282     private class ArrayDataAdapter extends DataAdapter {
    283 
    284         /** Array of input  data values */
    285         private double[] inputArray;
    286 
    287         /**
    288          * Construct an ArrayDataAdapter from a double[] array
    289          *
    290          * @param in double[] array holding the data
    291          */
    292         public ArrayDataAdapter(double[] in){
    293             super();
    294             inputArray = in;
    295         }
    296 
    297         /** {@inheritDoc} */
    298         @Override
    299         public void computeStats() throws IOException {
    300             sampleStats = new SummaryStatistics();
    301             for (int i = 0; i < inputArray.length; i++) {
    302                 sampleStats.addValue(inputArray[i]);
    303             }
    304         }
    305 
    306         /** {@inheritDoc} */
    307         @Override
    308         public void computeBinStats() throws IOException {
    309             for (int i = 0; i < inputArray.length; i++) {
    310                 SummaryStatistics stats =
    311                     binStats.get(findBin(inputArray[i]));
    312                 stats.addValue(inputArray[i]);
    313             }
    314         }
    315     }
    316 
    317     /**
    318      * Fills binStats array (second pass through data file).
    319      *
    320      * @param in object providing access to the data
    321      * @throws IOException  if an IO error occurs
    322      */
    323     private void fillBinStats(Object in) throws IOException {
    324         // Set up grid
    325         min = sampleStats.getMin();
    326         max = sampleStats.getMax();
    327         delta = (max - min)/(Double.valueOf(binCount)).doubleValue();
    328 
    329         // Initialize binStats ArrayList
    330         if (!binStats.isEmpty()) {
    331             binStats.clear();
    332         }
    333         for (int i = 0; i < binCount; i++) {
    334             SummaryStatistics stats = new SummaryStatistics();
    335             binStats.add(i,stats);
    336         }
    337 
    338         // Filling data in binStats Array
    339         DataAdapterFactory aFactory = new DataAdapterFactory();
    340         DataAdapter da = aFactory.getAdapter(in);
    341         da.computeBinStats();
    342 
    343         // Assign upperBounds based on bin counts
    344         upperBounds = new double[binCount];
    345         upperBounds[0] =
    346         ((double) binStats.get(0).getN()) / (double) sampleStats.getN();
    347         for (int i = 1; i < binCount-1; i++) {
    348             upperBounds[i] = upperBounds[i-1] +
    349             ((double) binStats.get(i).getN()) / (double) sampleStats.getN();
    350         }
    351         upperBounds[binCount-1] = 1.0d;
    352     }
    353 
    354     /**
    355      * Returns the index of the bin to which the given value belongs
    356      *
    357      * @param value  the value whose bin we are trying to find
    358      * @return the index of the bin containing the value
    359      */
    360     private int findBin(double value) {
    361         return FastMath.min(
    362                 FastMath.max((int) FastMath.ceil((value- min) / delta) - 1, 0),
    363                 binCount - 1);
    364         }
    365 
    366     /**
    367      * Generates a random value from this distribution.
    368      *
    369      * @return the random value.
    370      * @throws IllegalStateException if the distribution has not been loaded
    371      */
    372     public double getNextValue() throws IllegalStateException {
    373 
    374         if (!loaded) {
    375             throw MathRuntimeException.createIllegalStateException(LocalizedFormats.DISTRIBUTION_NOT_LOADED);
    376         }
    377 
    378         // Start with a uniformly distributed random number in (0,1)
    379         double x = FastMath.random();
    380 
    381         // Use this to select the bin and generate a Gaussian within the bin
    382         for (int i = 0; i < binCount; i++) {
    383            if (x <= upperBounds[i]) {
    384                SummaryStatistics stats = binStats.get(i);
    385                if (stats.getN() > 0) {
    386                    if (stats.getStandardDeviation() > 0) {  // more than one obs
    387                         return randomData.nextGaussian
    388                             (stats.getMean(),stats.getStandardDeviation());
    389                    } else {
    390                        return stats.getMean(); // only one obs in bin
    391                    }
    392                }
    393            }
    394         }
    395         throw new MathRuntimeException(LocalizedFormats.NO_BIN_SELECTED);
    396     }
    397 
    398     /**
    399      * Returns a {@link StatisticalSummary} describing this distribution.
    400      * <strong>Preconditions:</strong><ul>
    401      * <li>the distribution must be loaded before invoking this method</li></ul>
    402      *
    403      * @return the sample statistics
    404      * @throws IllegalStateException if the distribution has not been loaded
    405      */
    406     public StatisticalSummary getSampleStats() {
    407         return sampleStats;
    408     }
    409 
    410     /**
    411      * Returns the number of bins.
    412      *
    413      * @return the number of bins.
    414      */
    415     public int getBinCount() {
    416         return binCount;
    417     }
    418 
    419     /**
    420      * Returns a List of {@link SummaryStatistics} instances containing
    421      * statistics describing the values in each of the bins.  The list is
    422      * indexed on the bin number.
    423      *
    424      * @return List of bin statistics.
    425      */
    426     public List<SummaryStatistics> getBinStats() {
    427         return binStats;
    428     }
    429 
    430     /**
    431      * <p>Returns a fresh copy of the array of upper bounds for the bins.
    432      * Bins are: <br/>
    433      * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
    434      *  (upperBounds[binCount-2], upperBounds[binCount-1] = max].</p>
    435      *
    436      * <p>Note: In versions 1.0-2.0 of commons-math, this method
    437      * incorrectly returned the array of probability generator upper
    438      * bounds now returned by {@link #getGeneratorUpperBounds()}.</p>
    439      *
    440      * @return array of bin upper bounds
    441      * @since 2.1
    442      */
    443     public double[] getUpperBounds() {
    444         double[] binUpperBounds = new double[binCount];
    445         binUpperBounds[0] = min + delta;
    446         for (int i = 1; i < binCount - 1; i++) {
    447             binUpperBounds[i] = binUpperBounds[i-1] + delta;
    448         }
    449         binUpperBounds[binCount - 1] = max;
    450         return binUpperBounds;
    451     }
    452 
    453     /**
    454      * <p>Returns a fresh copy of the array of upper bounds of the subintervals
    455      * of [0,1] used in generating data from the empirical distribution.
    456      * Subintervals correspond to bins with lengths proportional to bin counts.</p>
    457      *
    458      * <p>In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned
    459      * by {@link #getUpperBounds()}.</p>
    460      *
    461      * @since 2.1
    462      * @return array of upper bounds of subintervals used in data generation
    463      */
    464     public double[] getGeneratorUpperBounds() {
    465         int len = upperBounds.length;
    466         double[] out = new double[len];
    467         System.arraycopy(upperBounds, 0, out, 0, len);
    468         return out;
    469     }
    470 
    471     /**
    472      * Property indicating whether or not the distribution has been loaded.
    473      *
    474      * @return true if the distribution has been loaded
    475      */
    476     public boolean isLoaded() {
    477         return loaded;
    478     }
    479 }
    480