Home | History | Annotate | Download | only in inference
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one or more
      3  * contributor license agreements.  See the NOTICE file distributed with
      4  * this work for additional information regarding copyright ownership.
      5  * The ASF licenses this file to You under the Apache License, Version 2.0
      6  * (the "License"); you may not use this file except in compliance with
      7  * the License.  You may obtain a copy of the License at
      8  *
      9  *      http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  */
     17 package org.apache.commons.math.stat.inference;
     18 
     19 import java.util.Collection;
     20 
     21 import org.apache.commons.math.MathException;
     22 import org.apache.commons.math.MathRuntimeException;
     23 import org.apache.commons.math.distribution.FDistribution;
     24 import org.apache.commons.math.distribution.FDistributionImpl;
     25 import org.apache.commons.math.exception.util.LocalizedFormats;
     26 import org.apache.commons.math.stat.descriptive.summary.Sum;
     27 import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
     28 
     29 
     30 /**
     31  * Implements one-way ANOVA statistics defined in the {@link OneWayAnovaImpl}
     32  * interface.
     33  *
     34  * <p>Uses the
     35  * {@link org.apache.commons.math.distribution.FDistribution
     36  *  commons-math F Distribution implementation} to estimate exact p-values.</p>
     37  *
     38  * <p>This implementation is based on a description at
     39  * http://faculty.vassar.edu/lowry/ch13pt1.html</p>
     40  * <pre>
     41  * Abbreviations: bg = between groups,
     42  *                wg = within groups,
     43  *                ss = sum squared deviations
     44  * </pre>
     45  *
     46  * @since 1.2
     47  * @version $Revision: 983921 $ $Date: 2010-08-10 12:46:06 +0200 (mar. 10 aot 2010) $
     48  */
     49 public class OneWayAnovaImpl implements OneWayAnova  {
     50 
     51     /**
     52      * Default constructor.
     53      */
     54     public OneWayAnovaImpl() {
     55     }
     56 
     57     /**
     58      * {@inheritDoc}<p>
     59      * This implementation computes the F statistic using the definitional
     60      * formula<pre>
     61      *   F = msbg/mswg</pre>
     62      * where<pre>
     63      *  msbg = between group mean square
     64      *  mswg = within group mean square</pre>
     65      * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
     66      * here</a></p>
     67      */
     68     public double anovaFValue(Collection<double[]> categoryData)
     69         throws IllegalArgumentException, MathException {
     70         AnovaStats a = anovaStats(categoryData);
     71         return a.F;
     72     }
     73 
     74     /**
     75      * {@inheritDoc}<p>
     76      * This implementation uses the
     77      * {@link org.apache.commons.math.distribution.FDistribution
     78      * commons-math F Distribution implementation} to estimate the exact
     79      * p-value, using the formula<pre>
     80      *   p = 1 - cumulativeProbability(F)</pre>
     81      * where <code>F</code> is the F value and <code>cumulativeProbability</code>
     82      * is the commons-math implementation of the F distribution.</p>
     83      */
     84     public double anovaPValue(Collection<double[]> categoryData)
     85         throws IllegalArgumentException, MathException {
     86         AnovaStats a = anovaStats(categoryData);
     87         FDistribution fdist = new FDistributionImpl(a.dfbg, a.dfwg);
     88         return 1.0 - fdist.cumulativeProbability(a.F);
     89     }
     90 
     91     /**
     92      * {@inheritDoc}<p>
     93      * This implementation uses the
     94      * {@link org.apache.commons.math.distribution.FDistribution
     95      * commons-math F Distribution implementation} to estimate the exact
     96      * p-value, using the formula<pre>
     97      *   p = 1 - cumulativeProbability(F)</pre>
     98      * where <code>F</code> is the F value and <code>cumulativeProbability</code>
     99      * is the commons-math implementation of the F distribution.</p>
    100      * <p>True is returned iff the estimated p-value is less than alpha.</p>
    101      */
    102     public boolean anovaTest(Collection<double[]> categoryData, double alpha)
    103         throws IllegalArgumentException, MathException {
    104         if ((alpha <= 0) || (alpha > 0.5)) {
    105             throw MathRuntimeException.createIllegalArgumentException(
    106                   LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
    107                   alpha, 0, 0.5);
    108         }
    109         return anovaPValue(categoryData) < alpha;
    110     }
    111 
    112 
    113     /**
    114      * This method actually does the calculations (except P-value).
    115      *
    116      * @param categoryData <code>Collection</code> of <code>double[]</code>
    117      * arrays each containing data for one category
    118      * @return computed AnovaStats
    119      * @throws IllegalArgumentException if categoryData does not meet
    120      * preconditions specified in the interface definition
    121      * @throws MathException if an error occurs computing the Anova stats
    122      */
    123     private AnovaStats anovaStats(Collection<double[]> categoryData)
    124         throws IllegalArgumentException, MathException {
    125 
    126         // check if we have enough categories
    127         if (categoryData.size() < 2) {
    128             throw MathRuntimeException.createIllegalArgumentException(
    129                   LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
    130                   categoryData.size());
    131         }
    132 
    133         // check if each category has enough data and all is double[]
    134         for (double[] array : categoryData) {
    135             if (array.length <= 1) {
    136                 throw MathRuntimeException.createIllegalArgumentException(
    137                       LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
    138                       array.length);
    139             }
    140         }
    141 
    142         int dfwg = 0;
    143         double sswg = 0;
    144         Sum totsum = new Sum();
    145         SumOfSquares totsumsq = new SumOfSquares();
    146         int totnum = 0;
    147 
    148         for (double[] data : categoryData) {
    149 
    150             Sum sum = new Sum();
    151             SumOfSquares sumsq = new SumOfSquares();
    152             int num = 0;
    153 
    154             for (int i = 0; i < data.length; i++) {
    155                 double val = data[i];
    156 
    157                 // within category
    158                 num++;
    159                 sum.increment(val);
    160                 sumsq.increment(val);
    161 
    162                 // for all categories
    163                 totnum++;
    164                 totsum.increment(val);
    165                 totsumsq.increment(val);
    166             }
    167             dfwg += num - 1;
    168             double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num;
    169             sswg += ss;
    170         }
    171         double sst = totsumsq.getResult() - totsum.getResult() *
    172             totsum.getResult()/totnum;
    173         double ssbg = sst - sswg;
    174         int dfbg = categoryData.size() - 1;
    175         double msbg = ssbg/dfbg;
    176         double mswg = sswg/dfwg;
    177         double F = msbg/mswg;
    178 
    179         return new AnovaStats(dfbg, dfwg, F);
    180     }
    181 
    182     /**
    183         Convenience class to pass dfbg,dfwg,F values around within AnovaImpl.
    184         No get/set methods provided.
    185     */
    186     private static class AnovaStats {
    187 
    188         /** Degrees of freedom in numerator (between groups). */
    189         private int dfbg;
    190 
    191         /** Degrees of freedom in denominator (within groups). */
    192         private int dfwg;
    193 
    194         /** Statistic. */
    195         private double F;
    196 
    197         /**
    198          * Constructor
    199          * @param dfbg degrees of freedom in numerator (between groups)
    200          * @param dfwg degrees of freedom in denominator (within groups)
    201          * @param F statistic
    202          */
    203         private AnovaStats(int dfbg, int dfwg, double F) {
    204             this.dfbg = dfbg;
    205             this.dfwg = dfwg;
    206             this.F = F;
    207         }
    208     }
    209 
    210 }
    211