Home | History | Annotate | Download | only in inference
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one or more
      3  * contributor license agreements.  See the NOTICE file distributed with
      4  * this work for additional information regarding copyright ownership.
      5  * The ASF licenses this file to You under the Apache License, Version 2.0
      6  * (the "License"); you may not use this file except in compliance with
      7  * the License.  You may obtain a copy of the License at
      8  *
      9  *      http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  */
     17 package org.apache.commons.math.stat.inference;
     18 
     19 import org.apache.commons.math.MathException;
     20 import org.apache.commons.math.MathRuntimeException;
     21 import org.apache.commons.math.distribution.ChiSquaredDistribution;
     22 import org.apache.commons.math.distribution.ChiSquaredDistributionImpl;
     23 import org.apache.commons.math.exception.util.LocalizedFormats;
     24 import org.apache.commons.math.util.FastMath;
     25 
     26 /**
     27  * Implements Chi-Square test statistics defined in the
     28  * {@link UnknownDistributionChiSquareTest} interface.
     29  *
     30  * @version $Revision: 990655 $ $Date: 2010-08-29 23:49:40 +0200 (dim. 29 aot 2010) $
     31  */
     32 public class ChiSquareTestImpl implements UnknownDistributionChiSquareTest {
     33 
     34     /** Distribution used to compute inference statistics. */
     35     private ChiSquaredDistribution distribution;
     36 
     37     /**
     38      * Construct a ChiSquareTestImpl
     39      */
     40     public ChiSquareTestImpl() {
     41         this(new ChiSquaredDistributionImpl(1.0));
     42     }
     43 
     44     /**
     45      * Create a test instance using the given distribution for computing
     46      * inference statistics.
     47      * @param x distribution used to compute inference statistics.
     48      * @since 1.2
     49      */
     50     public ChiSquareTestImpl(ChiSquaredDistribution x) {
     51         super();
     52         setDistribution(x);
     53     }
     54      /**
     55      * {@inheritDoc}
     56      * <p><strong>Note: </strong>This implementation rescales the
     57      * <code>expected</code> array if necessary to ensure that the sum of the
     58      * expected and observed counts are equal.</p>
     59      *
     60      * @param observed array of observed frequency counts
     61      * @param expected array of expected frequency counts
     62      * @return chi-square test statistic
     63      * @throws IllegalArgumentException if preconditions are not met
     64      * or length is less than 2
     65      */
     66     public double chiSquare(double[] expected, long[] observed)
     67         throws IllegalArgumentException {
     68         if (expected.length < 2) {
     69             throw MathRuntimeException.createIllegalArgumentException(
     70                   LocalizedFormats.INSUFFICIENT_DIMENSION, expected.length, 2);
     71         }
     72         if (expected.length != observed.length) {
     73             throw MathRuntimeException.createIllegalArgumentException(
     74                   LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, expected.length, observed.length);
     75         }
     76         checkPositive(expected);
     77         checkNonNegative(observed);
     78         double sumExpected = 0d;
     79         double sumObserved = 0d;
     80         for (int i = 0; i < observed.length; i++) {
     81             sumExpected += expected[i];
     82             sumObserved += observed[i];
     83         }
     84         double ratio = 1.0d;
     85         boolean rescale = false;
     86         if (FastMath.abs(sumExpected - sumObserved) > 10E-6) {
     87             ratio = sumObserved / sumExpected;
     88             rescale = true;
     89         }
     90         double sumSq = 0.0d;
     91         for (int i = 0; i < observed.length; i++) {
     92             if (rescale) {
     93                 final double dev = observed[i] - ratio * expected[i];
     94                 sumSq += dev * dev / (ratio * expected[i]);
     95             } else {
     96                 final double dev = observed[i] - expected[i];
     97                 sumSq += dev * dev / expected[i];
     98             }
     99         }
    100         return sumSq;
    101     }
    102 
    103     /**
    104      * {@inheritDoc}
    105      * <p><strong>Note: </strong>This implementation rescales the
    106      * <code>expected</code> array if necessary to ensure that the sum of the
    107      * expected and observed counts are equal.</p>
    108      *
    109      * @param observed array of observed frequency counts
    110      * @param expected array of expected frequency counts
    111      * @return p-value
    112      * @throws IllegalArgumentException if preconditions are not met
    113      * @throws MathException if an error occurs computing the p-value
    114      */
    115     public double chiSquareTest(double[] expected, long[] observed)
    116         throws IllegalArgumentException, MathException {
    117         distribution.setDegreesOfFreedom(expected.length - 1.0);
    118         return 1.0 - distribution.cumulativeProbability(
    119             chiSquare(expected, observed));
    120     }
    121 
    122     /**
    123      * {@inheritDoc}
    124      * <p><strong>Note: </strong>This implementation rescales the
    125      * <code>expected</code> array if necessary to ensure that the sum of the
    126      * expected and observed counts are equal.</p>
    127      *
    128      * @param observed array of observed frequency counts
    129      * @param expected array of expected frequency counts
    130      * @param alpha significance level of the test
    131      * @return true iff null hypothesis can be rejected with confidence
    132      * 1 - alpha
    133      * @throws IllegalArgumentException if preconditions are not met
    134      * @throws MathException if an error occurs performing the test
    135      */
    136     public boolean chiSquareTest(double[] expected, long[] observed,
    137             double alpha) throws IllegalArgumentException, MathException {
    138         if ((alpha <= 0) || (alpha > 0.5)) {
    139             throw MathRuntimeException.createIllegalArgumentException(
    140                   LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
    141                   alpha, 0, 0.5);
    142         }
    143         return chiSquareTest(expected, observed) < alpha;
    144     }
    145 
    146     /**
    147      * @param counts array representation of 2-way table
    148      * @return chi-square test statistic
    149      * @throws IllegalArgumentException if preconditions are not met
    150      */
    151     public double chiSquare(long[][] counts) throws IllegalArgumentException {
    152 
    153         checkArray(counts);
    154         int nRows = counts.length;
    155         int nCols = counts[0].length;
    156 
    157         // compute row, column and total sums
    158         double[] rowSum = new double[nRows];
    159         double[] colSum = new double[nCols];
    160         double total = 0.0d;
    161         for (int row = 0; row < nRows; row++) {
    162             for (int col = 0; col < nCols; col++) {
    163                 rowSum[row] += counts[row][col];
    164                 colSum[col] += counts[row][col];
    165                 total += counts[row][col];
    166             }
    167         }
    168 
    169         // compute expected counts and chi-square
    170         double sumSq = 0.0d;
    171         double expected = 0.0d;
    172         for (int row = 0; row < nRows; row++) {
    173             for (int col = 0; col < nCols; col++) {
    174                 expected = (rowSum[row] * colSum[col]) / total;
    175                 sumSq += ((counts[row][col] - expected) *
    176                         (counts[row][col] - expected)) / expected;
    177             }
    178         }
    179         return sumSq;
    180     }
    181 
    182     /**
    183      * @param counts array representation of 2-way table
    184      * @return p-value
    185      * @throws IllegalArgumentException if preconditions are not met
    186      * @throws MathException if an error occurs computing the p-value
    187      */
    188     public double chiSquareTest(long[][] counts)
    189     throws IllegalArgumentException, MathException {
    190         checkArray(counts);
    191         double df = ((double) counts.length -1) * ((double) counts[0].length - 1);
    192         distribution.setDegreesOfFreedom(df);
    193         return 1 - distribution.cumulativeProbability(chiSquare(counts));
    194     }
    195 
    196     /**
    197      * @param counts array representation of 2-way table
    198      * @param alpha significance level of the test
    199      * @return true iff null hypothesis can be rejected with confidence
    200      * 1 - alpha
    201      * @throws IllegalArgumentException if preconditions are not met
    202      * @throws MathException if an error occurs performing the test
    203      */
    204     public boolean chiSquareTest(long[][] counts, double alpha)
    205     throws IllegalArgumentException, MathException {
    206         if ((alpha <= 0) || (alpha > 0.5)) {
    207             throw MathRuntimeException.createIllegalArgumentException(
    208                   LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
    209                   alpha, 0.0, 0.5);
    210         }
    211         return chiSquareTest(counts) < alpha;
    212     }
    213 
    214     /**
    215      * @param observed1 array of observed frequency counts of the first data set
    216      * @param observed2 array of observed frequency counts of the second data set
    217      * @return chi-square test statistic
    218      * @throws IllegalArgumentException if preconditions are not met
    219      * @since 1.2
    220      */
    221     public double chiSquareDataSetsComparison(long[] observed1, long[] observed2)
    222         throws IllegalArgumentException {
    223 
    224         // Make sure lengths are same
    225         if (observed1.length < 2) {
    226             throw MathRuntimeException.createIllegalArgumentException(
    227                   LocalizedFormats.INSUFFICIENT_DIMENSION, observed1.length, 2);
    228         }
    229         if (observed1.length != observed2.length) {
    230             throw MathRuntimeException.createIllegalArgumentException(
    231                   LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE,
    232                   observed1.length, observed2.length);
    233         }
    234 
    235         // Ensure non-negative counts
    236         checkNonNegative(observed1);
    237         checkNonNegative(observed2);
    238 
    239         // Compute and compare count sums
    240         long countSum1 = 0;
    241         long countSum2 = 0;
    242         boolean unequalCounts = false;
    243         double weight = 0.0;
    244         for (int i = 0; i < observed1.length; i++) {
    245             countSum1 += observed1[i];
    246             countSum2 += observed2[i];
    247         }
    248         // Ensure neither sample is uniformly 0
    249         if (countSum1 == 0) {
    250             throw MathRuntimeException.createIllegalArgumentException(
    251                   LocalizedFormats.OBSERVED_COUNTS_ALL_ZERO, 1);
    252         }
    253         if (countSum2 == 0) {
    254             throw MathRuntimeException.createIllegalArgumentException(
    255                   LocalizedFormats.OBSERVED_COUNTS_ALL_ZERO, 2);
    256         }
    257         // Compare and compute weight only if different
    258         unequalCounts = countSum1 != countSum2;
    259         if (unequalCounts) {
    260             weight = FastMath.sqrt((double) countSum1 / (double) countSum2);
    261         }
    262         // Compute ChiSquare statistic
    263         double sumSq = 0.0d;
    264         double dev = 0.0d;
    265         double obs1 = 0.0d;
    266         double obs2 = 0.0d;
    267         for (int i = 0; i < observed1.length; i++) {
    268             if (observed1[i] == 0 && observed2[i] == 0) {
    269                 throw MathRuntimeException.createIllegalArgumentException(
    270                       LocalizedFormats.OBSERVED_COUNTS_BOTTH_ZERO_FOR_ENTRY, i);
    271             } else {
    272                 obs1 = observed1[i];
    273                 obs2 = observed2[i];
    274                 if (unequalCounts) { // apply weights
    275                     dev = obs1/weight - obs2 * weight;
    276                 } else {
    277                     dev = obs1 - obs2;
    278                 }
    279                 sumSq += (dev * dev) / (obs1 + obs2);
    280             }
    281         }
    282         return sumSq;
    283     }
    284 
    285     /**
    286      * @param observed1 array of observed frequency counts of the first data set
    287      * @param observed2 array of observed frequency counts of the second data set
    288      * @return p-value
    289      * @throws IllegalArgumentException if preconditions are not met
    290      * @throws MathException if an error occurs computing the p-value
    291      * @since 1.2
    292      */
    293     public double chiSquareTestDataSetsComparison(long[] observed1, long[] observed2)
    294         throws IllegalArgumentException, MathException {
    295         distribution.setDegreesOfFreedom((double) observed1.length - 1);
    296         return 1 - distribution.cumulativeProbability(
    297                 chiSquareDataSetsComparison(observed1, observed2));
    298     }
    299 
    300     /**
    301      * @param observed1 array of observed frequency counts of the first data set
    302      * @param observed2 array of observed frequency counts of the second data set
    303      * @param alpha significance level of the test
    304      * @return true iff null hypothesis can be rejected with confidence
    305      * 1 - alpha
    306      * @throws IllegalArgumentException if preconditions are not met
    307      * @throws MathException if an error occurs performing the test
    308      * @since 1.2
    309      */
    310     public boolean chiSquareTestDataSetsComparison(long[] observed1, long[] observed2,
    311             double alpha) throws IllegalArgumentException, MathException {
    312         if ((alpha <= 0) || (alpha > 0.5)) {
    313             throw MathRuntimeException.createIllegalArgumentException(
    314                   LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
    315                   alpha, 0.0, 0.5);
    316         }
    317         return chiSquareTestDataSetsComparison(observed1, observed2) < alpha;
    318     }
    319 
    320     /**
    321      * Checks to make sure that the input long[][] array is rectangular,
    322      * has at least 2 rows and 2 columns, and has all non-negative entries,
    323      * throwing IllegalArgumentException if any of these checks fail.
    324      *
    325      * @param in input 2-way table to check
    326      * @throws IllegalArgumentException if the array is not valid
    327      */
    328     private void checkArray(long[][] in) throws IllegalArgumentException {
    329 
    330         if (in.length < 2) {
    331             throw MathRuntimeException.createIllegalArgumentException(
    332                   LocalizedFormats.INSUFFICIENT_DIMENSION, in.length, 2);
    333         }
    334 
    335         if (in[0].length < 2) {
    336             throw MathRuntimeException.createIllegalArgumentException(
    337                   LocalizedFormats.INSUFFICIENT_DIMENSION, in[0].length, 2);
    338         }
    339 
    340         checkRectangular(in);
    341         checkNonNegative(in);
    342 
    343     }
    344 
    345     //---------------------  Private array methods -- should find a utility home for these
    346 
    347     /**
    348      * Throws IllegalArgumentException if the input array is not rectangular.
    349      *
    350      * @param in array to be tested
    351      * @throws NullPointerException if input array is null
    352      * @throws IllegalArgumentException if input array is not rectangular
    353      */
    354     private void checkRectangular(long[][] in) {
    355         for (int i = 1; i < in.length; i++) {
    356             if (in[i].length != in[0].length) {
    357                 throw MathRuntimeException.createIllegalArgumentException(
    358                       LocalizedFormats.DIFFERENT_ROWS_LENGTHS,
    359                       in[i].length, in[0].length);
    360             }
    361         }
    362     }
    363 
    364     /**
    365      * Check all entries of the input array are > 0.
    366      *
    367      * @param in array to be tested
    368      * @exception IllegalArgumentException if one entry is not positive
    369      */
    370     private void checkPositive(double[] in) throws IllegalArgumentException {
    371         for (int i = 0; i < in.length; i++) {
    372             if (in[i] <= 0) {
    373                 throw MathRuntimeException.createIllegalArgumentException(
    374                       LocalizedFormats.NOT_POSITIVE_ELEMENT_AT_INDEX,
    375                       i, in[i]);
    376             }
    377         }
    378     }
    379 
    380     /**
    381      * Check all entries of the input array are >= 0.
    382      *
    383      * @param in array to be tested
    384      * @exception IllegalArgumentException if one entry is negative
    385      */
    386     private void checkNonNegative(long[] in) throws IllegalArgumentException {
    387         for (int i = 0; i < in.length; i++) {
    388             if (in[i] < 0) {
    389                 throw MathRuntimeException.createIllegalArgumentException(
    390                       LocalizedFormats.NEGATIVE_ELEMENT_AT_INDEX,
    391                       i, in[i]);
    392             }
    393         }
    394     }
    395 
    396     /**
    397      * Check all entries of the input array are >= 0.
    398      *
    399      * @param in array to be tested
    400      * @exception IllegalArgumentException if one entry is negative
    401      */
    402     private void checkNonNegative(long[][] in) throws IllegalArgumentException {
    403         for (int i = 0; i < in.length; i ++) {
    404             for (int j = 0; j < in[i].length; j++) {
    405                 if (in[i][j] < 0) {
    406                     throw MathRuntimeException.createIllegalArgumentException(
    407                           LocalizedFormats.NEGATIVE_ELEMENT_AT_2D_INDEX,
    408                           i, j, in[i][j]);
    409                 }
    410             }
    411         }
    412     }
    413 
    414     /**
    415      * Modify the distribution used to compute inference statistics.
    416      *
    417      * @param value
    418      *            the new distribution
    419      * @since 1.2
    420      */
    421     public void setDistribution(ChiSquaredDistribution value) {
    422         distribution = value;
    423     }
    424 }
    425