Home | History | Annotate | Download | only in moment
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one or more
      3  * contributor license agreements.  See the NOTICE file distributed with
      4  * this work for additional information regarding copyright ownership.
      5  * The ASF licenses this file to You under the Apache License, Version 2.0
      6  * (the "License"); you may not use this file except in compliance with
      7  * the License.  You may obtain a copy of the License at
      8  *
      9  *      http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  */
     17 package org.apache.commons.math.stat.descriptive.moment;
     18 
     19 import java.io.Serializable;
     20 
     21 import org.apache.commons.math.exception.NullArgumentException;
     22 import org.apache.commons.math.exception.util.LocalizedFormats;
     23 import org.apache.commons.math.stat.descriptive.WeightedEvaluation;
     24 import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic;
     25 
     26 /**
     27  * Computes the variance of the available values.  By default, the unbiased
     28  * "sample variance" definitional formula is used:
     29  * <p>
     30  * variance = sum((x_i - mean)^2) / (n - 1) </p>
     31  * <p>
     32  * where mean is the {@link Mean} and <code>n</code> is the number
     33  * of sample observations.</p>
     34  * <p>
     35  * The definitional formula does not have good numerical properties, so
     36  * this implementation does not compute the statistic using the definitional
     37  * formula. <ul>
     38  * <li> The <code>getResult</code> method computes the variance using
     39  * updating formulas based on West's algorithm, as described in
     40  * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
     41  * J. G. Lewis 1979, <i>Communications of the ACM</i>,
     42  * vol. 22 no. 9, pp. 526-531.</a></li>
     43  * <li> The <code>evaluate</code> methods leverage the fact that they have the
     44  * full array of values in memory to execute a two-pass algorithm.
     45  * Specifically, these methods use the "corrected two-pass algorithm" from
     46  * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
     47  * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
     48  * Note that adding values using <code>increment</code> or
     49  * <code>incrementAll</code> and then executing <code>getResult</code> will
     50  * sometimes give a different, less accurate, result than executing
     51  * <code>evaluate</code> with the full array of values. The former approach
     52  * should only be used when the full array of values is not available.</p>
     53  * <p>
     54  * The "population variance"  ( sum((x_i - mean)^2) / n ) can also
     55  * be computed using this statistic.  The <code>isBiasCorrected</code>
     56  * property determines whether the "population" or "sample" value is
     57  * returned by the <code>evaluate</code> and <code>getResult</code> methods.
     58  * To compute population variances, set this property to <code>false.</code>
     59  * </p>
     60  * <p>
     61  * <strong>Note that this implementation is not synchronized.</strong> If
     62  * multiple threads access an instance of this class concurrently, and at least
     63  * one of the threads invokes the <code>increment()</code> or
     64  * <code>clear()</code> method, it must be synchronized externally.</p>
     65  *
     66  * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $
     67  */
     68 public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation {
     69 
     70     /** Serializable version identifier */
     71     private static final long serialVersionUID = -9111962718267217978L;
     72 
     73     /** SecondMoment is used in incremental calculation of Variance*/
     74     protected SecondMoment moment = null;
     75 
     76     /**
     77      * Boolean test to determine if this Variance should also increment
     78      * the second moment, this evaluates to false when this Variance is
     79      * constructed with an external SecondMoment as a parameter.
     80      */
     81     protected boolean incMoment = true;
     82 
     83     /**
     84      * Determines whether or not bias correction is applied when computing the
     85      * value of the statisic.  True means that bias is corrected.  See
     86      * {@link Variance} for details on the formula.
     87      */
     88     private boolean isBiasCorrected = true;
     89 
     90     /**
     91      * Constructs a Variance with default (true) <code>isBiasCorrected</code>
     92      * property.
     93      */
     94     public Variance() {
     95         moment = new SecondMoment();
     96     }
     97 
     98     /**
     99      * Constructs a Variance based on an external second moment.
    100      *
    101      * @param m2 the SecondMoment (Third or Fourth moments work
    102      * here as well.)
    103      */
    104     public Variance(final SecondMoment m2) {
    105         incMoment = false;
    106         this.moment = m2;
    107     }
    108 
    109     /**
    110      * Constructs a Variance with the specified <code>isBiasCorrected</code>
    111      * property
    112      *
    113      * @param isBiasCorrected  setting for bias correction - true means
    114      * bias will be corrected and is equivalent to using the argumentless
    115      * constructor
    116      */
    117     public Variance(boolean isBiasCorrected) {
    118         moment = new SecondMoment();
    119         this.isBiasCorrected = isBiasCorrected;
    120     }
    121 
    122     /**
    123      * Constructs a Variance with the specified <code>isBiasCorrected</code>
    124      * property and the supplied external second moment.
    125      *
    126      * @param isBiasCorrected  setting for bias correction - true means
    127      * bias will be corrected
    128      * @param m2 the SecondMoment (Third or Fourth moments work
    129      * here as well.)
    130      */
    131     public Variance(boolean isBiasCorrected, SecondMoment m2) {
    132         incMoment = false;
    133         this.moment = m2;
    134         this.isBiasCorrected = isBiasCorrected;
    135     }
    136 
    137     /**
    138      * Copy constructor, creates a new {@code Variance} identical
    139      * to the {@code original}
    140      *
    141      * @param original the {@code Variance} instance to copy
    142      */
    143     public Variance(Variance original) {
    144         copy(original, this);
    145     }
    146 
    147     /**
    148      * {@inheritDoc}
    149      * <p>If all values are available, it is more accurate to use
    150      * {@link #evaluate(double[])} rather than adding values one at a time
    151      * using this method and then executing {@link #getResult}, since
    152      * <code>evaluate</code> leverages the fact that is has the full
    153      * list of values together to execute a two-pass algorithm.
    154      * See {@link Variance}.</p>
    155      */
    156     @Override
    157     public void increment(final double d) {
    158         if (incMoment) {
    159             moment.increment(d);
    160         }
    161     }
    162 
    163     /**
    164      * {@inheritDoc}
    165      */
    166     @Override
    167     public double getResult() {
    168             if (moment.n == 0) {
    169                 return Double.NaN;
    170             } else if (moment.n == 1) {
    171                 return 0d;
    172             } else {
    173                 if (isBiasCorrected) {
    174                     return moment.m2 / (moment.n - 1d);
    175                 } else {
    176                     return moment.m2 / (moment.n);
    177                 }
    178             }
    179     }
    180 
    181     /**
    182      * {@inheritDoc}
    183      */
    184     public long getN() {
    185         return moment.getN();
    186     }
    187 
    188     /**
    189      * {@inheritDoc}
    190      */
    191     @Override
    192     public void clear() {
    193         if (incMoment) {
    194             moment.clear();
    195         }
    196     }
    197 
    198     /**
    199      * Returns the variance of the entries in the input array, or
    200      * <code>Double.NaN</code> if the array is empty.
    201      * <p>
    202      * See {@link Variance} for details on the computing algorithm.</p>
    203      * <p>
    204      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
    205      * <p>
    206      * Throws <code>IllegalArgumentException</code> if the array is null.</p>
    207      * <p>
    208      * Does not change the internal state of the statistic.</p>
    209      *
    210      * @param values the input array
    211      * @return the variance of the values or Double.NaN if length = 0
    212      * @throws IllegalArgumentException if the array is null
    213      */
    214     @Override
    215     public double evaluate(final double[] values) {
    216         if (values == null) {
    217             throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
    218         }
    219         return evaluate(values, 0, values.length);
    220     }
    221 
    222     /**
    223      * Returns the variance of the entries in the specified portion of
    224      * the input array, or <code>Double.NaN</code> if the designated subarray
    225      * is empty.
    226      * <p>
    227      * See {@link Variance} for details on the computing algorithm.</p>
    228      * <p>
    229      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
    230      * <p>
    231      * Does not change the internal state of the statistic.</p>
    232      * <p>
    233      * Throws <code>IllegalArgumentException</code> if the array is null.</p>
    234      *
    235      * @param values the input array
    236      * @param begin index of the first array element to include
    237      * @param length the number of elements to include
    238      * @return the variance of the values or Double.NaN if length = 0
    239      * @throws IllegalArgumentException if the array is null or the array index
    240      *  parameters are not valid
    241      */
    242     @Override
    243     public double evaluate(final double[] values, final int begin, final int length) {
    244 
    245         double var = Double.NaN;
    246 
    247         if (test(values, begin, length)) {
    248             clear();
    249             if (length == 1) {
    250                 var = 0.0;
    251             } else if (length > 1) {
    252                 Mean mean = new Mean();
    253                 double m = mean.evaluate(values, begin, length);
    254                 var = evaluate(values, m, begin, length);
    255             }
    256         }
    257         return var;
    258     }
    259 
    260     /**
    261      * <p>Returns the weighted variance of the entries in the specified portion of
    262      * the input array, or <code>Double.NaN</code> if the designated subarray
    263      * is empty.</p>
    264      * <p>
    265      * Uses the formula <pre>
    266      *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
    267      * </pre>
    268      * where weightedMean is the weighted mean</p>
    269      * <p>
    270      * This formula will not return the same result as the unweighted variance when all
    271      * weights are equal, unless all weights are equal to 1. The formula assumes that
    272      * weights are to be treated as "expansion values," as will be the case if for example
    273      * the weights represent frequency counts. To normalize weights so that the denominator
    274      * in the variance computation equals the length of the input vector minus one, use <pre>
    275      *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code>
    276      * </pre>
    277      * <p>
    278      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
    279      * <p>
    280      * Throws <code>IllegalArgumentException</code> if any of the following are true:
    281      * <ul><li>the values array is null</li>
    282      *     <li>the weights array is null</li>
    283      *     <li>the weights array does not have the same length as the values array</li>
    284      *     <li>the weights array contains one or more infinite values</li>
    285      *     <li>the weights array contains one or more NaN values</li>
    286      *     <li>the weights array contains negative values</li>
    287      *     <li>the start and length arguments do not determine a valid array</li>
    288      * </ul></p>
    289      * <p>
    290      * Does not change the internal state of the statistic.</p>
    291      * <p>
    292      * Throws <code>IllegalArgumentException</code> if either array is null.</p>
    293      *
    294      * @param values the input array
    295      * @param weights the weights array
    296      * @param begin index of the first array element to include
    297      * @param length the number of elements to include
    298      * @return the weighted variance of the values or Double.NaN if length = 0
    299      * @throws IllegalArgumentException if the parameters are not valid
    300      * @since 2.1
    301      */
    302     public double evaluate(final double[] values, final double[] weights,
    303                            final int begin, final int length) {
    304 
    305         double var = Double.NaN;
    306 
    307         if (test(values, weights,begin, length)) {
    308             clear();
    309             if (length == 1) {
    310                 var = 0.0;
    311             } else if (length > 1) {
    312                 Mean mean = new Mean();
    313                 double m = mean.evaluate(values, weights, begin, length);
    314                 var = evaluate(values, weights, m, begin, length);
    315             }
    316         }
    317         return var;
    318     }
    319 
    320     /**
    321      * <p>
    322      * Returns the weighted variance of the entries in the the input array.</p>
    323      * <p>
    324      * Uses the formula <pre>
    325      *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
    326      * </pre>
    327      * where weightedMean is the weighted mean</p>
    328      * <p>
    329      * This formula will not return the same result as the unweighted variance when all
    330      * weights are equal, unless all weights are equal to 1. The formula assumes that
    331      * weights are to be treated as "expansion values," as will be the case if for example
    332      * the weights represent frequency counts. To normalize weights so that the denominator
    333      * in the variance computation equals the length of the input vector minus one, use <pre>
    334      *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code>
    335      * </pre>
    336      * <p>
    337      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
    338      * <p>
    339      * Throws <code>IllegalArgumentException</code> if any of the following are true:
    340      * <ul><li>the values array is null</li>
    341      *     <li>the weights array is null</li>
    342      *     <li>the weights array does not have the same length as the values array</li>
    343      *     <li>the weights array contains one or more infinite values</li>
    344      *     <li>the weights array contains one or more NaN values</li>
    345      *     <li>the weights array contains negative values</li>
    346      * </ul></p>
    347      * <p>
    348      * Does not change the internal state of the statistic.</p>
    349      * <p>
    350      * Throws <code>IllegalArgumentException</code> if either array is null.</p>
    351      *
    352      * @param values the input array
    353      * @param weights the weights array
    354      * @return the weighted variance of the values
    355      * @throws IllegalArgumentException if the parameters are not valid
    356      * @since 2.1
    357      */
    358     public double evaluate(final double[] values, final double[] weights) {
    359         return evaluate(values, weights, 0, values.length);
    360     }
    361 
    362     /**
    363      * Returns the variance of the entries in the specified portion of
    364      * the input array, using the precomputed mean value.  Returns
    365      * <code>Double.NaN</code> if the designated subarray is empty.
    366      * <p>
    367      * See {@link Variance} for details on the computing algorithm.</p>
    368      * <p>
    369      * The formula used assumes that the supplied mean value is the arithmetic
    370      * mean of the sample data, not a known population parameter.  This method
    371      * is supplied only to save computation when the mean has already been
    372      * computed.</p>
    373      * <p>
    374      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
    375      * <p>
    376      * Throws <code>IllegalArgumentException</code> if the array is null.</p>
    377      * <p>
    378      * Does not change the internal state of the statistic.</p>
    379      *
    380      * @param values the input array
    381      * @param mean the precomputed mean value
    382      * @param begin index of the first array element to include
    383      * @param length the number of elements to include
    384      * @return the variance of the values or Double.NaN if length = 0
    385      * @throws IllegalArgumentException if the array is null or the array index
    386      *  parameters are not valid
    387      */
    388     public double evaluate(final double[] values, final double mean,
    389             final int begin, final int length) {
    390 
    391         double var = Double.NaN;
    392 
    393         if (test(values, begin, length)) {
    394             if (length == 1) {
    395                 var = 0.0;
    396             } else if (length > 1) {
    397                 double accum = 0.0;
    398                 double dev = 0.0;
    399                 double accum2 = 0.0;
    400                 for (int i = begin; i < begin + length; i++) {
    401                     dev = values[i] - mean;
    402                     accum += dev * dev;
    403                     accum2 += dev;
    404                 }
    405                 double len = length;
    406                 if (isBiasCorrected) {
    407                     var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
    408                 } else {
    409                     var = (accum - (accum2 * accum2 / len)) / len;
    410                 }
    411             }
    412         }
    413         return var;
    414     }
    415 
    416     /**
    417      * Returns the variance of the entries in the input array, using the
    418      * precomputed mean value.  Returns <code>Double.NaN</code> if the array
    419      * is empty.
    420      * <p>
    421      * See {@link Variance} for details on the computing algorithm.</p>
    422      * <p>
    423      * If <code>isBiasCorrected</code> is <code>true</code> the formula used
    424      * assumes that the supplied mean value is the arithmetic mean of the
    425      * sample data, not a known population parameter.  If the mean is a known
    426      * population parameter, or if the "population" version of the variance is
    427      * desired, set <code>isBiasCorrected</code> to <code>false</code> before
    428      * invoking this method.</p>
    429      * <p>
    430      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
    431      * <p>
    432      * Throws <code>IllegalArgumentException</code> if the array is null.</p>
    433      * <p>
    434      * Does not change the internal state of the statistic.</p>
    435      *
    436      * @param values the input array
    437      * @param mean the precomputed mean value
    438      * @return the variance of the values or Double.NaN if the array is empty
    439      * @throws IllegalArgumentException if the array is null
    440      */
    441     public double evaluate(final double[] values, final double mean) {
    442         return evaluate(values, mean, 0, values.length);
    443     }
    444 
    445     /**
    446      * Returns the weighted variance of the entries in the specified portion of
    447      * the input array, using the precomputed weighted mean value.  Returns
    448      * <code>Double.NaN</code> if the designated subarray is empty.
    449      * <p>
    450      * Uses the formula <pre>
    451      *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
    452      * </pre></p>
    453      * <p>
    454      * The formula used assumes that the supplied mean value is the weighted arithmetic
    455      * mean of the sample data, not a known population parameter. This method
    456      * is supplied only to save computation when the mean has already been
    457      * computed.</p>
    458      * <p>
    459      * This formula will not return the same result as the unweighted variance when all
    460      * weights are equal, unless all weights are equal to 1. The formula assumes that
    461      * weights are to be treated as "expansion values," as will be the case if for example
    462      * the weights represent frequency counts. To normalize weights so that the denominator
    463      * in the variance computation equals the length of the input vector minus one, use <pre>
    464      *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code>
    465      * </pre>
    466      * <p>
    467      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
    468      * <p>
    469      * Throws <code>IllegalArgumentException</code> if any of the following are true:
    470      * <ul><li>the values array is null</li>
    471      *     <li>the weights array is null</li>
    472      *     <li>the weights array does not have the same length as the values array</li>
    473      *     <li>the weights array contains one or more infinite values</li>
    474      *     <li>the weights array contains one or more NaN values</li>
    475      *     <li>the weights array contains negative values</li>
    476      *     <li>the start and length arguments do not determine a valid array</li>
    477      * </ul></p>
    478      * <p>
    479      * Does not change the internal state of the statistic.</p>
    480      *
    481      * @param values the input array
    482      * @param weights the weights array
    483      * @param mean the precomputed weighted mean value
    484      * @param begin index of the first array element to include
    485      * @param length the number of elements to include
    486      * @return the variance of the values or Double.NaN if length = 0
    487      * @throws IllegalArgumentException if the parameters are not valid
    488      * @since 2.1
    489      */
    490     public double evaluate(final double[] values, final double[] weights,
    491                            final double mean, final int begin, final int length) {
    492 
    493         double var = Double.NaN;
    494 
    495         if (test(values, weights, begin, length)) {
    496             if (length == 1) {
    497                 var = 0.0;
    498             } else if (length > 1) {
    499                 double accum = 0.0;
    500                 double dev = 0.0;
    501                 double accum2 = 0.0;
    502                 for (int i = begin; i < begin + length; i++) {
    503                     dev = values[i] - mean;
    504                     accum += weights[i] * (dev * dev);
    505                     accum2 += weights[i] * dev;
    506                 }
    507 
    508                 double sumWts = 0;
    509                 for (int i = 0; i < weights.length; i++) {
    510                     sumWts += weights[i];
    511                 }
    512 
    513                 if (isBiasCorrected) {
    514                     var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
    515                 } else {
    516                     var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
    517                 }
    518             }
    519         }
    520         return var;
    521     }
    522 
    523     /**
    524      * <p>Returns the weighted variance of the values in the input array, using
    525      * the precomputed weighted mean value.</p>
    526      * <p>
    527      * Uses the formula <pre>
    528      *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
    529      * </pre></p>
    530      * <p>
    531      * The formula used assumes that the supplied mean value is the weighted arithmetic
    532      * mean of the sample data, not a known population parameter. This method
    533      * is supplied only to save computation when the mean has already been
    534      * computed.</p>
    535      * <p>
    536      * This formula will not return the same result as the unweighted variance when all
    537      * weights are equal, unless all weights are equal to 1. The formula assumes that
    538      * weights are to be treated as "expansion values," as will be the case if for example
    539      * the weights represent frequency counts. To normalize weights so that the denominator
    540      * in the variance computation equals the length of the input vector minus one, use <pre>
    541      *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code>
    542      * </pre>
    543      * <p>
    544      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
    545      * <p>
    546      * Throws <code>IllegalArgumentException</code> if any of the following are true:
    547      * <ul><li>the values array is null</li>
    548      *     <li>the weights array is null</li>
    549      *     <li>the weights array does not have the same length as the values array</li>
    550      *     <li>the weights array contains one or more infinite values</li>
    551      *     <li>the weights array contains one or more NaN values</li>
    552      *     <li>the weights array contains negative values</li>
    553      * </ul></p>
    554      * <p>
    555      * Does not change the internal state of the statistic.</p>
    556      *
    557      * @param values the input array
    558      * @param weights the weights array
    559      * @param mean the precomputed weighted mean value
    560      * @return the variance of the values or Double.NaN if length = 0
    561      * @throws IllegalArgumentException if the parameters are not valid
    562      * @since 2.1
    563      */
    564     public double evaluate(final double[] values, final double[] weights, final double mean) {
    565         return evaluate(values, weights, mean, 0, values.length);
    566     }
    567 
    568     /**
    569      * @return Returns the isBiasCorrected.
    570      */
    571     public boolean isBiasCorrected() {
    572         return isBiasCorrected;
    573     }
    574 
    575     /**
    576      * @param biasCorrected The isBiasCorrected to set.
    577      */
    578     public void setBiasCorrected(boolean biasCorrected) {
    579         this.isBiasCorrected = biasCorrected;
    580     }
    581 
    582     /**
    583      * {@inheritDoc}
    584      */
    585     @Override
    586     public Variance copy() {
    587         Variance result = new Variance();
    588         copy(this, result);
    589         return result;
    590     }
    591 
    592     /**
    593      * Copies source to dest.
    594      * <p>Neither source nor dest can be null.</p>
    595      *
    596      * @param source Variance to copy
    597      * @param dest Variance to copy to
    598      * @throws NullPointerException if either source or dest is null
    599      */
    600     public static void copy(Variance source, Variance dest) {
    601         if (source == null ||
    602             dest == null) {
    603             throw new NullArgumentException();
    604         }
    605         dest.setData(source.getDataRef());
    606         dest.moment = source.moment.copy();
    607         dest.isBiasCorrected = source.isBiasCorrected;
    608         dest.incMoment = source.incMoment;
    609     }
    610 }
    611