1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.math.stat.descriptive.moment; 18 19 import java.io.Serializable; 20 21 import org.apache.commons.math.exception.NullArgumentException; 22 import org.apache.commons.math.exception.util.LocalizedFormats; 23 import org.apache.commons.math.stat.descriptive.WeightedEvaluation; 24 import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic; 25 26 /** 27 * Computes the variance of the available values. By default, the unbiased 28 * "sample variance" definitional formula is used: 29 * <p> 30 * variance = sum((x_i - mean)^2) / (n - 1) </p> 31 * <p> 32 * where mean is the {@link Mean} and <code>n</code> is the number 33 * of sample observations.</p> 34 * <p> 35 * The definitional formula does not have good numerical properties, so 36 * this implementation does not compute the statistic using the definitional 37 * formula. <ul> 38 * <li> The <code>getResult</code> method computes the variance using 39 * updating formulas based on West's algorithm, as described in 40 * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and 41 * J. G. Lewis 1979, <i>Communications of the ACM</i>, 42 * vol. 22 no. 9, pp. 526-531.</a></li> 43 * <li> The <code>evaluate</code> methods leverage the fact that they have the 44 * full array of values in memory to execute a two-pass algorithm. 45 * Specifically, these methods use the "corrected two-pass algorithm" from 46 * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>, 47 * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul> 48 * Note that adding values using <code>increment</code> or 49 * <code>incrementAll</code> and then executing <code>getResult</code> will 50 * sometimes give a different, less accurate, result than executing 51 * <code>evaluate</code> with the full array of values. The former approach 52 * should only be used when the full array of values is not available.</p> 53 * <p> 54 * The "population variance" ( sum((x_i - mean)^2) / n ) can also 55 * be computed using this statistic. The <code>isBiasCorrected</code> 56 * property determines whether the "population" or "sample" value is 57 * returned by the <code>evaluate</code> and <code>getResult</code> methods. 58 * To compute population variances, set this property to <code>false.</code> 59 * </p> 60 * <p> 61 * <strong>Note that this implementation is not synchronized.</strong> If 62 * multiple threads access an instance of this class concurrently, and at least 63 * one of the threads invokes the <code>increment()</code> or 64 * <code>clear()</code> method, it must be synchronized externally.</p> 65 * 66 * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ 67 */ 68 public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation { 69 70 /** Serializable version identifier */ 71 private static final long serialVersionUID = -9111962718267217978L; 72 73 /** SecondMoment is used in incremental calculation of Variance*/ 74 protected SecondMoment moment = null; 75 76 /** 77 * Boolean test to determine if this Variance should also increment 78 * the second moment, this evaluates to false when this Variance is 79 * constructed with an external SecondMoment as a parameter. 80 */ 81 protected boolean incMoment = true; 82 83 /** 84 * Determines whether or not bias correction is applied when computing the 85 * value of the statisic. True means that bias is corrected. See 86 * {@link Variance} for details on the formula. 87 */ 88 private boolean isBiasCorrected = true; 89 90 /** 91 * Constructs a Variance with default (true) <code>isBiasCorrected</code> 92 * property. 93 */ 94 public Variance() { 95 moment = new SecondMoment(); 96 } 97 98 /** 99 * Constructs a Variance based on an external second moment. 100 * 101 * @param m2 the SecondMoment (Third or Fourth moments work 102 * here as well.) 103 */ 104 public Variance(final SecondMoment m2) { 105 incMoment = false; 106 this.moment = m2; 107 } 108 109 /** 110 * Constructs a Variance with the specified <code>isBiasCorrected</code> 111 * property 112 * 113 * @param isBiasCorrected setting for bias correction - true means 114 * bias will be corrected and is equivalent to using the argumentless 115 * constructor 116 */ 117 public Variance(boolean isBiasCorrected) { 118 moment = new SecondMoment(); 119 this.isBiasCorrected = isBiasCorrected; 120 } 121 122 /** 123 * Constructs a Variance with the specified <code>isBiasCorrected</code> 124 * property and the supplied external second moment. 125 * 126 * @param isBiasCorrected setting for bias correction - true means 127 * bias will be corrected 128 * @param m2 the SecondMoment (Third or Fourth moments work 129 * here as well.) 130 */ 131 public Variance(boolean isBiasCorrected, SecondMoment m2) { 132 incMoment = false; 133 this.moment = m2; 134 this.isBiasCorrected = isBiasCorrected; 135 } 136 137 /** 138 * Copy constructor, creates a new {@code Variance} identical 139 * to the {@code original} 140 * 141 * @param original the {@code Variance} instance to copy 142 */ 143 public Variance(Variance original) { 144 copy(original, this); 145 } 146 147 /** 148 * {@inheritDoc} 149 * <p>If all values are available, it is more accurate to use 150 * {@link #evaluate(double[])} rather than adding values one at a time 151 * using this method and then executing {@link #getResult}, since 152 * <code>evaluate</code> leverages the fact that is has the full 153 * list of values together to execute a two-pass algorithm. 154 * See {@link Variance}.</p> 155 */ 156 @Override 157 public void increment(final double d) { 158 if (incMoment) { 159 moment.increment(d); 160 } 161 } 162 163 /** 164 * {@inheritDoc} 165 */ 166 @Override 167 public double getResult() { 168 if (moment.n == 0) { 169 return Double.NaN; 170 } else if (moment.n == 1) { 171 return 0d; 172 } else { 173 if (isBiasCorrected) { 174 return moment.m2 / (moment.n - 1d); 175 } else { 176 return moment.m2 / (moment.n); 177 } 178 } 179 } 180 181 /** 182 * {@inheritDoc} 183 */ 184 public long getN() { 185 return moment.getN(); 186 } 187 188 /** 189 * {@inheritDoc} 190 */ 191 @Override 192 public void clear() { 193 if (incMoment) { 194 moment.clear(); 195 } 196 } 197 198 /** 199 * Returns the variance of the entries in the input array, or 200 * <code>Double.NaN</code> if the array is empty. 201 * <p> 202 * See {@link Variance} for details on the computing algorithm.</p> 203 * <p> 204 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 205 * <p> 206 * Throws <code>IllegalArgumentException</code> if the array is null.</p> 207 * <p> 208 * Does not change the internal state of the statistic.</p> 209 * 210 * @param values the input array 211 * @return the variance of the values or Double.NaN if length = 0 212 * @throws IllegalArgumentException if the array is null 213 */ 214 @Override 215 public double evaluate(final double[] values) { 216 if (values == null) { 217 throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY); 218 } 219 return evaluate(values, 0, values.length); 220 } 221 222 /** 223 * Returns the variance of the entries in the specified portion of 224 * the input array, or <code>Double.NaN</code> if the designated subarray 225 * is empty. 226 * <p> 227 * See {@link Variance} for details on the computing algorithm.</p> 228 * <p> 229 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 230 * <p> 231 * Does not change the internal state of the statistic.</p> 232 * <p> 233 * Throws <code>IllegalArgumentException</code> if the array is null.</p> 234 * 235 * @param values the input array 236 * @param begin index of the first array element to include 237 * @param length the number of elements to include 238 * @return the variance of the values or Double.NaN if length = 0 239 * @throws IllegalArgumentException if the array is null or the array index 240 * parameters are not valid 241 */ 242 @Override 243 public double evaluate(final double[] values, final int begin, final int length) { 244 245 double var = Double.NaN; 246 247 if (test(values, begin, length)) { 248 clear(); 249 if (length == 1) { 250 var = 0.0; 251 } else if (length > 1) { 252 Mean mean = new Mean(); 253 double m = mean.evaluate(values, begin, length); 254 var = evaluate(values, m, begin, length); 255 } 256 } 257 return var; 258 } 259 260 /** 261 * <p>Returns the weighted variance of the entries in the specified portion of 262 * the input array, or <code>Double.NaN</code> if the designated subarray 263 * is empty.</p> 264 * <p> 265 * Uses the formula <pre> 266 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1) 267 * </pre> 268 * where weightedMean is the weighted mean</p> 269 * <p> 270 * This formula will not return the same result as the unweighted variance when all 271 * weights are equal, unless all weights are equal to 1. The formula assumes that 272 * weights are to be treated as "expansion values," as will be the case if for example 273 * the weights represent frequency counts. To normalize weights so that the denominator 274 * in the variance computation equals the length of the input vector minus one, use <pre> 275 * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code> 276 * </pre> 277 * <p> 278 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 279 * <p> 280 * Throws <code>IllegalArgumentException</code> if any of the following are true: 281 * <ul><li>the values array is null</li> 282 * <li>the weights array is null</li> 283 * <li>the weights array does not have the same length as the values array</li> 284 * <li>the weights array contains one or more infinite values</li> 285 * <li>the weights array contains one or more NaN values</li> 286 * <li>the weights array contains negative values</li> 287 * <li>the start and length arguments do not determine a valid array</li> 288 * </ul></p> 289 * <p> 290 * Does not change the internal state of the statistic.</p> 291 * <p> 292 * Throws <code>IllegalArgumentException</code> if either array is null.</p> 293 * 294 * @param values the input array 295 * @param weights the weights array 296 * @param begin index of the first array element to include 297 * @param length the number of elements to include 298 * @return the weighted variance of the values or Double.NaN if length = 0 299 * @throws IllegalArgumentException if the parameters are not valid 300 * @since 2.1 301 */ 302 public double evaluate(final double[] values, final double[] weights, 303 final int begin, final int length) { 304 305 double var = Double.NaN; 306 307 if (test(values, weights,begin, length)) { 308 clear(); 309 if (length == 1) { 310 var = 0.0; 311 } else if (length > 1) { 312 Mean mean = new Mean(); 313 double m = mean.evaluate(values, weights, begin, length); 314 var = evaluate(values, weights, m, begin, length); 315 } 316 } 317 return var; 318 } 319 320 /** 321 * <p> 322 * Returns the weighted variance of the entries in the the input array.</p> 323 * <p> 324 * Uses the formula <pre> 325 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1) 326 * </pre> 327 * where weightedMean is the weighted mean</p> 328 * <p> 329 * This formula will not return the same result as the unweighted variance when all 330 * weights are equal, unless all weights are equal to 1. The formula assumes that 331 * weights are to be treated as "expansion values," as will be the case if for example 332 * the weights represent frequency counts. To normalize weights so that the denominator 333 * in the variance computation equals the length of the input vector minus one, use <pre> 334 * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code> 335 * </pre> 336 * <p> 337 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 338 * <p> 339 * Throws <code>IllegalArgumentException</code> if any of the following are true: 340 * <ul><li>the values array is null</li> 341 * <li>the weights array is null</li> 342 * <li>the weights array does not have the same length as the values array</li> 343 * <li>the weights array contains one or more infinite values</li> 344 * <li>the weights array contains one or more NaN values</li> 345 * <li>the weights array contains negative values</li> 346 * </ul></p> 347 * <p> 348 * Does not change the internal state of the statistic.</p> 349 * <p> 350 * Throws <code>IllegalArgumentException</code> if either array is null.</p> 351 * 352 * @param values the input array 353 * @param weights the weights array 354 * @return the weighted variance of the values 355 * @throws IllegalArgumentException if the parameters are not valid 356 * @since 2.1 357 */ 358 public double evaluate(final double[] values, final double[] weights) { 359 return evaluate(values, weights, 0, values.length); 360 } 361 362 /** 363 * Returns the variance of the entries in the specified portion of 364 * the input array, using the precomputed mean value. Returns 365 * <code>Double.NaN</code> if the designated subarray is empty. 366 * <p> 367 * See {@link Variance} for details on the computing algorithm.</p> 368 * <p> 369 * The formula used assumes that the supplied mean value is the arithmetic 370 * mean of the sample data, not a known population parameter. This method 371 * is supplied only to save computation when the mean has already been 372 * computed.</p> 373 * <p> 374 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 375 * <p> 376 * Throws <code>IllegalArgumentException</code> if the array is null.</p> 377 * <p> 378 * Does not change the internal state of the statistic.</p> 379 * 380 * @param values the input array 381 * @param mean the precomputed mean value 382 * @param begin index of the first array element to include 383 * @param length the number of elements to include 384 * @return the variance of the values or Double.NaN if length = 0 385 * @throws IllegalArgumentException if the array is null or the array index 386 * parameters are not valid 387 */ 388 public double evaluate(final double[] values, final double mean, 389 final int begin, final int length) { 390 391 double var = Double.NaN; 392 393 if (test(values, begin, length)) { 394 if (length == 1) { 395 var = 0.0; 396 } else if (length > 1) { 397 double accum = 0.0; 398 double dev = 0.0; 399 double accum2 = 0.0; 400 for (int i = begin; i < begin + length; i++) { 401 dev = values[i] - mean; 402 accum += dev * dev; 403 accum2 += dev; 404 } 405 double len = length; 406 if (isBiasCorrected) { 407 var = (accum - (accum2 * accum2 / len)) / (len - 1.0); 408 } else { 409 var = (accum - (accum2 * accum2 / len)) / len; 410 } 411 } 412 } 413 return var; 414 } 415 416 /** 417 * Returns the variance of the entries in the input array, using the 418 * precomputed mean value. Returns <code>Double.NaN</code> if the array 419 * is empty. 420 * <p> 421 * See {@link Variance} for details on the computing algorithm.</p> 422 * <p> 423 * If <code>isBiasCorrected</code> is <code>true</code> the formula used 424 * assumes that the supplied mean value is the arithmetic mean of the 425 * sample data, not a known population parameter. If the mean is a known 426 * population parameter, or if the "population" version of the variance is 427 * desired, set <code>isBiasCorrected</code> to <code>false</code> before 428 * invoking this method.</p> 429 * <p> 430 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 431 * <p> 432 * Throws <code>IllegalArgumentException</code> if the array is null.</p> 433 * <p> 434 * Does not change the internal state of the statistic.</p> 435 * 436 * @param values the input array 437 * @param mean the precomputed mean value 438 * @return the variance of the values or Double.NaN if the array is empty 439 * @throws IllegalArgumentException if the array is null 440 */ 441 public double evaluate(final double[] values, final double mean) { 442 return evaluate(values, mean, 0, values.length); 443 } 444 445 /** 446 * Returns the weighted variance of the entries in the specified portion of 447 * the input array, using the precomputed weighted mean value. Returns 448 * <code>Double.NaN</code> if the designated subarray is empty. 449 * <p> 450 * Uses the formula <pre> 451 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1) 452 * </pre></p> 453 * <p> 454 * The formula used assumes that the supplied mean value is the weighted arithmetic 455 * mean of the sample data, not a known population parameter. This method 456 * is supplied only to save computation when the mean has already been 457 * computed.</p> 458 * <p> 459 * This formula will not return the same result as the unweighted variance when all 460 * weights are equal, unless all weights are equal to 1. The formula assumes that 461 * weights are to be treated as "expansion values," as will be the case if for example 462 * the weights represent frequency counts. To normalize weights so that the denominator 463 * in the variance computation equals the length of the input vector minus one, use <pre> 464 * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code> 465 * </pre> 466 * <p> 467 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 468 * <p> 469 * Throws <code>IllegalArgumentException</code> if any of the following are true: 470 * <ul><li>the values array is null</li> 471 * <li>the weights array is null</li> 472 * <li>the weights array does not have the same length as the values array</li> 473 * <li>the weights array contains one or more infinite values</li> 474 * <li>the weights array contains one or more NaN values</li> 475 * <li>the weights array contains negative values</li> 476 * <li>the start and length arguments do not determine a valid array</li> 477 * </ul></p> 478 * <p> 479 * Does not change the internal state of the statistic.</p> 480 * 481 * @param values the input array 482 * @param weights the weights array 483 * @param mean the precomputed weighted mean value 484 * @param begin index of the first array element to include 485 * @param length the number of elements to include 486 * @return the variance of the values or Double.NaN if length = 0 487 * @throws IllegalArgumentException if the parameters are not valid 488 * @since 2.1 489 */ 490 public double evaluate(final double[] values, final double[] weights, 491 final double mean, final int begin, final int length) { 492 493 double var = Double.NaN; 494 495 if (test(values, weights, begin, length)) { 496 if (length == 1) { 497 var = 0.0; 498 } else if (length > 1) { 499 double accum = 0.0; 500 double dev = 0.0; 501 double accum2 = 0.0; 502 for (int i = begin; i < begin + length; i++) { 503 dev = values[i] - mean; 504 accum += weights[i] * (dev * dev); 505 accum2 += weights[i] * dev; 506 } 507 508 double sumWts = 0; 509 for (int i = 0; i < weights.length; i++) { 510 sumWts += weights[i]; 511 } 512 513 if (isBiasCorrected) { 514 var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0); 515 } else { 516 var = (accum - (accum2 * accum2 / sumWts)) / sumWts; 517 } 518 } 519 } 520 return var; 521 } 522 523 /** 524 * <p>Returns the weighted variance of the values in the input array, using 525 * the precomputed weighted mean value.</p> 526 * <p> 527 * Uses the formula <pre> 528 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1) 529 * </pre></p> 530 * <p> 531 * The formula used assumes that the supplied mean value is the weighted arithmetic 532 * mean of the sample data, not a known population parameter. This method 533 * is supplied only to save computation when the mean has already been 534 * computed.</p> 535 * <p> 536 * This formula will not return the same result as the unweighted variance when all 537 * weights are equal, unless all weights are equal to 1. The formula assumes that 538 * weights are to be treated as "expansion values," as will be the case if for example 539 * the weights represent frequency counts. To normalize weights so that the denominator 540 * in the variance computation equals the length of the input vector minus one, use <pre> 541 * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code> 542 * </pre> 543 * <p> 544 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 545 * <p> 546 * Throws <code>IllegalArgumentException</code> if any of the following are true: 547 * <ul><li>the values array is null</li> 548 * <li>the weights array is null</li> 549 * <li>the weights array does not have the same length as the values array</li> 550 * <li>the weights array contains one or more infinite values</li> 551 * <li>the weights array contains one or more NaN values</li> 552 * <li>the weights array contains negative values</li> 553 * </ul></p> 554 * <p> 555 * Does not change the internal state of the statistic.</p> 556 * 557 * @param values the input array 558 * @param weights the weights array 559 * @param mean the precomputed weighted mean value 560 * @return the variance of the values or Double.NaN if length = 0 561 * @throws IllegalArgumentException if the parameters are not valid 562 * @since 2.1 563 */ 564 public double evaluate(final double[] values, final double[] weights, final double mean) { 565 return evaluate(values, weights, mean, 0, values.length); 566 } 567 568 /** 569 * @return Returns the isBiasCorrected. 570 */ 571 public boolean isBiasCorrected() { 572 return isBiasCorrected; 573 } 574 575 /** 576 * @param biasCorrected The isBiasCorrected to set. 577 */ 578 public void setBiasCorrected(boolean biasCorrected) { 579 this.isBiasCorrected = biasCorrected; 580 } 581 582 /** 583 * {@inheritDoc} 584 */ 585 @Override 586 public Variance copy() { 587 Variance result = new Variance(); 588 copy(this, result); 589 return result; 590 } 591 592 /** 593 * Copies source to dest. 594 * <p>Neither source nor dest can be null.</p> 595 * 596 * @param source Variance to copy 597 * @param dest Variance to copy to 598 * @throws NullPointerException if either source or dest is null 599 */ 600 public static void copy(Variance source, Variance dest) { 601 if (source == null || 602 dest == null) { 603 throw new NullArgumentException(); 604 } 605 dest.setData(source.getDataRef()); 606 dest.moment = source.moment.copy(); 607 dest.isBiasCorrected = source.isBiasCorrected; 608 dest.incMoment = source.incMoment; 609 } 610 } 611