1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.math.stat.descriptive.rank; 18 19 import java.io.Serializable; 20 import java.util.Arrays; 21 22 import org.apache.commons.math.MathRuntimeException; 23 import org.apache.commons.math.exception.util.LocalizedFormats; 24 import org.apache.commons.math.stat.descriptive.AbstractUnivariateStatistic; 25 import org.apache.commons.math.util.FastMath; 26 27 /** 28 * Provides percentile computation. 29 * <p> 30 * There are several commonly used methods for estimating percentiles (a.k.a. 31 * quantiles) based on sample data. For large samples, the different methods 32 * agree closely, but when sample sizes are small, different methods will give 33 * significantly different results. The algorithm implemented here works as follows: 34 * <ol> 35 * <li>Let <code>n</code> be the length of the (sorted) array and 36 * <code>0 < p <= 100</code> be the desired percentile.</li> 37 * <li>If <code> n = 1 </code> return the unique array element (regardless of 38 * the value of <code>p</code>); otherwise </li> 39 * <li>Compute the estimated percentile position 40 * <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code> 41 * between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional 42 * part of <code>pos</code>). If <code>pos >= n</code> return the largest 43 * element in the array; otherwise</li> 44 * <li>Let <code>lower</code> be the element in position 45 * <code>floor(pos)</code> in the array and let <code>upper</code> be the 46 * next element in the array. Return <code>lower + d * (upper - lower)</code> 47 * </li> 48 * </ol></p> 49 * <p> 50 * To compute percentiles, the data must be at least partially ordered. Input 51 * arrays are copied and recursively partitioned using an ordering definition. 52 * The ordering used by <code>Arrays.sort(double[])</code> is the one determined 53 * by {@link java.lang.Double#compareTo(Double)}. This ordering makes 54 * <code>Double.NaN</code> larger than any other value (including 55 * <code>Double.POSITIVE_INFINITY</code>). Therefore, for example, the median 56 * (50th percentile) of 57 * <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p> 58 * <p> 59 * Since percentile estimation usually involves interpolation between array 60 * elements, arrays containing <code>NaN</code> or infinite values will often 61 * result in <code>NaN<code> or infinite values returned.</p> 62 * <p> 63 * Since 2.2, Percentile implementation uses only selection instead of complete 64 * sorting and caches selection algorithm state between calls to the various 65 * {@code evaluate} methods when several percentiles are to be computed on the same data. 66 * This greatly improves efficiency, both for single percentile and multiple 67 * percentiles computations. However, it also induces a need to be sure the data 68 * at one call to {@code evaluate} is the same as the data with the cached algorithm 69 * state from the previous calls. Percentile does this by checking the array reference 70 * itself and a checksum of its content by default. If the user already knows he calls 71 * {@code evaluate} on an immutable array, he can save the checking time by calling the 72 * {@code evaluate} methods that do <em>not</em> 73 * </p> 74 * <p> 75 * <strong>Note that this implementation is not synchronized.</strong> If 76 * multiple threads access an instance of this class concurrently, and at least 77 * one of the threads invokes the <code>increment()</code> or 78 * <code>clear()</code> method, it must be synchronized externally.</p> 79 * 80 * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $ 81 */ 82 public class Percentile extends AbstractUnivariateStatistic implements Serializable { 83 84 /** Serializable version identifier */ 85 private static final long serialVersionUID = -8091216485095130416L; 86 87 /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */ 88 private static final int MIN_SELECT_SIZE = 15; 89 90 /** Maximum number of partitioning pivots cached (each level double the number of pivots). */ 91 private static final int MAX_CACHED_LEVELS = 10; 92 93 /** Determines what percentile is computed when evaluate() is activated 94 * with no quantile argument */ 95 private double quantile = 0.0; 96 97 /** Cached pivots. */ 98 private int[] cachedPivots; 99 100 /** 101 * Constructs a Percentile with a default quantile 102 * value of 50.0. 103 */ 104 public Percentile() { 105 this(50.0); 106 } 107 108 /** 109 * Constructs a Percentile with the specific quantile value. 110 * @param p the quantile 111 * @throws IllegalArgumentException if p is not greater than 0 and less 112 * than or equal to 100 113 */ 114 public Percentile(final double p) { 115 setQuantile(p); 116 cachedPivots = null; 117 } 118 119 /** 120 * Copy constructor, creates a new {@code Percentile} identical 121 * to the {@code original} 122 * 123 * @param original the {@code Percentile} instance to copy 124 */ 125 public Percentile(Percentile original) { 126 copy(original, this); 127 } 128 129 /** {@inheritDoc} */ 130 @Override 131 public void setData(final double[] values) { 132 if (values == null) { 133 cachedPivots = null; 134 } else { 135 cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; 136 Arrays.fill(cachedPivots, -1); 137 } 138 super.setData(values); 139 } 140 141 /** {@inheritDoc} */ 142 @Override 143 public void setData(final double[] values, final int begin, final int length) { 144 if (values == null) { 145 cachedPivots = null; 146 } else { 147 cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; 148 Arrays.fill(cachedPivots, -1); 149 } 150 super.setData(values, begin, length); 151 } 152 153 /** 154 * Returns the result of evaluating the statistic over the stored data. 155 * <p> 156 * The stored array is the one which was set by previous calls to 157 * </p> 158 * @param p the percentile value to compute 159 * @return the value of the statistic applied to the stored data 160 */ 161 public double evaluate(final double p) { 162 return evaluate(getDataRef(), p); 163 } 164 165 /** 166 * Returns an estimate of the <code>p</code>th percentile of the values 167 * in the <code>values</code> array. 168 * <p> 169 * Calls to this method do not modify the internal <code>quantile</code> 170 * state of this statistic.</p> 171 * <p> 172 * <ul> 173 * <li>Returns <code>Double.NaN</code> if <code>values</code> has length 174 * <code>0</code></li> 175 * <li>Returns (for any value of <code>p</code>) <code>values[0]</code> 176 * if <code>values</code> has length <code>1</code></li> 177 * <li>Throws <code>IllegalArgumentException</code> if <code>values</code> 178 * is null or p is not a valid quantile value (p must be greater than 0 179 * and less than or equal to 100) </li> 180 * </ul></p> 181 * <p> 182 * See {@link Percentile} for a description of the percentile estimation 183 * algorithm used.</p> 184 * 185 * @param values input array of values 186 * @param p the percentile value to compute 187 * @return the percentile value or Double.NaN if the array is empty 188 * @throws IllegalArgumentException if <code>values</code> is null 189 * or p is invalid 190 */ 191 public double evaluate(final double[] values, final double p) { 192 test(values, 0, 0); 193 return evaluate(values, 0, values.length, p); 194 } 195 196 /** 197 * Returns an estimate of the <code>quantile</code>th percentile of the 198 * designated values in the <code>values</code> array. The quantile 199 * estimated is determined by the <code>quantile</code> property. 200 * <p> 201 * <ul> 202 * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li> 203 * <li>Returns (for any value of <code>quantile</code>) 204 * <code>values[begin]</code> if <code>length = 1 </code></li> 205 * <li>Throws <code>IllegalArgumentException</code> if <code>values</code> 206 * is null, or <code>start</code> or <code>length</code> 207 * is invalid</li> 208 * </ul></p> 209 * <p> 210 * See {@link Percentile} for a description of the percentile estimation 211 * algorithm used.</p> 212 * 213 * @param values the input array 214 * @param start index of the first array element to include 215 * @param length the number of elements to include 216 * @return the percentile value 217 * @throws IllegalArgumentException if the parameters are not valid 218 * 219 */ 220 @Override 221 public double evaluate( final double[] values, final int start, final int length) { 222 return evaluate(values, start, length, quantile); 223 } 224 225 /** 226 * Returns an estimate of the <code>p</code>th percentile of the values 227 * in the <code>values</code> array, starting with the element in (0-based) 228 * position <code>begin</code> in the array and including <code>length</code> 229 * values. 230 * <p> 231 * Calls to this method do not modify the internal <code>quantile</code> 232 * state of this statistic.</p> 233 * <p> 234 * <ul> 235 * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li> 236 * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code> 237 * if <code>length = 1 </code></li> 238 * <li>Throws <code>IllegalArgumentException</code> if <code>values</code> 239 * is null , <code>begin</code> or <code>length</code> is invalid, or 240 * <code>p</code> is not a valid quantile value (p must be greater than 0 241 * and less than or equal to 100)</li> 242 * </ul></p> 243 * <p> 244 * See {@link Percentile} for a description of the percentile estimation 245 * algorithm used.</p> 246 * 247 * @param values array of input values 248 * @param p the percentile to compute 249 * @param begin the first (0-based) element to include in the computation 250 * @param length the number of array elements to include 251 * @return the percentile value 252 * @throws IllegalArgumentException if the parameters are not valid or the 253 * input array is null 254 */ 255 public double evaluate(final double[] values, final int begin, 256 final int length, final double p) { 257 258 test(values, begin, length); 259 260 if ((p > 100) || (p <= 0)) { 261 throw MathRuntimeException.createIllegalArgumentException( 262 LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p); 263 } 264 if (length == 0) { 265 return Double.NaN; 266 } 267 if (length == 1) { 268 return values[begin]; // always return single value for n = 1 269 } 270 double n = length; 271 double pos = p * (n + 1) / 100; 272 double fpos = FastMath.floor(pos); 273 int intPos = (int) fpos; 274 double dif = pos - fpos; 275 double[] work; 276 int[] pivotsHeap; 277 if (values == getDataRef()) { 278 work = getDataRef(); 279 pivotsHeap = cachedPivots; 280 } else { 281 work = new double[length]; 282 System.arraycopy(values, begin, work, 0, length); 283 pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; 284 Arrays.fill(pivotsHeap, -1); 285 } 286 287 if (pos < 1) { 288 return select(work, pivotsHeap, 0); 289 } 290 if (pos >= n) { 291 return select(work, pivotsHeap, length - 1); 292 } 293 double lower = select(work, pivotsHeap, intPos - 1); 294 double upper = select(work, pivotsHeap, intPos); 295 return lower + dif * (upper - lower); 296 } 297 298 /** 299 * Select the k<sup>th</sup> smallest element from work array 300 * @param work work array (will be reorganized during the call) 301 * @param pivotsHeap set of pivot index corresponding to elements that 302 * are already at their sorted location, stored as an implicit heap 303 * (i.e. a sorted binary tree stored in a flat array, where the 304 * children of a node at index n are at indices 2n+1 for the left 305 * child and 2n+2 for the right child, with 0-based indices) 306 * @param k index of the desired element 307 * @return k<sup>th</sup> smallest element 308 */ 309 private double select(final double[] work, final int[] pivotsHeap, final int k) { 310 311 int begin = 0; 312 int end = work.length; 313 int node = 0; 314 315 while (end - begin > MIN_SELECT_SIZE) { 316 317 final int pivot; 318 if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) { 319 // the pivot has already been found in a previous call 320 // and the array has already been partitioned around it 321 pivot = pivotsHeap[node]; 322 } else { 323 // select a pivot and partition work array around it 324 pivot = partition(work, begin, end, medianOf3(work, begin, end)); 325 if (node < pivotsHeap.length) { 326 pivotsHeap[node] = pivot; 327 } 328 } 329 330 if (k == pivot) { 331 // the pivot was exactly the element we wanted 332 return work[k]; 333 } else if (k < pivot) { 334 // the element is in the left partition 335 end = pivot; 336 node = Math.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow 337 } else { 338 // the element is in the right partition 339 begin = pivot + 1; 340 node = Math.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow 341 } 342 343 } 344 345 // the element is somewhere in the small sub-array 346 // sort the sub-array using insertion sort 347 insertionSort(work, begin, end); 348 return work[k]; 349 350 } 351 352 /** Select a pivot index as the median of three 353 * @param work data array 354 * @param begin index of the first element of the slice 355 * @param end index after the last element of the slice 356 * @return the index of the median element chosen between the 357 * first, the middle and the last element of the array slice 358 */ 359 int medianOf3(final double[] work, final int begin, final int end) { 360 361 final int inclusiveEnd = end - 1; 362 final int middle = begin + (inclusiveEnd - begin) / 2; 363 final double wBegin = work[begin]; 364 final double wMiddle = work[middle]; 365 final double wEnd = work[inclusiveEnd]; 366 367 if (wBegin < wMiddle) { 368 if (wMiddle < wEnd) { 369 return middle; 370 } else { 371 return (wBegin < wEnd) ? inclusiveEnd : begin; 372 } 373 } else { 374 if (wBegin < wEnd) { 375 return begin; 376 } else { 377 return (wMiddle < wEnd) ? inclusiveEnd : middle; 378 } 379 } 380 381 } 382 383 /** 384 * Partition an array slice around a pivot 385 * <p> 386 * Partitioning exchanges array elements such that all elements 387 * smaller than pivot are before it and all elements larger than 388 * pivot are after it 389 * </p> 390 * @param work data array 391 * @param begin index of the first element of the slice 392 * @param end index after the last element of the slice 393 * @param pivot initial index of the pivot 394 * @return index of the pivot after partition 395 */ 396 private int partition(final double[] work, final int begin, final int end, final int pivot) { 397 398 final double value = work[pivot]; 399 work[pivot] = work[begin]; 400 401 int i = begin + 1; 402 int j = end - 1; 403 while (i < j) { 404 while ((i < j) && (work[j] >= value)) { 405 --j; 406 } 407 while ((i < j) && (work[i] <= value)) { 408 ++i; 409 } 410 411 if (i < j) { 412 final double tmp = work[i]; 413 work[i++] = work[j]; 414 work[j--] = tmp; 415 } 416 } 417 418 if ((i >= end) || (work[i] > value)) { 419 --i; 420 } 421 work[begin] = work[i]; 422 work[i] = value; 423 return i; 424 425 } 426 427 /** 428 * Sort in place a (small) array slice using insertion sort 429 * @param work array to sort 430 * @param begin index of the first element of the slice to sort 431 * @param end index after the last element of the slice to sort 432 */ 433 private void insertionSort(final double[] work, final int begin, final int end) { 434 for (int j = begin + 1; j < end; j++) { 435 final double saved = work[j]; 436 int i = j - 1; 437 while ((i >= begin) && (saved < work[i])) { 438 work[i + 1] = work[i]; 439 i--; 440 } 441 work[i + 1] = saved; 442 } 443 } 444 445 /** 446 * Returns the value of the quantile field (determines what percentile is 447 * computed when evaluate() is called with no quantile argument). 448 * 449 * @return quantile 450 */ 451 public double getQuantile() { 452 return quantile; 453 } 454 455 /** 456 * Sets the value of the quantile field (determines what percentile is 457 * computed when evaluate() is called with no quantile argument). 458 * 459 * @param p a value between 0 < p <= 100 460 * @throws IllegalArgumentException if p is not greater than 0 and less 461 * than or equal to 100 462 */ 463 public void setQuantile(final double p) { 464 if (p <= 0 || p > 100) { 465 throw MathRuntimeException.createIllegalArgumentException( 466 LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p); 467 } 468 quantile = p; 469 } 470 471 /** 472 * {@inheritDoc} 473 */ 474 @Override 475 public Percentile copy() { 476 Percentile result = new Percentile(); 477 copy(this, result); 478 return result; 479 } 480 481 /** 482 * Copies source to dest. 483 * <p>Neither source nor dest can be null.</p> 484 * 485 * @param source Percentile to copy 486 * @param dest Percentile to copy to 487 * @throws NullPointerException if either source or dest is null 488 */ 489 public static void copy(Percentile source, Percentile dest) { 490 dest.setData(source.getDataRef()); 491 if (source.cachedPivots != null) { 492 System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length); 493 } 494 dest.quantile = source.quantile; 495 } 496 497 } 498