Home | History | Annotate | Download | only in rank
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one or more
      3  * contributor license agreements.  See the NOTICE file distributed with
      4  * this work for additional information regarding copyright ownership.
      5  * The ASF licenses this file to You under the Apache License, Version 2.0
      6  * (the "License"); you may not use this file except in compliance with
      7  * the License.  You may obtain a copy of the License at
      8  *
      9  *      http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  */
     17 package org.apache.commons.math.stat.descriptive.rank;
     18 
     19 import java.io.Serializable;
     20 import java.util.Arrays;
     21 
     22 import org.apache.commons.math.MathRuntimeException;
     23 import org.apache.commons.math.exception.util.LocalizedFormats;
     24 import org.apache.commons.math.stat.descriptive.AbstractUnivariateStatistic;
     25 import org.apache.commons.math.util.FastMath;
     26 
     27 /**
     28  * Provides percentile computation.
     29  * <p>
     30  * There are several commonly used methods for estimating percentiles (a.k.a.
     31  * quantiles) based on sample data.  For large samples, the different methods
     32  * agree closely, but when sample sizes are small, different methods will give
     33  * significantly different results.  The algorithm implemented here works as follows:
     34  * <ol>
     35  * <li>Let <code>n</code> be the length of the (sorted) array and
     36  * <code>0 < p <= 100</code> be the desired percentile.</li>
     37  * <li>If <code> n = 1 </code> return the unique array element (regardless of
     38  * the value of <code>p</code>); otherwise </li>
     39  * <li>Compute the estimated percentile position
     40  * <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code>
     41  * between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional
     42  * part of <code>pos</code>).  If <code>pos >= n</code> return the largest
     43  * element in the array; otherwise</li>
     44  * <li>Let <code>lower</code> be the element in position
     45  * <code>floor(pos)</code> in the array and let <code>upper</code> be the
     46  * next element in the array.  Return <code>lower + d * (upper - lower)</code>
     47  * </li>
     48  * </ol></p>
     49  * <p>
     50  * To compute percentiles, the data must be at least partially ordered.  Input
     51  * arrays are copied and recursively partitioned using an ordering definition.
     52  * The ordering used by <code>Arrays.sort(double[])</code> is the one determined
     53  * by {@link java.lang.Double#compareTo(Double)}.  This ordering makes
     54  * <code>Double.NaN</code> larger than any other value (including
     55  * <code>Double.POSITIVE_INFINITY</code>).  Therefore, for example, the median
     56  * (50th percentile) of
     57  * <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p>
     58  * <p>
     59  * Since percentile estimation usually involves interpolation between array
     60  * elements, arrays containing  <code>NaN</code> or infinite values will often
     61  * result in <code>NaN<code> or infinite values returned.</p>
     62  * <p>
     63  * Since 2.2, Percentile implementation uses only selection instead of complete
     64  * sorting and caches selection algorithm state between calls to the various
     65  * {@code evaluate} methods when several percentiles are to be computed on the same data.
     66  * This greatly improves efficiency, both for single percentile and multiple
     67  * percentiles computations. However, it also induces a need to be sure the data
     68  * at one call to {@code evaluate} is the same as the data with the cached algorithm
     69  * state from the previous calls. Percentile does this by checking the array reference
     70  * itself and a checksum of its content by default. If the user already knows he calls
     71  * {@code evaluate} on an immutable array, he can save the checking time by calling the
     72  * {@code evaluate} methods that do <em>not</em>
     73  * </p>
     74  * <p>
     75  * <strong>Note that this implementation is not synchronized.</strong> If
     76  * multiple threads access an instance of this class concurrently, and at least
     77  * one of the threads invokes the <code>increment()</code> or
     78  * <code>clear()</code> method, it must be synchronized externally.</p>
     79  *
     80  * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $
     81  */
     82 public class Percentile extends AbstractUnivariateStatistic implements Serializable {
     83 
     84     /** Serializable version identifier */
     85     private static final long serialVersionUID = -8091216485095130416L;
     86 
     87     /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */
     88     private static final int MIN_SELECT_SIZE = 15;
     89 
     90     /** Maximum number of partitioning pivots cached (each level double the number of pivots). */
     91     private static final int MAX_CACHED_LEVELS = 10;
     92 
     93     /** Determines what percentile is computed when evaluate() is activated
     94      * with no quantile argument */
     95     private double quantile = 0.0;
     96 
     97     /** Cached pivots. */
     98     private int[] cachedPivots;
     99 
    100     /**
    101      * Constructs a Percentile with a default quantile
    102      * value of 50.0.
    103      */
    104     public Percentile() {
    105         this(50.0);
    106     }
    107 
    108     /**
    109      * Constructs a Percentile with the specific quantile value.
    110      * @param p the quantile
    111      * @throws IllegalArgumentException  if p is not greater than 0 and less
    112      * than or equal to 100
    113      */
    114     public Percentile(final double p) {
    115         setQuantile(p);
    116         cachedPivots = null;
    117     }
    118 
    119     /**
    120      * Copy constructor, creates a new {@code Percentile} identical
    121      * to the {@code original}
    122      *
    123      * @param original the {@code Percentile} instance to copy
    124      */
    125     public Percentile(Percentile original) {
    126         copy(original, this);
    127     }
    128 
    129     /** {@inheritDoc} */
    130     @Override
    131     public void setData(final double[] values) {
    132         if (values == null) {
    133             cachedPivots = null;
    134         } else {
    135             cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
    136             Arrays.fill(cachedPivots, -1);
    137         }
    138         super.setData(values);
    139     }
    140 
    141     /** {@inheritDoc} */
    142     @Override
    143     public void setData(final double[] values, final int begin, final int length) {
    144         if (values == null) {
    145             cachedPivots = null;
    146         } else {
    147             cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
    148             Arrays.fill(cachedPivots, -1);
    149         }
    150         super.setData(values, begin, length);
    151     }
    152 
    153     /**
    154      * Returns the result of evaluating the statistic over the stored data.
    155      * <p>
    156      * The stored array is the one which was set by previous calls to
    157      * </p>
    158      * @param p the percentile value to compute
    159      * @return the value of the statistic applied to the stored data
    160      */
    161     public double evaluate(final double p) {
    162         return evaluate(getDataRef(), p);
    163     }
    164 
    165     /**
    166      * Returns an estimate of the <code>p</code>th percentile of the values
    167      * in the <code>values</code> array.
    168      * <p>
    169      * Calls to this method do not modify the internal <code>quantile</code>
    170      * state of this statistic.</p>
    171      * <p>
    172      * <ul>
    173      * <li>Returns <code>Double.NaN</code> if <code>values</code> has length
    174      * <code>0</code></li>
    175      * <li>Returns (for any value of <code>p</code>) <code>values[0]</code>
    176      *  if <code>values</code> has length <code>1</code></li>
    177      * <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
    178      * is null or p is not a valid quantile value (p must be greater than 0
    179      * and less than or equal to 100) </li>
    180      * </ul></p>
    181      * <p>
    182      * See {@link Percentile} for a description of the percentile estimation
    183      * algorithm used.</p>
    184      *
    185      * @param values input array of values
    186      * @param p the percentile value to compute
    187      * @return the percentile value or Double.NaN if the array is empty
    188      * @throws IllegalArgumentException if <code>values</code> is null
    189      *     or p is invalid
    190      */
    191     public double evaluate(final double[] values, final double p) {
    192         test(values, 0, 0);
    193         return evaluate(values, 0, values.length, p);
    194     }
    195 
    196     /**
    197      * Returns an estimate of the <code>quantile</code>th percentile of the
    198      * designated values in the <code>values</code> array.  The quantile
    199      * estimated is determined by the <code>quantile</code> property.
    200      * <p>
    201      * <ul>
    202      * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
    203      * <li>Returns (for any value of <code>quantile</code>)
    204      * <code>values[begin]</code> if <code>length = 1 </code></li>
    205      * <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
    206      * is null,  or <code>start</code> or <code>length</code>
    207      * is invalid</li>
    208      * </ul></p>
    209      * <p>
    210      * See {@link Percentile} for a description of the percentile estimation
    211      * algorithm used.</p>
    212      *
    213      * @param values the input array
    214      * @param start index of the first array element to include
    215      * @param length the number of elements to include
    216      * @return the percentile value
    217      * @throws IllegalArgumentException if the parameters are not valid
    218      *
    219      */
    220     @Override
    221     public double evaluate( final double[] values, final int start, final int length) {
    222         return evaluate(values, start, length, quantile);
    223     }
    224 
    225      /**
    226      * Returns an estimate of the <code>p</code>th percentile of the values
    227      * in the <code>values</code> array, starting with the element in (0-based)
    228      * position <code>begin</code> in the array and including <code>length</code>
    229      * values.
    230      * <p>
    231      * Calls to this method do not modify the internal <code>quantile</code>
    232      * state of this statistic.</p>
    233      * <p>
    234      * <ul>
    235      * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
    236      * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code>
    237      *  if <code>length = 1 </code></li>
    238      * <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
    239      *  is null , <code>begin</code> or <code>length</code> is invalid, or
    240      * <code>p</code> is not a valid quantile value (p must be greater than 0
    241      * and less than or equal to 100)</li>
    242      * </ul></p>
    243      * <p>
    244      * See {@link Percentile} for a description of the percentile estimation
    245      * algorithm used.</p>
    246      *
    247      * @param values array of input values
    248      * @param p  the percentile to compute
    249      * @param begin  the first (0-based) element to include in the computation
    250      * @param length  the number of array elements to include
    251      * @return  the percentile value
    252      * @throws IllegalArgumentException if the parameters are not valid or the
    253      * input array is null
    254      */
    255     public double evaluate(final double[] values, final int begin,
    256             final int length, final double p) {
    257 
    258         test(values, begin, length);
    259 
    260         if ((p > 100) || (p <= 0)) {
    261             throw MathRuntimeException.createIllegalArgumentException(
    262                   LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p);
    263         }
    264         if (length == 0) {
    265             return Double.NaN;
    266         }
    267         if (length == 1) {
    268             return values[begin]; // always return single value for n = 1
    269         }
    270         double n = length;
    271         double pos = p * (n + 1) / 100;
    272         double fpos = FastMath.floor(pos);
    273         int intPos = (int) fpos;
    274         double dif = pos - fpos;
    275         double[] work;
    276         int[] pivotsHeap;
    277         if (values == getDataRef()) {
    278             work = getDataRef();
    279             pivotsHeap = cachedPivots;
    280         } else {
    281             work = new double[length];
    282             System.arraycopy(values, begin, work, 0, length);
    283             pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
    284             Arrays.fill(pivotsHeap, -1);
    285         }
    286 
    287         if (pos < 1) {
    288             return select(work, pivotsHeap, 0);
    289         }
    290         if (pos >= n) {
    291             return select(work, pivotsHeap, length - 1);
    292         }
    293         double lower = select(work, pivotsHeap, intPos - 1);
    294         double upper = select(work, pivotsHeap, intPos);
    295         return lower + dif * (upper - lower);
    296     }
    297 
    298     /**
    299      * Select the k<sup>th</sup> smallest element from work array
    300      * @param work work array (will be reorganized during the call)
    301      * @param pivotsHeap set of pivot index corresponding to elements that
    302      * are already at their sorted location, stored as an implicit heap
    303      * (i.e. a sorted binary tree stored in a flat array, where the
    304      * children of a node at index n are at indices 2n+1 for the left
    305      * child and 2n+2 for the right child, with 0-based indices)
    306      * @param k index of the desired element
    307      * @return k<sup>th</sup> smallest element
    308      */
    309     private double select(final double[] work, final int[] pivotsHeap, final int k) {
    310 
    311         int begin = 0;
    312         int end   = work.length;
    313         int node  = 0;
    314 
    315         while (end - begin > MIN_SELECT_SIZE) {
    316 
    317             final int pivot;
    318             if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) {
    319                 // the pivot has already been found in a previous call
    320                 // and the array has already been partitioned around it
    321                 pivot = pivotsHeap[node];
    322             } else {
    323                 // select a pivot and partition work array around it
    324                 pivot = partition(work, begin, end, medianOf3(work, begin, end));
    325                 if (node < pivotsHeap.length) {
    326                     pivotsHeap[node] =  pivot;
    327                 }
    328             }
    329 
    330             if (k == pivot) {
    331                 // the pivot was exactly the element we wanted
    332                 return work[k];
    333             } else if (k < pivot) {
    334                 // the element is in the left partition
    335                 end  = pivot;
    336                 node = Math.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow
    337             } else {
    338                 // the element is in the right partition
    339                 begin = pivot + 1;
    340                 node  = Math.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow
    341             }
    342 
    343         }
    344 
    345         // the element is somewhere in the small sub-array
    346         // sort the sub-array using insertion sort
    347         insertionSort(work, begin, end);
    348         return work[k];
    349 
    350     }
    351 
    352     /** Select a pivot index as the median of three
    353      * @param work data array
    354      * @param begin index of the first element of the slice
    355      * @param end index after the last element of the slice
    356      * @return the index of the median element chosen between the
    357      * first, the middle and the last element of the array slice
    358      */
    359     int medianOf3(final double[] work, final int begin, final int end) {
    360 
    361         final int inclusiveEnd = end - 1;
    362         final int    middle    = begin + (inclusiveEnd - begin) / 2;
    363         final double wBegin    = work[begin];
    364         final double wMiddle   = work[middle];
    365         final double wEnd      = work[inclusiveEnd];
    366 
    367         if (wBegin < wMiddle) {
    368             if (wMiddle < wEnd) {
    369                 return middle;
    370             } else {
    371                 return (wBegin < wEnd) ? inclusiveEnd : begin;
    372             }
    373         } else {
    374             if (wBegin < wEnd) {
    375                 return begin;
    376             } else {
    377                 return (wMiddle < wEnd) ? inclusiveEnd : middle;
    378             }
    379         }
    380 
    381     }
    382 
    383     /**
    384      * Partition an array slice around a pivot
    385      * <p>
    386      * Partitioning exchanges array elements such that all elements
    387      * smaller than pivot are before it and all elements larger than
    388      * pivot are after it
    389      * </p>
    390      * @param work data array
    391      * @param begin index of the first element of the slice
    392      * @param end index after the last element of the slice
    393      * @param pivot initial index of the pivot
    394      * @return index of the pivot after partition
    395      */
    396     private int partition(final double[] work, final int begin, final int end, final int pivot) {
    397 
    398         final double value = work[pivot];
    399         work[pivot] = work[begin];
    400 
    401         int i = begin + 1;
    402         int j = end - 1;
    403         while (i < j) {
    404             while ((i < j) && (work[j] >= value)) {
    405                 --j;
    406             }
    407             while ((i < j) && (work[i] <= value)) {
    408                 ++i;
    409             }
    410 
    411             if (i < j) {
    412                 final double tmp = work[i];
    413                 work[i++] = work[j];
    414                 work[j--] = tmp;
    415             }
    416         }
    417 
    418         if ((i >= end) || (work[i] > value)) {
    419             --i;
    420         }
    421         work[begin] = work[i];
    422         work[i]     = value;
    423         return i;
    424 
    425     }
    426 
    427     /**
    428      * Sort in place a (small) array slice using insertion sort
    429      * @param work array to sort
    430      * @param begin index of the first element of the slice to sort
    431      * @param end index after the last element of the slice to sort
    432      */
    433     private void insertionSort(final double[] work, final int begin, final int end) {
    434         for (int j = begin + 1; j < end; j++) {
    435             final double saved = work[j];
    436             int i = j - 1;
    437             while ((i >= begin) && (saved < work[i])) {
    438                 work[i + 1] = work[i];
    439                 i--;
    440             }
    441             work[i + 1] = saved;
    442         }
    443     }
    444 
    445     /**
    446      * Returns the value of the quantile field (determines what percentile is
    447      * computed when evaluate() is called with no quantile argument).
    448      *
    449      * @return quantile
    450      */
    451     public double getQuantile() {
    452         return quantile;
    453     }
    454 
    455     /**
    456      * Sets the value of the quantile field (determines what percentile is
    457      * computed when evaluate() is called with no quantile argument).
    458      *
    459      * @param p a value between 0 < p <= 100
    460      * @throws IllegalArgumentException  if p is not greater than 0 and less
    461      * than or equal to 100
    462      */
    463     public void setQuantile(final double p) {
    464         if (p <= 0 || p > 100) {
    465             throw MathRuntimeException.createIllegalArgumentException(
    466                   LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p);
    467         }
    468         quantile = p;
    469     }
    470 
    471     /**
    472      * {@inheritDoc}
    473      */
    474     @Override
    475     public Percentile copy() {
    476         Percentile result = new Percentile();
    477         copy(this, result);
    478         return result;
    479     }
    480 
    481     /**
    482      * Copies source to dest.
    483      * <p>Neither source nor dest can be null.</p>
    484      *
    485      * @param source Percentile to copy
    486      * @param dest Percentile to copy to
    487      * @throws NullPointerException if either source or dest is null
    488      */
    489     public static void copy(Percentile source, Percentile dest) {
    490         dest.setData(source.getDataRef());
    491         if (source.cachedPivots != null) {
    492             System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length);
    493         }
    494         dest.quantile = source.quantile;
    495     }
    496 
    497 }
    498