Home | History | Annotate | Download | only in external
      1 ##  Module statistics.py
      2 ##
      3 ##  Copyright (c) 2013 Steven D'Aprano <steve+python (at] pearwood.info>.
      4 ##
      5 ##  Licensed under the Apache License, Version 2.0 (the "License");
      6 ##  you may not use this file except in compliance with the License.
      7 ##  You may obtain a copy of the License at
      8 ##
      9 ##  http://www.apache.org/licenses/LICENSE-2.0
     10 ##
     11 ##  Unless required by applicable law or agreed to in writing, software
     12 ##  distributed under the License is distributed on an "AS IS" BASIS,
     13 ##  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 ##  See the License for the specific language governing permissions and
     15 ##  limitations under the License.
     16 
     17 
     18 """
     19 Basic statistics module.
     20 
     21 This module provides functions for calculating statistics of data, including
     22 averages, variance, and standard deviation.
     23 
     24 Calculating averages
     25 --------------------
     26 
     27 ==================  =============================================
     28 Function            Description
     29 ==================  =============================================
     30 mean                Arithmetic mean (average) of data.
     31 median              Median (middle value) of data.
     32 median_low          Low median of data.
     33 median_high         High median of data.
     34 median_grouped      Median, or 50th percentile, of grouped data.
     35 mode                Mode (most common value) of data.
     36 ==================  =============================================
     37 
     38 Calculate the arithmetic mean ("the average") of data:
     39 
     40 >>> mean([-1.0, 2.5, 3.25, 5.75])
     41 2.625
     42 
     43 
     44 Calculate the standard median of discrete data:
     45 
     46 >>> median([2, 3, 4, 5])
     47 3.5
     48 
     49 
     50 Calculate the median, or 50th percentile, of data grouped into class intervals
     51 centred on the data values provided. E.g. if your data points are rounded to
     52 the nearest whole number:
     53 
     54 >>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
     55 2.8333333333...
     56 
     57 This should be interpreted in this way: you have two data points in the class
     58 interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
     59 the class interval 3.5-4.5. The median of these data points is 2.8333...
     60 
     61 
     62 Calculating variability or spread
     63 ---------------------------------
     64 
     65 ==================  =============================================
     66 Function            Description
     67 ==================  =============================================
     68 pvariance           Population variance of data.
     69 variance            Sample variance of data.
     70 pstdev              Population standard deviation of data.
     71 stdev               Sample standard deviation of data.
     72 ==================  =============================================
     73 
     74 Calculate the standard deviation of sample data:
     75 
     76 >>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
     77 4.38961843444...
     78 
     79 If you have previously calculated the mean, you can pass it as the optional
     80 second argument to the four "spread" functions to avoid recalculating it:
     81 
     82 >>> data = [1, 2, 2, 4, 4, 4, 5, 6]
     83 >>> mu = mean(data)
     84 >>> pvariance(data, mu)
     85 2.5
     86 
     87 
     88 Exceptions
     89 ----------
     90 
     91 A single exception is defined: StatisticsError is a subclass of ValueError.
     92 
     93 """
     94 
     95 __all__ = [ 'StatisticsError',
     96             'pstdev', 'pvariance', 'stdev', 'variance',
     97             'median',  'median_low', 'median_high', 'median_grouped',
     98             'mean', 'mode',
     99           ]
    100 
    101 
    102 import collections
    103 import math
    104 
    105 from fractions import Fraction
    106 from decimal import Decimal
    107 from itertools import groupby
    108 
    109 
    110 
    111 # === Exceptions ===
    112 
    113 class StatisticsError(ValueError):
    114     pass
    115 
    116 
    117 # === Private utilities ===
    118 
    119 def _sum(data, start=0):
    120     """_sum(data [, start]) -> (type, sum, count)
    121 
    122     Return a high-precision sum of the given numeric data as a fraction,
    123     together with the type to be converted to and the count of items.
    124 
    125     If optional argument ``start`` is given, it is added to the total.
    126     If ``data`` is empty, ``start`` (defaulting to 0) is returned.
    127 
    128 
    129     Examples
    130     --------
    131 
    132     >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
    133     (<class 'float'>, Fraction(11, 1), 5)
    134 
    135     Some sources of round-off error will be avoided:
    136 
    137     >>> _sum([1e50, 1, -1e50] * 1000)  # Built-in sum returns zero.
    138     (<class 'float'>, Fraction(1000, 1), 3000)
    139 
    140     Fractions and Decimals are also supported:
    141 
    142     >>> from fractions import Fraction as F
    143     >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
    144     (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
    145 
    146     >>> from decimal import Decimal as D
    147     >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
    148     >>> _sum(data)
    149     (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
    150 
    151     Mixed types are currently treated as an error, except that int is
    152     allowed.
    153     """
    154     count = 0
    155     n, d = _exact_ratio(start)
    156     partials = {d: n}
    157     partials_get = partials.get
    158     T = _coerce(int, type(start))
    159     for typ, values in groupby(data, type):
    160         T = _coerce(T, typ)  # or raise TypeError
    161         for n,d in map(_exact_ratio, values):
    162             count += 1
    163             partials[d] = partials_get(d, 0) + n
    164     if None in partials:
    165         # The sum will be a NAN or INF. We can ignore all the finite
    166         # partials, and just look at this special one.
    167         total = partials[None]
    168         assert not _isfinite(total)
    169     else:
    170         # Sum all the partial sums using builtin sum.
    171         # FIXME is this faster if we sum them in order of the denominator?
    172         total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
    173     return (T, total, count)
    174 
    175 
    176 def _isfinite(x):
    177     try:
    178         return x.is_finite()  # Likely a Decimal.
    179     except AttributeError:
    180         return math.isfinite(x)  # Coerces to float first.
    181 
    182 
    183 def _coerce(T, S):
    184     """Coerce types T and S to a common type, or raise TypeError.
    185 
    186     Coercion rules are currently an implementation detail. See the CoerceTest
    187     test class in test_statistics for details.
    188     """
    189     # See http://bugs.python.org/issue24068.
    190     assert T is not bool, "initial type T is bool"
    191     # If the types are the same, no need to coerce anything. Put this
    192     # first, so that the usual case (no coercion needed) happens as soon
    193     # as possible.
    194     if T is S:  return T
    195     # Mixed int & other coerce to the other type.
    196     if S is int or S is bool:  return T
    197     if T is int:  return S
    198     # If one is a (strict) subclass of the other, coerce to the subclass.
    199     if issubclass(S, T):  return S
    200     if issubclass(T, S):  return T
    201     # Ints coerce to the other type.
    202     if issubclass(T, int):  return S
    203     if issubclass(S, int):  return T
    204     # Mixed fraction & float coerces to float (or float subclass).
    205     if issubclass(T, Fraction) and issubclass(S, float):
    206         return S
    207     if issubclass(T, float) and issubclass(S, Fraction):
    208         return T
    209     # Any other combination is disallowed.
    210     msg = "don't know how to coerce %s and %s"
    211     raise TypeError(msg % (T.__name__, S.__name__))
    212 
    213 
    214 def _exact_ratio(x):
    215     """Return Real number x to exact (numerator, denominator) pair.
    216 
    217     >>> _exact_ratio(0.25)
    218     (1, 4)
    219 
    220     x is expected to be an int, Fraction, Decimal or float.
    221     """
    222     try:
    223         # Optimise the common case of floats. We expect that the most often
    224         # used numeric type will be builtin floats, so try to make this as
    225         # fast as possible.
    226         if type(x) is float:
    227             return x.as_integer_ratio()
    228         try:
    229             # x may be an int, Fraction, or Integral ABC.
    230             return (x.numerator, x.denominator)
    231         except AttributeError:
    232             try:
    233                 # x may be a float subclass.
    234                 return x.as_integer_ratio()
    235             except AttributeError:
    236                 try:
    237                     # x may be a Decimal.
    238                     return _decimal_to_ratio(x)
    239                 except AttributeError:
    240                     # Just give up?
    241                     pass
    242     except (OverflowError, ValueError):
    243         # float NAN or INF.
    244         assert not math.isfinite(x)
    245         return (x, None)
    246     msg = "can't convert type '{}' to numerator/denominator"
    247     raise TypeError(msg.format(type(x).__name__))
    248 
    249 
    250 # FIXME This is faster than Fraction.from_decimal, but still too slow.
    251 def _decimal_to_ratio(d):
    252     """Convert Decimal d to exact integer ratio (numerator, denominator).
    253 
    254     >>> from decimal import Decimal
    255     >>> _decimal_to_ratio(Decimal("2.6"))
    256     (26, 10)
    257 
    258     """
    259     sign, digits, exp = d.as_tuple()
    260     if exp in ('F', 'n', 'N'):  # INF, NAN, sNAN
    261         assert not d.is_finite()
    262         return (d, None)
    263     num = 0
    264     for digit in digits:
    265         num = num*10 + digit
    266     if exp < 0:
    267         den = 10**-exp
    268     else:
    269         num *= 10**exp
    270         den = 1
    271     if sign:
    272         num = -num
    273     return (num, den)
    274 
    275 
    276 def _convert(value, T):
    277     """Convert value to given numeric type T."""
    278     if type(value) is T:
    279         # This covers the cases where T is Fraction, or where value is
    280         # a NAN or INF (Decimal or float).
    281         return value
    282     if issubclass(T, int) and value.denominator != 1:
    283         T = float
    284     try:
    285         # FIXME: what do we do if this overflows?
    286         return T(value)
    287     except TypeError:
    288         if issubclass(T, Decimal):
    289             return T(value.numerator)/T(value.denominator)
    290         else:
    291             raise
    292 
    293 
    294 def _counts(data):
    295     # Generate a table of sorted (value, frequency) pairs.
    296     table = collections.Counter(iter(data)).most_common()
    297     if not table:
    298         return table
    299     # Extract the values with the highest frequency.
    300     maxfreq = table[0][1]
    301     for i in range(1, len(table)):
    302         if table[i][1] != maxfreq:
    303             table = table[:i]
    304             break
    305     return table
    306 
    307 
    308 # === Measures of central tendency (averages) ===
    309 
    310 def mean(data):
    311     """Return the sample arithmetic mean of data.
    312 
    313     >>> mean([1, 2, 3, 4, 4])
    314     2.8
    315 
    316     >>> from fractions import Fraction as F
    317     >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
    318     Fraction(13, 21)
    319 
    320     >>> from decimal import Decimal as D
    321     >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
    322     Decimal('0.5625')
    323 
    324     If ``data`` is empty, StatisticsError will be raised.
    325     """
    326     if iter(data) is data:
    327         data = list(data)
    328     n = len(data)
    329     if n < 1:
    330         raise StatisticsError('mean requires at least one data point')
    331     T, total, count = _sum(data)
    332     assert count == n
    333     return _convert(total/n, T)
    334 
    335 
    336 # FIXME: investigate ways to calculate medians without sorting? Quickselect?
    337 def median(data):
    338     """Return the median (middle value) of numeric data.
    339 
    340     When the number of data points is odd, return the middle data point.
    341     When the number of data points is even, the median is interpolated by
    342     taking the average of the two middle values:
    343 
    344     >>> median([1, 3, 5])
    345     3
    346     >>> median([1, 3, 5, 7])
    347     4.0
    348 
    349     """
    350     data = sorted(data)
    351     n = len(data)
    352     if n == 0:
    353         raise StatisticsError("no median for empty data")
    354     if n%2 == 1:
    355         return data[n//2]
    356     else:
    357         i = n//2
    358         return (data[i - 1] + data[i])/2
    359 
    360 
    361 def median_low(data):
    362     """Return the low median of numeric data.
    363 
    364     When the number of data points is odd, the middle value is returned.
    365     When it is even, the smaller of the two middle values is returned.
    366 
    367     >>> median_low([1, 3, 5])
    368     3
    369     >>> median_low([1, 3, 5, 7])
    370     3
    371 
    372     """
    373     data = sorted(data)
    374     n = len(data)
    375     if n == 0:
    376         raise StatisticsError("no median for empty data")
    377     if n%2 == 1:
    378         return data[n//2]
    379     else:
    380         return data[n//2 - 1]
    381 
    382 
    383 def median_high(data):
    384     """Return the high median of data.
    385 
    386     When the number of data points is odd, the middle value is returned.
    387     When it is even, the larger of the two middle values is returned.
    388 
    389     >>> median_high([1, 3, 5])
    390     3
    391     >>> median_high([1, 3, 5, 7])
    392     5
    393 
    394     """
    395     data = sorted(data)
    396     n = len(data)
    397     if n == 0:
    398         raise StatisticsError("no median for empty data")
    399     return data[n//2]
    400 
    401 
    402 def median_grouped(data, interval=1):
    403     """Return the 50th percentile (median) of grouped continuous data.
    404 
    405     >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
    406     3.7
    407     >>> median_grouped([52, 52, 53, 54])
    408     52.5
    409 
    410     This calculates the median as the 50th percentile, and should be
    411     used when your data is continuous and grouped. In the above example,
    412     the values 1, 2, 3, etc. actually represent the midpoint of classes
    413     0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
    414     class 3.5-4.5, and interpolation is used to estimate it.
    415 
    416     Optional argument ``interval`` represents the class interval, and
    417     defaults to 1. Changing the class interval naturally will change the
    418     interpolated 50th percentile value:
    419 
    420     >>> median_grouped([1, 3, 3, 5, 7], interval=1)
    421     3.25
    422     >>> median_grouped([1, 3, 3, 5, 7], interval=2)
    423     3.5
    424 
    425     This function does not check whether the data points are at least
    426     ``interval`` apart.
    427     """
    428     data = sorted(data)
    429     n = len(data)
    430     if n == 0:
    431         raise StatisticsError("no median for empty data")
    432     elif n == 1:
    433         return data[0]
    434     # Find the value at the midpoint. Remember this corresponds to the
    435     # centre of the class interval.
    436     x = data[n//2]
    437     for obj in (x, interval):
    438         if isinstance(obj, (str, bytes)):
    439             raise TypeError('expected number but got %r' % obj)
    440     try:
    441         L = x - interval/2  # The lower limit of the median interval.
    442     except TypeError:
    443         # Mixed type. For now we just coerce to float.
    444         L = float(x) - float(interval)/2
    445     cf = data.index(x)  # Number of values below the median interval.
    446     # FIXME The following line could be more efficient for big lists.
    447     f = data.count(x)  # Number of data points in the median interval.
    448     return L + interval*(n/2 - cf)/f
    449 
    450 
    451 def mode(data):
    452     """Return the most common data point from discrete or nominal data.
    453 
    454     ``mode`` assumes discrete data, and returns a single value. This is the
    455     standard treatment of the mode as commonly taught in schools:
    456 
    457     >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
    458     3
    459 
    460     This also works with nominal (non-numeric) data:
    461 
    462     >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
    463     'red'
    464 
    465     If there is not exactly one most common value, ``mode`` will raise
    466     StatisticsError.
    467     """
    468     # Generate a table of sorted (value, frequency) pairs.
    469     table = _counts(data)
    470     if len(table) == 1:
    471         return table[0][0]
    472     elif table:
    473         raise StatisticsError(
    474                 'no unique mode; found %d equally common values' % len(table)
    475                 )
    476     else:
    477         raise StatisticsError('no mode for empty data')
    478 
    479 
    480 # === Measures of spread ===
    481 
    482 # See http://mathworld.wolfram.com/Variance.html
    483 #     http://mathworld.wolfram.com/SampleVariance.html
    484 #     http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
    485 #
    486 # Under no circumstances use the so-called "computational formula for
    487 # variance", as that is only suitable for hand calculations with a small
    488 # amount of low-precision data. It has terrible numeric properties.
    489 #
    490 # See a comparison of three computational methods here:
    491 # http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
    492 
    493 def _ss(data, c=None):
    494     """Return sum of square deviations of sequence data.
    495 
    496     If ``c`` is None, the mean is calculated in one pass, and the deviations
    497     from the mean are calculated in a second pass. Otherwise, deviations are
    498     calculated from ``c`` as given. Use the second case with care, as it can
    499     lead to garbage results.
    500     """
    501     if c is None:
    502         c = mean(data)
    503     T, total, count = _sum((x-c)**2 for x in data)
    504     # The following sum should mathematically equal zero, but due to rounding
    505     # error may not.
    506     U, total2, count2 = _sum((x-c) for x in data)
    507     assert T == U and count == count2
    508     total -=  total2**2/len(data)
    509     assert not total < 0, 'negative sum of square deviations: %f' % total
    510     return (T, total)
    511 
    512 
    513 def variance(data, xbar=None):
    514     """Return the sample variance of data.
    515 
    516     data should be an iterable of Real-valued numbers, with at least two
    517     values. The optional argument xbar, if given, should be the mean of
    518     the data. If it is missing or None, the mean is automatically calculated.
    519 
    520     Use this function when your data is a sample from a population. To
    521     calculate the variance from the entire population, see ``pvariance``.
    522 
    523     Examples:
    524 
    525     >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
    526     >>> variance(data)
    527     1.3720238095238095
    528 
    529     If you have already calculated the mean of your data, you can pass it as
    530     the optional second argument ``xbar`` to avoid recalculating it:
    531 
    532     >>> m = mean(data)
    533     >>> variance(data, m)
    534     1.3720238095238095
    535 
    536     This function does not check that ``xbar`` is actually the mean of
    537     ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
    538     impossible results.
    539 
    540     Decimals and Fractions are supported:
    541 
    542     >>> from decimal import Decimal as D
    543     >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
    544     Decimal('31.01875')
    545 
    546     >>> from fractions import Fraction as F
    547     >>> variance([F(1, 6), F(1, 2), F(5, 3)])
    548     Fraction(67, 108)
    549 
    550     """
    551     if iter(data) is data:
    552         data = list(data)
    553     n = len(data)
    554     if n < 2:
    555         raise StatisticsError('variance requires at least two data points')
    556     T, ss = _ss(data, xbar)
    557     return _convert(ss/(n-1), T)
    558 
    559 
    560 def pvariance(data, mu=None):
    561     """Return the population variance of ``data``.
    562 
    563     data should be an iterable of Real-valued numbers, with at least one
    564     value. The optional argument mu, if given, should be the mean of
    565     the data. If it is missing or None, the mean is automatically calculated.
    566 
    567     Use this function to calculate the variance from the entire population.
    568     To estimate the variance from a sample, the ``variance`` function is
    569     usually a better choice.
    570 
    571     Examples:
    572 
    573     >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
    574     >>> pvariance(data)
    575     1.25
    576 
    577     If you have already calculated the mean of the data, you can pass it as
    578     the optional second argument to avoid recalculating it:
    579 
    580     >>> mu = mean(data)
    581     >>> pvariance(data, mu)
    582     1.25
    583 
    584     This function does not check that ``mu`` is actually the mean of ``data``.
    585     Giving arbitrary values for ``mu`` may lead to invalid or impossible
    586     results.
    587 
    588     Decimals and Fractions are supported:
    589 
    590     >>> from decimal import Decimal as D
    591     >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
    592     Decimal('24.815')
    593 
    594     >>> from fractions import Fraction as F
    595     >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
    596     Fraction(13, 72)
    597 
    598     """
    599     if iter(data) is data:
    600         data = list(data)
    601     n = len(data)
    602     if n < 1:
    603         raise StatisticsError('pvariance requires at least one data point')
    604     ss = _ss(data, mu)
    605     T, ss = _ss(data, mu)
    606     return _convert(ss/n, T)
    607 
    608 
    609 def stdev(data, xbar=None):
    610     """Return the square root of the sample variance.
    611 
    612     See ``variance`` for arguments and other details.
    613 
    614     >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
    615     1.0810874155219827
    616 
    617     """
    618     var = variance(data, xbar)
    619     try:
    620         return var.sqrt()
    621     except AttributeError:
    622         return math.sqrt(var)
    623 
    624 
    625 def pstdev(data, mu=None):
    626     """Return the square root of the population variance.
    627 
    628     See ``pvariance`` for arguments and other details.
    629 
    630     >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
    631     0.986893273527251
    632 
    633     """
    634     var = pvariance(data, mu)
    635     try:
    636         return var.sqrt()
    637     except AttributeError:
    638         return math.sqrt(var)
    639