[Python-checkins] python/nondist/sandbox/statistics statistics.py, NONE, 1.1 test_statistics.py, NONE, 1.1

Sun Feb 15 01:03:59 EST 2004

Update of /cvsroot/python/python/nondist/sandbox/statistics
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20081

Added Files:
	statistics.py test_statistics.py 
Log Message:
Load a draft version of a statistics module.

--- NEW FILE: statistics.py ---
"""statistics.py  A collection of functions for summarizing data

The module focuses on everyday data reduction functions and omits more
sophisticated statistical tools.

Unless otherwise noted, each function is designed to accept any iterable
and make only a single pass over the data.  This makes the tools suitable
for use with generator expressions and keeps them as memory friendly as
possible.  Except for median(), the use of iterators means that the
dataset can be processed one element at a time without bringing it all
into memory.

The functions, nlargest and nsmallest, are also designed to use the fewest
possible data comparisions and the data elements may be non-numerical (such
as strings) as long they support __cmp__ or __lt__.

The code is designed to run at its best on Py2.4; however,
it being kept fully compatible with Py2.3.

"""

from __future__ import division
import operator
import itertools
import heapq
import bisect

__all__ = ['mean', 'stddev', 'product', 'nlargest', 'nsmallest', 'select']

def mean(data):
    """Compute the arithmetic mean of a data sample"""
    try:
        cnt = len(data)                 # do it the fast way if possible
    except TypeError:
        cum = 0
        for cnt, elem in enumerate(data):
            cum += elem
        cnt += 1                        # enumerate counts from zero
    else:
        cum = sum(data)
    try:
        return cum / cnt                # relies on true division
    except ZeroDivisionError:
        raise ValueError('data must have at least one element')

def stddev(data, sample=True):
    """Computes the standard deviation of the dataset.

    If sample is True, computes the standard deviation with an n-1 divisor
    for an unbiased estimator for a data sample.  If sample is False, computes
    with a divisor of n, giving the standard deviation for a complete population.
    """

    # Formula for recurrence taken from Seminumerical Algorithms, Knuth, 4.2.2
    # It has substantially better numerical performance than computing the
    # result from sum(data) and sum(x*x for x in data).
    it = iter(data)
    try:
        m = it.next()                   # running mean
    except StopIteration:
        raise ValueError('data must have at least one element')
    s = 0                               # running sum((x-mean)**2 for x in data)
    k = 1                               # number of items
    dm = 0                              # carried forward error term
    for x in it:
        k += 1

        # This block computes:  newm = (x-m)/k
        # The use of a cumulative error term improves accuracy when the mean is
        # much larger than the standard deviation.  Also, it makes the formula
        # less sensitive to data order (with sorted data resulting is larger
        # relative errors according to experiments by Chris Reedy).
        adjm = (x-m)/k - dm             # relies on true division
        newm = m + adjm
        dm = (newm - m) - adjm

        s += (x-m)*(x-newm)
        m = newm
    if sample:
        try:
            return (s / (k-1)) ** 0.5   # sample standard deviation
        except ZeroDivisionError:
            raise ValueError('sample deviation requires at least two elements')
    else:
        return (s / k) ** 0.5           # population standard deviation

def partition(iterable):
    # Support routine for select
    under, notunder = [], []
    ua, nua = under.append, notunder.append
    it = iter(iterable)
    try:
        pivot = it.next()
    except StopIteration:
        pass
    for elem in it:
        if elem < pivot:
            under.append(elem)
        else:
            notunder.append(elem)
    return under, pivot, notunder

def select(data, n):
    'Find the nth rank ordered element'
    while 1:
        p, pivot, q = partition(data)
        if n < len(p):
            data = p
        elif n == len(p):
            return pivot
        else :
            data = q
            n -= len(p) + 1
        p = q = None

def median(data):
    try:
        n = len(data)
    except TypeError:
        data = list(data)
        n = len(data)
    if n == 0:
        raise ValueError('data must have at least one element')
    if n & 1 == 1:                      # if length is an odd number
        return select(data, n//2)
    return (select(data, n//2) + select(data, n//2-1)) / 2

def product(iterable):
    "Compute the product of the data elements"
    return reduce(operator.mul, iterable, 1)

def nlargest(iterable, n=1):            # XXX key= or cmp= argument
    "Compute the n largest elements in the dataset"
    ## When collections.fibheap is available, use it instead of heapq
    it = iter(iterable)
    result = list(itertools.islice(it, n))
    heapq.heapify(result)
    if len(result) == 0:
        raise ValueError('data must have at least one element')
    subst = heapq.heapreplace
    for elem in it:
        if (elem > result[0]):
            subst(result, elem)
    result.sort(reverse=True)
    return result

def nsmallest(iterable, n=1):
    "Compute the n largest elements in the dataset"
    ## When collections.fibheap is available, use it instead of bisect
    it = iter(iterable)
    result = list(itertools.islice(it, 0, n))
    result.sort()
    if len(result) == 0:
        raise ValueError('data must have at least one element')
    insort = bisect.insort
    pop = result.pop
    for elem in it:
        if (elem < result[-1]):
            insort(result, elem)
            pop()
    return result

XXX = """
    Other possible functions include data groupers for
    binning, counting, and splitting into equivalence classes.
"""

--- NEW FILE: test_statistics.py ---
from __future__ import division
from statistics import mean, stddev, product, nlargest, nsmallest, select, median

### UNITTESTS #################################################

import unittest
import random

def g(n):
    "iterator substitute for xrange (without defining __len__)"
    for i in xrange(n):
        yield i

class TestStats(unittest.TestCase):

    def test_mean(self):
        self.assertEqual(mean(range(6)), 15/6.0)
        self.assertEqual(mean(g(6)), 15/6.0)
        self.assertEqual(mean([10]), 10)
        self.assertRaises(ValueError, mean, [])
        self.assertRaises(TypeError, mean, 'abc')

    def test_stddev(self):
        self.assertEqual(stddev([10,15,20]), 5)
        self.assertEqual(round(stddev([11.1,4,9,13]), 3), 3.878)
        self.assertEqual(round(stddev([11.1,4,9,13], False), 3), 3.358)
        self.assertEqual(stddev([10], False), 0.0)
        self.assertRaises(ValueError, stddev, [1])
        self.assertRaises(ValueError, stddev, [], False)

    def test_median(self):
        data = range(10)
        random.shuffle(data)
        copy = data[:]
        self.assertEqual(median(data), 4.5)
        self.assertEqual(data, copy)
        self.assertEqual(median(g(10)), 4.5)
        data.insert(1, 10)
        self.assertEqual(median(data), 5)
        self.assertEqual(median([-50,10,2,11]), 6)
        self.assertEqual(median([30,10,2]), 10)
        self.assertRaises(ValueError, median, [])

    def test_product(self):
        self.assertEqual(product(range(1, 5)), 24)
        def g(lo, hi):
            for i in xrange(lo, hi):
                yield i
        self.assertEqual(product(g(1, 5)), 24)

    def test_select(self):
        n = 200
        a = range(n)
        random.shuffle(a)
        for i in xrange(100):
            nth = random.randrange(n)
            self.assertEqual(select(a, nth), nth)

    def test_nlargest(self):
        n = 10
        data = range(n)
        random.shuffle(data)
        copy = data[:]
        self.assertEqual(nlargest(data), [n-1])
        self.assertRaises(ValueError, nlargest, data, 0)
        for i in xrange(1, n+1):
            self.assertEqual(nlargest(data, i), range(n-1, n-i-1, -1))
        self.assertEqual(data, copy)
        self.assertEqual(nlargest('abcde', 3), list('edc'))

    def test_nsmallest(self):
        n = 10
        data = range(n)
        random.shuffle(data)
        copy = data[:]
        self.assertEqual(nsmallest(data), [0])
        self.assertRaises(ValueError, nlargest, data, 0)
        for i in xrange(1, n+1):
            self.assertEqual(nsmallest(data, i), range(i))
        self.assertEqual(data, copy)
        self.assertEqual(nsmallest('abcde', 3), list('abc'))

if __name__ == '__main__':
    suite = unittest.TestSuite()
    suite.addTest(unittest.makeSuite(TestStats))
    unittest.TextTestRunner(verbosity=2).run(suite)