[Python-checkins] bpo-35892: Fix mode() and add multimode() (#12089)
Raymond Hettinger
webhook-mailer at python.org
Tue Mar 12 03:43:38 EDT 2019
https://github.com/python/cpython/commit/fc06a192fdc44225ef1cc879f615a81931ad0a85
commit: fc06a192fdc44225ef1cc879f615a81931ad0a85
branch: master
author: Raymond Hettinger <rhettinger at users.noreply.github.com>
committer: GitHub <noreply at github.com>
date: 2019-03-12T00:43:27-07:00
summary:
bpo-35892: Fix mode() and add multimode() (#12089)
files:
M Doc/library/statistics.rst
M Doc/whatsnew/3.8.rst
M Lib/statistics.py
M Lib/test/test_statistics.py
diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index 81119da0a382..97e1c3a0a1c2 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -37,7 +37,7 @@ Averages and measures of central location
These functions calculate an average or typical value from a population
or sample.
-======================= =============================================
+======================= ===============================================================
:func:`mean` Arithmetic mean ("average") of data.
:func:`fmean` Fast, floating point arithmetic mean.
:func:`harmonic_mean` Harmonic mean of data.
@@ -45,8 +45,9 @@ or sample.
:func:`median_low` Low median of data.
:func:`median_high` High median of data.
:func:`median_grouped` Median, or 50th percentile, of grouped data.
-:func:`mode` Mode (most common value) of discrete data.
-======================= =============================================
+:func:`mode` Single mode (most common value) of discrete or nominal data.
+:func:`multimode` List of modes (most common values) of discrete or nomimal data.
+======================= ===============================================================
Measures of spread
------------------
@@ -287,12 +288,12 @@ However, for reading convenience, most of the examples show sorted sequences.
.. function:: mode(data)
- Return the most common data point from discrete or nominal *data*. The mode
- (when it exists) is the most typical value, and is a robust measure of
- central location.
+ Return the single most common data point from discrete or nominal *data*.
+ The mode (when it exists) is the most typical value and serves as a
+ measure of central location.
- If *data* is empty, or if there is not exactly one most common value,
- :exc:`StatisticsError` is raised.
+ If there are multiple modes, returns the first one encountered in the *data*.
+ If *data* is empty, :exc:`StatisticsError` is raised.
``mode`` assumes discrete data, and returns a single value. This is the
standard treatment of the mode as commonly taught in schools:
@@ -310,6 +311,27 @@ However, for reading convenience, most of the examples show sorted sequences.
>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
'red'
+ .. versionchanged:: 3.8
+ Now handles multimodal datasets by returning the first mode encountered.
+ Formerly, it raised :exc:`StatisticsError` when more than one mode was
+ found.
+
+
+.. function:: multimode(data)
+
+ Return a list of the most frequently occurring values in the order they
+ were first encountered in the *data*. Will return more than one result if
+ there are multiple modes or an empty list if the *data* is empty:
+
+ .. doctest::
+
+ >>> multimode('aabbbbccddddeeffffgg')
+ ['b', 'd', 'f']
+ >>> multimode('')
+ []
+
+ .. versionadded:: 3.8
+
.. function:: pstdev(data, mu=None)
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst
index 9cd5a3a937dc..ad86917d0cc7 100644
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -282,6 +282,9 @@ Added :func:`statistics.fmean` as a faster, floating point variant of
:func:`statistics.mean()`. (Contributed by Raymond Hettinger and
Steven D'Aprano in :issue:`35904`.)
+Added :func:`statistics.multimode` that returns a list of the most
+common values. (Contributed by Raymond Hettinger in :issue:`35892`.)
+
Added :class:`statistics.NormalDist`, a tool for creating
and manipulating normal distributions of a random variable.
(Contributed by Raymond Hettinger in :issue:`36018`.)
@@ -591,6 +594,11 @@ Changes in the Python API
* The function :func:`platform.popen` has been removed, it was deprecated since
Python 3.3: use :func:`os.popen` instead.
+* The :func:`statistics.mode` function no longer raises an exception
+ when given multimodal data. Instead, it returns the first mode
+ encountered in the input data. (Contributed by Raymond Hettinger
+ in :issue:`35892`.)
+
* The :meth:`~tkinter.ttk.Treeview.selection` method of the
:class:`tkinter.ttk.Treeview` class no longer takes arguments. Using it with
arguments for changing the selection was deprecated in Python 3.6. Use
diff --git a/Lib/statistics.py b/Lib/statistics.py
index e85aaa996cc7..97f154373dc0 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -17,6 +17,7 @@
median_high High median of data.
median_grouped Median, or 50th percentile, of grouped data.
mode Mode (most common value) of data.
+multimode List of modes (most common values of data)
================== =============================================
Calculate the arithmetic mean ("the average") of data:
@@ -79,10 +80,9 @@
__all__ = [ 'StatisticsError', 'NormalDist',
'pstdev', 'pvariance', 'stdev', 'variance',
'median', 'median_low', 'median_high', 'median_grouped',
- 'mean', 'mode', 'harmonic_mean', 'fmean',
+ 'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
]
-import collections
import math
import numbers
import random
@@ -92,8 +92,8 @@
from itertools import groupby
from bisect import bisect_left, bisect_right
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
-
-
+from operator import itemgetter
+from collections import Counter
# === Exceptions ===
@@ -249,20 +249,6 @@ def _convert(value, T):
raise
-def _counts(data):
- # Generate a table of sorted (value, frequency) pairs.
- table = collections.Counter(iter(data)).most_common()
- if not table:
- return table
- # Extract the values with the highest frequency.
- maxfreq = table[0][1]
- for i in range(1, len(table)):
- if table[i][1] != maxfreq:
- table = table[:i]
- break
- return table
-
-
def _find_lteq(a, x):
'Locate the leftmost value exactly equal to x'
i = bisect_left(a, x)
@@ -334,9 +320,9 @@ def count(x):
nonlocal n
n += 1
return x
- total = math.fsum(map(count, data))
+ total = fsum(map(count, data))
else:
- total = math.fsum(data)
+ total = fsum(data)
try:
return total / n
except ZeroDivisionError:
@@ -523,19 +509,38 @@ def mode(data):
>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
'red'
- If there is not exactly one most common value, ``mode`` will raise
- StatisticsError.
+ If there are multiple modes, return the first one encountered.
+
+ >>> mode(['red', 'red', 'green', 'blue', 'blue'])
+ 'red'
+
+ If *data* is empty, ``mode``, raises StatisticsError.
+
"""
- # Generate a table of sorted (value, frequency) pairs.
- table = _counts(data)
- if len(table) == 1:
- return table[0][0]
- elif table:
- raise StatisticsError(
- 'no unique mode; found %d equally common values' % len(table)
- )
- else:
- raise StatisticsError('no mode for empty data')
+ data = iter(data)
+ try:
+ return Counter(data).most_common(1)[0][0]
+ except IndexError:
+ raise StatisticsError('no mode for empty data') from None
+
+
+def multimode(data):
+ """ Return a list of the most frequently occurring values.
+
+ Will return more than one result if there are multiple modes
+ or an empty list if *data* is empty.
+
+ >>> multimode('aabbbbbbbbcc')
+ ['b']
+ >>> multimode('aabbbbccddddeeffffgg')
+ ['b', 'd', 'f']
+ >>> multimode('')
+ []
+
+ """
+ counts = Counter(iter(data)).most_common()
+ maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
+ return list(map(itemgetter(0), mode_items))
# === Measures of spread ===
@@ -836,6 +841,7 @@ def __repr__(self):
from math import isclose
from operator import add, sub, mul, truediv
from itertools import repeat
+ import doctest
g1 = NormalDist(10, 20)
g2 = NormalDist(-5, 25)
@@ -893,3 +899,5 @@ def assert_close(G1, G2):
S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
Y.samples(n))])
assert_close(X - Y, S)
+
+ print(doctest.testmod())
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index a63e4bf6cc84..26b22a1c4080 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -1769,7 +1769,7 @@ def prepare_data(self):
def test_range_data(self):
# Override test from UnivariateCommonMixin.
data = range(20, 50, 3)
- self.assertRaises(statistics.StatisticsError, self.func, data)
+ self.assertEqual(self.func(data), 20)
def test_nominal_data(self):
# Test mode with nominal data.
@@ -1790,13 +1790,14 @@ def test_bimodal_data(self):
# Test mode with bimodal data.
data = [1, 1, 2, 2, 2, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 9, 9]
assert data.count(2) == data.count(6) == 4
- # Check for an exception.
- self.assertRaises(statistics.StatisticsError, self.func, data)
+ # mode() should return 2, the first encounted mode
+ self.assertEqual(self.func(data), 2)
- def test_unique_data_failure(self):
- # Test mode exception when data points are all unique.
+ def test_unique_data(self):
+ # Test mode when data points are all unique.
data = list(range(10))
- self.assertRaises(statistics.StatisticsError, self.func, data)
+ # mode() should return 0, the first encounted mode
+ self.assertEqual(self.func(data), 0)
def test_none_data(self):
# Test that mode raises TypeError if given None as data.
@@ -1809,8 +1810,18 @@ def test_counter_data(self):
# Test that a Counter is treated like any other iterable.
data = collections.Counter([1, 1, 1, 2])
# Since the keys of the counter are treated as data points, not the
- # counts, this should raise.
- self.assertRaises(statistics.StatisticsError, self.func, data)
+ # counts, this should return the first mode encountered, 1
+ self.assertEqual(self.func(data), 1)
+
+
+class TestMultiMode(unittest.TestCase):
+
+ def test_basics(self):
+ multimode = statistics.multimode
+ self.assertEqual(multimode('aabbbbbbbbcc'), ['b'])
+ self.assertEqual(multimode('aabbbbccddddeeffffgg'), ['b', 'd', 'f'])
+ self.assertEqual(multimode(''), [])
+
class TestFMean(unittest.TestCase):
More information about the Python-checkins
mailing list