[Python-checkins] GH-95861: Add support for Spearman's rank correlation coefficient (GH-95863)

Thu Aug 18 14:48:32 EDT 2022

https://github.com/python/cpython/commit/29c8f80760869018aa0d7b1d42ab737dc325cfa2
commit: 29c8f80760869018aa0d7b1d42ab737dc325cfa2
branch: main
author: Raymond Hettinger <rhettinger at users.noreply.github.com>
committer: rhettinger <rhettinger at users.noreply.github.com>
date: 2022-08-18T13:48:27-05:00
summary:

GH-95861: Add support for Spearman's rank correlation coefficient (GH-95863)

files:
A Misc/NEWS.d/next/Library/2022-08-10-17-34-07.gh-issue-95861.qv-T5s.rst
M Doc/library/statistics.rst
M Lib/statistics.py
M Lib/test/test_statistics.py

diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index bf869903c0f..c3f9c1f5239 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -104,7 +104,7 @@ These functions calculate statistics regarding relations between two inputs.
 
 =========================  =====================================================
 :func:`covariance`         Sample covariance for two variables.
-:func:`correlation`        Pearson's correlation coefficient for two variables.
+:func:`correlation`        Pearson and Spearman's correlation coefficients.
 :func:`linear_regression`  Slope and intercept for simple linear regression.
 =========================  =====================================================
 
@@ -648,31 +648,57 @@ However, for reading convenience, most of the examples show sorted sequences.
 
    .. versionadded:: 3.10
 
-.. function:: correlation(x, y, /)
+.. function:: correlation(x, y, /, *, method='linear')
 
    Return the `Pearson's correlation coefficient
    <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
    for two inputs. Pearson's correlation coefficient *r* takes values
-   between -1 and +1. It measures the strength and direction of the linear
-   relationship, where +1 means very strong, positive linear relationship,
-   -1 very strong, negative linear relationship, and 0 no linear relationship.
+   between -1 and +1. It measures the strength and direction of a linear
+   relationship.
+
+   If *method* is "ranked", computes `Spearman's rank correlation coefficient
+   <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
+   for two inputs. The data is replaced by ranks.  Ties are averaged so that
+   equal values receive the same rank.  The resulting coefficient measures the
+   strength of a monotonic relationship.
+
+   Spearman's correlation coefficient is appropriate for ordinal data or for
+   continuous data that doesn't meet the linear proportion requirement for
+   Pearson's correlation coefficient.
 
    Both inputs must be of the same length (no less than two), and need
    not to be constant, otherwise :exc:`StatisticsError` is raised.
 
-   Examples:
+   Example with `Kepler's laws of planetary motion
+   <https://en.wikipedia.org/wiki/Kepler's_laws_of_planetary_motion>`_:
 
    .. doctest::
 
-      >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
-      >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
-      >>> correlation(x, x)
+      >>> # Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and  Neptune
+      >>> orbital_period = [88, 225, 365, 687, 4331, 10_756, 30_687, 60_190]    # days
+      >>> dist_from_sun = [58, 108, 150, 228, 778, 1_400, 2_900, 4_500] # million km
+
+      >>> # Show that a perfect monotonic relationship exists
+      >>> correlation(orbital_period, dist_from_sun, method='ranked')
+      1.0
+
+      >>> # Observe that a linear relationship is imperfect
+      >>> round(correlation(orbital_period, dist_from_sun), 4)
+      0.9882
+
+      >>> # Demonstrate Kepler's third law: There is a linear correlation
+      >>> # between the square of the orbital period and the cube of the
+      >>> # distance from the sun.
+      >>> period_squared = [p * p for p in orbital_period]
+      >>> dist_cubed = [d * d * d for d in dist_from_sun]
+      >>> round(correlation(period_squared, dist_cubed), 4)
       1.0
-      >>> correlation(x, y)
-      -1.0
 
    .. versionadded:: 3.10
 
+   .. versionchanged:: 3.12
+      Added support for Spearman's rank correlation coefficient.
+
 .. function:: linear_regression(x, y, /, *, proportional=False)
 
    Return the slope and intercept of `simple linear regression
diff --git a/Lib/statistics.py b/Lib/statistics.py
index c78d6451885..a3f915c1302 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -134,11 +134,11 @@
 
 from fractions import Fraction
 from decimal import Decimal
-from itertools import groupby, repeat
+from itertools import count, groupby, repeat
 from bisect import bisect_left, bisect_right
 from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
 from functools import reduce
-from operator import mul
+from operator import mul, itemgetter
 from collections import Counter, namedtuple, defaultdict
 
 _SQRT2 = sqrt(2.0)
@@ -355,6 +355,50 @@ def _fail_neg(values, errmsg='negative value'):
             raise StatisticsError(errmsg)
         yield x
 
+def _rank(data, /, *, key=None, reverse=False, ties='average') -> list[float]:
+    """Rank order a dataset. The lowest value has rank 1.
+
+    Ties are averaged so that equal values receive the same rank:
+
+        >>> data = [31, 56, 31, 25, 75, 18]
+        >>> _rank(data)
+        [3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
+
+    The operation is idempotent:
+
+        >>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0])
+        [3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
+
+    It is possible to rank the data in reverse order so that
+    the highest value has rank 1.  Also, a key-function can
+    extract the field to be ranked:
+
+        >>> goals = [('eagles', 45), ('bears', 48), ('lions', 44)]
+        >>> _rank(goals, key=itemgetter(1), reverse=True)
+        [2.0, 1.0, 3.0]
+
+    """
+    # If this function becomes public at some point, more thought
+    # needs to be given to the signature.  A list of ints is
+    # plausible when ties is "min" or "max".  When ties is "average",
+    # either list[float] or list[Fraction] is plausible.
+
+    # Default handling of ties matches scipy.stats.mstats.spearmanr.
+    if ties != 'average':
+        raise ValueError(f'Unknown tie resolution method: {ties!r}')
+    if key is not None:
+        data = map(key, data)
+    val_pos = sorted(zip(data, count()), reverse=reverse)
+    i = 0   # To rank starting at 0 instead of 1, set i = -1.
+    result = [0] * len(val_pos)
+    for _, g in groupby(val_pos, key=itemgetter(0)):
+        group = list(g)
+        size = len(group)
+        rank = i + (size + 1) / 2
+        for value, orig_pos in group:
+            result[orig_pos] = rank
+        i += size
+    return result
 
 def _integer_sqrt_of_frac_rto(n: int, m: int) -> int:
     """Square root of n/m, rounded to the nearest integer using round-to-odd."""
@@ -988,14 +1032,12 @@ def covariance(x, y, /):
     return sxy / (n - 1)
 
 
-def correlation(x, y, /):
+def correlation(x, y, /, *, method='linear'):
     """Pearson's correlation coefficient
 
     Return the Pearson's correlation coefficient for two inputs. Pearson's
-    correlation coefficient *r* takes values between -1 and +1. It measures the
-    strength and direction of the linear relationship, where +1 means very
-    strong, positive linear relationship, -1 very strong, negative linear
-    relationship, and 0 no linear relationship.
+    correlation coefficient *r* takes values between -1 and +1. It measures
+    the strength and direction of a linear relationship.
 
     >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
@@ -1004,12 +1046,25 @@ def correlation(x, y, /):
     >>> correlation(x, y)
     -1.0
 
+    If *method* is "ranked", computes Spearman's rank correlation coefficient
+    for two inputs.  The data is replaced by ranks.  Ties are averaged
+    so that equal values receive the same rank.  The resulting coefficient
+    measures the strength of a monotonic relationship.
+
+    Spearman's rank correlation coefficient is appropriate for ordinal
+    data or for continuous data that doesn't meet the linear proportion
+    requirement for Pearson's correlation coefficient.
     """
     n = len(x)
     if len(y) != n:
         raise StatisticsError('correlation requires that both inputs have same number of data points')
     if n < 2:
         raise StatisticsError('correlation requires at least two data points')
+    if method not in {'linear', 'ranked'}:
+        raise ValueError(f'Unknown method: {method!r}')
+    if method == 'ranked':
+        x = _rank(x)
+        y = _rank(y)
     xbar = fsum(x) / n
     ybar = fsum(y) / n
     sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index bf85525dd12..05ce79f1265 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2565,6 +2565,22 @@ def test_different_scales(self):
         self.assertAlmostEqual(statistics.covariance(x, y), 0.1)
 
 
+    def test_correlation_spearman(self):
+        # https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php
+        # Compare with:
+        #     >>> import scipy.stats.mstats
+        #     >>> scipy.stats.mstats.spearmanr(reading, mathematics)
+        #     SpearmanrResult(correlation=0.6686960980480712, pvalue=0.03450954165178532)
+        # And Wolfram Alpha gives: 0.668696
+        #     https://www.wolframalpha.com/input?i=SpearmanRho%5B%7B56%2C+75%2C+45%2C+71%2C+61%2C+64%2C+58%2C+80%2C+76%2C+61%7D%2C+%7B66%2C+70%2C+40%2C+60%2C+65%2C+56%2C+59%2C+77%2C+67%2C+63%7D%5D
+        reading = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
+        mathematics = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63]
+        self.assertAlmostEqual(statistics.correlation(reading, mathematics, method='ranked'),
+                               0.6686960980480712)
+
+        with self.assertRaises(ValueError):
+            statistics.correlation(reading, mathematics, method='bad_method')
+
 class TestLinearRegression(unittest.TestCase):
 
     def test_constant_input_error(self):
diff --git a/Misc/NEWS.d/next/Library/2022-08-10-17-34-07.gh-issue-95861.qv-T5s.rst b/Misc/NEWS.d/next/Library/2022-08-10-17-34-07.gh-issue-95861.qv-T5s.rst
new file mode 100644
index 00000000000..aae76c74e2f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-08-10-17-34-07.gh-issue-95861.qv-T5s.rst
@@ -0,0 +1,2 @@
+Add support for computing Spearman's correlation coefficient to the existing
+statistics.correlation() function.