[Scipy-svn] r5160 - in trunk/scipy/stats: . tests
scipy-svn at scipy.org
scipy-svn at scipy.org
Sat Nov 22 00:26:54 EST 2008
Author: josef
Date: 2008-11-21 23:26:45 -0600 (Fri, 21 Nov 2008)
New Revision: 5160
Added:
trunk/scipy/stats/mstats_basic.py
trunk/scipy/stats/mstats_extras.py
trunk/scipy/stats/tests/test_mstats_basic.py
trunk/scipy/stats/tests/test_mstats_extras.py
Removed:
trunk/scipy/stats/mmorestats.py
trunk/scipy/stats/mstats.py
trunk/scipy/stats/tests/test_mmorestats.py
trunk/scipy/stats/tests/test_mstats.py
Log:
rename masked array stats files
Deleted: trunk/scipy/stats/mmorestats.py
===================================================================
--- trunk/scipy/stats/mmorestats.py 2008-11-21 21:09:29 UTC (rev 5159)
+++ trunk/scipy/stats/mmorestats.py 2008-11-22 05:26:45 UTC (rev 5160)
@@ -1,385 +0,0 @@
-"""
-Additional statistics functions, with support to MA.
-
-:author: Pierre GF Gerard-Marchant
-:contact: pierregm_at_uga_edu
-:date: $Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $
-:version: $Id: morestats.py 3473 2007-10-29 15:18:13Z jarrod.millman $
-"""
-__author__ = "Pierre GF Gerard-Marchant"
-__docformat__ = "restructuredtext en"
-
-
-__all__ = ['compare_median_ms',
- 'hdquantiles', 'hdmedian', 'hdquantiles_sd',
- 'idealfourths',
- 'median_cihs','mjci','mquantiles_cimj',
- 'rsh',
- 'trimmed_mean_ci',]
-
-import numpy as np
-from numpy import float_, int_, ndarray
-
-import numpy.ma as ma
-from numpy.ma import MaskedArray
-
-import scipy.stats.mstats as mstats
-
-from scipy.stats.distributions import norm, beta, t, binom
-
-
-#####--------------------------------------------------------------------------
-#---- --- Quantiles ---
-#####--------------------------------------------------------------------------
-def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
- """Computes quantile estimates with the Harrell-Davis method, where the estimates
-are calculated as a weighted linear combination of order statistics.
-
-Parameters
-----------
- data: ndarray
- Data array.
- prob: sequence
- Sequence of quantiles to compute.
- axis : int
- Axis along which to compute the quantiles. If None, use a flattened array.
- var : boolean
- Whether to return the variance of the estimate.
-
-Returns
--------
- A (p,) array of quantiles (if ``var`` is False), or a (2,p) array of quantiles
- and variances (if ``var`` is True), where ``p`` is the number of quantiles.
-
-Notes
------
- The function is restricted to 2D arrays.
-
- """
- def _hd_1D(data,prob,var):
- "Computes the HD quantiles for a 1D array. Returns nan for invalid data."
- xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
- # Don't use length here, in case we have a numpy scalar
- n = xsorted.size
- #.........
- hd = np.empty((2,len(prob)), float_)
- if n < 2:
- hd.flat = np.nan
- if var:
- return hd
- return hd[0]
- #.........
- v = np.arange(n+1) / float(n)
- betacdf = beta.cdf
- for (i,p) in enumerate(prob):
- _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
- w = _w[1:] - _w[:-1]
- hd_mean = np.dot(w, xsorted)
- hd[0,i] = hd_mean
- #
- hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
- #
- hd[0, prob == 0] = xsorted[0]
- hd[0, prob == 1] = xsorted[-1]
- if var:
- hd[1, prob == 0] = hd[1, prob == 1] = np.nan
- return hd
- return hd[0]
- # Initialization & checks ---------
- data = ma.array(data, copy=False, dtype=float_)
- p = np.array(prob, copy=False, ndmin=1)
- # Computes quantiles along axis (or globally)
- if (axis is None) or (data.ndim == 1):
- result = _hd_1D(data, p, var)
- else:
- assert data.ndim <= 2, "Array should be 2D at most !"
- result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
- #
- return ma.fix_invalid(result, copy=False)
-
-#..............................................................................
-def hdmedian(data, axis=-1, var=False):
- """Returns the Harrell-Davis estimate of the median along the given axis.
-
-Parameters
-----------
- data: ndarray
- Data array.
- axis : int
- Axis along which to compute the quantiles. If None, use a flattened array.
- var : boolean
- Whether to return the variance of the estimate.
-
- """
- result = hdquantiles(data,[0.5], axis=axis, var=var)
- return result.squeeze()
-
-
-#..............................................................................
-def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
- """Computes the standard error of the Harrell-Davis quantile estimates by jackknife.
-
-
-Parameters
-----------
- data: ndarray
- Data array.
- prob: sequence
- Sequence of quantiles to compute.
- axis : int
- Axis along which to compute the quantiles. If None, use a flattened array.
-
-Notes
------
- The function is restricted to 2D arrays.
-
- """
- def _hdsd_1D(data,prob):
- "Computes the std error for 1D arrays."
- xsorted = np.sort(data.compressed())
- n = len(xsorted)
- #.........
- hdsd = np.empty(len(prob), float_)
- if n < 2:
- hdsd.flat = np.nan
- #.........
- vv = np.arange(n) / float(n-1)
- betacdf = beta.cdf
- #
- for (i,p) in enumerate(prob):
- _w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
- w = _w[1:] - _w[:-1]
- mx_ = np.fromiter([np.dot(w,xsorted[np.r_[range(0,k),
- range(k+1,n)].astype(int_)])
- for k in range(n)], dtype=float_)
- mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
- hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n))
- return hdsd
- # Initialization & checks ---------
- data = ma.array(data, copy=False, dtype=float_)
- p = np.array(prob, copy=False, ndmin=1)
- # Computes quantiles along axis (or globally)
- if (axis is None):
- result = _hdsd_1D(data, p)
- else:
- assert data.ndim <= 2, "Array should be 2D at most !"
- result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
- #
- return ma.fix_invalid(result, copy=False).ravel()
-
-
-#####--------------------------------------------------------------------------
-#---- --- Confidence intervals ---
-#####--------------------------------------------------------------------------
-
-def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
- alpha=0.05, axis=None):
- """Returns the selected confidence interval of the trimmed mean along the
-given axis.
-
-Parameters
-----------
- data : sequence
- Input data. The data is transformed to a masked array
- proportiontocut : float
- Proportion of the data to cut from each side of the data .
- As a result, (2*proportiontocut*n) values are actually trimmed.
- alpha : float
- Confidence level of the intervals.
- inclusive : tuple of boolean
- If relative==False, tuple indicating whether values exactly equal to the
- absolute limits are allowed.
- If relative==True, tuple indicating whether the number of data being masked
- on each side should be rounded (True) or truncated (False).
- axis : int
- Axis along which to cut. If None, uses a flattened version of the input.
-
- """
- data = ma.array(data, copy=False)
- trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
- tmean = trimmed.mean(axis)
- tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
- df = trimmed.count(axis) - 1
- tppf = t.ppf(1-alpha/2.,df)
- return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
-
-#..............................................................................
-def mjci(data, prob=[0.25,0.5,0.75], axis=None):
- """Returns the Maritz-Jarrett estimators of the standard error of selected
-experimental quantiles of the data.
-
-Parameters
------------
- data: ndarray
- Data array.
- prob: sequence
- Sequence of quantiles to compute.
- axis : int
- Axis along which to compute the quantiles. If None, use a flattened array.
-
- """
- def _mjci_1D(data, p):
- data = np.sort(data.compressed())
- n = data.size
- prob = (np.array(p) * n + 0.5).astype(int_)
- betacdf = beta.cdf
- #
- mj = np.empty(len(prob), float_)
- x = np.arange(1,n+1, dtype=float_) / n
- y = x - 1./n
- for (i,m) in enumerate(prob):
- (m1,m2) = (m-1, n-m)
- W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
- C1 = np.dot(W,data)
- C2 = np.dot(W,data**2)
- mj[i] = np.sqrt(C2 - C1**2)
- return mj
- #
- data = ma.array(data, copy=False)
- assert data.ndim <= 2, "Array should be 2D at most !"
- p = np.array(prob, copy=False, ndmin=1)
- # Computes quantiles along axis (or globally)
- if (axis is None):
- return _mjci_1D(data, p)
- else:
- return ma.apply_along_axis(_mjci_1D, axis, data, p)
-
-#..............................................................................
-def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
- """Computes the alpha confidence interval for the selected quantiles of the
-data, with Maritz-Jarrett estimators.
-
-Parameters
-----------
- data: ndarray
- Data array.
- prob: sequence
- Sequence of quantiles to compute.
- alpha : float
- Confidence level of the intervals.
- axis : integer
- Axis along which to compute the quantiles. If None, use a flattened array.
- """
- alpha = min(alpha, 1-alpha)
- z = norm.ppf(1-alpha/2.)
- xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
- smj = mjci(data, prob, axis=axis)
- return (xq - z * smj, xq + z * smj)
-
-
-#.............................................................................
-def median_cihs(data, alpha=0.05, axis=None):
- """Computes the alpha-level confidence interval for the median of the data,
-following the Hettmasperger-Sheather method.
-
-Parameters
-----------
- data : sequence
- Input data. Masked values are discarded. The input should be 1D only, or
- axis should be set to None.
- alpha : float
- Confidence level of the intervals.
- axis : integer
- Axis along which to compute the quantiles. If None, use a flattened array.
- """
- def _cihs_1D(data, alpha):
- data = np.sort(data.compressed())
- n = len(data)
- alpha = min(alpha, 1-alpha)
- k = int(binom._ppf(alpha/2., n, 0.5))
- gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
- if gk < 1-alpha:
- k -= 1
- gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
- gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
- I = (gk - 1 + alpha)/(gk - gkk)
- lambd = (n-k) * I / float(k + (n-2*k)*I)
- lims = (lambd*data[k] + (1-lambd)*data[k-1],
- lambd*data[n-k-1] + (1-lambd)*data[n-k])
- return lims
- data = ma.rray(data, copy=False)
- # Computes quantiles along axis (or globally)
- if (axis is None):
- result = _cihs_1D(data.compressed(), alpha)
- else:
- assert data.ndim <= 2, "Array should be 2D at most !"
- result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
- #
- return result
-
-#..............................................................................
-def compare_medians_ms(group_1, group_2, axis=None):
- """Compares the medians from two independent groups along the given axis.
-
-The comparison is performed using the McKean-Schrader estimate of the standard
-error of the medians.
-
-Parameters
-----------
- group_1 : {sequence}
- First dataset.
- group_2 : {sequence}
- Second dataset.
- axis : {integer}
- Axis along which the medians are estimated. If None, the arrays are flattened.
-
-Returns
--------
- A (p,) array of comparison values.
-
- """
- (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
- (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
- mstats.stde_median(group_2, axis=axis))
- W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
- return 1 - norm.cdf(W)
-
-
-def idealfourths(data, axis=None):
- """Returns an estimate of the lower and upper quartiles of the data along
- the given axis, as computed with the ideal fourths.
- """
- def _idf(data):
- x = data.compressed()
- n = len(x)
- if n < 3:
- return [np.nan,np.nan]
- (j,h) = divmod(n/4. + 5/12.,1)
- qlo = (1-h)*x[j-1] + h*x[j]
- k = n - j
- qup = (1-h)*x[k] + h*x[k-1]
- return [qlo, qup]
- data = ma.sort(data, axis=axis).view(MaskedArray)
- if (axis is None):
- return _idf(data)
- else:
- return ma.apply_along_axis(_idf, axis, data)
-
-
-def rsh(data, points=None):
- """Evaluates Rosenblatt's shifted histogram estimators for each point
-on the dataset 'data'.
-
-Parameters
- data : sequence
- Input data. Masked values are ignored.
- points : sequence
- Sequence of points where to evaluate Rosenblatt shifted histogram.
- If None, use the data.
- """
- data = ma.array(data, copy=False)
- if points is None:
- points = data
- else:
- points = np.array(points, copy=False, ndmin=1)
- if data.ndim != 1:
- raise AttributeError("The input array should be 1D only !")
- n = data.count()
- r = idealfourths(data, axis=None)
- h = 1.2 * (r[-1]-r[0]) / n**(1./5)
- nhi = (data[:,None] <= points[None,:] + h).sum(0)
- nlo = (data[:,None] < points[None,:] - h).sum(0)
- return (nhi-nlo) / (2.*n*h)
-
-
-###############################################################################
Deleted: trunk/scipy/stats/mstats.py
===================================================================
--- trunk/scipy/stats/mstats.py 2008-11-21 21:09:29 UTC (rev 5159)
+++ trunk/scipy/stats/mstats.py 2008-11-22 05:26:45 UTC (rev 5160)
@@ -1,1914 +0,0 @@
-"""
-An extension of scipy.stats.stats to support masked arrays
-
-:author: Pierre GF Gerard-Marchant
-:contact: pierregm_at_uga_edu
-"""
-#TODO : f_value_wilks_lambda looks botched... what are dfnum & dfden for ?
-#TODO : ttest_reel looks botched: what are x1,x2,v1,v2 for ?
-#TODO : reimplement ksonesamp
-
-__author__ = "Pierre GF Gerard-Marchant"
-__docformat__ = "restructuredtext en"
-
-__all__ = ['argstoarray',
- 'betai',
- 'chisquare','count_tied_groups',
- 'describe',
- 'f_oneway','f_value_wilks_lambda','find_repeats','friedmanchisquare',
- 'gmean',
- 'hmean',
- 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
- 'ks_twosamp','ks_2samp','kurtosis','kurtosistest',
- 'linregress',
- 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
- 'normaltest',
- 'obrientransform',
- 'pearsonr','plotting_positions','pointbiserialr',
- 'rankdata',
- 'samplestd','samplevar','scoreatpercentile','sem','std',
- 'sen_seasonal_slopes','signaltonoise','skew','skewtest','spearmanr',
- 'stderr',
- 'theilslopes','threshold','tmax','tmean','tmin','trim','trimboth',
- 'trimtail','trima','trimr','trimmed_mean','trimmed_std',
- 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
- 'ttest_ind','ttest_rel','tvar',
- 'var','variation',
- 'winsorize',
- 'z','zmap','zs'
- ]
-
-import numpy as np
-from numpy import ndarray
-import numpy.ma as ma
-from numpy.ma import MaskedArray, masked, nomask
-
-import itertools
-import warnings
-
-
-import scipy.stats as stats
-import scipy.special as special
-import scipy.misc as misc
-import scipy.stats.futil as futil
-
-
-genmissingvaldoc = """
-Notes
------
- Missing values are considered pair-wise: if a value is missing in x,
- the corresponding value in y is masked.
-"""
-#------------------------------------------------------------------------------
-def _chk_asarray(a, axis):
- if axis is None:
- a = ma.ravel(a)
- outaxis = 0
- else:
- a = ma.asanyarray(a)
- outaxis = axis
- return a, outaxis
-
-def _chk2_asarray(a, b, axis):
- if axis is None:
- a = ma.ravel(a)
- b = ma.ravel(b)
- outaxis = 0
- else:
- a = ma.asanyarray(a)
- b = ma.asanyarray(b)
- outaxis = axis
- return a, b, outaxis
-
-def _chk_size(a,b):
- a = ma.asanyarray(a)
- b = ma.asanyarray(b)
- (na, nb) = (a.size, b.size)
- if na != nb:
- raise ValueError("The size of the input array should match!"\
- " (%s <> %s)" % (na,nb))
- return (a,b,na)
-
-def argstoarray(*args):
- """Constructs a 2D array from a sequence of sequences. Sequences are filled
- with missing values to match the length of the longest sequence.
-
- Returns
- -------
- output : MaskedArray
- a (mxn) masked array, where m is the number of arguments and n the
- length of the longest argument.
- """
- if len(args) == 1 and not isinstance(args[0], ndarray):
- output = ma.asarray(args[0])
- assert(output.ndim == 2, "The input should be 2D!")
- else:
- n = len(args)
- m = max([len(k) for k in args])
- output = ma.array(np.empty((n,m), dtype=float), mask=True)
- for (k,v) in enumerate(args):
- output[k,:len(v)] = v
- output[np.logical_not(np.isfinite(output._data))] = masked
- return output
-
-
-
-#####--------------------------------------------------------------------------
-#---- --- Ranking ---
-#####--------------------------------------------------------------------------
-
-def find_repeats(arr):
- """Find repeats in arr and return a tuple (repeats, repeat_count).
- Masked values are discarded.
-
-Parameters
-----------
- arr : sequence
- Input array. The array is flattened if it is not 1D.
-
-Returns
--------
- repeats : ndarray
- Array of repeated values.
- counts : ndarray
- Array of counts.
-
- """
- marr = ma.compressed(arr)
- if not marr.size:
- return (np.array(0), np.array(0))
- (v1, v2, n) = futil.dfreps(ma.array(ma.compressed(arr), copy=True))
- return (v1[:n], v2[:n])
-
-
-def count_tied_groups(x, use_missing=False):
- """Counts the number of tied values in x, and returns a dictionary
- (nb of ties: nb of groups).
-
-Parameters
-----------
- x : sequence
- Sequence of data on which to counts the ties
- use_missing : boolean
- Whether to consider missing values as tied.
-
-Example
--------
- >>>z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6]
- >>>count_tied_groups(z)
- >>>{2:1, 3:2}
- >>># The ties were 0 (3x), 2 (3x) and 3 (2x)
- >>>z = ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6])
- >>>count_tied_groups(z)
- >>>{2:2, 3:1}
- >>># The ties were 0 (2x), 2 (3x) and 3 (2x)
- >>>z[[1,-1]] = masked
- >>>count_tied_groups(z)
- >>>{2:2, 3:1}
- >>># The ties were 2 (3x), 3 (2x) and masked (2x)
- """
- nmasked = ma.getmask(x).sum()
- # We need the copy as find_repeats will overwrite the initial data
- data = ma.compressed(x).copy()
- (ties, counts) = find_repeats(data)
- nties = {}
- if len(ties):
- nties = dict(zip(np.unique(counts), itertools.repeat(1)))
- nties.update(dict(zip(*find_repeats(counts))))
- if nmasked and use_missing:
- try:
- nties[nmasked] += 1
- except KeyError:
- nties[nmasked] = 1
- return nties
-
-
-def rankdata(data, axis=None, use_missing=False):
- """Returns the rank (also known as order statistics) of each data point
- along the given axis.
-
- If some values are tied, their rank is averaged.
- If some values are masked, their rank is set to 0 if use_missing is False,
- or set to the average rank of the unmasked values if use_missing is True.
-
- Parameters
- ----------
- data : sequence
- Input data. The data is transformed to a masked array
- axis : {None,int} optional
- Axis along which to perform the ranking.
- If None, the array is first flattened. An exception is raised if
- the axis is specified for arrays with a dimension larger than 2
- use_missing : {boolean} optional
- Whether the masked values have a rank of 0 (False) or equal to the
- average rank of the unmasked values (True).
- """
- #
- def _rank1d(data, use_missing=False):
- n = data.count()
- rk = np.empty(data.size, dtype=float)
- idx = data.argsort()
- rk[idx[:n]] = np.arange(1,n+1)
- #
- if use_missing:
- rk[idx[n:]] = (n+1)/2.
- else:
- rk[idx[n:]] = 0
- #
- repeats = find_repeats(data.copy())
- for r in repeats[0]:
- condition = (data==r).filled(False)
- rk[condition] = rk[condition].mean()
- return rk
- #
- data = ma.array(data, copy=False)
- if axis is None:
- if data.ndim > 1:
- return _rank1d(data.ravel(), use_missing).reshape(data.shape)
- else:
- return _rank1d(data, use_missing)
- else:
- return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray)
-
-
-#####--------------------------------------------------------------------------
-#---- --- Central tendency ---
-#####--------------------------------------------------------------------------
-
-def gmean(a, axis=0):
- a, axis = _chk_asarray(a, axis)
- log_a = ma.log(a)
- return ma.exp(log_a.mean(axis=axis))
-gmean.__doc__ = stats.gmean.__doc__
-
-
-def hmean(a, axis=0):
- a, axis = _chk_asarray(a, axis)
- if isinstance(a, MaskedArray):
- size = a.count(axis)
- else:
- size = a.shape[axis]
- return size / (1.0/a).sum(axis)
-hmean.__doc__ = stats.hmean.__doc__
-
-
-def mode(a, axis=0):
- def _mode1D(a):
- (rep,cnt) = find_repeats(a)
- if not cnt.ndim:
- return (0, 0)
- elif cnt.size:
- return (rep[cnt.argmax()], cnt.max())
- return (a[0], 1)
- #
- if axis is None:
- output = _mode1D(ma.ravel(a))
- output = (ma.array(output[0]), ma.array(output[1]))
- else:
- output = ma.apply_along_axis(_mode1D, axis, a)
- newshape = list(a.shape)
- newshape[axis] = 1
- slices = [slice(None)] * output.ndim
- slices[axis] = 0
- modes = output[tuple(slices)].reshape(newshape)
- slices[axis] = 1
- counts = output[tuple(slices)].reshape(newshape)
- output = (modes, counts)
- return output
-mode.__doc__ = stats.mode.__doc__
-
-
-#####--------------------------------------------------------------------------
-#---- --- Probabilities ---
-#####--------------------------------------------------------------------------
-
-def betai(a, b, x):
- x = np.asanyarray(x)
- x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0
- return special.betainc(a, b, x)
-betai.__doc__ = stats.betai.__doc__
-
-
-#####--------------------------------------------------------------------------
-#---- --- Correlation ---
-#####--------------------------------------------------------------------------
-
-def msign(x):
- """Returns the sign of x, or 0 if x is masked."""
- return ma.filled(np.sign(x), 0)
-
-cov = ma.cov
-
-corrcoef = ma.corrcoef
-
-
-def pearsonr(x,y):
- """Calculates a Pearson correlation coefficient and the p-value for testing
- non-correlation.
-
- The Pearson correlation coefficient measures the linear relationship
- between two datasets. Strictly speaking, Pearson's correlation requires
- that each dataset be normally distributed. Like other correlation
- coefficients, this one varies between -1 and +1 with 0 implying no
- correlation. Correlations of -1 or +1 imply an exact linear
- relationship. Positive correlations imply that as x increases, so does
- y. Negative correlations imply that as x increases, y decreases.
-
- The p-value roughly indicates the probability of an uncorrelated system
- producing datasets that have a Pearson correlation at least as extreme
- as the one computed from these datasets. The p-values are not entirely
- reliable but are probably reasonable for datasets larger than 500 or so.
-
- Parameters
- ----------
- x : 1D array
- y : 1D array the same length as x
-
- Returns
- -------
- (Pearson's correlation coefficient,
- 2-tailed p-value)
-
- References
- ----------
- http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation
- """
- (x, y, n) = _chk_size(x, y)
- (x, y) = (x.ravel(), y.ravel())
- # Get the common mask and the total nb of unmasked elements
- m = ma.mask_or(ma.getmask(x), ma.getmask(y))
- n -= m.sum()
- df = n-2
- if df < 0:
- return (masked, masked)
- #
- (mx, my) = (x.mean(), y.mean())
- (xm, ym) = (x-mx, y-my)
- #
- r_num = n*(ma.add.reduce(xm*ym))
- r_den = n*ma.sqrt(ma.dot(xm,xm)*ma.dot(ym,ym))
- r = (r_num / r_den)
- # Presumably, if r > 1, then it is only some small artifact of floating
- # point arithmetic.
- r = min(r, 1.0)
- r = max(r, -1.0)
- df = n-2
- #
- t = ma.sqrt(df/((1.0-r)*(1.0+r))) * r
- if t is masked:
- prob = 0.
- else:
- prob = betai(0.5*df,0.5,df/(df+t*t))
- return (r,prob)
-
-
-def spearmanr(x, y, use_ties=True):
- """Calculates a Spearman rank-order correlation coefficient and the p-value
- to test for non-correlation.
-
- The Spearman correlation is a nonparametric measure of the linear
- relationship between two datasets. Unlike the Pearson correlation, the
- Spearman correlation does not assume that both datasets are normally
- distributed. Like other correlation coefficients, this one varies
- between -1 and +1 with 0 implying no correlation. Correlations of -1 or
- +1 imply an exact linear relationship. Positive correlations imply that
- as x increases, so does y. Negative correlations imply that as x
- increases, y decreases.
-
- Missing values are discarded pair-wise: if a value is missing in x, the
- corresponding value in y is masked.
-
- The p-value roughly indicates the probability of an uncorrelated system
- producing datasets that have a Spearman correlation at least as extreme
- as the one computed from these datasets. The p-values are not entirely
- reliable but are probably reasonable for datasets larger than 500 or so.
-
-Parameters
-----------
- x : 1D array
- y : 1D array the same length as x
- The lengths of both arrays must be > 2.
- use_ties : {True, False} optional
- Whether the correction for ties should be computed.
-
-Returns
--------
- (Spearman correlation coefficient,
- 2-tailed p-value)
-
- References
- ----------
- [CRCProbStat2000] section 14.7
- """
- (x, y, n) = _chk_size(x, y)
- (x, y) = (x.ravel(), y.ravel())
- #
- m = ma.mask_or(ma.getmask(x), ma.getmask(y))
- n -= m.sum()
- if m is not nomask:
- x = ma.array(x, mask=m, copy=True)
- y = ma.array(y, mask=m, copy=True)
- df = n-2
- if df < 0:
- raise ValueError("The input must have at least 3 entries!")
- # Gets the ranks and rank differences
- rankx = rankdata(x)
- ranky = rankdata(y)
- dsq = np.add.reduce((rankx-ranky)**2)
- # Tie correction
- if use_ties:
- xties = count_tied_groups(x)
- yties = count_tied_groups(y)
- corr_x = np.sum(v*k*(k**2-1) for (k,v) in xties.iteritems())/12.
- corr_y = np.sum(v*k*(k**2-1) for (k,v) in yties.iteritems())/12.
- else:
- corr_x = corr_y = 0
- denom = n*(n**2 - 1)/6.
- if corr_x != 0 or corr_y != 0:
- rho = denom - dsq - corr_x - corr_y
- rho /= ma.sqrt((denom-2*corr_x)*(denom-2*corr_y))
- else:
- rho = 1. - dsq/denom
- #
- t = ma.sqrt(ma.divide(df,(rho+1.0)*(1.0-rho))) * rho
- if t is masked:
- prob = 0.
- else:
- prob = betai(0.5*df,0.5,df/(df+t*t))
- return rho, prob
-
-
-def kendalltau(x, y, use_ties=True, use_missing=False):
- """Computes Kendall's rank correlation tau on two variables *x* and *y*.
-
- Parameters
- ----------
- xdata: sequence
- First data list (for example, time).
- ydata: sequence
- Second data list.
- use_ties: {True, False} optional
- Whether ties correction should be performed.
- use_missing: {False, True} optional
- Whether missing data should be allocated a rank of 0 (False) or the
- average rank (True)
-
- Returns
- -------
- tau : float
- Kendall tau
- prob : float
- Approximate 2-side p-value.
- """
- (x, y, n) = _chk_size(x, y)
- (x, y) = (x.flatten(), y.flatten())
- m = ma.mask_or(ma.getmask(x), ma.getmask(y))
- if m is not nomask:
- x = ma.array(x, mask=m, copy=True)
- y = ma.array(y, mask=m, copy=True)
- n -= m.sum()
- #
- if n < 2:
- return (np.nan, np.nan)
- #
- rx = ma.masked_equal(rankdata(x, use_missing=use_missing),0)
- ry = ma.masked_equal(rankdata(y, use_missing=use_missing),0)
- idx = rx.argsort()
- (rx, ry) = (rx[idx], ry[idx])
- C = np.sum((((ry[i+1:]>ry[i])*(rx[i+1:]>rx[i])).filled(0).sum()
- for i in range(len(ry)-1)))
- D = np.sum((((ry[i+1:]<ry[i])*(rx[i+1:]>rx[i])).filled(0).sum()
- for i in range(len(ry)-1)))
- if use_ties:
- xties = count_tied_groups(x)
- yties = count_tied_groups(y)
- corr_x = np.sum(v*k*(k-1) for (k,v) in xties.iteritems())
- corr_y = np.sum(v*k*(k-1) for (k,v) in yties.iteritems())
- denom = ma.sqrt((n*(n-1)-corr_x) * (n*(n-1)-corr_y)) / 2.
- else:
- denom = n*(n-1)/2.
- tau = (C-D) / denom
- #
- var_s = n*(n-1)*(2*n+5)
- if use_ties:
- var_s -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in xties.iteritems())
- var_s -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in yties.iteritems())
- v1 = np.sum(v*k*(k-1) for (k,v) in xties.iteritems()) * \
- np.sum(v*k*(k-1) for (k,v) in yties.iteritems())
- v1 /= 2.*n*(n-1)
- if n > 2:
- v2 = np.sum(v*k*(k-1)*(k-2) for (k,v) in xties.iteritems()) * \
- np.sum(v*k*(k-1)*(k-2) for (k,v) in yties.iteritems())
- v2 /= 9.*n*(n-1)*(n-2)
- else:
- v2 = 0
- else:
- v1 = v2 = 0
- var_s /= 18.
- var_s += (v1 + v2)
- z = (C-D)/np.sqrt(var_s)
- prob = special.erfc(abs(z)/np.sqrt(2))
- return (tau,prob)
-
-
-def kendalltau_seasonal(x):
- """Computes a multivariate extension Kendall's rank correlation tau, designed
- for seasonal data.
-
-Parameters
-----------
- x: 2D array
- Array of seasonal data, with seasons in columns.
- """
- x = ma.array(x, subok=True, copy=False, ndmin=2)
- (n,m) = x.shape
- n_p = x.count(0)
- #
- S_szn = np.sum(msign(x[i:]-x[i]).sum(0) for i in range(n))
- S_tot = S_szn.sum()
- #
- n_tot = x.count()
- ties = count_tied_groups(x.compressed())
- corr_ties = np.sum(v*k*(k-1) for (k,v) in ties.iteritems())
- denom_tot = ma.sqrt(n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2.
- #
- R = rankdata(x, axis=0, use_missing=True)
- K = ma.empty((m,m), dtype=int)
- covmat = ma.empty((m,m), dtype=float)
-# cov_jj = ma.empty(m, dtype=float)
- denom_szn = ma.empty(m, dtype=float)
- for j in range(m):
- ties_j = count_tied_groups(x[:,j].compressed())
- corr_j = np.sum(v*k*(k-1) for (k,v) in ties_j.iteritems())
- cmb = n_p[j]*(n_p[j]-1)
- for k in range(j,m,1):
- K[j,k] = np.sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum()
- for i in range(n))
- covmat[j,k] = (K[j,k] +4*(R[:,j]*R[:,k]).sum() - \
- n*(n_p[j]+1)*(n_p[k]+1))/3.
- K[k,j] = K[j,k]
- covmat[k,j] = covmat[j,k]
-# cov_jj[j] = (nn_p*(2*n_p[j]+5))
-# cov_jj[j] -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in ties_j.iteritems())
-# cov_jj[j] /= 18.
- denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2.
- var_szn = covmat.diagonal()
- #
- z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn)
- z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum())
- z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum())
- #
- prob_szn = special.erfc(abs(z_szn)/np.sqrt(2))
- prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2))
- prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2))
- #
- chi2_tot = (z_szn*z_szn).sum()
- chi2_trd = m * z_szn.mean()**2
- output = {'seasonal tau': S_szn/denom_szn,
- 'global tau': S_tot/denom_tot,
- 'global tau (alt)': S_tot/denom_szn.sum(),
- 'seasonal p-value': prob_szn,
- 'global p-value (indep)': prob_tot_ind,
- 'global p-value (dep)': prob_tot_dep,
- 'chi2 total': chi2_tot,
- 'chi2 trend': chi2_trd,
- }
- return output
-
-
-def pointbiserialr(x, y):
- x = ma.fix_invalid(x, copy=True).astype(bool)
- y = ma.fix_invalid(y, copy=True).astype(float)
- # Get rid of the missing data ..........
- m = ma.mask_or(ma.getmask(x), ma.getmask(y))
- if m is not nomask:
- unmask = np.logical_not(m)
- x = x[unmask]
- y = y[unmask]
- #
- n = len(x)
- # phat is the fraction of x values that are True
- phat = x.sum() / float(n)
- y0 = y[~x] # y-values where x is False
- y1 = y[x] # y-values where x is True
- y0m = y0.mean()
- y1m = y1.mean()
- #
- rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std()
- #
- df = n-2
- t = rpb*ma.sqrt(df/(1.0-rpb**2))
- prob = betai(0.5*df, 0.5, df/(df+t*t))
- return rpb, prob
-
-if stats.pointbiserialr.__doc__:
- pointbiserialr.__doc__ = stats.pointbiserialr.__doc__ + genmissingvaldoc
-
-
-def linregress(*args):
- if len(args) == 1: # more than 1D array?
- args = ma.array(args[0], copy=True)
- if len(args) == 2:
- x = args[0]
- y = args[1]
- else:
- x = args[:,0]
- y = args[:,1]
- else:
- x = ma.array(args[0]).flatten()
- y = ma.array(args[1]).flatten()
- m = ma.mask_or(ma.getmask(x), ma.getmask(y))
- if m is not nomask:
- x = ma.array(x,mask=m)
- y = ma.array(y,mask=m)
- n = len(x)
- (xmean, ymean) = (x.mean(), y.mean())
- (xm, ym) = (x-xmean, y-ymean)
- (Sxx, Syy) = (ma.add.reduce(xm*xm), ma.add.reduce(ym*ym))
- Sxy = ma.add.reduce(xm*ym)
- r_den = ma.sqrt(Sxx*Syy)
- if r_den == 0.0:
- r = 0.0
- else:
- r = Sxy / r_den
- if (r > 1.0):
- r = 1.0 # from numerical error
- #z = 0.5*log((1.0+r+TINY)/(1.0-r+TINY))
- df = n-2
- t = r * ma.sqrt(df/(1.0-r*r))
- prob = betai(0.5*df,0.5,df/(df+t*t))
- slope = Sxy / Sxx
- intercept = ymean - slope*xmean
- sterrest = ma.sqrt(1.-r*r) * y.std()
- return slope, intercept, r, prob, sterrest, Syy/Sxx
-
-if stats.linregress.__doc__:
- linregress.__doc__ = stats.linregress.__doc__ + genmissingvaldoc
-
-
-def theilslopes(y, x=None, alpha=0.05):
- """Computes the Theil slope over the dataset (x,y), as the median of all slopes
- between paired values.
-
- Parameters
- ----------
- y : sequence
- Dependent variable.
- x : {None, sequence} optional
- Independent variable. If None, use arange(len(y)) instead.
- alpha : float
- Confidence degree.
-
- Returns
- -------
- medslope : float
- Theil slope
- medintercept : float
- Intercept of the Theil line, as median(y)-medslope*median(x)
- lo_slope : float
- Lower bound of the confidence interval on medslope
- up_slope : float
- Upper bound of the confidence interval on medslope
-
- """
- y = ma.asarray(y).flatten()
- y[-1] = masked
- n = len(y)
- if x is None:
- x = ma.arange(len(y), dtype=float)
- else:
- x = ma.asarray(x).flatten()
- if len(x) != n:
- raise ValueError, "Incompatible lengths ! (%s<>%s)" % (n,len(x))
- m = ma.mask_or(ma.getmask(x), ma.getmask(y))
- y._mask = x._mask = m
- ny = y.count()
- #
- slopes = ma.hstack([(y[i+1:]-y[i])/(x[i+1:]-x[i]) for i in range(n-1)])
- slopes.sort()
- medslope = ma.median(slopes)
- medinter = ma.median(y) - medslope*ma.median(x)
- #
- if alpha > 0.5:
- alpha = 1.-alpha
- z = stats.distributions.norm.ppf(alpha/2.)
- #
- (xties, yties) = (count_tied_groups(x), count_tied_groups(y))
- nt = ny*(ny-1)/2.
- sigsq = (ny*(ny-1)*(2*ny+5)/18.)
- sigsq -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in xties.iteritems())
- sigsq -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in yties.iteritems())
- sigma = np.sqrt(sigsq)
-
- Ru = min(np.round((nt - z*sigma)/2. + 1), len(slopes)-1)
- Rl = max(np.round((nt + z*sigma)/2.), 0)
- delta = slopes[[Rl,Ru]]
- return medslope, medinter, delta[0], delta[1]
-
-
-def sen_seasonal_slopes(x):
- x = ma.array(x, subok=True, copy=False, ndmin=2)
- (n,_) = x.shape
- # Get list of slopes per season
- szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None]
- for i in range(n)])
- szn_medslopes = ma.median(szn_slopes, axis=0)
- medslope = ma.median(szn_slopes, axis=None)
- return szn_medslopes, medslope
-
-
-#####--------------------------------------------------------------------------
-#---- --- Inferential statistics ---
-#####--------------------------------------------------------------------------
-
-def ttest_onesamp(a, popmean):
- a = ma.asarray(a)
- x = a.mean(axis=None)
- v = a.var(axis=None,ddof=1)
- n = a.count(axis=None)
- df = n-1
- svar = ((n-1)*v) / float(df)
- t = (x-popmean)/ma.sqrt(svar*(1.0/n))
- prob = betai(0.5*df,0.5,df/(df+t*t))
- return t,prob
-ttest_onesamp.__doc__ = stats.ttest_1samp.__doc__
-ttest_1samp = ttest_onesamp
-
-
-def ttest_ind(a, b, axis=0):
- a, b, axis = _chk2_asarray(a, b, axis)
- (x1, x2) = (a.mean(axis), b.mean(axis))
- (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))
- (n1, n2) = (a.count(axis), b.count(axis))
- df = n1+n2-2
- svar = ((n1-1)*v1+(n2-1)*v2) / float(df)
- svar == 0
- t = (x1-x2)/ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # N-D COMPUTATION HERE!!!!!!
- t = ma.filled(t, 1) # replace NaN t-values with 1.0
- probs = betai(0.5*df,0.5,float(df)/(df+t*t)).reshape(t.shape)
- return t, probs.squeeze()
-ttest_ind.__doc__ = stats.ttest_ind.__doc__
-
-
-def ttest_rel(a,b,axis=None):
- a, b, axis = _chk2_asarray(a, b, axis)
- if len(a)!=len(b):
- raise ValueError, 'unequal length arrays'
- (x1, x2) = (a.mean(axis), b.mean(axis))
- (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))
- n = a.count(axis)
- df = (n-1.0)
- d = (a-b).astype('d')
- denom = ma.sqrt((n*ma.add.reduce(d*d,axis) - ma.add.reduce(d,axis)**2) /df)
- #zerodivproblem = denom == 0
- t = ma.add.reduce(d, axis) / denom
- t = ma.filled(t, 1)
- probs = betai(0.5*df,0.5,df/(df+t*t)).reshape(t.shape).squeeze()
- return t, probs
-ttest_rel.__doc__ = stats.ttest_rel.__doc__
-
-
-def chisquare(f_obs, f_exp=None):
- f_obs = ma.asarray(f_obs)
- if f_exp is None:
- f_exp = ma.array([f_obs.mean(axis=0)] * len(f_obs))
- f_exp = f_exp.astype(float)
- chisq = ma.add.reduce((f_obs-f_exp)**2 / f_exp)
- return chisq, stats.chisqprob(chisq, f_obs.count(0)-1)
-chisquare.__doc__ = stats.chisquare.__doc__
-
-
-def mannwhitneyu(x,y, use_continuity=True):
- """Computes the Mann-Whitney on samples x and y.
- Missing values in x and/or y are discarded.
-
- Parameters
- ----------
- x : sequence
- y : sequence
- use_continuity : {True, False} optional
- Whether a continuity correction (1/2.) should be taken into account.
-
- Returns
- -------
- u : float
- The Mann-Whitney statistics
- prob : float
- Approximate p-value assuming a normal distribution.
-
- """
- x = ma.asarray(x).compressed().view(ndarray)
- y = ma.asarray(y).compressed().view(ndarray)
- ranks = rankdata(np.concatenate([x,y]))
- (nx, ny) = (len(x), len(y))
- nt = nx + ny
- U = ranks[:nx].sum() - nx*(nx+1)/2.
- U = max(U, nx*ny - U)
- u = nx*ny - U
- #
- mu = (nx*ny)/2.
- sigsq = (nt**3 - nt)/12.
- ties = count_tied_groups(ranks)
- sigsq -= np.sum(v*(k**3-k) for (k,v) in ties.iteritems())/12.
- sigsq *= nx*ny/float(nt*(nt-1))
- #
- if use_continuity:
- z = (U - 1/2. - mu) / ma.sqrt(sigsq)
- else:
- z = (U - mu) / ma.sqrt(sigsq)
- prob = special.erfc(abs(z)/np.sqrt(2))
- return (u, prob)
-
-
-def kruskalwallis(*args):
- output = argstoarray(*args)
- ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
- sumrk = ranks.sum(-1)
- ngrp = ranks.count(-1)
- ntot = ranks.count()
-# ssbg = (sumrk**2/ranks.count(-1)).sum() - ranks.sum()**2/ntotal
-# H = ssbg / (ntotal*(ntotal+1)/12.)
- H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
- # Tie correction
- ties = count_tied_groups(ranks)
- T = 1. - np.sum(v*(k**3-k) for (k,v) in ties.iteritems())/float(ntot**3-ntot)
- if T == 0:
- raise ValueError, 'All numbers are identical in kruskal'
- H /= T
- #
- df = len(output) - 1
- prob = stats.chisqprob(H,df)
- return (H, prob)
-kruskal = kruskalwallis
-kruskalwallis.__doc__ = stats.kruskal.__doc__
-
-
-_kolmog2 = special.kolmogorov
-def _kolmog1(x,n):
- if x <= 0:
- return 0
- if x >= 1:
- return 1
- j = np.arange(np.floor(n*(1-x))+1)
- return 1 - x * np.sum(np.exp(np.log(misc.comb(n,j))
- + (n-j) * np.log(1-x-j/float(n))
- + (j-1) * np.log(x+j/float(n))))
-
-
-def ks_twosamp(data1, data2, alternative="two_sided"):
- """Computes the Kolmogorov-Smirnov test on two samples.
- Missing values are discarded.
-
- Parameters
- ----------
- data1 : sequence
- First data set
- data2 : sequence
- Second data set
- alternative : {'two_sided', 'less', 'greater'} optional
- Indicates the alternative hypothesis.
-
- Returns
- -------
- d : float
- Value of the Kolmogorov Smirnov test
- p : float
- Corresponding p-value.
-
- """
- (data1, data2) = (ma.asarray(data1), ma.asarray(data2))
- (n1, n2) = (data1.count(), data2.count())
- n = (n1*n2/float(n1+n2))
- mix = ma.concatenate((data1.compressed(), data2.compressed()))
- mixsort = mix.argsort(kind='mergesort')
- csum = np.where(mixsort<n1, 1./n1, -1./n2).cumsum()
- # Check for ties
- if len(np.unique(mix)) < (n1+n2):
- csum = csum[np.r_[np.diff(mix[mixsort]).nonzero()[0],-1]]
- #
- alternative = str(alternative).lower()[0]
- if alternative == 't':
- d = ma.abs(csum).max()
- prob = _kolmog2(np.sqrt(n)*d)
- elif alternative == 'l':
- d = -csum.min()
- prob = np.exp(-2*n*d**2)
- elif alternative == 'g':
- d = csum.max()
- prob = np.exp(-2*n*d**2)
- else:
- raise ValueError("Invalid value for the alternative hypothesis: "\
- "should be in 'two_sided', 'less' or 'greater'")
- return (d, prob)
-ks_2samp = ks_twosamp
-
-
-def ks_twosamp_old(data1, data2):
- """ Computes the Kolmogorov-Smirnov statistic on 2 samples.
- Returns: KS D-value, p-value
- #"""
- (data1, data2) = [ma.asarray(d).compressed() for d in (data1,data2)]
- return stats.ks_2samp(data1,data2)
-
-
-#####--------------------------------------------------------------------------
-#---- --- Trimming ---
-#####--------------------------------------------------------------------------
-
-def threshold(a, threshmin=None, threshmax=None, newval=0):
- """Clip array to a given value.
-
-Similar to numpy.clip(), except that values less than threshmin or
-greater than threshmax are replaced by newval, instead of by
-threshmin and threshmax respectively.
-
-Parameters
-----------
- a : ndarray
- Input data
- threshmin : {None, float} optional
- Lower threshold. If None, set to the minimum value.
- threshmax : {None, float} optional
- Upper threshold. If None, set to the maximum value.
- newval : {0, float} optional
- Value outside the thresholds.
-
-Returns
--------
- a, with values less (greater) than threshmin (threshmax) replaced with newval.
-
-"""
- a = ma.array(a, copy=True)
- mask = np.zeros(a.shape, dtype=bool)
- if threshmin is not None:
- mask |= (a < threshmin).filled(False)
- if threshmax is not None:
- mask |= (a > threshmax).filled(False)
- a[mask] = newval
- return a
-
-
-def trima(a, limits=None, inclusive=(True,True)):
- """Trims an array by masking the data outside some given limits.
- Returns a masked version of the input array.
-
- Parameters
- ----------
- a : sequence
- Input array.
- limits : {None, tuple} optional
- Tuple of (lower limit, upper limit) in absolute values.
- Values of the input array lower (greater) than the lower (upper) limit
- will be masked. A limit is None indicates an open interval.
- inclusive : {(True,True) tuple} optional
- Tuple of (lower flag, upper flag), indicating whether values exactly
- equal to the lower (upper) limit are allowed.
-
- """
- a = ma.asarray(a)
- a.unshare_mask()
- if limits is None:
- return a
- (lower_lim, upper_lim) = limits
- (lower_in, upper_in) = inclusive
- condition = False
- if lower_lim is not None:
- if lower_in:
- condition |= (a < lower_lim)
- else:
- condition |= (a <= lower_lim)
- if upper_lim is not None:
- if upper_in:
- condition |= (a > upper_lim)
- else:
- condition |= (a >= upper_lim)
- a[condition.filled(True)] = masked
- return a
-
-
-def trimr(a, limits=None, inclusive=(True, True), axis=None):
- """Trims an array by masking some proportion of the data on each end.
- Returns a masked version of the input array.
-
- Parameters
- ----------
- a : sequence
- Input array.
- limits : {None, tuple} optional
- Tuple of the percentages to cut on each side of the array, with respect
- to the number of unmasked data, as floats between 0. and 1.
- Noting n the number of unmasked data before trimming, the (n*limits[0])th
- smallest data and the (n*limits[1])th largest data are masked, and the
- total number of unmasked data after trimming is n*(1.-sum(limits))
- The value of one limit can be set to None to indicate an open interval.
- inclusive : {(True,True) tuple} optional
- Tuple of flags indicating whether the number of data being masked on the
- left (right) end should be truncated (True) or rounded (False) to integers.
- axis : {None,int} optional
- Axis along which to trim. If None, the whole array is trimmed, but its
- shape is maintained.
-
- """
- def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
- n = a.count()
- idx = a.argsort()
- if low_limit:
- if low_inclusive:
- lowidx = int(low_limit*n)
- else:
- lowidx = np.round(low_limit*n)
- a[idx[:lowidx]] = masked
- if up_limit is not None:
- if up_inclusive:
- upidx = n - int(n*up_limit)
- else:
- upidx = n- np.round(n*up_limit)
- a[idx[upidx:]] = masked
- return a
- #
- a = ma.asarray(a)
- a.unshare_mask()
- if limits is None:
- return a
- # Check the limits
- (lolim, uplim) = limits
- errmsg = "The proportion to cut from the %s should be between 0. and 1."
- if lolim is not None:
- if lolim > 1. or lolim < 0:
- raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
- if uplim is not None:
- if uplim > 1. or uplim < 0:
- raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
- #
- (loinc, upinc) = inclusive
- #
- if axis is None:
- shp = a.shape
- return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)
- else:
- return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc)
-
-trimdoc = """
- Parameters
- ----------
- a : sequence
- Input array
- limits : {None, tuple} optional
- If relative == False, tuple (lower limit, upper limit) in absolute values.
- Values of the input array lower (greater) than the lower (upper) limit are
- masked.
- If relative == True, tuple (lower percentage, upper percentage) to cut
- on each side of the array, with respect to the number of unmasked data.
- Noting n the number of unmasked data before trimming, the (n*limits[0])th
- smallest data and the (n*limits[1])th largest data are masked, and the
- total number of unmasked data after trimming is n*(1.-sum(limits))
- In each case, the value of one limit can be set to None to indicate an
- open interval.
- If limits is None, no trimming is performed
- inclusive : {(True, True) tuple} optional
- If relative==False, tuple indicating whether values exactly equal to the
- absolute limits are allowed.
- If relative==True, tuple indicating whether the number of data being masked
- on each side should be rounded (True) or truncated (False).
- relative : {False, True} optional
- Whether to consider the limits as absolute values (False) or proportions
- to cut (True).
- axis : {None, integer}, optional
- Axis along which to trim.
-"""
-
-
-def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None):
- """Trims an array by masking the data outside some given limits.
- Returns a masked version of the input array.
- %s
-
- Examples
- --------
- >>>z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10]
- >>>trim(z,(3,8))
- [--,--, 3, 4, 5, 6, 7, 8,--,--]
- >>>trim(z,(0.1,0.2),relative=True)
- [--, 2, 3, 4, 5, 6, 7, 8,--,--]
-
-
- """
- if relative:
- return trimr(a, limits=limits, inclusive=inclusive, axis=axis)
- else:
- return trima(a, limits=limits, inclusive=inclusive)
-
-if trim.__doc__:
- trim.__doc__ = trim.__doc__ % trimdoc
-
-
-def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None):
- """Trims the data by masking the int(proportiontocut*n) smallest and
- int(proportiontocut*n) largest values of data along the given axis, where n
- is the number of unmasked values before trimming.
-
-Parameters
-----------
- data : ndarray
- Data to trim.
- proportiontocut : {0.2, float} optional
- Percentage of trimming (as a float between 0 and 1).
- If n is the number of unmasked values before trimming, the number of
- values after trimming is:
- (1-2*proportiontocut)*n.
- inclusive : {(True, True) tuple} optional
- Tuple indicating whether the number of data being masked on each side
- should be rounded (True) or truncated (False).
- axis : {None, integer}, optional
- Axis along which to perform the trimming.
- If None, the input array is first flattened.
-
- """
- return trimr(data, limits=(proportiontocut,proportiontocut),
- inclusive=inclusive, axis=axis)
-
-#..............................................................................
-def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True),
- axis=None):
- """Trims the data by masking int(trim*n) values from ONE tail of the
- data along the given axis, where n is the number of unmasked values.
-
-Parameters
-----------
- data : {ndarray}
- Data to trim.
- proportiontocut : {0.2, float} optional
- Percentage of trimming. If n is the number of unmasked values
- before trimming, the number of values after trimming is
- (1-proportiontocut)*n.
- tail : {'left','right'} optional
- If left (right), the ``proportiontocut`` lowest (greatest) values will
- be masked.
- inclusive : {(True, True) tuple} optional
- Tuple indicating whether the number of data being masked on each side
- should be rounded (True) or truncated (False).
- axis : {None, integer}, optional
- Axis along which to perform the trimming.
- If None, the input array is first flattened.
-
- """
- tail = str(tail).lower()[0]
- if tail == 'l':
- limits = (proportiontocut,None)
- elif tail == 'r':
- limits = (None, proportiontocut)
- else:
- raise TypeError("The tail argument should be in ('left','right')")
- return trimr(data, limits=limits, axis=axis, inclusive=inclusive)
-
-trim1 = trimtail
-
-def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
- axis=None):
- """Returns the trimmed mean of the data along the given axis.
-
- %s
-
- """ % trimdoc
- if (not isinstance(limits,tuple)) and isinstance(limits,float):
- limits = (limits, limits)
- if relative:
- return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis)
- else:
- return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis)
-
-
-def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
- axis=None, ddof=0):
- """Returns the trimmed variance of the data along the given axis.
-
- %s
- ddof : {0,integer}, optional
- Means Delta Degrees of Freedom. The denominator used during computations
- is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
- biased estimate of the variance.
-
- """ % trimdoc
- if (not isinstance(limits,tuple)) and isinstance(limits,float):
- limits = (limits, limits)
- if relative:
- out = trimr(a,limits=limits, inclusive=inclusive,axis=axis)
- else:
- out = trima(a,limits=limits,inclusive=inclusive)
- return out.var(axis=axis, ddof=ddof)
-
-
-def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
- axis=None, ddof=0):
- """Returns the trimmed standard deviation of the data along the given axis.
-
- %s
- ddof : {0,integer}, optional
- Means Delta Degrees of Freedom. The denominator used during computations
- is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
- biased estimate of the variance.
-
- """ % trimdoc
- if (not isinstance(limits,tuple)) and isinstance(limits,float):
- limits = (limits, limits)
- if relative:
- out = trimr(a,limits=limits,inclusive=inclusive,axis=axis)
- else:
- out = trima(a,limits=limits,inclusive=inclusive)
- return out.std(axis=axis,ddof=ddof)
-
-
-def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None):
- """Returns the standard error of the trimmed mean of the data along the given
- axis.
- Parameters
- ----------
- a : sequence
- Input array
- limits : {(0.1,0.1), tuple of float} optional
- tuple (lower percentage, upper percentage) to cut on each side of the
- array, with respect to the number of unmasked data.
- Noting n the number of unmasked data before trimming, the (n*limits[0])th
- smallest data and the (n*limits[1])th largest data are masked, and the
- total number of unmasked data after trimming is n*(1.-sum(limits))
- In each case, the value of one limit can be set to None to indicate an
- open interval.
- If limits is None, no trimming is performed
- inclusive : {(True, True) tuple} optional
- Tuple indicating whether the number of data being masked on each side
- should be rounded (True) or truncated (False).
- axis : {None, integer}, optional
- Axis along which to trim.
- """
- #........................
- def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
- "Returns the standard error of the trimmed mean for a 1D input data."
- n = a.count()
- idx = a.argsort()
- if low_limit:
- if low_inclusive:
- lowidx = int(low_limit*n)
- else:
- lowidx = np.round(low_limit*n)
- a[idx[:lowidx]] = masked
- if up_limit is not None:
- if up_inclusive:
- upidx = n - int(n*up_limit)
- else:
- upidx = n- np.round(n*up_limit)
- a[idx[upidx:]] = masked
- nsize = a.count()
- a[idx[:lowidx]] = a[idx[lowidx]]
- a[idx[upidx:]] = a[idx[upidx-1]]
- winstd = a.std(ddof=1)
- return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a)))
- #........................
- a = ma.array(a, copy=True, subok=True)
- a.unshare_mask()
- if limits is None:
- return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis))
- if (not isinstance(limits,tuple)) and isinstance(limits,float):
- limits = (limits, limits)
- # Check the limits
- (lolim, uplim) = limits
- errmsg = "The proportion to cut from the %s should be between 0. and 1."
- if lolim is not None:
- if lolim > 1. or lolim < 0:
- raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
- if uplim is not None:
- if uplim > 1. or uplim < 0:
- raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
- #
- (loinc, upinc) = inclusive
- if (axis is None):
- shp = a.shape
- return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc)
- else:
- assert a.ndim <= 2, "Array should be 2D at most !"
- return ma.apply_along_axis(_trimmed_stde_1D, axis, a,
- lolim,uplim,loinc,upinc)
-
-
-def tmean(a, limits=None, inclusive=(True,True)):
- return trima(a, limits=limits, inclusive=inclusive).mean()
-tmean.__doc__ = stats.tmean.__doc__
-
-def tvar(a, limits=None, inclusive=(True,True)):
- return trima(a, limits=limits, inclusive=inclusive).var()
-tvar.__doc__ = stats.tvar.__doc__
-
-def tmin(a, lowerlimit=None, axis=0, inclusive=True):
- a, axis = _chk_asarray(a, axis)
- am = trima(a, (lowerlimit, None), (inclusive, False))
- return ma.minimum.reduce(am, axis)
-tmin.__doc__ = stats.tmin.__doc__
-
-def tmax(a, upperlimit, axis=0, inclusive=True):
- a, axis = _chk_asarray(a, axis)
- am = trima(a, (None, upperlimit), (False, inclusive))
- return ma.maximum.reduce(am, axis)
-tmax.__doc__ = stats.tmax.__doc__
-
-def tsem(a, limits=None, inclusive=(True,True)):
- a = ma.asarray(a).ravel()
- if limits is None:
- n = float(a.count())
- return a.std()/ma.sqrt(n)
- am = trima(a.ravel(), limits, inclusive)
- sd = np.sqrt(am.var())
- return sd / am.count()
-tsem.__doc__ = stats.tsem.__doc__
-
-
-def winsorize(a, limits=None, inclusive=(True,True), inplace=False, axis=None):
- """Returns a Winsorized version of the input array.
-
- The (limits[0])th lowest values are set to the (limits[0])th percentile,
- and the (limits[1])th highest values are set to the (limits[1])th
- percentile.
- Masked values are skipped.
-
-
- Parameters
- ----------
- a : sequence
- Input array.
- limits : {None, tuple of float} optional
- Tuple of the percentages to cut on each side of the array, with respect
- to the number of unmasked data, as floats between 0. and 1.
- Noting n the number of unmasked data before trimming, the (n*limits[0])th
- smallest data and the (n*limits[1])th largest data are masked, and the
- total number of unmasked data after trimming is n*(1.-sum(limits))
- The value of one limit can be set to None to indicate an open interval.
- inclusive : {(True, True) tuple} optional
- Tuple indicating whether the number of data being masked on each side
- should be rounded (True) or truncated (False).
- inplace : {False, True} optional
- Whether to winsorize in place (True) or to use a copy (False)
- axis : {None, int} optional
- Axis along which to trim. If None, the whole array is trimmed, but its
- shape is maintained.
-
- """
- def _winsorize1D(a, low_limit, up_limit, low_include, up_include):
- n = a.count()
- idx = a.argsort()
- if low_limit:
- if low_include:
- lowidx = int(low_limit*n)
- else:
- lowidx = np.round(low_limit*n)
- a[idx[:lowidx]] = a[idx[lowidx]]
- if up_limit is not None:
- if up_include:
- upidx = n - int(n*up_limit)
- else:
- upidx = n- np.round(n*up_limit)
- a[idx[upidx:]] = a[idx[upidx-1]]
- return a
- # We gonna modify a: better make a copy
- a = ma.array(a, copy=np.logical_not(inplace))
- #
- if limits is None:
- return a
- if (not isinstance(limits,tuple)) and isinstance(limits,float):
- limits = (limits, limits)
- # Check the limits
- (lolim, uplim) = limits
- errmsg = "The proportion to cut from the %s should be between 0. and 1."
- if lolim is not None:
- if lolim > 1. or lolim < 0:
- raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
- if uplim is not None:
- if uplim > 1. or uplim < 0:
- raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
- #
- (loinc, upinc) = inclusive
- #
- if axis is None:
- shp = a.shape
- return _winsorize1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)
- else:
- return ma.apply_along_axis(_winsorize1D, axis,a,lolim,uplim,loinc,upinc)
-
-
-#####--------------------------------------------------------------------------
-#---- --- Moments ---
-#####--------------------------------------------------------------------------
-
-def moment(a, moment=1, axis=0):
- a, axis = _chk_asarray(a, axis)
- if moment == 1:
- # By definition the first moment about the mean is 0.
- shape = list(a.shape)
- del shape[axis]
- if shape:
- # return an actual array of the appropriate shape
- return np.zeros(shape, dtype=float)
- else:
- # the input was 1D, so return a scalar instead of a rank-0 array
- return np.float64(0.0)
- else:
- mn = ma.expand_dims(a.mean(axis=axis), axis)
- s = ma.power((a-mn), moment)
- return s.mean(axis=axis)
-moment.__doc__ = stats.moment.__doc__
-
-
-def variation(a, axis=0):
- a, axis = _chk_asarray(a, axis)
- return a.std(axis)/a.mean(axis)
-variation.__doc__ = stats.variation.__doc__
-
-
-def skew(a, axis=0, bias=True):
- a, axis = _chk_asarray(a,axis)
- n = a.count(axis)
- m2 = moment(a, 2, axis)
- m3 = moment(a, 3, axis)
- vals = ma.where(m2 == 0, 0, m3 / m2**1.5)
- if not bias:
- can_correct = (n > 2) & (m2 > 0)
- if can_correct.any():
- m2 = np.extract(can_correct, m2)
- m3 = np.extract(can_correct, m3)
- nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5
- np.place(vals, can_correct, nval)
- return vals
-skew.__doc__ = stats.skew.__doc__
-
-
-def kurtosis(a, axis=0, fisher=True, bias=True):
- a, axis = _chk_asarray(a, axis)
- n = a.count(axis)
- m2 = moment(a,2,axis)
- m4 = moment(a,4,axis)
- vals = ma.where(m2 == 0, 0, m4/ m2**2.0)
- if not bias:
- can_correct = (n > 3) & (m2 > 0)
- if can_correct.any():
- m2 = np.extract(can_correct, m2)
- m4 = np.extract(can_correct, m4)
- nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0)
- np.place(vals, can_correct, nval+3.0)
- if fisher:
- return vals - 3
- else:
- return vals
-kurtosis.__doc__ = stats.kurtosis.__doc__
-
-def describe(a, axis=0):
- """Computes several descriptive statistics of the passed array.
-
- Parameters
- ----------
- a : array
- axis : int or None
-
- Returns
- -------
- (size of the data (discarding missing values),
- (min, max),
- arithmetic mean,
- unbiased variance,
- biased skewness,
- biased kurtosis)
- """
- a, axis = _chk_asarray(a, axis)
- n = a.count(axis)
- mm = (ma.minimum.reduce(a), ma.maximum.reduce(a))
- m = a.mean(axis)
- v = a.var(axis)
- sk = skew(a, axis)
- kurt = kurtosis(a, axis)
- return n, mm, m, v, sk, kurt
-
-#.............................................................................
-def stde_median(data, axis=None):
- """Returns the McKean-Schrader estimate of the standard error of the sample
-median along the given axis. masked values are discarded.
-
- Parameters
- ----------
- data : ndarray
- Data to trim.
- axis : {None,int} optional
- Axis along which to perform the trimming.
- If None, the input array is first flattened.
-
- """
- def _stdemed_1D(data):
- data = np.sort(data.compressed())
- n = len(sorted)
- z = 2.5758293035489004
- k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0))
- return ((data[n-k] - data[k-1])/(2.*z))
- #
- data = ma.array(data, copy=False, subok=True)
- if (axis is None):
- return _stdemed_1D(data)
- else:
- assert data.ndim <= 2, "Array should be 2D at most !"
- return ma.apply_along_axis(_stdemed_1D, axis, data)
-
-#####--------------------------------------------------------------------------
-#---- --- Normality Tests ---
-#####--------------------------------------------------------------------------
-
-def skewtest(a, axis=0):
- a, axis = _chk_asarray(a, axis)
- if axis is None:
- a = a.ravel()
- axis = 0
- b2 = skew(a,axis)
- n = a.count(axis)
- if np.min(n) < 8:
- warnings.warn(
- "skewtest only valid for n>=8 ... continuing anyway, n=%i" %
- np.min(n))
- y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2)) )
- beta2 = ( 3.0*(n*n+27*n-70)*(n+1)*(n+3) ) / ( (n-2.0)*(n+5)*(n+7)*(n+9) )
- W2 = -1 + ma.sqrt(2*(beta2-1))
- delta = 1/ma.sqrt(0.5*ma.log(W2))
- alpha = ma.sqrt(2.0/(W2-1))
- y = ma.where(y==0, 1, y)
- Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1))
- return Z, (1.0 - stats.zprob(Z))*2
-skewtest.__doc__ = stats.skewtest.__doc__
-
-def kurtosistest(a, axis=0):
- a, axis = _chk_asarray(a, axis)
- n = a.count(axis=axis).astype(float)
- if np.min(n) < 20:
- warnings.warn(
- "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" %
- np.min(n))
- b2 = kurtosis(a, axis, fisher=False)
- E = 3.0*(n-1) /(n+1)
- varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1)*(n+3)*(n+5))
- x = (b2-E)/ma.sqrt(varb2)
- sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5))/
- (n*(n-2)*(n-3)))
- A = 6.0 + 8.0/sqrtbeta1 *(2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
- term1 = 1 - 2./(9.0*A)
- denom = 1 + x*ma.sqrt(2/(A-4.0))
- denom[denom < 0] = masked
- term2 = ma.power((1-2.0/A)/denom,1/3.0)
- Z = ( term1 - term2 ) / np.sqrt(2/(9.0*A))
- return Z, (1.0-stats.zprob(Z))*2
-kurtosistest.__doc__ = stats.kurtosistest.__doc__
-
-
-def normaltest(a, axis=0):
- a, axis = _chk_asarray(a, axis)
- s,_ = skewtest(a,axis)
- k,_ = kurtosistest(a,axis)
- k2 = s*s + k*k
- return k2, stats.chisqprob(k2,2)
-normaltest.__doc__ = stats.normaltest.__doc__
-
-# Martinez-Iglewicz test
-# K-S test
-
-
-#####--------------------------------------------------------------------------
-#---- --- Percentiles ---
-#####--------------------------------------------------------------------------
-
-
-def mquantiles(data, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None,
- limit=()):
- """Computes empirical quantiles for a *1xN* data array.
-Samples quantile are defined by:
-*Q(p) = (1-g).x[i] +g.x[i+1]*
-where *x[j]* is the jth order statistic,
-with *i = (floor(n*p+m))*, *m=alpha+p*(1-alpha-beta)* and *g = n*p + m - i)*.
-
-Typical values of (alpha,beta) are:
-
- - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4)
- - (.5,.5) : *p(k) = (k+1/2.)/n* : piecewise linear function (R, type 5)
- - (0,0) : *p(k) = k/(n+1)* : (R type 6)
- - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])].
- That's R default (R type 7)
- - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])].
- The resulting quantile estimates are approximately median-unbiased
- regardless of the distribution of x. (R type 8)
- - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom.
- The resulting quantile estimates are approximately unbiased
- if x is normally distributed (R type 9)
- - (.4,.4) : approximately quantile unbiased (Cunnane)
- - (.35,.35): APL, used with PWM
-
-Parameters
-----------
- x : sequence
- Input data, as a sequence or array of dimension at most 2.
- prob : sequence
- List of quantiles to compute.
- alpha : {0.4, float} optional
- Plotting positions parameter.
- beta : {0.4, float} optional
- Plotting positions parameter.
- axis : {None, int} optional
- Axis along which to perform the trimming.
- If None, the input array is first flattened.
- limit : tuple
- Tuple of (lower, upper) values. Values of a outside this closed interval
- are ignored.
- """
- def _quantiles1D(data,m,p):
- x = np.sort(data.compressed())
- n = len(x)
- if n == 0:
- return ma.array(np.empty(len(p), dtype=float), mask=True)
- elif n == 1:
- return ma.array(np.resize(x, p.shape), mask=nomask)
- aleph = (n*p + m)
- k = np.floor(aleph.clip(1, n-1)).astype(int)
- gamma = (aleph-k).clip(0,1)
- return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
-
- # Initialization & checks ---------
- data = ma.array(data, copy=False)
- if limit:
- condition = (limit[0]<data) & (data<limit[1])
- data[condition.filled(True)] = masked
- p = np.array(prob, copy=False, ndmin=1)
- m = alphap + p*(1.-alphap-betap)
- # Computes quantiles along axis (or globally)
- if (axis is None):
- return _quantiles1D(data, m, p)
- else:
- assert data.ndim <= 2, "Array should be 2D at most !"
- return ma.apply_along_axis(_quantiles1D, axis, data, m, p)
-
-
-def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4):
- """Calculate the score at the given 'per' percentile of the
- sequence a. For example, the score at per=50 is the median.
-
- This function is a shortcut to mquantile
-
- """
- if (per < 0) or (per > 100.):
- raise ValueError("The percentile should be between 0. and 100. !"\
- " (got %s)" % per)
- return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap,
- limit=limit, axis=0).squeeze()
-
-
-def plotting_positions(data, alpha=0.4, beta=0.4):
- """Returns the plotting positions (or empirical percentile points) for the
- data.
- Plotting positions are defined as (i-alpha)/(n-alpha-beta), where:
- - i is the rank order statistics
- - n is the number of unmasked values along the given axis
- - alpha and beta are two parameters.
-
- Typical values for alpha and beta are:
- - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4)
- - (.5,.5) : *p(k) = (k-1/2.)/n* : piecewise linear function (R, type 5)
- - (0,0) : *p(k) = k/(n+1)* : Weibull (R type 6)
- - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])].
- That's R default (R type 7)
- - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])].
- The resulting quantile estimates are approximately median-unbiased
- regardless of the distribution of x. (R type 8)
- - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom.
- The resulting quantile estimates are approximately unbiased
- if x is normally distributed (R type 9)
- - (.4,.4) : approximately quantile unbiased (Cunnane)
- - (.35,.35): APL, used with PWM
-
-Parameters
-----------
- x : sequence
- Input data, as a sequence or array of dimension at most 2.
- prob : sequence
- List of quantiles to compute.
- alpha : {0.4, float} optional
- Plotting positions parameter.
- beta : {0.4, float} optional
- Plotting positions parameter.
-
- """
- data = ma.array(data, copy=False).reshape(1,-1)
- n = data.count()
- plpos = np.empty(data.size, dtype=float)
- plpos[n:] = 0
- plpos[data.argsort()[:n]] = (np.arange(1,n+1) - alpha)/(n+1-alpha-beta)
- return ma.array(plpos, mask=data._mask)
-
-meppf = plotting_positions
-
-#####--------------------------------------------------------------------------
-#---- --- Variability ---
-#####--------------------------------------------------------------------------
-
-def obrientransform(*args):
- """
-Computes a transform on input data (any number of columns). Used to
-test for homogeneity of variance prior to running one-way stats. Each
-array in *args is one level of a factor. If an F_oneway() run on the
-transformed data and found significant, variances are unequal. From
-Maxwell and Delaney, p.112.
-
-Returns: transformed data for use in an ANOVA
- """
- data = argstoarray(*args).T
- v = data.var(axis=0,ddof=1)
- m = data.mean(0)
- n = data.count(0).astype(float)
- # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2))
- data -= m
- data **= 2
- data *= (n-1.5)*n
- data -= 0.5*v*(n-1)
- data /= (n-1.)*(n-2.)
- if not ma.allclose(v,data.mean(0)):
- raise ValueError("Lack of convergence in obrientransform.")
- return data
-
-
-def signaltonoise(data, axis=0):
- """Calculates the signal-to-noise ratio, as the ratio of the mean over
- standard deviation along the given axis.
-
- Parameters
- ----------
- data : sequence
- Input data
- axis : {0, int} optional
- Axis along which to compute. If None, the computation is performed
- on a flat version of the array.
-"""
- data = ma.array(data, copy=False)
- m = data.mean(axis)
- sd = data.std(axis, ddof=0)
- return m/sd
-
-
-def samplevar(data, axis=0):
- """Returns a biased estimate of the variance of the data, as the average
- of the squared deviations from the mean.
-
- Parameters
- ----------
- data : sequence
- Input data
- axis : {0, int} optional
- Axis along which to compute. If None, the computation is performed
- on a flat version of the array.
- """
- return ma.asarray(data).var(axis=axis,ddof=0)
-
-
-def samplestd(data, axis=0):
- """Returns a biased estimate of the standard deviation of the data, as the
- square root of the average squared deviations from the mean.
-
- Parameters
- ----------
- data : sequence
- Input data
- axis : {0,int} optional
- Axis along which to compute. If None, the computation is performed
- on a flat version of the array.
-
- Notes
- -----
- samplestd(a) is equivalent to a.std(ddof=0)
-
- """
- return ma.asarray(data).std(axis=axis,ddof=0)
-
-
-def var(a,axis=None):
- return ma.asarray(a).var(axis=axis,ddof=1)
-var.__doc__ = stats.var.__doc__
-
-def std(a,axis=None):
- return ma.asarray(a).std(axis=axis,ddof=1)
-std.__doc__ = stats.std.__doc__
-
-def stderr(a, axis=0):
- a, axis = _chk_asarray(a, axis)
- return a.std(axis=axis, ddof=1) / ma.sqrt(a.count(axis=axis))
-stderr.__doc__ = stats.stderr.__doc__
-
-def sem(a, axis=0):
- a, axis = _chk_asarray(a, axis)
- n = a.count(axis=axis)
- s = a.std(axis=axis,ddof=0) / ma.sqrt(n-1)
- return s
-sem.__doc__ = stats.sem.__doc__
-
-def z(a, score):
- a = ma.asarray(a)
- z = (score-a.mean(None)) / a.std(axis=None, ddof=1)
- return z
-z.__doc__ = stats.z.__doc__
-
-def zs(a):
- a = ma.asarray(a)
- mu = a.mean(axis=0)
- sigma = a.std(axis=0,ddof=0)
- return (a-mu)/sigma
-zs.__doc__ = stats.zs.__doc__
-
-def zmap(scores, compare, axis=0):
- (scores, compare) = (ma.asarray(scores), ma.asarray(compare))
- mns = compare.mean(axis=axis)
- sstd = compare.std(axis=0, ddof=0)
- return (scores - mns) / sstd
-zmap.__doc__ = stats.zmap.__doc__
-
-
-#####--------------------------------------------------------------------------
-#---- --- ANOVA ---
-#####--------------------------------------------------------------------------
-
-
-def f_oneway(*args):
- """
-Performs a 1-way ANOVA, returning an F-value and probability given
-any number of groups. From Heiman, pp.394-7.
-
-Usage: f_oneway (*args) where *args is 2 or more arrays, one per
- treatment group
-Returns: f-value, probability
-"""
- # Construct a single array of arguments: each row is a group
- data = argstoarray(*args)
- ngroups = len(data)
- ntot = data.count()
- sstot = (data**2).sum() - (data.sum())**2/float(ntot)
- ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum()
- sswg = sstot-ssbg
- dfbg = ngroups-1
- dfwg = ntot - ngroups
- msb = ssbg/float(dfbg)
- msw = sswg/float(dfwg)
- f = msb/msw
- prob = stats.fprob(dfbg,dfwg,f)
- return f, prob
-
-
-def f_value_wilks_lambda(ER, EF, dfnum, dfden, a, b):
- """Calculation of Wilks lambda F-statistic for multivarite data, per
- Maxwell & Delaney p.657.
- """
- ER = ma.array(ER, copy=False, ndmin=2)
- EF = ma.array(EF, copy=False, ndmin=2)
- if ma.getmask(ER).any() or ma.getmask(EF).any():
- raise NotImplementedError("Not implemented when the inputs "\
- "have missing data")
- lmbda = np.linalg.det(EF) / np.linalg.det(ER)
- q = ma.sqrt( ((a-1)**2*(b-1)**2 - 2) / ((a-1)**2 + (b-1)**2 -5) )
- q = ma.filled(q, 1)
- n_um = (1 - lmbda**(1.0/q))*(a-1)*(b-1)
- d_en = lmbda**(1.0/q) / (n_um*q - 0.5*(a-1)*(b-1) + 1)
- return n_um / d_en
-
-
-
-def friedmanchisquare(*args):
- """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA.
- This function calculates the Friedman Chi-square test for repeated measures
- and returns the result, along with the associated probability value.
-
- Each input is considered a given group. Ideally, the number of treatments
- among each group should be equal. If this is not the case, only the first
- n treatments are taken into account, where n is the number of treatments
- of the smallest group.
- If a group has some missing values, the corresponding treatments are masked
- in the other groups.
- The test statistic is corrected for ties.
-
- Masked values in one group are propagated to the other groups.
-
- Returns: chi-square statistic, associated p-value
- """
- data = argstoarray(*args).astype(float)
- k = len(data)
- if k < 3:
- raise ValueError("Less than 3 groups (%i): " % k +\
- "the Friedman test is NOT appropriate.")
- ranked = ma.masked_values(rankdata(data, axis=0), 0)
- if ranked._mask is not nomask:
- ranked = ma.mask_cols(ranked)
- ranked = ranked.compressed().reshape(k,-1).view(ndarray)
- else:
- ranked = ranked._data
- (k,n) = ranked.shape
- # Ties correction
- repeats = np.array([find_repeats(_) for _ in ranked.T], dtype=object)
- ties = repeats[repeats.nonzero()].reshape(-1,2)[:,-1].astype(int)
- tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k))
- #
- ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2)
- chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction
- return chisq, stats.chisqprob(chisq,k-1)
-
-#-############################################################################-#
Copied: trunk/scipy/stats/mstats_basic.py (from rev 5134, trunk/scipy/stats/mstats.py)
Copied: trunk/scipy/stats/mstats_extras.py (from rev 5134, trunk/scipy/stats/mmorestats.py)
Deleted: trunk/scipy/stats/tests/test_mmorestats.py
===================================================================
--- trunk/scipy/stats/tests/test_mmorestats.py 2008-11-21 21:09:29 UTC (rev 5159)
+++ trunk/scipy/stats/tests/test_mmorestats.py 2008-11-22 05:26:45 UTC (rev 5160)
@@ -1,102 +0,0 @@
-# pylint: disable-msg=W0611, W0612, W0511,R0201
-"""Tests suite for maskedArray statistics.
-
-:author: Pierre Gerard-Marchant
-:contact: pierregm_at_uga_dot_edu
-"""
-__author__ = "Pierre GF Gerard-Marchant ($Author: backtopop $)"
-
-import numpy as np
-
-import numpy.ma as ma
-
-import scipy.stats.mstats as ms
-import scipy.stats.mmorestats as mms
-
-from numpy.testing import *
-
-
-class TestMisc(TestCase):
- #
- def __init__(self, *args, **kwargs):
- TestCase.__init__(self, *args, **kwargs)
- #
- def test_mjci(self):
- "Tests the Marits-Jarrett estimator"
- data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262,
- 296,299,306,376,428,515,666,1310,2611])
- assert_almost_equal(mms.mjci(data),[55.76819,45.84028,198.87875],5)
- #
- def test_trimmedmeanci(self):
- "Tests the confidence intervals of the trimmed mean."
- data = ma.array([545,555,558,572,575,576,578,580,
- 594,605,635,651,653,661,666])
- assert_almost_equal(ms.trimmed_mean(data,0.2), 596.2, 1)
- assert_equal(np.round(mms.trimmed_mean_ci(data,(0.2,0.2)),1),
- [561.8, 630.6])
- #
- def test_idealfourths(self):
- "Tests ideal-fourths"
- test = np.arange(100)
- assert_almost_equal(np.asarray(mms.idealfourths(test)),
- [24.416667,74.583333],6)
- test_2D = test.repeat(3).reshape(-1,3)
- assert_almost_equal(mms.idealfourths(test_2D, axis=0),
- [[24.416667,24.416667,24.416667],
- [74.583333,74.583333,74.583333]],6)
- assert_almost_equal(mms.idealfourths(test_2D, axis=1),
- test.repeat(2).reshape(-1,2))
- test = [0,0]
- _result = mms.idealfourths(test)
- assert(np.isnan(_result).all())
-
-#..............................................................................
-class TestQuantiles(TestCase):
- #
- def __init__(self, *args, **kwargs):
- TestCase.__init__(self, *args, **kwargs)
- #
- def test_hdquantiles(self):
- data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014,
- 0.887764025,0.239407086,0.349638551,0.972791145,0.149789972,
- 0.936947700,0.132359948,0.046041972,0.641675031,0.945530547,
- 0.224218684,0.771450991,0.820257774,0.336458052,0.589113496,
- 0.509736129,0.696838829,0.491323573,0.622767425,0.775189248,
- 0.641461450,0.118455200,0.773029450,0.319280007,0.752229111,
- 0.047841438,0.466295911,0.583850781,0.840581845,0.550086491,
- 0.466470062,0.504765074,0.226855960,0.362641207,0.891620942,
- 0.127898691,0.490094097,0.044882048,0.041441695,0.317976349,
- 0.504135618,0.567353033,0.434617473,0.636243375,0.231803616,
- 0.230154113,0.160011327,0.819464108,0.854706985,0.438809221,
- 0.487427267,0.786907310,0.408367937,0.405534192,0.250444460,
- 0.995309248,0.144389588,0.739947527,0.953543606,0.680051621,
- 0.388382017,0.863530727,0.006514031,0.118007779,0.924024803,
- 0.384236354,0.893687694,0.626534881,0.473051932,0.750134705,
- 0.241843555,0.432947602,0.689538104,0.136934797,0.150206859,
- 0.474335206,0.907775349,0.525869295,0.189184225,0.854284286,
- 0.831089744,0.251637345,0.587038213,0.254475554,0.237781276,
- 0.827928620,0.480283781,0.594514455,0.213641488,0.024194386,
- 0.536668589,0.699497811,0.892804071,0.093835427,0.731107772]
- #
- assert_almost_equal(mms.hdquantiles(data,[0., 1.]),
- [0.006514031, 0.995309248])
- hdq = mms.hdquantiles(data,[0.25, 0.5, 0.75])
- assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,])
- hdq = mms.hdquantiles_sd(data,[0.25, 0.5, 0.75])
- assert_almost_equal(hdq, [0.03786954, 0.03805389, 0.03800152,], 4)
- #
- data = np.array(data).reshape(10,10)
- hdq = mms.hdquantiles(data,[0.25,0.5,0.75],axis=0)
- assert_almost_equal(hdq[:,0], mms.hdquantiles(data[:,0],[0.25,0.5,0.75]))
- assert_almost_equal(hdq[:,-1], mms.hdquantiles(data[:,-1],[0.25,0.5,0.75]))
- hdq = mms.hdquantiles(data,[0.25,0.5,0.75],axis=0,var=True)
- assert_almost_equal(hdq[...,0],
- mms.hdquantiles(data[:,0],[0.25,0.5,0.75],var=True))
- assert_almost_equal(hdq[...,-1],
- mms.hdquantiles(data[:,-1],[0.25,0.5,0.75], var=True))
-
-
-###############################################################################
-
-if __name__ == "__main__":
- run_module_suite()
Deleted: trunk/scipy/stats/tests/test_mstats.py
===================================================================
--- trunk/scipy/stats/tests/test_mstats.py 2008-11-21 21:09:29 UTC (rev 5159)
+++ trunk/scipy/stats/tests/test_mstats.py 2008-11-22 05:26:45 UTC (rev 5160)
@@ -1,509 +0,0 @@
-"""
-Tests for the stats.mstats module (support for maskd arrays)
-"""
-
-
-import numpy as np
-from numpy import nan
-import numpy.ma as ma
-from numpy.ma import masked, nomask
-
-import scipy.stats.mstats as mstats
-from numpy.testing import *
-from numpy.ma.testutils import assert_equal, assert_almost_equal, \
- assert_array_almost_equal
-
-
-class TestGMean(TestCase):
- def test_1D(self):
- a = (1,2,3,4)
- actual= mstats.gmean(a)
- desired = np.power(1*2*3*4,1./4.)
- assert_almost_equal(actual, desired,decimal=14)
-
- desired1 = mstats.gmean(a,axis=-1)
- assert_almost_equal(actual, desired1, decimal=14)
- assert not isinstance(desired1, ma.MaskedArray)
- #
- a = ma.array((1,2,3,4),mask=(0,0,0,1))
- actual= mstats.gmean(a)
- desired = np.power(1*2*3,1./3.)
- assert_almost_equal(actual, desired,decimal=14)
-
- desired1 = mstats.gmean(a,axis=-1)
- assert_almost_equal(actual, desired1, decimal=14)
- #
- def test_2D(self):
- a = ma.array(((1,2,3,4),(1,2,3,4),(1,2,3,4)),
- mask=((0,0,0,0),(1,0,0,1),(0,1,1,0)))
- actual= mstats.gmean(a)
- desired = np.array((1,2,3,4))
- assert_array_almost_equal(actual, desired, decimal=14)
- #
- desired1 = mstats.gmean(a,axis=0)
- assert_array_almost_equal(actual, desired1, decimal=14)
- #
- actual= mstats.gmean(a, -1)
- desired = ma.array((np.power(1*2*3*4,1./4.),
- np.power(2*3,1./2.),
- np.power(1*4,1./2.)))
- assert_array_almost_equal(actual, desired, decimal=14)
-
-class TestHMean(TestCase):
- def test_1D(self):
- a = (1,2,3,4)
- actual= mstats.hmean(a)
- desired = 4. / (1./1 + 1./2 + 1./3 + 1./4)
- assert_almost_equal(actual, desired, decimal=14)
- desired1 = mstats.hmean(ma.array(a),axis=-1)
- assert_almost_equal(actual, desired1, decimal=14)
- #
- a = ma.array((1,2,3,4),mask=(0,0,0,1))
- actual= mstats.hmean(a)
- desired = 3. / (1./1 + 1./2 + 1./3)
- assert_almost_equal(actual, desired,decimal=14)
- desired1 = mstats.hmean(a,axis=-1)
- assert_almost_equal(actual, desired1, decimal=14)
-
- def test_2D(self):
- a = ma.array(((1,2,3,4),(1,2,3,4),(1,2,3,4)),
- mask=((0,0,0,0),(1,0,0,1),(0,1,1,0)))
- actual= mstats.hmean(a)
- desired = ma.array((1,2,3,4))
- assert_array_almost_equal(actual, desired, decimal=14)
- #
- actual1 = mstats.hmean(a,axis=-1)
- desired = (4./(1/1.+1/2.+1/3.+1/4.),
- 2./(1/2.+1/3.),
- 2./(1/1.+1/4.)
- )
- assert_array_almost_equal(actual1, desired, decimal=14)
-
-
-class TestRanking(TestCase):
- #
- def __init__(self, *args, **kwargs):
- TestCase.__init__(self, *args, **kwargs)
- #
- def test_ranking(self):
- x = ma.array([0,1,1,1,2,3,4,5,5,6,])
- assert_almost_equal(mstats.rankdata(x),[1,3,3,3,5,6,7,8.5,8.5,10])
- x[[3,4]] = masked
- assert_almost_equal(mstats.rankdata(x),[1,2.5,2.5,0,0,4,5,6.5,6.5,8])
- assert_almost_equal(mstats.rankdata(x,use_missing=True),
- [1,2.5,2.5,4.5,4.5,4,5,6.5,6.5,8])
- x = ma.array([0,1,5,1,2,4,3,5,1,6,])
- assert_almost_equal(mstats.rankdata(x),[1,3,8.5,3,5,7,6,8.5,3,10])
- x = ma.array([[0,1,1,1,2], [3,4,5,5,6,]])
- assert_almost_equal(mstats.rankdata(x),[[1,3,3,3,5],[6,7,8.5,8.5,10]])
- assert_almost_equal(mstats.rankdata(x,axis=1),[[1,3,3,3,5],[1,2,3.5,3.5,5]])
- assert_almost_equal(mstats.rankdata(x,axis=0),[[1,1,1,1,1],[2,2,2,2,2,]])
-
-
-class TestCorr(TestCase):
- #
- def test_pearsonr(self):
- "Tests some computations of Pearson's r"
- x = ma.arange(10)
- assert_almost_equal(mstats.pearsonr(x,x)[0], 1.0)
- assert_almost_equal(mstats.pearsonr(x,x[::-1])[0], -1.0)
- #
- x = ma.array(x, mask=True)
- pr = mstats.pearsonr(x,x)
- assert(pr[0] is masked)
- assert(pr[1] is masked)
- #
- def test_spearmanr(self):
- "Tests some computations of Spearman's rho"
- (x, y) = ([5.05,6.75,3.21,2.66],[1.65,2.64,2.64,6.95])
- assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555)
- (x, y) = ([5.05,6.75,3.21,2.66,np.nan],[1.65,2.64,2.64,6.95,np.nan])
- (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
- assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555)
- #
- x = [ 2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1,
- 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7]
- y = [22.6, 08.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6,
- 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4]
- assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299)
- x = [ 2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1,
- 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan]
- y = [22.6, 08.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6,
- 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan]
- (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
- assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299)
- #
- def test_kendalltau(self):
- "Tests some computations of Kendall's tau"
- x = ma.fix_invalid([5.05, 6.75, 3.21, 2.66,np.nan])
- y = ma.fix_invalid([1.65, 26.5, -5.93, 7.96, np.nan])
- z = ma.fix_invalid([1.65, 2.64, 2.64, 6.95, np.nan])
- assert_almost_equal(np.asarray(mstats.kendalltau(x,y)),
- [+0.3333333,0.4969059])
- assert_almost_equal(np.asarray(mstats.kendalltau(x,z)),
- [-0.5477226,0.2785987])
- #
- x = ma.fix_invalid([ 0, 0, 0, 0,20,20, 0,60, 0,20,
- 10,10, 0,40, 0,20, 0, 0, 0, 0, 0, np.nan])
- y = ma.fix_invalid([ 0,80,80,80,10,33,60, 0,67,27,
- 25,80,80,80,80,80,80, 0,10,45, np.nan, 0])
- result = mstats.kendalltau(x,y)
- assert_almost_equal(np.asarray(result), [-0.1585188, 0.4128009])
- #
- def test_kendalltau_seasonal(self):
- "Tests the seasonal Kendall tau."
- x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1],
- [ 4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3],
- [ 3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan],
- [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]]
- x = ma.fix_invalid(x).T
- output = mstats.kendalltau_seasonal(x)
- assert_almost_equal(output['global p-value (indep)'], 0.008, 3)
- assert_almost_equal(output['seasonal p-value'].round(2),
- [0.18,0.53,0.20,0.04])
- #
- def test_pointbiserial(self):
- "Tests point biserial"
- x = [1,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,
- 0,0,0,0,1,-1]
- y = [14.8,13.8,12.4,10.1,7.1,6.1,5.8,4.6,4.3,3.5,3.3,3.2,3.0,
- 2.8,2.8,2.5,2.4,2.3,2.1,1.7,1.7,1.5,1.3,1.3,1.2,1.2,1.1,
- 0.8,0.7,0.6,0.5,0.2,0.2,0.1,np.nan]
- assert_almost_equal(mstats.pointbiserialr(x, y)[0], 0.36149, 5)
- #
- def test_cov(self):
- "Tests the cov function."
- x = ma.array([[1,2,3],[4,5,6]], mask=[[1,0,0],[0,0,0]])
- c = mstats.cov(x[0])
- assert_equal(c, x[0].var(ddof=1))
- c = mstats.cov(x[1])
- assert_equal(c, x[1].var(ddof=1))
- c = mstats.cov(x)
- assert_equal(c[1,0], (x[0].anom()*x[1].anom()).sum())
- #
- x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1],
- [ 4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3],
- [ 3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan],
- [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]]
- x = ma.fix_invalid(x).T
- (winter,spring,summer,fall) = x.T
- #
- assert_almost_equal(mstats.cov(winter,winter,bias=True),
- winter.var(ddof=0))
- assert_almost_equal(mstats.cov(winter,winter,bias=False),
- winter.var(ddof=1))
- assert_almost_equal(mstats.cov(winter,spring)[0,1], 7.7)
- assert_almost_equal(mstats.cov(winter,spring)[1,0], 7.7)
- assert_almost_equal(mstats.cov(winter,summer)[0,1], 19.1111111, 7)
- assert_almost_equal(mstats.cov(winter,summer)[1,0], 19.1111111, 7)
- assert_almost_equal(mstats.cov(winter,fall)[0,1], 20)
- assert_almost_equal(mstats.cov(winter,fall)[1,0], 20)
-
-
-class TestTrimming(TestCase):
- #
- def test_trim(self):
- "Tests trimming"
- a = ma.arange(10)
- assert_equal(mstats.trim(a), [0,1,2,3,4,5,6,7,8,9])
- a = ma.arange(10)
- assert_equal(mstats.trim(a,(2,8)), [None,None,2,3,4,5,6,7,8,None])
- a = ma.arange(10)
- assert_equal(mstats.trim(a,limits=(2,8),inclusive=(False,False)),
- [None,None,None,3,4,5,6,7,None,None])
- a = ma.arange(10)
- assert_equal(mstats.trim(a,limits=(0.1,0.2),relative=True),
- [None,1,2,3,4,5,6,7,None,None])
- #
- a = ma.arange(12)
- a[[0,-1]] = a[5] = masked
- assert_equal(mstats.trim(a,(2,8)),
- [None,None,2,3,4,None,6,7,8,None,None,None])
- #
- x = ma.arange(100).reshape(10,10)
- trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=None)
- assert_equal(trimx._mask.ravel(),[1]*10+[0]*70+[1]*20)
- trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=0)
- assert_equal(trimx._mask.ravel(),[1]*10+[0]*70+[1]*20)
- trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=-1)
- assert_equal(trimx._mask.T.ravel(),[1]*10+[0]*70+[1]*20)
- #
- x = ma.arange(110).reshape(11,10)
- x[1] = masked
- trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=None)
- assert_equal(trimx._mask.ravel(),[1]*20+[0]*70+[1]*20)
- trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=0)
- assert_equal(trimx._mask.ravel(),[1]*20+[0]*70+[1]*20)
- trimx = mstats.trim(x.T,(0.1,0.2),relative=True,axis=-1)
- assert_equal(trimx.T._mask.ravel(),[1]*20+[0]*70+[1]*20)
- #
- def test_trim_old(self):
- "Tests trimming."
- x = ma.arange(100)
- assert_equal(mstats.trimboth(x).count(), 60)
- assert_equal(mstats.trimtail(x,tail='r').count(), 80)
- x[50:70] = masked
- trimx = mstats.trimboth(x)
- assert_equal(trimx.count(), 48)
- assert_equal(trimx._mask, [1]*16 + [0]*34 + [1]*20 + [0]*14 + [1]*16)
- x._mask = nomask
- x.shape = (10,10)
- assert_equal(mstats.trimboth(x).count(), 60)
- assert_equal(mstats.trimtail(x).count(), 80)
- #
- def test_trimmedmean(self):
- "Tests the trimmed mean."
- data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262,
- 296,299,306,376,428,515,666,1310,2611])
- assert_almost_equal(mstats.trimmed_mean(data,0.1), 343, 0)
- assert_almost_equal(mstats.trimmed_mean(data,(0.1,0.1)), 343, 0)
- assert_almost_equal(mstats.trimmed_mean(data,(0.2,0.2)), 283, 0)
- #
- def test_trimmed_stde(self):
- "Tests the trimmed mean standard error."
- data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262,
- 296,299,306,376,428,515,666,1310,2611])
- assert_almost_equal(mstats.trimmed_stde(data,(0.2,0.2)), 56.13193, 5)
- assert_almost_equal(mstats.trimmed_stde(data,0.2), 56.13193, 5)
- #
- def test_winsorization(self):
- "Tests the Winsorization of the data."
- data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262,
- 296,299,306,376,428,515,666,1310,2611])
- assert_almost_equal(mstats.winsorize(data,(0.2,0.2)).var(ddof=1),
- 21551.4, 1)
- data[5] = masked
- winsorized = mstats.winsorize(data)
- assert_equal(winsorized.mask, data.mask)
-
-
-class TestMoments(TestCase):
- """
- Comparison numbers are found using R v.1.5.1
- note that length(testcase) = 4
- testmathworks comes from documentation for the
- Statistics Toolbox for Matlab and can be found at both
- http://www.mathworks.com/access/helpdesk/help/toolbox/stats/kurtosis.shtml
- http://www.mathworks.com/access/helpdesk/help/toolbox/stats/skewness.shtml
- Note that both test cases came from here.
- """
- testcase = [1,2,3,4]
- testmathworks = ma.fix_invalid([1.165 , 0.6268, 0.0751, 0.3516, -0.6965,
- np.nan])
- def test_moment(self):
- """
- mean((testcase-mean(testcase))**power,axis=0),axis=0))**power))"""
- y = mstats.moment(self.testcase,1)
- assert_almost_equal(y,0.0,10)
- y = mstats.moment(self.testcase,2)
- assert_almost_equal(y,1.25)
- y = mstats.moment(self.testcase,3)
- assert_almost_equal(y,0.0)
- y = mstats.moment(self.testcase,4)
- assert_almost_equal(y,2.5625)
- def test_variation(self):
- """variation = samplestd/mean """
-## y = stats.variation(self.shoes[0])
-## assert_almost_equal(y,21.8770668)
- y = mstats.variation(self.testcase)
- assert_almost_equal(y,0.44721359549996, 10)
-
- def test_skewness(self):
- """
- sum((testmathworks-mean(testmathworks,axis=0))**3,axis=0)/((sqrt(var(testmathworks)*4/5))**3)/5
- """
- y = mstats.skew(self.testmathworks)
- assert_almost_equal(y,-0.29322304336607,10)
- y = mstats.skew(self.testmathworks,bias=0)
- assert_almost_equal(y,-0.437111105023940,10)
- y = mstats.skew(self.testcase)
- assert_almost_equal(y,0.0,10)
-
- def test_kurtosis(self):
- """
- sum((testcase-mean(testcase,axis=0))**4,axis=0)/((sqrt(var(testcase)*3/4))**4)/4
- sum((test2-mean(testmathworks,axis=0))**4,axis=0)/((sqrt(var(testmathworks)*4/5))**4)/5
- Set flags for axis = 0 and
- fisher=0 (Pearson's definition of kurtosis for compatibility with Matlab)
- """
- y = mstats.kurtosis(self.testmathworks,0,fisher=0,bias=1)
- assert_almost_equal(y, 2.1658856802973,10)
- # Note that MATLAB has confusing docs for the following case
- # kurtosis(x,0) gives an unbiased estimate of Pearson's skewness
- # kurtosis(x) gives a biased estimate of Fisher's skewness (Pearson-3)
- # The MATLAB docs imply that both should give Fisher's
- y = mstats.kurtosis(self.testmathworks,fisher=0,bias=0)
- assert_almost_equal(y, 3.663542721189047,10)
- y = mstats.kurtosis(self.testcase,0,0)
- assert_almost_equal(y,1.64)
- #
- def test_mode(self):
- "Tests the mode"
- #
- a1 = [0,0,0,1,1,1,2,3,3,3,3,4,5,6,7]
- a2 = np.reshape(a1, (3,5))
- ma1 = ma.masked_where(ma.array(a1)>2,a1)
- ma2 = ma.masked_where(a2>2, a2)
- assert_equal(mstats.mode(a1, axis=None), (3,4))
- assert_equal(mstats.mode(ma1, axis=None), (0,3))
- assert_equal(mstats.mode(a2, axis=None), (3,4))
- assert_equal(mstats.mode(ma2, axis=None), (0,3))
- assert_equal(mstats.mode(a2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]]))
- assert_equal(mstats.mode(ma2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]]))
- assert_equal(mstats.mode(a2, axis=-1), ([[0],[3],[3]], [[3],[3],[1]]))
- assert_equal(mstats.mode(ma2, axis=-1), ([[0],[1],[0]], [[3],[1],[0]]))
-
-
-class TestPercentile(TestCase):
- def setUp(self):
- self.a1 = [3,4,5,10,-3,-5,6]
- self.a2 = [3,-6,-2,8,7,4,2,1]
- self.a3 = [3.,4,5,10,-3,-5,-6,7.0]
-
- def test_percentile(self):
- x = np.arange(8) * 0.5
- assert_equal(mstats.scoreatpercentile(x, 0), 0.)
- assert_equal(mstats.scoreatpercentile(x, 100), 3.5)
- assert_equal(mstats.scoreatpercentile(x, 50), 1.75)
-
- def test_2D(self):
- x = ma.array([[1, 1, 1],
- [1, 1, 1],
- [4, 4, 3],
- [1, 1, 1],
- [1, 1, 1]])
- assert_equal(mstats.scoreatpercentile(x,50), [1,1,1])
-
-
-class TestVariability(TestCase):
- """ Comparison numbers are found using R v.1.5.1
- note that length(testcase) = 4
- """
- testcase = ma.fix_invalid([1,2,3,4,np.nan])
- #
- def test_std(self):
- y = mstats.std(self.testcase)
- assert_almost_equal(y,1.290994449)
-
- def test_var(self):
- """
- var(testcase) = 1.666666667 """
- #y = stats.var(self.shoes[0])
- #assert_approx_equal(y,6.009)
- y = mstats.var(self.testcase)
- assert_almost_equal(y,1.666666667)
-
- def test_samplevar(self):
- """
- R does not have 'samplevar' so the following was used
- var(testcase)*(4-1)/4 where 4 = length(testcase)
- """
- #y = stats.samplevar(self.shoes[0])
- #assert_approx_equal(y,5.4081)
- y = mstats.samplevar(self.testcase)
- assert_almost_equal(y,1.25)
-
- def test_samplestd(self):
- #y = stats.samplestd(self.shoes[0])
- #assert_approx_equal(y,2.325532197)
- y = mstats.samplestd(self.testcase)
- assert_almost_equal(y,1.118033989)
-
- def test_signaltonoise(self):
- """
- this is not in R, so used
- mean(testcase,axis=0)/(sqrt(var(testcase)*3/4)) """
- #y = stats.signaltonoise(self.shoes[0])
- #assert_approx_equal(y,4.5709967)
- y = mstats.signaltonoise(self.testcase)
- assert_almost_equal(y,2.236067977)
-
- def test_stderr(self):
- """
- this is not in R, so used
- sqrt(var(testcase))/sqrt(4)
- """
-## y = stats.stderr(self.shoes[0])
-## assert_approx_equal(y,0.775177399)
- y = mstats.stderr(self.testcase)
- assert_almost_equal(y,0.6454972244)
-
- def test_sem(self):
- """
- this is not in R, so used
- sqrt(var(testcase)*3/4)/sqrt(3)
- """
- #y = stats.sem(self.shoes[0])
- #assert_approx_equal(y,0.775177399)
- y = mstats.sem(self.testcase)
- assert_almost_equal(y,0.6454972244)
-
- def test_z(self):
- """
- not in R, so used
- (10-mean(testcase,axis=0))/sqrt(var(testcase)*3/4)
- """
- y = mstats.z(self.testcase, ma.array(self.testcase).mean())
- assert_almost_equal(y,0.0)
-
- def test_zs(self):
- """
- not in R, so tested by using
- (testcase[i]-mean(testcase,axis=0))/sqrt(var(testcase)*3/4)
- """
- y = mstats.zs(self.testcase)
- desired = ma.fix_invalid([-1.3416407864999, -0.44721359549996 ,
- 0.44721359549996 , 1.3416407864999, np.nan])
- assert_almost_equal(desired,y,decimal=12)
-
-
-
-class TestMisc(TestCase):
- #
- def test_obrientransform(self):
- "Tests Obrien transform"
- args = [[5]*5+[6]*11+[7]*9+[8]*3+[9]*2+[10]*2,
- [6]+[7]*2+[8]*4+[9]*9+[10]*16]
- result = [5*[3.1828]+11*[0.5591]+9*[0.0344]+3*[1.6086]+2*[5.2817]+2*[11.0538],
- [10.4352]+2*[4.8599]+4*[1.3836]+9*[0.0061]+16*[0.7277]]
- assert_almost_equal(np.round(mstats.obrientransform(*args).T,4),
- result,4)
- #
- def test_kstwosamp(self):
- "Tests the Kolmogorov-Smirnov 2 samples test"
- x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1],
- [ 4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3],
- [ 3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan],
- [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]]
- x = ma.fix_invalid(x).T
- (winter,spring,summer,fall) = x.T
- #
- assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring),4),
- (0.1818,0.9892))
- assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring,'g'),4),
- (0.1469,0.7734))
- assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring,'l'),4),
- (0.1818,0.6744))
- #
- def test_friedmanchisq(self):
- "Tests the Friedman Chi-square test"
- # No missing values
- args = ([9.0,9.5,5.0,7.5,9.5,7.5,8.0,7.0,8.5,6.0],
- [7.0,6.5,7.0,7.5,5.0,8.0,6.0,6.5,7.0,7.0],
- [6.0,8.0,4.0,6.0,7.0,6.5,6.0,4.0,6.5,3.0])
- result = mstats.friedmanchisquare(*args)
- assert_almost_equal(result[0], 10.4737, 4)
- assert_almost_equal(result[1], 0.005317, 6)
- # Missing values
- x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1],
- [ 4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3],
- [ 3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan],
- [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]]
- x = ma.fix_invalid(x)
- result = mstats.friedmanchisquare(*x)
- assert_almost_equal(result[0], 2.0156, 4)
- assert_almost_equal(result[1], 0.5692, 4)
-
-
-if __name__ == "__main__":
- run_module_suite()
Copied: trunk/scipy/stats/tests/test_mstats_basic.py (from rev 5134, trunk/scipy/stats/tests/test_mstats.py)
Copied: trunk/scipy/stats/tests/test_mstats_extras.py (from rev 5134, trunk/scipy/stats/tests/test_mmorestats.py)
More information about the Scipy-svn
mailing list