[Scipy-svn] r6893 - trunk/scipy/stats

Sun Nov 14 05:02:01 EST 2010

Author: rgommers
Date: 2010-11-14 04:02:01 -0600 (Sun, 14 Nov 2010)
New Revision: 6893

Modified:
   trunk/scipy/stats/__init__.py
   trunk/scipy/stats/distributions.py
   trunk/scipy/stats/morestats.py
   trunk/scipy/stats/mstats_basic.py
   trunk/scipy/stats/stats.py
Log:
DOC: merge more wiki edits for stats module.

Modified: trunk/scipy/stats/__init__.py
===================================================================

--- trunk/scipy/stats/__init__.py	2010-11-14 10:01:37 UTC (rev 6892)
+++ trunk/scipy/stats/__init__.py	2010-11-14 10:02:01 UTC (rev 6893)
@@ -1,3 +1,259 @@
+"""
+Statistical Functions
+=====================
+
+This module contains a large number of probability distributions as
+well as a growing library of statistical functions.
+
+Each included distribution is an instance of the class rv_continous.
+For each given name the following methods are available.  See docstring
+for rv_continuous for more information
+
+:rvs:
+   random variates with the distribution
+:pdf:
+   probability density function
+:cdf:
+   cumulative distribution function
+:sf:
+   survival function (1.0 - cdf)
+:ppf:
+   percent-point function (inverse of cdf)
+:isf:
+   inverse survival function
+:stats:
+   mean, variance, and optionally skew and kurtosis
+
+Calling the instance as a function returns a frozen pdf whose shape,
+location, and scale parameters are fixed.
+
+Distributions
+---------------
+
+The distributions available with the above methods are:
+
+
+Continuous  (Total == 81 distributions)
+---------------------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   norm              Normal (Gaussian)
+   alpha             Alpha
+   anglit            Anglit
+   arcsine           Arcsine
+   beta              Beta
+   betaprime         Beta Prime
+   bradford          Bradford
+   burr              Burr
+   cauchy            Cauchy
+   chi               Chi
+   chi2              Chi-squared
+   cosine            Cosine
+   dgamma            Double Gamma
+   dweibull          Double Weibull
+   erlang            Erlang
+   expon             Exponential
+   exponweib         Exponentiated Weibull
+   exponpow          Exponential Power
+   f                 F (Snecdor F)
+   fatiguelife       Fatigue Life (Birnbaum-Sanders)
+   fisk              Fisk
+   foldcauchy        Folded Cauchy
+   foldnorm          Folded Normal
+   frechet_r         Frechet Right Sided, Extreme Value Type II (Extreme LB) or weibull_min
+   frechet_l         Frechet Left Sided, Weibull_max
+   genlogistic       Generalized Logistic
+   genpareto         Generalized Pareto
+   genexpon          Generalized Exponential
+   genextreme        Generalized Extreme Value
+   gausshyper        Gauss Hypergeometric
+   gamma             Gamma
+   gengamma          Generalized gamma
+   genhalflogistic   Generalized Half Logistic
+   gompertz          Gompertz (Truncated Gumbel)
+   gumbel_r          Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
+   gumbel_l          Left Sided Gumbel, etc.
+   halfcauchy        Half Cauchy
+   halflogistic      Half Logistic
+   halfnorm          Half Normal
+   hypsecant         Hyperbolic Secant
+   invgamma          Inverse Gamma
+   invnorm           Inverse Normal
+   invweibull        Inverse Weibull
+   johnsonsb         Johnson SB
+   johnsonsu         Johnson SU
+   ksone             Kolmogorov-Smirnov one-sided (no stats)
+   kstwobign         Kolmogorov-Smirnov two-sided test for Large N (no stats)
+   laplace           Laplace
+   logistic          Logistic
+   loggamma          Log-Gamma
+   loglaplace        Log-Laplace (Log Double Exponential)
+   lognorm           Log-Normal
+   gilbrat           Gilbrat
+   lomax             Lomax (Pareto of the second kind)
+   maxwell           Maxwell
+   mielke            Mielke's Beta-Kappa
+   nakagami          Nakagami
+   ncx2              Non-central chi-squared
+   ncf               Non-central F
+   nct               Non-central Student's T
+   pareto            Pareto
+   powerlaw          Power-function
+   powerlognorm      Power log normal
+   powernorm         Power normal
+   rdist             R distribution
+   reciprocal        Reciprocal
+   rayleigh          Rayleigh
+   rice              Rice
+   recipinvgauss     Reciprocal Inverse Gaussian
+   semicircular      Semicircular
+   t                 Student's T
+   triang            Triangular
+   truncexpon        Truncated Exponential
+   truncnorm         Truncated Normal
+   tukeylambda       Tukey-Lambda
+   uniform           Uniform
+   von_mises         Von-Mises (Circular)
+   wald              Wald
+   weibull_min       Minimum Weibull (see Frechet)
+   weibull_max       Maximum Weibull (see Frechet)
+   wrapcauchy        Wrapped Cauchy
+
+
+=============== ==============================================================
+Discrete    (Total == 10 distributions)
+==============================================================================
+binom             Binomial
+bernoulli         Bernoulli
+nbinom            Negative Binomial
+geom              Geometric
+hypergeom         Hypergeometric
+logser            Logarithmic (Log-Series, Series)
+poisson           Poisson
+planck            Planck (Discrete Exponential)
+boltzmann         Boltzmann (Truncated Discrete Exponential)
+randint           Discrete Uniform
+zipf              Zipf
+dlaplace          Discrete Laplacian
+=============== ==============================================================
+
+Statistical Functions (adapted from Gary Strangman)
+-----------------------------------------------------
+
+================= ==============================================================
+gmean             Geometric mean
+hmean             Harmonic mean
+mean              Arithmetic mean
+cmedian           Computed median
+median            Median
+mode              Modal value
+tmean             Truncated arithmetic mean
+tvar              Truncated variance
+tmin              _
+tmax              _
+tstd              _
+tsem              _
+moment            Central moment
+variation         Coefficient of variation
+skew              Skewness
+kurtosis          Fisher or Pearson kurtosis
+describe          Descriptive statistics
+skewtest          _
+kurtosistest      _
+normaltest        _
+================= ==============================================================
+
+================= ==============================================================
+itemfreq          _
+scoreatpercentile _
+percentileofscore _
+histogram2        _
+histogram         _
+cumfreq           _
+relfreq           _
+================= ==============================================================
+
+================= ==============================================================
+obrientransform   _
+samplevar         _
+samplestd         _
+signaltonoise     _
+bayes_mvs         _
+var               _
+std               _
+stderr            _
+sem               _
+z                 _
+zs                _
+zmap              _
+================= ==============================================================
+
+================= ==============================================================
+threshold         _
+trimboth          _
+trim1             _
+cov               _
+corrcoef          _
+================= ==============================================================
+
+================= ==============================================================
+f_oneway          _
+paired            _
+pearsonr          _
+spearmanr         _
+pointbiserialr    _
+kendalltau        _
+linregress        _
+================= ==============================================================
+
+================= ==============================================================
+ttest_1samp       _
+ttest_ind         _
+ttest_rel         _
+kstest            _
+chisquare         _
+ks_2samp          _
+meanwhitneyu      _
+tiecorrect        _
+ranksums          _
+wilcoxon          _
+kruskal           _
+friedmanchisquare _
+================= ==============================================================
+
+================= ==============================================================
+ansari            _
+bartlett          _
+levene            _
+shapiro           _
+anderson          _
+binom_test        _
+fligner           _
+mood              _
+oneway            _
+================= ==============================================================
+
+================= ==============================================================
+glm               _
+anova             _
+================= ==============================================================
+
+
+================= ==============================================================
+Plot-tests
+================================================================================
+probplot          _
+ppcc_max          _
+ppcc_plot         _
+================= ==============================================================
+
+
+For many more stat related functions install the software R and the
+interface package rpy.
+
+"""
 #
 # stats - Statistical Functions
 #

Modified: trunk/scipy/stats/distributions.py
===================================================================
--- trunk/scipy/stats/distributions.py	2010-11-14 10:01:37 UTC (rev 6892)
+++ trunk/scipy/stats/distributions.py	2010-11-14 10:02:01 UTC (rev 6893)
@@ -1637,40 +1637,49 @@
             
     def fit(self, data, *args, **kwds):
         """
-        Return max like estimators to shape, location, and scale from data
+        Return MLEs for shape, location, and scale parameters from data.
 
-        Starting points for the fit are given by input arguments.  For any 
-        arguments not given starting points, self._fitstart(data) is called 
-        to get the starting estimates.
+        MLE stands for Maximum Likelihood Estimate.  Starting estimates for
+        the fit are given by input arguments; for any arguments not provided
+        with starting estimates, ``self._fitstart(data)`` is called to generate
+        such.
 
-        You can hold some parameters fixed to specific values by passing in 
-        keyword arguments f0..fn for shape paramters and floc, fscale for 
-        location and scale parameters.
+        One can hold some parameters fixed to specific values by passing in
+        keyword arguments ``f0``..[is this supposed to be an ellipsis?] ``fn``
+        (for shape parameters) and ``floc`` and ``fscale`` (for location and
+        scale parameters, respectively).
 
         Parameters
         ----------
-        data : array-like
-            Data to use in calculating the MLE
-        args : optional
-            Starting values for any shape arguments (those not specified 
-            will be determined by _fitstart(data))
-        kwds : loc, scale
-            Starting values for the location and scale parameters 
-            Special keyword arguments are recognized as holding certain 
-              parameters fixed:
-               f0..fn : hold respective shape paramters fixed
-               floc : hold location parameter fixed to specified value
-               fscale : hold scale parameter fixed to specified value
-            optimizer : The optimizer to use.  The optimizer must take func, 
-                         and starting position as the first two arguments, 
-                         plus args (for extra arguments to pass to the 
-                         function to be optimized) and disp=0 to suppress
-                         output as keyword arguments.
-              
-        Return
-        ------
-        shape, loc, scale : tuple of float
-            MLE estimates for any shape arguments followed by location and scale
+        data : array_like
+            Data to use in calculating the MLEs
+        args : floats, optional
+            Starting value(s) for any shape-characterizing arguments (those not
+            provided will be determined by a call to ``_fitstart(data)``).
+            No default value.
+        kwds : floats, optional
+            Starting values for the location and scale parameters; no default.
+            Special keyword arguments are recognized as holding certain
+            parameters fixed:
+
+            f0..fn : hold respective shape parameters fixed.
+
+            floc : hold location parameter fixed to specified value.
+
+            fscale : hold scale parameter fixed to specified value.
+
+            optimizer : The optimizer to use.  The optimizer must take func,
+                        and starting position as the first two arguments,
+                        plus args (for extra arguments to pass to the
+                        function to be optimized) and disp=0 to suppress
+                        output as keyword arguments.
+
+        Returns
+        -------
+        shape, loc, scale : tuple of floats
+            MLEs for any shape statistics, followed by those for location and
+            scale.
+
         """
         Narg = len(args)
         if Narg > self.numargs:

Modified: trunk/scipy/stats/morestats.py
===================================================================
--- trunk/scipy/stats/morestats.py	2010-11-14 10:01:37 UTC (rev 6892)
+++ trunk/scipy/stats/morestats.py	2010-11-14 10:02:01 UTC (rev 6893)
@@ -196,10 +196,24 @@
 ##
 
 def kstat(data,n=2):
-    """Return the nth k-statistic (1<=n<=4 so far).
+    """
+    Return the nth k-statistic (1<=n<=4 so far).
 
     The nth k-statistic is the unique symmetric unbiased estimator of the nth
-    cumulant kappa_n
+    cumulant kappa_n.
+
+    Parameters
+    ----------
+    data : array_like
+        Input array.
+    n : int, {1, 2, 3, 4}, optional
+        Default is equal to 2.
+
+    Returns
+    -------
+    kstat : float
+        The nth k-statistic.
+
     """
     if n > 4 or n < 1:
         raise ValueError("k-statistics only supported for 1<=n<=4")
@@ -223,8 +237,22 @@
         raise ValueError("Should not be here.")
 
 def kstatvar(data,n=2):
-    """Returns an unbiased estimator of the variance of the k-statistic:  n=1 or 2
     """
+    Returns an unbiased estimator of the variance of the k-statistic.
+
+    Parameters
+    ----------
+    data : array_like
+        Input array.
+    n : int, {1, 2}, optional
+        Default is equal to 2.
+
+    Returns
+    -------
+    kstatvar : float
+        The nth k-statistic variance.
+
+    """
     data = ravel(data)
     N = len(data)
     if n == 1:
@@ -1343,8 +1371,25 @@
     #return pdf_fromgamma(g1, g2, g3, g4)
 
 def circmean(samples, high=2*pi, low=0):
-    """Compute the circular mean for samples assumed to be in the range [low to high]
     """
+    Compute the circular mean for samples assumed to be in the range
+    [low to high].
+
+    Parameters
+    ----------
+    samples : array_like
+        Input array.
+    low : float or int, optional
+        Low boundary for circular mean range.  Default is 0.
+    high : float or int, optional
+        High boundary for circular mean range.  Default is 2*pi.
+
+    Returns
+    -------
+    circmean : float
+        Circular mean.
+
+    """
     ang = (samples - low)*2*pi / (high-low)
     res = angle(np.mean(exp(1j*ang), axis=0))
     if (res < 0):
@@ -1352,16 +1397,50 @@
     return res*(high-low)/2.0/pi + low
 
 def circvar(samples, high=2*pi, low=0):
-    """Compute the circular variance for samples assumed to be in the range [low to high]
     """
+    Compute the circular variance for samples assumed to be in the range
+    [low to high].
+
+    Parameters
+    ----------
+    samples : array_like
+        Input array.
+    low : float or int, optional
+        Low boundary for circular variance range.  Default is 0.
+    high : float or int, optional
+        High boundary for circular variance range.  Default is 2*pi.
+
+    Returns
+    -------
+    circvar : float
+        Circular variance.
+
+    """
     ang = (samples - low)*2*pi / (high-low)
     res = np.mean(exp(1j*ang), axis=0)
     V = 1-abs(res)
     return ((high-low)/2.0/pi)**2 * V
 
 def circstd(samples, high=2*pi, low=0):
-    """Compute the circular standard deviation for samples assumed to be in the range [low to high]
     """
+    Compute the circular standard deviation for samples assumed to be in the
+    range [low to high].
+
+    Parameters
+    ----------
+    samples : array_like
+        Input array.
+    low : float or int, optional
+        Low boundary for circular standard deviation range.  Default is 0.
+    high : float or int, optional
+        High boundary for circular standard deviation range.  Default is 2*pi.
+
+    Returns
+    -------
+    circstd : float
+        Circular standard deviation.
+
+    """
     ang = (samples - low)*2*pi / (high-low)
     res = np.mean(exp(1j*ang), axis=0)
     V = 1-abs(res)

Modified: trunk/scipy/stats/mstats_basic.py
===================================================================
--- trunk/scipy/stats/mstats_basic.py	2010-11-14 10:01:37 UTC (rev 6892)
+++ trunk/scipy/stats/mstats_basic.py	2010-11-14 10:02:01 UTC (rev 6893)
@@ -1465,21 +1465,45 @@
 kurtosis.__doc__ = stats.kurtosis.__doc__
 
 def describe(a, axis=0):
-    """Computes several descriptive statistics of the passed array.
+    """
+    Computes several descriptive statistics of the passed array.
 
     Parameters
     ----------
     a : array
+
     axis : int or None
 
     Returns
     -------
-    (size of the data (discarding missing values),
-     (min, max),
-     arithmetic mean,
-     unbiased variance,
-     biased skewness,
-     biased kurtosis)
+    n : int
+        (size of the data (discarding missing values)
+    mm : (int, int)
+        min, max
+
+    arithmetic mean : float
+
+    unbiased variance : float
+
+    biased skewness : float
+
+    biased kurtosis : float
+
+    Examples
+    --------
+
+    >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1])
+    >>> describe(ma)
+    (array(3),
+     (0, 2),
+     1.0,
+     1.0,
+     masked_array(data = 0.0,
+                 mask = False,
+           fill_value = 1e+20)
+    ,
+     -1.5)
+
     """
     a, axis = _chk_asarray(a, axis)
     n = a.count(axis)
@@ -1523,6 +1547,30 @@
 #####--------------------------------------------------------------------------
 
 def skewtest(a, axis=0):
+    """
+    Tests whether the skew is different from the normal distribution.
+
+    This function tests the null hypothesis that the skewness of
+    the population that the sample was drawn from is the same
+    as that of a corresponding normal distribution.
+
+    Parameters
+    ----------
+    a : array
+    axis : int or None
+
+    Returns
+    -------
+    z-score : float
+
+    p-value : float
+        a 2-sided p-value for the hypothesis test
+
+    Notes
+    -----
+    The sample size should be at least 8.
+
+    """
     a, axis = _chk_asarray(a, axis)
     if axis is None:
         a = a.ravel()
@@ -1544,6 +1592,33 @@
 skewtest.__doc__ = stats.skewtest.__doc__
 
 def kurtosistest(a, axis=0):
+    """
+    Tests whether a dataset has normal kurtosis
+
+    This function tests the null hypothesis that the kurtosis
+    of the population from which the sample was drawn is that
+    of the normal distribution: kurtosis=3(n-1)/(n+1).
+
+    Parameters
+    ----------
+    a : array
+        array of the sample data
+    axis : int or None
+        the axis to operate along, or None to work on the whole array.
+        The default is the first axis.
+
+    Returns
+    -------
+    z-score : float
+
+    p-value : float
+        The 2-sided p-value for the hypothesis test
+
+    Notes
+    -----
+    Valid only for n>20.  The Z-score is set to 0 for bad entries.
+
+    """
     a, axis = _chk_asarray(a, axis)
     n = a.count(axis=axis).astype(float)
     if np.min(n) < 20:
@@ -1610,9 +1685,9 @@
 
     Parameters
     ----------
-    a : array-like
+    a : array_like
         Input data, as a sequence or array of dimension at most 2.
-    prob : array-like, optional
+    prob : array_like, optional
         List of quantiles to compute.
     alpha : float, optional
         Plotting positions parameter, default is 0.4.

Modified: trunk/scipy/stats/stats.py
===================================================================
--- trunk/scipy/stats/stats.py	2010-11-14 10:01:37 UTC (rev 6892)
+++ trunk/scipy/stats/stats.py	2010-11-14 10:02:01 UTC (rev 6893)
@@ -996,13 +996,16 @@
 
 
 def variation(a, axis=0):
-    """Computes the coefficient of variation, the ratio of the biased standard
+    """
+    Computes the coefficient of variation, the ratio of the biased standard
     deviation to the mean.
 
     Parameters
     ----------
-    a : array
+    a : array_like
+        Input array.
     axis : int or None
+        Axis along which to calculate the coefficient of variation.
 
     References
     ----------
@@ -1586,12 +1589,30 @@
 
 def cumfreq(a, numbins=10, defaultreallimits=None, weights=None):
     """
-Returns a cumulative frequency histogram, using the histogram function.
-Defaultreallimits can be None (use all data), or a 2-sequence containing
-lower and upper limits on values to include.
+    Returns a cumulative frequency histogram, using the histogram function.
 
-Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints
-"""
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    numbins : int, optional
+        Number of bins.
+    defaultreallimits : 2-sequence or None, optional
+        None (use all data), or a 2-sequence containing lower and upper limits
+        on values to include.
+
+    Returns
+    -------
+    cumfreq : ndarray
+        Binned values of cumulative frequency.
+    lowerreallimit : float
+        Lower real limit
+    binsize : float
+        Width of each bin.
+    extrapoints : int
+        Extra points.
+
+    """
     h,l,b,e = histogram(a, numbins, defaultreallimits, weights=weights)
     cumhist = np.cumsum(h*1, axis=0)
     return cumhist,l,b,e
@@ -2099,12 +2120,27 @@
 
 def trim1(a, proportiontocut, tail='right'):
     """
-    Slices off the passed proportion of items from ONE end of the passed
-    array (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost'
-    10% of scores).  Slices off LESS if proportion results in a non-integer
-    slice index (i.e., conservatively slices off proportiontocut).
+    Slices off a proportion of items from ONE end of the passed array
+    distribution.
 
-    Returns: trimmed version of array a
+    If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost'
+    10% of scores.  Slices off LESS if proportion results in a non-integer
+    slice index (i.e., conservatively slices off `proportiontocut` ).
+
+    Parameters
+    ----------
+    a : array_like
+        Input array
+    proportiontocut : float
+        Fraction to cut off of 'left' or 'right' of distribution
+    tail : string, {'left', 'right'}, optional
+        Defaults to 'right'.
+
+    Returns
+    -------
+    trim1 : ndarray
+        Trimmed version of array `a`
+
     """
     a = asarray(a)
     if tail.lower() == 'right':
@@ -2116,9 +2152,27 @@
     return a[lowercut:uppercut]
 
 def trim_mean(a, proportiontocut):
-    """Return mean with proportiontocut chopped from each of the lower and
-    upper tails.
     """
+    Return mean of array after trimming distribution from both lower and upper
+    tails.
+
+    If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of
+    scores. Slices off LESS if proportion results in a non-integer slice
+    index (i.e., conservatively slices off `proportiontocut` ).
+
+    Parameters
+    ----------
+    a : array_like
+        Input array
+    proportiontocut : float
+        Fraction to cut off of both tails of the distribution
+
+    Returns
+    -------
+    trim_mean : ndarray
+        Mean of trimmed array.
+
+    """
     newa = trimboth(np.sort(a),proportiontocut)
     return np.mean(newa,axis=0)
 
@@ -3466,20 +3520,23 @@
                                       new_name="scipy.special.erfc")
 
 def chisqprob(chisq, df):
-    """Returns the (1-tail) probability value associated with the provided
-    chi-square value and degrees of freedom.
+    """
+    Probability value (1-tail) for the Chi^2 probability distribution.
 
     Broadcasting rules apply.
 
     Parameters
     ----------
-    chisq : array or float > 0
-    df : array or float, probably int >= 1
+    chisq : array_like or float > 0
 
+    df : array_like or float, probably int >= 1
+
     Returns
     -------
-    The area from chisq to infinity under the Chi^2 probability distribution
-    with degrees of freedom df.
+    chisqprob : ndarray
+        The area from `chisq` to infinity under the Chi^2 probability
+        distribution with degrees of freedom `df`.
+
     """
     return special.chdtrc(df,chisq)
 
@@ -3487,7 +3544,8 @@
 fprob = special.fdtrc
 
 def betai(a, b, x):
-    """Returns the incomplete beta function.
+    """
+    Returns the incomplete beta function.
 
     I_x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt)
 
@@ -3498,13 +3556,17 @@
 
     Parameters
     ----------
-    a : array or float > 0
-    b : array or float > 0
-    x : array or float
+    a : array_like or float > 0
+
+    b : array_like or float > 0
+
+    x : array_like or float
         x will be clipped to be no greater than 1.0 .
 
     Returns
     -------
+    betai : ndarray
+        Incomplete beta function.
 
     """
     x = np.asarray(x)