[Numpy-svn] r3644 - in trunk/numpy/lib: . tests

Mon Apr 2 14:25:55 EDT 2007

Author: oliphant
Date: 2007-04-02 13:25:50 -0500 (Mon, 02 Apr 2007)
New Revision: 3644

Modified:
   trunk/numpy/lib/function_base.py
   trunk/numpy/lib/tests/test_function_base.py
   trunk/numpy/lib/tests/test_twodim_base.py
   trunk/numpy/lib/twodim_base.py
Log:
Add patch in Ticket #189 for histogramdd.  Fixes bug reported by Ben Granett

Modified: trunk/numpy/lib/function_base.py
===================================================================

--- trunk/numpy/lib/function_base.py	2007-04-02 13:27:42 UTC (rev 3643)
+++ trunk/numpy/lib/function_base.py	2007-04-02 18:25:50 UTC (rev 3644)
@@ -1,3 +1,4 @@
+__docformat__ = "restructuredtext en"
 __all__ = ['logspace', 'linspace',
            'select', 'piecewise', 'trim_zeros',
            'copy', 'iterable', #'base_repr', 'binary_repr',
@@ -117,46 +118,49 @@
     else:
         return n, bins
 
-def histogramdd(sample, bins=10, range=None, normed=False):
-    """histogramdd(sample, bins = 10, range = None, normed = False) -> H, edges
+def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
+    """histogramdd(sample, bins=10, range=None, normed=False, weights=None)                                                                
 
-    Return the D-dimensional histogram computed from sample.
+    Return the D-dimensional histogram of the sample.
 
-    Parameters
-    ----------
-    sample: A sequence of D arrays, or an NxD array.
-    bins:   A sequence of edge arrays, or a sequence of the number of bins.
-            If a scalar is given, it is assumed to be the number of bins
-            for all dimensions.
-    range:  A sequence of lower and upper bin edges (default: [min, max]).
-    normed: If False, returns the number of samples in each bin.
-            If True, returns the frequency distribution.
+    :Parameters:
+      - `sample` : A sequence of D arrays, or an NxD array. 
+      - `bins` : A sequence of edge arrays, a sequence of bin number, 
+                or a scalar (the number of bins for all dimensions.) 
+      - `range` : A sequence of lower and upper bin edges (default: [min, max]).
+      - `normed` : Boolean, if False, return the number of samples in each bin,
+                if True, returns the density.
+      - `weights` : An array of weights. The weights are normed only if normed is True. 
+             Should weights.sum() not equal N, the total bin count will 
+             not be equal to the number of samples.
 
+    :Return:
+      - `hist` : Histogram array.
+      - `edges` : List of arrays defining the bin edges. 
+    
 
-    Output
-    ------
-    H:      Histogram array.
-    edges:  List of arrays defining the bin edges.
-
     Example:
-    x = random.randn(100,3)
-    H, edges = histogramdd(x, bins = (5, 6, 7))
+      >>> x = random.randn(100,3)
+      >>> hist3d, edges = histogramdd(x, bins = (5, 6, 7))
 
-    See also: histogram
+    :SeeAlso: histogram
 
     """
 
-    try:
+    try: 
+        # Sample is an ND-array.
         N, D = sample.shape
-    except (AttributeError, ValueError):
-        ss = atleast_2d(sample)
-        sample = ss.transpose()
+    except (AttributeError, ValueError): 
+        # Sample is a sequence of 1D arrays.
+        sample = atleast_2d(sample).T
         N, D = sample.shape
 
     nbin = empty(D, int)
     edges = D*[None]
     dedges = D*[None]
-
+    if weights is not None:
+        weights = asarray(weights)
+    
     try:
         M = len(bins)
         if M != D:
@@ -164,6 +168,8 @@
     except TypeError:
         bins = D*[bins]
 
+    # Select range for each dimension
+    # Used only if number of bins is given.
     if range is None:
         smin = atleast_1d(sample.min(0))
         smax = atleast_1d(sample.max(0))
@@ -173,39 +179,37 @@
         for i in arange(D):
             smin[i], smax[i] = range[i]
 
+    # Create edge arrays
     for i in arange(D):
         if isscalar(bins[i]):
-            nbin[i] = bins[i]
-            edges[i] = linspace(smin[i], smax[i], nbin[i]+1)
+            nbin[i] = bins[i] + 2 # +2 for outlier bins
+            edges[i] = linspace(smin[i], smax[i], nbin[i]-1)
         else:
             edges[i] = asarray(bins[i], float)
-            nbin[i] = len(edges[i])-1
-
-
-
+            nbin[i] = len(edges[i])+1  # +1 for outlier bins
+        dedges[i] = diff(edges[i])
+        
+    nbin =  asarray(nbin)
+    
+    # Compute the bin number each sample falls into. 
     Ncount = {}
-    nbin =  asarray(nbin)
-
     for i in arange(D):
         Ncount[i] = digitize(sample[:,i], edges[i])
-        dedges[i] = diff(edges[i])
-    # Remove values falling outside of bins
-    # Values that fall on an edge are put in the right bin.
+    
+    # Using digitize, values that fall on an edge are put in the right bin.
     # For the rightmost bin, we want values equal to the right
     # edge to be counted in the last bin, and not as an outlier.
     outliers = zeros(N, int)
     for i in arange(D):
+        # Rounding precision
         decimal = int(-log10(dedges[i].min())) +6
+        # Find which points are on the rightmost edge.
         on_edge = where(around(sample[:,i], decimal) == around(edges[i][-1], decimal))[0]
+        # Shift these points one bin to the left. 
         Ncount[i][on_edge] -= 1
-        outliers += (Ncount[i] == 0) | (Ncount[i] == nbin[i]+1)
-    indices = where(outliers == 0)[0]
-    for i in arange(D):
-        Ncount[i] = Ncount[i][indices] - 1
-    N = len(indices)
 
     # Flattened histogram matrix (1D)
-    hist = zeros(nbin.prod(), int)
+    hist = zeros(nbin.prod(), float)
 
     # Compute the sample indices in the flattened histogram matrix.
     ni = nbin.argsort()
@@ -213,29 +217,33 @@
     xy = zeros(N, int)
     for i in arange(0, D-1):
         xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod()
-
     xy += Ncount[ni[-1]]
 
     # Compute the number of repetitions in xy and assign it to the flattened histmat.
     if len(xy) == 0:
-        return zeros(nbin, int)
+        return zeros(nbin-2, int), edges
 
-    flatcount = bincount(xy)
+    flatcount = bincount(xy, weights)
     a = arange(len(flatcount))
     hist[a] = flatcount
 
     # Shape into a proper matrix
     hist = hist.reshape(sort(nbin))
-    for i,j in enumerate(ni):
+    for i in arange(nbin.size):
+        j = ni[i]
         hist = hist.swapaxes(i,j)
-        if (hist.shape == nbin).all():
-            break
+        ni[i],ni[j] = ni[j],ni[i]
 
+    # Remove outliers (indices 0 and -1 for each dimension).
+    core = D*[slice(1,-1)]
+    hist = hist[core]
+    
+    # Normalize if normed is True
     if normed:
         s = hist.sum()
         for i in arange(D):
             shape = ones(D, int)
-            shape[i] = nbin[i]
+            shape[i] = nbin[i]-2
             hist = hist / dedges[i].reshape(shape)
         hist /= s
 

Modified: trunk/numpy/lib/tests/test_function_base.py
===================================================================
--- trunk/numpy/lib/tests/test_function_base.py	2007-04-02 13:27:42 UTC (rev 3643)
+++ trunk/numpy/lib/tests/test_function_base.py	2007-04-02 18:25:50 UTC (rev 3644)
@@ -365,7 +365,7 @@
         [.5, .5, 1.5], [.5, 1.5, 2.5], [.5, 2.5, 2.5]])
         H, edges = histogramdd(x, (2,3,3), range = [[-1,1], [0,3], [0,3]])
         answer = asarray([[[0,1,0], [0,0,1], [1,0,0]], [[0,1,0], [0,0,1], [0,0,1]]])
-        assert(all(H == answer))
+        assert_array_equal(H,answer)
         # Check normalization
         ed = [[-2,0,2], [0,1,2,3], [0,1,2,3]]
         H, edges = histogramdd(x, bins = ed, normed = True)
@@ -383,6 +383,27 @@
                           [[0,0],[0,0],[0,0]]])
         assert_array_equal(H, answer)
 
+        Z = zeros((5,5,5))
+        Z[range(5), range(5), range(5)] = 1.
+        H,edges = histogramdd([arange(5), arange(5), arange(5)], 5)
+        assert_array_equal(H, Z)
+    
+    def check_shape(self):
+        x = rand(100,3)
+        hist3d, edges = histogramdd(x, bins = (5, 7, 6))
+        assert_array_equal(hist3d.shape, (5,7,6))
+        
+    def check_weights(self):
+        v = rand(100,2)
+        hist, edges = histogramdd(v)
+        n_hist, edges = histogramdd(v, normed=True)
+        w_hist, edges = histogramdd(v, weights=ones(100))
+        assert_array_equal(w_hist, hist)
+        w_hist, edges = histogramdd(v, weights=ones(100)*2, normed=True)
+        assert_array_equal(w_hist, n_hist)
+        w_hist, edges = histogramdd(v, weights=ones(100, int)*2)
+        assert_array_equal(w_hist, 2*hist)
+            
 
 class test_unique(NumpyTestCase):
     def check_simple(self):

Modified: trunk/numpy/lib/tests/test_twodim_base.py
===================================================================
--- trunk/numpy/lib/tests/test_twodim_base.py	2007-04-02 13:27:42 UTC (rev 3643)
+++ trunk/numpy/lib/tests/test_twodim_base.py	2007-04-02 18:25:50 UTC (rev 3644)
@@ -149,6 +149,13 @@
                            [0, 0, 0, 0, 0, 0, 0, 0, 0],
                            [0, 0, 0, 0, 0, 0, 0, 0, 0]])
         assert_array_equal(H.T, answer)
+        H = histogram2d(x, y, xedges)[0]
+        assert_array_equal(H.T, answer)
+        H,xedges,yedges = histogram2d(range(10),range(10))
+        assert_array_equal(H, eye(10,10))
+        assert_array_equal(xedges, np.linspace(0,9,11))
+        assert_array_equal(yedges, np.linspace(0,9,11))
+        
     def check_asym(self):
         x = array([1, 1, 2, 3, 4, 4, 4, 5])
         y = array([1, 3, 2, 0, 1, 2, 3, 4])
@@ -160,6 +167,8 @@
                         [0,1,1,1,0],
                         [0,0,0,0,1]])
         assert_array_almost_equal(H, answer/8., 3)
+        assert_array_equal(xed, np.linspace(0,6,7))
+        assert_array_equal(yed, np.linspace(0,5,6))
     def check_norm(self):
         x = array([1,2,3,1,2,3,1,2,3])
         y = array([1,1,1,2,2,2,3,3,3])
@@ -169,5 +178,10 @@
                      [.5,.5,.25]])/9.
         assert_array_almost_equal(H, answer, 3)
 
+    def check_all_outliers(self):
+        r = rand(100)+1.
+        H, xed, yed = histogram2d(r, r, (4, 5), range=([0,1], [0,1]))
+        assert_array_equal(H, 0)
+        
 if __name__ == "__main__":
     NumpyTest().run()

Modified: trunk/numpy/lib/twodim_base.py
===================================================================
--- trunk/numpy/lib/twodim_base.py	2007-04-02 13:27:42 UTC (rev 3643)
+++ trunk/numpy/lib/twodim_base.py	2007-04-02 18:25:50 UTC (rev 3644)
@@ -143,105 +143,42 @@
         X[:,i] = x**(N-i-1)
     return X
 
-def histogram2d(x,y, bins=10, range=None, normed=False):
+    
+def histogram2d(x,y, bins=10, range=None, normed=False, weights=None):
     """histogram2d(x,y, bins=10, range=None, normed=False) -> H, xedges, yedges
 
     Compute the 2D histogram from samples x,y.
 
-    Parameters
-    ----------
-    x,y: 1D data series. Both arrays must have the same length.
-    bins: Number of bins -or- [nbin x, nbin y] -or-
-         [bin edges] -or- [x bin edges, y bin edges].
-    range:  A sequence of lower and upper bin edges (default: [min, max]).
-    normed: True or False.
-
-    The histogram array is a count of the number of samples in each
-    two dimensional bin.
-    Setting normed to True returns a density rather than a bin count.
+    :Parameters:
+      - `x,y` : Sample arrays (1D).
+      - `bins` : Number of bins -or- [nbin x, nbin y] -or- 
+             [bin edges] -or- [x bin edges, y bin edges].
+      - `range` : A sequence of lower and upper bin edges (default: [min, max]).
+      - `normed` : Boolean, if False, return the number of samples in each bin,
+                if True, returns the density.
+      - `weights` : An array of weights. The weights are normed only if normed 
+                is True. Should weights.sum() not equal N, the total bin count \
+                will not be equal to the number of samples.
+    
+    :Return:
+      - `hist` :    Histogram array.
+      - `xedges, yedges` : Arrays defining the bin edges. 
+    
+    Example:
+      >>> x = random.randn(100,2)
+      >>> hist2d, xedges, yedges = histogram2d(x, bins = (6, 7))
+    
+    :SeeAlso: histogramdd
     """
-    import numpy as np
+    from numpy import histogramdd
+    
     try:
         N = len(bins)
     except TypeError:
         N = 1
-        bins = [bins]
-    x = asarray(x)
-    y = asarray(y)
-    if range is None:
-        xmin, xmax = x.min(), x.max()
-        ymin, ymax = y.min(), y.max()
-    else:
-        xmin, xmax = range[0]
-        ymin, ymax = range[1]
-    if N == 2:
-        if np.isscalar(bins[0]):
-            xnbin = bins[0]
-            xedges = np.linspace(xmin, xmax, xnbin+1)
-        else:
-            xedges = asarray(bins[0], float)
-            xnbin = len(xedges)-1
-        if np.isscalar(bins[1]):
-            ynbin = bins[1]
-            yedges = np.linspace(ymin, ymax, ynbin+1)
-        else:
-            yedges = asarray(bins[1], float)
-            ynbin = len(yedges)-1
-    elif N == 1:
-        ynbin = xnbin = bins[0]
-        xedges = np.linspace(xmin, xmax, xnbin+1)
-        yedges = np.linspace(ymin, ymax, ynbin+1)
-    else:
-        yedges = asarray(bins, float)
-        xedges = yedges.copy()
-        ynbin = len(yedges)-1
-        xnbin = len(xedges)-1
 
-    dxedges = np.diff(xedges)
-    dyedges = np.diff(yedges)
-
-    # Flattened histogram matrix (1D)
-    hist = np.zeros((xnbin)*(ynbin), int)
-
-    # Count the number of sample in each bin (1D)
-    xbin = np.digitize(x,xedges)
-    ybin = np.digitize(y,yedges)
-
-    # Values that fall on an edge are put in the right bin.
-    # For the rightmost bin, we want values equal to the right
-    # edge to be counted in the last bin, and not as an outlier.
-    xdecimal = int(-np.log10(dxedges.min()))+6
-    ydecimal = int(-np.log10(dyedges.min()))+6
-    on_edge_x = np.where(np.around(x,xdecimal) == np.around(xedges[-1], xdecimal))[0]
-    on_edge_y = np.where(np.around(y,ydecimal) == np.around(yedges[-1], ydecimal))[0]
-    xbin[on_edge_x] -= 1
-    ybin[on_edge_y] -= 1
-    # Remove the true outliers
-    outliers = (xbin==0) | (xbin==xnbin+1) | (ybin==0) | (ybin == ynbin+1)
-    xbin = xbin[outliers==False] - 1
-    ybin = ybin[outliers==False] - 1
-
-    # Compute the sample indices in the flattened histogram matrix.
-    if xnbin >= ynbin:
-        xy = ybin*(xnbin) + xbin
-
-    else:
-        xy = xbin*(ynbin) + ybin
-
-
-    # Compute the number of repetitions in xy and assign it to the flattened
-    #  histogram matrix.
-
-    flatcount = np.bincount(xy)
-    indices = np.arange(len(flatcount))
-    hist[indices] = flatcount
-
-    shape = np.sort([xnbin, ynbin])
-    # Shape into a proper matrix
-    histmat = hist.reshape(shape)
-    if (shape == (ynbin, xnbin)).all():
-        histmat = histmat.T
-    if normed:
-        diff2 = np.outer(dxedges, dyedges)
-        histmat = histmat / diff2 / histmat.sum()
-    return histmat, xedges, yedges
+    if N != 1 and N != 2:
+        xedges = yedges = asarray(bins, float)
+        bins = [xedges, yedges]
+    hist, edges = histogramdd([x,y], bins, range, normed, weights)
+    return hist, edges[0], edges[1]