[SciPy-user] indices of consecutive elements

Tue Dec 2 12:13:16 EST 2008

Daniel,
I coded a generic class that does what you want. It's not optimize,  
but at least should get you started. Let me know if you find it useful  
and if you find ways to tweak it...
Cheers,
P.

_____

class Cluster(object):
     """
     Groups consecutive data from an array according to a clustering  
condition.
     A cluster is defined as a group of consecutive values differing  
by at most the
     increment value.

     Missing values are **not** handled: the input sequence must  
therefore be free
     of missing values.

     Parameters
     ----------
     darray : ndarray
         Input data array to clusterize.
     increment : {float}, optional
         Increment between two consecutive values to group.
         By default, use a value of 1.
     operator : {function}, optional
         Comparison operator for the definition of clusters.
         By default, use :func:`numpy.less_equal`.

     Attributes
     ----------
     inishape
         Shape of the argument array (stored for resizing).
     inisize
         Size of the argument array.
     uniques : sequence
         List of unique cluster values, as they appear in  
chronological order.
     slices : sequence
         List of the slices corresponding to each cluster of data.
     starts : ndarray
         Array of the indices at which the clusters start.
     clustered : list
         List of clustered data.

     Examples
     --------
     >>> A = [0, 0, 1, 2, 2, 2, 3, 4, 3, 4, 4, 4]
     >>> klust = cluster(A,0)
     >>> [list(_) for _ in klust.clustered]
     [[0, 0], [1], [2, 2, 2], [3], [4], [3], [4, 4, 4]]
     >>> klust.uniques
     array([0, 1, 2, 3, 4, 3, 4])

     >>> x = [ 1.8, 1.3, 2.4, 1.2, 2.5, 3.9, 1. , 3.8, 4.2, 3.3,
     ...       1.2, 0.2, 0.9, 2.7, 2.4, 2.8, 2.7, 4.7, 4.2, 0.4]
     >>> Cluster(x,1).starts
     array([ 0,  2,  3,  4,  5,  6,  7, 10, 11, 13, 17, 19])
     >>> Cluster(x,1.5).starts
     array([ 0,  6,  7, 10, 13, 17, 19])
     >>> Cluster(x,2.5).starts
     array([ 0,  6,  7, 19])
     >>> Cluster(x,2.5,greater).starts
     array([ 0,  1,  2,  3,  4,  5,  8,  9, 10,
     ...    11, 12, 13, 14, 15, 16, 17, 18])
     >>> y = [ 0, -1, 0, 0, 0, 1, 1, -1, -1, -1, 1, 1, 0, 0, 0, 0, 1,  
1, 0, 0]
     >>> Cluster(y,1).starts
     array([ 0,  1,  2,  5,  7, 10, 12, 16, 18])

     """
     def __init__(self,darray,increment=1,operator=np.less_equal):
         """
     Initializes instance.

     Parameters
     ----------
     darray : ndarray
         Input data array to clusterize.
     increment : {float}, optional
         Increment between two consecutive values to group.
         By default, use a value of 1.
     operator : {function}, optional
         Comparison operator for the definition of clusters.
         By default, use :func:`np.less_equal`

         """
         if hasattr(darray,'mask') and darray.mask.any():
             raise ma.MAError("Masked arrays should be filled prior  
clustering.")
         else:
             darray = np.asanyarray(darray)
         n = darray.size
         self.inishape = darray.shape
         self.inisize = darray.size
         clustercond = 1 -  
operator(np.absolute(np.diff(darray.ravel())),
                                    increment)
         sid = np.r_[[0,], np.arange(1,n).compress(clustercond), [n,]]
         slobj = np.asarray([slice(i,d)
                             for (i,d) in  
np.broadcast(sid[:-1],sid[1:])])
         #
         self.uniques = darray.ravel()[sid[:-1]]
         self.clustered = [darray[k] for k in slobj]
         self.sizes = np.asarray(np.diff(sid))
         self.slices = slobj
         self.starts = sid[:-1]

     def markonsize(self,operator,sizethresh):
         """
     Creates a **mask** for the clusters that do not meet a size  
requirement.
     Thus, outputs ``False`` if the size requirement is met, ``True``  
otherwise.

     Parameters
     ----------
     operator : function
         Comparison operator
     sizethresh : float
         Requirement for the sizes of the clusters

         """
         resmask = np.empty(self.inisize, dtype=bool)
         resmask[:] = True
#        for k in self.slices.compress(operator(self.sizes,sizethresh)):
         for k in self.slices[operator(self.sizes,sizethresh)]:
             resmask[k] = False
         return resmask.reshape(self.inishape)

     def mark_greaterthan(self,sizemin):
         """
     Shortcut for :meth:`markonsize(greater_equal,sizemin)`.
     Thus, the command outputs ``False`` for clusters larger than  
``sizemin``, and
     ``True`` for clusters smaller than ``sizemin``.

     Parameters
     ----------
     sizemin : int
         Minimum size of the clusters.

     See Also
     --------
     :meth:`markonsize`
         Creates a **mask** for the clusters that do not meet a size  
requirement.
     """
         return self.markonsize(np.greater_equal,sizemin)

     def grouped_slices(self):
         """
     Returns a dictionary with the unique values of ``self`` as keys,  
and a list
     of slices for the corresponding values.

     See Also
     --------
     :meth:`~Cluster.grouped_limits`
         that does the same thing
         """
         #
         output = dict([(k,[]) for k in np.unique1d(self.uniques)])
         for (k,v) in zip(self.uniques, self.slices):
             output[k].append(v)
         return output

     def grouped_limits(self):
         """
     Returns a dictionary with the unique values of ``self`` as keys,  
and a list
     of tuples (starting index, ending index) for the corresponding  
values.

     See Also
     --------
     :meth:`~Cluster.grouped_slices`
         """
         output = dict([(k,[]) for k in np.unique1d(self.uniques)])
         for (k,v) in zip(self.uniques, self.slices):
             output[k].append((v.start, v.stop))
         for k in output:
             output[k] = np.array(output[k])
         return output

_____

On Dec 2, 2008, at 11:43 AM, Daniel Ashbrook wrote:

> I'm trying to figure out a way to return the indices of the start and
> end of a run of consecutive elements that match some condition, but  
> only
> if there are more than a certain number.
>
> For example, take the array (with indices in comment for clarity):
>
> #0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22
> [0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0]
>
> I want to find the start and end indices of all runs of 1s with length
> of 4 or longer; so here the answer would be:
>
> [[2,5], [15,18]]
>
> Is there a reasonable way to do this without looping? I've been  
> playing
> around with diff() and where() but without too much progress.
>
> Thanks,
>
>
> dan
> _______________________________________________
> SciPy-user mailing list
> SciPy-user at scipy.org
> http://projects.scipy.org/mailman/listinfo/scipy-user