[Numpy-discussion] List/location of consecutive integers

Fri May 22 16:15:19 EDT 2009

On May 22, 2009, at 12:31 PM, Andrea Gavana wrote:

> Hi All,
>
>    this should be a very easy question but I am trying to make a
> script run as fast as possible, so please bear with me if the solution
> is easy and I just overlooked it.
>
> I have a list of integers, like this one:
>
> indices = [1,2,3,4,5,6,7,8,9,255,256,257,258,10001,10002,10003,10004]
>
>> From this list, I would like to find out which values are consecutive
> and store them in another list of tuples (begin_consecutive,
> end_consecutive) or a simple list: as an example, the previous list
> will become:
>
> new_list = [(1, 9), (255, 258), (10001, 10004)]

Josef's and Chris's solutions are pretty neat in this case. I've been  
recently working on a more generic case where integers are grouped  
depending on some condition (equals, differing by 1 or 2...). A  
version in pure Python/numpy, the `Cluster` class is available in  
scikits.hydroclimpy.core.tools (hydroclimpy.sourceforge.net).  
Otherwise, here's a Cython version of the same class.
Let me know if it works. And I'm not ultra happy with the name, so if  
you have any suggestions...

cdef class Brackets:
     """
     Groups consecutive data from an array according to a clustering  
condition.

     A cluster is defined as a group of consecutive values differing  
by at most the
     increment value.

     Missing values are **not** handled: the input sequence must  
therefore be free
     of missing values.

     Parameters
     ----------
     darray : ndarray
         Input data array to clusterize.
     increment : {float}, optional
         Increment between two consecutive values to group.
         By default, use a value of 1.
     operator : {function}, optional
         Comparison operator for the definition of clusters.
         By default, use :func:`numpy.less_equal`.

     Attributes
     ----------
     inishape
         Shape of the argument array (stored for resizing).
     inisize
         Size of the argument array.
     uniques : sequence
         List of unique cluster values, as they appear in  
chronological order.
     slices : sequence
         List of the slices corresponding to each cluster of data.
     starts : ndarray
         Lists of the indices at which the clusters start.
     ends : ndarray
         Lists of the indices at which the clusters end.
     clustered : list
         List of clustered data.

     Examples
     --------
     >>> A = [0, 0, 1, 2, 2, 2, 3, 4, 3, 4, 4, 4]
     >>> klust = cluster(A,0)
     >>> [list(_) for _ in klust.clustered]
     [[0, 0], [1], [2, 2, 2], [3], [4], [3], [4, 4, 4]]
     >>> klust.uniques
     array([0, 1, 2, 3, 4, 3, 4])

     >>> x = [ 1.8, 1.3, 2.4, 1.2, 2.5, 3.9, 1. , 3.8, 4.2, 3.3,
     ...       1.2, 0.2, 0.9, 2.7, 2.4, 2.8, 2.7, 4.7, 4.2, 0.4]
     >>> Brackets(x,1).starts
     array([ 0,  2,  3,  4,  5,  6,  7, 10, 11, 13, 17, 19])
     >>> Brackets(x,1.5).starts
     array([ 0,  6,  7, 10, 13, 17, 19])
     >>> Brackets(x,2.5).starts
     array([ 0,  6,  7, 19])
     >>> Brackets(x,2.5,greater).starts
     array([ 0,  1,  2,  3,  4,  5,  8,  9, 10,
     ...    11, 12, 13, 14, 15, 16, 17, 18])
     >>> y = [ 0, -1, 0, 0, 0, 1, 1, -1, -1, -1, 1, 1, 0, 0, 0, 0, 1,  
1, 0, 0]
     >>> Brackets(y,1).starts
     array([ 0,  1,  2,  5,  7, 10, 12, 16, 18])

     """

     cdef readonly double increment
     cdef readonly np.ndarray data
     cdef readonly list _starts
     cdef readonly list _ends

     def __init__(Brackets self, object data, double increment=1,
                  object operator=np.less_equal):
         """
         """
         cdef int i, n, ifirst, ilast, test
         cdef double last
         cdef list starts, ends
         #
         self.increment = increment
         self.data = np.asanyarray(data)
         data = np.asarray(data)
         #
         n = len(data)
         starts = []
         ends = []
         #
         last = data[0]
         ifirst = 0
         ilast = 0
         for 1 <= i < n:
             test = operator(abs(data[i] - last), increment)
             ilast = i
             if not test:
                 starts.append(ifirst)
                 ends.append(ilast-1)
                 ifirst = i
             last = data[i]
         starts.append(ifirst)
         ends.append(n-1)
         self._starts = starts
         self._ends = ends

     def __len__(self):
         return len(self.starts)

     property starts:
         #
         def __get__(Brackets self):
             return np.asarray(self._starts)

     property ends:
         #
         def __get__(Brackets self):
             return np.asarray(self._ends)

     property sizes:
         #
         def __get__(Brackets self):
             return np.asarray(self._ends) - np.asarray(self._firsts)

     property slices:
         #
         def __get__(Brackets self):
             cdef int i
             cdef list starts = self._starts, ends = self._ends
             cdef list slices = []
             for 0 <= i < len(starts):
                 slices.append(slice(starts[i], ends[i]+1))
             return slices

     property clustered:
         #
         def __get__(self):
             cdef int i
             cdef list starts = self._starts, ends = self._ends
             cdef list groups = []
             cdef np.ndarray data = self.data
             for 0 <= i < len(starts):
                 groups.append(data[starts[i]:ends[i]+1])
             return groups

     property uniques:
         def __get__(self):
             return self.data[self.starts]

     def grouped_slices(self):
         """
     Returns a dictionary with the unique values of ``self`` as keys,  
and a list
     of slices for the corresponding values.

     See Also
     --------
     Brackets.grouped_limits
         that does the same thing
         """
         # Define shortcuts
         cdef int i, ifirst, n = len(self)
         cdef list starts = self._starts, ends = self._ends
         cdef np.ndarray data = self.data
         # Define new variables
         cdef list seen = []
         cdef double value
         cdef dict grouped = {}
         for 0 <= i < n:
             ifirst = starts[i]
             value = data[ifirst]
             if not (value in seen):
                 grouped[value] = []
                 seen.append(value)
             grouped[value].append(slice(ifirst, ends[i]+1))
         return grouped

     def grouped_limits(self):
         """
     Returns a dictionary with the unique values of ``self`` as keys,  
and a list
     of tuples (starting index, ending index) for the corresponding  
values.

     See Also
     --------
     Cluster.grouped_slices
         """
         # Define shortcuts
         cdef int i, ifirst, n = len(self)
         cdef list starts = self.starts, ends = self.ends
         cdef np.ndarray data = self.data
         # Define new variables
         cdef list seen = []
         cdef double value
         cdef dict grouped = {}
         for 0 <= i < n:
             ifirst = starts[i]
             value = data[ifirst]
             if not (value in seen):
                 grouped[value] = []
                 seen.append(value)
             grouped[value].append((ifirst, ends[i]+1))
         for k in grouped:
             grouped[k] = np.asarray(grouped[k])
         return grouped