[Numpy-discussion] List/location of consecutive integers
Pierre GM
pgmdevlist at gmail.com
Fri May 22 16:15:19 EDT 2009
On May 22, 2009, at 12:31 PM, Andrea Gavana wrote:
> Hi All,
>
> this should be a very easy question but I am trying to make a
> script run as fast as possible, so please bear with me if the solution
> is easy and I just overlooked it.
>
> I have a list of integers, like this one:
>
> indices = [1,2,3,4,5,6,7,8,9,255,256,257,258,10001,10002,10003,10004]
>
>> From this list, I would like to find out which values are consecutive
> and store them in another list of tuples (begin_consecutive,
> end_consecutive) or a simple list: as an example, the previous list
> will become:
>
> new_list = [(1, 9), (255, 258), (10001, 10004)]
Josef's and Chris's solutions are pretty neat in this case. I've been
recently working on a more generic case where integers are grouped
depending on some condition (equals, differing by 1 or 2...). A
version in pure Python/numpy, the `Cluster` class is available in
scikits.hydroclimpy.core.tools (hydroclimpy.sourceforge.net).
Otherwise, here's a Cython version of the same class.
Let me know if it works. And I'm not ultra happy with the name, so if
you have any suggestions...
cdef class Brackets:
"""
Groups consecutive data from an array according to a clustering
condition.
A cluster is defined as a group of consecutive values differing
by at most the
increment value.
Missing values are **not** handled: the input sequence must
therefore be free
of missing values.
Parameters
----------
darray : ndarray
Input data array to clusterize.
increment : {float}, optional
Increment between two consecutive values to group.
By default, use a value of 1.
operator : {function}, optional
Comparison operator for the definition of clusters.
By default, use :func:`numpy.less_equal`.
Attributes
----------
inishape
Shape of the argument array (stored for resizing).
inisize
Size of the argument array.
uniques : sequence
List of unique cluster values, as they appear in
chronological order.
slices : sequence
List of the slices corresponding to each cluster of data.
starts : ndarray
Lists of the indices at which the clusters start.
ends : ndarray
Lists of the indices at which the clusters end.
clustered : list
List of clustered data.
Examples
--------
>>> A = [0, 0, 1, 2, 2, 2, 3, 4, 3, 4, 4, 4]
>>> klust = cluster(A,0)
>>> [list(_) for _ in klust.clustered]
[[0, 0], [1], [2, 2, 2], [3], [4], [3], [4, 4, 4]]
>>> klust.uniques
array([0, 1, 2, 3, 4, 3, 4])
>>> x = [ 1.8, 1.3, 2.4, 1.2, 2.5, 3.9, 1. , 3.8, 4.2, 3.3,
... 1.2, 0.2, 0.9, 2.7, 2.4, 2.8, 2.7, 4.7, 4.2, 0.4]
>>> Brackets(x,1).starts
array([ 0, 2, 3, 4, 5, 6, 7, 10, 11, 13, 17, 19])
>>> Brackets(x,1.5).starts
array([ 0, 6, 7, 10, 13, 17, 19])
>>> Brackets(x,2.5).starts
array([ 0, 6, 7, 19])
>>> Brackets(x,2.5,greater).starts
array([ 0, 1, 2, 3, 4, 5, 8, 9, 10,
... 11, 12, 13, 14, 15, 16, 17, 18])
>>> y = [ 0, -1, 0, 0, 0, 1, 1, -1, -1, -1, 1, 1, 0, 0, 0, 0, 1,
1, 0, 0]
>>> Brackets(y,1).starts
array([ 0, 1, 2, 5, 7, 10, 12, 16, 18])
"""
cdef readonly double increment
cdef readonly np.ndarray data
cdef readonly list _starts
cdef readonly list _ends
def __init__(Brackets self, object data, double increment=1,
object operator=np.less_equal):
"""
"""
cdef int i, n, ifirst, ilast, test
cdef double last
cdef list starts, ends
#
self.increment = increment
self.data = np.asanyarray(data)
data = np.asarray(data)
#
n = len(data)
starts = []
ends = []
#
last = data[0]
ifirst = 0
ilast = 0
for 1 <= i < n:
test = operator(abs(data[i] - last), increment)
ilast = i
if not test:
starts.append(ifirst)
ends.append(ilast-1)
ifirst = i
last = data[i]
starts.append(ifirst)
ends.append(n-1)
self._starts = starts
self._ends = ends
def __len__(self):
return len(self.starts)
property starts:
#
def __get__(Brackets self):
return np.asarray(self._starts)
property ends:
#
def __get__(Brackets self):
return np.asarray(self._ends)
property sizes:
#
def __get__(Brackets self):
return np.asarray(self._ends) - np.asarray(self._firsts)
property slices:
#
def __get__(Brackets self):
cdef int i
cdef list starts = self._starts, ends = self._ends
cdef list slices = []
for 0 <= i < len(starts):
slices.append(slice(starts[i], ends[i]+1))
return slices
property clustered:
#
def __get__(self):
cdef int i
cdef list starts = self._starts, ends = self._ends
cdef list groups = []
cdef np.ndarray data = self.data
for 0 <= i < len(starts):
groups.append(data[starts[i]:ends[i]+1])
return groups
property uniques:
def __get__(self):
return self.data[self.starts]
def grouped_slices(self):
"""
Returns a dictionary with the unique values of ``self`` as keys,
and a list
of slices for the corresponding values.
See Also
--------
Brackets.grouped_limits
that does the same thing
"""
# Define shortcuts
cdef int i, ifirst, n = len(self)
cdef list starts = self._starts, ends = self._ends
cdef np.ndarray data = self.data
# Define new variables
cdef list seen = []
cdef double value
cdef dict grouped = {}
for 0 <= i < n:
ifirst = starts[i]
value = data[ifirst]
if not (value in seen):
grouped[value] = []
seen.append(value)
grouped[value].append(slice(ifirst, ends[i]+1))
return grouped
def grouped_limits(self):
"""
Returns a dictionary with the unique values of ``self`` as keys,
and a list
of tuples (starting index, ending index) for the corresponding
values.
See Also
--------
Cluster.grouped_slices
"""
# Define shortcuts
cdef int i, ifirst, n = len(self)
cdef list starts = self.starts, ends = self.ends
cdef np.ndarray data = self.data
# Define new variables
cdef list seen = []
cdef double value
cdef dict grouped = {}
for 0 <= i < n:
ifirst = starts[i]
value = data[ifirst]
if not (value in seen):
grouped[value] = []
seen.append(value)
grouped[value].append((ifirst, ends[i]+1))
for k in grouped:
grouped[k] = np.asarray(grouped[k])
return grouped
More information about the NumPy-Discussion
mailing list