[Numpy-discussion] fromiter + dtype='S' -> Python crash
Zbyszek Szmek
zbyszek at in.waw.pl
Fri Mar 28 14:04:36 EDT 2008
On Fri, Mar 14, 2008 at 02:53:17PM -0500, Travis E. Oliphant wrote:
> Zbyszek Szmek wrote:
> > On Thu, Mar 13, 2008 at 05:44:54PM -0400, Alan G Isaac wrote:
> >> In principle you should be able to use ``fromiter``,
> >> I believe, but it does not work. BUG? (Crasher.)
> >>
> >>>>> import numpy as N
> >>>>> x = [1,2,3]
> >>>>> fmt="%03d"
> >>>>> N.fromiter([xi for xi in x],dtype='S')
> >>>>>
> >> Python crashes.
> >
> > 2. what does dtype with dtype.elsize==0 mean? Should it be allowed at all?
> > If it is sometimes valid, then PyArray_FromIter should be fixed.
>
> It is a bug that needs to be fixed in PyArray_FromIter, I think.
>
Upon deeper review, this function has more problems.
1. The bug above:
PyArray_NewFromDescr sometimes returns an array with dtype
different then the one specified. E.g.:
>>> dtype('S'); empty(300, dtype('S')).dtype
dtype('|S0')
dtype('|S1')
so the element size should be taken from the created array, not
from specified dtype.
2. The check for overflow is incorrect, when elsize==1 the function
always returns a MemoryError.
3. From the docstring: 'If count is
nonegative, the new array will have count elements, otherwise it's
size is determined by the generator.'
However, later we test for count == -1.
I think it is simplifies things to split out the overflow tests and
resizing into a helper function.
multiarraymodule.c | 74 ++++++++++++++++++++++++++++++++---------------------
1 file changed, 45 insertions(+), 29 deletions(-)
-------------------------------------------------------------------------
--- numpy/core/src/multiarraymodule.c_unmodified 2008-03-28 13:46:38.000000000 +0100
+++ numpy/core/src/multiarraymodule.c 2008-03-28 18:26:51.000000000 +0100
@@ -6301,6 +6301,38 @@
return ret;
}
+/*
+ Grow ret->data:
+ this is similar for the strategy for PyListObject, but we use
+ 50% overallocation => 0, 4, 8, 16, ...
+
+ Returns 1 on success, 0 on error.
+*/
+static int increase_array_size(PyArrayObject* ar, intp* elcount)
+{
+ char *new_data;
+ intp elsize = ar->strides[0];
+
+ /* size_t is unsigned so the behavior on overflow is defined. */
+ size_t bufsize;
+ size_t half = ar->dimensions[0] ? 2 * (size_t)ar->dimensions[0] : 2;
+ *elcount = half * 2;
+ bufsize = *elcount * elsize;
+
+ if( *elcount/2 != half || bufsize/elsize != *elcount)
+ goto error;
+
+ new_data = PyDataMem_RENEW(ar->data, bufsize);
+ if(!new_data)
+ goto error;
+
+ ar->data = new_data;
+ return 1;
+
+error:
+ PyErr_SetString(PyExc_MemoryError, "cannot allocate array memory");
+ return 0;
+}
/* steals a reference to dtype (which cannot be NULL) */
/*OBJECT_API */
@@ -6310,14 +6342,12 @@
PyObject *value;
PyObject *iter = PyObject_GetIter(obj);
PyArrayObject *ret = NULL;
- intp i, elsize, elcount;
+ intp i;
+ intp elcount = (count < 0) ? 0 : count;
char *item, *new_data;
if (iter == NULL) goto done;
- elcount = (count < 0) ? 0 : count;
- elsize = dtype->elsize;
-
/* We would need to alter the memory RENEW code to decrement any
reference counts before throwing away any memory.
*/
@@ -6329,31 +6359,17 @@
ret = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, dtype, 1,
&elcount, NULL,NULL, 0, NULL);
- dtype = NULL;
+ dtype = NULL; /* dtype is always eaten by PA_NewFromDescr */
if (ret == NULL) goto done;
- for (i = 0; (i < count || count == -1) &&
+ for (i = 0; (i < count || count < 0) &&
(value = PyIter_Next(iter)); i++) {
- if (i >= elcount) {
- /*
- Grow ret->data:
- this is similar for the strategy for PyListObject, but we use
- 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ...
- */
- elcount = (i >> 1) + (i < 4 ? 4 : 2) + i;
- if (elcount <= (intp)((~(size_t)0) / elsize))
- new_data = PyDataMem_RENEW(ret->data, elcount * elsize);
- else
- new_data = NULL;
- if (new_data == NULL) {
- PyErr_SetString(PyExc_MemoryError,
- "cannot allocate array memory");
- Py_DECREF(value);
- goto done;
- }
- ret->data = new_data;
+ if (i == elcount && !increase_array_size(ret, &elcount)){
+ Py_DECREF(value);
+ goto done;
}
+
ret->dimensions[0] = i+1;
if (((item = index2ptr(ret, i)) == NULL) ||
@@ -6373,13 +6389,13 @@
/*
Realloc the data so that don't keep extra memory tied up
(assuming realloc is reasonably good about reusing space...)
+
+ If the reallocation fails, it is not a fatal error.
*/
if (i==0) i = 1;
- ret->data = PyDataMem_RENEW(ret->data, i * elsize);
- if (ret->data == NULL) {
- PyErr_SetString(PyExc_MemoryError, "cannot allocate array memory");
- goto done;
- }
+ new_data = PyDataMem_RENEW(ret->data, i * ret->strides[0]);
+ if (new_data != NULL)
+ ret->data = new_data;
done:
Py_XDECREF(iter);
-------------------------------------------------------------------------
And the doctest:
>>> import numpy
>>> x = [1,2,3]
>>> print numpy.fromiter(x, dtype='d')
[ 1. 2. 3.]
>>> print numpy.fromiter(x, dtype='S1')
['1' '2' '3']
>>> print numpy.fromiter((i * 100 for i in x), dtype='S2')
['10' '20' '30']
>>> print numpy.fromiter(x, dtype='S1', count=-1)
['1' '2' '3']
>>> print numpy.fromiter(x, dtype='S1', count=-2)
['1' '2' '3']
>>> print numpy.fromiter(x, dtype='S1', count=3)
['1' '2' '3']
>>> print numpy.fromiter(x, dtype='S1', count=4)
Traceback (most recent call last):
File "doctest.py", line 1248, in __run
compileflags, 1) in test.globs
File "<doctest test_fromiter.py[9]>", line 1, in ?
print numpy.fromiter(x, dtype='S1', count=4)
ValueError: iterator too short
>>> print numpy.fromiter(range(3000000), dtype='S2') # doctest:
+ELLIPSIS
['0' ... '29']
>>> print numpy.fromiter(x, dtype='S')
['1' '2' '3']
Does this look OK?
Cheers,
Zbyszek
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ma_diff.diff
Type: text/x-diff
Size: 3944 bytes
Desc: not available
URL: <http://mail.python.org/pipermail/numpy-discussion/attachments/20080328/870ce04f/attachment.diff>
More information about the NumPy-Discussion
mailing list