[Numpy-discussion] fromiter + dtype='S' -> Python crash

Fri Mar 28 14:04:36 EDT 2008

On Fri, Mar 14, 2008 at 02:53:17PM -0500, Travis E. Oliphant wrote:
> Zbyszek Szmek wrote:
> > On Thu, Mar 13, 2008 at 05:44:54PM -0400, Alan G Isaac wrote:
> >> In principle you should be able to use ``fromiter``,
> >> I believe, but it does not work.  BUG? (Crasher.)
> >>     
> >>>>> import numpy as N
> >>>>> x = [1,2,3]
> >>>>> fmt="%03d"
> >>>>> N.fromiter([xi for xi in x],dtype='S')
> >>>>>           
> >> Python crashes.
> >
> > 2. what does dtype with dtype.elsize==0 mean? Should it be allowed at all?
> >    If it is sometimes valid, then PyArray_FromIter should be fixed.
> 
> It is a bug that needs to be fixed in PyArray_FromIter, I think.
> 

Upon deeper review, this function has more problems. 
1. The bug above:
PyArray_NewFromDescr sometimes returns an array with dtype
different then the one specified. E.g.:
>>> dtype('S'); empty(300, dtype('S')).dtype
dtype('|S0')
dtype('|S1')

so the element size should be taken from the created array, not
from specified dtype.

2. The check for overflow is incorrect, when elsize==1 the function 
always returns a MemoryError.

3. From the docstring: 'If count is
    nonegative, the new array will have count elements, otherwise it's
    size is determined by the generator.'
   However, later we test for count == -1.

I think it is simplifies things to split out the overflow tests and 
resizing into a helper function. 

 multiarraymodule.c |   74 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 29 deletions(-)

-------------------------------------------------------------------------

--- numpy/core/src/multiarraymodule.c_unmodified	2008-03-28 13:46:38.000000000 +0100
+++ numpy/core/src/multiarraymodule.c	2008-03-28 18:26:51.000000000 +0100
@@ -6301,6 +6301,38 @@
 	return ret;
 }
 
+/*
+  Grow ret->data:
+  this is similar for the strategy for PyListObject, but we use
+  50% overallocation => 0, 4, 8, 16, ...
+  
+  Returns 1 on success, 0 on error.
+*/
+static int increase_array_size(PyArrayObject* ar, intp* elcount)
+{
+        char *new_data;
+        intp elsize = ar->strides[0];
+
+        /* size_t is unsigned so the behavior on overflow is defined.  */
+        size_t bufsize;
+        size_t half = ar->dimensions[0] ? 2 * (size_t)ar->dimensions[0] : 2;
+        *elcount = half * 2;
+        bufsize = *elcount * elsize;
+        
+        if( *elcount/2 != half || bufsize/elsize != *elcount)
+                goto error;
+
+        new_data = PyDataMem_RENEW(ar->data, bufsize);
+        if(!new_data)
+                goto error;
+
+        ar->data = new_data;
+        return 1;
+
+error:
+        PyErr_SetString(PyExc_MemoryError, "cannot allocate array memory");
+        return 0;
+}
 
 /* steals a reference to dtype (which cannot be NULL) */
 /*OBJECT_API */
@@ -6310,14 +6342,12 @@
         PyObject *value;
         PyObject *iter = PyObject_GetIter(obj);
         PyArrayObject *ret = NULL;
-        intp i, elsize, elcount;
+        intp i;
+        intp elcount = (count < 0) ? 0 : count;
         char *item, *new_data;
 
         if (iter == NULL) goto done;
 
-        elcount = (count < 0) ? 0 : count;
-        elsize = dtype->elsize;
-
         /* We would need to alter the memory RENEW code to decrement any
            reference counts before throwing away any memory.
          */
@@ -6329,31 +6359,17 @@
 
         ret = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, dtype, 1,
                                     &elcount, NULL,NULL, 0, NULL);
-        dtype = NULL;
+        dtype = NULL; /* dtype is always eaten by PA_NewFromDescr */
         if (ret == NULL) goto done;
 
-        for (i = 0; (i < count || count == -1) &&
+        for (i = 0; (i < count || count < 0) &&
                      (value = PyIter_Next(iter)); i++) {
 
-            if (i >= elcount) {
-                /*
-                   Grow ret->data:
-                   this is similar for the strategy for PyListObject, but we use
-                   50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ...
-                */
-                elcount = (i >> 1) + (i < 4 ? 4 : 2) + i;
-                if (elcount <= (intp)((~(size_t)0) / elsize))
-                    new_data = PyDataMem_RENEW(ret->data, elcount * elsize);
-                else
-                    new_data = NULL;
-                if (new_data == NULL) {
-                    PyErr_SetString(PyExc_MemoryError,
-                                    "cannot allocate array memory");
-                    Py_DECREF(value);
-                    goto done;
-                }
-                ret->data = new_data;
+            if (i == elcount && !increase_array_size(ret, &elcount)){
+                Py_DECREF(value);
+                goto done;
             }
+
             ret->dimensions[0] = i+1;
 
             if (((item = index2ptr(ret, i)) == NULL) ||
@@ -6373,13 +6389,13 @@
         /*
             Realloc the data so that don't keep extra memory tied up
             (assuming realloc is reasonably good about reusing space...)
+
+            If the reallocation fails, it is not a fatal error.
         */
         if (i==0) i = 1;
-        ret->data = PyDataMem_RENEW(ret->data, i * elsize);
-        if (ret->data == NULL) {
-            PyErr_SetString(PyExc_MemoryError, "cannot allocate array memory");
-            goto done;
-        }
+        new_data = PyDataMem_RENEW(ret->data, i * ret->strides[0]);
+        if (new_data != NULL)
+            ret->data = new_data;
 
 done:
         Py_XDECREF(iter);
-------------------------------------------------------------------------

And the doctest:
>>> import numpy

>>> x = [1,2,3]
>>> print numpy.fromiter(x, dtype='d')
[ 1.  2.  3.]

>>> print numpy.fromiter(x, dtype='S1')
['1' '2' '3']

>>> print numpy.fromiter((i * 100 for i in x), dtype='S2')
['10' '20' '30']

>>> print numpy.fromiter(x, dtype='S1', count=-1)
['1' '2' '3']

>>> print numpy.fromiter(x, dtype='S1', count=-2)
['1' '2' '3']

>>> print numpy.fromiter(x, dtype='S1', count=3)
['1' '2' '3']

>>> print numpy.fromiter(x, dtype='S1', count=4)
Traceback (most recent call last):
  File "doctest.py", line 1248, in __run
    compileflags, 1) in test.globs
  File "<doctest test_fromiter.py[9]>", line 1, in ?
    print numpy.fromiter(x, dtype='S1', count=4)
ValueError: iterator too short

>>> print numpy.fromiter(range(3000000), dtype='S2') # doctest: 
+ELLIPSIS
['0' ... '29']

>>> print numpy.fromiter(x, dtype='S')
['1' '2' '3']


Does this look OK?

Cheers,
Zbyszek

-------------- next part --------------
A non-text attachment was scrubbed...
Name: ma_diff.diff
Type: text/x-diff
Size: 3944 bytes
Desc: not available
URL: <http://mail.python.org/pipermail/numpy-discussion/attachments/20080328/870ce04f/attachment.diff>