[Numpy-discussion] Possible modification to bincount()

Stephen Simmons mail at stevesimmons.com
Thu Jul 20 23:44:57 EDT 2006


While playing a little more with bincount(), one modification would be 
handy: Allow negative integers in the bin list, but skip them when 
counting bins

My specific use case is calculating subtotals on columns of large 
datasets (1m rows x 30 cols), where some rows need to be excluded.  The 
groupings are expensive to compute, and sometimes will involve ~99% of 
the rows (eliminate only outliers/errors), and other times only ~5% of 
the rows (focus in on a subset).

I'd like to calculate subtotals like this using bincount(), without 
having to copy the large datasets just to eliminate the unwanted rows:

# Assign each row to a group numbered from 0..G, except for -1 for rows 
to exclude
row_groups = expensive_function(data)

# Count number in each group, excluding those with grp==-1
grp_counts = bincount(list=row_groups)

# Use bincount() to form subtotals by column, excluding those with grp==-1
subtotals = column_stack([ bincount(list=row_groups, weights=data[:,i]) 
for i in range(G+1) ])

Is there any appetite to make such a change to bincount()?

This would require two simple changes to bincount() in  _compiled_base.c 
and an update to the docstring. Here is the diff file with enough 
context to show the entire arr_bincount() function:

*** orig_compiled_base.c    2006-07-21 13:14:21.250000000 +1000
--- _compiled_base.c    2006-07-21 13:34:41.718750000 +1000
***************
*** 70,143 ****
      intp j ;
      for ( j = 1 ; j < len; j ++ )
      if ( i [j] < min )
          {min = i [j] ;
          mn = j ;}
      return mn;
  }
 
 
  static PyObject *
  arr_bincount(PyObject *self, PyObject *args, PyObject *kwds)
  {
       /* histogram accepts one or two arguments. The first is an array
!       * of non-negative integers and the second, if present, is an
        * array of weights, which must be promotable to double.
        * Call these arguments list and weight. Both must be one-
        * dimensional. len (weight) == len(list)
        * If weight is not present:
!       *   histogram (list) [i] is the number of occurrences of i in list.
        * If weight is present:
        *   histogram (list, weight) [i] is the sum of all weight [j]
!       * where list [j] == 
i.                                              */
       /* self is not used */
      PyArray_Descr *type;
      PyObject *list = NULL, *weight=Py_None ;
      PyObject *lst=NULL, *ans=NULL, *wts=NULL;
!     intp *numbers, *ians, len , mxi, mni, ans_size;
      int i;
      double *weights , *dans;
      static char *kwlist[] = {"list", "weights", NULL};
 
 
      Py_Try(PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwlist,
                         &list, &weight));
      Py_Try(lst = PyArray_ContiguousFromAny(list, PyArray_INTP, 1, 1));
      len = PyArray_SIZE(lst);
      numbers = (intp *) PyArray_DATA(lst);
      mxi = mxx (numbers, len) ;
-     mni = mnx (numbers, len) ;
-     Py_Assert(numbers[mni] >= 0,
-           "irst argument of bincount must be non-negative");
      ans_size = numbers [mxi] + 1 ;
      type = PyArray_DescrFromType(PyArray_INTP);
      if (weight == Py_None) {
      Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0));
      ians = (intp *)(PyArray_DATA(ans));
      for (i = 0 ; i < len ; i++)
!         ians [numbers [i]] += 1 ;
      Py_DECREF(lst);
      }
      else {
          Py_Try(wts = PyArray_ContiguousFromAny(weight,
                             PyArray_DOUBLE, 1, 1));
      weights = (double *)PyArray_DATA (wts);
      Py_Assert(PyArray_SIZE(wts) == len, "bincount: length of weights " \
            "does not match that of list");
      type = PyArray_DescrFromType(PyArray_DOUBLE);
      Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0));
      dans = (double *)PyArray_DATA (ans);
      for (i = 0 ; i < len ; i++) {
!         dans[numbers[i]] += weights[i];
      }
      Py_DECREF(lst);
      Py_DECREF(wts);
      }
      return ans;
 
   fail:
      Py_XDECREF(lst);
      Py_XDECREF(wts);
      Py_XDECREF(ans);
      return NULL;
  }
 
--- 70,145 ----
      intp j ;
      for ( j = 1 ; j < len; j ++ )
      if ( i [j] < min )
          {min = i [j] ;
          mn = j ;}
      return mn;
  }
 
 
  static PyObject *
  arr_bincount(PyObject *self, PyObject *args, PyObject *kwds)
  {
       /* histogram accepts one or two arguments. The first is an array
!       * of integers and the second, if present, is an
        * array of weights, which must be promotable to double.
        * Call these arguments list and weight. Both must be one-
        * dimensional. len (weight) == len(list)
        * If weight is not present:
!       *   histogram (list) [i] is the number of occurrences of i in list
!       *   for i>=0. Negative i values are ignored.
        * If weight is present:
        *   histogram (list, weight) [i] is the sum of all weight [j]
!       *   where list [j] == i and i>=0.                                */
       /* self is not used */
      PyArray_Descr *type;
      PyObject *list = NULL, *weight=Py_None ;
      PyObject *lst=NULL, *ans=NULL, *wts=NULL;
!     intp *numbers, *ians, len , mxi, ans_size;
      int i;
      double *weights , *dans;
      static char *kwlist[] = {"list", "weights", NULL};
 
 
      Py_Try(PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwlist,
                         &list, &weight));
      Py_Try(lst = PyArray_ContiguousFromAny(list, PyArray_INTP, 1, 1));
      len = PyArray_SIZE(lst);
      numbers = (intp *) PyArray_DATA(lst);
      mxi = mxx (numbers, len) ;
      ans_size = numbers [mxi] + 1 ;
      type = PyArray_DescrFromType(PyArray_INTP);
      if (weight == Py_None) {
      Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0));
      ians = (intp *)(PyArray_DATA(ans));
      for (i = 0 ; i < len ; i++)
!         if (numbers[i]>=0) {
!             ians[numbers [i]] += 1 ;
!         }
      Py_DECREF(lst);
      }
      else {
          Py_Try(wts = PyArray_ContiguousFromAny(weight,
                             PyArray_DOUBLE, 1, 1));
      weights = (double *)PyArray_DATA (wts);
      Py_Assert(PyArray_SIZE(wts) == len, "bincount: length of weights " \
            "does not match that of list");
      type = PyArray_DescrFromType(PyArray_DOUBLE);
      Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0));
      dans = (double *)PyArray_DATA (ans);
      for (i = 0 ; i < len ; i++) {
!         if (numbers[i]>=0) {
!             dans[numbers[i]] += weights[i];
!         }
      }
      Py_DECREF(lst);
      Py_DECREF(wts);
      }
      return ans;
 
   fail:
      Py_XDECREF(lst);
      Py_XDECREF(wts);
      Py_XDECREF(ans);
      return NULL;
  }
 


Cheers

Stephen




More information about the NumPy-Discussion mailing list