[Numpy-discussion] Masked arrays to Compressed arrays

Fri Oct 1 08:41:13 EDT 2010

Hi numpy users

I am using masked arrays (MA) for gridded ocean forecast fields
(temperature, salinity, sea level and so on). These fields always have
the same masked elements (land). And in some cases memory usage is a
real issue. I have therefore made a class which stores masked arrays
in a more memory efficient way than the standard MA (I am calling it
Compressed arrays). Furthermore in some of my applications I perform
arithmetics which only needs to update the unmasked values in the
array and do not need to know anything per se about the mask. I have
therefore tried to put logic into my class which takes advantage of
this knowledge and applies the arithmetic operators efficiently only
on the unmasked values.

MA stores as far as I know two arrays: one containing the data values
and one containing the mask. Both have the same shape. My class stores
the mask (or rather a flattened version of it and shape info) and a 1D
data array which only contains values at unmasked elements.

The class is not entirely finished but I would like to hear your
constructive criticism of it. So to summarize:

1. Uses less memory except when temporarily converting to masked array
(for example to perform arithmetics - hints on avoiding this would be
greatly appreciated).
2. Has more efficient arithmetics in some cases.
3. Is less efficient in other cases.

Here is the class (I know that the inline arithmetic operator
overrides are wrong). I hope you have some good suggestions. I have
not subclassed MA but that might be an alternative solution:

"""
Compressed array. A compressed array is an alternative to numpy
masked arrays designed to reduce memory consumption.

Jesper Larsen, 2010
"""
# External imports
import numpy as np

def _compress(arr):
    """Compresses array."""
    mask = np.ma.getmaskarray(arr)
    arrout = np.ma.compressed(arr)
    return arrout, mask

def _decompress(arr, mask_flat, shape):
    """Decompresses array."""
    arr_out = np.ma.masked_all(mask_flat.shape, dtype=arr.dtype)
    arr_out[~mask_flat] = arr[:]
    arr_out = arr_out.reshape(shape)
    return arr_out

class carray(object):
    """Compressed array."""
    def __init__(self, array, mask=None, same_masks=False):
        """\
        Compressed array. Can be initialized with either a masked array
        instance or a flat numpy array and corresponding mask. If
        same_masks is set to True it is assumed that arithmetic operations
        on two such carrays with identical shapes have identical mask and
        thus optimized arithmetics can be used for some operations.
        """
        if isinstance(array, np.ma.masked_array):
            # Initialize from masked array
            self.carray, mask = _compress(array)
            self.shape = mask.shape
            self._mask_flat = mask.ravel()
        elif isinstance(array, np.ndarray):
            # Construct a compressed array from raw input.
            if mask is None:
                raise TypeError('Mask must be present')
            self.carray = array
            self.shape = mask.shape
            self._mask_flat = mask.ravel()
        else:
            raise TypeError('Invalid input type')
        self.same_masks = True

    def __getattr__(self, name):
        """Override attribute access."""
        if name == 'masked_array':
            return _decompress(self.carray, self._mask_flat, self.shape)
        if name == 'mask':
            return self._mask_flat.reshape(self.shape)
        else:
            # Find attribute in array data structure
            return getattr(self.masked_array, name)
        raise AttributeError(name)

    def __setattr__(self, name, value):
        """Override attribute access."""
        if name == 'masked_array':
            self.carray, mask = _compress(value)
            self.shape = mask.shape
            self._mask_flat = mask.ravel()
        else:
            object.__setattr__(self, name, value)

    def _optimized_arithmetics(self, method, *args):
        """\
        Generic method for efficiently performing arithmetics on
        compressed arrays.
        """
        if len(args) > 0 and isinstance(args[0], carray):
            other = args[0]
        else:
            other = None

        if other is None:
            np_method = getattr(self.carray, method)
            carr = np_method(*args)
            return carray(carr, self.mask)
        elif self is other:
            np_method = getattr(self.carray, method)
            carr = np_method(other.carray, *args[1:])
            return carray(carr, self.mask)
        elif self.same_masks and other.same_masks and \
             self.shape == other.shape:
            np_method = getattr(self.carray, method)
            carr = np_method(other.carray, *args[1:])
            return carray(carr, self.mask)
        else:
            ma_method = getattr(self.masked_array, method)
            return carray(ma_method(*args))

    def _optimized_arithmetic_methods(self, method, *args, **kwargs):
        """\
        Generic method for effiently applying arithmetic methods on
        compressed arrays.
        """
        if len(args) == 0 and 'axis' not in kwargs:
            np_method = getattr(self.carray, method)
            carr = np_method(*args, **kwargs)
            if isinstance(carr, np.ndarray):
                return carray(carr, self.mask)
            else:
                return carr
        else:
            ma_method = getattr(self.masked_array, method)
            return carray(ma_method(*args, **kwargs))

    """Expose access to the "masked_array" container at the top level"""
    def __repr__(self):
        return 'carray(\n' + self.masked_array.__repr__() + ')\n'
    def __str__(self):
        return self.masked_array.__str__()
    def __len__(self):
        return self.masked_array.__len__()
    def __getitem__(self, index):
        return self.masked_array.__getitem__(index)
    def __setitem__(self, index, value):
        return self.masked_array.__setitem__(index, value)
    def __delitem__(self, index):
        return self.masked_array.__delitem__(index)
    def __iter__(self):
        return self.masked_array.__iter__()
    def __contains__(self, item):
        return self.masked_array.__contains(item)

    """Optimized arithmetics"""
    def __add__(self, *args):
        return self._optimized_arithmetics('__add__', *args)
    def __sub__(self, *args):
        return self._optimized_arithmetics('__sub__', *args)
    def __mul__(self, *args):
        return self._optimized_arithmetics('__mul__', *args)
    def __floordiv__(self, *args):
        return self._optimized_arithmetics('__floordiv__', *args)
    def __mod__(self, *args):
        return self._optimized_arithmetics('__mod__', *args)
    def __divmod__(self, *args):
        return self._optimized_arithmetics('__divmod__', *args)
    def __pow__(self, *args):
        return self._optimized_arithmetics('__pow__', *args)
    def __lshift__(self, *args):
        return self._optimized_arithmetics('__lshift__', *args)
    def __rshift__(self, *args):
        return self._optimized_arithmetics('__rshift__', *args)
    def __and__(self, *args):
        return self._optimized_arithmetics('__and__', *args)
    def __xor__(self, *args):
        return self._optimized_arithmetics('__xor__', *args)
    def __or__(self, *args):
        return self._optimized_arithmetics('__or__', *args)
    def __div__(self, *args):
        return self._optimized_arithmetics('__div__', *args)
    def __truediv__(self, *args):
        return self._optimized_arithmetics('__truediv__', *args)
    def __radd__(self, *args):
        return self._optimized_arithmetics('__radd__', *args)
    def __rsub__(self, *args):
        return self._optimized_arithmetics('__rsub__', *args)
    def __rmul__(self, *args):
        return self._optimized_arithmetics('__rmul__', *args)
    def __rdiv__(self, *args):
        return self._optimized_arithmetics('__rdiv__', *args)
    def __rtruediv__(self, *args):
        return self._optimized_arithmetics('__rtruediv__', *args)
    def __rfloordiv__(self, *args):
        return self._optimized_arithmetics('__rfloordiv__', *args)
    def __rmod__(self, *args):
        return self._optimized_arithmetics('__rmod__', *args)
    def __rdivmod__(self, *args):
        return self._optimized_arithmetics('__rdivmod__', *args)
    def __rpow__(self, *args):
        return self._optimized_arithmetics('__rpow__', *args)
    def __rlshift__(self, *args):
        return self._optimized_arithmetics('__rlshift__', *args)
    def __rrshift__(self, *args):
        return self._optimized_arithmetics('__rrshift__', *args)
    def __rand__(self, *args):
        return self._optimized_arithmetics('__rand__', *args)
    def __rxor__(self, *args):
        return self._optimized_arithmetics('__rxor__', *args)
    def __ror__(self, *args):
        return self._optimized_arithmetics('__ror__', *args)
    def __iadd__(self, *args):
        return self._optimized_arithmetics('__iadd__', *args)
    def __isub__(self, *args):
        return self._optimized_arithmetics('__isub__', *args)
    def __imul__(self, *args):
        return self._optimized_arithmetics('__imul__', *args)
    def __idiv__(self, *args):
        return self._optimized_arithmetics('__idiv__', *args)
    def __itruediv__(self, *args):
        return self._optimized_arithmetics('__itruediv__', *args)
    def __ifloordiv__(self, *args):
        return self._optimized_arithmetics('__ifloordiv__', *args)
    def __imod__(self, *args):
        return self._optimized_arithmetics('__imod__', *args)
    def __ipow__(self, *args):
        return self._optimized_arithmetics('__ipow__', *args)
    def __ilshift__(self, *args):
        return self._optimized_arithmetics('__ilshift__', *args)
    def __irshift__(self, *args):
        return self._optimized_arithmetics('__irshift__', *args)
    def __iand__(self, *args):
        return self._optimized_arithmetics('__iand__', *args)
    def __ixor__(self, *args):
        return self._optimized_arithmetics('__ixor__', *args)
    def __ior__(self, *args):
        return self._optimized_arithmetics('__ior__', *args)
    def __neg__(self):
        return self._optimized_arithmetics('__neg__')
    def __pos__(self):
        return self._optimized_arithmetics('__pos__')
    def __abs__(self):
        return self._optimized_arithmetics('__abs__')
    def __invert__(self):
        return self._optimized_arithmetics('__invert__')
    def __complex__(self):
        return self._optimized_arithmetics('__complex__')
    def __int__(self):
        return self._optimized_arithmetics('__int__')
    def __long__(self):
        return self._optimized_arithmetics('__long__')
    def __float__(self):
        return self._optimized_arithmetics('__float__')
    def __oct__(self):
        return self._optimized_arithmetics('__oct__')
    def __hex__(self):
        return self._optimized_arithmetics('__hex__')
    def __index__(self):
        return self.masked_array.__index__(self)

    # Optimized methods (only sum implemented for now)
    # We can optimize some methods when they operate on the entire
    # flattened array. Otherwise we delegate to the masked array
    def sum(self, *args, **kwargs):
        return self._optimized_arithmetic_methods('sum', *args, **kwargs)