[Numpy-svn] r4586 - branches/lib_for_io

Sat Dec 15 13:56:46 EST 2007

Author: oliphant
Date: 2007-12-15 12:56:40 -0600 (Sat, 15 Dec 2007)
New Revision: 4586

Added:
   branches/lib_for_io/_datasource.py
   branches/lib_for_io/io.py
Modified:
   branches/lib_for_io/__init__.py
   branches/lib_for_io/utils.py
Log:
Start changes for numpy.lib to support deprecations in scipy.io

Modified: branches/lib_for_io/__init__.py
===================================================================

--- branches/lib_for_io/__init__.py	2007-12-15 18:54:52 UTC (rev 4585)
+++ branches/lib_for_io/__init__.py	2007-12-15 18:56:40 UTC (rev 4586)
@@ -15,6 +15,7 @@
 #import convertcode
 from utils import *
 from arraysetops import *
+from io import *
 import math
 
 __all__ = ['emath','math']
@@ -29,6 +30,7 @@
 __all__ += getlimits.__all__
 __all__ += utils.__all__
 __all__ += arraysetops.__all__
+__all__ += io.__all__
 
 def test(level=1, verbosity=1):
     from numpy.testing import NumpyTest

Added: branches/lib_for_io/_datasource.py
===================================================================
--- branches/lib_for_io/_datasource.py	2007-12-15 18:54:52 UTC (rev 4585)
+++ branches/lib_for_io/_datasource.py	2007-12-15 18:56:40 UTC (rev 4586)
@@ -0,0 +1,457 @@
+"""A file interface for handling local and remote data files.
+The goal of datasource is to abstract some of the file system operations when
+dealing with data files so the researcher doesn't have to know all the
+low-level details.  Through datasource, a researcher can obtain and use a
+file with one function call, regardless of location of the file.
+
+DataSource is meant to augment standard python libraries, not replace them.
+It should work seemlessly with standard file IO operations and the os module.
+
+DataSource files can originate locally or remotely:
+
+- local files : '/home/guido/src/local/data.txt'
+- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
+
+DataSource files can also be compressed or uncompressed.  Currently only gzip
+and bz2 are supported.
+
+Example:
+
+    >>> # Create a DataSource, use os.curdir (default) for local storage.
+    >>> ds = datasource.DataSource()
+    >>>
+    >>> # Open a remote file.
+    >>> # DataSource downloads the file, stores it locally in:
+    >>> #     './www.google.com/index.html'
+    >>> # opens the file and returns a file object.
+    >>> fp = ds.open('http://www.google.com/index.html')
+    >>>
+    >>> # Use the file as you normally would
+    >>> fp.read()
+    >>> fp.close()
+
+"""
+
+__docformat__ = "restructuredtext en"
+
+import bz2
+import gzip
+import os
+import tempfile
+from shutil import rmtree
+from urllib2 import urlopen, URLError
+from urlparse import urlparse
+
+import warnings
+
+# datasource has been used for a while in the NIPY project for analyzing
+# large fmri imaging files hosted over a network.  Data would be fetched
+# via URLs, cached locally and analyzed. Under these conditions the code
+# worked well, however it needs to be documented, tested and reviewed
+# before being fully exposed to SciPy.  We hope to do this before the
+# 0.7 release.
+_api_warning = "The datasource API will be changing frequently before \
+the 0.7 release as the code is ported from the NIPY project to SciPy. \
+Some of the current public interface may become private during the port! \
+Use this module minimally, if at all, until it is stabilized."
+
+warnings.warn(_api_warning)
+
+# TODO: .zip support, .tar support?
+_file_openers = {".gz":gzip.open, ".bz2":bz2.BZ2File, None:file}
+
+
+def open(path, mode='r', destpath=os.curdir):
+    """Open ``path`` with ``mode`` and return the file object.
+
+    If ``path`` is an URL, it will be downloaded, stored in the DataSource
+    directory and opened from there.
+
+    *Parameters*:
+
+        path : {string}
+
+        mode : {string}, optional
+
+        destpath : {string}, optional
+            Destination directory where URLs will be downloaded and stored.
+
+    *Returns*:
+
+        file object
+
+    """
+
+    ds = DataSource(destpath)
+    return ds.open(path, mode)
+
+
+class DataSource (object):
+    """A generic data source file (file, http, ftp, ...).
+
+    DataSources could be local files or remote files/URLs.  The files may
+    also be compressed or uncompressed.  DataSource hides some of the low-level
+    details of downloading the file, allowing you to simply pass in a valid
+    file path (or URL) and obtain a file object.
+
+    *Methods*:
+
+        - exists : test if the file exists locally or remotely
+        - abspath : get absolute path of the file in the DataSource directory
+        - open : open the file
+
+    *Example URL DataSource*::
+
+        # Initialize DataSource with a local directory, default is os.curdir.
+        ds = DataSource('/home/guido')
+
+        # Open remote file.
+        # File will be downloaded and opened from here:
+        #     /home/guido/site/xyz.txt
+        ds.open('http://fake.xyz.web/site/xyz.txt')
+
+    *Example using DataSource for temporary files*::
+
+        # Initialize DataSource with 'None' for the local directory.
+        ds = DataSource(None)
+
+        # Open local file.
+        # Opened file exists in a temporary directory like:
+        #     /tmp/tmpUnhcvM/foobar.txt
+        # Temporary directories are deleted when the DataSource is deleted.
+        ds.open('/home/guido/foobar.txt')
+
+    *Notes*:
+        BUG : URLs require a scheme string ('http://') to be used.
+              www.google.com will fail.
+
+              >>> repos.exists('www.google.com/index.html')
+              False
+
+              >>> repos.exists('http://www.google.com/index.html')
+              True
+
+    """
+
+    def __init__(self, destpath=os.curdir):
+        """Create a DataSource with a local path at destpath."""
+        if destpath:
+            self._destpath = os.path.abspath(destpath)
+            self._istmpdest = False
+        else:
+            self._destpath = tempfile.mkdtemp()
+            self._istmpdest = True
+
+    def __del__(self):
+        # Remove temp directories
+        if self._istmpdest:
+            rmtree(self._destpath)
+
+    def _iszip(self, filename):
+        """Test if the filename is a zip file by looking at the file extension.
+        """
+        fname, ext = os.path.splitext(filename)
+        return ext in _file_openers.keys()
+
+    def _iswritemode(self, mode):
+        """Test if the given mode will open a file for writing."""
+
+        # Currently only used to test the bz2 files.
+        _writemodes = ("w", "+")
+        for c in mode:
+            if c in _writemodes:
+                return True
+        return False
+
+    def _splitzipext(self, filename):
+        """Split zip extension from filename and return filename.
+
+        *Returns*:
+            base, zip_ext : {tuple}
+
+        """
+
+        if self._iszip(filename):
+            return os.path.splitext(filename)
+        else:
+            return filename, None
+
+    def _possible_names(self, filename):
+        """Return a tuple containing compressed filename variations."""
+        names = [filename]
+        if not self._iszip(filename):
+            for zipext in _file_openers.keys():
+                if zipext:
+                    names.append(filename+zipext)
+        return names
+
+    def _isurl(self, path):
+        """Test if path is a net location.  Tests the scheme and netloc."""
+
+        # BUG : URLs require a scheme string ('http://') to be used.
+        #       www.google.com will fail.
+        #       Should we prepend the scheme for those that don't have it and
+        #       test that also?  Similar to the way we append .gz and test for
+        #       for compressed versions of files.
+
+        scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
+        return bool(scheme and netloc)
+
+    def _cache(self, path):
+        """Cache the file specified by path.
+
+        Creates a copy of the file in the datasource cache.
+
+        """
+
+        upath = self.abspath(path)
+
+        # ensure directory exists
+        if not os.path.exists(os.path.dirname(upath)):
+            os.makedirs(os.path.dirname(upath))
+
+        # TODO: Doesn't handle compressed files!
+        if self._isurl(path):
+            try:
+                openedurl = urlopen(path)
+                file(upath, 'w').write(openedurl.read())
+            except URLError:
+                raise URLError("URL not found: ", path)
+        else:
+            try:
+                # TODO: Why not just copy the file with shutils.copyfile?
+                fp = file(path, 'r')
+                file(upath, 'w').write(fp.read())
+            except IOError:
+                raise IOError("File not found: ", path)
+        return upath
+
+    def _findfile(self, path):
+        """Searches for ``path`` and returns full path if found.
+
+        If path is an URL, _findfile will cache a local copy and return
+        the path to the cached file.
+        If path is a local file, _findfile will return a path to that local
+        file.
+
+        The search will include possible compressed versions of the file and
+        return the first occurence found.
+
+        """
+
+        # Build list of possible local file paths
+        if not self._isurl(path):
+            # Valid local paths
+            filelist = self._possible_names(path)
+            # Paths in self._destpath
+            filelist += self._possible_names(self.abspath(path))
+        else:
+            # Cached URLs in self._destpath
+            filelist = self._possible_names(self.abspath(path))
+            # Remote URLs
+            filelist = filelist + self._possible_names(path)
+
+        for name in filelist:
+            if self.exists(name):
+                if self._isurl(name):
+                    name = self._cache(name)
+                return name
+        return None
+
+    def abspath(self, path):
+        """Return absolute path of ``path`` in the DataSource directory.
+
+        If ``path`` is an URL, the ``abspath`` will be either the location
+        the file exists locally or the location it would exist when opened
+        using the ``open`` method.
+
+        The functionality is idential to os.path.abspath.
+
+        *Parameters*:
+
+            path : {string}
+                Can be a local file or a remote URL.
+
+        *Returns*:
+
+            Complete path, rooted in the DataSource destination directory.
+
+        *See Also*:
+
+            `open` : Method that downloads and opens files.
+
+        """
+
+        # TODO:  This should be more robust.  Handles case where path includes
+        #        the destpath, but not other sub-paths. Failing case:
+        #        path = /home/guido/datafile.txt
+        #        destpath = /home/alex/
+        #        upath = self.abspath(path)
+        #        upath == '/home/alex/home/guido/datafile.txt'
+
+        # handle case where path includes self._destpath
+        splitpath = path.split(self._destpath, 2)
+        if len(splitpath) > 1:
+            path = splitpath[1]
+        scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
+        return os.path.join(self._destpath, netloc, upath.strip(os.sep))
+
+    def exists(self, path):
+        """Test if ``path`` exists.
+
+        Test if ``path`` exists as (and in this order):
+
+        - a local file.
+        - a remote URL that have been downloaded and stored locally in the
+          DataSource directory.
+        - a remote URL that has not been downloaded, but is valid and
+          accessible.
+
+        *Parameters*:
+
+            path : {string}
+                Can be a local file or a remote URL.
+
+        *Returns*:
+
+            boolean
+
+        *See Also*:
+
+            `abspath`
+
+        *Notes*
+
+            When ``path`` is an URL, ``exist`` will return True if it's either
+            stored locally in the DataSource directory, or is a valid remote
+            URL.  DataSource does not discriminate between to two, the file
+            is accessible if it exists in either location.
+
+        """
+
+        # Test local path
+        if os.path.exists(path):
+            return True
+
+        # Test cached url
+        upath = self.abspath(path)
+        if os.path.exists(upath):
+            return True
+
+        # Test remote url
+        if self._isurl(path):
+            try:
+                netfile = urlopen(path)
+                del(netfile)
+                return True
+            except URLError:
+                return False
+        return False
+
+    def open(self, path, mode='r'):
+        """Open ``path`` with ``mode`` and return the file object.
+
+        If ``path`` is an URL, it will be downloaded, stored in the DataSource
+        directory and opened from there.
+
+        *Parameters*:
+
+            path : {string}
+
+            mode : {string}, optional
+
+
+        *Returns*:
+
+            file object
+
+        """
+
+        # TODO: There is no support for opening a file for writing which
+        #       doesn't exist yet (creating a file).  Should there be?
+
+        # TODO: Add a ``subdir`` parameter for specifying the subdirectory
+        #       used to store URLs in self._destpath.
+
+        if self._isurl(path) and self._iswritemode(mode):
+            raise ValueError("URLs are not writeable")
+
+        # NOTE: _findfile will fail on a new file opened for writing.
+        found = self._findfile(path)
+        if found:
+            _fname, ext = self._splitzipext(found)
+            if ext == 'bz2':
+                mode.replace("+", "")
+            return _file_openers[ext](found, mode=mode)
+        else:
+            raise IOError("%s not found." % path)
+
+
+class Repository (DataSource):
+    """A data Repository where multiple DataSource's share a base URL/directory.
+
+    Repository extends DataSource by prepending a base URL (or directory) to
+    all the files it handles. Use a Repository when you will be working with
+    multiple files from one base URL.  Initialize the Respository with the
+    base URL, then refer to each file by it's filename only.
+
+    *Methods*:
+
+        - exists : test if the file exists locally or remotely
+        - abspath : get absolute path of the file in the DataSource directory
+        - open : open the file
+
+    *Toy example*::
+
+        # Analyze all files in the repository.
+        repos = Repository('/home/user/data/dir/')
+        for filename in filelist:
+            fp = repos.open(filename)
+            fp.analyze()
+            fp.close()
+
+        # Similarly you could use a URL for a repository.
+        repos = Repository('http://www.xyz.edu/data')
+
+    """
+
+    def __init__(self, baseurl, destpath=os.curdir):
+        """Create a Repository with a shared url or directory of baseurl."""
+        DataSource.__init__(self, destpath=destpath)
+        self._baseurl = baseurl
+
+    def __del__(self):
+        DataSource.__del__(self)
+
+    def _fullpath(self, path):
+        """Return complete path for path.  Prepends baseurl if necessary."""
+        splitpath = path.split(self._baseurl, 2)
+        if len(splitpath) == 1:
+            result = os.path.join(self._baseurl, path)
+        else:
+            result = path    # path contains baseurl already
+        return result
+
+    def _findfile(self, path):
+        """Extend DataSource method to prepend baseurl to ``path``."""
+        return DataSource._findfile(self, self._fullpath(path))
+
+    def abspath(self, path):
+        """Extend DataSource method to prepend baseurl to ``path``."""
+        return DataSource.abspath(self, self._fullpath(path))
+
+    def exists(self, path):
+        """Extend DataSource method to prepend baseurl to ``path``."""
+        return DataSource.exists(self, self._fullpath(path))
+
+    def open(self, path, mode='r'):
+        """Extend DataSource method to prepend baseurl to ``path``."""
+        return DataSource.open(self, self._fullpath(path), mode)
+
+    def listdir(self):
+        '''List files in the source Repository.'''
+        if self._isurl(self._baseurl):
+            raise NotImplementedError, \
+                  "Directory listing of URLs, not supported yet."
+        else:
+            return os.listdir(self._baseurl)


Property changes on: branches/lib_for_io/_datasource.py
___________________________________________________________________
Name: svn:keywords
   + Id
Name: svn:eol-style
   + native

Added: branches/lib_for_io/io.py
===================================================================
--- branches/lib_for_io/io.py	2007-12-15 18:54:52 UTC (rev 4585)
+++ branches/lib_for_io/io.py	2007-12-15 18:56:40 UTC (rev 4586)
@@ -0,0 +1,198 @@
+
+__all__ = ['savetxt', 'loadtxt', 
+           'dump', 'dumps', 'loads', 
+           'save', 'load',
+           'DataFile']
+
+from cPickle import load as _cload, loads
+from _datasource import DataFile
+_file = file
+
+def load(file):
+    """Load a binary file.
+
+    Read a binary file (either a pickle or a binary NumPy array file .npy) and
+    return the resulting arrays. 
+
+    Parameters:
+    -----------
+    file - the file to read. This can be a string, or any file-like object
+
+    Returns:
+    --------
+    result - array or tuple of arrays stored in the file.  If file contains 
+             pickle data, then whatever is stored in the pickle is returned.
+
+    """
+    if isinstance(file, type("")):
+        file = _file(file,"rb")
+    return _cload(file)
+
+# Adapted from matplotlib
+
+def _getconv(dtype):
+    typ = dtype.type
+    if issubclass(typ, bool_):
+        return lambda x: bool(int(x))
+    if issubclass(typ, integer):
+        return int
+    elif issubclass(typ, floating):
+        return float
+    elif issubclass(typ, complex):
+        return complex
+    else:
+        return str
+
+
+def _string_like(obj):
+    try: obj + ''
+    except (TypeError, ValueError): return 0
+    return 1
+
+def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None,
+            skiprows=0, usecols=None, unpack=False):
+    """
+    Load ASCII data from fname into an array and return the array.
+
+    The data must be regular, same number of values in every row
+
+    fname can be a filename or a file handle.  Support for gzipped files is
+    automatic, if the filename ends in .gz
+
+    See scipy.loadmat to read and write matfiles.
+
+    Example usage:
+
+      X = loadtxt('test.dat')  # data in two columns
+      t = X[:,0]
+      y = X[:,1]
+
+    Alternatively, you can do the same with "unpack"; see below
+
+      X = loadtxt('test.dat')    # a matrix of data
+      x = loadtxt('test.dat')    # a single column of data
+
+
+    dtype - the data-type of the resulting array.  If this is a
+    record data-type, the the resulting array will be 1-d and each row will
+    be interpreted as an element of the array. The number of columns
+    used must match the number of fields in the data-type in this case.
+
+    comments - the character used to indicate the start of a comment
+    in the file
+
+    delimiter is a string-like character used to seperate values in the
+    file. If delimiter is unspecified or none, any whitespace string is
+    a separator.
+
+    converters, if not None, is a dictionary mapping column number to
+    a function that will convert that column to a float.  Eg, if
+    column 0 is a date string: converters={0:datestr2num}
+
+    skiprows is the number of rows from the top to skip
+
+    usecols, if not None, is a sequence of integer column indexes to
+    extract where 0 is the first column, eg usecols=(1,4,5) to extract
+    just the 2nd, 5th and 6th columns
+
+    unpack, if True, will transpose the matrix allowing you to unpack
+    into named arguments on the left hand side
+
+        t,y = load('test.dat', unpack=True) # for  two column data
+        x,y,z = load('somefile.dat', usecols=(3,5,7), unpack=True)
+
+    """
+
+    if _string_like(fname):
+        if fname.endswith('.gz'):
+            import gzip
+            fh = gzip.open(fname)
+        else:
+            fh = file(fname)
+    elif hasattr(fname, 'seek'):
+        fh = fname
+    else:
+        raise ValueError('fname must be a string or file handle')
+    X = []
+
+    dtype = multiarray.dtype(dtype)
+    defconv = _getconv(dtype)
+    converterseq = None
+    if converters is None:
+        converters = {}
+        if dtype.names is not None:
+            converterseq = [_getconv(dtype.fields[name][0]) \
+                            for name in dtype.names]
+
+    for i,line in enumerate(fh):
+        if i<skiprows: continue
+        line = line[:line.find(comments)].strip()
+        if not len(line): continue
+        vals = line.split(delimiter)
+        if converterseq is None:
+            converterseq = [converters.get(j,defconv) \
+                            for j in xrange(len(vals))]
+        if usecols is not None:
+            row = [converterseq[j](vals[j]) for j in usecols]
+        else:
+            row = [converterseq[j](val) for j,val in enumerate(vals)]
+        if dtype.names is not None:
+            row = tuple(row)
+        X.append(row)
+
+    X = array(X, dtype)
+    r,c = X.shape
+    if r==1 or c==1:
+        X.shape = max([r,c]),
+    if unpack: return X.T
+    else:  return X
+
+
+# adjust so that fmt can change across columns if desired.
+
+def savetxt(fname, X, fmt='%.18e',delimiter=' '):
+    """
+    Save the data in X to file fname using fmt string to convert the
+    data to strings
+
+    fname can be a filename or a file handle.  If the filename ends in .gz,
+    the file is automatically saved in compressed gzip format.  The load()
+    command understands gzipped files transparently.
+
+    Example usage:
+
+    save('test.out', X)         # X is an array
+    save('test1.out', (x,y,z))  # x,y,z equal sized 1D arrays
+    save('test2.out', x)        # x is 1D
+    save('test3.out', x, fmt='%1.4e')  # use exponential notation
+
+    delimiter is used to separate the fields, eg delimiter ',' for
+    comma-separated values
+    """
+
+    if _string_like(fname):
+        if fname.endswith('.gz'):
+            import gzip
+            fh = gzip.open(fname,'wb')
+        else:
+            fh = file(fname,'w')
+    elif hasattr(fname, 'seek'):
+        fh = fname
+    else:
+        raise ValueError('fname must be a string or file handle')
+
+
+    X = asarray(X)
+    origShape = None
+    if len(X.shape)==1:
+        origShape = X.shape
+        X.shape = len(X), 1
+    for row in X:
+        fh.write(delimiter.join([fmt%val for val in row]) + '\n')
+
+    if origShape is not None:
+        X.shape = origShape
+
+
+
+


Property changes on: branches/lib_for_io/io.py
___________________________________________________________________
Name: svn:keywords
   + Id
Name: svn:eol-style
   + native

Modified: branches/lib_for_io/utils.py
===================================================================
--- branches/lib_for_io/utils.py	2007-12-15 18:54:52 UTC (rev 4585)
+++ branches/lib_for_io/utils.py	2007-12-15 18:56:40 UTC (rev 4586)
@@ -90,7 +90,7 @@
         return func(*args, **kwds)
     newfunc = _set_function_name(newfunc, oldname)
     doc = func.__doc__
-    depdoc = '%s is DEPRECATED in numpy: use %s instead' % (oldname, newname,)
+    depdoc = '%s is DEPRECATED: use %s instead' % (oldname, newname,)
     if doc is None:
         doc = depdoc
     else: