[Numpy-svn] r4586 - branches/lib_for_io
numpy-svn at scipy.org
numpy-svn at scipy.org
Sat Dec 15 13:56:46 EST 2007
Author: oliphant
Date: 2007-12-15 12:56:40 -0600 (Sat, 15 Dec 2007)
New Revision: 4586
Added:
branches/lib_for_io/_datasource.py
branches/lib_for_io/io.py
Modified:
branches/lib_for_io/__init__.py
branches/lib_for_io/utils.py
Log:
Start changes for numpy.lib to support deprecations in scipy.io
Modified: branches/lib_for_io/__init__.py
===================================================================
--- branches/lib_for_io/__init__.py 2007-12-15 18:54:52 UTC (rev 4585)
+++ branches/lib_for_io/__init__.py 2007-12-15 18:56:40 UTC (rev 4586)
@@ -15,6 +15,7 @@
#import convertcode
from utils import *
from arraysetops import *
+from io import *
import math
__all__ = ['emath','math']
@@ -29,6 +30,7 @@
__all__ += getlimits.__all__
__all__ += utils.__all__
__all__ += arraysetops.__all__
+__all__ += io.__all__
def test(level=1, verbosity=1):
from numpy.testing import NumpyTest
Added: branches/lib_for_io/_datasource.py
===================================================================
--- branches/lib_for_io/_datasource.py 2007-12-15 18:54:52 UTC (rev 4585)
+++ branches/lib_for_io/_datasource.py 2007-12-15 18:56:40 UTC (rev 4586)
@@ -0,0 +1,457 @@
+"""A file interface for handling local and remote data files.
+The goal of datasource is to abstract some of the file system operations when
+dealing with data files so the researcher doesn't have to know all the
+low-level details. Through datasource, a researcher can obtain and use a
+file with one function call, regardless of location of the file.
+
+DataSource is meant to augment standard python libraries, not replace them.
+It should work seemlessly with standard file IO operations and the os module.
+
+DataSource files can originate locally or remotely:
+
+- local files : '/home/guido/src/local/data.txt'
+- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
+
+DataSource files can also be compressed or uncompressed. Currently only gzip
+and bz2 are supported.
+
+Example:
+
+ >>> # Create a DataSource, use os.curdir (default) for local storage.
+ >>> ds = datasource.DataSource()
+ >>>
+ >>> # Open a remote file.
+ >>> # DataSource downloads the file, stores it locally in:
+ >>> # './www.google.com/index.html'
+ >>> # opens the file and returns a file object.
+ >>> fp = ds.open('http://www.google.com/index.html')
+ >>>
+ >>> # Use the file as you normally would
+ >>> fp.read()
+ >>> fp.close()
+
+"""
+
+__docformat__ = "restructuredtext en"
+
+import bz2
+import gzip
+import os
+import tempfile
+from shutil import rmtree
+from urllib2 import urlopen, URLError
+from urlparse import urlparse
+
+import warnings
+
+# datasource has been used for a while in the NIPY project for analyzing
+# large fmri imaging files hosted over a network. Data would be fetched
+# via URLs, cached locally and analyzed. Under these conditions the code
+# worked well, however it needs to be documented, tested and reviewed
+# before being fully exposed to SciPy. We hope to do this before the
+# 0.7 release.
+_api_warning = "The datasource API will be changing frequently before \
+the 0.7 release as the code is ported from the NIPY project to SciPy. \
+Some of the current public interface may become private during the port! \
+Use this module minimally, if at all, until it is stabilized."
+
+warnings.warn(_api_warning)
+
+# TODO: .zip support, .tar support?
+_file_openers = {".gz":gzip.open, ".bz2":bz2.BZ2File, None:file}
+
+
+def open(path, mode='r', destpath=os.curdir):
+ """Open ``path`` with ``mode`` and return the file object.
+
+ If ``path`` is an URL, it will be downloaded, stored in the DataSource
+ directory and opened from there.
+
+ *Parameters*:
+
+ path : {string}
+
+ mode : {string}, optional
+
+ destpath : {string}, optional
+ Destination directory where URLs will be downloaded and stored.
+
+ *Returns*:
+
+ file object
+
+ """
+
+ ds = DataSource(destpath)
+ return ds.open(path, mode)
+
+
+class DataSource (object):
+ """A generic data source file (file, http, ftp, ...).
+
+ DataSources could be local files or remote files/URLs. The files may
+ also be compressed or uncompressed. DataSource hides some of the low-level
+ details of downloading the file, allowing you to simply pass in a valid
+ file path (or URL) and obtain a file object.
+
+ *Methods*:
+
+ - exists : test if the file exists locally or remotely
+ - abspath : get absolute path of the file in the DataSource directory
+ - open : open the file
+
+ *Example URL DataSource*::
+
+ # Initialize DataSource with a local directory, default is os.curdir.
+ ds = DataSource('/home/guido')
+
+ # Open remote file.
+ # File will be downloaded and opened from here:
+ # /home/guido/site/xyz.txt
+ ds.open('http://fake.xyz.web/site/xyz.txt')
+
+ *Example using DataSource for temporary files*::
+
+ # Initialize DataSource with 'None' for the local directory.
+ ds = DataSource(None)
+
+ # Open local file.
+ # Opened file exists in a temporary directory like:
+ # /tmp/tmpUnhcvM/foobar.txt
+ # Temporary directories are deleted when the DataSource is deleted.
+ ds.open('/home/guido/foobar.txt')
+
+ *Notes*:
+ BUG : URLs require a scheme string ('http://') to be used.
+ www.google.com will fail.
+
+ >>> repos.exists('www.google.com/index.html')
+ False
+
+ >>> repos.exists('http://www.google.com/index.html')
+ True
+
+ """
+
+ def __init__(self, destpath=os.curdir):
+ """Create a DataSource with a local path at destpath."""
+ if destpath:
+ self._destpath = os.path.abspath(destpath)
+ self._istmpdest = False
+ else:
+ self._destpath = tempfile.mkdtemp()
+ self._istmpdest = True
+
+ def __del__(self):
+ # Remove temp directories
+ if self._istmpdest:
+ rmtree(self._destpath)
+
+ def _iszip(self, filename):
+ """Test if the filename is a zip file by looking at the file extension.
+ """
+ fname, ext = os.path.splitext(filename)
+ return ext in _file_openers.keys()
+
+ def _iswritemode(self, mode):
+ """Test if the given mode will open a file for writing."""
+
+ # Currently only used to test the bz2 files.
+ _writemodes = ("w", "+")
+ for c in mode:
+ if c in _writemodes:
+ return True
+ return False
+
+ def _splitzipext(self, filename):
+ """Split zip extension from filename and return filename.
+
+ *Returns*:
+ base, zip_ext : {tuple}
+
+ """
+
+ if self._iszip(filename):
+ return os.path.splitext(filename)
+ else:
+ return filename, None
+
+ def _possible_names(self, filename):
+ """Return a tuple containing compressed filename variations."""
+ names = [filename]
+ if not self._iszip(filename):
+ for zipext in _file_openers.keys():
+ if zipext:
+ names.append(filename+zipext)
+ return names
+
+ def _isurl(self, path):
+ """Test if path is a net location. Tests the scheme and netloc."""
+
+ # BUG : URLs require a scheme string ('http://') to be used.
+ # www.google.com will fail.
+ # Should we prepend the scheme for those that don't have it and
+ # test that also? Similar to the way we append .gz and test for
+ # for compressed versions of files.
+
+ scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
+ return bool(scheme and netloc)
+
+ def _cache(self, path):
+ """Cache the file specified by path.
+
+ Creates a copy of the file in the datasource cache.
+
+ """
+
+ upath = self.abspath(path)
+
+ # ensure directory exists
+ if not os.path.exists(os.path.dirname(upath)):
+ os.makedirs(os.path.dirname(upath))
+
+ # TODO: Doesn't handle compressed files!
+ if self._isurl(path):
+ try:
+ openedurl = urlopen(path)
+ file(upath, 'w').write(openedurl.read())
+ except URLError:
+ raise URLError("URL not found: ", path)
+ else:
+ try:
+ # TODO: Why not just copy the file with shutils.copyfile?
+ fp = file(path, 'r')
+ file(upath, 'w').write(fp.read())
+ except IOError:
+ raise IOError("File not found: ", path)
+ return upath
+
+ def _findfile(self, path):
+ """Searches for ``path`` and returns full path if found.
+
+ If path is an URL, _findfile will cache a local copy and return
+ the path to the cached file.
+ If path is a local file, _findfile will return a path to that local
+ file.
+
+ The search will include possible compressed versions of the file and
+ return the first occurence found.
+
+ """
+
+ # Build list of possible local file paths
+ if not self._isurl(path):
+ # Valid local paths
+ filelist = self._possible_names(path)
+ # Paths in self._destpath
+ filelist += self._possible_names(self.abspath(path))
+ else:
+ # Cached URLs in self._destpath
+ filelist = self._possible_names(self.abspath(path))
+ # Remote URLs
+ filelist = filelist + self._possible_names(path)
+
+ for name in filelist:
+ if self.exists(name):
+ if self._isurl(name):
+ name = self._cache(name)
+ return name
+ return None
+
+ def abspath(self, path):
+ """Return absolute path of ``path`` in the DataSource directory.
+
+ If ``path`` is an URL, the ``abspath`` will be either the location
+ the file exists locally or the location it would exist when opened
+ using the ``open`` method.
+
+ The functionality is idential to os.path.abspath.
+
+ *Parameters*:
+
+ path : {string}
+ Can be a local file or a remote URL.
+
+ *Returns*:
+
+ Complete path, rooted in the DataSource destination directory.
+
+ *See Also*:
+
+ `open` : Method that downloads and opens files.
+
+ """
+
+ # TODO: This should be more robust. Handles case where path includes
+ # the destpath, but not other sub-paths. Failing case:
+ # path = /home/guido/datafile.txt
+ # destpath = /home/alex/
+ # upath = self.abspath(path)
+ # upath == '/home/alex/home/guido/datafile.txt'
+
+ # handle case where path includes self._destpath
+ splitpath = path.split(self._destpath, 2)
+ if len(splitpath) > 1:
+ path = splitpath[1]
+ scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
+ return os.path.join(self._destpath, netloc, upath.strip(os.sep))
+
+ def exists(self, path):
+ """Test if ``path`` exists.
+
+ Test if ``path`` exists as (and in this order):
+
+ - a local file.
+ - a remote URL that have been downloaded and stored locally in the
+ DataSource directory.
+ - a remote URL that has not been downloaded, but is valid and
+ accessible.
+
+ *Parameters*:
+
+ path : {string}
+ Can be a local file or a remote URL.
+
+ *Returns*:
+
+ boolean
+
+ *See Also*:
+
+ `abspath`
+
+ *Notes*
+
+ When ``path`` is an URL, ``exist`` will return True if it's either
+ stored locally in the DataSource directory, or is a valid remote
+ URL. DataSource does not discriminate between to two, the file
+ is accessible if it exists in either location.
+
+ """
+
+ # Test local path
+ if os.path.exists(path):
+ return True
+
+ # Test cached url
+ upath = self.abspath(path)
+ if os.path.exists(upath):
+ return True
+
+ # Test remote url
+ if self._isurl(path):
+ try:
+ netfile = urlopen(path)
+ del(netfile)
+ return True
+ except URLError:
+ return False
+ return False
+
+ def open(self, path, mode='r'):
+ """Open ``path`` with ``mode`` and return the file object.
+
+ If ``path`` is an URL, it will be downloaded, stored in the DataSource
+ directory and opened from there.
+
+ *Parameters*:
+
+ path : {string}
+
+ mode : {string}, optional
+
+
+ *Returns*:
+
+ file object
+
+ """
+
+ # TODO: There is no support for opening a file for writing which
+ # doesn't exist yet (creating a file). Should there be?
+
+ # TODO: Add a ``subdir`` parameter for specifying the subdirectory
+ # used to store URLs in self._destpath.
+
+ if self._isurl(path) and self._iswritemode(mode):
+ raise ValueError("URLs are not writeable")
+
+ # NOTE: _findfile will fail on a new file opened for writing.
+ found = self._findfile(path)
+ if found:
+ _fname, ext = self._splitzipext(found)
+ if ext == 'bz2':
+ mode.replace("+", "")
+ return _file_openers[ext](found, mode=mode)
+ else:
+ raise IOError("%s not found." % path)
+
+
+class Repository (DataSource):
+ """A data Repository where multiple DataSource's share a base URL/directory.
+
+ Repository extends DataSource by prepending a base URL (or directory) to
+ all the files it handles. Use a Repository when you will be working with
+ multiple files from one base URL. Initialize the Respository with the
+ base URL, then refer to each file by it's filename only.
+
+ *Methods*:
+
+ - exists : test if the file exists locally or remotely
+ - abspath : get absolute path of the file in the DataSource directory
+ - open : open the file
+
+ *Toy example*::
+
+ # Analyze all files in the repository.
+ repos = Repository('/home/user/data/dir/')
+ for filename in filelist:
+ fp = repos.open(filename)
+ fp.analyze()
+ fp.close()
+
+ # Similarly you could use a URL for a repository.
+ repos = Repository('http://www.xyz.edu/data')
+
+ """
+
+ def __init__(self, baseurl, destpath=os.curdir):
+ """Create a Repository with a shared url or directory of baseurl."""
+ DataSource.__init__(self, destpath=destpath)
+ self._baseurl = baseurl
+
+ def __del__(self):
+ DataSource.__del__(self)
+
+ def _fullpath(self, path):
+ """Return complete path for path. Prepends baseurl if necessary."""
+ splitpath = path.split(self._baseurl, 2)
+ if len(splitpath) == 1:
+ result = os.path.join(self._baseurl, path)
+ else:
+ result = path # path contains baseurl already
+ return result
+
+ def _findfile(self, path):
+ """Extend DataSource method to prepend baseurl to ``path``."""
+ return DataSource._findfile(self, self._fullpath(path))
+
+ def abspath(self, path):
+ """Extend DataSource method to prepend baseurl to ``path``."""
+ return DataSource.abspath(self, self._fullpath(path))
+
+ def exists(self, path):
+ """Extend DataSource method to prepend baseurl to ``path``."""
+ return DataSource.exists(self, self._fullpath(path))
+
+ def open(self, path, mode='r'):
+ """Extend DataSource method to prepend baseurl to ``path``."""
+ return DataSource.open(self, self._fullpath(path), mode)
+
+ def listdir(self):
+ '''List files in the source Repository.'''
+ if self._isurl(self._baseurl):
+ raise NotImplementedError, \
+ "Directory listing of URLs, not supported yet."
+ else:
+ return os.listdir(self._baseurl)
Property changes on: branches/lib_for_io/_datasource.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: branches/lib_for_io/io.py
===================================================================
--- branches/lib_for_io/io.py 2007-12-15 18:54:52 UTC (rev 4585)
+++ branches/lib_for_io/io.py 2007-12-15 18:56:40 UTC (rev 4586)
@@ -0,0 +1,198 @@
+
+__all__ = ['savetxt', 'loadtxt',
+ 'dump', 'dumps', 'loads',
+ 'save', 'load',
+ 'DataFile']
+
+from cPickle import load as _cload, loads
+from _datasource import DataFile
+_file = file
+
+def load(file):
+ """Load a binary file.
+
+ Read a binary file (either a pickle or a binary NumPy array file .npy) and
+ return the resulting arrays.
+
+ Parameters:
+ -----------
+ file - the file to read. This can be a string, or any file-like object
+
+ Returns:
+ --------
+ result - array or tuple of arrays stored in the file. If file contains
+ pickle data, then whatever is stored in the pickle is returned.
+
+ """
+ if isinstance(file, type("")):
+ file = _file(file,"rb")
+ return _cload(file)
+
+# Adapted from matplotlib
+
+def _getconv(dtype):
+ typ = dtype.type
+ if issubclass(typ, bool_):
+ return lambda x: bool(int(x))
+ if issubclass(typ, integer):
+ return int
+ elif issubclass(typ, floating):
+ return float
+ elif issubclass(typ, complex):
+ return complex
+ else:
+ return str
+
+
+def _string_like(obj):
+ try: obj + ''
+ except (TypeError, ValueError): return 0
+ return 1
+
+def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None,
+ skiprows=0, usecols=None, unpack=False):
+ """
+ Load ASCII data from fname into an array and return the array.
+
+ The data must be regular, same number of values in every row
+
+ fname can be a filename or a file handle. Support for gzipped files is
+ automatic, if the filename ends in .gz
+
+ See scipy.loadmat to read and write matfiles.
+
+ Example usage:
+
+ X = loadtxt('test.dat') # data in two columns
+ t = X[:,0]
+ y = X[:,1]
+
+ Alternatively, you can do the same with "unpack"; see below
+
+ X = loadtxt('test.dat') # a matrix of data
+ x = loadtxt('test.dat') # a single column of data
+
+
+ dtype - the data-type of the resulting array. If this is a
+ record data-type, the the resulting array will be 1-d and each row will
+ be interpreted as an element of the array. The number of columns
+ used must match the number of fields in the data-type in this case.
+
+ comments - the character used to indicate the start of a comment
+ in the file
+
+ delimiter is a string-like character used to seperate values in the
+ file. If delimiter is unspecified or none, any whitespace string is
+ a separator.
+
+ converters, if not None, is a dictionary mapping column number to
+ a function that will convert that column to a float. Eg, if
+ column 0 is a date string: converters={0:datestr2num}
+
+ skiprows is the number of rows from the top to skip
+
+ usecols, if not None, is a sequence of integer column indexes to
+ extract where 0 is the first column, eg usecols=(1,4,5) to extract
+ just the 2nd, 5th and 6th columns
+
+ unpack, if True, will transpose the matrix allowing you to unpack
+ into named arguments on the left hand side
+
+ t,y = load('test.dat', unpack=True) # for two column data
+ x,y,z = load('somefile.dat', usecols=(3,5,7), unpack=True)
+
+ """
+
+ if _string_like(fname):
+ if fname.endswith('.gz'):
+ import gzip
+ fh = gzip.open(fname)
+ else:
+ fh = file(fname)
+ elif hasattr(fname, 'seek'):
+ fh = fname
+ else:
+ raise ValueError('fname must be a string or file handle')
+ X = []
+
+ dtype = multiarray.dtype(dtype)
+ defconv = _getconv(dtype)
+ converterseq = None
+ if converters is None:
+ converters = {}
+ if dtype.names is not None:
+ converterseq = [_getconv(dtype.fields[name][0]) \
+ for name in dtype.names]
+
+ for i,line in enumerate(fh):
+ if i<skiprows: continue
+ line = line[:line.find(comments)].strip()
+ if not len(line): continue
+ vals = line.split(delimiter)
+ if converterseq is None:
+ converterseq = [converters.get(j,defconv) \
+ for j in xrange(len(vals))]
+ if usecols is not None:
+ row = [converterseq[j](vals[j]) for j in usecols]
+ else:
+ row = [converterseq[j](val) for j,val in enumerate(vals)]
+ if dtype.names is not None:
+ row = tuple(row)
+ X.append(row)
+
+ X = array(X, dtype)
+ r,c = X.shape
+ if r==1 or c==1:
+ X.shape = max([r,c]),
+ if unpack: return X.T
+ else: return X
+
+
+# adjust so that fmt can change across columns if desired.
+
+def savetxt(fname, X, fmt='%.18e',delimiter=' '):
+ """
+ Save the data in X to file fname using fmt string to convert the
+ data to strings
+
+ fname can be a filename or a file handle. If the filename ends in .gz,
+ the file is automatically saved in compressed gzip format. The load()
+ command understands gzipped files transparently.
+
+ Example usage:
+
+ save('test.out', X) # X is an array
+ save('test1.out', (x,y,z)) # x,y,z equal sized 1D arrays
+ save('test2.out', x) # x is 1D
+ save('test3.out', x, fmt='%1.4e') # use exponential notation
+
+ delimiter is used to separate the fields, eg delimiter ',' for
+ comma-separated values
+ """
+
+ if _string_like(fname):
+ if fname.endswith('.gz'):
+ import gzip
+ fh = gzip.open(fname,'wb')
+ else:
+ fh = file(fname,'w')
+ elif hasattr(fname, 'seek'):
+ fh = fname
+ else:
+ raise ValueError('fname must be a string or file handle')
+
+
+ X = asarray(X)
+ origShape = None
+ if len(X.shape)==1:
+ origShape = X.shape
+ X.shape = len(X), 1
+ for row in X:
+ fh.write(delimiter.join([fmt%val for val in row]) + '\n')
+
+ if origShape is not None:
+ X.shape = origShape
+
+
+
+
Property changes on: branches/lib_for_io/io.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Modified: branches/lib_for_io/utils.py
===================================================================
--- branches/lib_for_io/utils.py 2007-12-15 18:54:52 UTC (rev 4585)
+++ branches/lib_for_io/utils.py 2007-12-15 18:56:40 UTC (rev 4586)
@@ -90,7 +90,7 @@
return func(*args, **kwds)
newfunc = _set_function_name(newfunc, oldname)
doc = func.__doc__
- depdoc = '%s is DEPRECATED in numpy: use %s instead' % (oldname, newname,)
+ depdoc = '%s is DEPRECATED: use %s instead' % (oldname, newname,)
if doc is None:
doc = depdoc
else:
More information about the Numpy-svn
mailing list