[Python-checkins] cpython: Issue #13815: TarFile.extractfile() now returns io.BufferedReader objects.

lars.gustaebel python-checkins at python.org
Sat May 5 18:16:18 CEST 2012


http://hg.python.org/cpython/rev/254cb4f5d0ff
changeset:   76770:254cb4f5d0ff
user:        Lars Gustäbel <lars at gustaebel.de>
date:        Sat May 05 18:15:03 2012 +0200
summary:
  Issue #13815: TarFile.extractfile() now returns io.BufferedReader objects.
The ExFileObject class was removed, some of its code went into _FileInFile.

files:
  Doc/library/tarfile.rst  |   13 +-
  Lib/tarfile.py           |  202 ++++++--------------------
  Lib/test/test_tarfile.py |   69 ++++-----
  Misc/NEWS                |    2 +
  4 files changed, 83 insertions(+), 203 deletions(-)


diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst
--- a/Doc/library/tarfile.rst
+++ b/Doc/library/tarfile.rst
@@ -376,15 +376,12 @@
 .. method:: TarFile.extractfile(member)
 
    Extract a member from the archive as a file object. *member* may be a filename
-   or a :class:`TarInfo` object. If *member* is a regular file, a :term:`file-like
-   object` is returned. If *member* is a link, a file-like object is constructed from
-   the link's target. If *member* is none of the above, :const:`None` is returned.
+   or a :class:`TarInfo` object. If *member* is a regular file or a link, an
+   :class:`io.BufferedReader` object is returned. Otherwise, :const:`None` is
+   returned.
 
-   .. note::
-
-      The file-like object is read-only.  It provides the methods
-      :meth:`read`, :meth:`readline`, :meth:`readlines`, :meth:`seek`, :meth:`tell`,
-      and :meth:`close`, and also supports iteration over its lines.
+   .. versionchanged:: 3.3
+      Return an :class:`io.BufferedReader` object.
 
 
 .. method:: TarFile.add(name, arcname=None, recursive=True, exclude=None, *, filter=None)
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -668,6 +668,8 @@
         self.offset = offset
         self.size = size
         self.position = 0
+        self.name = getattr(fileobj, "name", None)
+        self.closed = False
 
         if blockinfo is None:
             blockinfo = [(0, size)]
@@ -686,10 +688,16 @@
         if lastpos < self.size:
             self.map.append((False, lastpos, self.size, None))
 
+    def flush(self):
+        pass
+
+    def readable(self):
+        return True
+
+    def writable(self):
+        return False
+
     def seekable(self):
-        if not hasattr(self.fileobj, "seekable"):
-            # XXX gzip.GzipFile and bz2.BZ2File
-            return True
         return self.fileobj.seekable()
 
     def tell(self):
@@ -697,10 +705,21 @@
         """
         return self.position
 
-    def seek(self, position):
+    def seek(self, position, whence=io.SEEK_SET):
         """Seek to a position in the file.
         """
-        self.position = position
+        if whence == io.SEEK_SET:
+            self.position = min(max(position, 0), self.size)
+        elif whence == io.SEEK_CUR:
+            if position < 0:
+                self.position = max(self.position + position, 0)
+            else:
+                self.position = min(self.position + position, self.size)
+        elif whence == io.SEEK_END:
+            self.position = max(min(self.size + position, self.size), 0)
+        else:
+            raise ValueError("Invalid argument")
+        return self.position
 
     def read(self, size=None):
         """Read data from the file.
@@ -729,147 +748,17 @@
             size -= length
             self.position += length
         return buf
+
+    def readinto(self, b):
+        buf = self.read(len(b))
+        b[:len(buf)] = buf
+        return len(buf)
+
+    def close(self):
+        self.closed = True
 #class _FileInFile
 
 
-class ExFileObject(object):
-    """File-like object for reading an archive member.
-       Is returned by TarFile.extractfile().
-    """
-    blocksize = 1024
-
-    def __init__(self, tarfile, tarinfo):
-        self.fileobj = _FileInFile(tarfile.fileobj,
-                                   tarinfo.offset_data,
-                                   tarinfo.size,
-                                   tarinfo.sparse)
-        self.name = tarinfo.name
-        self.mode = "r"
-        self.closed = False
-        self.size = tarinfo.size
-
-        self.position = 0
-        self.buffer = b""
-
-    def readable(self):
-        return True
-
-    def writable(self):
-        return False
-
-    def seekable(self):
-        return self.fileobj.seekable()
-
-    def read(self, size=None):
-        """Read at most size bytes from the file. If size is not
-           present or None, read all data until EOF is reached.
-        """
-        if self.closed:
-            raise ValueError("I/O operation on closed file")
-
-        buf = b""
-        if self.buffer:
-            if size is None:
-                buf = self.buffer
-                self.buffer = b""
-            else:
-                buf = self.buffer[:size]
-                self.buffer = self.buffer[size:]
-
-        if size is None:
-            buf += self.fileobj.read()
-        else:
-            buf += self.fileobj.read(size - len(buf))
-
-        self.position += len(buf)
-        return buf
-
-    # XXX TextIOWrapper uses the read1() method.
-    read1 = read
-
-    def readline(self, size=-1):
-        """Read one entire line from the file. If size is present
-           and non-negative, return a string with at most that
-           size, which may be an incomplete line.
-        """
-        if self.closed:
-            raise ValueError("I/O operation on closed file")
-
-        pos = self.buffer.find(b"\n") + 1
-        if pos == 0:
-            # no newline found.
-            while True:
-                buf = self.fileobj.read(self.blocksize)
-                self.buffer += buf
-                if not buf or b"\n" in buf:
-                    pos = self.buffer.find(b"\n") + 1
-                    if pos == 0:
-                        # no newline found.
-                        pos = len(self.buffer)
-                    break
-
-        if size != -1:
-            pos = min(size, pos)
-
-        buf = self.buffer[:pos]
-        self.buffer = self.buffer[pos:]
-        self.position += len(buf)
-        return buf
-
-    def readlines(self):
-        """Return a list with all remaining lines.
-        """
-        result = []
-        while True:
-            line = self.readline()
-            if not line: break
-            result.append(line)
-        return result
-
-    def tell(self):
-        """Return the current file position.
-        """
-        if self.closed:
-            raise ValueError("I/O operation on closed file")
-
-        return self.position
-
-    def seek(self, pos, whence=io.SEEK_SET):
-        """Seek to a position in the file.
-        """
-        if self.closed:
-            raise ValueError("I/O operation on closed file")
-
-        if whence == io.SEEK_SET:
-            self.position = min(max(pos, 0), self.size)
-        elif whence == io.SEEK_CUR:
-            if pos < 0:
-                self.position = max(self.position + pos, 0)
-            else:
-                self.position = min(self.position + pos, self.size)
-        elif whence == io.SEEK_END:
-            self.position = max(min(self.size + pos, self.size), 0)
-        else:
-            raise ValueError("Invalid argument")
-
-        self.buffer = b""
-        self.fileobj.seek(self.position)
-
-    def close(self):
-        """Close the file object.
-        """
-        self.closed = True
-
-    def __iter__(self):
-        """Get an iterator over the file's lines.
-        """
-        while True:
-            line = self.readline()
-            if not line:
-                break
-            yield line
-#class ExFileObject
-
 #------------------
 # Exported Classes
 #------------------
@@ -1554,7 +1443,8 @@
 
     tarinfo = TarInfo           # The default TarInfo class to use.
 
-    fileobject = ExFileObject   # The default ExFileObject class to use.
+    fileobject = None           # The file-object for extractfile() or
+                                # io.BufferedReader if None.
 
     def __init__(self, name=None, mode="r", fileobj=None, format=None,
             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
@@ -2178,12 +2068,9 @@
 
     def extractfile(self, member):
         """Extract a member from the archive as a file object. `member' may be
-           a filename or a TarInfo object. If `member' is a regular file, a
-           file-like object is returned. If `member' is a link, a file-like
-           object is constructed from the link's target. If `member' is none of
-           the above, None is returned.
-           The file-like object is read-only and provides the following
-           methods: read(), readline(), readlines(), seek() and tell()
+           a filename or a TarInfo object. If `member' is a regular file or a
+           link, an io.BufferedReader object is returned. Otherwise, None is
+           returned.
         """
         self._check("r")
 
@@ -2192,13 +2079,14 @@
         else:
             tarinfo = member
 
-        if tarinfo.isreg():
-            return self.fileobject(self, tarinfo)
-
-        elif tarinfo.type not in SUPPORTED_TYPES:
-            # If a member's type is unknown, it is treated as a
-            # regular file.
-            return self.fileobject(self, tarinfo)
+        if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
+            # Members with unknown types are treated as regular files.
+            if self.fileobject is None:
+                fileobj = _FileInFile(self.fileobj, tarinfo.offset_data, tarinfo.size, tarinfo.sparse)
+                return io.BufferedReader(fileobj)
+            else:
+                # Keep the traditional pre-3.3 API intact.
+                return self.fileobject(self, tarinfo)
 
         elif tarinfo.islnk() or tarinfo.issym():
             if isinstance(self.fileobj, _Stream):
diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py
--- a/Lib/test/test_tarfile.py
+++ b/Lib/test/test_tarfile.py
@@ -56,13 +56,10 @@
 
     def test_fileobj_regular_file(self):
         tarinfo = self.tar.getmember("ustar/regtype")
-        fobj = self.tar.extractfile(tarinfo)
-        try:
+        with self.tar.extractfile(tarinfo) as fobj:
             data = fobj.read()
             self.assertTrue((len(data), md5sum(data)) == (tarinfo.size, md5_regtype),
                     "regular file extraction failed")
-        finally:
-            fobj.close()
 
     def test_fileobj_readlines(self):
         self.tar.extract("ustar/regtype", TEMPDIR)
@@ -70,8 +67,7 @@
         with open(os.path.join(TEMPDIR, "ustar/regtype"), "r") as fobj1:
             lines1 = fobj1.readlines()
 
-        fobj = self.tar.extractfile(tarinfo)
-        try:
+        with self.tar.extractfile(tarinfo) as fobj:
             fobj2 = io.TextIOWrapper(fobj)
             lines2 = fobj2.readlines()
             self.assertTrue(lines1 == lines2,
@@ -81,21 +77,16 @@
             self.assertTrue(lines2[83] ==
                     "I will gladly admit that Python is not the fastest running scripting language.\n",
                     "fileobj.readlines() failed")
-        finally:
-            fobj.close()
 
     def test_fileobj_iter(self):
         self.tar.extract("ustar/regtype", TEMPDIR)
         tarinfo = self.tar.getmember("ustar/regtype")
         with open(os.path.join(TEMPDIR, "ustar/regtype"), "r") as fobj1:
             lines1 = fobj1.readlines()
-        fobj2 = self.tar.extractfile(tarinfo)
-        try:
+        with self.tar.extractfile(tarinfo) as fobj2:
             lines2 = list(io.TextIOWrapper(fobj2))
             self.assertTrue(lines1 == lines2,
                          "fileobj.__iter__() failed")
-        finally:
-            fobj2.close()
 
     def test_fileobj_seek(self):
         self.tar.extract("ustar/regtype", TEMPDIR)
@@ -147,17 +138,24 @@
                      "read() after readline() failed")
         fobj.close()
 
+    def test_fileobj_text(self):
+        with self.tar.extractfile("ustar/regtype") as fobj:
+            fobj = io.TextIOWrapper(fobj)
+            data = fobj.read().encode("iso8859-1")
+            self.assertEqual(md5sum(data), md5_regtype)
+            try:
+                fobj.seek(100)
+            except AttributeError:
+                # Issue #13815: seek() complained about a missing
+                # flush() method.
+                self.fail("seeking failed in text mode")
+
     # Test if symbolic and hard links are resolved by extractfile().  The
     # test link members each point to a regular member whose data is
     # supposed to be exported.
     def _test_fileobj_link(self, lnktype, regtype):
-        a = self.tar.extractfile(lnktype)
-        b = self.tar.extractfile(regtype)
-        try:
+        with self.tar.extractfile(lnktype) as a, self.tar.extractfile(regtype) as b:
             self.assertEqual(a.name, b.name)
-        finally:
-            a.close()
-            b.close()
 
     def test_fileobj_link1(self):
         self._test_fileobj_link("ustar/lnktype", "ustar/regtype")
@@ -265,9 +263,8 @@
             t = tar.next()
             name = t.name
             offset = t.offset
-            f = tar.extractfile(t)
-            data = f.read()
-            f.close()
+            with tar.extractfile(t) as f:
+                data = f.read()
         finally:
             tar.close()
 
@@ -439,27 +436,26 @@
         for tarinfo in self.tar:
             if not tarinfo.isreg():
                 continue
-            fobj = self.tar.extractfile(tarinfo)
-            while True:
-                try:
-                    buf = fobj.read(512)
-                except tarfile.StreamError:
-                    self.fail("simple read-through using TarFile.extractfile() failed")
-                if not buf:
-                    break
-            fobj.close()
+            with self.tar.extractfile(tarinfo) as fobj:
+                while True:
+                    try:
+                        buf = fobj.read(512)
+                    except tarfile.StreamError:
+                        self.fail("simple read-through using TarFile.extractfile() failed")
+                    if not buf:
+                        break
 
     def test_fileobj_regular_file(self):
         tarinfo = self.tar.next() # get "regtype" (can't use getmember)
-        fobj = self.tar.extractfile(tarinfo)
-        data = fobj.read()
+        with self.tar.extractfile(tarinfo) as fobj:
+            data = fobj.read()
         self.assertTrue((len(data), md5sum(data)) == (tarinfo.size, md5_regtype),
                 "regular file extraction failed")
 
     def test_provoke_stream_error(self):
         tarinfos = self.tar.getmembers()
-        f = self.tar.extractfile(tarinfos[0]) # read the first member
-        self.assertRaises(tarfile.StreamError, f.read)
+        with self.tar.extractfile(tarinfos[0]) as f: # read the first member
+            self.assertRaises(tarfile.StreamError, f.read)
 
     def test_compare_members(self):
         tar1 = tarfile.open(tarname, encoding="iso8859-1")
@@ -1484,12 +1480,9 @@
         with tarfile.open(tarname, encoding="iso8859-1") as src:
             t = src.getmember("ustar/regtype")
             t.name = "foo"
-            f = src.extractfile(t)
-            try:
+            with src.extractfile(t) as f:
                 with tarfile.open(self.tarname, mode) as tar:
                     tar.addfile(t, f)
-            finally:
-                f.close()
 
     def _test(self, names=["bar"], fileobj=None):
         with tarfile.open(self.tarname, fileobj=fileobj) as tar:
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -89,6 +89,8 @@
 Library
 -------
 
+- Issue #13815: TarFile.extractfile() now returns io.BufferedReader objects.
+
 - Issue #14371: Support bzip2 in zipfile module.
   Patch by Serhiy Storchaka.
 

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list