[Python-3000-checkins] r62337 - in python/branches/py3k: Lib/tarfile.py Misc/NEWS

lars.gustaebel python-3000-checkins at python.org
Mon Apr 14 12:05:49 CEST 2008


Author: lars.gustaebel
Date: Mon Apr 14 12:05:48 2008
New Revision: 62337

Log:
Issue #2058: Remove the buf attribute and add __slots__ to the
TarInfo class in order to reduce tarfile's memory usage.


Modified:
   python/branches/py3k/Lib/tarfile.py
   python/branches/py3k/Misc/NEWS

Modified: python/branches/py3k/Lib/tarfile.py
==============================================================================
--- python/branches/py3k/Lib/tarfile.py	(original)
+++ python/branches/py3k/Lib/tarfile.py	Mon Apr 14 12:05:48 2008
@@ -767,7 +767,7 @@
         self.fileobj = _FileInFile(tarfile.fileobj,
                                    tarinfo.offset_data,
                                    tarinfo.size,
-                                   getattr(tarinfo, "sparse", None))
+                                   tarinfo.sparse)
         self.name = tarinfo.name
         self.mode = "r"
         self.closed = False
@@ -906,6 +906,12 @@
        usually created internally.
     """
 
+    __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
+                 "chksum", "type", "linkname", "uname", "gname",
+                 "devmajor", "devminor",
+                 "offset", "offset_data", "pax_headers", "sparse",
+                 "tarfile", "_sparse_structs", "_link_target")
+
     def __init__(self, name=""):
         """Construct a TarInfo object. name is the optional name
            of the member.
@@ -927,6 +933,7 @@
         self.offset = 0         # the tar header starts here
         self.offset_data = 0    # the file's data starts here
 
+        self.sparse = None      # sparse member information
         self.pax_headers = {}   # pax header information
 
     # In pax headers the "name" and "linkname" field are called
@@ -1181,7 +1188,6 @@
             raise HeaderError("bad checksum")
 
         obj = cls()
-        obj.buf = buf
         obj.name = nts(buf[0:100], encoding, errors)
         obj.mode = nti(buf[100:108])
         obj.uid = nti(buf[108:116])
@@ -1202,6 +1208,24 @@
         if obj.type == AREGTYPE and obj.name.endswith("/"):
             obj.type = DIRTYPE
 
+        # The old GNU sparse format occupies some of the unused
+        # space in the buffer for up to 4 sparse structures.
+        # Save the them for later processing in _proc_sparse().
+        if obj.type == GNUTYPE_SPARSE:
+            pos = 386
+            structs = []
+            for i in range(4):
+                try:
+                    offset = nti(buf[pos:pos + 12])
+                    numbytes = nti(buf[pos + 12:pos + 24])
+                except ValueError:
+                    break
+                structs.append((offset, numbytes))
+                pos += 24
+            isextended = bool(buf[482])
+            origsize = nti(buf[483:495])
+            obj._sparse_structs = (structs, isextended, origsize)
+
         # Remove redundant slashes from directories.
         if obj.isdir():
             obj.name = obj.name.rstrip("/")
@@ -1288,31 +1312,11 @@
     def _proc_sparse(self, tarfile):
         """Process a GNU sparse header plus extra headers.
         """
-        buf = self.buf
-        sp = _ringbuffer()
-        pos = 386
-        lastpos = 0
-        realpos = 0
-        # There are 4 possible sparse structs in the
-        # first header.
-        for i in range(4):
-            try:
-                offset = nti(buf[pos:pos + 12])
-                numbytes = nti(buf[pos + 12:pos + 24])
-            except ValueError:
-                break
-            if offset > lastpos:
-                sp.append(_hole(lastpos, offset - lastpos))
-            sp.append(_data(offset, numbytes, realpos))
-            realpos += numbytes
-            lastpos = offset + numbytes
-            pos += 24
-
-        isextended = bool(buf[482])
-        origsize = nti(buf[483:495])
+        # We already collected some sparse structures in frombuf().
+        structs, isextended, origsize = self._sparse_structs
+        del self._sparse_structs
 
-        # If the isextended flag is given,
-        # there are extra headers to process.
+        # Collect sparse structures from extended header blocks.
         while isextended:
             buf = tarfile.fileobj.read(BLOCKSIZE)
             pos = 0
@@ -1322,18 +1326,23 @@
                     numbytes = nti(buf[pos + 12:pos + 24])
                 except ValueError:
                     break
-                if offset > lastpos:
-                    sp.append(_hole(lastpos, offset - lastpos))
-                sp.append(_data(offset, numbytes, realpos))
-                realpos += numbytes
-                lastpos = offset + numbytes
+                structs.append((offset, numbytes))
                 pos += 24
             isextended = bool(buf[504])
 
+        # Transform the sparse structures to something we can use
+        # in ExFileObject.
+        self.sparse = _ringbuffer()
+        lastpos = 0
+        realpos = 0
+        for offset, numbytes in structs:
+            if offset > lastpos:
+                self.sparse.append(_hole(lastpos, offset - lastpos))
+            self.sparse.append(_data(offset, numbytes, realpos))
+            realpos += numbytes
+            lastpos = offset + numbytes
         if lastpos < origsize:
-            sp.append(_hole(lastpos, origsize - lastpos))
-
-        self.sparse = sp
+            self.sparse.append(_hole(lastpos, origsize - lastpos))
 
         self.offset_data = tarfile.fileobj.tell()
         tarfile.offset = self.offset_data + self._block(self.size)

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS	(original)
+++ python/branches/py3k/Misc/NEWS	Mon Apr 14 12:05:48 2008
@@ -29,6 +29,9 @@
 Library
 -------
 
+- Issue #2058: Remove the buf attribute and add __slots__ to the TarInfo
+  class in order to reduce tarfile's memory usage.
+
 - Bug #2606: Avoid calling .sort() on a dict_keys object.
 
 - The bundled libffi copy is now in sync with the recently released


More information about the Python-3000-checkins mailing list