[Python-checkins] r77288 - in python/trunk: Lib/gzip.py Lib/test/test_gzip.py Misc/NEWS

antoine.pitrou python-checkins at python.org
Sun Jan 3 23:29:56 CET 2010


Author: antoine.pitrou
Date: Sun Jan  3 23:29:56 2010
New Revision: 77288

Log:
Issue #7471: Improve the performance of GzipFile's buffering mechanism,
and make it implement the `io.BufferedIOBase` ABC to allow for further
speedups by wrapping it in an `io.BufferedReader`.  Patch by Nir Aides.



Modified:
   python/trunk/Lib/gzip.py
   python/trunk/Lib/test/test_gzip.py
   python/trunk/Misc/NEWS

Modified: python/trunk/Lib/gzip.py
==============================================================================
--- python/trunk/Lib/gzip.py	(original)
+++ python/trunk/Lib/gzip.py	Sun Jan  3 23:29:56 2010
@@ -7,6 +7,7 @@
 
 import struct, sys, time, os
 import zlib
+import io
 import __builtin__
 
 __all__ = ["GzipFile","open"]
@@ -32,7 +33,7 @@
     """
     return GzipFile(filename, mode, compresslevel)
 
-class GzipFile:
+class GzipFile(io.BufferedIOBase):
     """The GzipFile class simulates most of the methods of a file object with
     the exception of the readinto() and truncate() methods.
 
@@ -97,8 +98,12 @@
             self.mode = READ
             # Set flag indicating start of a new member
             self._new_member = True
+            # Buffer data read from gzip file. extrastart is offset in
+            # stream where buffer starts. extrasize is number of
+            # bytes remaining in buffer from current stream position.
             self.extrabuf = ""
             self.extrasize = 0
+            self.extrastart = 0
             self.name = filename
             # Starts small, scales exponentially
             self.min_readsize = 100
@@ -196,7 +201,6 @@
         if flag & FHCRC:
             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 
-
     def write(self,data):
         if self.mode != WRITE:
             import errno
@@ -204,12 +208,19 @@
 
         if self.fileobj is None:
             raise ValueError, "write() on closed GzipFile object"
+
+        # Convert data type if called by io.BufferedWriter.
+        if isinstance(data, memoryview):
+            data = data.tobytes()
+
         if len(data) > 0:
             self.size = self.size + len(data)
             self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
             self.fileobj.write( self.compress.compress(data) )
             self.offset += len(data)
 
+        return len(data)
+
     def read(self, size=-1):
         if self.mode != READ:
             import errno
@@ -235,15 +246,14 @@
                 if size > self.extrasize:
                     size = self.extrasize
 
-        chunk = self.extrabuf[:size]
-        self.extrabuf = self.extrabuf[size:]
+        offset = self.offset - self.extrastart
+        chunk = self.extrabuf[offset: offset + size]
         self.extrasize = self.extrasize - size
 
         self.offset += size
         return chunk
 
     def _unread(self, buf):
-        self.extrabuf = buf + self.extrabuf
         self.extrasize = len(buf) + self.extrasize
         self.offset -= len(buf)
 
@@ -299,8 +309,10 @@
 
     def _add_read_data(self, data):
         self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
-        self.extrabuf = self.extrabuf + data
+        offset = self.offset - self.extrastart
+        self.extrabuf = self.extrabuf[offset:] + data
         self.extrasize = self.extrasize + len(data)
+        self.extrastart = self.offset
         self.size = self.size + len(data)
 
     def _read_eof(self):
@@ -318,6 +330,10 @@
         elif isize != (self.size & 0xffffffffL):
             raise IOError, "Incorrect length of data produced"
 
+    @property
+    def closed(self):
+        return self.fileobj is None
+
     def close(self):
         if self.fileobj is None:
             return
@@ -333,15 +349,6 @@
             self.myfileobj.close()
             self.myfileobj = None
 
-    def __del__(self):
-        try:
-            if (self.myfileobj is None and
-                self.fileobj is None):
-                return
-        except AttributeError:
-            return
-        self.close()
-
     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
         if self.mode == WRITE:
             # Ensure the compressor's buffer is flushed
@@ -356,12 +363,6 @@
         """
         return self.fileobj.fileno()
 
-    def isatty(self):
-        return False
-
-    def tell(self):
-        return self.offset
-
     def rewind(self):
         '''Return the uncompressed stream file position indicator to the
         beginning of the file'''
@@ -371,8 +372,18 @@
         self._new_member = True
         self.extrabuf = ""
         self.extrasize = 0
+        self.extrastart = 0
         self.offset = 0
 
+    def readable(self):
+        return self.mode == READ
+
+    def writable(self):
+        return self.mode == WRITE
+
+    def seekable(self):
+        return True
+
     def seek(self, offset, whence=0):
         if whence:
             if whence == 1:
@@ -395,8 +406,18 @@
                 self.read(1024)
             self.read(count % 1024)
 
+        return self.offset
+
     def readline(self, size=-1):
         if size < 0:
+            # Shortcut common case - newline found in buffer.
+            offset = self.offset - self.extrastart
+            i = self.extrabuf.find('\n', offset) + 1
+            if i > 0:
+                self.extrasize -= i - offset
+                self.offset += i - offset
+                return self.extrabuf[offset: i]
+
             size = sys.maxint
             readsize = self.min_readsize
         else:
@@ -426,42 +447,6 @@
             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
         return ''.join(bufs) # Return resulting line
 
-    def readlines(self, sizehint=0):
-        # Negative numbers result in reading all the lines
-        if sizehint <= 0:
-            sizehint = sys.maxint
-        L = []
-        while sizehint > 0:
-            line = self.readline()
-            if line == "":
-                break
-            L.append(line)
-            sizehint = sizehint - len(line)
-
-        return L
-
-    def writelines(self, L):
-        for line in L:
-            self.write(line)
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-        line = self.readline()
-        if line:
-            return line
-        else:
-            raise StopIteration
-
-    def __enter__(self):
-        if self.fileobj is None:
-            raise ValueError("I/O operation on closed GzipFile object")
-        return self
-
-    def __exit__(self, *args):
-        self.close()
-
 
 def _test():
     # Act like gzip; with -d, act like gunzip.

Modified: python/trunk/Lib/test/test_gzip.py
==============================================================================
--- python/trunk/Lib/test/test_gzip.py	(original)
+++ python/trunk/Lib/test/test_gzip.py	Sun Jan  3 23:29:56 2010
@@ -5,6 +5,7 @@
 import unittest
 from test import test_support
 import os
+import io
 import struct
 gzip = test_support.import_module('gzip')
 
@@ -80,6 +81,16 @@
         zgfile.close()
         self.assertEquals(contents, 'a'*201)
 
+    def test_buffered_reader(self):
+        # Issue #7471: a GzipFile can be wrapped in a BufferedReader for
+        # performance.
+        self.test_write()
+
+        f = gzip.GzipFile(self.filename, 'rb')
+        with io.BufferedReader(f) as r:
+            lines = [line for line in r]
+
+        self.assertEqual(lines, 50 * data1.splitlines(True))
 
     def test_readline(self):
         self.test_write()

Modified: python/trunk/Misc/NEWS
==============================================================================
--- python/trunk/Misc/NEWS	(original)
+++ python/trunk/Misc/NEWS	Sun Jan  3 23:29:56 2010
@@ -62,7 +62,11 @@
 Library
 -------
 
-_ Issue #3972: httplib.HTTPConnection now accepts an optional source_address
+- Issue #7471: Improve the performance of GzipFile's buffering mechanism,
+  and make it implement the `io.BufferedIOBase` ABC to allow for further
+  speedups by wrapping it in an `io.BufferedReader`.  Patch by Nir Aides.
+
+- Issue #3972: httplib.HTTPConnection now accepts an optional source_address
   parameter to allow specifying where your connections come from.
 
 - socket.create_connection now accepts an optional source_address parameter.


More information about the Python-checkins mailing list