[Python-checkins] cpython (merge 3.5 -> default): Issue #15068: Got rid of excessive buffering in fileinput.

serhiy.storchaka python-checkins at python.org
Tue Mar 8 11:37:17 EST 2016


https://hg.python.org/cpython/rev/fefedbaac640
changeset:   100458:fefedbaac640
parent:      100456:da020e408c7f
parent:      100457:9ead3a6c5f81
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Tue Mar 08 18:35:19 2016 +0200
summary:
  Issue #15068: Got rid of excessive buffering in fileinput.
The bufsize parameter is now deprecated and ignored.

files:
  Doc/library/fileinput.rst  |    7 +-
  Lib/fileinput.py           |  166 +++++++++++-------------
  Lib/test/test_fileinput.py |   79 +++++++++++-
  Misc/NEWS                  |    3 +
  4 files changed, 160 insertions(+), 95 deletions(-)


diff --git a/Doc/library/fileinput.rst b/Doc/library/fileinput.rst
--- a/Doc/library/fileinput.rst
+++ b/Doc/library/fileinput.rst
@@ -71,6 +71,8 @@
    .. versionchanged:: 3.2
       Can be used as a context manager.
 
+   .. deprecated-removed:: 3.6 3.8
+      The *bufsize* parameter.
 
 The following functions use the global state created by :func:`fileinput.input`;
 if there is no active state, :exc:`RuntimeError` is raised.
@@ -161,7 +163,10 @@
       Can be used as a context manager.
 
    .. deprecated:: 3.4
-        The ``'rU'`` and ``'U'`` modes.
+      The ``'rU'`` and ``'U'`` modes.
+
+   .. deprecated-removed:: 3.6 3.8
+      The *bufsize* parameter.
 
 
 **Optional in-place filtering:** if the keyword argument ``inplace=True`` is
diff --git a/Lib/fileinput.py b/Lib/fileinput.py
--- a/Lib/fileinput.py
+++ b/Lib/fileinput.py
@@ -64,13 +64,6 @@
 disabled when standard input is read.  XXX The current implementation
 does not work for MS-DOS 8+3 filesystems.
 
-Performance: this module is unfortunately one of the slower ways of
-processing large numbers of input lines.  Nevertheless, a significant
-speed-up has been obtained by using readlines(bufsize) instead of
-readline().  A new keyword argument, bufsize=N, is present on the
-input() function and the FileInput() class to override the default
-buffer size.
-
 XXX Possible additions:
 
 - optional getopt argument processing
@@ -87,8 +80,6 @@
 
 _state = None
 
-DEFAULT_BUFSIZE = 8*1024
-
 def input(files=None, inplace=False, backup="", bufsize=0,
           mode="r", openhook=None):
     """Return an instance of the FileInput class, which can be iterated.
@@ -208,17 +199,19 @@
         self._files = files
         self._inplace = inplace
         self._backup = backup
-        self._bufsize = bufsize or DEFAULT_BUFSIZE
+        if bufsize:
+            import warnings
+            warnings.warn('bufsize is deprecated and ignored',
+                          DeprecationWarning, stacklevel=2)
         self._savestdout = None
         self._output = None
         self._filename = None
-        self._lineno = 0
+        self._startlineno = 0
         self._filelineno = 0
         self._file = None
+        self._readline = self._start_readline
         self._isstdin = False
         self._backupfilename = None
-        self._buffer = []
-        self._bufindex = 0
         # restrict mode argument to reading modes
         if mode not in ('r', 'rU', 'U', 'rb'):
             raise ValueError("FileInput opening mode must be one of "
@@ -254,22 +247,18 @@
         return self
 
     def __next__(self):
-        try:
-            line = self._buffer[self._bufindex]
-        except IndexError:
-            pass
-        else:
-            self._bufindex += 1
-            self._lineno += 1
+        line = self._readline()
+        if line:
             self._filelineno += 1
             return line
-        line = self.readline()
-        if not line:
+        if not self._file:
             raise StopIteration
-        return line
+        self.nextfile()
+        # Recursive call
+        return self.__next__()
 
     def __getitem__(self, i):
-        if i != self._lineno:
+        if i != self.lineno():
             raise RuntimeError("accessing lines out of order")
         try:
             return self.__next__()
@@ -290,6 +279,7 @@
         finally:
             file = self._file
             self._file = None
+            self._readline = self._start_readline
             try:
                 if file and not self._isstdin:
                     file.close()
@@ -301,85 +291,81 @@
                     except OSError: pass
 
                 self._isstdin = False
-                self._buffer = []
-                self._bufindex = 0
 
     def readline(self):
-        try:
-            line = self._buffer[self._bufindex]
-        except IndexError:
-            pass
+        while True:
+            line = self._readline()
+            if line:
+                self._filelineno += 1
+                return line
+            if not self._file:
+                return line
+            self.nextfile()
+            # repeat with next file
+
+    def _start_readline(self):
+        if not self._files:
+            if 'b' in self._mode:
+                return b''
+            else:
+                return ''
+        self._filename = self._files[0]
+        self._files = self._files[1:]
+        self._startlineno = self.lineno()
+        self._filelineno = 0
+        self._file = None
+        self._isstdin = False
+        self._backupfilename = 0
+        if self._filename == '-':
+            self._filename = '<stdin>'
+            if 'b' in self._mode:
+                self._file = getattr(sys.stdin, 'buffer', sys.stdin)
+            else:
+                self._file = sys.stdin
+            self._isstdin = True
         else:
-            self._bufindex += 1
-            self._lineno += 1
-            self._filelineno += 1
-            return line
-        if not self._file:
-            if not self._files:
-                if 'b' in self._mode:
-                    return b''
+            if self._inplace:
+                self._backupfilename = (
+                    self._filename + (self._backup or ".bak"))
+                try:
+                    os.unlink(self._backupfilename)
+                except OSError:
+                    pass
+                # The next few lines may raise OSError
+                os.rename(self._filename, self._backupfilename)
+                self._file = open(self._backupfilename, self._mode)
+                try:
+                    perm = os.fstat(self._file.fileno()).st_mode
+                except OSError:
+                    self._output = open(self._filename, "w")
                 else:
-                    return ''
-            self._filename = self._files[0]
-            self._files = self._files[1:]
-            self._filelineno = 0
-            self._file = None
-            self._isstdin = False
-            self._backupfilename = 0
-            if self._filename == '-':
-                self._filename = '<stdin>'
-                if 'b' in self._mode:
-                    self._file = getattr(sys.stdin, 'buffer', sys.stdin)
-                else:
-                    self._file = sys.stdin
-                self._isstdin = True
-            else:
-                if self._inplace:
-                    self._backupfilename = (
-                        self._filename + (self._backup or ".bak"))
+                    mode = os.O_CREAT | os.O_WRONLY | os.O_TRUNC
+                    if hasattr(os, 'O_BINARY'):
+                        mode |= os.O_BINARY
+
+                    fd = os.open(self._filename, mode, perm)
+                    self._output = os.fdopen(fd, "w")
                     try:
-                        os.unlink(self._backupfilename)
+                        if hasattr(os, 'chmod'):
+                            os.chmod(self._filename, perm)
                     except OSError:
                         pass
-                    # The next few lines may raise OSError
-                    os.rename(self._filename, self._backupfilename)
-                    self._file = open(self._backupfilename, self._mode)
-                    try:
-                        perm = os.fstat(self._file.fileno()).st_mode
-                    except OSError:
-                        self._output = open(self._filename, "w")
-                    else:
-                        mode = os.O_CREAT | os.O_WRONLY | os.O_TRUNC
-                        if hasattr(os, 'O_BINARY'):
-                            mode |= os.O_BINARY
-
-                        fd = os.open(self._filename, mode, perm)
-                        self._output = os.fdopen(fd, "w")
-                        try:
-                            if hasattr(os, 'chmod'):
-                                os.chmod(self._filename, perm)
-                        except OSError:
-                            pass
-                    self._savestdout = sys.stdout
-                    sys.stdout = self._output
+                self._savestdout = sys.stdout
+                sys.stdout = self._output
+            else:
+                # This may raise OSError
+                if self._openhook:
+                    self._file = self._openhook(self._filename, self._mode)
                 else:
-                    # This may raise OSError
-                    if self._openhook:
-                        self._file = self._openhook(self._filename, self._mode)
-                    else:
-                        self._file = open(self._filename, self._mode)
-        self._buffer = self._file.readlines(self._bufsize)
-        self._bufindex = 0
-        if not self._buffer:
-            self.nextfile()
-        # Recursive call
-        return self.readline()
+                    self._file = open(self._filename, self._mode)
+        self._readline = self._file.readline
+        return self._readline()
 
     def filename(self):
         return self._filename
 
     def lineno(self):
-        return self._lineno
+        return self._startlineno + self._filelineno
 
     def filelineno(self):
         return self._filelineno
diff --git a/Lib/test/test_fileinput.py b/Lib/test/test_fileinput.py
--- a/Lib/test/test_fileinput.py
+++ b/Lib/test/test_fileinput.py
@@ -47,6 +47,42 @@
         if name:
             safe_unlink(name)
 
+class LineReader:
+
+    def __init__(self):
+        self._linesread = []
+
+    @property
+    def linesread(self):
+        try:
+            return self._linesread[:]
+        finally:
+            self._linesread = []
+
+    def openhook(self, filename, mode):
+        self.it = iter(filename.splitlines(True))
+        return self
+
+    def readline(self, size=None):
+        line = next(self.it, '')
+        self._linesread.append(line)
+        return line
+
+    def readlines(self, hint=-1):
+        lines = []
+        size = 0
+        while True:
+            line = self.readline()
+            if not line:
+                return lines
+            lines.append(line)
+            size += len(line)
+            if size >= hint:
+                return lines
+
+    def close(self):
+        pass
+
 class BufferSizesTests(unittest.TestCase):
     def test_buffer_sizes(self):
         # First, run the tests with default and teeny buffer size.
@@ -57,7 +93,11 @@
                 t2 = writeTmp(2, ["Line %s of file 2\n" % (i+1) for i in range(10)])
                 t3 = writeTmp(3, ["Line %s of file 3\n" % (i+1) for i in range(5)])
                 t4 = writeTmp(4, ["Line %s of file 4\n" % (i+1) for i in range(1)])
-                self.buffer_size_test(t1, t2, t3, t4, bs, round)
+                if bs:
+                    with self.assertWarns(DeprecationWarning):
+                        self.buffer_size_test(t1, t2, t3, t4, bs, round)
+                else:
+                    self.buffer_size_test(t1, t2, t3, t4, bs, round)
             finally:
                 remove_tempfiles(t1, t2, t3, t4)
 
@@ -290,7 +330,7 @@
         self.addCleanup(safe_unlink, TESTFN)
 
         with FileInput(files=TESTFN,
-                       openhook=hook_encoded('ascii'), bufsize=8) as fi:
+                       openhook=hook_encoded('ascii')) as fi:
             try:
                 self.assertEqual(fi.readline(), 'A\n')
                 self.assertEqual(fi.readline(), 'B\n')
@@ -458,6 +498,38 @@
 
         self.assertEqual(result, -1, "fileno() should return -1")
 
+    def test_readline_buffering(self):
+        src = LineReader()
+        with FileInput(files=['line1\nline2', 'line3\n'],
+                       openhook=src.openhook) as fi:
+            self.assertEqual(src.linesread, [])
+            self.assertEqual(fi.readline(), 'line1\n')
+            self.assertEqual(src.linesread, ['line1\n'])
+            self.assertEqual(fi.readline(), 'line2')
+            self.assertEqual(src.linesread, ['line2'])
+            self.assertEqual(fi.readline(), 'line3\n')
+            self.assertEqual(src.linesread, ['', 'line3\n'])
+            self.assertEqual(fi.readline(), '')
+            self.assertEqual(src.linesread, [''])
+            self.assertEqual(fi.readline(), '')
+            self.assertEqual(src.linesread, [])
+
+    def test_iteration_buffering(self):
+        src = LineReader()
+        with FileInput(files=['line1\nline2', 'line3\n'],
+                       openhook=src.openhook) as fi:
+            self.assertEqual(src.linesread, [])
+            self.assertEqual(next(fi), 'line1\n')
+            self.assertEqual(src.linesread, ['line1\n'])
+            self.assertEqual(next(fi), 'line2')
+            self.assertEqual(src.linesread, ['line2'])
+            self.assertEqual(next(fi), 'line3\n')
+            self.assertEqual(src.linesread, ['', 'line3\n'])
+            self.assertRaises(StopIteration, next, fi)
+            self.assertEqual(src.linesread, [''])
+            self.assertRaises(StopIteration, next, fi)
+            self.assertEqual(src.linesread, [])
+
 class MockFileInput:
     """A class that mocks out fileinput.FileInput for use during unit tests"""
 
@@ -917,8 +989,7 @@
 class MiscTest(unittest.TestCase):
 
     def test_all(self):
-        blacklist = {'DEFAULT_BUFSIZE'}
-        support.check__all__(self, fileinput, blacklist=blacklist)
+        support.check__all__(self, fileinput)
 
 
 if __name__ == "__main__":
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -201,6 +201,9 @@
 Library
 -------
 
+- Issue #15068: Got rid of excessive buffering in fileinput.
+  The bufsize parameter is now deprecated and ignored.
+
 - Issue #19475: Added an optional argument timespec to the datetime
   isoformat() method to choose the precision of the time component.
 

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list