[pypy-svn] pypy compile-from-stream: New function PythonParser.parse_file(), it accepts a "Stream" to provide source lines

Mon Mar 21 21:47:46 CET 2011

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: compile-from-stream
Changeset: r42826:0b31f3c872f9
Date: 2011-03-18 17:25 +0100
http://bitbucket.org/pypy/pypy/changeset/0b31f3c872f9/

Log:	New function PythonParser.parse_file(), it accepts a "Stream" to
	provide source lines

diff --git a/pypy/interpreter/pyparser/test/test_pyparse.py b/pypy/interpreter/pyparser/test/test_pyparse.py
--- a/pypy/interpreter/pyparser/test/test_pyparse.py
+++ b/pypy/interpreter/pyparser/test/test_pyparse.py
@@ -61,8 +61,13 @@
         assert exc.msg == "Unknown encoding: not-here"
         input = u"# coding: ascii\n\xe2".encode('utf-8')
         exc = py.test.raises(SyntaxError, self.parse, input).value
-        assert exc.msg == ("'ascii' codec can't decode byte 0xc3 "
-                           "in position 16: ordinal not in range(128)")
+        if isinstance(self, TestPythonFileParser):
+            # incremental decoder works line by line
+            assert exc.msg == ("'ascii' codec can't decode byte 0xc3 "
+                               "in position 0: ordinal not in range(128)")
+        else:
+            assert exc.msg == ("'ascii' codec can't decode byte 0xc3 "
+                               "in position 16: ordinal not in range(128)")
 
     def test_syntax_error(self):
         parse = self.parse
@@ -144,3 +149,31 @@
         self.parse('0b1101')
         self.parse('0b0l')
         py.test.raises(SyntaxError, self.parse, "0b112")
+
+class TestPythonFileParser(TestPythonParser):
+    def parse(self, source, mode="exec", info=None):
+        if info is None:
+            info = pyparse.CompileInfo("<test>", mode)
+
+        space = self.space
+
+        from pypy.interpreter.error import OperationError
+        import StringIO
+
+        class IOStream(pyparse.Stream):
+            def __init__(self, source):
+                self.stream = StringIO.StringIO(source)
+            def readline(self):
+                return self.stream.readline()
+            def recode_to_utf8(self, line, encoding):
+                try:
+                    if encoding is None or encoding in ('utf-8', 'iso-8859-1'):
+                        return line
+                    return line.decode(encoding).encode('utf-8')
+                except LookupError, e:
+                    raise OperationError(space.w_LookupError,
+                                         space.wrap(e.message))
+                except UnicodeDecodeError, e:
+                    raise SyntaxError(str(e)) # The one from pyparser.error!
+
+        return self.parser.parse_file(IOStream(source), info)

diff --git a/pypy/interpreter/pyparser/pyparse.py b/pypy/interpreter/pyparser/pyparse.py
--- a/pypy/interpreter/pyparser/pyparse.py
+++ b/pypy/interpreter/pyparser/pyparse.py
@@ -12,6 +12,7 @@
 def recode_to_utf8(space, text, encoding):
     return space.str_w(_recode_to_utf8(space, space.wrap(text),
                                           space.wrap(encoding)))
+
 def _normalize_encoding(encoding):
     """returns normalized name for <encoding>
 
@@ -33,17 +34,25 @@
             return 'iso-8859-1'
     return encoding
 
-def _check_for_encoding(s):
-    eol = s.find('\n')
+def _check_for_encoding(s1, s2):
+    eol = s1.find('\n')
     if eol < 0:
-        return _check_line_for_encoding(s)
-    enc = _check_line_for_encoding(s[:eol])
+        enc = _check_line_for_encoding(s1)
+    else:
+        enc = _check_line_for_encoding(s1[:eol])
     if enc:
         return enc
-    eol2 = s.find('\n', eol + 1)
-    if eol2 < 0:
-        return _check_line_for_encoding(s[eol + 1:])
-    return _check_line_for_encoding(s[eol + 1:eol2])
+    if eol:
+        if s2:
+            s = s1 + s2
+        else:
+            s = s1
+        eol2 = s.find('\n', eol + 1)
+        if eol2 < 0:
+            return _check_line_for_encoding(s[eol + 1:])
+        return _check_line_for_encoding(s[eol + 1:eol2])
+    elif s2:
+        return _check_line_for_encoding(s2)
 
 
 def _check_line_for_encoding(line):
@@ -86,75 +95,144 @@
 'exec' : pygram.syms.file_input,
 }
 
+class Stream(object):
+    "Pseudo-file object used by PythonParser.parse_file"
+    def readline(self):
+        raise NotImplementedError
+    def recode_to_utf8(self, text, encoding):
+        raise NotImplementedError
+
 class PythonParser(parser.Parser):
 
     def __init__(self, space, grammar=pygram.python_grammar):
         parser.Parser.__init__(self, grammar)
         self.space = space
 
+    def _detect_encoding(self, text1, text2, compile_info):
+        "Detect source encoding from the beginning of the file"
+        if text1.startswith("\xEF\xBB\xBF"):
+            text1 = text1[3:]
+            compile_info.encoding = 'utf-8'
+            # If an encoding is explicitly given check that it is utf-8.
+            decl_enc = _check_for_encoding(text1, text2)
+            if decl_enc and decl_enc != "utf-8":
+                raise error.SyntaxError("UTF-8 BOM with non-utf8 coding cookie",
+                                        filename=compile_info.filename)
+        elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:
+            compile_info.encoding = 'utf-8'
+            if _check_for_encoding(text1, text2) is not None:
+                raise error.SyntaxError("coding declaration in unicode string",
+                                        filename=compile_info.filename)
+        else:
+            compile_info.encoding = _normalize_encoding(
+                _check_for_encoding(text1, text2))
+        return text1
+
+    def _decode_error(self, e, compile_info):
+        space = self.space
+        # if the codec is not found, LookupError is raised.  we
+        # check using 'is_w' not to mask potential IndexError or
+        # KeyError
+        if space.is_w(e.w_type, space.w_LookupError):
+            return error.SyntaxError(
+                "Unknown encoding: %s" % compile_info.encoding,
+                filename=compile_info.filename)
+        # Transform unicode errors into SyntaxError
+        if e.match(space, space.w_UnicodeDecodeError):
+            e.normalize_exception(space)
+            w_message = space.str(e.get_w_value(space))
+            return error.SyntaxError(space.str_w(w_message))
+
     def parse_source(self, textsrc, compile_info):
         """Main entry point for parsing Python source.
 
         Everything from decoding the source to tokenizing to building the parse
         tree is handled here.
         """
-        # Detect source encoding.
-        enc = None
-        if textsrc.startswith("\xEF\xBB\xBF"):
-            textsrc = textsrc[3:]
-            enc = 'utf-8'
-            # If an encoding is explicitly given check that it is utf-8.
-            decl_enc = _check_for_encoding(textsrc)
-            if decl_enc and decl_enc != "utf-8":
-                raise error.SyntaxError("UTF-8 BOM with non-utf8 coding cookie",
-                                        filename=compile_info.filename)
-        elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:
-            enc = 'utf-8'
-            if _check_for_encoding(textsrc) is not None:
-                raise error.SyntaxError("coding declaration in unicode string",
-                                        filename=compile_info.filename)
-        else:
-            enc = _normalize_encoding(_check_for_encoding(textsrc))
-            if enc is not None and enc not in ('utf-8', 'iso-8859-1'):
-                try:
-                    textsrc = recode_to_utf8(self.space, textsrc, enc)
-                except OperationError, e:
-                    space = self.space
-                    # if the codec is not found, LookupError is raised.  we
-                    # check using 'is_w' not to mask potential IndexError or
-                    # KeyError
-                    if space.is_w(e.w_type, space.w_LookupError):
-                        raise error.SyntaxError("Unknown encoding: %s" % enc,
-                                                filename=compile_info.filename)
-                    # Transform unicode errors into SyntaxError
-                    if e.match(space, space.w_UnicodeDecodeError):
-                        e.normalize_exception(space)
-                        w_message = space.str(e.get_w_value(space))
-                        raise error.SyntaxError(space.str_w(w_message))
+        textsrc = self._detect_encoding(textsrc, None, compile_info)
+
+        enc = compile_info.encoding
+        if enc is not None and enc not in ('utf-8', 'iso-8859-1'):
+            try:
+                textsrc = recode_to_utf8(self.space, textsrc, enc)
+            except OperationError, e:
+                operror = self._decode_error(e, compile_info)
+                if operror:
+                    raise operror
+                else:
                     raise
 
-        flags = compile_info.flags
-
-        if flags & consts.CO_FUTURE_PRINT_FUNCTION:
-            self.grammar = pygram.python_grammar_no_print
-        else:
-            self.grammar = pygram.python_grammar
         source_lines = textsrc.splitlines(True)
 
-        if textsrc and textsrc[-1] == "\n":
-            compile_info.flags &= ~consts.PyCF_DONT_IMPLY_DEDENT
+        return self.build_tree(source_lines, compile_info)
 
-        if enc is not None:
-            compile_info.encoding = enc
+    def parse_file(self, stream, compile_info):
+        assert isinstance(stream, Stream)
+
+        firstline = stream.readline()
+        secondline = None
+        if firstline:
+            secondline = stream.readline()
+            if secondline:
+                firstline = self._detect_encoding(
+                    firstline, secondline, compile_info)
+            else:
+                firstline = self._detect_encoding(
+                    firstline, '', compile_info)
+
+        enc = compile_info.encoding
+        if enc in ('utf-8', 'iso-8859-1'):
+            enc = None # No need to recode
+
+        source_lines = []
+
+        if enc is None:
+            if firstline:
+                source_lines.append(firstline)
+            if secondline:
+                source_lines.append(secondline)
+            while True:
+                line = stream.readline()
+                if not line:
+                    break
+                source_lines.append(line)
+        else:
+            try:
+                if firstline:
+                    source_lines.append(stream.recode_to_utf8(firstline, enc))
+                if secondline:
+                    source_lines.append(stream.recode_to_utf8(secondline, enc))
+
+                while True:
+                    line = stream.readline()
+                    if not line:
+                        break
+                    source_lines.append(stream.recode_to_utf8(line, enc))
+            except OperationError, e:
+                operror = self._decode_error(e, compile_info)
+                if operror:
+                    raise operror
+                else:
+                    raise
 
         return self.build_tree(source_lines, compile_info)
 
     def build_tree(self, source_lines, compile_info):
         """Builds the parse tree from a list of source lines"""
 
-        # The tokenizer is very picky about how it wants its input.
-        if source_lines and not source_lines[-1].endswith("\n"):
-            source_lines[-1] += '\n'
+        if compile_info.flags & consts.CO_FUTURE_PRINT_FUNCTION:
+            self.grammar = pygram.python_grammar_no_print
+        else:
+            self.grammar = pygram.python_grammar
+
+        if source_lines and source_lines[-1]:
+            last_line = source_lines[-1]
+            if last_line:
+                if last_line[-1] == "\n":
+                    compile_info.flags &= ~consts.PyCF_DONT_IMPLY_DEDENT
+                else:
+                    # The tokenizer is very picky about how it wants its input.
+                    source_lines[-1] += '\n'
 
         self.prepare(_targets[compile_info.mode])
         tp = 0