[pypy-svn] r17971 - pypy/dist/pypy/interpreter/pyparser

Thu Sep 29 14:01:15 CEST 2005

Author: ac
Date: Thu Sep 29 14:01:14 2005
New Revision: 17971

Modified:
   pypy/dist/pypy/interpreter/pyparser/pythonlexer.py
   pypy/dist/pypy/interpreter/pyparser/pythonparse.py
Log:
Refactor sourcecode encoding processing.

Modified: pypy/dist/pypy/interpreter/pyparser/pythonlexer.py
==============================================================================

--- pypy/dist/pypy/interpreter/pyparser/pythonlexer.py	(original)
+++ pypy/dist/pypy/interpreter/pyparser/pythonlexer.py	Thu Sep 29 14:01:14 2005
@@ -47,24 +47,6 @@
     if encoding != '':
         return encoding
     return None
-    
-def _normalize_encoding(encoding):
-    """returns normalized name for <encoding>
-
-    see dist/src/Parser/tokenizer.c 'get_normal_name()'
-    for implementation details / reference
-
-    NOTE: for now, parser.suite() raises a MemoryError when
-          a bad encoding is used. (SF bug #979739)
-    """
-    # lower() + '_' / '-' conversion
-    encoding = encoding.replace('_', '-').lower()
-    if encoding.startswith('utf-8'):
-        return 'utf-8'
-    for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:
-        if encoding.startswith(variant):
-            return 'iso-8859-1'
-    return encoding
 
 ################################################################################
 from pypy.interpreter.pyparser import pytoken
@@ -112,16 +94,11 @@
     contline = None
     indents = [0]
     last_comment = ''
-    encoding = None
     # make the annotator happy
     pos = -1
     lines.append('') # XXX HACK probably not needed
 
     # look for the bom (byte-order marker) for utf-8
-    # XXX encoding support is incomplete at the moment
-    if lines[0].startswith('\xEF\xBB\xBF'):
-        lines[0] = lines[0][3:]
-        encoding = 'utf-8'
 
     # make the annotator happy
     endDFA = automata.DFA([], [])
@@ -175,10 +152,6 @@
                 if line[pos] == '#':
                     tok = Token(pytoken.COMMENT, line[pos:])
                     last_comment = line[pos:]
-                    if lnum <= 2 and encoding is None:
-                        encoding = match_encoding_declaration(last_comment)
-                        if encoding is not None:
-                            encoding = _normalize_encoding(encoding)
                 else:
                     tok = Token(pytoken.NL, line[pos:])
                     last_comment = ''
@@ -237,10 +210,6 @@
                 elif initial == '#':
                     tok = Token(pytoken.COMMENT, token)
                     last_comment = token
-                    if lnum <= 2 and encoding is None:
-                        encoding = match_encoding_declaration(last_comment)
-                        if encoding is not None:
-                            encoding = _normalize_encoding(encoding)
                     # XXX Skip # token_list.append((tok, line, lnum, pos))
                     # token_list.append((COMMENT, token, spos, epos, line))
                 elif token in triple_quoted:
@@ -317,15 +286,14 @@
     #for t in token_list:
     #    print '%20s  %-25s %d' % (pytoken.tok_name.get(t[0].codename, '?'), t[0], t[-2])
     #print '----------------------------------------- pyparser/pythonlexer.py'
-    return token_list, encoding
+    return token_list
 
 class PythonSource(TokenSource):
     """This source uses Jonathan's tokenizer"""
     def __init__(self, strings, flags=0):
         # TokenSource.__init__(self)
-        tokens, encoding = generate_tokens(strings, flags)
+        tokens = generate_tokens(strings, flags)
         self.token_stack = tokens
-        self.encoding = encoding
         self._current_line = '' # the current line (as a string)
         self._lineno = -1
         self._offset = 0

Modified: pypy/dist/pypy/interpreter/pyparser/pythonparse.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/pythonparse.py	(original)
+++ pypy/dist/pypy/interpreter/pyparser/pythonparse.py	Thu Sep 29 14:01:14 2005
@@ -6,9 +6,10 @@
 using file_input, single_input and eval_input targets
 """
 from pypy.interpreter.error import OperationError, debug_print
+from pypy.interpreter import gateway
 from pypy.interpreter.pyparser.error import ParseError
 from pypy.tool.option import Options
-from pythonlexer import Source
+from pythonlexer import Source, match_encoding_declaration
 import pysymbol
 import ebnfparse
 import sys
@@ -27,8 +28,18 @@
 
     def parse_source(self, textsrc, goal, builder, flags=0):
         """Parse a python source according to goal"""
+        # Detect source encoding.
+        if textsrc[:3] == '\xEF\xBB\xBF':
+            textsrc = textsrc[3:]
+            enc = 'utf-8'
+        else:
+            enc = _normalize_encoding(_check_for_encoding(textsrc))
+            if enc is not None and enc not in ('utf-8', 'iso-8859-1'):
+                textsrc = _recode_in_utf8(builder.space, textsrc, enc)
+
         lines = [line + '\n' for line in textsrc.split('\n')]
-        if textsrc.endswith('\n'):
+        builder.source_encoding = enc
+        if textsrc[-1] =='\n':
             lines.pop()
             flags &= ~PyCF_DONT_IMPLY_DEDENT
         return self.parse_lines(lines, goal, builder, flags)
@@ -37,8 +48,7 @@
         goalnumber = pysymbol.sym_values[goal]
         target = self.rules[goalnumber]
         src = Source(lines, flags)
-        builder.source_encoding = src.encoding
-
+    
         result = target.match(src, builder)
         if not result:
             line, lineno = src.debug()
@@ -46,6 +56,54 @@
             raise ParseError("invalid syntax", lineno, -1, line)
             # return None
         return builder
+    
+app_recode_to_utf8 = gateway.applevel(r'''
+    def app_recode_to_utf8(text, encoding):
+        return unicode(text, encoding).encode("utf-8")
+''').interphook('app_recode_to_utf8')
+
+def _recode_to_utf8(space, text, encoding):
+    return space.str_w(app_recode_to_utf8(space, space.wrap(text),
+                                          space.wrap(encoding)))
+def _normalize_encoding(encoding):
+    """returns normalized name for <encoding>
+
+    see dist/src/Parser/tokenizer.c 'get_normal_name()'
+    for implementation details / reference
+
+    NOTE: for now, parser.suite() raises a MemoryError when
+          a bad encoding is used. (SF bug #979739)
+    """
+    if encoding is None:
+        return None
+    # lower() + '_' / '-' conversion
+    encoding = encoding.replace('_', '-').lower()
+    if encoding.startswith('utf-8'):
+        return 'utf-8'
+    for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:
+        if encoding.startswith(variant):
+            return 'iso-8859-1'
+    return encoding
+
+def _check_for_encoding(s):
+    eol = s.find('\n')
+    if eol == -1:
+        return _check_line_for_encoding(s)
+    enc = _check_line_for_encoding(s[:eol])
+    eol2 = s.find('\n', eol + 1)
+    if eol2 == -1:
+        return _check_line_for_encoding(s[eol + 1:])
+    return _check_line_for_encoding(s[eol + 1:eol2])
+    
+def _check_line_for_encoding(line):
+    """returns the declared encoding or None"""
+    i = 0
+    for i in range(len(line)):
+        if line[i] == '#':
+            break
+        if line[i] not in ' \t\014':
+            return None
+    return match_encoding_declaration(line[i:])
 
 PYTHON_VERSION = ".".join([str(i) for i in sys.version_info[:2]])
 def get_grammar_file( version ):