[pypy-svn] r17971 - pypy/dist/pypy/interpreter/pyparser
ac at codespeak.net
ac at codespeak.net
Thu Sep 29 14:01:15 CEST 2005
Author: ac
Date: Thu Sep 29 14:01:14 2005
New Revision: 17971
Modified:
pypy/dist/pypy/interpreter/pyparser/pythonlexer.py
pypy/dist/pypy/interpreter/pyparser/pythonparse.py
Log:
Refactor sourcecode encoding processing.
Modified: pypy/dist/pypy/interpreter/pyparser/pythonlexer.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/pythonlexer.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/pythonlexer.py Thu Sep 29 14:01:14 2005
@@ -47,24 +47,6 @@
if encoding != '':
return encoding
return None
-
-def _normalize_encoding(encoding):
- """returns normalized name for <encoding>
-
- see dist/src/Parser/tokenizer.c 'get_normal_name()'
- for implementation details / reference
-
- NOTE: for now, parser.suite() raises a MemoryError when
- a bad encoding is used. (SF bug #979739)
- """
- # lower() + '_' / '-' conversion
- encoding = encoding.replace('_', '-').lower()
- if encoding.startswith('utf-8'):
- return 'utf-8'
- for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:
- if encoding.startswith(variant):
- return 'iso-8859-1'
- return encoding
################################################################################
from pypy.interpreter.pyparser import pytoken
@@ -112,16 +94,11 @@
contline = None
indents = [0]
last_comment = ''
- encoding = None
# make the annotator happy
pos = -1
lines.append('') # XXX HACK probably not needed
# look for the bom (byte-order marker) for utf-8
- # XXX encoding support is incomplete at the moment
- if lines[0].startswith('\xEF\xBB\xBF'):
- lines[0] = lines[0][3:]
- encoding = 'utf-8'
# make the annotator happy
endDFA = automata.DFA([], [])
@@ -175,10 +152,6 @@
if line[pos] == '#':
tok = Token(pytoken.COMMENT, line[pos:])
last_comment = line[pos:]
- if lnum <= 2 and encoding is None:
- encoding = match_encoding_declaration(last_comment)
- if encoding is not None:
- encoding = _normalize_encoding(encoding)
else:
tok = Token(pytoken.NL, line[pos:])
last_comment = ''
@@ -237,10 +210,6 @@
elif initial == '#':
tok = Token(pytoken.COMMENT, token)
last_comment = token
- if lnum <= 2 and encoding is None:
- encoding = match_encoding_declaration(last_comment)
- if encoding is not None:
- encoding = _normalize_encoding(encoding)
# XXX Skip # token_list.append((tok, line, lnum, pos))
# token_list.append((COMMENT, token, spos, epos, line))
elif token in triple_quoted:
@@ -317,15 +286,14 @@
#for t in token_list:
# print '%20s %-25s %d' % (pytoken.tok_name.get(t[0].codename, '?'), t[0], t[-2])
#print '----------------------------------------- pyparser/pythonlexer.py'
- return token_list, encoding
+ return token_list
class PythonSource(TokenSource):
"""This source uses Jonathan's tokenizer"""
def __init__(self, strings, flags=0):
# TokenSource.__init__(self)
- tokens, encoding = generate_tokens(strings, flags)
+ tokens = generate_tokens(strings, flags)
self.token_stack = tokens
- self.encoding = encoding
self._current_line = '' # the current line (as a string)
self._lineno = -1
self._offset = 0
Modified: pypy/dist/pypy/interpreter/pyparser/pythonparse.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/pythonparse.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/pythonparse.py Thu Sep 29 14:01:14 2005
@@ -6,9 +6,10 @@
using file_input, single_input and eval_input targets
"""
from pypy.interpreter.error import OperationError, debug_print
+from pypy.interpreter import gateway
from pypy.interpreter.pyparser.error import ParseError
from pypy.tool.option import Options
-from pythonlexer import Source
+from pythonlexer import Source, match_encoding_declaration
import pysymbol
import ebnfparse
import sys
@@ -27,8 +28,18 @@
def parse_source(self, textsrc, goal, builder, flags=0):
"""Parse a python source according to goal"""
+ # Detect source encoding.
+ if textsrc[:3] == '\xEF\xBB\xBF':
+ textsrc = textsrc[3:]
+ enc = 'utf-8'
+ else:
+ enc = _normalize_encoding(_check_for_encoding(textsrc))
+ if enc is not None and enc not in ('utf-8', 'iso-8859-1'):
+ textsrc = _recode_in_utf8(builder.space, textsrc, enc)
+
lines = [line + '\n' for line in textsrc.split('\n')]
- if textsrc.endswith('\n'):
+ builder.source_encoding = enc
+ if textsrc[-1] =='\n':
lines.pop()
flags &= ~PyCF_DONT_IMPLY_DEDENT
return self.parse_lines(lines, goal, builder, flags)
@@ -37,8 +48,7 @@
goalnumber = pysymbol.sym_values[goal]
target = self.rules[goalnumber]
src = Source(lines, flags)
- builder.source_encoding = src.encoding
-
+
result = target.match(src, builder)
if not result:
line, lineno = src.debug()
@@ -46,6 +56,54 @@
raise ParseError("invalid syntax", lineno, -1, line)
# return None
return builder
+
+app_recode_to_utf8 = gateway.applevel(r'''
+ def app_recode_to_utf8(text, encoding):
+ return unicode(text, encoding).encode("utf-8")
+''').interphook('app_recode_to_utf8')
+
+def _recode_to_utf8(space, text, encoding):
+ return space.str_w(app_recode_to_utf8(space, space.wrap(text),
+ space.wrap(encoding)))
+def _normalize_encoding(encoding):
+ """returns normalized name for <encoding>
+
+ see dist/src/Parser/tokenizer.c 'get_normal_name()'
+ for implementation details / reference
+
+ NOTE: for now, parser.suite() raises a MemoryError when
+ a bad encoding is used. (SF bug #979739)
+ """
+ if encoding is None:
+ return None
+ # lower() + '_' / '-' conversion
+ encoding = encoding.replace('_', '-').lower()
+ if encoding.startswith('utf-8'):
+ return 'utf-8'
+ for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:
+ if encoding.startswith(variant):
+ return 'iso-8859-1'
+ return encoding
+
+def _check_for_encoding(s):
+ eol = s.find('\n')
+ if eol == -1:
+ return _check_line_for_encoding(s)
+ enc = _check_line_for_encoding(s[:eol])
+ eol2 = s.find('\n', eol + 1)
+ if eol2 == -1:
+ return _check_line_for_encoding(s[eol + 1:])
+ return _check_line_for_encoding(s[eol + 1:eol2])
+
+def _check_line_for_encoding(line):
+ """returns the declared encoding or None"""
+ i = 0
+ for i in range(len(line)):
+ if line[i] == '#':
+ break
+ if line[i] not in ' \t\014':
+ return None
+ return match_encoding_declaration(line[i:])
PYTHON_VERSION = ".".join([str(i) for i in sys.version_info[:2]])
def get_grammar_file( version ):
More information about the Pypy-commit
mailing list