[pypy-svn] r71038 - in pypy/trunk/pypy/module/_codecs: . test
cfbolz at codespeak.net
cfbolz at codespeak.net
Mon Feb 1 18:10:10 CET 2010
Author: cfbolz
Date: Mon Feb 1 18:10:08 2010
New Revision: 71038
Modified:
pypy/trunk/pypy/module/_codecs/__init__.py
pypy/trunk/pypy/module/_codecs/app_codecs.py
pypy/trunk/pypy/module/_codecs/interp_codecs.py
pypy/trunk/pypy/module/_codecs/test/test_codecs.py
Log:
move charmap_decode from app- to interplevel (I chose this one, because html5lib
uses it a lot). More of those codecs should be rewritten that way.
Modified: pypy/trunk/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/__init__.py (original)
+++ pypy/trunk/pypy/module/_codecs/__init__.py Mon Feb 1 18:10:08 2010
@@ -5,7 +5,6 @@
appleveldefs = {
'__doc__' : 'app_codecs.__doc__',
'__name__' : 'app_codecs.__name__',
- 'charmap_decode' : 'app_codecs.charmap_decode',
'charmap_encode' : 'app_codecs.charmap_encode',
'escape_decode' : 'app_codecs.escape_decode',
'escape_encode' : 'app_codecs.escape_encode',
@@ -44,6 +43,7 @@
'utf_16_ex_decode' : 'interp_codecs.utf_16_ex_decode',
'charbuffer_encode': 'interp_codecs.buffer_encode',
'readbuffer_encode': 'interp_codecs.buffer_encode',
+ 'charmap_decode' : 'interp_codecs.charmap_decode',
}
def __init__(self, space, *args):
Modified: pypy/trunk/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/app_codecs.py (original)
+++ pypy/trunk/pypy/module/_codecs/app_codecs.py Mon Feb 1 18:10:08 2010
@@ -39,7 +39,9 @@
Copyright (c) Corporation for National Research Initiatives.
"""
-#from unicodecodec import *
+
+# XXX move some of these functions to RPython (like charmap_encode,
+# charmap_build) to make them faster
import sys
@@ -201,13 +203,6 @@
res = ''.join(res)
return res, len(data)
-def charmap_decode( data, errors='strict', mapping=None):
- """None
- """
- res = PyUnicode_DecodeCharmap(data, mapping, errors)
- res = u''.join(res)
- return res, len(data)
-
def utf_7_encode( obj, errors='strict'):
"""None
@@ -841,44 +836,6 @@
inpos += 1
return res
-def PyUnicode_DecodeCharmap(s, mapping, errors):
-
- size = len(s)
-## /* Default to Latin-1 */
- if mapping is None:
- import _codecs
- return _codecs.latin_1_decode(s, errors)[0]
-
- if (size == 0):
- return u''
- p = []
- inpos = 0
- while (inpos< len(s)):
-
- #/* Get mapping (char ordinal -> integer, Unicode char or None) */
- ch = s[inpos]
- try:
- x = mapping[ord(ch)]
- if isinstance(x, int):
- if x < 65536:
- p += unichr(x)
- else:
- raise TypeError("character mapping must be in range(65536)")
- elif isinstance(x, unicode):
- if x == u"\ufffe":
- raise KeyError
- p += x
- elif not x:
- raise KeyError
- else:
- raise TypeError
- inpos += 1
- except (KeyError, IndexError):
- next, inpos = unicode_call_errorhandler(errors, "charmap",
- "character maps to <undefined>", s, inpos, inpos+1)
- p += next
- inpos
- return p
def PyUnicode_DecodeRawUnicodeEscape(s, size, errors):
Modified: pypy/trunk/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/interp_codecs.py (original)
+++ pypy/trunk/pypy/module/_codecs/interp_codecs.py Mon Feb 1 18:10:08 2010
@@ -1,6 +1,7 @@
from pypy.interpreter.error import OperationError, operationerrfmt
from pypy.interpreter.gateway import ObjSpace, NoneNotWrapped
from pypy.interpreter.baseobjspace import W_Root
+from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
class CodecState(object):
def __init__(self, space):
@@ -272,3 +273,77 @@
space.wrap(byteorder)])
utf_16_ex_decode.unwrap_spec = [ObjSpace, str, str, int, W_Root]
+def _extract_from_mapping(space, mapping_w, w_mapping, ch):
+ if mapping_w is not None:
+ try:
+ return mapping_w[ord(ch)]
+ except IndexError:
+ pass
+ else:
+ try:
+ return space.getitem(w_mapping, space.newint(ord(ch)))
+ except OperationError, e:
+ if (not e.match(space, space.w_KeyError) and
+ not e.match(space, space.w_IndexError)):
+ raise
+ pass
+
+def _append_unicode(space, builder, w_x):
+ try:
+ x = space.unicode_w(w_x)
+ except OperationError, e:
+ if not e.match(space, space.w_TypeError):
+ raise
+ else:
+ if x != u"\ufffe":
+ builder.append(x)
+ return True
+ return False
+ try:
+ x = space.int_w(w_x)
+ except OperationError:
+ if not e.match(space, space.w_TypeError):
+ raise
+ else:
+ if x < 65536:
+ builder.append(unichr(x))
+ else:
+ raise OperationError(space.w_TypeError, space.wrap("character mapping must be in range(65536)"))
+ return True
+ if not space.is_true(w_x):
+ return False
+ else:
+ raise OperationError(space.w_TypeError, space.w_None)
+
+
+def charmap_decode(space, s, errors="strict", w_mapping=None):
+ size = len(s)
+## /* Default to Latin-1 */
+ if space.is_true(space.is_(w_mapping, space.w_None)):
+ return latin_1_decode(space, s, errors, space.w_False)
+
+ if (size == 0):
+ return space.wrap(u'')
+
+ # fast path for all the stuff in the encodings module
+ if space.is_true(space.isinstance(w_mapping, space.w_tuple)):
+ mapping_w = space.fixedview(w_mapping)
+ else:
+ mapping_w = None
+
+ builder = UnicodeBuilder(size)
+ inpos = 0
+ while (inpos < len(s)):
+ #/* Get mapping_w (char ordinal -> integer, Unicode char or None) */
+ ch = s[inpos]
+ w_x = _extract_from_mapping(space, mapping_w, w_mapping, ch)
+ if w_x is not None and _append_unicode(space, builder, w_x):
+ inpos += 1
+ continue
+ state = space.fromcache(CodecState)
+ next, inpos = state.decode_error_handler(errors, "charmap",
+ "character maps to <undefined>", s, inpos, inpos+1)
+ builder.append(next)
+ res = builder.build()
+ return space.newtuple([space.wrap(res), space.wrap(size)])
+charmap_decode.unwrap_spec = [ObjSpace, str, str, W_Root]
Modified: pypy/trunk/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/test/test_codecs.py (original)
+++ pypy/trunk/pypy/module/_codecs/test/test_codecs.py Mon Feb 1 18:10:08 2010
@@ -1,7 +1,7 @@
import autopath
from pypy.conftest import gettestobjspace
from pypy.module._codecs.app_codecs import unicode_escape_encode,\
- charmap_encode, charmap_decode, unicode_escape_decode
+ charmap_encode, unicode_escape_decode
class AppTestCodecs:
@@ -116,6 +116,14 @@
raises (ValueError, test.decode,'string-escape')
+ def test_charmap_decode(self):
+ from _codecs import charmap_decode
+ assert charmap_decode('xxx') == ('xxx', 3)
+ assert charmap_decode('xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3)
+ map = tuple([unichr(i) for i in range(256)])
+ assert charmap_decode('xxx\xff', 'strict', map) == (u'xxx\xff', 4)
+
+
class AppTestPartialEvaluation:
def test_partial_utf8(self):
@@ -551,10 +559,6 @@
assert charmap_encode(u'xxx') == ('xxx', 3)
assert charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 6)
- def test_charmap_decode(self):
- assert charmap_decode('xxx') == ('xxx', 3)
- assert charmap_decode('xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3)
-
def test_unicode_escape(self):
assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
More information about the Pypy-commit
mailing list