[pypy-svn] r75655 - in pypy/branch/interplevel-codecs/pypy: module/_codecs module/_codecs/test rlib
afa at codespeak.net
afa at codespeak.net
Tue Jun 29 13:20:26 CEST 2010
Author: afa
Date: Tue Jun 29 13:20:24 2010
New Revision: 75655
Modified:
pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
Log:
rewrite the "charmap" codec at interp-level
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py Tue Jun 29 13:20:24 2010
@@ -6,7 +6,6 @@
appleveldefs = {
'__doc__' : 'app_codecs.__doc__',
'__name__' : 'app_codecs.__name__',
- 'charmap_encode' : 'app_codecs.charmap_encode',
'escape_decode' : 'app_codecs.escape_decode',
'escape_encode' : 'app_codecs.escape_encode',
'unicode_internal_decode' : 'app_codecs.unicode_internal_decode',
@@ -40,6 +39,7 @@
'charbuffer_encode': 'interp_codecs.buffer_encode',
'readbuffer_encode': 'interp_codecs.buffer_encode',
'charmap_decode' : 'interp_codecs.charmap_decode',
+ 'charmap_encode' : 'interp_codecs.charmap_encode',
'unicode_escape_decode' : 'interp_codecs.unicode_escape_decode',
'unicode_escape_encode' : 'interp_codecs.unicode_escape_encode',
'raw_unicode_escape_decode' : 'interp_codecs.raw_unicode_escape_decode',
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py Tue Jun 29 13:20:24 2010
@@ -36,9 +36,6 @@
"""
-# XXX move some of these functions to RPython (like charmap_encode,
-# charmap_build) to make them faster
-
def escape_encode( obj, errors='strict'):
"""None
"""
@@ -46,13 +43,6 @@
v = s[1:-1]
return v, len(v)
-def charmap_encode(obj, errors='strict', mapping=None):
- """None
- """
- res = PyUnicode_EncodeCharmap(obj, mapping, errors)
- res = ''.join(res)
- return res, len(res)
-
def unicode_internal_encode( obj, errors='strict'):
"""None
"""
@@ -191,54 +181,6 @@
raise TypeError("encoding error handler must return (unicode, int) tuple, not %s" % repr(res))
-
-def charmapencode_output(c, mapping):
-
- rep = mapping[c]
- if isinstance(rep, int) or isinstance(rep, long):
- if rep < 256:
- return chr(rep)
- else:
- raise TypeError("character mapping must be in range(256)")
- elif isinstance(rep, str):
- return rep
- elif rep == None:
- raise KeyError("character maps to <undefined>")
- else:
- raise TypeError("character mapping must return integer, None or str")
-
-def PyUnicode_EncodeCharmap(p, mapping='latin-1', errors='strict'):
-
-## /* the following variable is used for caching string comparisons
-## * -1=not initialized, 0=unknown, 1=strict, 2=replace,
-## * 3=ignore, 4=xmlcharrefreplace */
-
-# /* Default to Latin-1 */
- if mapping == None:
- import _codecs
- return _codecs.latin_1_encode(p, errors)[0]
- size = len(p)
- if (size == 0):
- return ''
- inpos = 0
- res = []
- while (inpos<size):
- #/* try to encode it */
- try:
- x = charmapencode_output(ord(p[inpos]), mapping)
- res += x
- except KeyError:
- x = unicode_call_errorhandler(errors, "charmap",
- "character maps to <undefined>", p, inpos, inpos+1, False)
- try:
- res += [charmapencode_output(ord(y), mapping) for y in x[0]]
- except KeyError:
- raise UnicodeEncodeError("charmap", p, inpos, inpos+1,
- "character maps to <undefined>")
- inpos += 1
- return res
-
-
def charmap_build(somestring):
m = {}
num = 0
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py Tue Jun 29 13:20:24 2010
@@ -414,80 +414,128 @@
space.wrap(byteorder)])
utf_16_ex_decode.unwrap_spec = [ObjSpace, str, str, int, W_Root]
-def _extract_from_mapping(space, mapping_w, w_mapping, ch):
- if mapping_w is not None:
+# ____________________________________________________________
+# Charmap
+
+class Charmap_Decode:
+ def __init__(self, space, w_mapping):
+ self.space = space
+ self.w_mapping = w_mapping
+
+ # fast path for all the stuff in the encodings module
+ if space.is_true(space.isinstance(w_mapping, space.w_tuple)):
+ self.mapping_w = space.fixedview(w_mapping)
+ else:
+ self.mapping_w = None
+
+ def get(self, ch, errorchar):
+ space = self.space
+
+ # get the character from the mapping
+ if self.mapping_w is not None:
+ w_ch = self.mapping_w[ord(ch)]
+ else:
+ try:
+ w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
+ except OperationError, e:
+ if not e.match(space, space.w_LookupError):
+ raise
+ return errorchar
+
+ # Charmap may return a unicode string
try:
- return mapping_w[ord(ch)]
- except IndexError:
- pass
- else:
+ x = space.unicode_w(w_ch)
+ except OperationError, e:
+ if not e.match(space, space.w_TypeError):
+ raise
+ else:
+ return x
+
+ # Charmap may return a number
+ try:
+ x = space.int_w(w_ch)
+ except OperationError:
+ if not e.match(space, space.w_TypeError):
+ raise
+ else:
+ return unichr(x)
+
+ # Charmap may return None
+ if space.is_w(w_ch, space.w_None):
+ return errorchar
+
+ raise OperationError(space.w_TypeError, space.wrap("invalid mapping"))
+
+class Charmap_Encode:
+ def __init__(self, space, w_mapping):
+ self.space = space
+ self.w_mapping = w_mapping
+
+ def get(self, ch, errorchar):
+ space = self.space
+
+ # get the character from the mapping
try:
- return space.getitem(w_mapping, space.newint(ord(ch)))
+ w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
except OperationError, e:
- if (not e.match(space, space.w_KeyError) and
- not e.match(space, space.w_IndexError)):
+ if not e.match(space, space.w_LookupError):
raise
- pass
+ return errorchar
-def _append_unicode(space, builder, w_x):
- try:
- x = space.unicode_w(w_x)
- except OperationError, e:
- if not e.match(space, space.w_TypeError):
- raise
- else:
- if x != u"\ufffe":
- builder.append(x)
- return True
- return False
- try:
- x = space.int_w(w_x)
- except OperationError:
- if not e.match(space, space.w_TypeError):
- raise
- else:
- if x < 65536:
- builder.append(unichr(x))
+ # Charmap may return a string
+ try:
+ x = space.realstr_w(w_ch)
+ except OperationError, e:
+ if not e.match(space, space.w_TypeError):
+ raise
else:
- raise OperationError(space.w_TypeError, space.wrap("character mapping must be in range(65536)"))
- return True
- if not space.is_true(w_x):
- return False
- else:
- raise OperationError(space.w_TypeError, space.w_None)
+ return x
+
+ # Charmap may return a number
+ try:
+ x = space.int_w(w_ch)
+ except OperationError:
+ if not e.match(space, space.w_TypeError):
+ raise
+ else:
+ return chr(x)
+
+ # Charmap may return None
+ if space.is_w(w_ch, space.w_None):
+ return errorchar
+ raise OperationError(space.w_TypeError, space.wrap("invalid mapping"))
-def charmap_decode(space, s, errors="strict", w_mapping=None):
- size = len(s)
- # Default to Latin-1
- if space.is_true(space.is_(w_mapping, space.w_None)):
- return latin_1_decode(space, s, errors, space.w_False)
- if (size == 0):
+ at unwrap_spec(ObjSpace, str, str, W_Root)
+def charmap_decode(space, string, errors="strict", w_mapping=None):
+ if len(string) == 0:
return space.newtuple([space.wrap(u''), space.wrap(0)])
-
- # fast path for all the stuff in the encodings module
- if space.is_true(space.isinstance(w_mapping, space.w_tuple)):
- mapping_w = space.fixedview(w_mapping)
+
+ if space.is_w(w_mapping, space.w_None):
+ mapping = None
else:
- mapping_w = None
+ mapping = Charmap_Decode(space, w_mapping)
- builder = UnicodeBuilder(size)
- inpos = 0
- while (inpos < len(s)):
- #/* Get mapping_w (char ordinal -> integer, Unicode char or None) */
- ch = s[inpos]
- w_x = _extract_from_mapping(space, mapping_w, w_mapping, ch)
- if w_x is not None and _append_unicode(space, builder, w_x):
- inpos += 1
- continue
- state = space.fromcache(CodecState)
- next, inpos = state.decode_error_handler(errors, "charmap",
- "character maps to <undefined>", s, inpos, inpos+1)
- builder.append(next)
- res = builder.build()
- return space.newtuple([space.wrap(res), space.wrap(size)])
-charmap_decode.unwrap_spec = [ObjSpace, str, str, W_Root]
+ final = True
+ state = space.fromcache(CodecState)
+ result, consumed = runicode.str_decode_charmap(
+ string, len(string), errors,
+ final, state.decode_error_handler, mapping)
+ return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
+ at unwrap_spec(ObjSpace, unicode, str, W_Root)
+def charmap_encode(space, uni, errors="strict", w_mapping=None):
+ if space.is_w(w_mapping, space.w_None):
+ mapping = None
+ else:
+ mapping = Charmap_Encode(space, w_mapping)
+
+ state = space.fromcache(CodecState)
+ result = runicode.unicode_encode_charmap(
+ uni, len(uni), errors,
+ state.encode_error_handler, mapping)
+ return space.newtuple([space.wrap(result), space.wrap(len(uni))])
# ____________________________________________________________
# Unicode escape
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py Tue Jun 29 13:20:24 2010
@@ -1,6 +1,5 @@
import autopath
from pypy.conftest import gettestobjspace
-from pypy.module._codecs.app_codecs import charmap_encode
class AppTestCodecs:
@@ -377,6 +376,9 @@
def test_charmap_decode_1(self):
import codecs
+ assert codecs.charmap_encode(u'xxx') == ('xxx', 3)
+ assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 3)
+
res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
assert res == (u"ab\ufffd", 3)
res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
@@ -556,9 +558,3 @@
assert u'caf\xe9'.encode('mbcs') == 'caf\xe9'
assert u'\u040a'.encode('mbcs') == '?' # some cyrillic letter
assert 'cafx\e9'.decode('mbcs') == u'cafx\e9'
-
-
-class TestDirect:
- def test_charmap_encode(self):
- assert charmap_encode(u'xxx') == ('xxx', 3)
- assert charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 6)
Modified: pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/rlib/runicode.py (original)
+++ pypy/branch/interplevel-codecs/pypy/rlib/runicode.py Tue Jun 29 13:20:24 2010
@@ -1,7 +1,7 @@
import sys
from pypy.rlib.bitmanipulation import splitter
from pypy.rpython.lltypesystem import lltype, rffi
-from pypy.rlib.objectmodel import we_are_translated
+from pypy.rlib.objectmodel import we_are_translated, specialize
from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
if rffi.sizeof(lltype.UniChar) == 4:
@@ -714,6 +714,65 @@
return res
# ____________________________________________________________
+# Charmap
+
+ERROR_CHAR = u'\ufffe'
+
+ at specialize.argtype(5)
+def str_decode_charmap(s, size, errors, final=False,
+ errorhandler=None, mapping=None):
+ "mapping can be a rpython dictionary, or a dict-like object."
+
+ # Default to Latin-1
+ if mapping is None:
+ return str_decode_latin_1(s, size, errors, final=final,
+ errorhandler=errorhandler)
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_decode
+ if size == 0:
+ return u'', 0
+
+ pos = 0
+ result = UnicodeBuilder(size)
+ while pos < size:
+ ch = s[pos]
+
+ c = mapping.get(ch, ERROR_CHAR)
+ if c == ERROR_CHAR:
+ r, pos = errorhandler(errors, "charmap",
+ "character maps to <undefined>",
+ s, pos, pos + 1)
+ result.append(r)
+ continue
+ result.append(c)
+ pos += 1
+ return result.build(), pos
+
+def unicode_encode_charmap(s, size, errors, errorhandler=None,
+ mapping=None):
+ if mapping is None:
+ return unicode_encode_latin_1(s, size, errors,
+ errorhandler=errorhandler)
+
+ if size == 0:
+ return ''
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ c = mapping.get(ch, '')
+ if len(c) == 0:
+ r, pos = errorhandler(errors, "charmap",
+ "character maps to <undefined>",
+ s, pos, pos + 1)
+ result.append(r)
+ continue
+ result.append(c)
+ pos += 1
+ return result.build()
+
+# ____________________________________________________________
# Unicode escape
hexdigits = "0123456789ABCDEFabcdef"
More information about the Pypy-commit
mailing list