[pypy-svn] r75655 - in pypy/branch/interplevel-codecs/pypy: module/_codecs module/_codecs/test rlib

afa at codespeak.net afa at codespeak.net
Tue Jun 29 13:20:26 CEST 2010


Author: afa
Date: Tue Jun 29 13:20:24 2010
New Revision: 75655

Modified:
   pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
   pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
   pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
   pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
   pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
Log:
rewrite the "charmap" codec at interp-level


Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py	Tue Jun 29 13:20:24 2010
@@ -6,7 +6,6 @@
     appleveldefs = {
          '__doc__' :  'app_codecs.__doc__',
          '__name__' :  'app_codecs.__name__',
-         'charmap_encode' :  'app_codecs.charmap_encode',
          'escape_decode' :  'app_codecs.escape_decode',
          'escape_encode' :  'app_codecs.escape_encode',
          'unicode_internal_decode' :  'app_codecs.unicode_internal_decode',
@@ -40,6 +39,7 @@
          'charbuffer_encode': 'interp_codecs.buffer_encode',
          'readbuffer_encode': 'interp_codecs.buffer_encode',
          'charmap_decode'   : 'interp_codecs.charmap_decode',
+         'charmap_encode'   : 'interp_codecs.charmap_encode',
          'unicode_escape_decode'     :  'interp_codecs.unicode_escape_decode',
          'unicode_escape_encode'     :  'interp_codecs.unicode_escape_encode',
          'raw_unicode_escape_decode' :  'interp_codecs.raw_unicode_escape_decode',

Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py	Tue Jun 29 13:20:24 2010
@@ -36,9 +36,6 @@
 
 """
 
-# XXX move some of these functions to RPython (like charmap_encode,
-# charmap_build) to make them faster
-
 def escape_encode( obj, errors='strict'):
     """None
     """
@@ -46,13 +43,6 @@
     v = s[1:-1]
     return v, len(v)
 
-def charmap_encode(obj, errors='strict', mapping=None):
-    """None
-    """
-    res = PyUnicode_EncodeCharmap(obj, mapping, errors)
-    res = ''.join(res)
-    return res, len(res)
-
 def unicode_internal_encode( obj, errors='strict'):
     """None
     """
@@ -191,54 +181,6 @@
         raise TypeError("encoding error handler must return (unicode, int) tuple, not %s" % repr(res))
 
 
-
-def charmapencode_output(c, mapping):
-
-    rep = mapping[c]
-    if isinstance(rep, int) or isinstance(rep, long):
-        if rep < 256:
-            return chr(rep)
-        else:
-            raise TypeError("character mapping must be in range(256)")
-    elif isinstance(rep, str):
-        return rep
-    elif rep == None:
-        raise KeyError("character maps to <undefined>")
-    else:
-        raise TypeError("character mapping must return integer, None or str")
-
-def PyUnicode_EncodeCharmap(p, mapping='latin-1', errors='strict'):
-
-##    /* the following variable is used for caching string comparisons
-##     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
-##     * 3=ignore, 4=xmlcharrefreplace */
-
-#    /* Default to Latin-1 */
-    if mapping == None:
-        import _codecs
-        return _codecs.latin_1_encode(p, errors)[0]
-    size = len(p)
-    if (size == 0):
-        return ''
-    inpos = 0
-    res = []
-    while (inpos<size):
-        #/* try to encode it */
-        try:
-            x = charmapencode_output(ord(p[inpos]), mapping)
-            res += x
-        except KeyError:
-            x = unicode_call_errorhandler(errors, "charmap",
-            "character maps to <undefined>", p, inpos, inpos+1, False)
-            try:
-                res += [charmapencode_output(ord(y), mapping) for y in x[0]]
-            except KeyError:
-                raise UnicodeEncodeError("charmap", p, inpos, inpos+1,
-                                        "character maps to <undefined>")
-        inpos += 1
-    return res
-
-
 def charmap_build(somestring):
     m = {}
     num = 0

Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py	Tue Jun 29 13:20:24 2010
@@ -414,80 +414,128 @@
                            space.wrap(byteorder)])
 utf_16_ex_decode.unwrap_spec = [ObjSpace, str, str, int, W_Root]
 
-def _extract_from_mapping(space, mapping_w, w_mapping, ch):
-    if mapping_w is not None:
+# ____________________________________________________________
+# Charmap
+
+class Charmap_Decode:
+    def __init__(self, space, w_mapping):
+        self.space = space
+        self.w_mapping = w_mapping
+
+        # fast path for all the stuff in the encodings module
+        if space.is_true(space.isinstance(w_mapping, space.w_tuple)):
+            self.mapping_w = space.fixedview(w_mapping)
+        else:
+            self.mapping_w = None
+
+    def get(self, ch, errorchar):
+        space = self.space
+
+        # get the character from the mapping
+        if self.mapping_w is not None:
+            w_ch = self.mapping_w[ord(ch)]
+        else:
+            try:
+                w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
+            except OperationError, e:
+                if not e.match(space, space.w_LookupError):
+                    raise
+                return errorchar
+
+        # Charmap may return a unicode string
         try:
-            return mapping_w[ord(ch)]
-        except IndexError:
-            pass
-    else:
+            x = space.unicode_w(w_ch)
+        except OperationError, e:
+            if not e.match(space, space.w_TypeError):
+                raise
+        else:
+            return x
+
+        # Charmap may return a number
+        try:
+            x = space.int_w(w_ch)
+        except OperationError:
+            if not e.match(space, space.w_TypeError):
+                raise
+        else:
+            return unichr(x)
+
+        # Charmap may return None
+        if space.is_w(w_ch, space.w_None):
+            return errorchar
+
+        raise OperationError(space.w_TypeError, space.wrap("invalid mapping"))
+
+class Charmap_Encode:
+    def __init__(self, space, w_mapping):
+        self.space = space
+        self.w_mapping = w_mapping
+
+    def get(self, ch, errorchar):
+        space = self.space
+
+        # get the character from the mapping
         try:
-            return space.getitem(w_mapping, space.newint(ord(ch)))
+            w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
         except OperationError, e:
-            if (not e.match(space, space.w_KeyError) and
-                not e.match(space, space.w_IndexError)):
+            if not e.match(space, space.w_LookupError):
                 raise
-            pass
+            return errorchar
 
-def _append_unicode(space, builder, w_x):
-    try:
-        x = space.unicode_w(w_x)
-    except OperationError, e:
-        if not e.match(space, space.w_TypeError):
-            raise
-    else:
-        if x != u"\ufffe":
-            builder.append(x)
-            return True
-        return False
-    try:
-        x = space.int_w(w_x)
-    except OperationError:
-        if not e.match(space, space.w_TypeError):
-            raise
-    else:
-        if x < 65536:
-            builder.append(unichr(x))
+        # Charmap may return a string
+        try:
+            x = space.realstr_w(w_ch)
+        except OperationError, e:
+            if not e.match(space, space.w_TypeError):
+                raise
         else:
-            raise OperationError(space.w_TypeError, space.wrap("character mapping must be in range(65536)"))
-        return True
-    if not space.is_true(w_x):
-        return False
-    else:
-        raise OperationError(space.w_TypeError, space.w_None)
+            return x
+
+        # Charmap may return a number
+        try:
+            x = space.int_w(w_ch)
+        except OperationError:
+            if not e.match(space, space.w_TypeError):
+                raise
+        else:
+            return chr(x)
+
+        # Charmap may return None
+        if space.is_w(w_ch, space.w_None):
+            return errorchar
 
+        raise OperationError(space.w_TypeError, space.wrap("invalid mapping"))
 
-def charmap_decode(space, s, errors="strict", w_mapping=None):
-    size = len(s)
-    # Default to Latin-1
-    if space.is_true(space.is_(w_mapping, space.w_None)):
-        return latin_1_decode(space, s, errors, space.w_False)
 
-    if (size == 0):
+ at unwrap_spec(ObjSpace, str, str, W_Root)
+def charmap_decode(space, string, errors="strict", w_mapping=None):
+    if len(string) == 0:
         return space.newtuple([space.wrap(u''), space.wrap(0)])
-    
-    # fast path for all the stuff in the encodings module
-    if space.is_true(space.isinstance(w_mapping, space.w_tuple)):
-        mapping_w = space.fixedview(w_mapping)
+
+    if space.is_w(w_mapping, space.w_None):
+        mapping = None
     else:
-        mapping_w = None
+        mapping = Charmap_Decode(space, w_mapping)
 
-    builder = UnicodeBuilder(size)
-    inpos = 0
-    while (inpos < len(s)):
-        #/* Get mapping_w (char ordinal -> integer, Unicode char or None) */
-        ch = s[inpos]
-        w_x = _extract_from_mapping(space, mapping_w, w_mapping, ch)
-        if w_x is not None and _append_unicode(space, builder, w_x):
-            inpos += 1
-            continue
-        state = space.fromcache(CodecState)
-        next, inpos = state.decode_error_handler(errors, "charmap",
-                   "character maps to <undefined>", s, inpos, inpos+1)
-        builder.append(next)
-    res = builder.build()
-    return space.newtuple([space.wrap(res), space.wrap(size)])
-charmap_decode.unwrap_spec = [ObjSpace, str, str, W_Root]
+    final = True
+    state = space.fromcache(CodecState)
+    result, consumed = runicode.str_decode_charmap(
+        string, len(string), errors,
+        final, state.decode_error_handler, mapping)
+    return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
+ at unwrap_spec(ObjSpace, unicode, str, W_Root)
+def charmap_encode(space, uni, errors="strict", w_mapping=None):
+    if space.is_w(w_mapping, space.w_None):
+        mapping = None
+    else:
+        mapping = Charmap_Encode(space, w_mapping)
+
+    state = space.fromcache(CodecState)
+    result = runicode.unicode_encode_charmap(
+        uni, len(uni), errors,
+        state.encode_error_handler, mapping)
+    return space.newtuple([space.wrap(result), space.wrap(len(uni))])
 
 # ____________________________________________________________
 # Unicode escape

Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py	Tue Jun 29 13:20:24 2010
@@ -1,6 +1,5 @@
 import autopath
 from pypy.conftest import gettestobjspace
-from pypy.module._codecs.app_codecs import charmap_encode
 
 
 class AppTestCodecs:
@@ -377,6 +376,9 @@
 
     def test_charmap_decode_1(self):
         import codecs
+        assert codecs.charmap_encode(u'xxx') == ('xxx', 3)
+        assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 3)
+
         res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
         assert res == (u"ab\ufffd", 3)
         res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
@@ -556,9 +558,3 @@
         assert u'caf\xe9'.encode('mbcs') == 'caf\xe9'
         assert u'\u040a'.encode('mbcs') == '?' # some cyrillic letter
         assert 'cafx\e9'.decode('mbcs') == u'cafx\e9'
-
-
-class TestDirect:
-    def test_charmap_encode(self):
-        assert charmap_encode(u'xxx') == ('xxx', 3)
-        assert charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) ==  ('XXXXXX', 6)

Modified: pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/rlib/runicode.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/rlib/runicode.py	Tue Jun 29 13:20:24 2010
@@ -1,7 +1,7 @@
 import sys
 from pypy.rlib.bitmanipulation import splitter
 from pypy.rpython.lltypesystem import lltype, rffi
-from pypy.rlib.objectmodel import we_are_translated
+from pypy.rlib.objectmodel import we_are_translated, specialize
 from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
 
 if rffi.sizeof(lltype.UniChar) == 4:
@@ -714,6 +714,65 @@
     return res
 
 # ____________________________________________________________
+# Charmap
+
+ERROR_CHAR = u'\ufffe'
+
+ at specialize.argtype(5)
+def str_decode_charmap(s, size, errors, final=False,
+                       errorhandler=None, mapping=None):
+    "mapping can be a rpython dictionary, or a dict-like object."
+
+    # Default to Latin-1
+    if mapping is None:
+        return str_decode_latin_1(s, size, errors, final=final,
+                                  errorhandler=errorhandler)
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_decode
+    if size == 0:
+        return u'', 0
+
+    pos = 0
+    result = UnicodeBuilder(size)
+    while pos < size:
+        ch = s[pos]
+
+        c = mapping.get(ch, ERROR_CHAR)
+        if c == ERROR_CHAR:
+            r, pos = errorhandler(errors, "charmap",
+                                  "character maps to <undefined>",
+                                  s,  pos, pos + 1)
+            result.append(r)
+            continue
+        result.append(c)
+        pos += 1
+    return result.build(), pos
+
+def unicode_encode_charmap(s, size, errors, errorhandler=None,
+                           mapping=None):
+    if mapping is None:
+        return unicode_encode_latin_1(s, size, errors,
+                                      errorhandler=errorhandler)
+
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+
+        c = mapping.get(ch, '')
+        if len(c) == 0:
+            r, pos = errorhandler(errors, "charmap",
+                                  "character maps to <undefined>",
+                                  s, pos, pos + 1)
+            result.append(r)
+            continue
+        result.append(c)
+        pos += 1
+    return result.build()
+
+# ____________________________________________________________
 # Unicode escape
 
 hexdigits = "0123456789ABCDEFabcdef"



More information about the Pypy-commit mailing list