[pypy-svn] r71038 - in pypy/trunk/pypy/module/_codecs: . test

cfbolz at codespeak.net cfbolz at codespeak.net
Mon Feb 1 18:10:10 CET 2010


Author: cfbolz
Date: Mon Feb  1 18:10:08 2010
New Revision: 71038

Modified:
   pypy/trunk/pypy/module/_codecs/__init__.py
   pypy/trunk/pypy/module/_codecs/app_codecs.py
   pypy/trunk/pypy/module/_codecs/interp_codecs.py
   pypy/trunk/pypy/module/_codecs/test/test_codecs.py
Log:
move charmap_decode from app- to interplevel (I chose this one, because html5lib
uses it a lot). More of those codecs should be rewritten that way.


Modified: pypy/trunk/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/__init__.py	(original)
+++ pypy/trunk/pypy/module/_codecs/__init__.py	Mon Feb  1 18:10:08 2010
@@ -5,7 +5,6 @@
     appleveldefs = {
          '__doc__' :  'app_codecs.__doc__',
          '__name__' :  'app_codecs.__name__',
-         'charmap_decode' :  'app_codecs.charmap_decode',
          'charmap_encode' :  'app_codecs.charmap_encode',
          'escape_decode' :  'app_codecs.escape_decode',
          'escape_encode' :  'app_codecs.escape_encode',
@@ -44,6 +43,7 @@
          'utf_16_ex_decode' : 'interp_codecs.utf_16_ex_decode',
          'charbuffer_encode': 'interp_codecs.buffer_encode',
          'readbuffer_encode': 'interp_codecs.buffer_encode',
+         'charmap_decode'   : 'interp_codecs.charmap_decode',
     }
 
     def __init__(self, space, *args):

Modified: pypy/trunk/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/app_codecs.py	(original)
+++ pypy/trunk/pypy/module/_codecs/app_codecs.py	Mon Feb  1 18:10:08 2010
@@ -39,7 +39,9 @@
 Copyright (c) Corporation for National Research Initiatives.
 
 """
-#from unicodecodec import *
+
+# XXX move some of these functions to RPython (like charmap_encode,
+# charmap_build) to make them faster
 
 import sys
 
@@ -201,13 +203,6 @@
     res = ''.join(res)    
     return res, len(data)
 
-def charmap_decode( data, errors='strict', mapping=None):
-    """None
-    """
-    res = PyUnicode_DecodeCharmap(data, mapping, errors)
-    res = u''.join(res)
-    return res, len(data)
-
 
 def utf_7_encode( obj, errors='strict'):
     """None
@@ -841,44 +836,6 @@
         inpos += 1
     return res
 
-def PyUnicode_DecodeCharmap(s, mapping, errors):
-
-    size = len(s)
-##    /* Default to Latin-1 */
-    if mapping is None:
-        import _codecs
-        return _codecs.latin_1_decode(s, errors)[0]
-
-    if (size == 0):
-        return u''
-    p = []
-    inpos = 0
-    while (inpos< len(s)):
-        
-        #/* Get mapping (char ordinal -> integer, Unicode char or None) */
-        ch = s[inpos]
-        try:
-            x = mapping[ord(ch)]
-            if isinstance(x, int):
-                if x < 65536:
-                    p += unichr(x)
-                else:
-                    raise TypeError("character mapping must be in range(65536)")
-            elif isinstance(x, unicode):
-                if x == u"\ufffe":
-                    raise KeyError
-                p += x
-            elif not x:
-                raise KeyError
-            else:
-                raise TypeError
-            inpos += 1
-        except (KeyError, IndexError):
-            next, inpos = unicode_call_errorhandler(errors, "charmap",
-                       "character maps to <undefined>", s, inpos, inpos+1)
-            p += next
-            inpos
-    return p
 
 def PyUnicode_DecodeRawUnicodeEscape(s, size, errors):
 

Modified: pypy/trunk/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/interp_codecs.py	(original)
+++ pypy/trunk/pypy/module/_codecs/interp_codecs.py	Mon Feb  1 18:10:08 2010
@@ -1,6 +1,7 @@
 from pypy.interpreter.error import OperationError, operationerrfmt
 from pypy.interpreter.gateway import ObjSpace, NoneNotWrapped
 from pypy.interpreter.baseobjspace import W_Root
+from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
 
 class CodecState(object):
     def __init__(self, space):
@@ -272,3 +273,77 @@
                            space.wrap(byteorder)])
 utf_16_ex_decode.unwrap_spec = [ObjSpace, str, str, int, W_Root]
 
+def _extract_from_mapping(space, mapping_w, w_mapping, ch):
+    if mapping_w is not None:
+        try:
+            return mapping_w[ord(ch)]
+        except IndexError:
+            pass
+    else:
+        try:
+            return space.getitem(w_mapping, space.newint(ord(ch)))
+        except OperationError, e:
+            if (not e.match(space, space.w_KeyError) and
+                not e.match(space, space.w_IndexError)):
+                raise
+            pass
+
+def _append_unicode(space, builder, w_x):
+    try:
+        x = space.unicode_w(w_x)
+    except OperationError, e:
+        if not e.match(space, space.w_TypeError):
+            raise
+    else:
+        if x != u"\ufffe":
+            builder.append(x)
+            return True
+        return False
+    try:
+        x = space.int_w(w_x)
+    except OperationError:
+        if not e.match(space, space.w_TypeError):
+            raise
+    else:
+        if x < 65536:
+            builder.append(unichr(x))
+        else:
+            raise OperationError(space.w_TypeError, space.wrap("character mapping must be in range(65536)"))
+        return True
+    if not space.is_true(w_x):
+        return False
+    else:
+        raise OperationError(space.w_TypeError, space.w_None)
+
+
+def charmap_decode(space, s, errors="strict", w_mapping=None):
+    size = len(s)
+##    /* Default to Latin-1 */
+    if space.is_true(space.is_(w_mapping, space.w_None)):
+        return latin_1_decode(space, s, errors, space.w_False)
+
+    if (size == 0):
+        return space.wrap(u'')
+    
+    # fast path for all the stuff in the encodings module
+    if space.is_true(space.isinstance(w_mapping, space.w_tuple)):
+        mapping_w = space.fixedview(w_mapping)
+    else:
+        mapping_w = None
+
+    builder = UnicodeBuilder(size)
+    inpos = 0
+    while (inpos < len(s)):
+        #/* Get mapping_w (char ordinal -> integer, Unicode char or None) */
+        ch = s[inpos]
+        w_x = _extract_from_mapping(space, mapping_w, w_mapping, ch)
+        if w_x is not None and _append_unicode(space, builder, w_x):
+            inpos += 1
+            continue
+        state = space.fromcache(CodecState)
+        next, inpos = state.decode_error_handler(errors, "charmap",
+                   "character maps to <undefined>", s, inpos, inpos+1)
+        builder.append(next)
+    res = builder.build()
+    return space.newtuple([space.wrap(res), space.wrap(size)])
+charmap_decode.unwrap_spec = [ObjSpace, str, str, W_Root]

Modified: pypy/trunk/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/test/test_codecs.py	(original)
+++ pypy/trunk/pypy/module/_codecs/test/test_codecs.py	Mon Feb  1 18:10:08 2010
@@ -1,7 +1,7 @@
 import autopath
 from pypy.conftest import gettestobjspace
 from pypy.module._codecs.app_codecs import unicode_escape_encode,\
-     charmap_encode, charmap_decode, unicode_escape_decode
+     charmap_encode, unicode_escape_decode
 
 
 class AppTestCodecs:
@@ -116,6 +116,14 @@
              
         raises (ValueError, test.decode,'string-escape')
 
+    def test_charmap_decode(self):
+        from _codecs import charmap_decode
+        assert charmap_decode('xxx') == ('xxx', 3)
+        assert charmap_decode('xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3)
+        map = tuple([unichr(i) for i in range(256)])
+        assert charmap_decode('xxx\xff', 'strict', map) == (u'xxx\xff', 4)
+
+
 class AppTestPartialEvaluation:
 
     def test_partial_utf8(self):
@@ -551,10 +559,6 @@
         assert charmap_encode(u'xxx') == ('xxx', 3)
         assert charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) ==  ('XXXXXX', 6)
 
-    def test_charmap_decode(self):
-        assert charmap_decode('xxx') == ('xxx', 3)
-        assert charmap_decode('xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3)
-
     def test_unicode_escape(self):
         assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
         assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)



More information about the Pypy-commit mailing list