[pypy-svn] r61947 - in pypy/trunk/pypy/module: _codecs _codecs/test unicodedata

Mon Feb 16 13:47:29 CET 2009

Author: afa
Date: Mon Feb 16 13:47:28 2009
New Revision: 61947

Modified:
   pypy/trunk/pypy/module/_codecs/app_codecs.py
   pypy/trunk/pypy/module/_codecs/test/test_codecs.py
   pypy/trunk/pypy/module/unicodedata/__init__.py
   pypy/trunk/pypy/module/unicodedata/interp_ucd.py
Log:
Allow u'\N{NAME}' to return characters beyond the BMP on narrow unicode builds.
Test and fix, should fix test_unicodedata on Windows.

python2 is inconsistent here: it allows 
u'\N{CJK UNIFIED IDEOGRAPH-20000}' but not unichr(0x20000).
Things will be better with python3.


Modified: pypy/trunk/pypy/module/_codecs/app_codecs.py
==============================================================================

--- pypy/trunk/pypy/module/_codecs/app_codecs.py	(original)
+++ pypy/trunk/pypy/module/_codecs/app_codecs.py	Mon Feb 16 13:47:28 2009
@@ -749,13 +749,19 @@
                         message = "unknown Unicode character name"
                         st = s[pos+1:look]
                         try:
-                            chr = unicodedata.lookup("%s" % st)
+                            ch = unicodedata._get_code("%s" % st)
                         except KeyError, e:
                             x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1)
+                            p += x[0]
+                            pos = x[1]
                         else:
-                            x = chr, look + 1 
-                        p += x[0]
-                        pos = x[1]
+                            pos = look + 1
+                            if ch <= sys.maxunicode:
+                                p += unichr(ch)
+                            else:
+                                ch -= 0x10000L
+                                p += unichr(0xD800 + (ch >> 10))
+                                p += unichr(0xDC00 +  (ch & 0x03FF))
                     else:        
                         x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1)
                 else:        

Modified: pypy/trunk/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/test/test_codecs.py	(original)
+++ pypy/trunk/pypy/module/_codecs/test/test_codecs.py	Mon Feb 16 13:47:28 2009
@@ -44,6 +44,7 @@
         assert  unicode('\\N{SPACE}\\N{SPACE}','unicode-escape') == u"  " 
         assert  unicode('\\N{SPACE}a\\N{SPACE}','unicode-escape') == u" a " 
         assert "\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx"
+        assert 1 <= len(u"\N{CJK UNIFIED IDEOGRAPH-20000}") <= 2
 
     def test_literals(self):
         raises(UnicodeError, eval, 'u\'\\Uffffffff\'')

Modified: pypy/trunk/pypy/module/unicodedata/__init__.py
==============================================================================
--- pypy/trunk/pypy/module/unicodedata/__init__.py	(original)
+++ pypy/trunk/pypy/module/unicodedata/__init__.py	Mon Feb 16 13:47:28 2009
@@ -13,6 +13,6 @@
     }
     for name in '''lookup name decimal digit numeric category bidirectional
                    east_asian_width combining mirrored decomposition
-                   normalize'''.split():
+                   normalize _get_code'''.split():
         interpleveldefs[name] = '''space.getattr(space.wrap(interp_ucd.ucd),
                                    space.wrap("%s"))''' % name

Modified: pypy/trunk/pypy/module/unicodedata/interp_ucd.py
==============================================================================
--- pypy/trunk/pypy/module/unicodedata/interp_ucd.py	(original)
+++ pypy/trunk/pypy/module/unicodedata/interp_ucd.py	Mon Feb 16 13:47:28 2009
@@ -48,17 +48,20 @@
         
         self.version = unicodedb.version
         
-    def lookup(self, space, name):
+    def _get_code(self, space, name):
         try:
             code = self._lookup(name.upper())
         except KeyError:
             msg = space.mod(space.wrap("undefined character name '%s'"), space.wrap(name))
             raise OperationError(space.w_KeyError, msg)
+        return space.wrap(code)
+    _get_code.unwrap_spec = ['self', ObjSpace, str]
+    
+    def lookup(self, space, name):
         return space.call_function(space.builtin.get('unichr'),
-                                   space.wrap(code))
-
+                                   self._get_code(space, name))
     lookup.unwrap_spec = ['self', ObjSpace, str]
-    
+
     def name(self, space, w_unichr, w_default=NoneNotWrapped):
         code = unichr_to_code_w(space, w_unichr)
         try: