[pypy-commit] pypy default: support the 6.0.0 db: add its cjk_interval and handle east asian width ranges

Wed Mar 20 01:49:40 CET 2013

Author: Philip Jenvey <pjenvey at underboss.org>
Branch: 
Changeset: r62525:5606e1cd236f
Date: 2013-03-19 16:30 -0700
http://bitbucket.org/pypy/pypy/changeset/5606e1cd236f/

Log:	support the 6.0.0 db: add its cjk_interval and handle east asian
	width ranges overlapping the general UnicodeData-x.x.x.txt ranges

diff --git a/rpython/rlib/unicodedata/generate_unicodedb.py b/rpython/rlib/unicodedata/generate_unicodedb.py
--- a/rpython/rlib/unicodedata/generate_unicodedb.py
+++ b/rpython/rlib/unicodedata/generate_unicodedb.py
@@ -152,6 +152,12 @@
         for char in range(first, last+1):
             table[char].linebreak = True
 
+    # Expand ranges
+    for (first, last), char in ranges.iteritems():
+        for code in range(first, last + 1):
+            assert table[code] is None, 'Multiply defined character %04X' % code
+            table[code] = char
+
     # Read east asian width
     for line in east_asian_width_file:
         line = line.split('#', 1)[0].strip()
@@ -160,22 +166,15 @@
         code, width = line.split(';')
         if '..' in code:
             first, last = map(lambda x:int(x,16), code.split('..'))
-            try:
-                ranges[(first, last)].east_asian_width = width
-            except KeyError:
-                ch = Unicodechar(['0000', None, 'Cn'] + [''] * 12)
-                ch.east_asian_width = width
-                ranges[(first, last)] = ch
+            for code in range(first, last + 1):
+                uc = table[code]
+                if uc is None:
+                    uc = table[code] = Unicodechar(['0000', None,
+                                                    'Cn'] + [''] * 12)
+                uc.east_asian_width = width
         else:
             table[int(code, 16)].east_asian_width = width
 
-    # Expand ranges
-    for (first, last), char in ranges.iteritems():
-        for code in range(first, last + 1):
-            assert table[code] is None, 'Multiply defined character %04X' % code
-
-            table[code] = char
-
     # Read Derived Core Properties:
     for line in derived_core_properties_file:
         line = line.split('#', 1)[0].strip()
@@ -446,11 +445,17 @@
         cjk_interval = ("(0x3400 <= code <= 0x4DB5 or"
                         " 0x4E00 <= code <= 0x9FBB or"
                         " 0x20000 <= code <= 0x2A6D6)")
+    elif version < "6":
+        cjk_interval = ("(0x3400 <= code <= 0x4DB5 or"
+                        " 0x4E00 <= code <= 0x9FCB or"
+                        " 0x20000 <= code <= 0x2A6D6 or"
+                        " 0x2A700 <= code <= 0x2B734)")
     else:
         cjk_interval = ("(0x3400 <= code <= 0x4DB5 or"
                         " 0x4E00 <= code <= 0x9FCB or"
                         " 0x20000 <= code <= 0x2A6D6 or"
-                        " 0x2A700 <= code <= 0x2B734)")
+                        " 0x2A700 <= code <= 0x2B734 or"
+                        " 0x2B740 <= code <= 0x2B81D)")
 
     write_character_names(outfile, table, base_mod)