[pypy-commit] pypy py3.3: Refactor a bit generate_unicodedb.
amauryfa
noreply at buildbot.pypy.org
Mon Mar 16 01:16:14 CET 2015
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: py3.3
Changeset: r76389:8144addb7b8d
Date: 2015-03-13 14:48 +0100
http://bitbucket.org/pypy/pypy/changeset/8144addb7b8d/
Log: Refactor a bit generate_unicodedb. More objects.
diff --git a/rpython/rlib/unicodedata/generate_unicodedb.py b/rpython/rlib/unicodedata/generate_unicodedb.py
--- a/rpython/rlib/unicodedata/generate_unicodedb.py
+++ b/rpython/rlib/unicodedata/generate_unicodedb.py
@@ -1,6 +1,8 @@
#!/usr/bin/env python
import sys, os
+import itertools
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
MAXUNICODE = 0x10FFFF # the value of sys.maxunicode of wide Python builds
@@ -18,7 +20,7 @@
def __str__(self):
return repr(self)
-class Unicodechar:
+class UnicodeChar:
def __init__(self, data=None):
if data is None:
return
@@ -74,37 +76,74 @@
self.title = int(data[14], 16)
def copy(self):
- uc = Unicodechar()
+ uc = UnicodeChar()
uc.__dict__.update(self.__dict__)
return uc
-def get_compat_decomposition(table, code):
- if not table[code].decomposition:
- return [code]
- if not table[code].compat_decomp:
- result = []
- for decomp in table[code].decomposition:
- result.extend(get_compat_decomposition(table, decomp))
- table[code].compat_decomp = result
- return table[code].compat_decomp
+class UnicodeData(object):
+ def __init__(self):
+ self.table = [None] * (MAXUNICODE + 1)
-def get_canonical_decomposition(table, code):
- if not table[code].decomposition or table[code].isCompatibility:
- return [code]
- if not table[code].canonical_decomp:
- result = []
- for decomp in table[code].decomposition:
- result.extend(get_canonical_decomposition(table, decomp))
- table[code].canonical_decomp = result
- return table[code].canonical_decomp
+ def add_char(self, code, char):
+ assert self.table[code] is None, (
+ 'Multiply defined character %04X' % code)
+ if isinstance(char, list):
+ char = UnicodeChar(char)
+ self.table[code] = char
+ return char
-def read_unicodedata(unicodedata_file, exclusions_file, east_asian_width_file,
- unihan_file=None, linebreak_file=None,
- derived_core_properties_file=None):
+ def all_codes(self):
+ return range(len(self.table))
+
+ def enum_chars(self):
+ for code in range(len(self.table)):
+ yield code, self.table[code]
+
+ def get_char(self, code):
+ return self.table[code]
+
+ def clone_char(self, code):
+ clone = self.table[code] = self.table[code].copy()
+ return clone
+
+ def set_excluded(self, code):
+ self.table[code].excluded = True
+
+ def set_linebreak(self, code):
+ self.table[code].linebreak = True
+
+ def set_east_asian_width(self, code, width):
+ self.table[code].east_asian_width = width
+
+ def add_property(self, code, p):
+ self.table[code].properties += (p,)
+
+ def get_compat_decomposition(self, code):
+ if not self.table[code].decomposition:
+ return [code]
+ if not self.table[code].compat_decomp:
+ result = []
+ for decomp in self.table[code].decomposition:
+ result.extend(self.get_compat_decomposition(decomp))
+ self.table[code].compat_decomp = result
+ return self.table[code].compat_decomp
+
+ def get_canonical_decomposition(self, code):
+ if (not self.table[code].decomposition or
+ self.table[code].isCompatibility):
+ return [code]
+ if not self.table[code].canonical_decomp:
+ result = []
+ for decomp in self.table[code].decomposition:
+ result.extend(self.get_canonical_decomposition(decomp))
+ self.table[code].canonical_decomp = result
+ return self.table[code].canonical_decomp
+
+def read_unicodedata(files):
rangeFirst = {}
rangeLast = {}
- table = [None] * (MAXUNICODE + 1)
- for line in unicodedata_file:
+ table = UnicodeData()
+ for line in files['data']:
line = line.split('#', 1)[0].strip()
if not line:
continue
@@ -119,26 +158,23 @@
rangeLast[name] = code
continue
code = int(data[0], 16)
- u = Unicodechar(data)
- assert table[code] is None, 'Multiply defined character %04X' % code
- table[code] = u
+ table.add_char(code, data)
# Collect ranges
ranges = {}
for name, (start, data) in rangeFirst.iteritems():
end = rangeLast[name]
- unichar = Unicodechar(['0000', None] + data[2:])
- ranges[(start, end)] = unichar
+ ranges[(start, end)] = ['0000', None] + data[2:]
# Read exclusions
- for line in exclusions_file:
+ for line in files['exclusions']:
line = line.split('#', 1)[0].strip()
if not line:
continue
- table[int(line, 16)].excluded = True
+ table.set_excluded(int(line, 16))
# Read line breaks
- for line in linebreak_file:
+ for line in files['linebreak']:
line = line.split('#', 1)[0].strip()
if not line:
continue
@@ -150,16 +186,15 @@
else:
first, last = [int(c, 16) for c in data[0].split('..')]
for char in range(first, last+1):
- table[char].linebreak = True
+ table.set_linebreak(char)
# Expand ranges
- for (first, last), char in ranges.iteritems():
+ for (first, last), data in ranges.iteritems():
for code in range(first, last + 1):
- assert table[code] is None, 'Multiply defined character %04X' % code
- table[code] = char
+ table.add_char(code, data)
# Read east asian width
- for line in east_asian_width_file:
+ for line in files['east_asian_width']:
line = line.split('#', 1)[0].strip()
if not line:
continue
@@ -167,16 +202,16 @@
if '..' in code:
first, last = map(lambda x:int(x,16), code.split('..'))
for code in range(first, last + 1):
- uc = table[code]
- if uc is None:
- uc = table[code] = Unicodechar(['0000', None,
- 'Cn'] + [''] * 12)
+ uc = table.get_char(code)
+ if not uc:
+ uc = table.add_char(code, ['0000', None,
+ 'Cn'] + [''] * 12)
uc.east_asian_width = width
else:
- table[int(code, 16)].east_asian_width = width
+ table.set_east_asian_width(int(code, 16), width)
# Read Derived Core Properties:
- for line in derived_core_properties_file:
+ for line in files['derived_core_properties']:
line = line.split('#', 1)[0].strip()
if not line:
continue
@@ -190,27 +225,25 @@
else:
chars = [int(r, 16)]
for char in chars:
- if not table[char]:
+ if not table.get_char(char):
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
continue
- table[char].properties += (p,)
+ table.add_property(char, p)
- defaultChar = Unicodechar(['0000', None, 'Cn'] + [''] * 12)
- for code in range(len(table)):
- if table[code] is None:
- table[code] = defaultChar
+ defaultChar = UnicodeChar(['0000', None, 'Cn'] + [''] * 12)
+ for code, char in table.enum_chars():
+ if not char:
+ table.add_char(code, defaultChar)
- extra_numeric = read_unihan(unihan_file)
+ extra_numeric = read_unihan(files['unihan'])
for code, value in extra_numeric.iteritems():
- uc = table[code].copy()
- uc.numeric = value
- table[code] = uc
+ table.clone_char(code).numeric = value
# Compute full decompositions.
- for code in range(len(table)):
- get_canonical_decomposition(table, code)
- get_compat_decomposition(table, code)
+ for code, char in table.enum_chars():
+ table.get_canonical_decomposition(code)
+ table.get_compat_decomposition(code)
return table
@@ -288,8 +321,8 @@
# Create the records
db_records = {}
- for code in range(len(table)):
- char = table[code]
+ for code in table.all_codes():
+ char = table.get_char(code)
flags = 0
if char.category == "Zs" or char.bidirectional in ("WS", "B", "S"):
flags |= IS_SPACE
@@ -328,9 +361,11 @@
print >> outfile, '_db_pgtbl = ('
pages = []
line = []
- for i in range(0, len(table), pgsize):
+ groups = [iter(table.enum_chars())] * pgsize
+ for group in itertools.izip_longest(*groups):
result = []
- for char in table[i:i + pgsize]:
+ for code, char in group:
+ if not char: continue
result.append(chr(db_records.index(char.db_record)))
categorytbl = ''.join(result)
try:
@@ -380,7 +415,9 @@
import triegenerator
- names = dict((table[code].name,code) for code in range(len(table)) if table[code].name)
+ names = dict((table.get_char(code).name, code)
+ for code in table.all_codes()
+ if table.get_char(code).name)
sorted_names_codes = sorted(names.iteritems())
if base_mod is None:
@@ -563,13 +600,13 @@
decimal = {}
digit = {}
numeric = {}
- for code in range(len(table)):
- if table[code].decimal is not None:
- decimal[code] = table[code].decimal
- if table[code].digit is not None:
- digit[code] = table[code].digit
- if table[code].numeric is not None:
- numeric[code] = table[code].numeric
+ for code, char in table.enum_chars():
+ if char.decimal is not None:
+ decimal[code] = char.decimal
+ if char.digit is not None:
+ digit[code] = char.digit
+ if char.numeric is not None:
+ numeric[code] = char.numeric
writeDict(outfile, '_decimal', decimal, base_mod)
writeDict(outfile, '_digit', digit, base_mod)
@@ -606,13 +643,13 @@
toupper = {}
tolower = {}
totitle = {}
- for code in range(len(table)):
- if table[code].upper:
- toupper[code] = table[code].upper
- if table[code].lower:
- tolower[code] = table[code].lower
- if table[code].title:
- totitle[code] = table[code].title
+ for code, char in table.enum_chars():
+ if char.upper:
+ toupper[code] = char.upper
+ if char.lower:
+ tolower[code] = char.lower
+ if char.title:
+ totitle[code] = char.title
writeDict(outfile, '_toupper', toupper, base_mod)
writeDict(outfile, '_tolower', tolower, base_mod)
writeDict(outfile, '_totitle', totitle, base_mod)
@@ -646,9 +683,9 @@
'''
# Decomposition
decomposition = {}
- for code in range(len(table)):
- if table[code].raw_decomposition:
- decomposition[code] = table[code].raw_decomposition
+ for code, char in table.enum_chars():
+ if char.raw_decomposition:
+ decomposition[code] = char.raw_decomposition
writeDict(outfile, '_raw_decomposition', decomposition, base_mod)
print >> outfile, '''
def decomposition(code):
@@ -662,13 +699,12 @@
'''
# Collect the composition pairs.
compositions = []
- for code in range(len(table)):
- unichar = table[code]
+ for code, unichar in table.enum_chars():
if (not unichar.decomposition or
unichar.isCompatibility or
unichar.excluded or
len(unichar.decomposition) != 2 or
- table[unichar.decomposition[0]].combining):
+ table.get_char(unichar.decomposition[0]).combining):
continue
left, right = unichar.decomposition
compositions.append((left, right, code))
@@ -680,15 +716,15 @@
print >> outfile
decomposition = {}
- for code in range(len(table)):
- if table[code].canonical_decomp:
- decomposition[code] = table[code].canonical_decomp
+ for code, char in table.enum_chars():
+ if char.canonical_decomp:
+ decomposition[code] = char.canonical_decomp
writeDict(outfile, '_canon_decomposition', decomposition, base_mod)
decomposition = {}
- for code in range(len(table)):
- if table[code].compat_decomp:
- decomposition[code] = table[code].compat_decomp
+ for code, char in table.enum_chars():
+ if char.compat_decomp:
+ decomposition[code] = char.compat_decomp
writeDict(outfile, '_compat_decomposition', decomposition, base_mod)
print >> outfile, '''
def canon_decomposition(code):
@@ -726,16 +762,23 @@
if options.output:
outfile = open(options.output + '.py', "w")
- infile = open('UnicodeData-%s.txt' % options.unidata_version)
- exclusions = open('CompositionExclusions-%s.txt' % options.unidata_version)
- east_asian_width = open('EastAsianWidth-%s.txt' % options.unidata_version)
- unihan = open('UnihanNumeric-%s.txt' % options.unidata_version)
- linebreak = open('LineBreak-%s.txt' % options.unidata_version)
- derived_core_properties = open('DerivedCoreProperties-%s.txt' %
- options.unidata_version)
- table = read_unicodedata(infile, exclusions, east_asian_width, unihan,
- linebreak, derived_core_properties)
+ filenames = dict(
+ data='UnicodeData-%(version)s.txt',
+ exclusions='CompositionExclusions-%(version)s.txt',
+ east_asian_width='EastAsianWidth-%(version)s.txt',
+ unihan='UnihanNumeric-%(version)s.txt',
+ linebreak='LineBreak-%(version)s.txt',
+ derived_core_properties='DerivedCoreProperties-%(version)s.txt',
+ name_aliases='NameAliases-%(version)s.txt',
+ named_sequences = 'NamedSequences-%(version)s.txt',
+ )
+ filenames = dict((name, filename % dict(version=options.unidata_version))
+ for (name, filename) in filenames.items())
+ files = dict((name, open(filename))
+ for (name, filename) in filenames.items())
+
+ table = read_unicodedata(files)
print >> outfile, '# UNICODE CHARACTER DATABASE'
print >> outfile, '# This file was generated with the command:'
print >> outfile, '# ', ' '.join(sys.argv)
More information about the pypy-commit
mailing list