[Python-Dev] Unicode charnames impl.
Andrew M. Kuchling
akuchlin@mems-exchange.org
Fri, 24 Mar 2000 16:46:28 -0500 (EST)
Here's a strawman codec for doing the \N{NULL} thing. Questions:
0) Is the code below correct?
1) What the heck would this encoding be called?
2) What does .encode() do? (Right now it escapes \N as
\N{BACKSLASH}N.)
3) How can we store all those names? The resulting dictionary makes a
361K .py file; Python dumps core trying to parse it. (Another bug...)
4) What do you with the error \N{...... no closing right bracket.
Right now it stops at that point, and never advances any farther.
Maybe it should assume it's an error if there's no } within the
next 200 chars or some similar limit?
5) Do we need StreamReader/Writer classes, too?
I've also add a script that parses the names out of the NameList.txt
file at ftp://ftp.unicode.org/Public/UNIDATA/.
--amk
namecodec.py:
=============
import codecs
#from _namedict import namedict
namedict = {'NULL': 0, 'START OF HEADING' : 1,
'BACKSLASH':ord('\\')}
class NameCodec(codecs.Codec):
def encode(self,input,errors='strict'):
# XXX what should this do? Escape the
# sequence \N as '\N{BACKSLASH}N'?
return input.replace( '\\N', '\\N{BACKSLASH}N' )
def decode(self,input,errors='strict'):
output = unicode("")
last = 0
index = input.find( u'\\N{' )
while index != -1:
output = output + unicode( input[last:index] )
used = index
r_bracket = input.find( '}', index)
if r_bracket == -1:
# No closing bracket; bail out...
break
name = input[index + 3 : r_bracket]
code = namedict.get( name )
if code is not None:
output = output + unichr(code)
elif errors == 'strict':
raise ValueError, 'Unknown character name %s' % repr(name)
elif errors == 'ignore': pass
elif errors == 'replace':
output = output + unichr( 0xFFFD )
last = r_bracket + 1
index = input.find( '\\N{', last)
else:
# Finally failed gently, no longer finding a \N{...
output = output + unicode( input[last:] )
return len(input), output
# Otherwise, we hit the break for an unterminated \N{...}
return index, output
if __name__ == '__main__':
c = NameCodec()
for s in [ r'b\lah blah \N{NULL} asdf',
r'b\l\N{START OF HEADING}\N{NU' ]:
used, s2 = c.decode(s)
print repr( s2 )
s3 = c.encode(s)
_, s4 = c.decode(s3)
print repr(s3)
assert s4 == s
print repr( c.decode(r'blah blah \N{NULLsadf} asdf' , errors='replace' ))
print repr( c.decode(r'blah blah \N{NULLsadf} asdf' , errors='ignore' ))
makenamelist.py
===============
# Hack to extract character names from NamesList.txt
# Output the repr() of the resulting dictionary
import re, sys, string
namedict = {}
while 1:
L = sys.stdin.readline()
if L == "": break
m = re.match('([0-9a-fA-F]){4}(?:\t(.*)\s*)', L)
if m is not None:
last_char = int(m.group(1), 16)
if m.group(2) is not None:
name = string.upper( m.group(2) )
if name not in ['<CONTROL>',
'<NOT A CHARACTER>']:
namedict[ name ] = last_char
# print name, last_char
m = re.match('\t=\s*(.*)\s*(;.*)?', L)
if m is not None:
name = string.upper( m.group(1) )
names = string.split(name, ',')
names = map(string.strip, names)
for n in names:
namedict[ n ] = last_char
# print n, last_char
# XXX and do what with this dictionary?
print namedict