[pypy-svn] r48603 - in pypy/branch/more-unicode-improvements/pypy/rlib: . test
cfbolz at codespeak.net
cfbolz at codespeak.net
Mon Nov 12 18:00:33 CET 2007
Author: cfbolz
Date: Mon Nov 12 18:00:33 2007
New Revision: 48603
Modified:
pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
Log:
port encoders to RPython
Modified: pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py (original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py Mon Nov 12 18:00:33 2007
@@ -11,7 +11,12 @@
"%s can't decode byte %s in position %s: %s" % (
encoding, s[startingpos], startingpos, msg))
else:
- XXX
+ raise UnicodeEncodeError(
+ "%s can't encode byte %s in position %s: %s" % (
+ encoding, s[startingpos], startingpos, msg))
+
+# ____________________________________________________________
+# unicode decoding
utf8_code_length = [
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -290,3 +295,144 @@
s, pos, pos + 1)
p += r
return u"".join(p), pos
+
+
+# ____________________________________________________________
+# unicode encoding
+
+
+def unicode_encode_utf8(s, size, errors, errorhandler=raise_unicode_exception):
+ assert(size >= 0)
+ p = []
+ i = 0
+ while i < size:
+ ch = s[i]
+ i += 1
+ if (ord(ch) < 0x80):
+ # Encode ASCII
+ p += chr(ord(ch))
+ elif (ord(ch) < 0x0800) :
+ # Encode Latin-1
+ p += chr((0xc0 | (ord(ch) >> 6)))
+ p += chr((0x80 | (ord(ch) & 0x3f)))
+ else:
+ # Encode UCS2 Unicode ordinals
+ if (ord(ch) < 0x10000):
+ # Special case: check for high surrogate
+ if (0xD800 <= ord(ch) and ord(ch) <= 0xDBFF and i != size) :
+ ch2 = s[i]
+ # Check for low surrogate and combine the two to
+ # form a UCS4 value
+ if (0xDC00 <= ord(ch2) and ord(ch2) <= 0xDFFF) :
+ ch3 = ((ord(ch) - 0xD800) << 10 | (ord(ch2) - 0xDC00)) + 0x10000
+ i += 1
+ _encodeUCS4(p, ch)
+ continue
+ # Fall through: handles isolated high surrogates
+ p += (chr((0xe0 | (ord(ch) >> 12))))
+ p += (chr((0x80 | ((ord(ch) >> 6) & 0x3f))))
+ p += (chr((0x80 | (ord(ch) & 0x3f))))
+ continue
+ else:
+ _encodeUCS4(p, ord(ch))
+ return "".join(p)
+
+def _encodeUCS4(p, ch):
+ # Encode UCS4 Unicode ordinals
+ p += (chr((0xf0 | (ch >> 18))))
+ p += (chr((0x80 | ((ch >> 12) & 0x3f))))
+ p += (chr((0x80 | ((ch >> 6) & 0x3f))))
+ p += (chr((0x80 | (ch & 0x3f))))
+
+
+def unicode_encode_ucs1_helper(p, size, errors,
+ errorhandler=raise_unicode_exception, limit=256):
+
+ if limit == 256:
+ reason = "ordinal not in range(256)"
+ encoding = "latin-1"
+ else:
+ reason = "ordinal not in range(128)"
+ encoding = "ascii"
+
+ if (size == 0):
+ return ''
+ res = []
+ pos = 0
+ while pos < len(p):
+ ch = p[pos]
+
+ if ord(ch) < limit:
+ res += chr(ord(ch))
+ pos += 1
+ else:
+ # startpos for collecting unencodable chars
+ collstart = pos
+ collend = pos+1
+ while collend < len(p) and ord(p[collend]) >= limit:
+ collend += 1
+ x = errorhandler(errors, encoding, reason, p,
+ collstart, collend, False)
+ res += str(x[0])
+ pos = x[1]
+
+ return "".join(res)
+
+def unicode_encode_latin1(p, size, errors):
+ res = unicode_encode_ucs1_helper(p, size, errors, 256)
+ return res
+
+def unicode_encode_ascii(p, size, errors):
+ res = unicode_encode_ucs1_helper(p, size, errors, 128)
+ return res
+
+
+def _STORECHAR(p, CH, byteorder):
+ hi = chr(((CH) >> 8) & 0xff)
+ lo = chr((CH) & 0xff)
+ if byteorder == 'little':
+ p.append(lo)
+ p.append(hi)
+ else:
+ p.append(hi)
+ p.append(lo)
+
+def unicode_encode_utf16_helper(s, size, errors,
+ errorhandler=raise_unicode_exception,
+ byteorder='little'):
+ p = []
+ if (byteorder == 'native'):
+ _STORECHAR(p, 0xFEFF, sys.byteorder)
+ byteorder = sys.byteorder
+
+ if size == 0:
+ return ""
+
+ i = 0
+ while i < size:
+ ch = ord(s[i])
+ i += 1
+ ch2 = 0
+ if (ch >= 0x10000) :
+ ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
+ ch = 0xD800 | ((ch-0x10000) >> 10)
+
+ _STORECHAR(p, ch, byteorder)
+ if ch2:
+ _STORECHAR(p, ch2, byteorder)
+
+ return "".join(p)
+
+def unicode_encode_utf16(s, size, errors,
+ errorhandler=raise_unicode_exception):
+ return unicode_encode_utf16_helper(s, size, errors, errorhandler, "native")
+
+
+def unicode_encode_utf16be(s, size, errors,
+ errorhandler=raise_unicode_exception):
+ return unicode_encode_utf16_helper(s, size, errors, errorhandler, "big")
+
+
+def unicode_encode_utf16le(s, size, errors,
+ errorhandler=raise_unicode_exception):
+ return unicode_encode_utf16_helper(s, size, errors, errorhandler, "little")
Modified: pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py (original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py Mon Nov 12 18:00:33 2007
@@ -17,9 +17,23 @@
assert consumed == len(s)
self.typeequals(trueresult, result)
+ def checkencode(self, s, encoding):
+ encoder = getattr(runicode,
+ "unicode_encode_%s" % encoding.replace("-", ""))
+ if isinstance(s, unicode):
+ trueresult = s.encode(encoding)
+ else:
+ trueresult = s
+ s = s.decode(encoding)
+ result = encoder(s, len(s), True)
+ self.typeequals(trueresult, result)
+
class TestDecoding(UnicodeTests):
+ # XXX test bom recognition in utf-16
+ # XXX test proper error handling
+
def test_all_ascii(self):
for i in range(128):
for encoding in "utf8 latin1 ascii".split():
@@ -30,6 +44,7 @@
for encoding in "utf8 latin1 utf16 utf-16-be utf-16-le".split():
self.checkdecode(unichr(i), encoding)
+
def test_random(self):
for i in range(10000):
uni = unichr(random.randrange(sys.maxunicode))
@@ -40,3 +55,24 @@
for s in ["\xd7\x90", "\xd6\x96", "\xeb\x96\x95", "\xf0\x90\x91\x93"]:
self.checkdecode(s, "utf8")
+
+class TestEncoding(UnicodeTests):
+ def test_all_ascii(self):
+ for i in range(128):
+ for encoding in "utf8 latin1 ascii".split():
+ self.checkencode(unichr(i), encoding)
+
+ def test_all_first_256(self):
+ for i in range(256):
+ for encoding in "utf8 latin1 utf16 utf-16-be utf-16-le".split():
+ self.checkencode(unichr(i), encoding)
+
+ def test_random(self):
+ for i in range(10000):
+ uni = unichr(random.randrange(sys.maxunicode))
+ for encoding in "utf8 utf16 utf-16-be utf-16-le".split():
+ self.checkencode(unichr(i), encoding)
+
+ def test_single_chars_utf8(self):
+ for s in ["\xd7\x90", "\xd6\x96", "\xeb\x96\x95", "\xf0\x90\x91\x93"]:
+ self.checkencode(s, "utf8")
More information about the Pypy-commit
mailing list