[pypy-svn] r77420 - in pypy/branch/fast-forward/pypy/rlib: . test
afa at codespeak.net
afa at codespeak.net
Tue Sep 28 00:14:02 CEST 2010
Author: afa
Date: Tue Sep 28 00:14:00 2010
New Revision: 77420
Modified:
pypy/branch/fast-forward/pypy/rlib/runicode.py
pypy/branch/fast-forward/pypy/rlib/test/test_runicode.py
Log:
implement utf-32 encoders and decoders
Modified: pypy/branch/fast-forward/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/fast-forward/pypy/rlib/runicode.py (original)
+++ pypy/branch/fast-forward/pypy/rlib/runicode.py Tue Sep 28 00:14:00 2010
@@ -270,7 +270,6 @@
if errorhandler is None:
errorhandler = raise_unicode_exception_decode
bo = 0
- consumed = 0
if BYTEORDER == 'little':
ihi = 1
@@ -419,6 +418,162 @@
# ____________________________________________________________
+# utf-32
+
+def str_decode_utf_32(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
+ errorhandler, "native")
+ return result, length
+
+def str_decode_utf_32_be(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
+ errorhandler, "big")
+ return result, length
+
+def str_decode_utf_32_le(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
+ errorhandler, "little")
+ return result, length
+
+def str_decode_utf_32_helper(s, size, errors, final=True,
+ errorhandler=None,
+ byteorder="native"):
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_decode
+ bo = 0
+
+ if BYTEORDER == 'little':
+ iorder = [0, 1, 2, 3]
+ else:
+ iorder = [3, 2, 1, 0]
+
+ # Check for BOM marks (U+FEFF) in the input and adjust current
+ # byte order setting accordingly. In native mode, the leading BOM
+ # mark is skipped, in all other modes, it is copied to the output
+ # stream as-is (giving a ZWNBSP character).
+ pos = 0
+ if byteorder == 'native':
+ if size >= 4:
+ bom = ((ord(s[iorder[3]]) << 24) | (ord(s[iorder[2]]) << 16) |
+ (ord(s[iorder[1]]) << 8) | ord(s[iorder[0]]))
+ if BYTEORDER == 'little':
+ if bom == 0x0000FEFF:
+ pos += 4
+ bo = -1
+ elif bom == 0xFFFE0000:
+ pos += 4
+ bo = 1
+ else:
+ if bom == 0x0000FEFF:
+ pos += 2
+ bo = 1
+ elif bom == 0xFFFE0000:
+ pos += 2
+ bo = -1
+ elif byteorder == 'little':
+ bo = -1
+ else:
+ bo = 1
+ if size == 0:
+ return u'', 0, bo
+ if bo == -1:
+ # force little endian
+ iorder = [0, 1, 2, 3]
+
+ elif bo == 1:
+ # force big endian
+ iorder = [3, 2, 1, 0]
+
+ result = UnicodeBuilder(size // 4)
+
+ while pos < size:
+ # remaining bytes at the end? (size should be divisible by 4)
+ if len(s) - pos < 4:
+ if not final:
+ break
+ r, pos = errorhandler(errors, 'utf-32', "truncated data",
+ s, pos, len(s))
+ result.append(r)
+ if len(s) - pos < 4:
+ break
+ continue
+ ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) |
+ (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
+ if ch >= 0x110000:
+ r, pos = errorhandler(errors, 'utf-32', "codepoint not in range(0x110000)",
+ s, pos, len(s))
+ result.append(r)
+ continue
+
+ if MAXUNICODE < 65536 and ch >= 0x10000:
+ ch -= 0x10000L
+ result.append(unichr(0xD800 + (ch >> 10)))
+ result.append(unichr(0xDC00 + (ch & 0x03FF)))
+ else:
+ result.append(UNICHR(ch))
+ pos += 4
+ return result.build(), pos, bo
+
+def _STORECHAR32(result, CH, byteorder):
+ c0 = chr(((CH) >> 24) & 0xff)
+ c1 = chr(((CH) >> 16) & 0xff)
+ c2 = chr(((CH) >> 8) & 0xff)
+ c3 = chr((CH) & 0xff)
+ if byteorder == 'little':
+ result.append(c3)
+ result.append(c2)
+ result.append(c1)
+ result.append(c0)
+ else:
+ result.append(c0)
+ result.append(c1)
+ result.append(c2)
+ result.append(c3)
+
+def unicode_encode_utf_32_helper(s, size, errors,
+ errorhandler=None,
+ byteorder='little'):
+ if size == 0:
+ return ""
+
+ result = StringBuilder(size * 4 + 4)
+ if byteorder == 'native':
+ _STORECHAR32(result, 0xFEFF, BYTEORDER)
+ byteorder = BYTEORDER
+
+ i = 0
+ while i < size:
+ ch = ord(s[i])
+ i += 1
+ ch2 = 0
+ if MAXUNICODE < 65536 and 0xD800 <= ch <= 0xDBFF and i < size:
+ ch2 = ord(s[i])
+ if 0xDC00 <= ch2 <= 0xDFFF:
+ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+ i += 1
+ _STORECHAR32(result, ch, byteorder)
+
+ return result.build()
+
+def unicode_encode_utf_32(s, size, errors,
+ errorhandler=None):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "native")
+
+
+def unicode_encode_utf_32_be(s, size, errors,
+ errorhandler=None):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "big")
+
+
+def unicode_encode_utf_32_le(s, size, errors,
+ errorhandler=None):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "little")
+
+
+# ____________________________________________________________
# utf-7
## indicate whether a UTF-7 character is special i.e. cannot be directly
Modified: pypy/branch/fast-forward/pypy/rlib/test/test_runicode.py
==============================================================================
--- pypy/branch/fast-forward/pypy/rlib/test/test_runicode.py (original)
+++ pypy/branch/fast-forward/pypy/rlib/test/test_runicode.py Tue Sep 28 00:14:00 2010
@@ -76,7 +76,7 @@
assert start == startingpos
assert stop == endingpos
return u"42424242", stop
- return "", endingpos
+ return u"", endingpos
decoder = self.getdecoder(encoding)
if addstuff:
s += "some rest in ascii"
@@ -99,12 +99,14 @@
def test_all_first_256(self):
for i in range(256):
- for encoding in "utf-8 latin-1 utf-16 utf-16-be utf-16-le".split():
+ for encoding in ("utf-8 latin-1 utf-16 utf-16-be utf-16-le "
+ "utf-32 utf-32-be utf-32-le").split():
self.checkdecode(unichr(i), encoding)
def test_first_10000(self):
for i in range(10000):
- for encoding in "utf-8 utf-16 utf-16-be utf-16-le".split():
+ for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ "utf-32 utf-32-be utf-32-le").split():
self.checkdecode(unichr(i), encoding)
def test_random(self):
@@ -113,13 +115,15 @@
if 0xd800 <= v <= 0xdfff:
continue
uni = unichr(v)
- for encoding in "utf-8 utf-16 utf-16-be utf-16-le".split():
- self.checkdecode(uni, encoding)
+ for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ "utf-32 utf-32-be utf-32-le").split():
+ self.checkdecode(uni, encoding)
def test_maxunicode(self):
uni = unichr(sys.maxunicode)
- for encoding in "utf-8 utf-16 utf-16-be utf-16-le".split():
- self.checkdecode(uni, encoding)
+ for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ "utf-32 utf-32-be utf-32-le").split():
+ self.checkdecode(uni, encoding)
def test_single_chars_utf8(self):
for s in ["\xd7\x90", "\xd6\x96", "\xeb\x96\x95", "\xf0\x90\x91\x93"]:
@@ -179,12 +183,14 @@
def test_all_first_256(self):
for i in range(256):
- for encoding in "utf-8 latin-1 utf-16 utf-16-be utf-16-le".split():
+ for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ "utf-32 utf-32-be utf-32-le").split():
self.checkencode(unichr(i), encoding)
def test_first_10000(self):
for i in range(10000):
- for encoding in "utf-8 utf-16 utf-16-be utf-16-le".split():
+ for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ "utf-32 utf-32-be utf-32-le").split():
self.checkencode(unichr(i), encoding)
def test_random(self):
@@ -193,12 +199,14 @@
if 0xd800 <= v <= 0xdfff:
continue
uni = unichr(v)
- for encoding in "utf-8 utf-16 utf-16-be utf-16-le".split():
+ for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ "utf-32 utf-32-be utf-32-le").split():
self.checkencode(uni, encoding)
def test_maxunicode(self):
uni = unichr(sys.maxunicode)
- for encoding in "utf-8 utf-16 utf-16-be utf-16-le".split():
+ for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ "utf-32 utf-32-be utf-32-le").split():
self.checkencode(uni, encoding)
def test_single_chars_utf8(self):
More information about the Pypy-commit
mailing list