[Jython-checkins] jython (merge default -> default): Merge codec work to trunk
jeff.allen
jython-checkins at python.org
Sat Jan 5 18:57:02 CET 2013
http://hg.python.org/jython/rev/e3f1125c222d
changeset: 6939:e3f1125c222d
parent: 6933:318673e3de37
parent: 6938:c0072e7f0c90
user: Jeff Allen <ja...py at farowl.co.uk>
date: Sat Jan 05 17:47:50 2013 +0000
summary:
Merge codec work to trunk
files:
Lib/test/test_codecs.py | 1324 ++++++++++++++-
Lib/test/test_unicode.py | 13 +-
src/org/python/core/codecs.java | 1242 +++++++++----
src/org/python/modules/_codecs.java | 480 +++--
4 files changed, 2445 insertions(+), 614 deletions(-)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1,15 +1,426 @@
+# test_codecs.py from CPython 2.7, modified for Jython
from test import test_support
import unittest
import codecs
-import StringIO
+import locale
+import sys, StringIO
+if not test_support.is_jython:
+ import _testcapi
-class UTF16Test(unittest.TestCase):
+class Queue(object):
+ """
+ queue: write bytes at one end, read bytes from the other end
+ """
+ def __init__(self):
+ self._buffer = ""
+
+ def write(self, chars):
+ self._buffer += chars
+
+ def read(self, size=-1):
+ if size<0:
+ s = self._buffer
+ self._buffer = ""
+ return s
+ else:
+ s = self._buffer[:size]
+ self._buffer = self._buffer[size:]
+ return s
+
+class ReadTest(unittest.TestCase):
+ def check_partial(self, input, partialresults):
+ # get a StreamReader for the encoding and feed the bytestring version
+ # of input to the reader byte by byte. Read everything available from
+ # the StreamReader and check that the results equal the appropriate
+ # entries from partialresults.
+ q = Queue()
+ r = codecs.getreader(self.encoding)(q)
+ result = u""
+ for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
+ q.write(c)
+ result += r.read()
+ self.assertEqual(result, partialresult)
+ # check that there's nothing left in the buffers
+ self.assertEqual(r.read(), u"")
+ self.assertEqual(r.bytebuffer, "")
+ self.assertEqual(r.charbuffer, u"")
+
+ # do the check again, this time using a incremental decoder
+ d = codecs.getincrementaldecoder(self.encoding)()
+ result = u""
+ for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
+ result += d.decode(c)
+ self.assertEqual(result, partialresult)
+ # check that there's nothing left in the buffers
+ self.assertEqual(d.decode("", True), u"")
+ self.assertEqual(d.buffer, "")
+
+ # Check whether the reset method works properly
+ d.reset()
+ result = u""
+ for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
+ result += d.decode(c)
+ self.assertEqual(result, partialresult)
+ # check that there's nothing left in the buffers
+ self.assertEqual(d.decode("", True), u"")
+ self.assertEqual(d.buffer, "")
+
+ # check iterdecode()
+ encoded = input.encode(self.encoding)
+ self.assertEqual(
+ input,
+ u"".join(codecs.iterdecode(encoded, self.encoding))
+ )
+
+ def test_readline(self):
+ def getreader(input):
+ stream = StringIO.StringIO(input.encode(self.encoding))
+ return codecs.getreader(self.encoding)(stream)
+
+ def readalllines(input, keepends=True, size=None):
+ reader = getreader(input)
+ lines = []
+ while True:
+ line = reader.readline(size=size, keepends=keepends)
+ if not line:
+ break
+ lines.append(line)
+ return "|".join(lines)
+
+ s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
+ sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
+ sexpectednoends = u"foo|bar|baz|spam|eggs"
+ self.assertEqual(readalllines(s, True), sexpected)
+ self.assertEqual(readalllines(s, False), sexpectednoends)
+ self.assertEqual(readalllines(s, True, 10), sexpected)
+ self.assertEqual(readalllines(s, False, 10), sexpectednoends)
+
+ # Test long lines (multiple calls to read() in readline())
+ vw = []
+ vwo = []
+ for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
+ vw.append((i*200)*u"\3042" + lineend)
+ vwo.append((i*200)*u"\3042")
+ self.assertEqual(readalllines("".join(vw), True), "".join(vw))
+ self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
+
+ # Test lines where the first read might end with \r, so the
+ # reader has to look ahead whether this is a lone \r or a \r\n
+ for size in xrange(80):
+ for lineend in u"\n \r\n \r \u2028".split():
+ s = 10*(size*u"a" + lineend + u"xxx\n")
+ reader = getreader(s)
+ for i in xrange(10):
+ self.assertEqual(
+ reader.readline(keepends=True),
+ size*u"a" + lineend,
+ )
+ reader = getreader(s)
+ for i in xrange(10):
+ self.assertEqual(
+ reader.readline(keepends=False),
+ size*u"a",
+ )
+
+ def test_bug1175396(self):
+ s = [
+ '<%!--===================================================\r\n',
+ ' BLOG index page: show recent articles,\r\n',
+ ' today\'s articles, or articles of a specific date.\r\n',
+ '========================================================--%>\r\n',
+ '<%@inputencoding="ISO-8859-1"%>\r\n',
+ '<%@pagetemplate=TEMPLATE.y%>\r\n',
+ '<%@import=import frog.util, frog%>\r\n',
+ '<%@import=import frog.objects%>\r\n',
+ '<%@import=from frog.storageerrors import StorageError%>\r\n',
+ '<%\r\n',
+ '\r\n',
+ 'import logging\r\n',
+ 'log=logging.getLogger("Snakelets.logger")\r\n',
+ '\r\n',
+ '\r\n',
+ 'user=self.SessionCtx.user\r\n',
+ 'storageEngine=self.SessionCtx.storageEngine\r\n',
+ '\r\n',
+ '\r\n',
+ 'def readArticlesFromDate(date, count=None):\r\n',
+ ' entryids=storageEngine.listBlogEntries(date)\r\n',
+ ' entryids.reverse() # descending\r\n',
+ ' if count:\r\n',
+ ' entryids=entryids[:count]\r\n',
+ ' try:\r\n',
+ ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
+ ' except StorageError,x:\r\n',
+ ' log.error("Error loading articles: "+str(x))\r\n',
+ ' self.abort("cannot load articles")\r\n',
+ '\r\n',
+ 'showdate=None\r\n',
+ '\r\n',
+ 'arg=self.Request.getArg()\r\n',
+ 'if arg=="today":\r\n',
+ ' #-------------------- TODAY\'S ARTICLES\r\n',
+ ' self.write("<h2>Today\'s articles</h2>")\r\n',
+ ' showdate = frog.util.isodatestr() \r\n',
+ ' entries = readArticlesFromDate(showdate)\r\n',
+ 'elif arg=="active":\r\n',
+ ' #-------------------- ACTIVE ARTICLES redirect\r\n',
+ ' self.Yredirect("active.y")\r\n',
+ 'elif arg=="login":\r\n',
+ ' #-------------------- LOGIN PAGE redirect\r\n',
+ ' self.Yredirect("login.y")\r\n',
+ 'elif arg=="date":\r\n',
+ ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
+ ' showdate = self.Request.getParameter("date")\r\n',
+ ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
+ ' entries = readArticlesFromDate(showdate)\r\n',
+ 'else:\r\n',
+ ' #-------------------- RECENT ARTICLES\r\n',
+ ' self.write("<h2>Recent articles</h2>")\r\n',
+ ' dates=storageEngine.listBlogEntryDates()\r\n',
+ ' if dates:\r\n',
+ ' entries=[]\r\n',
+ ' SHOWAMOUNT=10\r\n',
+ ' for showdate in dates:\r\n',
+ ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
+ ' if len(entries)>=SHOWAMOUNT:\r\n',
+ ' break\r\n',
+ ' \r\n',
+ ]
+ stream = StringIO.StringIO("".join(s).encode(self.encoding))
+ reader = codecs.getreader(self.encoding)(stream)
+ for (i, line) in enumerate(reader):
+ self.assertEqual(line, s[i])
+
+ def test_readlinequeue(self):
+ q = Queue()
+ writer = codecs.getwriter(self.encoding)(q)
+ reader = codecs.getreader(self.encoding)(q)
+
+ # No lineends
+ writer.write(u"foo\r")
+ self.assertEqual(reader.readline(keepends=False), u"foo")
+ writer.write(u"\nbar\r")
+ self.assertEqual(reader.readline(keepends=False), u"")
+ self.assertEqual(reader.readline(keepends=False), u"bar")
+ writer.write(u"baz")
+ self.assertEqual(reader.readline(keepends=False), u"baz")
+ self.assertEqual(reader.readline(keepends=False), u"")
+
+ # Lineends
+ writer.write(u"foo\r")
+ self.assertEqual(reader.readline(keepends=True), u"foo\r")
+ writer.write(u"\nbar\r")
+ self.assertEqual(reader.readline(keepends=True), u"\n")
+ self.assertEqual(reader.readline(keepends=True), u"bar\r")
+ writer.write(u"baz")
+ self.assertEqual(reader.readline(keepends=True), u"baz")
+ self.assertEqual(reader.readline(keepends=True), u"")
+ writer.write(u"foo\r\n")
+ self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
+
+ def test_bug1098990_a(self):
+ s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
+ s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
+ s3 = u"next line.\r\n"
+
+ s = (s1+s2+s3).encode(self.encoding)
+ stream = StringIO.StringIO(s)
+ reader = codecs.getreader(self.encoding)(stream)
+ self.assertEqual(reader.readline(), s1)
+ self.assertEqual(reader.readline(), s2)
+ self.assertEqual(reader.readline(), s3)
+ self.assertEqual(reader.readline(), u"")
+
+ def test_bug1098990_b(self):
+ s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
+ s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
+ s3 = u"stillokay:bbbbxx\r\n"
+ s4 = u"broken!!!!badbad\r\n"
+ s5 = u"againokay.\r\n"
+
+ s = (s1+s2+s3+s4+s5).encode(self.encoding)
+ stream = StringIO.StringIO(s)
+ reader = codecs.getreader(self.encoding)(stream)
+ self.assertEqual(reader.readline(), s1)
+ self.assertEqual(reader.readline(), s2)
+ self.assertEqual(reader.readline(), s3)
+ self.assertEqual(reader.readline(), s4)
+ self.assertEqual(reader.readline(), s5)
+ self.assertEqual(reader.readline(), u"")
+
+ at unittest.skipIf(test_support.is_jython, "FIXME: Jython issue 2000 missing support for UTF-32")
+class UTF32Test(ReadTest):
+ encoding = "utf-32"
+
+ spamle = ('\xff\xfe\x00\x00'
+ 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
+ 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
+ spambe = ('\x00\x00\xfe\xff'
+ '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
+ '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
+
+ def test_only_one_bom(self):
+ _,_,reader,writer = codecs.lookup(self.encoding)
+ # encode some stream
+ s = StringIO.StringIO()
+ f = writer(s)
+ f.write(u"spam")
+ f.write(u"spam")
+ d = s.getvalue()
+ # check whether there is exactly one BOM in it
+ self.assertTrue(d == self.spamle or d == self.spambe)
+ # try to read it back
+ s = StringIO.StringIO(d)
+ f = reader(s)
+ self.assertEqual(f.read(), u"spamspam")
+
+ def test_badbom(self):
+ s = StringIO.StringIO(4*"\xff")
+ f = codecs.getreader(self.encoding)(s)
+ self.assertRaises(UnicodeError, f.read)
+
+ s = StringIO.StringIO(8*"\xff")
+ f = codecs.getreader(self.encoding)(s)
+ self.assertRaises(UnicodeError, f.read)
+
+ def test_partial(self):
+ self.check_partial(
+ u"\x00\xff\u0100\uffff",
+ [
+ u"", # first byte of BOM read
+ u"", # second byte of BOM read
+ u"", # third byte of BOM read
+ u"", # fourth byte of BOM read => byteorder known
+ u"",
+ u"",
+ u"",
+ u"\x00",
+ u"\x00",
+ u"\x00",
+ u"\x00",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_handlers(self):
+ self.assertEqual((u'\ufffd', 1),
+ codecs.utf_32_decode('\x01', 'replace', True))
+ self.assertEqual((u'', 1),
+ codecs.utf_32_decode('\x01', 'ignore', True))
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
+ "\xff", "strict", True)
+
+ def test_issue8941(self):
+ # Issue #8941: insufficient result allocation when decoding into
+ # surrogate pairs on UCS-2 builds.
+ encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
+ self.assertEqual(u'\U00010000' * 1024,
+ codecs.utf_32_decode(encoded_le)[0])
+ encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
+ self.assertEqual(u'\U00010000' * 1024,
+ codecs.utf_32_decode(encoded_be)[0])
+
+ at unittest.skipIf(test_support.is_jython, "FIXME: Jython issue 2000 missing support for UTF-32")
+class UTF32LETest(ReadTest):
+ encoding = "utf-32-le"
+
+ def test_partial(self):
+ self.check_partial(
+ u"\x00\xff\u0100\uffff",
+ [
+ u"",
+ u"",
+ u"",
+ u"\x00",
+ u"\x00",
+ u"\x00",
+ u"\x00",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_simple(self):
+ self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
+ "\xff", "strict", True)
+
+ def test_issue8941(self):
+ # Issue #8941: insufficient result allocation when decoding into
+ # surrogate pairs on UCS-2 builds.
+ encoded = '\x00\x00\x01\x00' * 1024
+ self.assertEqual(u'\U00010000' * 1024,
+ codecs.utf_32_le_decode(encoded)[0])
+
+ at unittest.skipIf(test_support.is_jython, "FIXME: Jython issue 2000 missing support for UTF-32")
+class UTF32BETest(ReadTest):
+ encoding = "utf-32-be"
+
+ def test_partial(self):
+ self.check_partial(
+ u"\x00\xff\u0100\uffff",
+ [
+ u"",
+ u"",
+ u"",
+ u"\x00",
+ u"\x00",
+ u"\x00",
+ u"\x00",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_simple(self):
+ self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
+ "\xff", "strict", True)
+
+ def test_issue8941(self):
+ # Issue #8941: insufficient result allocation when decoding into
+ # surrogate pairs on UCS-2 builds.
+ encoded = '\x00\x01\x00\x00' * 1024
+ self.assertEqual(u'\U00010000' * 1024,
+ codecs.utf_32_be_decode(encoded)[0])
+
+
+class UTF16Test(ReadTest):
+ encoding = "utf-16"
spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
def test_only_one_bom(self):
- _,_,reader,writer = codecs.lookup("utf-16")
+ _,_,reader,writer = codecs.lookup(self.encoding)
# encode some stream
s = StringIO.StringIO()
f = writer(s)
@@ -17,15 +428,259 @@
f.write(u"spam")
d = s.getvalue()
# check whether there is exactly one BOM in it
- self.assert_(d == self.spamle or d == self.spambe)
+ self.assertTrue(d == self.spamle or d == self.spambe)
# try to read it back
s = StringIO.StringIO(d)
f = reader(s)
- self.assertEquals(f.read(), u"spamspam")
+ self.assertEqual(f.read(), u"spamspam")
+
+ def test_badbom(self):
+ s = StringIO.StringIO("\xff\xff")
+ f = codecs.getreader(self.encoding)(s)
+ self.assertRaises(UnicodeError, f.read)
+
+ s = StringIO.StringIO("\xff\xff\xff\xff")
+ f = codecs.getreader(self.encoding)(s)
+ self.assertRaises(UnicodeError, f.read)
+
+ def test_partial(self):
+ self.check_partial(
+ u"\x00\xff\u0100\uffff",
+ [
+ u"", # first byte of BOM read
+ u"", # second byte of BOM read => byteorder known
+ u"",
+ u"\x00",
+ u"\x00",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_handlers(self):
+ self.assertEqual((u'\ufffd', 1),
+ codecs.utf_16_decode('\x01', 'replace', True))
+ self.assertEqual((u'', 1),
+ codecs.utf_16_decode('\x01', 'ignore', True))
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
+
+ def test_bug691291(self):
+ # Files are always opened in binary mode, even if no binary mode was
+ # specified. This means that no automatic conversion of '\n' is done
+ # on reading and writing.
+ s1 = u'Hello\r\nworld\r\n'
+
+ s = s1.encode(self.encoding)
+ self.addCleanup(test_support.unlink, test_support.TESTFN)
+ with open(test_support.TESTFN, 'wb') as fp:
+ fp.write(s)
+ with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
+ self.assertEqual(reader.read(), s1)
+
+class UTF16LETest(ReadTest):
+ encoding = "utf-16-le"
+
+ def test_partial(self):
+ self.check_partial(
+ u"\x00\xff\u0100\uffff",
+ [
+ u"",
+ u"\x00",
+ u"\x00",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
+
+class UTF16BETest(ReadTest):
+ encoding = "utf-16-be"
+
+ def test_partial(self):
+ self.check_partial(
+ u"\x00\xff\u0100\uffff",
+ [
+ u"",
+ u"\x00",
+ u"\x00",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
+
+class UTF8Test(ReadTest):
+ encoding = "utf-8"
+
+ def test_partial(self):
+ self.check_partial(
+ u"\x00\xff\u07ff\u0800\uffff",
+ [
+ u"\x00",
+ u"\x00",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff\u07ff",
+ u"\x00\xff\u07ff",
+ u"\x00\xff\u07ff",
+ u"\x00\xff\u07ff\u0800",
+ u"\x00\xff\u07ff\u0800",
+ u"\x00\xff\u07ff\u0800",
+ u"\x00\xff\u07ff\u0800\uffff",
+ ]
+ )
+
+class UTF7Test(ReadTest):
+ encoding = "utf-7"
+
+ def test_partial(self):
+ self.check_partial(
+ u"a+-b",
+ [
+ u"a",
+ u"a",
+ u"a+",
+ u"a+-",
+ u"a+-b",
+ ]
+ )
+
+class UTF16ExTest(unittest.TestCase):
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
+
+ def test_bad_args(self):
+ self.assertRaises(TypeError, codecs.utf_16_ex_decode)
+
+ at unittest.skipIf(test_support.is_jython, "FIXME: Jython has no _codecs.readbuffer_encode method")
+class ReadBufferTest(unittest.TestCase):
+
+ def test_array(self):
+ import array
+ self.assertEqual(
+ codecs.readbuffer_encode(array.array("c", "spam")),
+ ("spam", 4)
+ )
+
+ def test_empty(self):
+ self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
+
+ def test_bad_args(self):
+ self.assertRaises(TypeError, codecs.readbuffer_encode)
+ self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
+
+ at unittest.skipIf(test_support.is_jython, "FIXME: Jython has no _codecs.charbuffer_encode method")
+class CharBufferTest(unittest.TestCase):
+
+ def test_string(self):
+ self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
+
+ def test_empty(self):
+ self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
+
+ def test_bad_args(self):
+ self.assertRaises(TypeError, codecs.charbuffer_encode)
+ self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
+
+class UTF8SigTest(ReadTest):
+ encoding = "utf-8-sig"
+
+ def test_partial(self):
+ self.check_partial(
+ u"\ufeff\x00\xff\u07ff\u0800\uffff",
+ [
+ u"",
+ u"",
+ u"", # First BOM has been read and skipped
+ u"",
+ u"",
+ u"\ufeff", # Second BOM has been read and emitted
+ u"\ufeff\x00", # "\x00" read and emitted
+ u"\ufeff\x00", # First byte of encoded u"\xff" read
+ u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
+ u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
+ u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
+ u"\ufeff\x00\xff\u07ff",
+ u"\ufeff\x00\xff\u07ff",
+ u"\ufeff\x00\xff\u07ff\u0800",
+ u"\ufeff\x00\xff\u07ff\u0800",
+ u"\ufeff\x00\xff\u07ff\u0800",
+ u"\ufeff\x00\xff\u07ff\u0800\uffff",
+ ]
+ )
+
+ def test_bug1601501(self):
+ # SF bug #1601501: check that the codec works with a buffer
+ unicode("\xef\xbb\xbf", "utf-8-sig")
+
+ def test_bom(self):
+ d = codecs.getincrementaldecoder("utf-8-sig")()
+ s = u"spam"
+ self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
+
+ def test_stream_bom(self):
+ unistring = u"ABC\u00A1\u2200XYZ"
+ bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
+
+ reader = codecs.getreader("utf-8-sig")
+ for sizehint in [None] + range(1, 11) + \
+ [64, 128, 256, 512, 1024]:
+ istream = reader(StringIO.StringIO(bytestring))
+ ostream = StringIO.StringIO()
+ while 1:
+ if sizehint is not None:
+ data = istream.read(sizehint)
+ else:
+ data = istream.read()
+
+ if not data:
+ break
+ ostream.write(data)
+
+ got = ostream.getvalue()
+ self.assertEqual(got, unistring)
+
+ def test_stream_bare(self):
+ unistring = u"ABC\u00A1\u2200XYZ"
+ bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
+
+ reader = codecs.getreader("utf-8-sig")
+ for sizehint in [None] + range(1, 11) + \
+ [64, 128, 256, 512, 1024]:
+ istream = reader(StringIO.StringIO(bytestring))
+ ostream = StringIO.StringIO()
+ while 1:
+ if sizehint is not None:
+ data = istream.read(sizehint)
+ else:
+ data = istream.read()
+
+ if not data:
+ break
+ ostream.write(data)
+
+ got = ostream.getvalue()
+ self.assertEqual(got, unistring)
class EscapeDecodeTest(unittest.TestCase):
- def test_empty_escape_decode(self):
- self.assertEquals(codecs.escape_decode(""), ("", 0))
+ def test_empty(self):
+ self.assertEqual(codecs.escape_decode(""), ("", 0))
class RecodingTest(unittest.TestCase):
def test_recoding(self):
@@ -101,7 +756,6 @@
u"\u0056\u0069\u1EC7\u0074",
"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
-
#(L) 3<nen>B<gumi><kinpachi><sensei>
(u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
"3B-ww4c5e180e575a65lsy2b"),
@@ -153,11 +807,69 @@
# code produces only lower case. Converting just puny to
# lower is also insufficient, since some of the input characters
# are upper case.
- self.assertEquals(uni.encode("punycode").lower(), puny.lower())
+ self.assertEqual(uni.encode("punycode").lower(), puny.lower())
def test_decode(self):
for uni, puny in punycode_testcases:
- self.assertEquals(uni, puny.decode("punycode"))
+ self.assertEqual(uni, puny.decode("punycode"))
+
+ at unittest.skipIf(test_support.is_jython, "FIXME: equates to UTF-32BE in Jython")
+class UnicodeInternalTest(unittest.TestCase):
+ def test_bug1251300(self):
+ # Decoding with unicode_internal used to not correctly handle "code
+ # points" above 0x10ffff on UCS-4 builds.
+ if sys.maxunicode > 0xffff:
+ ok = [
+ ("\x00\x10\xff\xff", u"\U0010ffff"),
+ ("\x00\x00\x01\x01", u"\U00000101"),
+ ("", u""),
+ ]
+ not_ok = [
+ "\x7f\xff\xff\xff",
+ "\x80\x00\x00\x00",
+ "\x81\x00\x00\x00",
+ "\x00",
+ "\x00\x00\x00\x00\x00",
+ ]
+ for internal, uni in ok:
+ if sys.byteorder == "little":
+ internal = "".join(reversed(internal))
+ self.assertEqual(uni, internal.decode("unicode_internal"))
+ for internal in not_ok:
+ if sys.byteorder == "little":
+ internal = "".join(reversed(internal))
+ self.assertRaises(UnicodeDecodeError, internal.decode,
+ "unicode_internal")
+
+ def test_decode_error_attributes(self):
+ if sys.maxunicode > 0xffff:
+ try:
+ "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
+ except UnicodeDecodeError, ex:
+ self.assertEqual("unicode_internal", ex.encoding)
+ self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
+ self.assertEqual(4, ex.start)
+ self.assertEqual(8, ex.end)
+ else:
+ self.fail("UnicodeDecodeError not raised")
+
+ def test_decode_callback(self):
+ if sys.maxunicode > 0xffff:
+ codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
+ decoder = codecs.getdecoder("unicode_internal")
+ ab = u"ab".encode("unicode_internal")
+ ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
+ "UnicodeInternalTest")
+ self.assertEqual((u"ab", 12), ignored)
+
+ def test_encode_length(self):
+ # Issue 3739
+ encoder = codecs.getencoder("unicode_internal")
+ self.assertEqual(encoder(u"a")[1], 1)
+ self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
+
+ encoder = codecs.getencoder("string-escape")
+ self.assertEqual(encoder(r'\x00')[1], 4)
# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
nameprep_tests = [
@@ -313,6 +1025,7 @@
]
+ at unittest.skipIf(test_support.is_jython, "FIXME: incomplete unicodedata module")
class NameprepTest(unittest.TestCase):
def test_nameprep(self):
from encodings.idna import nameprep
@@ -328,19 +1041,602 @@
else:
prepped = unicode(prepped, "utf-8")
try:
- self.assertEquals(nameprep(orig), prepped)
+ self.assertEqual(nameprep(orig), prepped)
except Exception,e:
raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
+ at unittest.skipIf(test_support.is_jython, "FIXME: Jython issue 2000 missing support for IDNA")
+class IDNACodecTest(unittest.TestCase):
+ def test_builtin_decode(self):
+ self.assertEqual(unicode("python.org", "idna"), u"python.org")
+ self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
+ self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
+ self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
+
+ def test_builtin_encode(self):
+ self.assertEqual(u"python.org".encode("idna"), "python.org")
+ self.assertEqual("python.org.".encode("idna"), "python.org.")
+ self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
+ self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
+
+ def test_stream(self):
+ import StringIO
+ r = codecs.getreader("idna")(StringIO.StringIO("abc"))
+ r.read(3)
+ self.assertEqual(r.read(), u"")
+
+ def test_incremental_decode(self):
+ self.assertEqual(
+ "".join(codecs.iterdecode("python.org", "idna")),
+ u"python.org"
+ )
+ self.assertEqual(
+ "".join(codecs.iterdecode("python.org.", "idna")),
+ u"python.org."
+ )
+ self.assertEqual(
+ "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
+ u"pyth\xf6n.org."
+ )
+ self.assertEqual(
+ "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
+ u"pyth\xf6n.org."
+ )
+
+ decoder = codecs.getincrementaldecoder("idna")()
+ self.assertEqual(decoder.decode("xn--xam", ), u"")
+ self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
+ self.assertEqual(decoder.decode(u"rg"), u"")
+ self.assertEqual(decoder.decode(u"", True), u"org")
+
+ decoder.reset()
+ self.assertEqual(decoder.decode("xn--xam", ), u"")
+ self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
+ self.assertEqual(decoder.decode("rg."), u"org.")
+ self.assertEqual(decoder.decode("", True), u"")
+
+ def test_incremental_encode(self):
+ self.assertEqual(
+ "".join(codecs.iterencode(u"python.org", "idna")),
+ "python.org"
+ )
+ self.assertEqual(
+ "".join(codecs.iterencode(u"python.org.", "idna")),
+ "python.org."
+ )
+ self.assertEqual(
+ "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
+ "xn--pythn-mua.org."
+ )
+ self.assertEqual(
+ "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
+ "xn--pythn-mua.org."
+ )
+
+ encoder = codecs.getincrementalencoder("idna")()
+ self.assertEqual(encoder.encode(u"\xe4x"), "")
+ self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
+ self.assertEqual(encoder.encode(u"", True), "org")
+
+ encoder.reset()
+ self.assertEqual(encoder.encode(u"\xe4x"), "")
+ self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
+ self.assertEqual(encoder.encode(u"", True), "")
+
+class CodecsModuleTest(unittest.TestCase):
+
+ @unittest.skipIf(test_support.is_jython, "FIXME: _codecs.decode not implemented")
+ def test_decode(self):
+ self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
+ u'\xe4\xf6\xfc')
+ self.assertRaises(TypeError, codecs.decode)
+ self.assertEqual(codecs.decode('abc'), u'abc')
+ self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
+
+ @unittest.skipIf(test_support.is_jython, "FIXME: _codecs.encode not implemented")
+ def test_encode(self):
+ self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
+ '\xe4\xf6\xfc')
+ self.assertRaises(TypeError, codecs.encode)
+ self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
+ self.assertEqual(codecs.encode(u'abc'), 'abc')
+ self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
+
+ def test_register(self):
+ self.assertRaises(TypeError, codecs.register)
+ self.assertRaises(TypeError, codecs.register, 42)
+
+ def test_lookup(self):
+ self.assertRaises(TypeError, codecs.lookup)
+ self.assertRaises(LookupError, codecs.lookup, "__spam__")
+ self.assertRaises(LookupError, codecs.lookup, " ")
+
+ def test_getencoder(self):
+ self.assertRaises(TypeError, codecs.getencoder)
+ self.assertRaises(LookupError, codecs.getencoder, "__spam__")
+
+ def test_getdecoder(self):
+ self.assertRaises(TypeError, codecs.getdecoder)
+ self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
+
+ def test_getreader(self):
+ self.assertRaises(TypeError, codecs.getreader)
+ self.assertRaises(LookupError, codecs.getreader, "__spam__")
+
+ def test_getwriter(self):
+ self.assertRaises(TypeError, codecs.getwriter)
+ self.assertRaises(LookupError, codecs.getwriter, "__spam__")
+
+ def test_lookup_issue1813(self):
+ # Issue #1813: under Turkish locales, lookup of some codecs failed
+ # because 'I' is lowercased as a dotless "i"
+ oldlocale = locale.getlocale(locale.LC_CTYPE)
+ self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
+ try:
+ locale.setlocale(locale.LC_CTYPE, 'tr_TR')
+ except locale.Error:
+ # Unsupported locale on this system
+ self.skipTest('test needs Turkish locale')
+ c = codecs.lookup('ASCII')
+ self.assertEqual(c.name, 'ascii')
+
+class StreamReaderTest(unittest.TestCase):
+
+ def setUp(self):
+ self.reader = codecs.getreader('utf-8')
+ self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
+
+ def test_readlines(self):
+ f = self.reader(self.stream)
+ self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
+
+class EncodedFileTest(unittest.TestCase):
+
+ def test_basic(self):
+ f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
+ ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
+ self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
+
+ f = StringIO.StringIO()
+ ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
+ ef.write('\xc3\xbc')
+ self.assertEqual(f.getvalue(), '\xfc')
+
+class Str2StrTest(unittest.TestCase):
+
+ def test_read(self):
+ sin = "\x80".encode("base64_codec")
+ reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
+ sout = reader.read()
+ self.assertEqual(sout, "\x80")
+ self.assertIsInstance(sout, str)
+
+ def test_readline(self):
+ sin = "\x80".encode("base64_codec")
+ reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
+ sout = reader.readline()
+ self.assertEqual(sout, "\x80")
+ self.assertIsInstance(sout, str)
+
+all_unicode_encodings = [
+ "ascii",
+ "base64_codec",
+# FIXME: Jython issue 1066: "big5",
+# FIXME: Jython issue 1066: "big5hkscs",
+ "charmap",
+ "cp037",
+ "cp1006",
+ "cp1026",
+ "cp1140",
+ "cp1250",
+ "cp1251",
+ "cp1252",
+ "cp1253",
+ "cp1254",
+ "cp1255",
+ "cp1256",
+ "cp1257",
+ "cp1258",
+ "cp424",
+ "cp437",
+ "cp500",
+ "cp720",
+ "cp737",
+ "cp775",
+ "cp850",
+ "cp852",
+ "cp855",
+ "cp856",
+ "cp857",
+ "cp858",
+ "cp860",
+ "cp861",
+ "cp862",
+ "cp863",
+ "cp864",
+ "cp865",
+ "cp866",
+ "cp869",
+ "cp874",
+ "cp875",
+# FIXME: Jython issue 1066: "cp932",
+# FIXME: Jython issue 1066: "cp949",
+# FIXME: Jython issue 1066: "cp950",
+# FIXME: Jython issue 1066: "euc_jis_2004",
+# FIXME: Jython issue 1066: 'euc_jisx0213',
+# FIXME: Jython issue 1066: 'euc_jp',
+# FIXME: Jython issue 1066: 'euc_kr',
+# FIXME: Jython issue 1066: 'gb18030',
+# FIXME: Jython issue 1066: 'gb2312',
+# FIXME: Jython issue 1066: 'gbk',
+ "hex_codec",
+ "hp_roman8",
+# FIXME: Jython issue 1066: 'hz',
+# FIXME: Jython issue 1066: "idna",
+# FIXME: Jython issue 1066: 'iso2022_jp',
+# FIXME: Jython issue 1066: 'iso2022_jp_1',
+# FIXME: Jython issue 1066: 'iso2022_jp_2',
+# FIXME: Jython issue 1066: 'iso2022_jp_2004',
+# FIXME: Jython issue 1066: 'iso2022_jp_3',
+# FIXME: Jython issue 1066: 'iso2022_jp_ext',
+# FIXME: Jython issue 1066: 'iso2022_kr',
+ "iso8859_1",
+ "iso8859_10",
+ "iso8859_11",
+ "iso8859_13",
+ "iso8859_14",
+ "iso8859_15",
+ "iso8859_16",
+ "iso8859_2",
+ "iso8859_3",
+ "iso8859_4",
+ "iso8859_5",
+ "iso8859_6",
+ "iso8859_7",
+ "iso8859_8",
+ "iso8859_9",
+# FIXME: Jython issue 1066: 'johab',
+ "koi8_r",
+ "koi8_u",
+ "latin_1",
+ "mac_cyrillic",
+ "mac_greek",
+ "mac_iceland",
+ "mac_latin2",
+ "mac_roman",
+ "mac_turkish",
+ "palmos",
+ "ptcp154",
+ "punycode",
+ "raw_unicode_escape",
+ "rot_13",
+# FIXME: Jython issue 1066: 'shift_jis',
+# FIXME: Jython issue 1066: 'shift_jis_2004',
+# FIXME: Jython issue 1066: 'shift_jisx0213',
+ "tis_620",
+ "unicode_escape",
+ "unicode_internal",
+ "utf_16",
+ "utf_16_be",
+ "utf_16_le",
+ "utf_7",
+ "utf_8",
+]
+
+if hasattr(codecs, "mbcs_encode"):
+ all_unicode_encodings.append("mbcs")
+
+# The following encodings work only with str, not unicode
+all_string_encodings = [
+ "quopri_codec",
+ "string_escape",
+ "uu_codec",
+]
+
+# The following encoding is not tested, because it's not supposed
+# to work:
+# "undefined"
+
+# The following encodings don't work in stateful mode
+broken_unicode_with_streams = [
+ "base64_codec",
+ "hex_codec",
+ "punycode",
+ "unicode_internal"
+]
+broken_incremental_coders = broken_unicode_with_streams[:]
+
+# The following encodings only support "strict" mode
+only_strict_mode = [
+ "idna",
+ "zlib_codec",
+ "bz2_codec",
+]
+
+try:
+ import bz2
+except ImportError:
+ pass
+else:
+ all_unicode_encodings.append("bz2_codec")
+ broken_unicode_with_streams.append("bz2_codec")
+
+try:
+ import zlib
+except ImportError:
+ pass
+else:
+ all_unicode_encodings.append("zlib_codec")
+ broken_unicode_with_streams.append("zlib_codec")
+
+class BasicUnicodeTest(unittest.TestCase):
+
+ @unittest.skipIf(test_support.is_jython, "_testcapi module not present in Jython")
+ def test_basics(self):
+ s = u"abc123" # all codecs should be able to encode these
+ for encoding in all_unicode_encodings:
+ name = codecs.lookup(encoding).name
+ if encoding.endswith("_codec"):
+ name += "_codec"
+ elif encoding == "latin_1":
+ name = "latin_1"
+ self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
+ (bytes, size) = codecs.getencoder(encoding)(s)
+ self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
+ (chars, size) = codecs.getdecoder(encoding)(bytes)
+ self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
+
+ if encoding not in broken_unicode_with_streams:
+ # check stream reader/writer
+ q = Queue()
+ writer = codecs.getwriter(encoding)(q)
+ encodedresult = ""
+ for c in s:
+ writer.write(c)
+ encodedresult += q.read()
+ q = Queue()
+ reader = codecs.getreader(encoding)(q)
+ decodedresult = u""
+ for c in encodedresult:
+ q.write(c)
+ decodedresult += reader.read()
+ self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
+
+ if encoding not in broken_incremental_coders:
+ # check incremental decoder/encoder (fetched via the Python
+ # and C API) and iterencode()/iterdecode()
+ try:
+ encoder = codecs.getincrementalencoder(encoding)()
+ cencoder = _testcapi.codec_incrementalencoder(encoding)
+ except LookupError: # no IncrementalEncoder
+ pass
+ else:
+ # check incremental decoder/encoder
+ encodedresult = ""
+ for c in s:
+ encodedresult += encoder.encode(c)
+ encodedresult += encoder.encode(u"", True)
+ decoder = codecs.getincrementaldecoder(encoding)()
+ decodedresult = u""
+ for c in encodedresult:
+ decodedresult += decoder.decode(c)
+ decodedresult += decoder.decode("", True)
+ self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
+
+ # check C API
+ encodedresult = ""
+ for c in s:
+ encodedresult += cencoder.encode(c)
+ encodedresult += cencoder.encode(u"", True)
+ cdecoder = _testcapi.codec_incrementaldecoder(encoding)
+ decodedresult = u""
+ for c in encodedresult:
+ decodedresult += cdecoder.decode(c)
+ decodedresult += cdecoder.decode("", True)
+ self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
+
+ # check iterencode()/iterdecode()
+ result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
+ self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
+
+ # check iterencode()/iterdecode() with empty string
+ result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
+ self.assertEqual(result, u"")
+
+ if encoding not in only_strict_mode:
+ # check incremental decoder/encoder with errors argument
+ try:
+ encoder = codecs.getincrementalencoder(encoding)("ignore")
+ cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
+ except LookupError: # no IncrementalEncoder
+ pass
+ else:
+ encodedresult = "".join(encoder.encode(c) for c in s)
+ decoder = codecs.getincrementaldecoder(encoding)("ignore")
+ decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
+ self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
+
+ encodedresult = "".join(cencoder.encode(c) for c in s)
+ cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
+ decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
+ self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
+
+ def test_seek(self):
+ # all codecs should be able to encode these
+ s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
+ for encoding in all_unicode_encodings:
+ if encoding == "idna": # FIXME: See SF bug #1163178
+ continue
+ if encoding in broken_unicode_with_streams:
+ continue
+ reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
+ for t in xrange(5):
+ # Test that calling seek resets the internal codec state and buffers
+ reader.seek(0, 0)
+ line = reader.readline()
+ self.assertEqual(s[:len(line)], line)
+
+ def test_bad_decode_args(self):
+ for encoding in all_unicode_encodings:
+ decoder = codecs.getdecoder(encoding)
+ self.assertRaises(TypeError, decoder)
+ if encoding not in ("idna", "punycode"):
+ self.assertRaises(TypeError, decoder, 42)
+
+ def test_bad_encode_args(self):
+ for encoding in all_unicode_encodings:
+ encoder = codecs.getencoder(encoding)
+ self.assertRaises(TypeError, encoder)
+
+ def test_encoding_map_type_initialized(self):
+ from encodings import cp1140
+ # This used to crash, we are only verifying there's no crash.
+ table_type = type(cp1140.encoding_table)
+ self.assertEqual(table_type, table_type)
+
+class BasicStrTest(unittest.TestCase):
+ def test_basics(self):
+ s = "abc123"
+ for encoding in all_string_encodings:
+ (bytes, size) = codecs.getencoder(encoding)(s)
+ self.assertEqual(size, len(s))
+ (chars, size) = codecs.getdecoder(encoding)(bytes)
+ self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
+
+class CharmapTest(unittest.TestCase):
+ def test_decode_with_string_map(self):
+ self.assertEqual(
+ codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
+ (u"abc", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
+ (u"ab\ufffd", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
+ (u"ab\ufffd", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
+ (u"ab", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
+ (u"ab", 3)
+ )
+
+ allbytes = "".join(chr(i) for i in xrange(256))
+ self.assertEqual(
+ codecs.charmap_decode(allbytes, "ignore", u""),
+ (u"", len(allbytes))
+ )
+
+class WithStmtTest(unittest.TestCase):
+ def test_encodedfile(self):
+ f = StringIO.StringIO("\xc3\xbc")
+ with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
+ self.assertEqual(ef.read(), "\xfc")
+
+ def test_streamreaderwriter(self):
+ f = StringIO.StringIO("\xc3\xbc")
+ info = codecs.lookup("utf-8")
+ with codecs.StreamReaderWriter(f, info.streamreader,
+ info.streamwriter, 'strict') as srw:
+ self.assertEqual(srw.read(), u"\xfc")
+
+
+class BomTest(unittest.TestCase):
+ def test_seek0(self):
+ data = u"1234567890"
+ tests = ("utf-16",
+ "utf-16-le",
+ "utf-16-be",
+ # FIXME: Jython does not support:"utf-32",
+ # FIXME: Jython does not support:"utf-32-le",
+ # FIXME: Jython does not support:"utf-32-be",
+ )
+ self.addCleanup(test_support.unlink, test_support.TESTFN)
+ for encoding in tests:
+ # Check if the BOM is written only once
+ with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
+ f.write(data)
+ f.write(data)
+ f.seek(0)
+ self.assertEqual(f.read(), data * 2)
+ f.seek(0)
+ self.assertEqual(f.read(), data * 2)
+
+ # Check that the BOM is written after a seek(0)
+ with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
+ f.write(data[0])
+ self.assertNotEqual(f.tell(), 0)
+ f.seek(0)
+ f.write(data)
+ f.seek(0)
+ self.assertEqual(f.read(), data)
+
+ # (StreamWriter) Check that the BOM is written after a seek(0)
+ with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
+ f.writer.write(data[0])
+ self.assertNotEqual(f.writer.tell(), 0)
+ f.writer.seek(0)
+ f.writer.write(data)
+ f.seek(0)
+ self.assertEqual(f.read(), data)
+
+ # Check that the BOM is not written after a seek() at a position
+ # different than the start
+ with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
+ f.write(data)
+ f.seek(f.tell())
+ f.write(data)
+ f.seek(0)
+ self.assertEqual(f.read(), data * 2)
+
+ # (StreamWriter) Check that the BOM is not written after a seek()
+ # at a position different than the start
+ with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
+ f.writer.write(data)
+ f.writer.seek(f.writer.tell())
+ f.writer.write(data)
+ f.seek(0)
+ self.assertEqual(f.read(), data * 2)
+
+
def test_main():
test_support.run_unittest(
+ UTF32Test,
+ UTF32LETest,
+ UTF32BETest,
UTF16Test,
+ UTF16LETest,
+ UTF16BETest,
+ UTF8Test,
+ UTF8SigTest,
+ UTF7Test,
+ UTF16ExTest,
+ ReadBufferTest,
+ CharBufferTest,
EscapeDecodeTest,
RecodingTest,
PunycodeTest,
-# Jython transition 2.3
-# Missing the stringprep module. http://jython.org/bugs/1758320
-# NameprepTest
+ UnicodeInternalTest,
+ NameprepTest,
+ IDNACodecTest,
+ CodecsModuleTest,
+ StreamReaderTest,
+ EncodedFileTest,
+ Str2StrTest,
+ BasicUnicodeTest,
+ BasicStrTest,
+ CharmapTest,
+ WithStmtTest,
+ BomTest,
)
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -464,7 +464,6 @@
self.assertRaises(TypeError, unicode, 42, 42, 42)
- @unittest.skip("FIXME: broken")
def test_codecs_utf7(self):
utfTests = [
(u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
@@ -484,14 +483,17 @@
for (x, y) in utfTests:
self.assertEqual(x.encode('utf-7'), y)
- # surrogates not supported
+ # Lone/misordered surrogates are an error
self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
- self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
+ # Jython (and some CPython versions): two misplaced surrogates => two replacements
+ self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd\ufffd')
+ # self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
def test_codecs_utf8(self):
self.assertEqual(u''.encode('utf-8'), '')
self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
+ # Jython will not compile Unicode literals with surrogate units
#self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
#self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
#self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
@@ -528,6 +530,7 @@
# * strict decoding testing for all of the
# UTF8_ERROR cases in PyUnicode_DecodeUTF8
+ @unittest.skipIf(test_support.is_jython, "IDNA codec missing in Jython (issue 1153)")
def test_codecs_idna(self):
# Test whether trailing dot is preserved
self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
@@ -587,7 +590,6 @@
# Error handling (PyUnicode_EncodeDecimal())
self.assertRaises(UnicodeError, int, u"\u0200")
- @unittest.skip("FIXME: broken")
def test_codecs(self):
# Encoding
self.assertEqual(u'hello'.encode('ascii'), 'hello')
@@ -714,9 +716,6 @@
self.assertEqual(x, y)
def test_main():
- if test_support.is_jython:
- # http://bugs.jython.org/issue1153
- del UnicodeTest.test_codecs_idna
test_support.run_unittest(UnicodeTest)
if __name__ == "__main__":
diff --git a/src/org/python/core/codecs.java b/src/org/python/core/codecs.java
--- a/src/org/python/core/codecs.java
+++ b/src/org/python/core/codecs.java
@@ -1,21 +1,28 @@
/*
- * Copyright 2000 Finn Bock
+ * Copyright (c)2013 Jython Developers. Original Java version copyright 2000 Finn Bock.
*
- * This program contains material copyrighted by:
- * Copyright (c) Corporation for National Research Initiatives.
- * Originally written by Marc-Andre Lemburg (mal at lemburg.com).
+ * This program contains material copyrighted by: Copyright (c) Corporation for National Research
+ * Initiatives. Originally written by Marc-Andre Lemburg (mal at lemburg.com).
*/
package org.python.core;
import java.nio.charset.Charset;
-
import java.util.ArrayList;
import java.util.Iterator;
import org.python.core.util.StringUtil;
/**
- * Contains the implementation of the builtin codecs.
+ * This class implements the codec registry and utility methods supporting codecs, such as those
+ * providing the standard replacement strategies ("ignore", "backslashreplace", etc.). The _codecs
+ * module relies heavily on apparatus implemented here, and therefore so does the Python
+ * <code>codecs</code> module (in <code>Lib/codecs.py</code>). It corresponds approximately to
+ * CPython's <code>Python/codecs.c</code>.
+ * <p>
+ * The class also contains the inner methods of the standard Unicode codecs, available for
+ * transcoding of text at the Java level. These also are exposed through the <code>_codecs</code>
+ * module. In CPython, the implementation are found in <code>Objects/unicodeobject.c</code>.
+ *
* @since Jython 2.0
*/
public class codecs {
@@ -48,8 +55,8 @@
}
PyObject handler = errorHandlers.__finditem__(handlerName.intern());
if (handler == null) {
- throw new PyException(Py.LookupError,
- "unknown error handler name '" + handlerName + "'");
+ throw new PyException(Py.LookupError, "unknown error handler name '" + handlerName
+ + "'");
}
return handler;
}
@@ -80,7 +87,7 @@
if (searchPath.__len__() == 0) {
throw new PyException(Py.LookupError,
- "no codec search functions registered: can't find encoding '" + encoding + "'");
+ "no codec search functions registered: can't find encoding '" + encoding + "'");
}
for (PyObject func : searchPath.asIterable()) {
@@ -100,6 +107,7 @@
private static String normalizestring(String string) {
return string.toLowerCase().replace(' ', '-');
}
+
private static boolean import_encodings_called;
private static void import_encodings() {
@@ -140,11 +148,11 @@
// If we couldn't find an encoding, see if we have a builtin
if (encoding.equals("utf-8")) {
return wrapDecodeResult(PyUnicode_DecodeUTF8(v.toString(), errors));
- } else if(encoding.equals("utf-7")) {
+ } else if (encoding.equals("utf-7")) {
return wrapDecodeResult(PyUnicode_DecodeUTF7(v.toString(), errors));
- } else if(encoding.equals("latin-1")) {
+ } else if (encoding.equals("latin-1")) {
return wrapDecodeResult(PyUnicode_DecodeLatin1(v.toString(), v.__len__(),
- errors));
+ errors));
}
}
throw ex;
@@ -166,8 +174,7 @@
return new PyUnicode(result, true);
}
- public static String encode(PyString v, String encoding,
- String errors) {
+ public static String encode(PyString v, String encoding, String errors) {
if (encoding == null) {
encoding = getDefaultEncoding();
} else {
@@ -178,8 +185,10 @@
errors = errors.intern();
}
- /* Shortcuts for common default encodings. latin-1 must not use the
- * lookup registry for the encodings module to work correctly */
+ /*
+ * Shortcuts for common default encodings. latin-1 must not use the lookup registry for the
+ * encodings module to work correctly
+ */
if (encoding.equals("latin-1")) {
return PyUnicode_EncodeLatin1(v.toString(), v.__len__(), errors);
} else if (encoding.equals("ascii")) {
@@ -195,7 +204,7 @@
// If we couldn't find an encoding, see if we have a builtin
if (encoding.equals("utf-8")) {
return PyUnicode_EncodeUTF8(v.toString(), errors);
- } else if(encoding.equals("utf-7")) {
+ } else if (encoding.equals("utf-7")) {
return codecs.PyUnicode_EncodeUTF7(v.toString(), false, false, errors);
}
}
@@ -244,9 +253,9 @@
}
private static boolean isUnicodeError(PyObject exc) {
- return Py.isInstance(exc, Py.UnicodeDecodeError) ||
- Py.isInstance(exc, Py.UnicodeEncodeError) ||
- Py.isInstance(exc, Py.UnicodeTranslateError);
+ return Py.isInstance(exc, Py.UnicodeDecodeError)
+ || Py.isInstance(exc, Py.UnicodeEncodeError)
+ || Py.isInstance(exc, Py.UnicodeTranslateError);
}
public static PyObject replace_errors(PyObject[] args, String[] kws) {
@@ -257,12 +266,10 @@
return new PyTuple(new PyUnicode("?"), Py.newInteger(end));
} else if (Py.isInstance(exc, Py.UnicodeDecodeError)) {
int end = exceptions.getEnd(exc, false);
- return new PyTuple(new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER),
- Py.newInteger(end));
+ return new PyTuple(new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER), Py.newInteger(end));
} else if (Py.isInstance(exc, Py.UnicodeTranslateError)) {
int end = exceptions.getEnd(exc, true);
- return new PyTuple(new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER),
- Py.newInteger(end));
+ return new PyTuple(new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER), Py.newInteger(end));
}
throw wrong_exception_type(exc);
}
@@ -273,8 +280,8 @@
if (!Py.isInstance(exc, Py.UnicodeEncodeError)) {
throw wrong_exception_type(exc);
}
- int start = ((PyInteger) exc.__getattr__("start")).getValue();
- int end = ((PyInteger) exc.__getattr__("end")).getValue();
+ int start = ((PyInteger)exc.__getattr__("start")).getValue();
+ int end = ((PyInteger)exc.__getattr__("end")).getValue();
String object = exc.__getattr__("object").toString();
StringBuilder replacement = new StringBuilder();
xmlcharrefreplace_internal(start, end, object, replacement);
@@ -287,7 +294,8 @@
return replacement;
}
- private static void xmlcharrefreplace_internal(int start, int end, String object, StringBuilder replacement) {
+ private static void xmlcharrefreplace_internal(int start, int end, String object,
+ StringBuilder replacement) {
for (int i = start; i < end; i++) {
replacement.append("&#");
char cur = object.charAt(i);
@@ -316,7 +324,7 @@
base = 1000000;
}
while (digits-- > 0) {
- replacement.append((char) ('0' + cur / base));
+ replacement.append((char)('0' + cur / base));
cur %= base;
base /= 10;
}
@@ -327,12 +335,14 @@
private static PyException wrong_exception_type(PyObject exc) {
PyObject excClass = exc.__getattr__("__class__");
PyObject className = excClass.__getattr__("__name__");
- return new PyException(Py.TypeError, "Don't know how to handle " + className + " in error callback");
+ return new PyException(Py.TypeError, "Don't know how to handle " + className
+ + " in error callback");
}
- static char hexdigits[] = {
+
+ static char hexdigits[] = {//@formatter:off
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
- };
+ }; //@formatter:on
public static PyObject backslashreplace_errors(PyObject[] args, String[] kws) {
ArgParser ap = new ArgParser("backslashreplace_errors", args, kws, "exc");
@@ -340,8 +350,8 @@
if (!Py.isInstance(exc, Py.UnicodeEncodeError)) {
throw wrong_exception_type(exc);
}
- int start = ((PyInteger) exc.__getattr__("start")).getValue();
- int end = ((PyInteger) exc.__getattr__("end")).getValue();
+ int start = ((PyInteger)exc.__getattr__("start")).getValue();
+ int end = ((PyInteger)exc.__getattr__("end")).getValue();
String object = exc.__getattr__("object").toString();
StringBuilder replacement = new StringBuilder();
backslashreplace_internal(start, end, object, replacement);
@@ -354,8 +364,10 @@
return replacement;
}
- private static void backslashreplace_internal(int start, int end, String object, StringBuilder replacement) {
- for (Iterator<Integer> iter = new StringSubsequenceIterator(object, start, end, 1); iter.hasNext();) {
+ private static void backslashreplace_internal(int start, int end, String object,
+ StringBuilder replacement) {
+ for (Iterator<Integer> iter = new StringSubsequenceIterator(object, start, end, 1); iter
+ .hasNext();) {
int c = iter.next();
replacement.append('\\');
if (c >= 0x00010000) {
@@ -386,284 +398,652 @@
searchPath = new PyList();
searchCache = new PyStringMap();
errorHandlers = new PyStringMap();
- String[] builtinErrorHandlers = new String[]{"strict",
- IGNORE,
- REPLACE,
- XMLCHARREFREPLACE,
- BACKSLASHREPLACE
- };
+ String[] builtinErrorHandlers =
+ new String[] {"strict", IGNORE, REPLACE, XMLCHARREFREPLACE, BACKSLASHREPLACE};
for (String builtinErrorHandler : builtinErrorHandlers) {
- register_error(builtinErrorHandler, Py.newJavaFunc(codecs.class,
- builtinErrorHandler + "_errors"));
+ register_error(builtinErrorHandler,
+ Py.newJavaFunc(codecs.class, builtinErrorHandler + "_errors"));
}
import_encodings();
}
}
+
/* --- UTF-7 Codec -------------------------------------------------------- */
- /* see RFC2152 for details */
- public static char utf7_special[] = {
- /*
- * indicate whether a UTF-7 character is special i.e. cannot be directly
- * encoded: 0 - not special 1 - special 2 - whitespace (optional) 3 -
- * RFC2152 Set O (optional)
- */
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1
- ,
+ /*
+ * This codec was converted to Java from the CPython v2.7.3 final. See RFC2152 for details of
+ * the encoding scheme. We encode conservatively and decode liberally.
+ */
+ /* //@formatter:off
+ * The UTF-7 encoder treats ASCII characters differently according to whether they are Set D,
+ * Set O, Whitespace, or special (i.e. none of the above). See RFC2152. This array identifies
+ * these different sets:
+ * 0 : "Set D"
+ * alphanumeric and '(),-./:?
+ * 1 : "Set O"
+ * !"#$%&*;<=>@[]^_`{|}
+ * 2 : "whitespace"
+ * ht nl cr sp
+ * 3 : special (must be base64 encoded)
+ * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
+ */
+ private static final byte[] utf7_category = {
+ /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
+ /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ /* sp ! " # $ % & ' ( ) * + , - . / */
+ 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
+ /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+ /* @ A B C D E F G H I J K L M N O */
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* P Q R S T U V W X Y Z [ \ ] ^ _ */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
+ /* ` a b c d e f g h i j k l m n o */
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* p q r s t u v w x y z { | } ~ del */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
+ };//@formatter:on
- };
+ /**
+ * Determine whether this character should be encoded as itself. The answer depends on whether
+ * we are encoding set O (optional special characters) as itself, and also on whether we are
+ * encoding whitespace as itself. RFC2152 makes it clear that the answers to these questions
+ * vary between applications, so this code needs to be flexible.
+ *
+ * @param c code point of the character
+ * @param directO true if characters in "set O" may be encoded as themselves
+ * @param directWS true if whitespace characters may be encoded as themselves
+ * @return
+ */
+ private static boolean ENCODE_DIRECT(int c, boolean directO, boolean directWS) {
- private static boolean SPECIAL(char c, boolean encodeO, boolean encodeWS){
- return (c>127 || utf7_special[(c)] == 1) ||
- (encodeWS && (utf7_special[(c)] == 2)) ||
- (encodeO && (utf7_special[(c)] == 3));
+ if (c >= 128 || c < 0) {
+ return false; // Character not in table is always special
+ } else {
+ switch (utf7_category[c]) {
+ case 0: // This is a regular character
+ return true;
+ case 1: // This is a whilespace character
+ return directWS;
+ case 2: // This is an optional special character
+ return directO;
+ default: // This is always a special character (including '+')
+ return false;
+ }
+ }
}
- private static final String B64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
- private static char B64(int n) {
+ /** Look-up for the Base64 encoded byte [0..0x3f] */
+ private static final String B64_CHARS =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+ /** What is the Base64 encoded byte for (the bottom 6 bits of) n? */
+ private static char TO_BASE64(int n) {
return B64_CHARS.charAt(n & 0x3f);
}
- private static boolean B64CHAR(char c) {
- return B64_CHARS.indexOf(c) != -1;
+ /**
+ * Is c the code point of a Base64 character? And if so, what is the 6-bit quantity to be
+ * decodec from c? Return the 6-bit equivalent of c in a Base64 segment, -1 if it cannot be used
+ * in a Base64 segment, and -2 for the special case of '-' ending the segment.
+ */
+ private static int FROM_BASE64(int c) {
+ return (c >= 128) ? -1 : BASE64_VALUE[c];
}
- private static int UB64(char c) {
- return ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4);
+ /**
+ * Look-up table to convert ASCII byte to 6-bit Base64 value, -1 if not Base64, and -2 if
+ * special terminator '-'.
+ */
+ private static final byte[] BASE64_VALUE = {//@formatter:off
+ // nul soh stx etx eot enq ack bel bs ht nl vt np cr so si
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ // dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ // sp ! " # $ % & ' ( ) * + , - . /
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
+ // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
+ // @ A B C D E F G H I J K L M N O
+ -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ // P Q R S T U V W X Y Z [ \ ] ^ _
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
+ // ` a b c d e f g h i j k l m n o
+ -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ // p q r s t u v w x y z { | } ~ del
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
+ };//@formatter:on
+
+ /**
+ * Enumeration of the error causes during decoding of the Base64 segment of UTF-7
+ */
+ static enum UTF7Error {
+ NONE("No error"), // No error
+ PADDING("non-zero padding bits in shift sequence"), // Error when at end
+ PARTIAL("partial character in shift sequence"), // Error when at end
+ TRUNCATED("second surrogate missing at end of shift sequence"), // Error when at end
+ MISSING("second surrogate missing"), // Lead surrogate followed by another, or BMP
+ TRAIL("unexpected second surrogate"); // Trail surrogate not preceded by lead
+
+ /** Suitable error message */
+ final String msg;
+
+ private UTF7Error(String msg) {
+ this.msg = msg;
+ }
}
- // note that we follow CPython 2.5 exactly here - it does not support surrogates,
- // but has to process as-if they are there for replacement purposes
- // fortunately no one really cares about utf-7
- public static String PyUnicode_DecodeUTF7(String str, String errors) {
- int s = 0;
- int e = str.length();
- boolean inShift = false;
- int bitsInCharsleft = 0;
- long charsleft = 0;
- boolean surrogate = false;
- StringBuilder unicode = new StringBuilder(e);
- while (s < e) {
- // restart:
- char ch = str.charAt(s);
- if (inShift) {
- if ((ch == '-') || !B64CHAR(ch)) {
- inShift = false;
- s++;
- while (bitsInCharsleft >= 16) {
- bitsInCharsleft -= 16;
- char outCh = (char) ((charsleft >> bitsInCharsleft) & 0xffff);
- if (surrogate) {
- s = codecs.insertReplacementAndGetResume(unicode,
- errors,
- "utf-7",
- str,
- s,
- s + 1,
- "code pairs are not supported");
- surrogate = false;
- } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {
- surrogate = true;
- } else {
- unicode.append(outCh);
+ /**
+ * Decode (perhaps partially) a sequence of bytes representing the UTF-7 encoded form of a
+ * Unicode string and return the (Jython internal representation of) the unicode object, and
+ * amount of input consumed. The only state we preserve is our read position, i.e. how many
+ * characters we have consumed. So if the input ends part way through a Base64 sequence the data
+ * reported as consumed is only that up to and not including the Base64 start marker ('+').
+ * Performance will be poor (quadratic cost) on runs of Base64 data long enough to exceed the
+ * input quantum in incremental decoding. The retruned Java String is a UTF-16 representation of
+ * the Unicode result, in line with Java conventions. Unicode characters above the BMP are
+ * represented as surrogate pairs.
+ *
+ * @param bytes input represented as String (Jython PyString convention)
+ * @param errors error policy name (e.g. "ignore", "replace")
+ * @param consumed returns number of bytes consumed in element 0, or is null if a "final" call
+ * @return unicode result (as UTF-16 Java String)
+ */
+ public static String PyUnicode_DecodeUTF7Stateful(String bytes, String errors, int[] consumed) {
+ int s; // Index in the input bytes
+ boolean inBase64 = false; // Whether s is currently in a Base64 segment
+ long base64buffer = 0; // Stored bits buffer during Base64 decoding
+ int base64bits = 0; // Number of valid bits buffered during Base64 decoding
+ int startInBytes = 0; // Place in input bytes where most recent Base64 segment begins
+ int syncInBytes = 0; // Place in input bytes where stored bits buffer last empty
+ int startInUnicode = 0; // Place in output unicode where most recent Base64 segment begins
+
+ int size = bytes.length();
+ StringBuilder unicode = new StringBuilder(size);
+
+ for (s = 0; s < size; s++) { // In error cases s may skip forwards in bytes
+
+ // Next byte to process
+ int b = bytes.charAt(s);
+
+ if (b >= 128) {
+ // The input was supposed to be 7-bit clean
+ s = insertReplacementAndGetResume(unicode, errors, "utf-7", //
+ bytes, s, s + 1, "unexpected special character") - 1;
+
+ } else if (inBase64) {
+ // We are currently processing a Base64 section
+
+ if (base64bits == 0) {
+ // Mark this point as latest easy error recovery point (bits buffer empty)
+ syncInBytes = s;
+ }
+
+ int sixBits = FROM_BASE64(b); // returns -ve if not Base64
+ if (sixBits >= 0) {
+ // And we continue processing a Base64 section
+ base64buffer = (base64buffer << 6) | sixBits;
+ base64bits += 6;
+
+ if (base64bits >= 32) {
+ // We have enough bits for a code point
+ base64bits = emitCodePoints(unicode, base64buffer, base64bits);
+
+ if (base64bits >= 32) {
+ // We stopped prematurely. Why?
+ UTF7Error error = emitCodePointsDiagnosis(base64buffer, base64bits);
+ // Difficult to know exactly what input characters to blame
+ s = insertReplacementAndGetResume(unicode, errors, "utf-7", //
+ bytes, syncInBytes, s + 1, error.msg) - 1;
+ // Discard one UTF-16 output and hope for the best
+ base64bits -= 16;
}
+
}
- if (bitsInCharsleft >= 6) {
+
+ } else {
+ // We are now leaving a Base64 section
+ inBase64 = false;
+
+ // We should have a whole number of code points and < 6 bits zero padding
+ if (base64bits > 0) {
+ // Try to emit them all
+ base64bits = emitCodePoints(unicode, base64buffer, base64bits);
+ // Now check for errors
+ UTF7Error error = emitCodePointsDiagnosis(base64buffer, base64bits);
+ if (error != UTF7Error.NONE) {
+ // Difficult to know exactly what input characters to blame
+ s = insertReplacementAndGetResume(unicode, errors, "utf-7", //
+ bytes, s, s + 1, error.msg) - 1;
+ }
+ // We are, in any case, discarding whatever is in the buffer
+ base64bits = 0;
+ }
+
+ if (b == '-') {
/*
- * The shift sequence has a partial character in it. If
- * bitsleft < 6 then we could just classify it as
- * padding but that is not the case here
+ * '-' signals the end of Base64. The byte is is simply absorbed, but in the
+ * special case where it is the first byte of the Base64 segment, the
+ * zero-length segment '+-' actually encodes "+".
*/
- s = insertReplacementAndGetResume(unicode,
- errors,
- "utf-7",
- str,
- s,
- s + 1,
- "partial character in shift sequence");
- }
- /*
- * According to RFC2152 the remaining bits should be zero.
- * We choose to signal an error/insert a replacement
- * character here so indicate the potential of a misencoded
- * character.
- */
- if (bitsInCharsleft > 0 && ((charsleft << 5 - bitsInCharsleft) & 0x1f) > 0) {
- s = insertReplacementAndGetResume(unicode,
- errors,
- "utf-7",
- str,
- s,
- s + 1,
- "non-zero padding bits in shift sequence");
- }
- if (ch == '-') {
- if ((s < e) && (str.charAt(s) == '-')) {
- unicode.append('-');
- inShift = true;
+ if (s == startInBytes + 1) {
+ unicode.append('+');
}
- } else if (SPECIAL(ch, false, false)) {
- s = insertReplacementAndGetResume(unicode,
- errors,
- "utf-7",
- str,
- s,
- s + 1,
- "unexpected special character");
} else {
- unicode.append(ch);
- }
- } else {
- charsleft = (charsleft << 6) | UB64(ch);
- bitsInCharsleft += 6;
- s++;
- while (bitsInCharsleft >= 16) {
- bitsInCharsleft -= 16;
- char outCh = (char) ((charsleft >> bitsInCharsleft) & 0xffff);
- if (surrogate) {
- s = codecs.insertReplacementAndGetResume(unicode,
- errors,
- "utf-7",
- str,
- s,
- s + 1,
- "code pairs are not supported");
- } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {
- surrogate = true;
- } else {
- unicode.append(outCh);
- }
+ /*
+ * This b is a US-ASCII byte for some character.
+ */
+ unicode.appendCodePoint(b);
}
}
- } else if (ch == '+') {
- s++;
- if (s < e && str.charAt(s) == '-') {
- s++;
- unicode.append('+');
- } else {
- inShift = true;
- bitsInCharsleft = 0;
- }
- } else if (SPECIAL(ch, false, false)) {
- s = insertReplacementAndGetResume(unicode,
- errors,
- "utf-7",
- str,
- s,
- s + 1,
- "unexpected special character");
+
+ } else if (b == '+') {
+ /*
+ * We are not currently processing a Base64 section, but this starts one. Remember
+ * where it starts, in the input bytes and the output unicode so that, if we hit the
+ * end of input before it ends, we can leave it unprocessed for next time.
+ */
+ startInBytes = s;
+ startInUnicode = unicode.length();
+
+ // Initialise the Base64 decoder
+ base64bits = 0;
+ inBase64 = true;
+
} else {
- unicode.append(ch);
- s++;
- }
- if (inShift && s == e) {
- s = insertReplacementAndGetResume(unicode,
- errors,
- "utf-7",
- str,
- s,
- s,
- "unterminated shift sequence");
+ /*
+ * This b is a US-ASCII byte for some character. We are permissive on decoding; the
+ * only ASCII byte not decoding to itself is the + which begins a base64 string.
+ */
+ unicode.appendCodePoint(b);
}
}
+
+ /*
+ * We hit the end of the input. If we were part way through some Base64 processing, since we
+ * don't store all that state (inBase64, base64bits, base64buffer) the strategy is to back
+ * up the input pointer to the '-' that started the current Base64 segment.
+ */
+ if (inBase64) {
+ // Restore state to beginning of last Base64 sequence
+ s = startInBytes;
+ unicode.setLength(startInUnicode);
+ }
+
+ if (consumed != null) {
+ // Not a final call, so report how much consumed in the consumed argument
+ consumed[0] = s;
+ } else if (s < size) {
+ // This was final but we didn't exhaust the input: that's an error.
+ s = insertReplacementAndGetResume(unicode, errors, "utf-7", //
+ bytes, startInBytes, size, "unterminated shift sequence");
+ }
+
return unicode.toString();
}
- public static String PyUnicode_EncodeUTF7(String str,
- boolean encodeSetO,
- boolean encodeWhiteSpace,
- String errors) {
- int size = str.length();
+ /**
+ * Decode completely a sequence of bytes representing the UTF-7 encoded form of a Unicode string
+ * and return the (Jython internal representation of) the unicode object. The retruned Java
+ * String is a UTF-16 representation of the Unicode result, in line with Java conventions.
+ * Unicode characters above the BMP are represented as surrogate pairs.
+ *
+ * @param bytes input represented as String (Jython PyString convention)
+ * @param errors error policy name (e.g. "ignore", "replace")
+ * @return unicode result (as UTF-16 Java String)
+ */
+ public static String PyUnicode_DecodeUTF7(String bytes, String errors) {
+ return PyUnicode_DecodeUTF7Stateful(bytes, errors, null);
+ }
- if (size == 0) {
- return "";
+ /**
+ * Helper for {@link #PyUnicode_DecodeUTF7Stateful(String, String, int[])} to emit characters
+ * that accumulated as UTF-16 code units in the bits of a long integer (from Base64 decoding,
+ * say). The buffer variable may hold any number of bits (up to its 64-bit capacity). The number
+ * of valid bits is given by argument <code>n</code> and they are the <code>n</code> least
+ * significant of the buffer.
+ * <p>
+ * Only complete Unicode characters are emitted, which are obtained by consuming 16 bits (when
+ * those bits identify a BMP character), or 32 bits (when those bits form a surrogate pair).
+ * Consumed bits are not cleared from the buffer (it is passed by value), and there is no need
+ * for the client to clear them, but the method returns the new number of valid bits n1, which
+ * are in the least significant positions (that is, bits <code>n1-1</code> to <code>0</code>).
+ *
+ * If the method returns with 32 or more bits unconsumed, it has encountered an invalid sequence
+ * of bits: the leading bits will then either be an "unaccompanied" trail surrogate, or a lead
+ * surrogate not followed by a trail surrogate.
+ *
+ * @param v output UTF-16 sequence
+ * @param buffer holding the bits
+ * @param n the number of bits held (<=64)
+ * @return the number of bits not emitted (<32 unless error)
+ */
+ private static int emitCodePoints(StringBuilder v, long buffer, int n) {
+
+ // Emit code points until too few in the buffer to process.
+ while (n >= 16) {
+
+ /*
+ * Get the top 16 bits of the buffer to bottom of an int. Note no 0xffff mask as bits to
+ * left of bit-15 are harmless
+ */
+ int unit = (int)(buffer >>> (n - 16));
+ boolean unitIsSurrogate = ((unit & 0xF800) == 0xD800);
+
+ if (!unitIsSurrogate) {
+ // This (or rather its bottom 16 bits) is a BMP codepoint: easy
+ v.append((char)unit);
+ n -= 16;
+
+ } else if (n >= 32) {
+ // This a surrogate unit and we have enough bits for the whole code point.
+ if ((unit & 0x0400) == 0) {
+ // This is a lead surrogate as expected ... get the trail surrogate.
+ int unit2 = (int)(buffer >>> (n - 32));
+ if ((unit2 & 0xFC00) == 0xD800) {
+ // And this is the trail surrogate we expected
+ v.appendCodePoint(0x10000 + ((unit & 0x3ff) << 10) + (unit2 & 0x3ff));
+ n -= 32;
+ } else {
+ // But this isn't a trail surrogate: jam at >=32
+ return n;
+ }
+ } else {
+ // This is an unaccompanied trail surrogate: jam at >=32
+ return n;
+ }
+
+ } else {
+ // This a non-BMP code point but we don't have enough bits to deal with it yet
+ return n;
+ }
+
}
- boolean inShift = false;
- int bitsleft = 0;
- int charsleft = 0;
- StringBuilder v = new StringBuilder();
+ return n;
+ }
- for (int i = 0; i < size; ++i) {
- char ch = str.charAt(i);
+ /**
+ * Helper for {@link #PyUnicode_DecodeUTF7Stateful(String, String, int[])} to diagnose what went
+ * wrong in {@link #emitCodePoints(StringBuilder, long, int)}. When called with fewer than 32
+ * bits in the buffer, it assumes we are in the run-down of processing at the end of the
+ * decoder, where partial output characters are an error. For 32 bits or more, It duplicates
+ * some logic, but is called only during abnormal processing. The return is:
+ * <table>
+ * <tr>
+ * <td>NONE</td>
+ * <td>No error</td>
+ * </tr>
+ * <tr>
+ * <td>PADDING</td>
+ * <td>non-zero padding bits in shift sequence</td>
+ * <td>(error if at end of shift sequence)</td>
+ * </tr>
+ * <tr>
+ * <td>PARTIAL</td>
+ * <td>partial character in shift sequence</td>
+ * <td>(error if at end of shift sequence)</td>
+ * </tr>
+ * <tr>
+ * <td>TRUNCATED</td>
+ * <td>second surrogate missing at end of shift sequence</td>
+ * </tr>
+ * <tr>
+ * <td>MISSING</td>
+ * <td>second surrogate missing</td>
+ * </tr>
+ * <tr>
+ * <td>TRAIL</td>
+ * <td>unexpected second surrogate</td>
+ * </tr>
+ * </table>
+ * <p>
+ * We are compatible with CPython in using the term "second surrogate" in error messages rather
+ * than "trail surrogate" (which is used in the code).
+ * <p>
+ * Note that CPython (see Issue13333) allows this codec to decode lone surrogates into the
+ * internal data of unicode objects. It is difficult to reconcile this with the idea that the
+ * v3.3 statement "Strings contain Unicode characters", but that reconciliation is probably to
+ * be found in PEP383, not implemented in Jython.
+ *
+ * @param buffer holding the bits
+ * @param n the number of bits held (<=64)
+ * @return the diagnosis
+ */
+ private static UTF7Error emitCodePointsDiagnosis(long buffer, int n) {
- if (!inShift) {
- if (ch == '+') {
- v.append('+');
- v.append('-');
- } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
- charsleft = ch;
- bitsleft = 16;
- v.append('+');
- while (bitsleft >= 6) {
- v.append(B64(charsleft >> (bitsleft - 6)));
- bitsleft -= 6;
+ if (n >= 16) {
+ /*
+ * Get the top 16 bits of the buffer to bottom of an int. Note no 0xffff mask as bits to
+ * left of bit-15 are harmless
+ */
+ int unit = (int)(buffer >>> (n - 16));
+ boolean unitIsSurrogate = ((unit & 0xF800) == 0xD800);
+
+ if (!unitIsSurrogate) {
+ // No problem. In practice, we should never land here.
+ return UTF7Error.NONE;
+
+ } else if (n >= 32) {
+
+ if ((unit & 0x0400) == 0) {
+ // This is a lead surrogate, which is valid: check the next 16 bits.
+ int unit2 = ((int)(buffer >>> (n - 32))) & 0xffff;
+ if ((unit2 & 0xFC00) == 0xD800) {
+ // Not trail surrogate: that's the problem
+ return UTF7Error.MISSING;
+ } else {
+ // Hmm ... why was I called?
+ return UTF7Error.NONE;
}
- inShift = bitsleft > 0;
+
} else {
- v.append(ch);
+ // This is an unexpected trail surrogate
+ return UTF7Error.TRAIL;
}
+
} else {
- if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
- v.append(B64(charsleft << (6 - bitsleft)));
- charsleft = 0;
- bitsleft = 0;
- /* Characters not in the BASE64 set implicitly unshift the sequence
- so no '-' is required, except if the character is itself a '-' */
- if (B64CHAR(ch) || ch == '-') {
+ // Note that 32 > n >= 16, so we are at the end of decoding
+
+ if ((unit & 0x0400) == 0) {
+ /*
+ * This is a lead surrogate, but since decoding stopped we must have reched the
+ * end of a Base64 segment without the trail surrogate appearing.
+ */
+ return UTF7Error.TRUNCATED;
+
+ } else {
+ // This is an unexpected trail surrogate
+ return UTF7Error.TRAIL;
+ }
+ }
+
+ } else if (n >= 6) {
+ // Fewer than 16 bits: at end of decoding with Base64 characters left over
+ return UTF7Error.PARTIAL;
+
+ } else {
+ // Fewer than 6 bits, which should all be zero. Make a mask to extract them.
+ int validBits = (1 << n) - 1;
+ int padding = ((int)buffer) & validBits;
+ if (padding != 0) {
+ // At end of decoding with non-zero padding
+ return UTF7Error.PADDING;
+ } else {
+ // Any bits left are zero: that's ok then.
+ return UTF7Error.NONE;
+ }
+ }
+ }
+
+ /**
+ * Encode a UTF-16 Java String as UTF-7 bytes represented by the low bytes of the characters in
+ * a String. (String representation for byte data is chosen so that it may immediately become a
+ * PyString.)
+ *
+ * This method differs from the CPython equivalent (in <code>Object/unicodeobject.c</code>)
+ * which works with an array of point codes that are, in a wide build, Unicode code points.
+ *
+ * @param unicode
+ * @param base64SetO
+ * @param base64WhiteSpace
+ * @param errors
+ * @return
+ */
+ public static String PyUnicode_EncodeUTF7(String unicode, boolean base64SetO,
+ boolean base64WhiteSpace, String errors) {
+ boolean inBase64 = false;
+ int base64bits = 0;
+ long base64buffer = 0;
+
+ int size = unicode.length();
+
+ // Output bytes here: sized for ASCII + a few non-BMP characters
+ // We use a StringBuilder and return a String, but we are really storing encoded bytes
+ StringBuilder v = new StringBuilder(size + size / 8 + 10);
+
+ for (int i = 0; i < size; i++) {
+
+ // Next UTF-16 code unit to process
+ int ch = unicode.charAt(i);
+
+ /*
+ * Decide what to output and prepare for it. Mainly, decide whether to represent this
+ * UTF-16 code unit in Base64 or US-ASCII, and switch modes, with output, accordingly.
+ */
+ if (inBase64) {
+ // Currently we are in Base64 encoding: should we switch out?
+ if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
+ /*
+ * The next character is one for which we do not neeed to be in Base64, so pad
+ * out to 6n the Base64 bits we currently have buffered and emit them. Then
+ * switch to US-ASCII.
+ */
+ emitBase64Padded(v, base64buffer, base64bits);
+ inBase64 = false;
+
+ if (FROM_BASE64(ch) != -1) {
+ // Character is in the Base64 set, or is a '-': must signal end explicitly.
v.append('-');
}
- inShift = false;
- v.append(ch);
- } else {
- bitsleft += 16;
- charsleft = (charsleft << 16) | ch;
- while (bitsleft >= 6) {
- v.append(B64(charsleft >> (bitsleft - 6)));
- bitsleft -= 6;
- }
- /* If the next character is special then we dont' need to terminate
- the shift sequence. If the next character is not a BASE64 character
- or '-' then the shift sequence will be terminated implicitly and we
- don't have to insert a '-'. */
+ }
- if (bitsleft == 0) {
- if (i + 1 < size) {
- char ch2 = str.charAt(i + 1);
-
- if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
-
- } else if (B64CHAR(ch2) || ch2 == '-') {
- v.append('-');
- inShift = false;
- } else {
- inShift = false;
- }
-
- } else {
- v.append('-');
- inShift = false;
- }
- }
+ } else {
+ // Not currently in Base64 encoding: should we switch in?
+ if (ch == '+') {
+ // Special case for + since it would otherwise flag a start
+ v.append('+');
+ ch = '-'; // Comes out as +-
+ } else if (!ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
+ /*
+ * The next character is one for which we neeed to be in Base64, so switch to it
+ * and emit the Base64 start marker and initialise the coder.
+ */
+ v.append('+');
+ inBase64 = true;
+ base64bits = 0;
}
}
+
+ /*
+ * We have decided what to do (US-ASCII or Base64) but we haven't done it yet.
+ */
+ if (!inBase64) {
+ // We decided to encode the current character as US-ASCII and are in that mode
+ v.append((char)ch);
+
+ } else {
+ // We decided to encode the current character as Base64 and are in that mode
+ /*
+ * In the present implementation the characters are suppplied as a UTF-16 Java
+ * String. The UTF-7 approach to characters beyond the BMP is to encode the
+ * surrogate pair as two 16-bit pseudo-characters, which is how Jython represents it
+ * already, so the first part is already done for us by accessing the internal
+ * representation.
+ */
+ // XXX see issue #2002: we should only count surrogate pairs as one character
+ // if ((ch & 0xFC00)==0xC800) { count++; }
+
+ if (base64bits > 48) {
+ // No room for the next 16 bits: emit all we have
+ base64bits = emitBase64(v, base64buffer, base64bits);
+ }
+ base64bits += 16;
+ base64buffer = (base64buffer << 16) + ch;
+ }
}
- if (bitsleft > 0) {
- v.append(B64(charsleft << (6 - bitsleft)));
+
+ /*
+ * We've run out of input to encode. If we are currently in US-ASCII mode, we can just stop.
+ * If we are in Base64 mode, we have to come to a clean stop, since there is no opportunity
+ * to store this fact as state for next time (and there may be no next time).
+ */
+ if (inBase64) {
+ /*
+ * Currently we are in Base64 encoding and must switch out. Pad out to 6n the bits we
+ * currently have buffered and emit them. We don't know what might come next so emit a
+ * '-' to round out the segment.
+ */
+ emitBase64Padded(v, base64buffer, base64bits);
v.append('-');
}
+
return v.toString();
}
+
+ /**
+ * Helper for {@link #PyUnicode_EncodeUTF7(String, boolean, boolean, String)} to emit 6-bit
+ * Base64 code units as bytes to the output. The buffer variable may hold any number of bits (up
+ * to its 64-bit capacity). The number of valid bits is given by argument <code>n</code> and
+ * they are the <code>n</code> least significant of the buffer. Bits will be emitted in groups
+ * of 6, represented by their Base64 character, starting with the 6 most-significant valid bits
+ * of the buffer (that is, bits <code>n-6</code> to <code>n-1</code>). The buffer is not cleared
+ * (it is passed by value), but the method returns the new number of valid bits n1, which are in
+ * the least significant positions (that is, bits <code>n1-1</code> to <code>0</code>).
+ *
+ * @param v output byte array
+ * @param buffer holding the bits
+ * @param n the number of bits held (<=64)
+ * @return the number of bits (<6) not emitted
+ */
+ private static int emitBase64(StringBuilder v, long buffer, int n) {
+ while (n >= 6) {
+ n -= 6;
+ long sixBits = buffer >>> n;
+ char b64byte = TO_BASE64((int)sixBits);
+ v.append(b64byte);
+ }
+ return n;
+ }
+
+ /**
+ * Helper for {@link #PyUnicode_EncodeUTF7(String, boolean, boolean, String)} to emit 6-bit
+ * Base64 code units as bytes to the output. The buffer variable may hold any number of bits (up
+ * to 60 bits). The number of valid bits is given by argument <code>n</code> and they are the
+ * <code>n</code> least significant of the buffer. The buffer will be padded, by shifting in
+ * zeros at the least significant end, until it the number of valid bits is a multiple of 6.
+ * Bits will then be emitted in groups of 6, represented by their Base64 character, starting
+ * with the 6 most-significant valid bits of the buffer (that is, bits <code>n-6</code> to
+ * <code>n-1</code>). The buffer is not cleared (it is passed by value), but can be considered
+ * empty.
+ *
+ * @param v output byte array
+ * @param buffer holding the bits
+ * @param n the number of bits held (<=60)
+ */
+ private static void emitBase64Padded(StringBuilder v, long buffer, int n) {
+ if (n > 0) {
+ int npad = 5 - (n + 5) % 6; // smallest such that (n+npad) mod 6 == 0
+ emitBase64(v, buffer << npad, n + npad); // == 0 as a result of the padding
+ }
+ }
+
/* --- UTF-8 Codec ---------------------------------------------------- */
- private static byte utf8_code_length[] = {
+
+ private static byte utf8_code_length[] = {//@formatter:off
/* Map UTF-8 encoded prefix byte to sequence length. zero means
illegal prefix. see RFC 2279 for details */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -682,8 +1062,7 @@
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
- };
-
+ }; //@formatter:on
// TODO: need to modify to use a codepoint approach (which is almost the case now,
// ch is an
@@ -701,12 +1080,13 @@
int ch = str.charAt(i);
if (ch < 0x80) {
- unicode.append((char) ch);
+ unicode.append((char)ch);
i++;
continue;
}
if (ch > 0xFF) {
- i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "ordinal not in range(255)");
+ i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
+ i, i + 1, "ordinal not in range(255)");
continue;
}
@@ -716,27 +1096,31 @@
if (consumed != null) {
break;
}
- i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "unexpected end of data");
+ i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
+ i, i + 1, "unexpected end of data");
continue;
}
-
switch (n) {
case 0:
- i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "unexpected code byte");
+ i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
+ i, i + 1, "unexpected code byte");
continue;
case 1:
- i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "internal error");
+ i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
+ i, i + 1, "internal error");
continue;
case 2:
char ch1 = str.charAt(i + 1);
if ((ch1 & 0xc0) != 0x80) {
- i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 2, "invalid data");
+ i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
+ i, i + 2, "invalid data");
continue;
}
ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f);
if (ch < 0x80) {
- i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 2, "illegal encoding");
+ i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
+ i, i + 2, "illegal encoding");
continue;
} else {
unicode.appendCodePoint(ch);
@@ -747,12 +1131,14 @@
ch1 = str.charAt(i + 1);
char ch2 = str.charAt(i + 2);
if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) {
- i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 3, "invalid data");
+ i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
+ i, i + 3, "invalid data");
continue;
}
ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f);
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
- i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 3, "illegal encoding");
+ i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
+ i, i + 3, "illegal encoding");
continue;
} else {
unicode.appendCodePoint(ch);
@@ -763,20 +1149,18 @@
ch1 = str.charAt(i + 1);
ch2 = str.charAt(i + 2);
char ch3 = str.charAt(i + 3);
- if ((ch1 & 0xc0) != 0x80 ||
- (ch2 & 0xc0) != 0x80 ||
- (ch3 & 0xc0) != 0x80) {
- i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 4, "invalid data");
+ if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80 || (ch3 & 0xc0) != 0x80) {
+ i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
+ i, i + 4, "invalid data");
continue;
}
- ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) +
+ ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) + //
((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
- /* validate and convert to UTF-16 */
- if ((ch < 0x10000) || /* minimum value allowed for 4
- byte encoding */
- (ch > 0x10ffff)) { /* maximum value allowed for
- UTF-16 */
- i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 4, "illegal encoding");
+ // validate and convert to UTF-16
+ if ((ch < 0x10000) || // minimum value allowed for 4 byte encoding
+ (ch > 0x10ffff)) { // maximum value allowed for UTF-16
+ i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
+ i, i + 4, "illegal encoding");
continue;
}
@@ -785,8 +1169,9 @@
default:
// TODO: support
- /* Other sizes are only needed for UCS-4 */
- i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + n, "unsupported Unicode code range");
+ /* Other sizes are only needed for UCS-4 */
+ i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, //
+ i, i + n, "unsupported Unicode code range");
continue;
}
i += n;
@@ -811,7 +1196,8 @@
return PyUnicode_DecodeIntLimited(str, size, errors, "latin-1", 256);
}
- private static String PyUnicode_DecodeIntLimited(String str, int size, String errors, String encoding, int limit) {
+ private static String PyUnicode_DecodeIntLimited(String str, int size, String errors,
+ String encoding, int limit) {
StringBuilder v = new StringBuilder(size);
String reason = "ordinal not in range(" + limit + ")";
@@ -820,31 +1206,24 @@
if (ch < limit) {
v.append(ch);
} else {
- i = insertReplacementAndGetResume(v, errors,
- encoding,
- str,
- i,
- i + 1,
- reason) - 1;
+ i = insertReplacementAndGetResume(v, errors, encoding, str, i, i + 1, reason) - 1;
}
}
return v.toString();
}
- public static String PyUnicode_EncodeASCII(String str, int size,
- String errors) {
+ public static String PyUnicode_EncodeASCII(String str, int size, String errors) {
return PyUnicode_EncodeIntLimited(str, size, errors, "ascii", 128);
}
- public static String PyUnicode_EncodeLatin1(String str, int size,
- String errors) {
+ public static String PyUnicode_EncodeLatin1(String str, int size, String errors) {
return PyUnicode_EncodeIntLimited(str, size, errors, "latin-1", 256);
}
- private static String PyUnicode_EncodeIntLimited(String str, int size,
- String errors, String encoding, int limit) {
+ private static String PyUnicode_EncodeIntLimited(String str, int size, String errors,
+ String encoding, int limit) {
String reason = "ordinal not in range(" + limit + ")";
StringBuilder v = new StringBuilder(size);
for (int i = 0; i < size; i++) {
@@ -876,12 +1255,7 @@
continue;
}
}
- PyObject replacement = encoding_error(errors,
- encoding,
- str,
- i,
- nextGood,
- reason);
+ PyObject replacement = encoding_error(errors, encoding, str, i, nextGood, reason);
String replStr = replacement.__getitem__(0).toString();
for (int j = 0; j < replStr.length(); j++) {
if (replStr.charAt(j) >= limit) {
@@ -897,26 +1271,15 @@
return v.toString();
}
- public static int calcNewPosition(int size, PyObject errorTuple) {
- int newPosition = ((PyInteger) errorTuple.__getitem__(1)).getValue();
- if (newPosition < 0) {
- newPosition = size + newPosition;
- }
- if (newPosition > size || newPosition < 0) {
- throw Py.IndexError(newPosition + " out of bounds of encoded string");
- }
- return newPosition;
- }
/* --- RawUnicodeEscape Codec ---------------------------------------- */
private static char[] hexdigit = "0123456789ABCDEF".toCharArray();
// The modified flag is used by cPickle.
- public static String PyUnicode_EncodeRawUnicodeEscape(String str, String errors,
- boolean modifed) {
+ public static String
+ PyUnicode_EncodeRawUnicodeEscape(String str, String errors, boolean modifed) {
StringBuilder v = new StringBuilder(str.length());
- for (Iterator<Integer> iter = new PyUnicode(str).newSubsequenceIterator();
- iter.hasNext();) {
+ for (Iterator<Integer> iter = new PyUnicode(str).newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
if (codePoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// Map 32-bit characters to '\\Uxxxxxxxx'
@@ -992,8 +1355,8 @@
codePoint = ((codePoint << 4) & ~0xF) + asDigit;
}
if (asDigit == -1) {
- i = codecs.insertReplacementAndGetResume(v, errors, "rawunicodeescape", str, bs, i,
- "truncated \\uXXXX");
+ i = codecs.insertReplacementAndGetResume(v, errors, "rawunicodeescape", str, //
+ bs, i, "truncated \\uXXXX");
} else {
v.appendCodePoint(codePoint);
}
@@ -1003,6 +1366,7 @@
}
private static class Punycode {
+
// specified by punycode, http://www.ietf.org/rfc/rfc3492.txt
private static final int BASE = 36;
private static final int TMIN = 1;
@@ -1033,8 +1397,7 @@
}
}
- public static String PyUnicode_EncodePunycode(PyUnicode input,
- String errors) {
+ public static String PyUnicode_EncodePunycode(PyUnicode input, String errors) {
int n = Punycode.INITIAL_N;
int delta = 0;
long guard_delta;
@@ -1066,9 +1429,10 @@
}
guard_delta = delta + ((m - n) * (h + 1));
if (guard_delta > Integer.MAX_VALUE) {
- throw Py.UnicodeEncodeError("punycode", input.getString(), codePointIndex, codePointIndex + 1, "overflow");
+ throw Py.UnicodeEncodeError("punycode", input.getString(), codePointIndex,
+ codePointIndex + 1, "overflow");
}
- delta = (int) guard_delta;
+ delta = (int)guard_delta;
n = m;
i = 0;
@@ -1077,14 +1441,16 @@
if (c < n) {
guard_delta = delta + 1;
if (guard_delta > Integer.MAX_VALUE) {
- throw Py.UnicodeEncodeError("punycode", input.getString(), i, i + 1, "overflow");
+ throw Py.UnicodeEncodeError("punycode", input.getString(), i, i + 1,
+ "overflow");
}
- delta = (int) guard_delta;
+ delta = (int)guard_delta;
}
if (c == n) {
int q = delta;
for (int k = Punycode.BASE;; k += Punycode.BASE) {
- int t = k <= bias ? Punycode.TMIN : (k >= bias + Punycode.TMAX ? Punycode.TMAX : k - bias);
+ int t = k <= bias ? Punycode.TMIN : //
+ (k >= bias + Punycode.TMAX ? Punycode.TMAX : k - bias);
if (q < t) {
break;
}
@@ -1134,8 +1500,9 @@
if (guard_i > Integer.MAX_VALUE) {
throw Py.UnicodeDecodeError("punycode", input, j, j + 1, "overflow");
}
- i = (int) guard_i;
- int t = k <= bias ? Punycode.TMIN : (k >= bias + Punycode.TMAX ? Punycode.TMAX : k - bias);
+ i = (int)guard_i;
+ int t = k <= bias ? Punycode.TMIN : //
+ (k >= bias + Punycode.TMAX ? Punycode.TMAX : k - bias);
if (digit < t) {
break;
}
@@ -1153,41 +1520,38 @@
return new PyUnicode(ucs4);
}
- public static String PyUnicode_EncodeIDNA(PyUnicode input,
- String errors) {
+ public static String PyUnicode_EncodeIDNA(PyUnicode input, String errors) {
throw new UnsupportedOperationException();
-
-// 1. If the sequence contains any code points outside the ASCII range
-// (0..7F) then proceed to step 2, otherwise skip to step 3.
-//
-// 2. Perform the steps specified in [NAMEPREP] and fail if there is an
-// error. The AllowUnassigned flag is used in [NAMEPREP].
-// this basically enails changing out space, etc.
-//
-// 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
-//
-// (a) Verify the absence of non-LDH ASCII code points; that is, the
-// absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
-//
-// (b) Verify the absence of leading and trailing hyphen-minus; that
-// is, the absence of U+002D at the beginning and end of the
-// sequence.
-//
-// 4. If the sequence contains any code points outside the ASCII range
-// (0..7F) then proceed to step 5, otherwise skip to step 8.
-//
-// 5. Verify that the sequence does NOT begin with the ACE prefix.
-//
-// 6. Encode the sequence using the encoding algorithm in [PUNYCODE] and
-// fail if there is an error.
-//
-// 7. Prepend the ACE prefix.
-//
-// 8. Verify that the number of code points is in the range 1 to 63
-// inclusive.
-
+ // 1. If the sequence contains any code points outside the ASCII range
+ // (0..7F) then proceed to step 2, otherwise skip to step 3.
+ //
+ // 2. Perform the steps specified in [NAMEPREP] and fail if there is an
+ // error. The AllowUnassigned flag is used in [NAMEPREP].
+ // this basically enails changing out space, etc.
+ //
+ // 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
+ //
+ // (a) Verify the absence of non-LDH ASCII code points; that is, the
+ // absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
+ //
+ // (b) Verify the absence of leading and trailing hyphen-minus; that
+ // is, the absence of U+002D at the beginning and end of the
+ // sequence.
+ //
+ // 4. If the sequence contains any code points outside the ASCII range
+ // (0..7F) then proceed to step 5, otherwise skip to step 8.
+ //
+ // 5. Verify that the sequence does NOT begin with the ACE prefix.
+ //
+ // 6. Encode the sequence using the encoding algorithm in [PUNYCODE] and
+ // fail if there is an error.
+ //
+ // 7. Prepend the ACE prefix.
+ //
+ // 8. Verify that the number of code points is in the range 1 to 63
+ // inclusive.
}
public static PyUnicode PyUnicode_DecodeIDNA(String input, String errors) {
@@ -1195,85 +1559,127 @@
}
/* --- Utility methods -------------------------------------------- */
- public static PyObject encoding_error(String errors,
- String encoding,
- String toEncode,
- int start,
- int end,
- String reason) {
+ public static PyObject encoding_error(String errors, String encoding, String toEncode,
+ int start, int end, String reason) {
PyObject errorHandler = lookup_error(errors);
- PyException exc = Py.UnicodeEncodeError(encoding,
- toEncode,
- start,
- end,
- reason);
+ PyException exc = Py.UnicodeEncodeError(encoding, toEncode, start, end, reason);
exc.normalize();
- PyObject replacement = errorHandler.__call__(new PyObject[]{exc.value});
+ PyObject replacement = errorHandler.__call__(new PyObject[] {exc.value});
checkErrorHandlerReturn(errors, replacement);
return replacement;
}
- public static int insertReplacementAndGetResume(StringBuilder partialDecode,
- String errors,
- String encoding,
- String toDecode,
- int start,
- int end,
- String reason) {
+ /**
+ * Handler errors encountered during decoding, adjusting the output buffer contents and
+ * returning the correct position to resume decoding (if the handler does not siomply raise an
+ * exception).
+ *
+ * @param partialDecode output buffer of unicode (as UTF-16) that the codec is building
+ * @param errors name of the error policy (or null meaning "strict")
+ * @param encoding name of encoding that encountered the error
+ * @param toDecode bytes being decoded
+ * @param start index of first byte it couldn't decode
+ * @param end index+1 of last byte it couldn't decode (usually becomes the resume point)
+ * @param reason contribution to error message if any
+ * @return the resume position: index of next byte to decode
+ */
+ public static int insertReplacementAndGetResume(StringBuilder partialDecode, String errors,
+ String encoding, String toDecode, int start, int end, String reason) {
+
+ // Handle the two special cases "ignore" and "replace" locally
if (errors != null) {
if (errors.equals(IGNORE)) {
+ // Just skip to the first non-problem byte
return end;
} else if (errors.equals(REPLACE)) {
- while (start < end) {
- partialDecode.appendCodePoint(Py_UNICODE_REPLACEMENT_CHARACTER);
- start++;
- }
+ // Insert *one* Unicode replacement character and skip
+ partialDecode.appendCodePoint(Py_UNICODE_REPLACEMENT_CHARACTER);
return end;
}
}
- PyObject replacement = decoding_error(errors,
- encoding,
- toDecode,
- start,
- end,
- reason);
- checkErrorHandlerReturn(errors, replacement);
- partialDecode.append(replacement.__getitem__(0).toString());
- return calcNewPosition(toDecode.length(), replacement);
+
+ // If errors not one of those, invoke the generic mechanism
+ PyObject replacementSpec = decoding_error(errors, encoding, toDecode, start, end, reason);
+ checkErrorHandlerReturn(errors, replacementSpec);
+
+ // Deliver the replacement unicode text to the output buffer
+ partialDecode.append(replacementSpec.__getitem__(0).toString());
+
+ // Return the index in toDecode at which we should resume
+ return calcNewPosition(toDecode.length(), replacementSpec);
}
- public static PyObject decoding_error(String errors,
- String encoding,
- String toEncode,
- int start,
- int end,
- String reason) {
+ /**
+ * Invoke a user-defined error-handling mechanism, for errors encountered during decoding, as
+ * registered through {@link #register_error(String, PyObject)}. The return value is the return
+ * from the error handler indicating the replacement codec output and the the position at which
+ * to resume decoding. invokes the mechanism described in PEP-293.
+ *
+ * @param errors name of the error policy (or null meaning "strict")
+ * @param encoding name of encoding that encountered the error
+ * @param toDecode bytes being decoded
+ * @param start index of first byte it couldn't decode
+ * @param end index+1 of last byte it couldn't decode (usually becomes the resume point)
+ * @param reason contribution to error message if any
+ * @return must be a tuple <code>(replacement_unicode, resume_index)</code>
+ */
+ public static PyObject decoding_error(String errors, String encoding, String toDecode,
+ int start, int end, String reason) {
+ // Retrieve handler registered through register_error(). null is equivalent to "strict".
PyObject errorHandler = lookup_error(errors);
- PyException exc = Py.UnicodeDecodeError(encoding,
- toEncode,
- start,
- end,
- reason);
+ // Construct an exception to hand to the error handler
+ PyException exc = Py.UnicodeDecodeError(encoding, toDecode, start, end, reason);
exc.normalize();
- return errorHandler.__call__(new PyObject[]{exc.value});
+ // And invoke the handler.
+ return errorHandler.__call__(new PyObject[] {exc.value});
}
- private static void checkErrorHandlerReturn(String errors,
- PyObject replacement) {
- if (!(replacement instanceof PyTuple) || replacement.__len__() != 2 || !(replacement.__getitem__(0) instanceof PyBaseString) || !(replacement.__getitem__(1) instanceof PyInteger)) {
- throw new PyException(Py.TypeError, "error_handler " + errors + " must return a tuple of (replacement, new position)");
+ /**
+ * Check thet the error handler returned a tuple
+ * <code>(replacement_unicode, resume_index)</code>.
+ *
+ * @param errors name of the error policy (or null meaning "strict")
+ * @param replacementSpec from error handler
+ */
+ private static void checkErrorHandlerReturn(String errors, PyObject replacementSpec) {
+ if (!(replacementSpec instanceof PyTuple) || replacementSpec.__len__() != 2
+ || !(replacementSpec.__getitem__(0) instanceof PyBaseString)
+ || !(replacementSpec.__getitem__(1) instanceof PyInteger)) {
+ throw new PyException(Py.TypeError, "error_handler " + errors
+ + " must return a tuple of (replacement, new position)");
}
}
+
+ /**
+ * Given the return from some codec error handler (invoked while decoding), which specifies a
+ * resume position, and the length of buffer being decoded, check and interpret the resume
+ * position. Negative indexes in the error handler return are interpreted as "from the end". If
+ * the result would be out of bounds in the bytes being decoded, an exception is raised.
+ *
+ * @param size of byte buffer being decoded
+ * @param errorTuple returned from error handler
+ * @return absolute resume position.
+ */
+ public static int calcNewPosition(int size, PyObject errorTuple) {
+ int newPosition = ((PyInteger)errorTuple.__getitem__(1)).getValue();
+ if (newPosition < 0) {
+ newPosition = size + newPosition;
+ }
+ if (newPosition > size || newPosition < 0) {
+ throw Py.IndexError(newPosition + " out of bounds of encoded string");
+ }
+ return newPosition;
+ }
}
class StringSubsequenceIterator implements Iterator {
private final String s;
- private int current, k, start, stop, step;
+ private int current, k, start, stop, step;
StringSubsequenceIterator(String s, int start, int stop, int step) {
-// System.out.println("s=" + s.length() + ",start=" + start + ",stop=" + stop);
+ // System.out.println("s=" + s.length() + ",start=" + start + ",stop=" + stop);
this.s = s;
k = 0;
current = start;
@@ -1281,13 +1687,14 @@
this.stop = stop;
this.step = step;
- // this bounds checking is necessary to convert between use of code units elsewhere, and codepoints here
- // it would be nice if it were unnecessary!
+ /*
+ * this bounds checking is necessary to convert between use of code units elsewhere, and
+ * codepoints here it would be nice if it were unnecessary!
+ */
int count = getCodePointCount(s);
if (start >= count) {
this.stop = -1;
- }
- else if (stop >= count) {
+ } else if (stop >= count) {
this.stop = count;
}
@@ -1304,10 +1711,12 @@
return s.codePointCount(0, s.length());
}
+ @Override
public boolean hasNext() {
return current < stop;
}
+ @Override
public Object next() {
int codePoint = nextCodePoint();
current += 1;
@@ -1320,7 +1729,7 @@
private int nextCodePoint() {
int U;
-// System.out.println("k=" + k);
+ // System.out.println("k=" + k);
int W1 = s.charAt(k);
if (W1 >= 0xD800 && W1 < 0xDC00) {
int W2 = s.charAt(k + 1);
@@ -1333,6 +1742,7 @@
return U;
}
+ @Override
public void remove() {
throw new UnsupportedOperationException("Not supported on String objects (immutable)");
}
diff --git a/src/org/python/modules/_codecs.java b/src/org/python/modules/_codecs.java
--- a/src/org/python/modules/_codecs.java
+++ b/src/org/python/modules/_codecs.java
@@ -1,9 +1,8 @@
/*
- * Copyright 2000 Finn Bock
+ * Copyright (c)2013 Jython Developers. Original Java version copyright 2000 Finn Bock.
*
- * This program contains material copyrighted by:
- * Copyright (c) Corporation for National Research Initiatives.
- * Originally written by Marc-Andre Lemburg (mal at lemburg.com).
+ * This program contains material copyrighted by: Copyright (c) Corporation for National Research
+ * Initiatives. Originally written by Marc-Andre Lemburg (mal at lemburg.com).
*/
package org.python.modules;
@@ -23,6 +22,14 @@
import org.python.core.codecs;
import org.python.expose.ExposedType;
+/**
+ * This class corresponds to the Python _codecs module, which in turn lends its functions to the
+ * codecs module (in Lib/codecs.py). It exposes the implementing functions of several codec families
+ * called out in the Python codecs library Lib/encodings/*.py, where it is usually claimed that they
+ * are bound "as C functions". Obviously, C stands for "compiled" in this context, rather than
+ * dependence on a particular implementation language. Actual transcoding methods often come from
+ * the related {@link codecs} class.
+ */
public class _codecs {
public static void register(PyObject search_function) {
@@ -45,8 +52,30 @@
return EncodingMap.buildEncodingMap(map);
}
- private static PyTuple decode_tuple(String s, int len) {
- return new PyTuple(new PyUnicode(s), Py.newInteger(len));
+ /**
+ * Convenience method to construct the return value of decoders, providing the Unicode result as
+ * a String, and the number of bytes consumed.
+ *
+ * @param u the unicode result as a UTF-16 Java String
+ * @param bytesConsumed the number of bytes consumed
+ * @return the tuple (unicode(u), bytesConsumed)
+ */
+ private static PyTuple decode_tuple(String u, int bytesConsumed) {
+ return new PyTuple(new PyUnicode(u), Py.newInteger(bytesConsumed));
+ }
+
+ /**
+ * Convenience method to construct the return value of decoders, providing the Unicode result
+ * as a String, and the number of bytes consumed in decoding as either a single-element array or
+ * an int to be used if the array argument is null.
+ *
+ * @param u the unicode result as a UTF-16 Java String
+ * @param consumed if not null, element [0] is the number of bytes consumed
+ * @param defaultBytesConsumed if consumed==null, use this as the number of bytes consumed
+ * @return the tuple (unicode(u), bytesConsumed)
+ */
+ private static PyTuple decode_tuple(String u, int[] consumed, int defaultBytesConsumed) {
+ return decode_tuple(u, consumed != null ? consumed[0] : defaultBytesConsumed);
}
private static PyTuple decode_tuple_str(String s, int len) {
@@ -57,7 +86,6 @@
return new PyTuple(new PyString(s), Py.newInteger(len));
}
-
/* --- UTF-8 Codec --------------------------------------------------- */
public static PyTuple utf_8_decode(String str) {
return utf_8_decode(str, null);
@@ -69,8 +97,8 @@
public static PyTuple utf_8_decode(String str, String errors, boolean final_) {
int[] consumed = final_ ? null : new int[1];
- return decode_tuple(codecs.PyUnicode_DecodeUTF8Stateful(str, errors, consumed),
- final_ ? str.length() : consumed[0]);
+ return decode_tuple(codecs.PyUnicode_DecodeUTF8Stateful(str, errors, consumed), final_
+ ? str.length() : consumed[0]);
}
public static PyTuple utf_8_encode(String str) {
@@ -82,15 +110,19 @@
return encode_tuple(codecs.PyUnicode_EncodeUTF8(str, errors), size);
}
-
/* --- UTF-7 Codec --------------------------------------------------- */
- public static PyTuple utf_7_decode(String str) {
- return utf_7_decode(str, null);
+ public static PyTuple utf_7_decode(String bytes) {
+ return utf_7_decode(bytes, null);
}
- public static PyTuple utf_7_decode(String str, String errors) {
- int size = str.length();
- return decode_tuple(codecs.PyUnicode_DecodeUTF7(str, errors), size);
+ public static PyTuple utf_7_decode(String bytes, String errors) {
+ return utf_7_decode(bytes, null, false);
+ }
+
+ public static PyTuple utf_7_decode(String bytes, String errors, boolean finalFlag) {
+ int[] consumed = finalFlag ? null : new int[1];
+ String decoded = codecs.PyUnicode_DecodeUTF7Stateful(bytes, errors, consumed);
+ return decode_tuple(decoded, consumed, bytes.length());
}
public static PyTuple utf_7_encode(String str) {
@@ -102,16 +134,14 @@
return encode_tuple(codecs.PyUnicode_EncodeUTF7(str, false, false, errors), size);
}
+ /* --- string-escape Codec -------------------------------------------- */
public static PyTuple escape_decode(String str) {
return escape_decode(str, null);
}
public static PyTuple escape_decode(String str, String errors) {
- return decode_tuple_str(PyString.decode_UnicodeEscape(str,
- 0,
- str.length(),
- errors,
- true), str.length());
+ return decode_tuple_str(PyString.decode_UnicodeEscape(str, 0, str.length(), errors, true),
+ str.length());
}
public static PyTuple escape_encode(String str) {
@@ -123,63 +153,118 @@
}
/* --- Character Mapping Codec --------------------------------------- */
- public static PyTuple charmap_decode(String str,
- String errors,
- PyObject mapping) {
- return charmap_decode(str, errors, mapping, false);
+
+ /**
+ * Equivalent to <code>charmap_decode(bytes, errors, null)</code>. This method is here so the
+ * error and mapping arguments can be optional at the Python level.
+ *
+ * @param bytes sequence of bytes to decode
+ * @return decoded string and number of bytes consumed
+ */
+ public static PyTuple charmap_decode(String bytes) {
+ return charmap_decode(bytes, null, null);
}
- public static PyTuple charmap_decode(String str,
- String errors,
- PyObject mapping, boolean ignoreUnmapped) {
+ /**
+ * Equivalent to <code>charmap_decode(bytes, errors, null)</code>. This method is here so the
+ * error argument can be optional at the Python level.
+ *
+ * @param bytes sequence of bytes to decode
+ * @param errors error policy
+ * @return decoded string and number of bytes consumed
+ */
+ public static PyTuple charmap_decode(String bytes, String errors) {
+ return charmap_decode(bytes, errors, null);
+ }
+ /**
+ * Decode a sequence of bytes into Unicode characters via a mapping supplied as a container to
+ * be indexed by the byte values (as unsigned integers). If the mapping is null or None, decode
+ * with latin-1 (essentially treating bytes as character codes directly).
+ *
+ * @param bytes sequence of bytes to decode
+ * @param errors error policy
+ * @param mapping to convert bytes to characters
+ * @return decoded string and number of bytes consumed
+ */
+ public static PyTuple charmap_decode(String str, String errors, PyObject mapping) {
+ if (mapping == null || mapping == Py.None) {
+ // Default to Latin-1
+ return latin_1_decode(str, errors);
+ } else {
+ return charmap_decode(str, errors, mapping, false);
+ }
+ }
- int size = str.length();
+ /**
+ * Decode a sequence of bytes into Unicode characters via a mapping supplied as a container to
+ * be indexed by the byte values (as unsigned integers).
+ *
+ * @param bytes sequence of bytes to decode
+ * @param errors error policy
+ * @param mapping to convert bytes to characters
+ * @param ignoreUnmapped if true, pass unmapped byte values as character codes [0..256)
+ * @return decoded string and number of bytes consumed
+ */
+ public static PyTuple charmap_decode(String bytes, String errors, PyObject mapping,
+ boolean ignoreUnmapped) {
+ // XXX bytes: would prefer to accept any object with buffer API
+ int size = bytes.length();
StringBuilder v = new StringBuilder(size);
+
for (int i = 0; i < size; i++) {
- char ch = str.charAt(i);
- if (ch > 0xFF) {
- i = codecs.insertReplacementAndGetResume(v,
- errors,
- "charmap",
- str,
- i,
- i + 1,
- "ordinal not in range(255)") - 1;
+
+ // Process the i.th input byte
+ int b = bytes.charAt(i);
+ if (b > 0xff) {
+ i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
+ i, i + 1, "ordinal not in range(255)") - 1;
continue;
}
- PyObject w = Py.newInteger(ch);
+
+ // Map the byte to an output character code (or possibly string)
+ PyObject w = Py.newInteger(b);
PyObject x = mapping.__finditem__(w);
+
+ // Apply to the output
if (x == null) {
+ // Error case: mapping not found
if (ignoreUnmapped) {
- v.append(ch);
+ v.appendCodePoint(b);
} else {
- i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, i, i + 1, "no mapping found") - 1;
+ i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
+ i, i + 1, "no mapping found") - 1;
}
- continue;
- }
- /* Apply mapping */
- if (x instanceof PyInteger) {
- int value = ((PyInteger) x).getValue();
+
+ } else if (x instanceof PyInteger) {
+ // Mapping was to an int: treat as character code
+ int value = ((PyInteger)x).getValue();
if (value < 0 || value > PySystemState.maxunicode) {
- throw Py.TypeError("character mapping must return " + "integer greater than 0 and less than sys.maxunicode");
+ throw Py.TypeError("character mapping must return "
+ + "integer greater than 0 and less than sys.maxunicode");
}
- v.append((char) value);
+ v.appendCodePoint(value);
+
} else if (x == Py.None) {
- i = codecs.insertReplacementAndGetResume(v,
- errors,
- "charmap",
- str,
- i,
- i + 1,
- "character maps to <undefined>") - 1;
+ i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
+ i, i + 1, "character maps to <undefined>") - 1;
+
} else if (x instanceof PyString) {
- v.append(x.toString());
+ String s = x.toString();
+ if (s.charAt(0) == 0xfffe) {
+ // Invalid indicates "undefined" see C-API PyUnicode_DecodeCharmap()
+ i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
+ i, i + 1, "character maps to <undefined>") - 1;
+ } else {
+ v.append(s);
+ }
+
} else {
/* wrong return value */
throw Py.TypeError("character mapping must return " + "integer, None or str");
}
}
+
return decode_tuple(v.toString(), size);
}
@@ -203,7 +288,7 @@
int value = result.asInt();
if (value < 0 || value > PySystemState.maxunicode) {
throw Py.TypeError(String.format("character mapping must be in range(0x%x)",
- PySystemState.maxunicode + 1));
+ PySystemState.maxunicode + 1));
}
buf.appendCodePoint(value);
} else if (result instanceof PyUnicode) {
@@ -216,102 +301,174 @@
return new PyUnicode(buf.toString());
}
- public static PyTuple charmap_encode(String str, String errors,
- PyObject mapping) {
- //Default to Latin-1
- if (mapping == null) {
- return latin_1_encode(str, errors);
- }
- return charmap_encode_internal(str, errors, mapping, new StringBuilder(str.length()), true);
+ /**
+ * Equivalent to <code>charmap_encode(str, null, null)</code>. This method is here so the error
+ * and mapping arguments can be optional at the Python level.
+ *
+ * @param str to be encoded
+ * @return (encoded data, size(str)) as a pair
+ */
+ public static PyTuple charmap_encode(String str) {
+ return charmap_encode(str, null, null);
}
- private static PyTuple charmap_encode_internal(String str,
- String errors,
- PyObject mapping,
- StringBuilder v,
- boolean letLookupHandleError) {
+ /**
+ * Equivalent to <code>charmap_encode(str, errors, null)</code>. This method is here so the
+ * mapping can be optional at the Python level.
+ *
+ * @param str to be encoded
+ * @param errors error policy name (e.g. "ignore")
+ * @return (encoded data, size(str)) as a pair
+ */
+ public static PyTuple charmap_encode(String str, String errors) {
+ return charmap_encode(str, errors, null);
+ }
+
+ /**
+ * Encoder based on an optional character mapping. This mapping is either an
+ * <code>EncodingMap</code> of 256 entries, or an arbitrary container indexable with integers
+ * using <code>__finditem__</code> and yielding byte strings. If the mapping is null, latin-1
+ * (effectively a mapping of character code to the numerically-equal byte) is used
+ *
+ * @param str to be encoded
+ * @param errors error policy name (e.g. "ignore")
+ * @param mapping from character code to output byte (or string)
+ * @return (encoded data, size(str)) as a pair
+ */
+ public static PyTuple charmap_encode(String str, String errors, PyObject mapping) {
+ if (mapping == null || mapping == Py.None) {
+ // Default to Latin-1
+ return latin_1_encode(str, errors);
+ } else {
+ return charmap_encode_internal(str, errors, mapping, new StringBuilder(str.length()),
+ true);
+ }
+ }
+
+ /**
+ * Helper to implement the several variants of <code>charmap_encode</code>, given an optional
+ * mapping. This mapping is either an <code>EncodingMap</code> of 256 entries, or an arbitrary
+ * container indexable with integers using <code>__finditem__</code> and yielding byte strings.
+ *
+ * @param str to be encoded
+ * @param errors error policy name (e.g. "ignore")
+ * @param mapping from character code to output byte (or string)
+ * @param v to contain the encoded bytes
+ * @param letLookupHandleError
+ * @return (encoded data, size(str)) as a pair
+ */
+ private static PyTuple charmap_encode_internal(String str, String errors, PyObject mapping,
+ StringBuilder v, boolean letLookupHandleError) {
+
EncodingMap encodingMap = mapping instanceof EncodingMap ? (EncodingMap)mapping : null;
int size = str.length();
+
for (int i = 0; i < size; i++) {
+
+ // Map the i.th character of str to some value
char ch = str.charAt(i);
PyObject x;
if (encodingMap != null) {
+ // The mapping given was an EncodingMap [0,256) => on-negative int
int result = encodingMap.lookup(ch);
- if (result == -1) {
- x = null;
- } else {
- x = Py.newInteger(result);
- }
+ x = (result == -1) ? null : Py.newInteger(result);
} else {
+ // The mapping was a map or similar: non-negative int -> object
x = mapping.__finditem__(Py.newInteger(ch));
}
+
+ // And map this object to an output character
if (x == null) {
+ // Error during lookup
if (letLookupHandleError) {
+ // Some kind of substitute can be placed in the output
i = handleBadMapping(str, errors, mapping, v, size, i);
} else {
- throw Py.UnicodeEncodeError("charmap",
- str,
- i,
- i + 1,
+ // Hard error
+ throw Py.UnicodeEncodeError("charmap", str, i, i + 1,
"character maps to <undefined>");
}
+
} else if (x instanceof PyInteger) {
- int value = ((PyInteger) x).getValue();
+ // Look-up had integer result: output as byte value
+ int value = ((PyInteger)x).getValue();
if (value < 0 || value > 255) {
throw Py.TypeError("character mapping must be in range(256)");
}
- v.append((char) value);
+ v.append((char)value);
+
} else if (x instanceof PyString && !(x instanceof PyUnicode)) {
+ // Look-up had str or unicode result: output as Java String
+ // XXX: (Py3k) Look-up had bytes or str result: output as ... this is a problem
v.append(x.toString());
+
} else if (x instanceof PyNone) {
i = handleBadMapping(str, errors, mapping, v, size, i);
+
} else {
/* wrong return value */
throw Py.TypeError("character mapping must return " + "integer, None or str");
}
}
+
return encode_tuple(v.toString(), size);
}
- private static int handleBadMapping(String str,
- String errors,
- PyObject mapping,
- StringBuilder v,
- int size,
- int i) {
+ /**
+ * Helper for {@link #charmap_encode_internal(String, String, PyObject, StringBuilder, boolean)}
+ * called when we need some kind of substitute in the output for an invalid input.
+ *
+ * @param str to be encoded
+ * @param errors error policy name (e.g. "ignore")
+ * @param mapping from character code to output byte (or string)
+ * @param v to contain the encoded bytes
+ * @param size of str
+ * @param i index in str of current (and problematic) character
+ * @return index of last character of problematic section
+ */
+ private static int handleBadMapping(String str, String errors, PyObject mapping,
+ StringBuilder v, int size, int i) {
+
+ // If error policy specified, execute it
if (errors != null) {
+
if (errors.equals(codecs.IGNORE)) {
return i;
+
} else if (errors.equals(codecs.REPLACE)) {
- charmap_encode_internal("?", errors, mapping, v, false);
+ String replStr = "?";
+ charmap_encode_internal(replStr, errors, mapping, v, false);
return i;
+
} else if (errors.equals(codecs.XMLCHARREFREPLACE)) {
- charmap_encode_internal(codecs.xmlcharrefreplace(i, i + 1, str).toString(), errors, mapping, v, false);
+ String replStr = codecs.xmlcharrefreplace(i, i + 1, str).toString();
+ charmap_encode_internal(replStr, errors, mapping, v, false);
return i;
+
} else if (errors.equals(codecs.BACKSLASHREPLACE)) {
- charmap_encode_internal(codecs.backslashreplace(i, i + 1, str).toString(), errors, mapping, v, false);
+ String replStr = codecs.backslashreplace(i, i + 1, str).toString();
+ charmap_encode_internal(replStr, errors, mapping, v, false);
return i;
}
}
- PyObject replacement = codecs.encoding_error(errors,
- "charmap",
- str,
- i,
- i + 1,
- "character maps to <undefined>");
+
+ // Default behaviour (error==null or does not match known case)
+ String msg = "character maps to <undefined>";
+ PyObject replacement = codecs.encoding_error(errors, "charmap", str, i, i + 1, msg);
String replStr = replacement.__getitem__(0).toString();
charmap_encode_internal(replStr, errors, mapping, v, false);
+
return codecs.calcNewPosition(size, replacement) - 1;
}
+ /* --- ascii Codec ---------------------------------------------- */
public static PyTuple ascii_decode(String str) {
return ascii_decode(str, null);
}
public static PyTuple ascii_decode(String str, String errors) {
int size = str.length();
- return decode_tuple(codecs.PyUnicode_DecodeASCII(str, size, errors),
- size);
+ return decode_tuple(codecs.PyUnicode_DecodeASCII(str, size, errors), size);
}
public static PyTuple ascii_encode(String str) {
@@ -320,11 +477,9 @@
public static PyTuple ascii_encode(String str, String errors) {
int size = str.length();
- return encode_tuple(codecs.PyUnicode_EncodeASCII(str, size, errors),
- size);
+ return encode_tuple(codecs.PyUnicode_EncodeASCII(str, size, errors), size);
}
-
/* --- Latin-1 Codec -------------------------------------------- */
public static PyTuple latin_1_decode(String str) {
return latin_1_decode(str, null);
@@ -332,8 +487,7 @@
public static PyTuple latin_1_decode(String str, String errors) {
int size = str.length();
- return decode_tuple(codecs.PyUnicode_DecodeLatin1(str, size, errors),
- size);
+ return decode_tuple(codecs.PyUnicode_DecodeLatin1(str, size, errors), size);
}
public static PyTuple latin_1_encode(String str) {
@@ -345,7 +499,6 @@
return encode_tuple(codecs.PyUnicode_EncodeLatin1(str, size, errors), size);
}
-
/* --- UTF16 Codec -------------------------------------------- */
public static PyTuple utf_16_encode(String str) {
return utf_16_encode(str, null);
@@ -355,10 +508,8 @@
return encode_tuple(encode_UTF16(str, errors, 0), str.length());
}
- public static PyTuple utf_16_encode(String str, String errors,
- int byteorder) {
- return encode_tuple(encode_UTF16(str, errors, byteorder),
- str.length());
+ public static PyTuple utf_16_encode(String str, String errors, int byteorder) {
+ return encode_tuple(encode_UTF16(str, errors, byteorder), str.length());
}
public static PyTuple utf_16_le_encode(String str) {
@@ -397,7 +548,7 @@
}
return v.toString();
}
-
+
public static PyTuple utf_16_decode(String str) {
return utf_16_decode(str, null);
}
@@ -407,10 +558,10 @@
}
public static PyTuple utf_16_decode(String str, String errors, boolean final_) {
- int[] bo = new int[] { 0 };
+ int[] bo = new int[] {0};
int[] consumed = final_ ? null : new int[1];
- return decode_tuple(decode_UTF16(str, errors, bo, consumed),
- final_ ? str.length() : consumed[0]);
+ return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length()
+ : consumed[0]);
}
public static PyTuple utf_16_le_decode(String str) {
@@ -420,27 +571,27 @@
public static PyTuple utf_16_le_decode(String str, String errors) {
return utf_16_le_decode(str, errors, false);
}
-
+
public static PyTuple utf_16_le_decode(String str, String errors, boolean final_) {
- int[] bo = new int[] { -1 };
+ int[] bo = new int[] {-1};
int[] consumed = final_ ? null : new int[1];
- return decode_tuple(decode_UTF16(str, errors, bo, consumed),
- final_ ? str.length() : consumed[0]);
+ return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length()
+ : consumed[0]);
}
public static PyTuple utf_16_be_decode(String str) {
return utf_16_be_decode(str, null);
}
-
+
public static PyTuple utf_16_be_decode(String str, String errors) {
return utf_16_be_decode(str, errors, false);
}
public static PyTuple utf_16_be_decode(String str, String errors, boolean final_) {
- int[] bo = new int[] { 1 };
+ int[] bo = new int[] {1};
int[] consumed = final_ ? null : new int[1];
- return decode_tuple(decode_UTF16(str, errors, bo, consumed),
- final_ ? str.length() : consumed[0]);
+ return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length()
+ : consumed[0]);
}
public static PyTuple utf_16_ex_decode(String str) {
@@ -454,27 +605,21 @@
public static PyTuple utf_16_ex_decode(String str, String errors, int byteorder) {
return utf_16_ex_decode(str, errors, byteorder, false);
}
-
- public static PyTuple utf_16_ex_decode(String str, String errors, int byteorder,
- boolean final_) {
- int[] bo = new int[] { 0 };
+
+ public static PyTuple
+ utf_16_ex_decode(String str, String errors, int byteorder, boolean final_) {
+ int[] bo = new int[] {0};
int[] consumed = final_ ? null : new int[1];
String decoded = decode_UTF16(str, errors, bo, consumed);
- return new PyTuple(Py.newString(decoded),
- Py.newInteger(final_ ? str.length() : consumed[0]),
- Py.newInteger(bo[0]));
+ return new PyTuple(new PyUnicode(decoded), Py.newInteger(final_ ? str.length()
+ : consumed[0]), Py.newInteger(bo[0]));
}
- private static String decode_UTF16(String str,
- String errors,
- int[] byteorder) {
+ private static String decode_UTF16(String str, String errors, int[] byteorder) {
return decode_UTF16(str, errors, byteorder, null);
}
- private static String decode_UTF16(String str,
- String errors,
- int[] byteorder,
- int[] consumed) {
+ private static String decode_UTF16(String str, String errors, int[] byteorder, int[] consumed) {
int bo = 0;
if (byteorder != null) {
bo = byteorder[0];
@@ -488,13 +633,8 @@
if (consumed != null) {
break;
}
- i = codecs.insertReplacementAndGetResume(v,
- errors,
- "utf-16",
- str,
- i,
- i + 1,
- "truncated data");
+ i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, //
+ i, i + 1, "truncated data");
continue;
}
char ch2 = str.charAt(i + 1);
@@ -530,23 +670,13 @@
v.appendCodePoint(U);
continue;
}
- i = codecs.insertReplacementAndGetResume(v,
- errors,
- "utf-16",
- str,
- i,
- i + 1,
- "illegal UTF-16 surrogate");
+ i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, //
+ i, i + 1, "illegal UTF-16 surrogate");
continue;
}
- i = codecs.insertReplacementAndGetResume(v,
- errors,
- "utf-16",
- str,
- i,
- i + 1,
- "illegal encoding");
+ i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, //
+ i, i + 1, "illegal encoding");
}
if (byteorder != null) {
byteorder[0] = bo;
@@ -562,10 +692,8 @@
return raw_unicode_escape_encode(str, null);
}
- public static PyTuple raw_unicode_escape_encode(String str,
- String errors) {
- return encode_tuple(codecs.PyUnicode_EncodeRawUnicodeEscape(str,
- errors, false),
+ public static PyTuple raw_unicode_escape_encode(String str, String errors) {
+ return encode_tuple(codecs.PyUnicode_EncodeRawUnicodeEscape(str, errors, false),
str.length());
}
@@ -573,21 +701,17 @@
return raw_unicode_escape_decode(str, null);
}
- public static PyTuple raw_unicode_escape_decode(String str,
- String errors) {
- return decode_tuple(codecs.PyUnicode_DecodeRawUnicodeEscape(str,
- errors),
- str.length());
+ public static PyTuple raw_unicode_escape_decode(String str, String errors) {
+ return decode_tuple(codecs.PyUnicode_DecodeRawUnicodeEscape(str, errors), str.length());
}
- /* --- UnicodeEscape Codec -------------------------------------------- */
+ /* --- unicode-escape Codec ------------------------------------------- */
public static PyTuple unicode_escape_encode(String str) {
return unicode_escape_encode(str, null);
}
public static PyTuple unicode_escape_encode(String str, String errors) {
- return encode_tuple(PyString.encode_UnicodeEscape(str, false),
- str.length());
+ return encode_tuple(PyString.encode_UnicodeEscape(str, false), str.length());
}
public static PyTuple unicode_escape_decode(String str) {
@@ -596,14 +720,21 @@
public static PyTuple unicode_escape_decode(String str, String errors) {
int n = str.length();
- return decode_tuple(PyString.decode_UnicodeEscape(str,
- 0,
- n,
- errors,
- true), n);
+ return decode_tuple(PyString.decode_UnicodeEscape(str, 0, n, errors, true), n);
}
/* --- UnicodeInternal Codec ------------------------------------------ */
+ // XXX Should deprecate unicode-internal codec and delegate to UTF-32BE (when we have one)
+ /*
+ * This codec is supposed to deal with an encoded form equal to the internal representation of
+ * the unicode object considered as bytes in memory. This was confusing in CPython as it varied
+ * with machine architecture (width and endian-ness). In Jython, the most compatible choice
+ * would be UTF-32BE since unicode objects report their length as if UCS-4 and
+ * sys.byteorder=='big'. The codec is deprecated in v3.3 as irrelevant, or impossible, in view
+ * of the flexible string representation (which Jython emulates in its own way).
+ *
+ * See http://mail.python.org/pipermail/python-dev/2011-November/114415.html
+ */
public static PyTuple unicode_internal_encode(String str) {
return unicode_internal_encode(str, null);
}
@@ -623,19 +754,15 @@
/**
* Optimized charmap encoder mapping.
*
- * Uses a trie structure instead of a dictionary; the speedup primarily comes from not
- * creating integer objects in the process. The trie is created by inverting the
- * encoding map.
+ * Uses a trie structure instead of a dictionary; the speedup primarily comes from not creating
+ * integer objects in the process. The trie is created by inverting the encoding map.
*/
@ExposedType(name = "EncodingMap", isBaseType = false)
public static class EncodingMap extends PyObject {
char[] level1;
-
char[] level23;
-
int count2;
-
int count3;
private EncodingMap(char[] level1, char[] level23, int count2, int count3) {
@@ -770,4 +897,3 @@
}
}
}
-
--
Repository URL: http://hg.python.org/jython
More information about the Jython-checkins
mailing list