[Jython-checkins] jython: Fixed bug in UTF-7 decoder (surrogate pairs)
jeff.allen
jython-checkins at python.org
Mon Jan 14 10:08:17 CET 2013
http://hg.python.org/jython/rev/b49adef87315
changeset: 6947:b49adef87315
parent: 6938:c0072e7f0c90
user: Jeff Allen <ja...py at farowl.co.uk>
date: Mon Jan 07 23:42:56 2013 +0000
summary:
Fixed bug in UTF-7 decoder (surrogate pairs)
Also added a test to test_codecs.py to exercise decoding pairs.
files:
Lib/test/test_codecs.py | 37 +++++++++++++++++++++
src/org/python/core/codecs.java | 26 +++++++-------
2 files changed, 50 insertions(+), 13 deletions(-)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -560,6 +560,43 @@
]
)
+ # Jython extra (test supplementary characters)
+ @unittest.skipIf(not test_support.is_jython, "Jython supports surrogate pairs")
+ def test_partial_supp(self):
+ # Check the encoding is what we think it is
+ ustr = u"x\U00023456.\u0177\U00023456\u017az"
+ bstr = b'x+2E3cVg.+AXfYTdxWAXo-z'
+ self.assertEqual(ustr.encode(self.encoding), bstr)
+
+ self.check_partial(
+ ustr,
+ [
+ u"x",
+ u"x", # '+' added: begins Base64
+ u"x",
+ u"x",
+ u"x",
+ u"x",
+ u"x",
+ u"x",
+ u"x\U00023456.", # '.' added: ends Base64
+ u"x\U00023456.", # '+' added: begins Base64
+ u"x\U00023456.",
+ u"x\U00023456.",
+ u"x\U00023456.",
+ u"x\U00023456.",
+ u"x\U00023456.",
+ u"x\U00023456.",
+ u"x\U00023456.",
+ u"x\U00023456.",
+ u"x\U00023456.",
+ u"x\U00023456.",
+ u"x\U00023456.",
+ u"x\U00023456.\u0177\U00023456\u017a", # '-' added: ends Base64
+ u"x\U00023456.\u0177\U00023456\u017az",
+ ]
+ )
+
class UTF16ExTest(unittest.TestCase):
def test_errors(self):
diff --git a/src/org/python/core/codecs.java b/src/org/python/core/codecs.java
--- a/src/org/python/core/codecs.java
+++ b/src/org/python/core/codecs.java
@@ -540,10 +540,10 @@
* Decode (perhaps partially) a sequence of bytes representing the UTF-7 encoded form of a
* Unicode string and return the (Jython internal representation of) the unicode object, and
* amount of input consumed. The only state we preserve is our read position, i.e. how many
- * characters we have consumed. So if the input ends part way through a Base64 sequence the data
- * reported as consumed is only that up to and not including the Base64 start marker ('+').
+ * bytes we have consumed. So if the input ends part way through a Base64 sequence the data
+ * reported as consumed is just that up to and not including the Base64 start marker ('+').
* Performance will be poor (quadratic cost) on runs of Base64 data long enough to exceed the
- * input quantum in incremental decoding. The retruned Java String is a UTF-16 representation of
+ * input quantum in incremental decoding. The returned Java String is a UTF-16 representation of
* the Unicode result, in line with Java conventions. Unicode characters above the BMP are
* represented as surrogate pairs.
*
@@ -743,7 +743,7 @@
if ((unit & 0x0400) == 0) {
// This is a lead surrogate as expected ... get the trail surrogate.
int unit2 = (int)(buffer >>> (n - 32));
- if ((unit2 & 0xFC00) == 0xD800) {
+ if ((unit2 & 0xFC00) == 0xDC00) {
// And this is the trail surrogate we expected
v.appendCodePoint(0x10000 + ((unit & 0x3ff) << 10) + (unit2 & 0x3ff));
n -= 32;
@@ -832,12 +832,12 @@
if ((unit & 0x0400) == 0) {
// This is a lead surrogate, which is valid: check the next 16 bits.
int unit2 = ((int)(buffer >>> (n - 32))) & 0xffff;
- if ((unit2 & 0xFC00) == 0xD800) {
+ if ((unit2 & 0xFC00) == 0xDC00) {
+ // Hmm ... why was I called?
+ return UTF7Error.NONE;
+ } else {
// Not trail surrogate: that's the problem
return UTF7Error.MISSING;
- } else {
- // Hmm ... why was I called?
- return UTF7Error.NONE;
}
} else {
@@ -885,7 +885,7 @@
* PyString.)
*
* This method differs from the CPython equivalent (in <code>Object/unicodeobject.c</code>)
- * which works with an array of point codes that are, in a wide build, Unicode code points.
+ * which works with an array of code points that are, in a wide build, Unicode code points.
*
* @param unicode
* @param base64SetO
@@ -965,7 +965,7 @@
* representation.
*/
// XXX see issue #2002: we should only count surrogate pairs as one character
- // if ((ch & 0xFC00)==0xC800) { count++; }
+ // if ((ch & 0xFC00)==0xD800) { count++; }
if (base64bits > 48) {
// No room for the next 16 bits: emit all we have
@@ -1570,8 +1570,8 @@
}
/**
- * Handler errors encountered during decoding, adjusting the output buffer contents and
- * returning the correct position to resume decoding (if the handler does not siomply raise an
+ * Handler for errors encountered during decoding, adjusting the output buffer contents and
+ * returning the correct position to resume decoding (if the handler does not simply raise an
* exception).
*
* @param partialDecode output buffer of unicode (as UTF-16) that the codec is building
@@ -1613,7 +1613,7 @@
* Invoke a user-defined error-handling mechanism, for errors encountered during decoding, as
* registered through {@link #register_error(String, PyObject)}. The return value is the return
* from the error handler indicating the replacement codec output and the the position at which
- * to resume decoding. invokes the mechanism described in PEP-293.
+ * to resume decoding. Invokes the mechanism described in PEP-293.
*
* @param errors name of the error policy (or null meaning "strict")
* @param encoding name of encoding that encountered the error
--
Repository URL: http://hg.python.org/jython
More information about the Jython-checkins
mailing list