[Jython-checkins] jython: Fixed bug in UTF-7 decoder (surrogate pairs)

Mon Jan 14 10:08:17 CET 2013

http://hg.python.org/jython/rev/b49adef87315
changeset:   6947:b49adef87315
parent:      6938:c0072e7f0c90
user:        Jeff Allen <ja...py at farowl.co.uk>
date:        Mon Jan 07 23:42:56 2013 +0000
summary:
  Fixed bug in UTF-7 decoder (surrogate pairs)
Also added a test to test_codecs.py to exercise decoding pairs.

files:
  Lib/test/test_codecs.py         |  37 +++++++++++++++++++++
  src/org/python/core/codecs.java |  26 +++++++-------
  2 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -560,6 +560,43 @@
             ]
         )
 
+    # Jython extra (test supplementary characters)
+    @unittest.skipIf(not test_support.is_jython, "Jython supports surrogate pairs")
+    def test_partial_supp(self):
+        # Check the encoding is what we think it is
+        ustr = u"x\U00023456.\u0177\U00023456\u017az"
+        bstr = b'x+2E3cVg.+AXfYTdxWAXo-z'
+        self.assertEqual(ustr.encode(self.encoding), bstr)
+
+        self.check_partial(
+            ustr,
+            [
+                u"x",
+                u"x",   # '+' added: begins Base64
+                u"x",
+                u"x",
+                u"x",
+                u"x",
+                u"x",
+                u"x",
+                u"x\U00023456.",    # '.' added: ends Base64
+                u"x\U00023456.",    # '+' added: begins Base64
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.\u0177\U00023456\u017a",  # '-' added: ends Base64
+                u"x\U00023456.\u0177\U00023456\u017az",
+            ]
+        )
+
 class UTF16ExTest(unittest.TestCase):
 
     def test_errors(self):
diff --git a/src/org/python/core/codecs.java b/src/org/python/core/codecs.java
--- a/src/org/python/core/codecs.java
+++ b/src/org/python/core/codecs.java
@@ -540,10 +540,10 @@
      * Decode (perhaps partially) a sequence of bytes representing the UTF-7 encoded form of a
      * Unicode string and return the (Jython internal representation of) the unicode object, and
      * amount of input consumed. The only state we preserve is our read position, i.e. how many
-     * characters we have consumed. So if the input ends part way through a Base64 sequence the data
-     * reported as consumed is only that up to and not including the Base64 start marker ('+').
+     * bytes we have consumed. So if the input ends part way through a Base64 sequence the data
+     * reported as consumed is just that up to and not including the Base64 start marker ('+').
      * Performance will be poor (quadratic cost) on runs of Base64 data long enough to exceed the
-     * input quantum in incremental decoding. The retruned Java String is a UTF-16 representation of
+     * input quantum in incremental decoding. The returned Java String is a UTF-16 representation of
      * the Unicode result, in line with Java conventions. Unicode characters above the BMP are
      * represented as surrogate pairs.
      *
@@ -743,7 +743,7 @@
                 if ((unit & 0x0400) == 0) {
                     // This is a lead surrogate as expected ... get the trail surrogate.
                     int unit2 = (int)(buffer >>> (n - 32));
-                    if ((unit2 & 0xFC00) == 0xD800) {
+                    if ((unit2 & 0xFC00) == 0xDC00) {
                         // And this is the trail surrogate we expected
                         v.appendCodePoint(0x10000 + ((unit & 0x3ff) << 10) + (unit2 & 0x3ff));
                         n -= 32;
@@ -832,12 +832,12 @@
                 if ((unit & 0x0400) == 0) {
                     // This is a lead surrogate, which is valid: check the next 16 bits.
                     int unit2 = ((int)(buffer >>> (n - 32))) & 0xffff;
-                    if ((unit2 & 0xFC00) == 0xD800) {
+                    if ((unit2 & 0xFC00) == 0xDC00) {
+                        // Hmm ... why was I called?
+                        return UTF7Error.NONE;
+                    } else {
                         // Not trail surrogate: that's the problem
                         return UTF7Error.MISSING;
-                    } else {
-                        // Hmm ... why was I called?
-                        return UTF7Error.NONE;
                     }
 
                 } else {
@@ -885,7 +885,7 @@
      * PyString.)
      *
      * This method differs from the CPython equivalent (in <code>Object/unicodeobject.c</code>)
-     * which works with an array of point codes that are, in a wide build, Unicode code points.
+     * which works with an array of code points that are, in a wide build, Unicode code points.
      *
      * @param unicode
      * @param base64SetO
@@ -965,7 +965,7 @@
                  * representation.
                  */
                 // XXX see issue #2002: we should only count surrogate pairs as one character
-                // if ((ch & 0xFC00)==0xC800) { count++; }
+                // if ((ch & 0xFC00)==0xD800) { count++; }
 
                 if (base64bits > 48) {
                     // No room for the next 16 bits: emit all we have
@@ -1570,8 +1570,8 @@
     }
 
     /**
-     * Handler errors encountered during decoding, adjusting the output buffer contents and
-     * returning the correct position to resume decoding (if the handler does not siomply raise an
+     * Handler for errors encountered during decoding, adjusting the output buffer contents and
+     * returning the correct position to resume decoding (if the handler does not simply raise an
      * exception).
      *
      * @param partialDecode output buffer of unicode (as UTF-16) that the codec is building
@@ -1613,7 +1613,7 @@
      * Invoke a user-defined error-handling mechanism, for errors encountered during decoding, as
      * registered through {@link #register_error(String, PyObject)}. The return value is the return
      * from the error handler indicating the replacement codec output and the the position at which
-     * to resume decoding. invokes the mechanism described in PEP-293.
+     * to resume decoding. Invokes the mechanism described in PEP-293.
      *
      * @param errors name of the error policy (or null meaning "strict")
      * @param encoding name of encoding that encountered the error

-- 
Repository URL: http://hg.python.org/jython