[Jython-checkins] jython: Forbid lone surrogates in PyUnicode. Consequent refactoring.

Wed Sep 17 00:55:26 CEST 2014

http://hg.python.org/jython/rev/4187e256ae1e
changeset:   7386:4187e256ae1e
user:        Jeff Allen <ja.py at farowl.co.uk>
date:        Mon Sep 15 22:33:07 2014 +0100
summary:
  Forbid lone surrogates in PyUnicode. Consequent refactoring.

Surrogate code units are not (easily) compatible with a PyUnicode
implementation that uses a UTF-16 interpretation of String. This change
produces test failures in test_unicodedata, but they should go away when
merged with work on unichr.

files:
  Lib/test/test_unicode_jy.py        |   37 +++++
  src/org/python/core/PyUnicode.java |  118 ++++++++++------
  2 files changed, 112 insertions(+), 43 deletions(-)

diff --git a/Lib/test/test_unicode_jy.py b/Lib/test/test_unicode_jy.py
--- a/Lib/test/test_unicode_jy.py
+++ b/Lib/test/test_unicode_jy.py
@@ -11,6 +11,7 @@
 import unittest
 from StringIO import StringIO
 from test import test_support
+from java.lang import StringBuilder
 
 class UnicodeTestCase(unittest.TestCase):
 
@@ -360,6 +361,42 @@
                 m.insert(t)
                 check_rfind_str(m, t)
 
+    def test_surrogate_validation(self):
+
+        def insert_sb(text, c1, c2):
+            # Insert code points c1, c2 in the text, as a Java StringBuilder
+            sb = StringBuilder()
+            # c1 at the quarter point
+            p1 = len(mat) // 4
+            for c in mat.text[:p1]:
+                sb.appendCodePoint(ord(c))
+            sb.appendCodePoint(c1)
+            # c2 at the three-quarter point
+            p2 = 3 * p1
+            for c in mat.text[p1:p2]:
+                sb.appendCodePoint(ord(c))
+            sb.appendCodePoint(c2)
+            # Rest of text
+            for c in mat.text[p2:]:
+                sb.appendCodePoint(ord(c))
+            return sb
+
+        # Test that lone surrogates are rejected
+        for surr in [0xdc81, 0xdc00, 0xdfff, 0xd800, 0xdbff]:
+            for mat in self.material:
+
+                # Java StringBuilder with two private-use characters:
+                sb = insert_sb(mat.text, 0xe000, 0xf000)
+                # Check this is acceptable
+                #print repr(unicode(sb))
+                self.assertEqual(len(unicode(sb)), len(mat)+2)
+
+                # Java StringBuilder with private-use and lone surrogate:
+                sb = insert_sb(mat.text, 0xe000, surr)
+                # Check this is detected
+                #print repr(unicode(sb))
+                self.assertRaises(ValueError, unicode, sb)
+
 
 class UnicodeFormatTestCase(unittest.TestCase):
 
diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java
--- a/src/org/python/core/PyUnicode.java
+++ b/src/org/python/core/PyUnicode.java
@@ -336,11 +336,12 @@
         int p; // Index of the current UTF-16 code unit.
 
         /*
-         * We scan to the first supplementary character in a simple loop. If we hit the end before
-         * we find one, no count array will be necessary and we'll use BASIC.
+         * We scan to the first surrogate code unit, in a simple loop. If we hit the end before we
+         * find one, no count array will be necessary and we'll use BASIC. If we find a surrogate it
+         * may be half a supplementary character, or a lone surrogate: we'll find out later.
          */
         for (p = 0; p < n; p++) {
-            if (Character.isHighSurrogate(string.charAt(p))) {
+            if (Character.isSurrogate(string.charAt(p))) {
                 break;
             }
         }
@@ -371,20 +372,15 @@
             /*
              * To get the generation of count[] going efficiently, we need to advance the next whole
              * block. The next loop will complete processing of the block containing the first
-             * supplementary character. Note that in all these loops, if we exit on p==n, the count
-             * for the last partial; block is known from p-q and we take care of that right at the
-             * end of this method.
+             * supplementary character. Note that in all these loops, if we exit because p reaches a
+             * limit, the count for the last partial block is known from p-q and we take care of
+             * that right at the end of this method. The limit of these loops is n-1, so if we spot
+             * a lead surrogate, the we may access the low-surrogate confident that p+1<n.
              */
-            while (p < n) {
+            while (p < n - 1) {
 
-                if (Character.isHighSurrogate(string.charAt(p++))) {
-                    // Integrity checks (also advances p past the trailing surrogate)
-                    if (p == n || !Character.isLowSurrogate(string.charAt(p++))) {
-                        // End of string follows or trailing surrogate does not : oops.
-                        throw unpairedLeadSurrogate(n, p - 1);
-                    }
-                }
-
+                // Catch supplementary characters and lone surrogate code units.
+                p += calcAdvance(string, p);
                 // Advance the code point index
                 q += 1;
 
@@ -401,14 +397,10 @@
              * at least one whole block to go when p+2*M<n.
              */
             while (p + 2 * Supplementary.M < n) {
+
                 for (int i = 0; i < Supplementary.M; i++) {
-                    if (Character.isHighSurrogate(string.charAt(p++))) {
-                        // Integrity checks (also advances p past the trailing surrogate)
-                        if (!Character.isLowSurrogate(string.charAt(p++))) {
-                            // A trailing surrogate does not follow : oops.
-                            throw unpairedLeadSurrogate(n, p);
-                        }
-                    }
+                    // Catch supplementary characters and lone surrogate code units.
+                    p += calcAdvance(string, p);
                 }
 
                 // Advance the code point index one whole block
@@ -419,19 +411,12 @@
             }
 
             /*
-             * We take the remaining UTF-16 code units more carefully, as we can not be sure when
-             * the end of the string will come.
+             * Process the remaining UTF-16 code units, except possibly the last.
              */
-            while (p < n) {
+            while (p < n - 1) {
 
-                if (Character.isHighSurrogate(string.charAt(p++))) {
-                    // Integrity checks (also advances p past the trailing surrogate)
-                    if (p == n || !Character.isLowSurrogate(string.charAt(p++))) {
-                        // End of string follows or trailing surrogate does not : oops.
-                        throw unpairedLeadSurrogate(n, p);
-                    }
-                }
-
+                // Catch supplementary characters and lone surrogate code units.
+                p += calcAdvance(string, p);
                 // Advance the code point index
                 q += 1;
 
@@ -442,6 +427,20 @@
             }
 
             /*
+             * There may be just one UTF-16 unit left (if the last thing processed was not a
+             * surrogate pair).
+             */
+            if (p < n) {
+                // We are at the last UTF-16 unit in string. Any surrogate here is an error.
+                char c = string.charAt(p++);
+                if (Character.isSurrogate(c)) {
+                    throw unpairedSurrogate(p - 1, c);
+                }
+                // Advance the code point index
+                q += 1;
+            }
+
+            /*
              * There may still be some elements of count[] we haven't set, so we fill to the end
              * with the total count. This also takes care of an incomplete final block.
              */
@@ -455,20 +454,53 @@
     }
 
     /**
+     * Called at each code point index, returns 2 if this is a surrogate pair, 1 otherwise, and
+     * detects lone surrogates as an error. The return is the amount to advance the UTF-16 index. An
+     * exception is raised if at <code>p</code> we find a lead surrogate without a trailing one
+     * following, or a trailing surrogate directly. It should not be called on the final code unit,
+     * when <code>p==string.length()-1</code>, since it may check the next code unit as well.
+     *
+     * @param string of UTF-16 code units
+     * @param p index into that string
+     * @return 2 if a surrogate pair stands at <code>p</code>, 1 if not
+     * @throws PyException(ValueError) if a lone surrogate stands at <code>p</code>.
+     */
+    private static int calcAdvance(String string, int p) throws PyException {
+
+        // Catch supplementary characters and lone surrogate code units.
+        char c = string.charAt(p);
+
+        if (c >= Character.MIN_SURROGATE) {
+            if (c < Character.MIN_LOW_SURROGATE) {
+                // This is a lead surrogate.
+                if (Character.isLowSurrogate(string.charAt(p + 1))) {
+                    // Required trailing surrogate follows, so step over both.
+                    return 2;
+                } else {
+                    // Required trailing surrogate missing.
+                    throw unpairedSurrogate(p, c);
+                }
+
+            } else if (c <= Character.MAX_SURROGATE) {
+                // This is a lone trailing surrogate
+                throw unpairedSurrogate(p, c);
+
+            } // else this is a private use or special character in 0xE000 to 0xFFFF.
+
+        }
+        return 1;
+    }
+
+    /**
      * Return a ready-to-throw exception indicating an unpaired surrogate.
      *
-     * @param n the UTF-16 length of the array being scanned
-     * @param p pointer within that array
+     * @param p index within that sequence of the problematic code unit
+     * @param c the code unit
      * @return an exception
      */
-    private static PyException unpairedLeadSurrogate(int n, int p) {
-        String msg;
-        if (p + 1 >= n) {
-            msg = "unpaired lead-surrogate at end of string/array";
-        } else {
-            String fmt = "unpaired lead-surrogate at code unit %d";
-            msg = String.format(fmt, p);
-        }
+    private static PyException unpairedSurrogate(int p, int c) {
+        String fmt = "unpaired surrogate %#4x at code unit %d";
+        String msg = String.format(fmt, c, p);
         return Py.ValueError(msg);
     }
 

-- 
Repository URL: http://hg.python.org/jython