[Jython-checkins] jython: Forbid lone surrogates in PyUnicode. Consequent refactoring.
jeff.allen
jython-checkins at python.org
Wed Sep 17 00:55:26 CEST 2014
http://hg.python.org/jython/rev/4187e256ae1e
changeset: 7386:4187e256ae1e
user: Jeff Allen <ja.py at farowl.co.uk>
date: Mon Sep 15 22:33:07 2014 +0100
summary:
Forbid lone surrogates in PyUnicode. Consequent refactoring.
Surrogate code units are not (easily) compatible with a PyUnicode
implementation that uses a UTF-16 interpretation of String. This change
produces test failures in test_unicodedata, but they should go away when
merged with work on unichr.
files:
Lib/test/test_unicode_jy.py | 37 +++++
src/org/python/core/PyUnicode.java | 118 ++++++++++------
2 files changed, 112 insertions(+), 43 deletions(-)
diff --git a/Lib/test/test_unicode_jy.py b/Lib/test/test_unicode_jy.py
--- a/Lib/test/test_unicode_jy.py
+++ b/Lib/test/test_unicode_jy.py
@@ -11,6 +11,7 @@
import unittest
from StringIO import StringIO
from test import test_support
+from java.lang import StringBuilder
class UnicodeTestCase(unittest.TestCase):
@@ -360,6 +361,42 @@
m.insert(t)
check_rfind_str(m, t)
+ def test_surrogate_validation(self):
+
+ def insert_sb(text, c1, c2):
+ # Insert code points c1, c2 in the text, as a Java StringBuilder
+ sb = StringBuilder()
+ # c1 at the quarter point
+ p1 = len(mat) // 4
+ for c in mat.text[:p1]:
+ sb.appendCodePoint(ord(c))
+ sb.appendCodePoint(c1)
+ # c2 at the three-quarter point
+ p2 = 3 * p1
+ for c in mat.text[p1:p2]:
+ sb.appendCodePoint(ord(c))
+ sb.appendCodePoint(c2)
+ # Rest of text
+ for c in mat.text[p2:]:
+ sb.appendCodePoint(ord(c))
+ return sb
+
+ # Test that lone surrogates are rejected
+ for surr in [0xdc81, 0xdc00, 0xdfff, 0xd800, 0xdbff]:
+ for mat in self.material:
+
+ # Java StringBuilder with two private-use characters:
+ sb = insert_sb(mat.text, 0xe000, 0xf000)
+ # Check this is acceptable
+ #print repr(unicode(sb))
+ self.assertEqual(len(unicode(sb)), len(mat)+2)
+
+ # Java StringBuilder with private-use and lone surrogate:
+ sb = insert_sb(mat.text, 0xe000, surr)
+ # Check this is detected
+ #print repr(unicode(sb))
+ self.assertRaises(ValueError, unicode, sb)
+
class UnicodeFormatTestCase(unittest.TestCase):
diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java
--- a/src/org/python/core/PyUnicode.java
+++ b/src/org/python/core/PyUnicode.java
@@ -336,11 +336,12 @@
int p; // Index of the current UTF-16 code unit.
/*
- * We scan to the first supplementary character in a simple loop. If we hit the end before
- * we find one, no count array will be necessary and we'll use BASIC.
+ * We scan to the first surrogate code unit, in a simple loop. If we hit the end before we
+ * find one, no count array will be necessary and we'll use BASIC. If we find a surrogate it
+ * may be half a supplementary character, or a lone surrogate: we'll find out later.
*/
for (p = 0; p < n; p++) {
- if (Character.isHighSurrogate(string.charAt(p))) {
+ if (Character.isSurrogate(string.charAt(p))) {
break;
}
}
@@ -371,20 +372,15 @@
/*
* To get the generation of count[] going efficiently, we need to advance the next whole
* block. The next loop will complete processing of the block containing the first
- * supplementary character. Note that in all these loops, if we exit on p==n, the count
- * for the last partial; block is known from p-q and we take care of that right at the
- * end of this method.
+ * supplementary character. Note that in all these loops, if we exit because p reaches a
+ * limit, the count for the last partial block is known from p-q and we take care of
+ * that right at the end of this method. The limit of these loops is n-1, so if we spot
+ * a lead surrogate, the we may access the low-surrogate confident that p+1<n.
*/
- while (p < n) {
+ while (p < n - 1) {
- if (Character.isHighSurrogate(string.charAt(p++))) {
- // Integrity checks (also advances p past the trailing surrogate)
- if (p == n || !Character.isLowSurrogate(string.charAt(p++))) {
- // End of string follows or trailing surrogate does not : oops.
- throw unpairedLeadSurrogate(n, p - 1);
- }
- }
-
+ // Catch supplementary characters and lone surrogate code units.
+ p += calcAdvance(string, p);
// Advance the code point index
q += 1;
@@ -401,14 +397,10 @@
* at least one whole block to go when p+2*M<n.
*/
while (p + 2 * Supplementary.M < n) {
+
for (int i = 0; i < Supplementary.M; i++) {
- if (Character.isHighSurrogate(string.charAt(p++))) {
- // Integrity checks (also advances p past the trailing surrogate)
- if (!Character.isLowSurrogate(string.charAt(p++))) {
- // A trailing surrogate does not follow : oops.
- throw unpairedLeadSurrogate(n, p);
- }
- }
+ // Catch supplementary characters and lone surrogate code units.
+ p += calcAdvance(string, p);
}
// Advance the code point index one whole block
@@ -419,19 +411,12 @@
}
/*
- * We take the remaining UTF-16 code units more carefully, as we can not be sure when
- * the end of the string will come.
+ * Process the remaining UTF-16 code units, except possibly the last.
*/
- while (p < n) {
+ while (p < n - 1) {
- if (Character.isHighSurrogate(string.charAt(p++))) {
- // Integrity checks (also advances p past the trailing surrogate)
- if (p == n || !Character.isLowSurrogate(string.charAt(p++))) {
- // End of string follows or trailing surrogate does not : oops.
- throw unpairedLeadSurrogate(n, p);
- }
- }
-
+ // Catch supplementary characters and lone surrogate code units.
+ p += calcAdvance(string, p);
// Advance the code point index
q += 1;
@@ -442,6 +427,20 @@
}
/*
+ * There may be just one UTF-16 unit left (if the last thing processed was not a
+ * surrogate pair).
+ */
+ if (p < n) {
+ // We are at the last UTF-16 unit in string. Any surrogate here is an error.
+ char c = string.charAt(p++);
+ if (Character.isSurrogate(c)) {
+ throw unpairedSurrogate(p - 1, c);
+ }
+ // Advance the code point index
+ q += 1;
+ }
+
+ /*
* There may still be some elements of count[] we haven't set, so we fill to the end
* with the total count. This also takes care of an incomplete final block.
*/
@@ -455,20 +454,53 @@
}
/**
+ * Called at each code point index, returns 2 if this is a surrogate pair, 1 otherwise, and
+ * detects lone surrogates as an error. The return is the amount to advance the UTF-16 index. An
+ * exception is raised if at <code>p</code> we find a lead surrogate without a trailing one
+ * following, or a trailing surrogate directly. It should not be called on the final code unit,
+ * when <code>p==string.length()-1</code>, since it may check the next code unit as well.
+ *
+ * @param string of UTF-16 code units
+ * @param p index into that string
+ * @return 2 if a surrogate pair stands at <code>p</code>, 1 if not
+ * @throws PyException(ValueError) if a lone surrogate stands at <code>p</code>.
+ */
+ private static int calcAdvance(String string, int p) throws PyException {
+
+ // Catch supplementary characters and lone surrogate code units.
+ char c = string.charAt(p);
+
+ if (c >= Character.MIN_SURROGATE) {
+ if (c < Character.MIN_LOW_SURROGATE) {
+ // This is a lead surrogate.
+ if (Character.isLowSurrogate(string.charAt(p + 1))) {
+ // Required trailing surrogate follows, so step over both.
+ return 2;
+ } else {
+ // Required trailing surrogate missing.
+ throw unpairedSurrogate(p, c);
+ }
+
+ } else if (c <= Character.MAX_SURROGATE) {
+ // This is a lone trailing surrogate
+ throw unpairedSurrogate(p, c);
+
+ } // else this is a private use or special character in 0xE000 to 0xFFFF.
+
+ }
+ return 1;
+ }
+
+ /**
* Return a ready-to-throw exception indicating an unpaired surrogate.
*
- * @param n the UTF-16 length of the array being scanned
- * @param p pointer within that array
+ * @param p index within that sequence of the problematic code unit
+ * @param c the code unit
* @return an exception
*/
- private static PyException unpairedLeadSurrogate(int n, int p) {
- String msg;
- if (p + 1 >= n) {
- msg = "unpaired lead-surrogate at end of string/array";
- } else {
- String fmt = "unpaired lead-surrogate at code unit %d";
- msg = String.format(fmt, p);
- }
+ private static PyException unpairedSurrogate(int p, int c) {
+ String fmt = "unpaired surrogate %#4x at code unit %d";
+ String msg = String.format(fmt, c, p);
return Py.ValueError(msg);
}
--
Repository URL: http://hg.python.org/jython
More information about the Jython-checkins
mailing list