[Jython-checkins] jython (merge default -> default): Merge #2364 fixes
jeff.allen
jython-checkins at python.org
Fri Sep 11 00:58:40 CEST 2015
https://hg.python.org/jython/rev/7d55b82d5842
changeset: 7730:7d55b82d5842
parent: 7724:45a9d8f613b9
parent: 7729:a77dad1d7050
user: Jeff Allen <ja.py at farowl.co.uk>
date: Thu Sep 10 23:56:36 2015 +0100
summary:
Merge #2364 fixes
files:
Lib/test/test_bytes.py | 18 +-
Lib/test/test_bytes_jy.py | 63 +++
NEWS | 1 +
src/org/python/core/BaseBytes.java | 343 ++++++++++------
src/org/python/core/PyModule.java | 8 +-
src/org/python/core/PyString.java | 264 ++++++------
src/org/python/core/PyUnicode.java | 140 +++---
7 files changed, 472 insertions(+), 365 deletions(-)
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -357,11 +357,7 @@
self.assertEqual(b.split(None, 2), [b'arf', b'barf'])
for b in (b'a\x1Cb', b'a\x1Db', b'a\x1Eb', b'a\x1Fb'):
b = self.type2test(b)
- if not test.test_support.is_jython:
- self.assertEqual(b.split(), [b])
- else:
- # \x1c .. \x1f are whitespace Jython (which follows Java)
- self.assertEqual(b.split(), [b'a', b'b'])
+ self.assertEqual(b.split(), [b])
self.assertEqual(self.type2test(b' a bb c ').split(None, 0), [b'a bb c '])
self.assertEqual(self.type2test(b' a bb c ').split(None, 1), [b'a', b'bb c '])
self.assertEqual(self.type2test(b' a bb c ').split(None, 2), [b'a', b'bb', b'c '])
@@ -372,11 +368,7 @@
def test_split_unicodewhitespace(self):
b = self.type2test(b"\x09\x0A\x0B\x0C\x0D\x1C\x1D\x1E\x1F")
- if not test.test_support.is_jython:
- self.assertEqual(b.split(), [b'\x1c\x1d\x1e\x1f'])
- else:
- # \x1c .. \x1f are whitespace Jython
- self.assertEqual(b.split(), [])
+ self.assertEqual(b.split(), [b'\x1c\x1d\x1e\x1f'])
def test_rsplit(self):
b = self.type2test(b'mississippi')
@@ -401,11 +393,7 @@
def test_rsplit_unicodewhitespace(self):
b = self.type2test(b"\x09\x0A\x0B\x0C\x0D\x1C\x1D\x1E\x1F")
- if not test.test_support.is_jython:
- self.assertEqual(b.rsplit(), [b'\x1c\x1d\x1e\x1f'])
- else:
- # \x1c .. \x1f are whitespace Jython
- self.assertEqual(b.rsplit(), [])
+ self.assertEqual(b.rsplit(), [b'\x1c\x1d\x1e\x1f'])
def test_partition(self):
b = self.type2test(b'mississippi')
diff --git a/Lib/test/test_bytes_jy.py b/Lib/test/test_bytes_jy.py
--- a/Lib/test/test_bytes_jy.py
+++ b/Lib/test/test_bytes_jy.py
@@ -51,6 +51,69 @@
for n in range(-1, 3) :
irepeat_export(b'', n)
+ # The following test_is* tests supplement string_tests for non-ascii examples.
+ # The principle is to choose some character codes that are letters, digits
+ # or spaces in Unicode but not in ASCII and check they are *not* categorised
+ # as such in a byte context.
+
+ def checkequal(self, expected, obj, methodname, *args):
+ "check that object.method() returns expected result"
+ for B in (bytearray,): # (bytes, bytearray):
+ obj = B(obj)
+ realresult = getattr(obj, methodname)()
+ grumble = "%r.%s() returned %s" % (obj, methodname, realresult)
+ self.assertIs(expected, realresult, grumble)
+ # print grumble, 'x' if realresult != expected else '.'
+
+ LOWER = b'\xe0\xe7\xe9\xff' # Uppercase in Latin-1 but not ascii
+ UPPER = b'\xc0\xc7\xc9\xdd' # Lowercase in Latin-1 but not ascii
+ DIGIT = b'\xb9\xb2\xb3' # sup 1, 2, 3: numeric in Python (not Java)
+ SPACE = b'\x85\xa0' # NEXT LINE, NBSP: space in Python (not Java)
+
+ def test_isalpha(self):
+ for c in self.UPPER + self.LOWER:
+ self.checkequal(False, c, 'isalpha')
+ self.checkequal(False, b'a' + c + b'Z', 'isalpha')
+
+ def test_isdigit(self):
+ for c in self.DIGIT:
+ self.checkequal(False, c, 'isdigit')
+ self.checkequal(False, b'1' + c + b'3', 'isdigit')
+
+ def test_islower(self):
+ for c in self.LOWER:
+ self.checkequal(False, c, 'islower')
+ for c in self.UPPER:
+ self.checkequal(True, b'a' + c + b'z', 'islower')
+
+ def test_isupper(self):
+ for c in self.UPPER:
+ self.checkequal(False, c, 'isupper')
+ for c in self.LOWER:
+ self.checkequal(True, b'A' + c + b'Z', 'isupper')
+
+ def test_isspace(self):
+ for c in self.SPACE:
+ self.checkequal(False, c, 'isspace')
+ self.checkequal(False, b'\t' + c + b' ', 'isspace')
+
+ def test_isalnum(self):
+ for c in self.UPPER + self.LOWER + self.DIGIT:
+ self.checkequal(False, c, 'isalnum')
+ self.checkequal(False, b'a' + c + b'3', 'isalnum')
+
+ def test_istitle(self):
+ for c in self.UPPER:
+ # c should be an un-cased character (effectively a space)
+ self.checkequal(False, c, 'istitle')
+ self.checkequal(True, b'A' + c + b'Titlecased Line', 'istitle')
+ self.checkequal(True, b'A' + c + b' Titlecased Line', 'istitle')
+ self.checkequal(True, b'A ' + c + b'Titlecased Line', 'istitle')
+ for c in self.LOWER:
+ # c should be an un-cased character (effectively a space)
+ self.checkequal(True, b'A' + c + b'Titlecased Line', 'istitle')
+ self.checkequal(True, b'A ' + c + b' Titlecased Line', 'istitle')
+
def test_main():
test.test_support.run_unittest(
diff --git a/NEWS b/NEWS
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,7 @@
- [ 2158, 2259 ] Fixed behaviour of relative from ... import *
- [ 1879 ] -m command now executes scripts from inside a jar file
- [ 2058 ] ClasspathPyImporter implements PEP 302 get_data (and others)
+ - [ 2364 ] bytearray and str: isalpha(), isupper() etc. now match Python 2
Jython 2.7
same as 2.7rc3
diff --git a/src/org/python/core/BaseBytes.java b/src/org/python/core/BaseBytes.java
--- a/src/org/python/core/BaseBytes.java
+++ b/src/org/python/core/BaseBytes.java
@@ -1737,9 +1737,9 @@
*/
protected int lstripIndex() {
int limit = offset + size;
- // Run up the storage until non-whitespace (or hit end)t
+ // Run up the storage until non-whitespace (or hit end)
for (int left = offset; left < limit; left++) {
- if (!Character.isWhitespace(storage[left] & 0xff)) {
+ if (!isspace(storage[left])) {
return left - offset;
}
}
@@ -1777,7 +1777,7 @@
protected int rstripIndex() {
// Run down the storage until next is non-whitespace (or hit start)
for (int right = offset + size; right > offset; --right) {
- if (!Character.isWhitespace(storage[right - 1] & 0xff)) {
+ if (!isspace(storage[right - 1])) {
return right - offset;
}
}
@@ -2604,7 +2604,7 @@
// Scan backwards over trailing whitespace
for (q = offset + size; q > offset; --q) {
- if (!Character.isWhitespace(storage[q - 1] & 0xff)) {
+ if (!isspace(storage[q - 1])) {
break;
}
}
@@ -2617,7 +2617,7 @@
// Delimit the word whose last byte is storage[q-1]
// Skip p backwards over the non-whitespace
for (p = q; p > offset; --p) {
- if (Character.isWhitespace(storage[p - 1] & 0xff)) {
+ if (isspace(storage[p - 1])) {
break;
}
}
@@ -2626,7 +2626,7 @@
result.add(0, word);
// Skip q backwards over the whitespace
for (q = p; q > offset; --q) {
- if (!Character.isWhitespace(storage[q - 1] & 0xff)) {
+ if (!isspace(storage[q - 1])) {
break;
}
}
@@ -2795,7 +2795,7 @@
int p, q; // Indexes of unsplit text and whitespace
// Scan over leading whitespace
- for (p = offset; p < limit && Character.isWhitespace(storage[p] & 0xff); p++) {
+ for (p = offset; p < limit && isspace(storage[p]); p++) {
; // continue
}
@@ -2807,13 +2807,13 @@
// Delimit a word at p
// storage[p] is not whitespace or at the limit: it is the start of a word
// Skip q over the non-whitespace at p
- for (q = p; q < limit && !Character.isWhitespace(storage[q] & 0xff); q++) {
+ for (q = p; q < limit && !isspace(storage[q]); q++) {
; // continue
}
// storage[q] is whitespace or it is at the limit
result.append(getslice(p - offset, q - offset));
// Skip p over the whitespace at q
- for (p = q; p < limit && Character.isWhitespace(storage[p] & 0xff); p++) {
+ for (p = q; p < limit && isspace(storage[p]); p++) {
; // continue
}
}
@@ -3089,11 +3089,68 @@
// Character class operations
//
+
+ // Bit to twiddle (XOR) for lowercase letter to uppercase and vice-versa.
+ private static final int SWAP_CASE = 0x20;
+
+ // Bit masks and sets to use with the byte classification table
+ private static final byte UPPER = 0b1;
+ private static final byte LOWER = 0b10;
+ private static final byte DIGIT = 0b100;
+ private static final byte SPACE = 0b1000;
+ private static final byte ALPHA = UPPER | LOWER;
+ private static final byte ALNUM = ALPHA | DIGIT;
+
+ // Character (byte) classification table.
+ private static final byte[] ctype = new byte[256];
+ static {
+ for (int c = 'A'; c <= 'Z'; c++) {
+ ctype[0x80 + c] = UPPER;
+ ctype[0x80 + SWAP_CASE + c] = LOWER;
+ }
+ for (int c = '0'; c <= '9'; c++) {
+ ctype[0x80 + c] = DIGIT;
+ }
+ for (char c : " \t\n\u000b\f\r".toCharArray()) {
+ ctype[0x80 + c] = SPACE;
+ }
+ }
+
+ /** @return 'A'<= b <='Z'. */
+ static final boolean isupper(byte b) {
+ return (ctype[0x80 + b] & UPPER) != 0;
+ }
+
+ /** @return 'a'<= b <='z'. */
+ static final boolean islower(byte b) {
+ return (ctype[0x80 + b] & LOWER) != 0;
+ }
+
+ /** @return 'A'<= b <='Z' or 'a'<= b <='z'. */
+ static final boolean isalpha(byte b) {
+ return (ctype[0x80 + b] & ALPHA) != 0;
+ }
+
+ /** @return '0'<= b <='9'. */
+ static final boolean isdigit(byte b) {
+ return (ctype[0x80 + b] & DIGIT) != 0;
+ }
+
+ /** @return 'A'<= b <='Z' or 'a'<= b <='z' or '0'<= b <='9'. */
+ static final boolean isalnum(byte b) {
+ return (ctype[0x80 + b] & ALNUM) != 0;
+ }
+
+ /** @return b in ' \t\n\v\f\r' */
+ static final boolean isspace(byte b) {
+ return (ctype[0x80 + b] & SPACE) != 0;
+ }
+
/**
- * Java API equivalent of Python <code>isalnum()</code>. This method treats the bytes as Unicode
- * pont codes and is consistent with Java's {@link Character#isLetterOrDigit(char)}.
+ * Java API equivalent of Python <code>isalnum()</code>. This method treats the bytes as
+ * US-ASCII code points.
*
- * @return true if all bytes in the array are point codes for alphanumerics and there is at
+ * @return true if all bytes in the array are code points for alphanumerics and there is at
* least one byte, false otherwise.
*/
public boolean isalnum() {
@@ -3103,27 +3160,28 @@
/**
* Ready-to-expose implementation of Python <code>isalnum()</code>.
*
- * @return true if all bytes in the array are point codes for alphanumerics and there is at
+ * @return true if all bytes in the array are code points for alphanumerics and there is at
* least one byte, false otherwise.
*/
final boolean basebytes_isalnum() {
- if (size <= 0) {
- // Treat empty string as special case
- return false;
+ if (size == 1) {
+ // Special case strings of length one (i.e. characters)
+ return isalnum(storage[offset]);
} else {
- // Test the bytes
+ // Work through the bytes, stopping early if the test is false.
for (int i = 0; i < size; i++) {
- if (!Character.isLetterOrDigit(charAt(i))) {
+ if (!isalnum(storage[offset + i])) {
return false;
}
}
- return true;
+ // Result is true if we reached the end (and there were some bytes)
+ return size > 0;
}
}
/**
- * Java API equivalent of Python <code>isalpha()</code>. This method treats the bytes as Unicode
- * pont codes and is consistent with Java's {@link Character#isLetter(char)}.
+ * Java API equivalent of Python <code>isalpha()</code>. This method treats the bytes as
+ * US-ASCII code points.
*
* @return true if all bytes in the array are alphabetic and there is at least one byte, false
* otherwise
@@ -3139,25 +3197,26 @@
* otherwise
*/
final boolean basebytes_isalpha() {
- if (size <= 0) {
- // Treat empty string as special case
- return false;
+ if (size == 1) {
+ // Special case strings of length one (i.e. characters)
+ return isalpha(storage[offset]);
} else {
- // Test the bytes
+ // Work through the bytes, stopping early if the test is false.
for (int i = 0; i < size; i++) {
- if (!Character.isLetter(charAt(i))) {
+ if (!isalpha(storage[offset + i])) {
return false;
}
}
- return true;
+ // Result is true if we reached the end (and there were some bytes)
+ return size > 0;
}
}
/**
- * Java API equivalent of Python <code>isdigit()</code>. This method treats the bytes as Unicode
- * pont codes and is consistent with Java's {@link Character#isDigit(char)}.
+ * Java API equivalent of Python <code>isdigit()</code>. This method treats the bytes as
+ * US-ASCII code points.
*
- * @return true if all bytes in the array are point codes for digits and there is at least one
+ * @return true if all bytes in the array are code points for digits and there is at least one
* byte, false otherwise.
*/
public boolean isdigit() {
@@ -3167,29 +3226,30 @@
/**
* Ready-to-expose implementation of Python <code>isdigit()</code>.
*
- * @return true if all bytes in the array are point codes for digits and there is at least one
+ * @return true if all bytes in the array are code points for digits and there is at least one
* byte, false otherwise.
*/
final boolean basebytes_isdigit() {
- if (size <= 0) {
- // Treat empty string as special case
- return false;
+ if (size == 1) {
+ // Special case strings of length one (i.e. characters)
+ return isdigit(storage[offset]);
} else {
- // Test the bytes
+ // Work through the bytes, stopping early if the test is false.
for (int i = 0; i < size; i++) {
- if (!Character.isDigit(charAt(i))) {
+ if (!isdigit(storage[offset + i])) {
return false;
}
}
- return true;
+ // Result is true if we reached the end (and there were some bytes)
+ return size > 0;
}
}
/**
- * Java API equivalent of Python <code>islower()</code>. This method treats the bytes as Unicode
- * pont codes and is consistent with Java's {@link Character#isLowerCase(char)}.
+ * Java API equivalent of Python <code>islower()</code>. This method treats the bytes as
+ * US-ASCII code points.
*
- * @return true if all cased bytes in the array are point codes for lowercase characters and
+ * @return true if all cased bytes in the array are code points for lowercase characters and
* there is at least one cased byte, false otherwise.
*/
public boolean islower() {
@@ -3199,31 +3259,46 @@
/**
* Ready-to-expose implementation of Python <code>islower()</code>.
*
- * @return true if all cased bytes in the array are point codes for lowercase characters and
+ * @return true if all cased bytes in the array are code points for lowercase characters and
* there is at least one cased byte, false otherwise.
*/
final boolean basebytes_islower() {
- boolean hasCased = false;
- // Test the bytes
- for (int i = 0; i < size; i++) {
- char c = charAt(i);
- if (Character.isUpperCase(c)) {
+ if (size == 1) {
+ // Special case strings of length one (i.e. characters)
+ return islower(storage[offset]);
+
+ } else {
+ int i;
+ byte c = 0;
+ // Test the bytes until a cased byte is encountered
+ for (i = 0; i < size; i++) {
+ if (isalpha(c = storage[offset + i])) {
+ break;
+ }
+ }
+
+ if (i == size || isupper(c)) {
+ // We reached the end without finding a cased byte, or it was upper case.
return false;
- } else if (hasCased) {
- continue; // Don't need to keep checking for cased characters
- } else if (Character.isLowerCase(c)) {
- hasCased = true;
}
+
+ // Continue to end or until an upper case byte is encountered
+ for (i = i + 1; i < size; i++) {
+ if (isupper(storage[offset + i])) {
+ return false;
+ }
+ }
+
+ // Found no upper case bytes, and at least one lower case byte.
+ return true;
}
- // Found no upper case bytes, but did we find any cased bytes at all?
- return hasCased;
}
/**
- * Java API equivalent of Python <code>isspace()</code>. This method treats the bytes as Unicode
- * pont codes and is consistent with Java's {@link Character#isWhitespace(char)}.
+ * Java API equivalent of Python <code>isspace()</code>. This method treats the bytes as
+ * US-ASCII code points.
*
- * @return true if all the bytes in the array are point codes for whitespace characters and
+ * @return true if all the bytes in the array are code points for whitespace characters and
* there is at least one byte, false otherwise.
*/
public boolean isspace() {
@@ -3233,28 +3308,28 @@
/**
* Ready-to-expose implementation of Python <code>isspace()</code>.
*
- * @return true if all the bytes in the array are point codes for whitespace characters and
+ * @return true if all the bytes in the array are code points for whitespace characters and
* there is at least one byte, false otherwise.
*/
final boolean basebytes_isspace() {
- if (size <= 0) {
- // Treat empty string as special case
- return false;
+ if (size == 1) {
+ // Special case strings of length one (i.e. characters)
+ return isspace(storage[offset]);
} else {
- // Test the bytes
+ // Work through the bytes, stopping early if the test is false.
for (int i = 0; i < size; i++) {
- if (!Character.isWhitespace(charAt(i))) {
+ if (!isspace(storage[offset + i])) {
return false;
}
}
- return true;
+ // Result is true if we reached the end (and there were some bytes)
+ return size > 0;
}
}
/**
- * Java API equivalent of Python <code>istitle()</code>. This method treats the bytes as Unicode
- * pont codes and is consistent with Java's {@link Character#isUpperCase(char)} and
- * {@link Character#isLowerCase(char)}.
+ * Java API equivalent of Python <code>istitle()</code>. This method treats the bytes as
+ * US-ASCII code points.
*
* @return true if the string is a titlecased string and there is at least one cased byte, for
* example uppercase characters may only follow uncased bytes and lowercase characters
@@ -3279,8 +3354,8 @@
// 2 = in a word (hence have have seen cased character)
for (int i = 0; i < size; i++) {
- char c = charAt(i);
- if (Character.isUpperCase(c)) {
+ byte c = storage[offset+i];
+ if (isupper(c)) {
if (state == 2) {
// Violation: can't continue a word in upper case
return false;
@@ -3288,7 +3363,7 @@
// Validly in a word
state = 2;
}
- } else if (Character.isLowerCase(c)) {
+ } else if (islower(c)) {
if (state != 2) {
// Violation: can't start a word in lower case
return false;
@@ -3305,10 +3380,10 @@
}
/**
- * Java API equivalent of Python <code>isupper()</code>. This method treats the bytes as Unicode
- * pont codes and is consistent with Java's {@link Character#isUpperCase(char)}.
+ * Java API equivalent of Python <code>isupper()</code>. This method treats the bytes as
+ * US-ASCII code points.
*
- * @return true if all cased bytes in the array are point codes for uppercase characters and
+ * @return true if all cased bytes in the array are code points for uppercase characters and
* there is at least one cased byte, false otherwise.
*/
public boolean isupper() {
@@ -3318,24 +3393,39 @@
/**
* Ready-to-expose implementation of Python <code>isupper()</code>.
*
- * @return true if all cased bytes in the array are point codes for uppercase characters and
+ * @return true if all cased bytes in the array are code points for uppercase characters and
* there is at least one cased byte, false otherwise.
*/
final boolean basebytes_isupper() {
- boolean hasCased = false;
- // Test the bytes
- for (int i = 0; i < size; i++) {
- char c = charAt(i);
- if (Character.isLowerCase(c)) {
+ if (size == 1) {
+ // Special case strings of length one (i.e. characters)
+ return isupper(storage[offset]);
+
+ } else {
+ int i;
+ byte c = 0;
+ // Test the bytes until a cased byte is encountered
+ for (i = 0; i < size; i++) {
+ if (isalpha(c = storage[offset + i])) {
+ break;
+ }
+ }
+
+ if (i == size || islower(c)) {
+ // We reached the end without finding a cased byte, or it was lower case.
return false;
- } else if (hasCased) {
- continue; // Don't need to keep checking for cased characters
- } else if (Character.isUpperCase(c)) {
- hasCased = true;
}
+
+ // Continue to end or until a lower case byte is encountered
+ for (i = i + 1; i < size; i++) {
+ if (islower(storage[offset + i])) {
+ return false;
+ }
+ }
+
+ // Found no lower case bytes, and at least one upper case byte.
+ return true;
}
- // Found no lower case bytes, but did we find any cased bytes at all?
- return hasCased;
}
//
@@ -3344,9 +3434,8 @@
/**
* Java API equivalent of Python <code>capitalize()</code>. This method treats the bytes as
- * Unicode pont codes and is consistent with Java's {@link Character#toUpperCase(char)} and
- * {@link Character#toLowerCase(char)}. The <code>BaseBytes</code> returned by this method has
- * the same actual type as <code>this/self</code>.
+ * US-ASCII code points. The <code>BaseBytes</code> returned by this method has the same actual
+ * type as <code>this/self</code>.
*
* @return a copy of the array with its first character capitalized and the rest lowercased.
*/
@@ -3367,21 +3456,21 @@
if (size > 0) {
// Treat first character
- char c = charAt(0);
- if (Character.isLowerCase(c)) {
- c = Character.toUpperCase(c);
+ byte c = storage[offset];
+ if (islower(c)) {
+ c ^= SWAP_CASE; // 'a' -> 'A', etc.
}
// Put the adjusted character in the output as a byte
- builder.append((byte)c);
+ builder.append(c);
// Treat the rest
for (int i = 1; i < size; i++) {
- c = charAt(i);
- if (Character.isUpperCase(c)) {
- c = Character.toLowerCase(c);
+ c = storage[offset+i];
+ if (isupper(c)) {
+ c ^= SWAP_CASE; // 'A' -> 'a', etc.
}
// Put the adjusted character in the output as a byte
- builder.append((byte)c);
+ builder.append(c);
}
}
@@ -3389,9 +3478,8 @@
}
/**
- * Java API equivalent of Python <code>lower()</code>. This method treats the bytes as Unicode
- * pont codes and is consistent with Java's {@link Character#toLowerCase(char)}. The
- * <code>BaseBytes</code> returned by this method has the same actual type as
+ * Java API equivalent of Python <code>lower()</code>. This method treats the bytes as US-ASCII
+ * code points. The <code>BaseBytes</code> returned by this method has the same actual type as
* <code>this/self</code>.
*
* @return a copy of the array with all the cased characters converted to lowercase.
@@ -3411,12 +3499,12 @@
Builder builder = getBuilder(size);
for (int i = 0; i < size; i++) {
- char c = charAt(i);
- if (Character.isUpperCase(c)) {
- c = Character.toLowerCase(c);
+ byte c = storage[offset+i];
+ if (isupper(c)) {
+ c ^= SWAP_CASE; // 'A' -> 'a', etc.
}
// Put the adjusted character in the output as a byte
- builder.append((byte)c);
+ builder.append(c);
}
return builder.getResult();
@@ -3424,9 +3512,8 @@
/**
* Java API equivalent of Python <code>swapcase()</code>. This method treats the bytes as
- * Unicode pont codes and is consistent with Java's {@link Character#toUpperCase(char)} and
- * {@link Character#toLowerCase(char)}. The <code>BaseBytes</code> returned by this method has
- * the same actual type as <code>this/self</code>.
+ * US-ASCII code points. The <code>BaseBytes</code> returned by this method has the same actual
+ * type as <code>this/self</code>.
*
* @return a copy of the array with uppercase characters converted to lowercase and vice versa.
*/
@@ -3445,14 +3532,12 @@
Builder builder = getBuilder(size);
for (int i = 0; i < size; i++) {
- char c = charAt(i);
- if (Character.isUpperCase(c)) {
- c = Character.toLowerCase(c);
- } else if (Character.isLowerCase(c)) {
- c = Character.toUpperCase(c);
+ byte c = storage[offset+i];
+ if (isalpha(c)) {
+ c ^= SWAP_CASE; // 'a' -> 'A', 'A' -> 'a', etc.
}
// Put the adjusted character in the output as a byte
- builder.append((byte)c);
+ builder.append(c);
}
return builder.getResult();
@@ -3485,27 +3570,27 @@
boolean inWord = false; // We begin, not in a word (sequence of cased characters)
for (int i = 0; i < size; i++) {
- char c = charAt(i);
+ byte c = storage[offset+i];
if (!inWord) {
// When we are not in a word ...
- if (Character.isLowerCase(c)) {
- c = Character.toUpperCase(c); // ... a lowercase letter must be upcased
+ if (islower(c)) {
+ c ^= SWAP_CASE; // ... a lowercase letter must be upcased
inWord = true; // and it starts a word.
- } else if (Character.isUpperCase(c)) {
+ } else if (isupper(c)) {
inWord = true; // ... an uppercase letter just starts the word
}
} else {
// When we are in a word ...
- if (Character.isUpperCase(c)) {
- c = Character.toLowerCase(c); // ... an uppercase letter must be downcased
- } else if (!Character.isLowerCase(c)) {
+ if (isupper(c)) {
+ c ^= SWAP_CASE; // ... an uppercase letter must be downcased
+ } else if (!islower(c)) {
inWord = false; // ... and a non-letter ends the word
}
}
// Put the adjusted character in the output as a byte
- builder.append((byte)c);
+ builder.append(c);
}
return builder.getResult();
}
@@ -3533,12 +3618,12 @@
Builder builder = getBuilder(size);
for (int i = 0; i < size; i++) {
- char c = charAt(i);
- if (Character.isLowerCase(c)) {
- c = Character.toUpperCase(c);
+ byte c = storage[offset+i];
+ if (islower(c)) {
+ c ^= SWAP_CASE; // 'a' -> 'A' etc.
}
// Put the adjusted character in the output as a byte
- builder.append((byte)c);
+ builder.append(c);
}
return builder.getResult();
@@ -3575,18 +3660,6 @@
}
/**
- * Return the Python byte (in range 0 to 255 inclusive) at the given index, interpreted as an
- * unsigned point code, without checking the index.
- *
- * @param index of value in byte array
- * @return the char value at the index
- * @throws IndexOutOfBoundsException if outside storage array
- */
- private final char charAt(int index) throws IndexOutOfBoundsException {
- return (char)(0xff & storage[index + offset]);
- }
-
- /**
* Helper to implement {@link #repeat(int)}. Use something like:
*
* <pre>
@@ -3638,7 +3711,7 @@
/**
* Almost ready-to-expose Python <code>__repr__()</code>, based on treating the bytes as point
- * codes. The value added by this method is conversion of non-printing point codes to
+ * codes. The value added by this method is conversion of non-printing code points to
* hexadecimal escapes in printable ASCII, and bracketed by the given before and after strings.
* These are used to get the required presentation:
*
diff --git a/src/org/python/core/PyModule.java b/src/org/python/core/PyModule.java
--- a/src/org/python/core/PyModule.java
+++ b/src/org/python/core/PyModule.java
@@ -106,15 +106,11 @@
PyObject modules = Py.getSystemState().modules;
PyObject attr = modules.__finditem__(fullName);
- if (path == Py.None) {
- // XXX: disabled
- //attr = imp.loadFromClassLoader(fullName,
- // Py.getSystemState().getClassLoader());
- } else if (path instanceof PyList) {
+ if (path instanceof PyList) {
if (attr == null) {
attr = imp.find_module(name, fullName, (PyList)path);
}
- } else {
+ } else if (path != Py.None) {
throw Py.TypeError("__path__ must be list or None");
}
diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java
--- a/src/org/python/core/PyString.java
+++ b/src/org/python/core/PyString.java
@@ -6,10 +6,10 @@
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.List;
+import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import java.util.List;
-import java.util.Locale;
import org.python.core.buffer.BaseBuffer;
import org.python.core.buffer.SimpleStringBuffer;
@@ -2662,8 +2662,8 @@
* Return the (lazily) compiled regular expression for a Python complex number. This is used
* within the regular expression patterns that define a priori acceptable strings in the complex
* constructors. The expression contributes five named capture groups a, b, x, y and j. x and y
- * are the two floats encountered, and if j is present, one of them is the imaginary part.
- * a and b are the optional parentheses. They must either both be present or both omitted.
+ * are the two floats encountered, and if j is present, one of them is the imaginary part. a and
+ * b are the optional parentheses. They must either both be present or both omitted.
*/
private static synchronized Pattern getComplexPattern() {
if (complexPattern == null) {
@@ -3602,77 +3602,99 @@
@ExposedMethod(doc = BuiltinDocs.str_islower_doc)
final boolean str_islower() {
- int n = getString().length();
-
- /* Shortcut for single character strings */
+ String s = getString();
+ int n = s.length();
+
if (n == 1) {
- return Character.isLowerCase(getString().charAt(0));
+ // Special case single character strings.
+ return _islower(s.charAt(0));
}
boolean cased = false;
for (int i = 0; i < n; i++) {
- char ch = getString().charAt(i);
-
- if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
+ char ch = s.charAt(i);
+ if (_isupper(ch)) {
return false;
- } else if (!cased && Character.isLowerCase(ch)) {
+ } else if (!cased && _islower(ch)) {
cased = true;
}
}
return cased;
}
+ private boolean _islower(char ch) {
+ if (ch < 256) {
+ return BaseBytes.islower((byte)ch);
+ } else {
+ // This is an internal error. Really, the test should be unnecessary.
+ throw new java.lang.IllegalArgumentException("non-byte character in PyString");
+ }
+ }
+
public boolean isupper() {
return str_isupper();
}
@ExposedMethod(doc = BuiltinDocs.str_isupper_doc)
final boolean str_isupper() {
- int n = getString().length();
-
- /* Shortcut for single character strings */
+ String s = getString();
+ int n = s.length();
+
if (n == 1) {
- return Character.isUpperCase(getString().charAt(0));
+ // Special case single character strings.
+ return _isupper(s.charAt(0));
}
boolean cased = false;
for (int i = 0; i < n; i++) {
- char ch = getString().charAt(i);
-
- if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
+ char ch = s.charAt(i);
+ if (_islower(ch)) {
return false;
- } else if (!cased && Character.isUpperCase(ch)) {
+ } else if (!cased && _isupper(ch)) {
cased = true;
}
}
return cased;
}
+ private boolean _isupper(char ch) {
+ if (ch < 256) {
+ return BaseBytes.isupper((byte)ch);
+ } else {
+ // This is an internal error. Really, the test should be unnecessary.
+ throw new java.lang.IllegalArgumentException("non-byte character in PyString");
+ }
+ }
+
public boolean isalpha() {
return str_isalpha();
}
@ExposedMethod(doc = BuiltinDocs.str_isalpha_doc)
final boolean str_isalpha() {
- int n = getString().length();
-
- /* Shortcut for single character strings */
+ String s = getString();
+ int n = s.length();
+
if (n == 1) {
- return Character.isLetter(getString().charAt(0));
+ // Special case single character strings.
+ return _isalpha(s.charAt(0));
}
- if (n == 0) {
- return false;
- }
-
for (int i = 0; i < n; i++) {
- char ch = getString().charAt(i);
-
- if (!Character.isLetter(ch)) {
+ if (!_isalpha(s.charAt(i))) {
return false;
}
}
- return true;
+ return n > 0;
+ }
+
+ private boolean _isalpha(char ch) {
+ if (ch < 256) {
+ return BaseBytes.isalpha((byte)ch);
+ } else {
+ // This is an internal error. Really, the test should be unnecessary.
+ throw new java.lang.IllegalArgumentException("non-byte character in PyString");
+ }
}
public boolean isalnum() {
@@ -3681,33 +3703,30 @@
@ExposedMethod(doc = BuiltinDocs.str_isalnum_doc)
final boolean str_isalnum() {
- int n = getString().length();
-
- /* Shortcut for single character strings */
+ String s = getString();
+ int n = s.length();
+
if (n == 1) {
- return _isalnum(getString().charAt(0));
+ // Special case single character strings.
+ return _isalnum(s.charAt(0));
}
- if (n == 0) {
- return false;
- }
-
for (int i = 0; i < n; i++) {
- char ch = getString().charAt(i);
-
- if (!_isalnum(ch)) {
+ if (!_isalnum(s.charAt(i))) {
return false;
}
}
- return true;
+ return n > 0;
}
private boolean _isalnum(char ch) {
- // This can ever be entirely compatible with CPython. In CPython
- // The type is not used, the numeric property is determined from
- // the presense of digit, decimal or numeric fields. These fields
- // are not available in exactly the same way in java.
- return Character.isLetterOrDigit(ch) || Character.getType(ch) == Character.LETTER_NUMBER;
+ // This is now entirely compatible with CPython, as long as only bytes are stored.
+ if (ch < 256) {
+ return BaseBytes.isalnum((byte)ch);
+ } else {
+ // This is an internal error. Really, the test should be unnecessary.
+ throw new java.lang.IllegalArgumentException("non-byte character in PyString");
+ }
}
public boolean isdecimal() {
@@ -3715,59 +3734,44 @@
}
@ExposedMethod(doc = BuiltinDocs.unicode_isdecimal_doc)
- final boolean str_isdecimal() {
- int n = getString().length();
-
- /* Shortcut for single character strings */
+ final boolean str_isdecimal() { // XXX this ought not to exist in str (in Python 2)
+ return str_isdigit();
+ }
+
+ private boolean _isdecimal(char ch) {
+ // See the comment in _isalnum. Here it is even worse.
+ return Character.getType(ch) == Character.DECIMAL_DIGIT_NUMBER;
+ }
+
+ public boolean isdigit() {
+ return str_isdigit();
+ }
+
+ @ExposedMethod(doc = BuiltinDocs.str_isdigit_doc)
+ final boolean str_isdigit() {
+ String s = getString();
+ int n = s.length();
+
if (n == 1) {
- char ch = getString().charAt(0);
- return _isdecimal(ch);
+ // Special case single character strings.
+ return _isdigit(s.charAt(0));
}
- if (n == 0) {
- return false;
- }
-
for (int i = 0; i < n; i++) {
- char ch = getString().charAt(i);
-
- if (!_isdecimal(ch)) {
+ if (!_isdigit(s.charAt(i))) {
return false;
}
}
- return true;
- }
-
- private boolean _isdecimal(char ch) {
- // See the comment in _isalnum. Here it is even worse.
- return Character.getType(ch) == Character.DECIMAL_DIGIT_NUMBER;
- }
-
- public boolean isdigit() {
- return str_isdigit();
- }
-
- @ExposedMethod(doc = BuiltinDocs.str_isdigit_doc)
- final boolean str_isdigit() {
- int n = getString().length();
-
- /* Shortcut for single character strings */
- if (n == 1) {
- return Character.isDigit(getString().charAt(0));
+ return n > 0;
+ }
+
+ private boolean _isdigit(char ch) {
+ if (ch < 256) {
+ return BaseBytes.isdigit((byte)ch);
+ } else {
+ // This is an internal error. Really, the test should be unnecessary.
+ throw new java.lang.IllegalArgumentException("non-byte character in PyString");
}
-
- if (n == 0) {
- return false;
- }
-
- for (int i = 0; i < n; i++) {
- char ch = getString().charAt(i);
-
- if (!Character.isDigit(ch)) {
- return false;
- }
- }
- return true;
}
public boolean isnumeric() {
@@ -3775,31 +3779,8 @@
}
@ExposedMethod(doc = BuiltinDocs.unicode_isnumeric_doc)
- final boolean str_isnumeric() {
- int n = getString().length();
-
- /* Shortcut for single character strings */
- if (n == 1) {
- return _isnumeric(getString().charAt(0));
- }
-
- if (n == 0) {
- return false;
- }
-
- for (int i = 0; i < n; i++) {
- char ch = getString().charAt(i);
- if (!_isnumeric(ch)) {
- return false;
- }
- }
- return true;
- }
-
- private boolean _isnumeric(char ch) {
- int type = Character.getType(ch);
- return type == Character.DECIMAL_DIGIT_NUMBER || type == Character.LETTER_NUMBER
- || type == Character.OTHER_NUMBER;
+ final boolean str_isnumeric() { // XXX this ought not to exist in str (in Python 2)
+ return str_isdigit();
}
public boolean istitle() {
@@ -3808,26 +3789,25 @@
@ExposedMethod(doc = BuiltinDocs.str_istitle_doc)
final boolean str_istitle() {
- int n = getString().length();
-
- /* Shortcut for single character strings */
+ String s = getString();
+ int n = s.length();
+
if (n == 1) {
- return Character.isTitleCase(getString().charAt(0))
- || Character.isUpperCase(getString().charAt(0));
+ // Special case single character strings.
+ return _isupper(s.charAt(0));
}
boolean cased = false;
boolean previous_is_cased = false;
for (int i = 0; i < n; i++) {
- char ch = getString().charAt(i);
-
- if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
+ char ch = s.charAt(i);
+ if (_isupper(ch)) {
if (previous_is_cased) {
return false;
}
previous_is_cased = true;
cased = true;
- } else if (Character.isLowerCase(ch)) {
+ } else if (_islower(ch)) {
if (!previous_is_cased) {
return false;
}
@@ -3846,25 +3826,29 @@
@ExposedMethod(doc = BuiltinDocs.str_isspace_doc)
final boolean str_isspace() {
- int n = getString().length();
-
- /* Shortcut for single character strings */
+ String s = getString();
+ int n = s.length();
+
if (n == 1) {
- return Character.isWhitespace(getString().charAt(0));
+ // Special case single character strings.
+ return _isspace(s.charAt(0));
}
- if (n == 0) {
- return false;
- }
-
for (int i = 0; i < n; i++) {
- char ch = getString().charAt(i);
-
- if (!Character.isWhitespace(ch)) {
+ if (!_isspace(s.charAt(i))) {
return false;
}
}
- return true;
+ return n > 0;
+ }
+
+ private boolean _isspace(char ch) {
+ if (ch < 256) {
+ return BaseBytes.isspace((byte)ch);
+ } else {
+ // This is an internal error. Really, the test should be unnecessary.
+ throw new java.lang.IllegalArgumentException("non-byte character in PyString");
+ }
}
public boolean isunicode() {
diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java
--- a/src/org/python/core/PyUnicode.java
+++ b/src/org/python/core/PyUnicode.java
@@ -9,6 +9,7 @@
import java.util.Set;
import com.google.common.base.CharMatcher;
+
import org.python.core.stringlib.FieldNameIterator;
import org.python.core.stringlib.MarkupIterator;
import org.python.expose.ExposedMethod;
@@ -582,7 +583,9 @@
}
public static String checkEncoding(String s) {
- if (s == null || CharMatcher.ASCII.matchesAllOf(s)) { return s; }
+ if (s == null || CharMatcher.ASCII.matchesAllOf(s)) {
+ return s;
+ }
return codecs.PyUnicode_EncodeASCII(s, s.length(), null);
}
@@ -739,19 +742,21 @@
return Py.makeCharacter(codepoint, true);
}
+ @Override
public int getInt(int i) {
return getString().codePointAt(translator.utf16Index(i));
}
- private class SubsequenceIteratorImpl implements Iterator {
+ /**
+ * An iterator returning code points from this array, for use when not basic plane.
+ */
+ private class SubsequenceIteratorImpl extends SubsequenceIteratorBasic {
- private int current, k, stop, step;
+ private int k; // UTF-16 index (of current)
SubsequenceIteratorImpl(int start, int stop, int step) {
- current = start;
+ super(start, stop, step);
k = translator.utf16Index(current);
- this.stop = stop;
- this.step = step;
}
SubsequenceIteratorImpl() {
@@ -759,22 +764,7 @@
}
@Override
- public boolean hasNext() {
- return current < stop;
- }
-
- @Override
- public Object next() {
- int codePoint = nextCodePoint();
- current += 1;
- for (int j = 1; j < step && hasNext(); j++) {
- nextCodePoint();
- current += 1;
- }
- return codePoint;
- }
-
- private int nextCodePoint() {
+ protected int nextCodePoint() {
int U;
int W1 = getString().charAt(k);
if (W1 >= 0xD800 && W1 < 0xDC00) {
@@ -785,8 +775,45 @@
U = W1;
k += 1;
}
+ current += 1;
return U;
}
+ }
+
+ /**
+ * An iterator returning code points from this array, for use when basic plane.
+ */
+ private class SubsequenceIteratorBasic implements Iterator<Integer> {
+
+ protected int current, stop, step; // Character indexes
+
+ SubsequenceIteratorBasic(int start, int stop, int step) {
+ current = start;
+ this.stop = stop;
+ this.step = step;
+ }
+
+ SubsequenceIteratorBasic() {
+ this(0, getCodePointCount(), 1);
+ }
+
+ @Override
+ public boolean hasNext() {
+ return current < stop;
+ }
+
+ @Override
+ public Integer next() {
+ int codePoint = nextCodePoint();
+ for (int j = 1; j < step && hasNext(); j++) {
+ nextCodePoint();
+ }
+ return codePoint;
+ }
+
+ protected int nextCodePoint() {
+ return getString().charAt(current++);
+ }
@Override
public void remove() {
@@ -845,16 +872,31 @@
}
// XXX: Parameterize SubsequenceIteratorImpl and friends (and make them Iterable)
+ /** Get an iterator over the code point sequence. */
public Iterator<Integer> newSubsequenceIterator() {
- return new SubsequenceIteratorImpl();
+ if (isBasicPlane()) {
+ return new SubsequenceIteratorBasic();
+ } else {
+ return new SubsequenceIteratorImpl();
+ }
}
+ /** Get an iterator over a slice of the code point sequence. */
public Iterator<Integer> newSubsequenceIterator(int start, int stop, int step) {
- if (step < 0) {
- return new SteppedIterator(step * -1, new ReversedIterator(new SubsequenceIteratorImpl(
- stop + 1, start + 1, 1)));
+ if (isBasicPlane()) {
+ if (step < 0) {
+ return new SteppedIterator(step * -1, new ReversedIterator(
+ new SubsequenceIteratorBasic(stop + 1, start + 1, 1)));
+ } else {
+ return new SubsequenceIteratorBasic(start, stop, step);
+ }
} else {
- return new SubsequenceIteratorImpl(start, stop, step);
+ if (step < 0) {
+ return new SteppedIterator(step * -1, new ReversedIterator(
+ new SubsequenceIteratorImpl(stop + 1, start + 1, 1)));
+ } else {
+ return new SubsequenceIteratorImpl(start, stop, step);
+ }
}
}
@@ -948,9 +990,6 @@
@ExposedMethod(doc = BuiltinDocs.unicode_title_doc)
final PyObject unicode_title() {
- if (isBasicPlane()) {
- return new PyUnicode(str_title());
- }
StringBuilder buffer = new StringBuilder(getString().length());
boolean previous_is_cased = false;
for (Iterator<Integer> iter = newSubsequenceIterator(); iter.hasNext();) {
@@ -973,9 +1012,6 @@
@ExposedMethod(doc = BuiltinDocs.unicode_swapcase_doc)
final PyObject unicode_swapcase() {
- if (isBasicPlane()) {
- return new PyUnicode(str_swapcase());
- }
StringBuilder buffer = new StringBuilder(getString().length());
for (Iterator<Integer> iter = newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
@@ -1416,9 +1452,6 @@
@ExposedMethod(defaults = "false", doc = BuiltinDocs.unicode___getslice___doc)
final PyList unicode_splitlines(boolean keepends) {
- if (isBasicPlane()) {
- return str_splitlines(keepends);
- }
return new PyList(new LineSplitIterator(keepends));
}
@@ -1582,9 +1615,6 @@
if (getString().length() == 0) {
return this;
}
- if (isBasicPlane()) {
- return new PyUnicode(str_capitalize());
- }
StringBuilder buffer = new StringBuilder(getString().length());
boolean first = true;
for (Iterator<Integer> iter = newSubsequenceIterator(); iter.hasNext();) {
@@ -1671,13 +1701,8 @@
return _codecs.translateCharmap(this, "ignore", table);
}
- // these tests need to be UTF-16 aware because they are character-by-character tests,
- // so we can only use equivalent str_XXX tests if we are in basic plane
@ExposedMethod(doc = BuiltinDocs.unicode_islower_doc)
final boolean unicode_islower() {
- if (isBasicPlane()) {
- return str_islower();
- }
boolean cased = false;
for (Iterator<Integer> iter = newSubsequenceIterator(); iter.hasNext();) {
int codepoint = iter.next();
@@ -1692,9 +1717,6 @@
@ExposedMethod(doc = BuiltinDocs.unicode_isupper_doc)
final boolean unicode_isupper() {
- if (isBasicPlane()) {
- return str_isupper();
- }
boolean cased = false;
for (Iterator<Integer> iter = newSubsequenceIterator(); iter.hasNext();) {
int codepoint = iter.next();
@@ -1709,9 +1731,6 @@
@ExposedMethod(doc = BuiltinDocs.unicode_isalpha_doc)
final boolean unicode_isalpha() {
- if (isBasicPlane()) {
- return str_isalpha();
- }
if (getCodePointCount() == 0) {
return false;
}
@@ -1725,15 +1744,13 @@
@ExposedMethod(doc = BuiltinDocs.unicode_isalnum_doc)
final boolean unicode_isalnum() {
- if (isBasicPlane()) {
- return str_isalnum();
- }
if (getCodePointCount() == 0) {
return false;
}
for (Iterator<Integer> iter = newSubsequenceIterator(); iter.hasNext();) {
int codePoint = iter.next();
- if (!(Character.isLetterOrDigit(codePoint) || Character.getType(codePoint) == Character.LETTER_NUMBER)) {
+ if (!(Character.isLetterOrDigit(codePoint) || //
+ Character.getType(codePoint) == Character.LETTER_NUMBER)) {
return false;
}
}
@@ -1742,9 +1759,6 @@
@ExposedMethod(doc = BuiltinDocs.unicode_isdecimal_doc)
final boolean unicode_isdecimal() {
- if (isBasicPlane()) {
- return str_isdecimal();
- }
if (getCodePointCount() == 0) {
return false;
}
@@ -1758,9 +1772,6 @@
@ExposedMethod(doc = BuiltinDocs.unicode_isdigit_doc)
final boolean unicode_isdigit() {
- if (isBasicPlane()) {
- return str_isdigit();
- }
if (getCodePointCount() == 0) {
return false;
}
@@ -1774,9 +1785,6 @@
@ExposedMethod(doc = BuiltinDocs.unicode_isnumeric_doc)
final boolean unicode_isnumeric() {
- if (isBasicPlane()) {
- return str_isnumeric();
- }
if (getCodePointCount() == 0) {
return false;
}
@@ -1792,9 +1800,6 @@
@ExposedMethod(doc = BuiltinDocs.unicode_istitle_doc)
final boolean unicode_istitle() {
- if (isBasicPlane()) {
- return str_istitle();
- }
if (getCodePointCount() == 0) {
return false;
}
@@ -1823,9 +1828,6 @@
@ExposedMethod(doc = BuiltinDocs.unicode_isspace_doc)
final boolean unicode_isspace() {
- if (isBasicPlane()) {
- return str_isspace();
- }
if (getCodePointCount() == 0) {
return false;
}
--
Repository URL: https://hg.python.org/jython
More information about the Jython-checkins
mailing list