[Jython-checkins] jython: str character operations become ASCII. Completes fix for #2364.

jeff.allen jython-checkins at python.org
Fri Sep 11 00:58:40 CEST 2015


https://hg.python.org/jython/rev/a77dad1d7050
changeset:   7729:a77dad1d7050
user:        Jeff Allen <ja.py at farowl.co.uk>
date:        Thu Sep 10 23:14:16 2015 +0100
summary:
  str character operations become ASCII. Completes fix for #2364.

PyString isalpha, islower, isdigit and so on now use character
classification methods from BaseBytes, resulting in a pure ASCII
interpretation. (Possibly leaves some Unicode-ness in other methods.)
Tests are in place for non-byte characters, just in case.

files:
  NEWS                              |    1 +
  src/org/python/core/PyString.java |  264 ++++++++---------
  2 files changed, 125 insertions(+), 140 deletions(-)


diff --git a/NEWS b/NEWS
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,7 @@
    - [ 2158, 2259 ] Fixed behaviour of relative from ... import *
    - [ 1879 ] -m command now executes scripts from inside a jar file 
    - [ 2058 ] ClasspathPyImporter implements PEP 302 get_data (and others)
+   - [ 2364 ] bytearray and str: isalpha(), isupper() etc. now match Python 2
 
 Jython 2.7
   same as 2.7rc3
diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java
--- a/src/org/python/core/PyString.java
+++ b/src/org/python/core/PyString.java
@@ -6,10 +6,10 @@
 import java.math.BigInteger;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.List;
+import java.util.Locale;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-import java.util.List;
-import java.util.Locale;
 
 import org.python.core.buffer.BaseBuffer;
 import org.python.core.buffer.SimpleStringBuffer;
@@ -2662,8 +2662,8 @@
      * Return the (lazily) compiled regular expression for a Python complex number. This is used
      * within the regular expression patterns that define a priori acceptable strings in the complex
      * constructors. The expression contributes five named capture groups a, b, x, y and j. x and y
-     * are the two floats encountered, and if j is present, one of them is the imaginary part.
-     * a and b are the optional parentheses. They must either both be present or both omitted.
+     * are the two floats encountered, and if j is present, one of them is the imaginary part. a and
+     * b are the optional parentheses. They must either both be present or both omitted.
      */
     private static synchronized Pattern getComplexPattern() {
         if (complexPattern == null) {
@@ -3602,77 +3602,99 @@
 
     @ExposedMethod(doc = BuiltinDocs.str_islower_doc)
     final boolean str_islower() {
-        int n = getString().length();
-
-        /* Shortcut for single character strings */
+        String s = getString();
+        int n = s.length();
+
         if (n == 1) {
-            return Character.isLowerCase(getString().charAt(0));
+            // Special case single character strings.
+            return _islower(s.charAt(0));
         }
 
         boolean cased = false;
         for (int i = 0; i < n; i++) {
-            char ch = getString().charAt(i);
-
-            if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
+            char ch = s.charAt(i);
+            if (_isupper(ch)) {
                 return false;
-            } else if (!cased && Character.isLowerCase(ch)) {
+            } else if (!cased && _islower(ch)) {
                 cased = true;
             }
         }
         return cased;
     }
 
+    private boolean _islower(char ch) {
+        if (ch < 256) {
+            return BaseBytes.islower((byte)ch);
+        } else {
+            // This is an internal error. Really, the test should be unnecessary.
+            throw new java.lang.IllegalArgumentException("non-byte character in PyString");
+        }
+    }
+
     public boolean isupper() {
         return str_isupper();
     }
 
     @ExposedMethod(doc = BuiltinDocs.str_isupper_doc)
     final boolean str_isupper() {
-        int n = getString().length();
-
-        /* Shortcut for single character strings */
+        String s = getString();
+        int n = s.length();
+
         if (n == 1) {
-            return Character.isUpperCase(getString().charAt(0));
+            // Special case single character strings.
+            return _isupper(s.charAt(0));
         }
 
         boolean cased = false;
         for (int i = 0; i < n; i++) {
-            char ch = getString().charAt(i);
-
-            if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) {
+            char ch = s.charAt(i);
+            if (_islower(ch)) {
                 return false;
-            } else if (!cased && Character.isUpperCase(ch)) {
+            } else if (!cased && _isupper(ch)) {
                 cased = true;
             }
         }
         return cased;
     }
 
+    private boolean _isupper(char ch) {
+        if (ch < 256) {
+            return BaseBytes.isupper((byte)ch);
+        } else {
+            // This is an internal error. Really, the test should be unnecessary.
+            throw new java.lang.IllegalArgumentException("non-byte character in PyString");
+        }
+    }
+
     public boolean isalpha() {
         return str_isalpha();
     }
 
     @ExposedMethod(doc = BuiltinDocs.str_isalpha_doc)
     final boolean str_isalpha() {
-        int n = getString().length();
-
-        /* Shortcut for single character strings */
+        String s = getString();
+        int n = s.length();
+
         if (n == 1) {
-            return Character.isLetter(getString().charAt(0));
+            // Special case single character strings.
+            return _isalpha(s.charAt(0));
         }
 
-        if (n == 0) {
-            return false;
-        }
-
         for (int i = 0; i < n; i++) {
-            char ch = getString().charAt(i);
-
-            if (!Character.isLetter(ch)) {
+            if (!_isalpha(s.charAt(i))) {
                 return false;
             }
         }
-        return true;
+        return n > 0;
+    }
+
+    private boolean _isalpha(char ch) {
+        if (ch < 256) {
+            return BaseBytes.isalpha((byte)ch);
+        } else {
+            // This is an internal error. Really, the test should be unnecessary.
+            throw new java.lang.IllegalArgumentException("non-byte character in PyString");
+        }
     }
 
     public boolean isalnum() {
@@ -3681,33 +3703,30 @@
 
     @ExposedMethod(doc = BuiltinDocs.str_isalnum_doc)
     final boolean str_isalnum() {
-        int n = getString().length();
-
-        /* Shortcut for single character strings */
+        String s = getString();
+        int n = s.length();
+
         if (n == 1) {
-            return _isalnum(getString().charAt(0));
+            // Special case single character strings.
+            return _isalnum(s.charAt(0));
         }
 
-        if (n == 0) {
-            return false;
-        }
-
         for (int i = 0; i < n; i++) {
-            char ch = getString().charAt(i);
-
-            if (!_isalnum(ch)) {
+            if (!_isalnum(s.charAt(i))) {
                 return false;
             }
         }
-        return true;
+        return n > 0;
     }
 
     private boolean _isalnum(char ch) {
-        // This can ever be entirely compatible with CPython. In CPython
-        // The type is not used, the numeric property is determined from
-        // the presense of digit, decimal or numeric fields. These fields
-        // are not available in exactly the same way in java.
-        return Character.isLetterOrDigit(ch) || Character.getType(ch) == Character.LETTER_NUMBER;
+        // This is now entirely compatible with CPython, as long as only bytes are stored.
+        if (ch < 256) {
+            return BaseBytes.isalnum((byte)ch);
+        } else {
+            // This is an internal error. Really, the test should be unnecessary.
+            throw new java.lang.IllegalArgumentException("non-byte character in PyString");
+        }
     }
 
     public boolean isdecimal() {
@@ -3715,59 +3734,44 @@
     }
 
     @ExposedMethod(doc = BuiltinDocs.unicode_isdecimal_doc)
-    final boolean str_isdecimal() {
-        int n = getString().length();
-
-        /* Shortcut for single character strings */
+    final boolean str_isdecimal() { // XXX this ought not to exist in str (in Python 2)
+        return str_isdigit();
+    }
+
+    private boolean _isdecimal(char ch) {
+        // See the comment in _isalnum. Here it is even worse.
+        return Character.getType(ch) == Character.DECIMAL_DIGIT_NUMBER;
+    }
+
+    public boolean isdigit() {
+        return str_isdigit();
+    }
+
+    @ExposedMethod(doc = BuiltinDocs.str_isdigit_doc)
+    final boolean str_isdigit() {
+        String s = getString();
+        int n = s.length();
+
         if (n == 1) {
-            char ch = getString().charAt(0);
-            return _isdecimal(ch);
+            // Special case single character strings.
+            return _isdigit(s.charAt(0));
         }
 
-        if (n == 0) {
-            return false;
-        }
-
         for (int i = 0; i < n; i++) {
-            char ch = getString().charAt(i);
-
-            if (!_isdecimal(ch)) {
+            if (!_isdigit(s.charAt(i))) {
                 return false;
             }
         }
-        return true;
-    }
-
-    private boolean _isdecimal(char ch) {
-        // See the comment in _isalnum. Here it is even worse.
-        return Character.getType(ch) == Character.DECIMAL_DIGIT_NUMBER;
-    }
-
-    public boolean isdigit() {
-        return str_isdigit();
-    }
-
-    @ExposedMethod(doc = BuiltinDocs.str_isdigit_doc)
-    final boolean str_isdigit() {
-        int n = getString().length();
-
-        /* Shortcut for single character strings */
-        if (n == 1) {
-            return Character.isDigit(getString().charAt(0));
+        return n > 0;
+    }
+
+    private boolean _isdigit(char ch) {
+        if (ch < 256) {
+            return BaseBytes.isdigit((byte)ch);
+        } else {
+            // This is an internal error. Really, the test should be unnecessary.
+            throw new java.lang.IllegalArgumentException("non-byte character in PyString");
         }
-
-        if (n == 0) {
-            return false;
-        }
-
-        for (int i = 0; i < n; i++) {
-            char ch = getString().charAt(i);
-
-            if (!Character.isDigit(ch)) {
-                return false;
-            }
-        }
-        return true;
     }
 
     public boolean isnumeric() {
@@ -3775,31 +3779,8 @@
     }
 
     @ExposedMethod(doc = BuiltinDocs.unicode_isnumeric_doc)
-    final boolean str_isnumeric() {
-        int n = getString().length();
-
-        /* Shortcut for single character strings */
-        if (n == 1) {
-            return _isnumeric(getString().charAt(0));
-        }
-
-        if (n == 0) {
-            return false;
-        }
-
-        for (int i = 0; i < n; i++) {
-            char ch = getString().charAt(i);
-            if (!_isnumeric(ch)) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    private boolean _isnumeric(char ch) {
-        int type = Character.getType(ch);
-        return type == Character.DECIMAL_DIGIT_NUMBER || type == Character.LETTER_NUMBER
-                || type == Character.OTHER_NUMBER;
+    final boolean str_isnumeric() { // XXX this ought not to exist in str (in Python 2)
+        return str_isdigit();
     }
 
     public boolean istitle() {
@@ -3808,26 +3789,25 @@
 
     @ExposedMethod(doc = BuiltinDocs.str_istitle_doc)
     final boolean str_istitle() {
-        int n = getString().length();
-
-        /* Shortcut for single character strings */
+        String s = getString();
+        int n = s.length();
+
         if (n == 1) {
-            return Character.isTitleCase(getString().charAt(0))
-                    || Character.isUpperCase(getString().charAt(0));
+            // Special case single character strings.
+            return _isupper(s.charAt(0));
         }
 
         boolean cased = false;
         boolean previous_is_cased = false;
         for (int i = 0; i < n; i++) {
-            char ch = getString().charAt(i);
-
-            if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
+            char ch = s.charAt(i);
+            if (_isupper(ch)) {
                 if (previous_is_cased) {
                     return false;
                 }
                 previous_is_cased = true;
                 cased = true;
-            } else if (Character.isLowerCase(ch)) {
+            } else if (_islower(ch)) {
                 if (!previous_is_cased) {
                     return false;
                 }
@@ -3846,25 +3826,29 @@
 
     @ExposedMethod(doc = BuiltinDocs.str_isspace_doc)
     final boolean str_isspace() {
-        int n = getString().length();
-
-        /* Shortcut for single character strings */
+        String s = getString();
+        int n = s.length();
+
         if (n == 1) {
-            return Character.isWhitespace(getString().charAt(0));
+            // Special case single character strings.
+            return _isspace(s.charAt(0));
         }
 
-        if (n == 0) {
-            return false;
-        }
-
         for (int i = 0; i < n; i++) {
-            char ch = getString().charAt(i);
-
-            if (!Character.isWhitespace(ch)) {
+            if (!_isspace(s.charAt(i))) {
                 return false;
             }
         }
-        return true;
+        return n > 0;
+    }
+
+    private boolean _isspace(char ch) {
+        if (ch < 256) {
+            return BaseBytes.isspace((byte)ch);
+        } else {
+            // This is an internal error. Really, the test should be unnecessary.
+            throw new java.lang.IllegalArgumentException("non-byte character in PyString");
+        }
     }
 
     public boolean isunicode() {

-- 
Repository URL: https://hg.python.org/jython


More information about the Jython-checkins mailing list