From jython-checkins at python.org Tue Nov 21 17:39:08 2017 From: jython-checkins at python.org (jeff.allen) Date: Tue, 21 Nov 2017 22:39:08 +0000 Subject: [Jython-checkins] =?utf-8?q?jython=3A_Add_a_note_to_jython=2Epy_?= =?utf-8?q?on_how_to_regenerate_jython=2Eexe?= Message-ID: <20171121223907.85344.5815D349B8685987@mg.python.org> https://hg.python.org/jython/rev/1503edec030b changeset: 8136:1503edec030b user: Jeff Allen date: Mon Oct 23 21:48:11 2017 +0100 summary: Add a note to jython.py on how to regenerate jython.exe This is a follow-up to fixing #2607 and #2620, intended to make a regression less likely. jython.exe was regenerated following the instructions in the devguide as a test. files: src/shell/jython.exe | Bin src/shell/jython.py | 15 ++++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/shell/jython.exe b/src/shell/jython.exe index 9f1235faa2ff2480db1215f775a32a056e82b7fc..8a4abc7af726b5b52902677b534479dcc23c2361 GIT binary patch [stripped] diff --git a/src/shell/jython.py b/src/shell/jython.py --- a/src/shell/jython.py +++ b/src/shell/jython.py @@ -1,11 +1,16 @@ #!/usr/bin/env python2.7 -E # -*- coding: utf-8 -*- -# Launch script for Jython. It may be wrapped as an executable with -# tools like PyInstaller, creating jython.exe, or run directly. The -# installer will make this the default launcher under the name -# bin/jython if CPython 2.7 is available with the above shebang -# invocation. +# Launch script for Jython. It may be run directly (note the shebang line), but +# importantly it supplies python.exe, the launcher we use on Windows. +# +# Each time this file changes, we must regenerate an executable with +# PyInstaller, using the command: +# +# pyinstaller --onefile jython.py +# +# This is best done in a virtual environment (more about this in the Jython +# Developers' Guide). import glob import inspect -- Repository URL: https://hg.python.org/jython From jython-checkins at python.org Tue Nov 21 17:39:09 2017 From: jython-checkins at python.org (jeff.allen) Date: Tue, 21 Nov 2017 22:39:09 +0000 Subject: [Jython-checkins] =?utf-8?q?jython=3A_Formatting_only_=28PyUnico?= =?utf-8?q?de=2C_PyString=29?= Message-ID: <20171121223907.85259.89105040FA4FC332@mg.python.org> https://hg.python.org/jython/rev/862e65475e3b changeset: 8137:862e65475e3b user: Jeff Allen date: Wed Nov 01 22:14:06 2017 +0000 summary: Formatting only (PyUnicode, PyString) files: src/org/python/core/PyString.java | 191 ++++++++-------- src/org/python/core/PyUnicode.java | 47 ++-- 2 files changed, 118 insertions(+), 120 deletions(-) diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java --- a/src/org/python/core/PyString.java +++ b/src/org/python/core/PyString.java @@ -7,7 +7,6 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; -import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -168,7 +167,7 @@ S = S.__str__(); if (S instanceof PyUnicode) { // Encoding will raise UnicodeEncodeError if not 7-bit clean. - str = codecs.encode((PyUnicode)S, null, null); + str = codecs.encode((PyUnicode) S, null, null); } else { // Must be str/bytes, and should be 8-bit clean already. str = S.toString(); @@ -349,7 +348,7 @@ // Escape quotes and backslash if ((use_quotes && ch == quote) || ch == '\\') { v.append('\\'); - v.append((char)ch); + v.append((char) ch); continue; } /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ @@ -397,7 +396,7 @@ v.append(hexdigit[(ch >> 4) & 0xf]); v.append(hexdigit[ch & 0xf]); } else {/* Copy everything else as-is */ - v.append((char)ch); + v.append((char) ch); } } @@ -432,7 +431,7 @@ } ch = str.charAt(s++); switch (ch) { - /* \x escapes */ + /* \x escapes */ case '\n': break; case '\\': @@ -482,7 +481,7 @@ } x = (x << 3) + Character.digit(ch, 8); } - v.append((char)x); + v.append((char) x); break; case 'x': s = hexescape(v, errors, 2, s, str, end, "truncated \\xXX"); @@ -516,7 +515,7 @@ if (pucnHash == null) { PyObject mod = imp.importName("ucnhash", true); mod = mod.__call__(); - pucnHash = (ucnhashAPI)mod.__tojava__(Object.class); + pucnHash = (ucnhashAPI) mod.__tojava__(Object.class); if (pucnHash.getCchMax() < 0) { throw Py.UnicodeError("Unicode names not loaded"); } @@ -564,8 +563,8 @@ private static int hexescape(StringBuilder partialDecode, String errors, int digits, int hexDigitStart, String str, int size, String errorMessage) { if (hexDigitStart + digits > size) { - return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape", - str, hexDigitStart - 2, size, errorMessage); + return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape", str, + hexDigitStart - 2, size, errorMessage); } int i = 0; int x = 0; @@ -588,8 +587,8 @@ if (storeUnicodeCharacter(x, partialDecode)) { return hexDigitStart + i; } else { - return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape", - str, hexDigitStart - 2, hexDigitStart + i + 1, "illegal Unicode character"); + return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape", str, + hexDigitStart - 2, hexDigitStart + i + 1, "illegal Unicode character"); } } @@ -630,7 +629,7 @@ return -2; } - int c = getString().compareTo(((PyString)other).getString()); + int c = getString().compareTo(((PyString) other).getString()); return c < 0 ? -1 : c > 0 ? 1 : 0; } @@ -749,12 +748,11 @@ @Override public Object __tojava__(Class c) { if (c.isAssignableFrom(String.class)) { - /* If c is a CharSequence we assume the caller is prepared - * to get maybe not an actual String. In that case we avoid - * conversion so the caller can do special stuff with the - * returned PyString or PyUnicode or whatever. - * (If c is Object.class, the caller usually expects to get - * actually a String) + /* + * If c is a CharSequence we assume the caller is prepared to get maybe not an actual + * String. In that case we avoid conversion so the caller can do special stuff with the + * returned PyString or PyUnicode or whatever. (If c is Object.class, the caller usually + * expects to get actually a String) */ return c == CharSequence.class ? this : getString(); } @@ -854,10 +852,10 @@ private static String asUTF16StringOrNull(PyObject obj) { if (obj instanceof PyString) { // str or unicode object: go directly to the String - return ((PyString)obj).getString(); + return ((PyString) obj).getString(); } else if (obj instanceof BufferProtocol) { // Other object with buffer API: briefly access the buffer - try (PyBuffer buf = ((BufferProtocol)obj).getBuffer(PyBUF.FULL_RO)) { + try (PyBuffer buf = ((BufferProtocol) obj).getBuffer(PyBUF.FULL_RO)) { return buf.toString(); } } else { @@ -963,7 +961,7 @@ count = 0; } int s = getString().length(); - if ((long)s * count > Integer.MAX_VALUE) { + if ((long) s * count > Integer.MAX_VALUE) { // Since Strings store their data in an array, we can't make one // longer than Integer.MAX_VALUE. Without this check we get // NegativeArraySize exceptions when we create the array on the @@ -1104,13 +1102,13 @@ if (n == 1) { // Special-case single byte string char c = s.charAt(0); - return _isupper(c) ? String.valueOf((char)(c ^ SWAP_CASE)) : s; + return _isupper(c) ? String.valueOf((char) (c ^ SWAP_CASE)) : s; } else { // Copy chars to buffer, converting to lower-case. char[] buf = new char[n]; for (int i = 0; i < n; i++) { char c = s.charAt(i); - buf[i] = _isupper(c) ? (char)(c ^ SWAP_CASE) : c; + buf[i] = _isupper(c) ? (char) (c ^ SWAP_CASE) : c; } return new String(buf); } @@ -1127,13 +1125,13 @@ if (n == 1) { // Special-case single byte string char c = s.charAt(0); - return _islower(c) ? String.valueOf((char)(c ^ SWAP_CASE)) : s; + return _islower(c) ? String.valueOf((char) (c ^ SWAP_CASE)) : s; } else { // Copy chars to buffer, converting to upper-case. char[] buf = new char[n]; for (int i = 0; i < n; i++) { char c = s.charAt(i); - buf[i] = _islower(c) ? (char)(c ^ SWAP_CASE) : c; + buf[i] = _islower(c) ? (char) (c ^ SWAP_CASE) : c; } return new String(buf); } @@ -1154,12 +1152,12 @@ if (previous_is_cased) { // Should be lower case if (_isupper(ch)) { - chars[i] = (char)(ch ^ SWAP_CASE); + chars[i] = (char) (ch ^ SWAP_CASE); } } else { // Should be upper case if (_islower(ch)) { - chars[i] = (char)(ch ^ SWAP_CASE); + chars[i] = (char) (ch ^ SWAP_CASE); } } // And this was a letter @@ -1183,13 +1181,13 @@ if (n == 1) { // Special-case single byte string char c = s.charAt(0); - return _isalpha(c) ? String.valueOf((char)(c ^ SWAP_CASE)) : s; + return _isalpha(c) ? String.valueOf((char) (c ^ SWAP_CASE)) : s; } else { // Copy chars to buffer, converting lower to upper case, upper to lower case. char[] buf = new char[n]; for (int i = 0; i < n; i++) { char c = s.charAt(i); - buf[i] = _isalpha(c) ? (char)(c ^ SWAP_CASE) : c; + buf[i] = _isalpha(c) ? (char) (c ^ SWAP_CASE) : c; } return new String(buf); } @@ -1236,7 +1234,7 @@ final PyObject str_strip(PyObject chars) { if (chars instanceof PyUnicode) { // Promote the problem to a Unicode one - return ((PyUnicode)decode()).unicode_strip(chars); + return ((PyUnicode) decode()).unicode_strip(chars); } else { // It ought to be None, null, some kind of bytes with the buffer API. String stripChars = asStringNullOrError(chars, "strip"); @@ -1406,7 +1404,7 @@ final PyObject str_lstrip(PyObject chars) { if (chars instanceof PyUnicode) { // Promote the problem to a Unicode one - return ((PyUnicode)decode()).unicode_lstrip(chars); + return ((PyUnicode) decode()).unicode_lstrip(chars); } else { // It ought to be None, null, some kind of bytes with the buffer API. String stripChars = asStringNullOrError(chars, "lstrip"); @@ -1495,7 +1493,7 @@ final PyObject str_rstrip(PyObject chars) { if (chars instanceof PyUnicode) { // Promote the problem to a Unicode one - return ((PyUnicode)decode()).unicode_rstrip(chars); + return ((PyUnicode) decode()).unicode_rstrip(chars); } else { // It ought to be None, null, some kind of bytes with the buffer API. String stripChars = asStringNullOrError(chars, "rstrip"); @@ -1616,7 +1614,7 @@ final PyList str_split(PyObject sepObj, int maxsplit) { if (sepObj instanceof PyUnicode) { // Promote the problem to a Unicode one - return ((PyUnicode)decode()).unicode_split(sepObj, maxsplit); + return ((PyUnicode) decode()).unicode_split(sepObj, maxsplit); } else { // It ought to be None, null, some kind of bytes with the buffer API. String sep = asStringNullOrError(sepObj, "split"); @@ -1867,7 +1865,7 @@ final PyList str_rsplit(PyObject sepObj, int maxsplit) { if (sepObj instanceof PyUnicode) { // Promote the problem to a Unicode one - return ((PyUnicode)decode()).unicode_rsplit(sepObj, maxsplit); + return ((PyUnicode) decode()).unicode_rsplit(sepObj, maxsplit); } else { // It ought to be None, null, some kind of bytes with the buffer API. String sep = asStringNullOrError(sepObj, "rsplit"); @@ -2066,8 +2064,8 @@ int index = getString().indexOf(sep); if (index != -1) { - return new PyTuple(fromSubstring(0, index), sepObj, fromSubstring( - index + sep.length(), getString().length())); + return new PyTuple(fromSubstring(0, index), sepObj, + fromSubstring(index + sep.length(), getString().length())); } else { return new PyTuple(this, Py.EmptyString, Py.EmptyString); } @@ -2088,8 +2086,8 @@ int index = str.indexOf(sep); if (index != -1) { - return new PyTuple(strObj.fromSubstring(0, index), sepObj, strObj.fromSubstring(index - + sep.length(), str.length())); + return new PyTuple(strObj.fromSubstring(0, index), sepObj, + strObj.fromSubstring(index + sep.length(), str.length())); } else { PyUnicode emptyUnicode = Py.newUnicode(""); return new PyTuple(this, emptyUnicode, emptyUnicode); @@ -2125,8 +2123,8 @@ int index = getString().lastIndexOf(sep); if (index != -1) { - return new PyTuple(fromSubstring(0, index), sepObj, fromSubstring( - index + sep.length(), getString().length())); + return new PyTuple(fromSubstring(0, index), sepObj, + fromSubstring(index + sep.length(), getString().length())); } else { return new PyTuple(Py.EmptyString, Py.EmptyString, this); } @@ -2147,8 +2145,8 @@ int index = str.lastIndexOf(sep); if (index != -1) { - return new PyTuple(strObj.fromSubstring(0, index), sepObj, strObj.fromSubstring(index - + sep.length(), str.length())); + return new PyTuple(strObj.fromSubstring(0, index), sepObj, + strObj.fromSubstring(index + sep.length(), str.length())); } else { PyUnicode emptyUnicode = Py.newUnicode(""); return new PyTuple(emptyUnicode, emptyUnicode, this); @@ -2420,7 +2418,7 @@ final int str_count(PyObject subObj, PyObject start, PyObject end) { if (subObj instanceof PyUnicode) { // Promote the problem to a Unicode one - return ((PyUnicode)decode()).unicode_count(subObj, start, end); + return ((PyUnicode) decode()).unicode_count(subObj, start, end); } else { // It ought to be some kind of bytes with the buffer API. String sub = asStringOrError(subObj); @@ -2535,7 +2533,7 @@ final int str_find(PyObject subObj, PyObject start, PyObject end) { if (subObj instanceof PyUnicode) { // Promote the problem to a Unicode one - return ((PyUnicode)decode()).unicode_find(subObj, start, end); + return ((PyUnicode) decode()).unicode_find(subObj, start, end); } else { // It ought to be some kind of bytes with the buffer API. String sub = asStringOrError(subObj); @@ -2640,7 +2638,7 @@ final int str_rfind(PyObject subObj, PyObject start, PyObject end) { if (subObj instanceof PyUnicode) { // Promote the problem to a Unicode one - return ((PyUnicode)decode()).unicode_rfind(subObj, start, end); + return ((PyUnicode) decode()).unicode_rfind(subObj, start, end); } else { // It ought to be some kind of bytes with the buffer API. String sub = asStringOrError(subObj); @@ -2948,7 +2946,8 @@ } // if the base >= 22, then an 'l' or 'L' is a digit! - if (isLong && base < 22 && e > b && (str.charAt(e - 1) == 'L' || str.charAt(e - 1) == 'l')) { + if (isLong && base < 22 && e > b + && (str.charAt(e - 1) == 'L' || str.charAt(e - 1) == 'l')) { e--; } @@ -2982,11 +2981,11 @@ } return bi.intValue(); } catch (NumberFormatException exc) { - throw Py.ValueError("invalid literal for int() with base " + base + ": '" + getString() - + "'"); + throw Py.ValueError( + "invalid literal for int() with base " + base + ": '" + getString() + "'"); } catch (StringIndexOutOfBoundsException exc) { - throw Py.ValueError("invalid literal for int() with base " + base + ": '" + getString() - + "'"); + throw Py.ValueError( + "invalid literal for int() with base " + base + ": '" + getString() + "'"); } } @@ -3011,12 +3010,12 @@ throw Py.UnicodeEncodeError("decimal", "codec can't encode character", 0, 0, "invalid decimal Unicode string"); } else { - throw Py.ValueError("invalid literal for long() with base " + base + ": '" - + getString() + "'"); + throw Py.ValueError( + "invalid literal for long() with base " + base + ": '" + getString() + "'"); } } catch (StringIndexOutOfBoundsException exc) { - throw Py.ValueError("invalid literal for long() with base " + base + ": '" - + getString() + "'"); + throw Py.ValueError( + "invalid literal for long() with base " + base + ": '" + getString() + "'"); } } @@ -3131,7 +3130,7 @@ @ExposedMethod(defaults = "8", doc = BuiltinDocs.str_expandtabs_doc) final String str_expandtabs(int tabsize) { String s = getString(); - StringBuilder buf = new StringBuilder((int)(s.length() * 1.5)); + StringBuilder buf = new StringBuilder((int) (s.length() * 1.5)); char[] chars = s.toCharArray(); int n = chars.length; int position = 0; @@ -3169,11 +3168,11 @@ char[] buf = new char[n]; // At least one byte: if lower convert to upper case. char c = s.charAt(0); - buf[0] = _islower(c) ? (char)(c ^ SWAP_CASE) : c; + buf[0] = _islower(c) ? (char) (c ^ SWAP_CASE) : c; // Copy the rest, converting to lower case. for (int i = 1; i < n; i++) { c = s.charAt(i); - buf[i] = _isupper(c) ? (char)(c ^ SWAP_CASE) : c; + buf[i] = _isupper(c) ? (char) (c ^ SWAP_CASE) : c; } return new String(buf); } @@ -3211,7 +3210,7 @@ final PyString str_replace(PyObject oldPieceObj, PyObject newPieceObj, int count) { if (oldPieceObj instanceof PyUnicode || newPieceObj instanceof PyUnicode) { // Promote the problem to a Unicode one - return ((PyUnicode)decode()).unicode_replace(oldPieceObj, newPieceObj, count); + return ((PyUnicode) decode()).unicode_replace(oldPieceObj, newPieceObj, count); } else { // Neither is a PyUnicode: both ought to be some kind of bytes with the buffer API. String oldPiece = asStringOrError(oldPieceObj); @@ -3283,7 +3282,7 @@ if (seqLen == 1) { item = seq.pyget(0); if (item.getType() == PyString.TYPE || item.getType() == PyUnicode.TYPE) { - return (PyString)item; + return (PyString) item; } } @@ -3309,20 +3308,20 @@ if (i != 0) { size += sepLen; } - size += ((PyString)item).getString().length(); + size += ((PyString) item).getString().length(); if (size > Integer.MAX_VALUE) { throw Py.OverflowError("join() result is too long for a Python string"); } } // Catenate everything - StringBuilder buf = new StringBuilder((int)size); + StringBuilder buf = new StringBuilder((int) size); for (i = 0; i < seqLen; i++) { item = seq.pyget(i); if (i != 0) { buf.append(getString()); } - buf.append(((PyString)item).getString()); + buf.append(((PyString) item).getString()); } return new PyString(buf.toString(), true); // Guaranteed to be byte-like } @@ -3345,7 +3344,7 @@ if (seqLen == 1) { item = seq.pyget(0); if (item.getType() == PyUnicode.TYPE) { - return (PyUnicode)item; + return (PyUnicode) item; } } @@ -3354,7 +3353,7 @@ if (this instanceof PyUnicode) { sep = getString(); } else { - sep = ((PyUnicode)decode()).getString(); + sep = ((PyUnicode) decode()).getString(); // In case decode()'s codec mutated seq seqLen = seq.__len__(); } @@ -3369,15 +3368,16 @@ item = seq.pyget(i); // Convert item to Unicode if (!(item instanceof PyString)) { - throw Py.TypeError(String.format("sequence item %d: expected string or Unicode," - + " %.80s found", i, item.getType().fastGetName())); + throw Py.TypeError(String.format( + "sequence item %d: expected string or Unicode," + " %.80s found", i, + item.getType().fastGetName())); } if (!(item instanceof PyUnicode)) { - item = ((PyString)item).decode(); + item = ((PyString) item).decode(); // In case decode()'s codec mutated seq seqLen = seq.__len__(); } - itemString = ((PyUnicode)item).getString(); + itemString = ((PyUnicode) item).getString(); if (i != 0) { size += sepLen; @@ -3450,7 +3450,7 @@ } else { // Loop will return true if this slice starts with any prefix in the tuple - for (PyObject prefixObj : ((PyTuple)prefix).getArray()) { + for (PyObject prefixObj : ((PyTuple) prefix).getArray()) { // It ought to be PyUnicode or some kind of bytes with the buffer API. String s = asUTF16StringOrError(prefixObj); // If s is non-BMP, and this is a PyString (bytes), result will correctly be false. @@ -3521,7 +3521,7 @@ } else { // Loop will return true if this slice ends with any suffix in the tuple - for (PyObject suffixObj : ((PyTuple)suffix).getArray()) { + for (PyObject suffixObj : ((PyTuple) suffix).getArray()) { // It ought to be PyUnicode or some kind of bytes with the buffer API. String s = asUTF16StringOrError(suffixObj); // If s is non-BMP, and this is a PyString (bytes), result will correctly be false. @@ -3725,7 +3725,7 @@ private boolean _islower(char ch) { if (ch < 256) { - return BaseBytes.islower((byte)ch); + return BaseBytes.islower((byte) ch); } else { // This is an internal error. Really, the test should be unnecessary. throw new java.lang.IllegalArgumentException("non-byte character in PyString"); @@ -3760,7 +3760,7 @@ private boolean _isupper(char ch) { if (ch < 256) { - return BaseBytes.isupper((byte)ch); + return BaseBytes.isupper((byte) ch); } else { // This is an internal error. Really, the test should be unnecessary. throw new java.lang.IllegalArgumentException("non-byte character in PyString"); @@ -3791,7 +3791,7 @@ private boolean _isalpha(char ch) { if (ch < 256) { - return BaseBytes.isalpha((byte)ch); + return BaseBytes.isalpha((byte) ch); } else { // This is an internal error. Really, the test should be unnecessary. throw new java.lang.IllegalArgumentException("non-byte character in PyString"); @@ -3823,7 +3823,7 @@ private boolean _isalnum(char ch) { // This is now entirely compatible with CPython, as long as only bytes are stored. if (ch < 256) { - return BaseBytes.isalnum((byte)ch); + return BaseBytes.isalnum((byte) ch); } else { // This is an internal error. Really, the test should be unnecessary. throw new java.lang.IllegalArgumentException("non-byte character in PyString"); @@ -3868,7 +3868,7 @@ private boolean _isdigit(char ch) { if (ch < 256) { - return BaseBytes.isdigit((byte)ch); + return BaseBytes.isdigit((byte) ch); } else { // This is an internal error. Really, the test should be unnecessary. throw new java.lang.IllegalArgumentException("non-byte character in PyString"); @@ -3945,7 +3945,7 @@ private boolean _isspace(char ch) { if (ch < 256) { - return BaseBytes.isspace((byte)ch); + return BaseBytes.isspace((byte) ch); } else { // This is an internal error. Really, the test should be unnecessary. throw new java.lang.IllegalArgumentException("non-byte character in PyString"); @@ -4088,7 +4088,7 @@ // Check for "{}".format(u"abc") if (fieldObj instanceof PyUnicode && !(this instanceof PyUnicode)) { // Down-convert to PyString, at the risk of raising UnicodeEncodingError - fieldObj = ((PyUnicode)fieldObj).__str__(); + fieldObj = ((PyUnicode) fieldObj).__str__(); } // The format_spec may be simple, or contained nested replacement fields. @@ -4156,11 +4156,11 @@ Object key = chunk.value; if (chunk.is_attr) { // key must be a String - obj = obj.__getattr__((String)key); + obj = obj.__getattr__((String) key); } else { if (key instanceof Integer) { // Can this happen? - obj = obj.__getitem__(((Integer)key).intValue()); + obj = obj.__getitem__(((Integer) key).intValue()); } else { obj = obj.__getitem__(new PyString(key.toString())); } @@ -4412,7 +4412,7 @@ if (c == '*') { PyObject o = getarg(); if (o instanceof PyInteger) { - return ((PyInteger)o).getValue(); + return ((PyInteger) o).getValue(); } throw Py.TypeError("* wants int"); } else { @@ -4533,7 +4533,7 @@ if (arg instanceof PyUnicode) { // arg is already acceptable. needUnicode = true; - return (PyUnicode)arg; + return (PyUnicode) arg; } else if (needUnicode) { // The string being built is unicode, so we need that version of the arg. @@ -4541,7 +4541,7 @@ } else if (arg instanceof PyString) { // The string being built is not unicode, so arg is already acceptable. - return (PyString)arg; + return (PyString) arg; } else { // The string being built is not unicode, so use __str__ to get a PyString. @@ -4572,11 +4572,10 @@ } else { // Not a tuple, but possibly still some kind of container: use special argIndex values. argIndex = -1; - if (args instanceof AbstractDict - || (!(args instanceof PySequence) && - // See issue 2511: __getitem__ should be looked up directly in the dict, rather - // than going through another __getattr__ call. We achieve this by using - // object___findattr__ instead of generic __findattr__. + if (args instanceof AbstractDict || (!(args instanceof PySequence) && + // See issue 2511: __getitem__ should be looked up directly in the dict, rather + // than going through another __getattr__ call. We achieve this by using + // object___findattr__ instead of generic __findattr__. args.object___findattr__("__getitem__".intern()) != null)) { dict = args; argIndex = -3; @@ -4775,7 +4774,7 @@ needUnicode = true; fi.setBytes(false); } - fi.format(((PyString)arg).getString().codePointAt(0)); + fi.format(((PyString) arg).getString().codePointAt(0)); } } else { @@ -4784,14 +4783,14 @@ // We have to check what we got back. if (argAsNumber instanceof PyInteger) { - fi.format(((PyInteger)argAsNumber).getValue()); + fi.format(((PyInteger) argAsNumber).getValue()); } else if (argAsNumber instanceof PyLong) { - fi.format(((PyLong)argAsNumber).getValue()); + fi.format(((PyLong) argAsNumber).getValue()); } else { // It couldn't be converted, raise the error here - throw Py.TypeError("%" + spec.type - + " format: a number is required, not " - + arg.getType().fastGetName()); + throw Py.TypeError( + "%" + spec.type + " format: a number is required, not " + + arg.getType().fastGetName()); } } @@ -4814,11 +4813,11 @@ // We have to check what we got back.. if (argAsFloat instanceof PyFloat) { - ff.format(((PyFloat)argAsFloat).getValue()); + ff.format(((PyFloat) argAsFloat).getValue()); } else { // It couldn't be converted, raise the error here - throw Py.TypeError("float argument required, not " - + arg.getType().fastGetName()); + throw Py.TypeError( + "float argument required, not " + arg.getType().fastGetName()); } break; diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java --- a/src/org/python/core/PyUnicode.java +++ b/src/org/python/core/PyUnicode.java @@ -8,8 +8,6 @@ import java.util.List; import java.util.Set; -import com.google.common.base.CharMatcher; - import org.python.core.stringlib.FieldNameIterator; import org.python.core.stringlib.MarkupIterator; import org.python.expose.ExposedMethod; @@ -19,6 +17,8 @@ import org.python.modules._codecs; import org.python.util.Generic; +import com.google.common.base.CharMatcher; + /** * a builtin python unicode string. */ @@ -592,9 +592,8 @@ @ExposedNew final static PyObject unicode_new(PyNewWrapper new_, boolean init, PyType subtype, PyObject[] args, String[] keywords) { - ArgParser ap = - new ArgParser("unicode", args, keywords, new String[] {"string", "encoding", - "errors"}, 0); + ArgParser ap = new ArgParser("unicode", args, keywords, + new String[] {"string", "encoding", "errors"}, 0); PyObject S = ap.getPyObject(0, null); String encoding = checkEncoding(ap.getString(1, null)); String errors = checkEncoding(ap.getString(2, null)); @@ -603,15 +602,15 @@ return new PyUnicode(""); } if (S instanceof PyUnicode) { - return new PyUnicode(((PyUnicode)S).getString()); + return new PyUnicode(((PyUnicode) S).getString()); } if (S instanceof PyString) { if (S.getType() != PyString.TYPE && encoding == null && errors == null) { return S.__unicode__(); } - PyObject decoded = codecs.decode((PyString)S, encoding, errors); + PyObject decoded = codecs.decode((PyString) S, encoding, errors); if (decoded instanceof PyUnicode) { - return new PyUnicode((PyUnicode)decoded); + return new PyUnicode((PyUnicode) decoded); } else { throw Py.TypeError("decoder did not return an unicode object (type=" + decoded.getType().fastGetName() + ")"); @@ -623,7 +622,7 @@ return new PyUnicodeDerived(subtype, Py.EmptyString); } if (S instanceof PyUnicode) { - return new PyUnicodeDerived(subtype, (PyUnicode)S); + return new PyUnicodeDerived(subtype, (PyUnicode) S); } else { return new PyUnicodeDerived(subtype, S.__str__()); } @@ -910,12 +909,12 @@ */ private PyUnicode coerceToUnicode(PyObject o) { if (o instanceof PyUnicode) { - return (PyUnicode)o; + return (PyUnicode) o; } else if (o instanceof PyString) { - return new PyUnicode(((PyString)o).getString(), true); + return new PyUnicode(((PyString) o).getString(), true); } else if (o instanceof BufferProtocol) { // PyByteArray, PyMemoryView, Py2kBuffer ... - try (PyBuffer buf = ((BufferProtocol)o).getBuffer(PyBUF.FULL_RO)) { + try (PyBuffer buf = ((BufferProtocol) o).getBuffer(PyBUF.FULL_RO)) { return new PyUnicode(buf.toString(), true); } } else { @@ -969,9 +968,9 @@ final PyObject unicode___add__(PyObject other) { PyUnicode otherUnicode; if (other instanceof PyUnicode) { - otherUnicode = (PyUnicode)other; + otherUnicode = (PyUnicode) other; } else if (other instanceof PyString) { - otherUnicode = (PyUnicode)((PyString)other).decode(); + otherUnicode = (PyUnicode) ((PyString) other).decode(); } else { return null; } @@ -1094,9 +1093,9 @@ if (o == null) { return null; } else if (o instanceof PyUnicode) { - return (PyUnicode)o; + return (PyUnicode) o; } else if (o instanceof PyString) { - return new PyUnicode(((PyString)o).decode().toString()); + return new PyUnicode(((PyString) o).decode().toString()); } else if (o == Py.None) { return null; } else { @@ -1121,8 +1120,8 @@ } // Not basic plane: have to do real Unicode - return new PyUnicode(new ReversedIterator(new StripIterator(sep, new ReversedIterator<>( - new StripIterator(sep, newSubsequenceIterator()))))); + return new PyUnicode(new ReversedIterator(new StripIterator(sep, + new ReversedIterator<>(new StripIterator(sep, newSubsequenceIterator()))))); } @ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode_lstrip_doc) @@ -1162,8 +1161,8 @@ } // Not basic plane: have to do real Unicode - return new PyUnicode(new ReversedIterator(new StripIterator(sep, - new ReversedIterator<>(newSubsequenceIterator())))); + return new PyUnicode(new ReversedIterator( + new StripIterator(sep, new ReversedIterator<>(newSubsequenceIterator())))); } @Override @@ -1484,8 +1483,8 @@ } int[] indices = super.translateIndices(start, end); // do not convert to utf-16 indices. int count = 0; - for (Iterator mainIter = newSubsequenceIterator(indices[0], indices[1], 1); mainIter - .hasNext();) { + for (Iterator mainIter = + newSubsequenceIterator(indices[0], indices[1], 1); mainIter.hasNext();) { int matched = sub.getCodePointCount(); for (Iterator subIter = sub.newSubsequenceIterator(); mainIter.hasNext() && subIter.hasNext();) { @@ -1661,7 +1660,7 @@ SplitIterator iter = newSplitIterator(oldPiece, count); int numSplits = 0; while (iter.hasNext()) { - buffer.append(((PyUnicode)iter.next()).getString()); + buffer.append(((PyUnicode) iter.next()).getString()); if (iter.hasNext()) { buffer.append(newPiece.getString()); } @@ -1750,7 +1749,7 @@ for (Iterator iter = newSubsequenceIterator(); iter.hasNext();) { int codePoint = iter.next(); if (!(Character.isLetterOrDigit(codePoint) || // - Character.getType(codePoint) == Character.LETTER_NUMBER)) { + Character.getType(codePoint) == Character.LETTER_NUMBER)) { return false; } } -- Repository URL: https://hg.python.org/jython From jython-checkins at python.org Tue Nov 21 17:39:10 2017 From: jython-checkins at python.org (jeff.allen) Date: Tue, 21 Nov 2017 22:39:10 +0000 Subject: [Jython-checkins] =?utf-8?q?jython=3A_Respect_default_encoding_w?= =?utf-8?q?hen_coercing_str_to_unicode_=28addresses_=232638=29=2E?= Message-ID: <20171121223908.45462.A964147CB55D5E15@mg.python.org> https://hg.python.org/jython/rev/78482073e91f changeset: 8138:78482073e91f user: Jeff Allen date: Sun Nov 19 08:32:18 2017 +0000 summary: Respect default encoding when coercing str to unicode (addresses #2638). This change corrects the implicit ascii or latin-1 assumption made when accepting arguments in PyUnicode and PyString methods, adds tests to test_unicode_jy, and makes small consequential changes to other modules. The general effects is to allow, and decode, any byte buffer where a str is acceptable. This is more liberal than CPython, except in __eq__ and __ne__ which reproduce CPython limitations. It is not certain we have to do that. Further change is needed to support all comparison operations. files: Lib/test/test_bytes.py | 4 +- Lib/test/test_concat_jy.py | 7 +- Lib/test/test_import_jy.py | 10 +- Lib/test/test_unicode_jy.py | 274 ++++++++++ src/org/python/core/PyShadowString.java | 4 +- src/org/python/core/PyString.java | 244 +++++--- src/org/python/core/PyUnicode.java | 319 +++++++++-- src/org/python/core/__builtin__.java | 17 +- src/org/python/core/codecs.java | 2 +- src/org/python/core/imp.java | 22 +- 10 files changed, 726 insertions(+), 177 deletions(-) diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -217,7 +217,9 @@ self.assertEqual(b1 + bytes(b"def"), b"abcdef") self.assertEqual(bytes(b"def") + b1, b"defabc") self.assertRaises(TypeError, lambda: b1 + u"def") - self.assertRaises(TypeError, lambda: u"abc" + b2) + # Jython treats unicode + bytearray the same way as unicode + str + #self.assertRaises(TypeError, lambda: u"abc" + b2) + self.assertEqual(u"def" + b1, u"defabc") # OK in Jython def test_repeat(self): for b in b"abc", self.type2test(b"abc"): diff --git a/Lib/test/test_concat_jy.py b/Lib/test/test_concat_jy.py --- a/Lib/test/test_concat_jy.py +++ b/Lib/test/test_concat_jy.py @@ -23,11 +23,10 @@ resType = unicode res = a.__add__(b) self.assertEquals(type(res), resType, - '%r is a %s, not a %s' % (res, type(res), - resType)) + '%r + %r -> %r is a %s, not a %s' % + (type(a), type(b), res, type(res), resType)) self.assertEquals(res, 'ab', - '%r (%s) != %r (%s)' % (res, type(res), 'ab', - str)) + '%r (%s) != %r (%s)' % (res, type(res), 'ab', str)) class StrUnicodeConcatOverridesTestCase(unittest.TestCase): diff --git a/Lib/test/test_import_jy.py b/Lib/test/test_import_jy.py --- a/Lib/test/test_import_jy.py +++ b/Lib/test/test_import_jy.py @@ -219,11 +219,17 @@ class UnicodeNamesTestCase(unittest.TestCase): + def test_import_non_ascii_module(self): + module = "m?d?l?" + with self.assertRaises(ImportError) as cm: + __import__(module) + def test_import_unicode_module(self): + module = u"m?d?l?" with self.assertRaises(UnicodeEncodeError) as cm: - __import__("m?d?l?") + __import__(module) self.assertEqual(cm.exception.encoding, "ascii") - self.assertEqual(cm.exception.object, "m?d?l?") + self.assertEqual(cm.exception.object, module) self.assertEqual(cm.exception.reason, "ordinal not in range(128)") diff --git a/Lib/test/test_unicode_jy.py b/Lib/test/test_unicode_jy.py --- a/Lib/test/test_unicode_jy.py +++ b/Lib/test/test_unicode_jy.py @@ -900,6 +900,277 @@ self.assertEqual(2, len(s.split()), "no split made in " + repr(s)) self.assertEqual(2, len(s.rsplit()), "no rsplit made in " + repr(s)) +class EncodingContext(object): + """Context manager to save and restore the encoding. + + Use like this: + + with EncodingContext("utf-8"): + self.assertEqual("'caf\xc3\xa9'", u"'caf\xe9'") + """ + + def __init__(self, encoding): + if not hasattr(sys, "setdefaultencoding"): + reload(sys) + self.original_encoding = sys.getdefaultencoding() + sys.setdefaultencoding(encoding) + + def __enter__(self): + return self + + def __exit__(self, *ignore_exc): + sys.setdefaultencoding(self.original_encoding) + + +class DefaultDecodingTestCase(unittest.TestCase): + # Test use of default encoding to coerce str to unicode + + def test_add(self): + ref = u'caf? cr?me' + s1 = ref[:4].encode(self.encoding) + s2 = ref[4:].encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual( s1 + ref[4:], ref) + self.assertEqual( ref[:4] + s2, ref) + + def test_in(self): + ref = u'caf? cr?me' + with EncodingContext(self.encoding): + self.assertTrue(u'?'.encode(self.encoding) in ref) + self.assertTrue(u'f?'.encode(self.encoding) in ref) + # Fails if the string is interpreted as code points. + if self.encoding != 'latin-1': + self.assertFalse('\xc3\xa9' in u'caf\xc3\xa9') + + def test_eq(self): + ref = u'caf? cr?me' + b = ref.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertTrue(ref == b) + self.assertTrue(b == ref) + + def test_ne(self): + with EncodingContext(self.encoding): + # Fails if the string is interpreted as code points. + if self.encoding != 'latin-1': + self.assertFalse(u'caf\xc3\xa9'== 'caf\xc3\xa9') + self.assertFalse('caf\xc3\xa9' == u'caf\xc3\xa9') + + def test_count(self): + ref = u'Le caf? des f?es ?gar?es' + with EncodingContext(self.encoding): + self.assertEqual(ref.count(u'?'.encode(self.encoding)), 4) + self.assertEqual(ref.count(u'f?'.encode(self.encoding)), 2) + + def test_endswith(self): + # Set up the test using unicode values and indices + ref = u'caf? cr?me' + s, u, v = ref[-4:], u'?m?', u'??e' + # Encode all this + enc = ref.encode(self.encoding) + u1, v1 = u.encode(self.encoding), v.encode(self.encoding) + s1 = s.encode(self.encoding) + + with EncodingContext(self.encoding): + # Test with single argument + self.assertFalse(ref.endswith(v1)) + self.assertTrue(ref.endswith(s1)) + # Test with a mixed tuple as the argument + self.assertFalse(ref.endswith((u1, u, v1, v))) + self.assertTrue(ref.endswith((u1, s1, v1))) + self.assertTrue(ref.endswith((u1, u, s1, v1, v))) + self.assertFalse(enc.endswith((u1, v1, u, v))) + self.assertTrue(enc.endswith((u, s, v))) + self.assertTrue(enc.endswith((u1, u, s, v1, v))) + + def test_endswith_slice(self): + # Set up the test using unicode values and indices + ref = u'?Un caf? cr?me??' + if len(u'??'.encode(self.encoding))!=2 and not test_support.is_jython: + # CPython fails on str.startswith(unicode, int, int) as it passes + # byte indices to unicode.startswith(unicode, int, int) unchanged. + # It only works if ? and ? encode to single bytes. Easier test: + ref = u'"Un caf? cr?me?"' + a, b = 4, -2 + s, u, v = ref[b-4:b], u'?m?', u'??e' + # Encode all this, including the indices + enc = ref.encode(self.encoding) + u1, v1 = u.encode(self.encoding), v.encode(self.encoding) + a1 = len(ref[:a].encode(self.encoding)) + b1 = - len(ref[b:].encode(self.encoding)) + s1 = s.encode(self.encoding) + + with EncodingContext(self.encoding): + # Test the assumption on which the test is based + self.assertEqual(ref[a:b], enc[a1:b1]) + # Test slice with single argument + self.assertFalse(ref.endswith(v1, a, b)) + self.assertTrue(ref.endswith(s1, a, b)) + self.assertFalse(enc.endswith(v1, a1, b1)) + self.assertTrue(enc.endswith(s, a1, b1)) + # CPython would pass: + #self.assertTrue(enc.endswith(s, a, b)) + # Test slice with a mixed tuple as the argument + self.assertFalse(ref.endswith((u1, u, v1, v), a, b)) + self.assertTrue(ref.endswith((u1, s1, v1), a, b)) + self.assertTrue(ref.endswith((u1, u, s1, v1, v), a, b)) + self.assertFalse(enc.endswith((u1, v1, u, v), a1, b1)) + self.assertTrue(enc.endswith((u, s, v), a1, b1)) + self.assertTrue(enc.endswith((u1, u, s, v1, v), a1, b1)) + # CPython would pass: + #self.assertTrue(enc.endswith((u, s, v), a, b)) + #self.assertTrue(enc.endswith((u1, u, s, v1, v), a, b)) + + def test_find(self): + ref = u'caf? cr?me' + sub = u'?'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.find(sub), 7) + + def test_index(self): + ref = u'caf? cr?me' + sub = u'?'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.index(sub), 7) + + def test_lstrip(self): + ref = u"??????du bl? ?" + sep = u'???'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.lstrip(sep), u"du bl? ?") + + def test_partition(self): + ref = u"Des f?es h?b?t?es." + sep1 = u'?'.encode(self.encoding) + sep2 = u'?es'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.partition(sep1), (u"Des f", u"?", u"es h?b?t?es.")) + self.assertEqual(ref.partition(sep2), (u"Des f", u"?es", u" h?b?t?es.")) + + def test_replace(self): + ref = u"?t?." + a = u'?'.encode(self.encoding) + b = u'?'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.replace(a, b), u"?t?.") + self.assertEqual(ref.replace(b, a), u"?t?.") + + def test_rfind(self): + ref = u'caf? cr?me' + sub = u'?'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.rfind(sub), 3) + + def test_rindex(self): + ref = u'caf? cr?me' + sub = u'?'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.index(sub), 3) + + def test_rpartition(self): + ref = u"Des f?es h?b?t?es." + sep1 = u'?'.encode(self.encoding) + sep2 = u'?es'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.rpartition(sep1), (u"Des f?es h?b?t", u"?", u"es.")) + self.assertEqual(ref.rpartition(sep2), (u"Des f?es h?b?t", u"?es", u".")) + + def test_rsplit(self): + ref = u"Des f?es h?b?t?es." + sep1 = u'?'.encode(self.encoding) + sep2 = u'?es'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.rsplit(sep1, 3), [u"Des f?es h", u"b", u"t", u"es."]) + self.assertEqual(ref.rsplit(sep2), [u"Des f", u" h?b?t", u"."]) + + def test_rstrip(self): + ref = u"? du bl???????" + sep = u'???'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.rstrip(sep), u"? du bl?") + + def test_split(self): + ref = u"Des f?es h?b?t?es." + sep1 = u'?'.encode(self.encoding) + sep2 = u'?es'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.split(sep1, 3), [u"Des f", u"es h", u"b", u"t?es."]) + self.assertEqual(ref.split(sep2), [u"Des f", u" h?b?t", u"."]) + + def test_startsswith(self): + # Set up the test using unicode values and indices + ref = u'caf? cr?me' + s, u, v = ref[:4], u'?af', u'caf?' + # Encode all this + enc = ref.encode(self.encoding) + u1, v1 = u.encode(self.encoding), v.encode(self.encoding) + s1 = s.encode(self.encoding) + + with EncodingContext(self.encoding): + self.assertFalse(ref.startswith(v1)) + self.assertTrue(ref.startswith(enc[:5])) + # Test with a mixed tuple as the argument + self.assertFalse(ref.startswith((u1, u, v1, v))) + self.assertTrue(ref.startswith((u1, enc[:5], v1))) + self.assertTrue(ref.startswith((u1, u, enc[:5], v1, v))) + self.assertFalse(enc.startswith((u1, v1, u, v))) + self.assertTrue(enc.startswith((u, ref[:4], v))) + self.assertTrue(enc.startswith((u1, u, ref[:4], v1, v))) + + def test_startsswith_slice(self): + # Set up the test using unicode values and indices + ref = u'?Un caf? cr?me??' + if len(u'??'.encode(self.encoding))!=2 and not test_support.is_jython: + # CPython fails on str.startswith(unicode, int, int) as it passes + # byte indices to unicode.startswith(unicode, int, int) unchanged. + # It only works if ? and ? encode to single bytes. Easier test: + ref = u'"Un caf? cr?me?"' + a, b = 4, -2 + s, u, v = ref[a:a+4], u'?af', u'caf?' + # Encode all this, including the indices + enc = ref.encode(self.encoding) + u1, v1 = u.encode(self.encoding), v.encode(self.encoding) + a1 = len(ref[:a].encode(self.encoding)) + b1 = - len(ref[b:].encode(self.encoding)) + s1 = s.encode(self.encoding) + + with EncodingContext(self.encoding): + # Test the assumption on which the test is based + self.assertEqual(ref[a:b], enc[a1:b1]) + # Test slice with single argument + self.assertFalse(ref.startswith(v, a, b)) + self.assertTrue(ref.startswith(s1, a, b)) + self.assertFalse(enc.startswith(v1, a1, b1)) + self.assertTrue(enc.startswith(s, a1, b1)) + # CPython would pass: + #self.assertTrue(enc.startswith(s, a, b)) + # Test slice with a mixed tuple as the argument + self.assertFalse(ref.startswith((u1, u, v1, v), a, b)) + self.assertTrue(ref.startswith((u1, s1, v1), a, b)) + self.assertTrue(ref.startswith((u1, u, s1, v1, v), a, b)) + self.assertFalse(enc.startswith((u1, v1, u, v), a1, b1)) + self.assertTrue(enc.startswith((u, s, v), a1, b1)) + self.assertTrue(enc.startswith((u1, u, s, v1, v), a1, b1)) + # CPython would pass: + #self.assertTrue(enc.startswith((u, s, v), a, b)) + #self.assertTrue(enc.startswith((u1, u, s, v1, v), a, b)) + + def test_strip(self): + ref = u"??????du bl???????" + sep = u'???'.encode(self.encoding) + with EncodingContext(self.encoding): + self.assertEqual(ref.strip(sep), u"du bl?") + + +class DefaultDecodingLatin1(DefaultDecodingTestCase): + encoding = "latin-1" + +class DefaultDecodingUTF8(DefaultDecodingTestCase): + encoding = "utf-8" + +class DefaultDecodingCp850(DefaultDecodingTestCase): + encoding = "cp850" + def test_main(): test_support.run_unittest( @@ -910,6 +1181,9 @@ UnicodeFormatStrTest, StringModuleUnicodeTest, UnicodeSpaceTest, + DefaultDecodingLatin1, + DefaultDecodingUTF8, + DefaultDecodingCp850, ) diff --git a/src/org/python/core/PyShadowString.java b/src/org/python/core/PyShadowString.java --- a/src/org/python/core/PyShadowString.java +++ b/src/org/python/core/PyShadowString.java @@ -251,7 +251,7 @@ if (!(prefix instanceof PyTuple)) { // It ought to be PyUnicode or some kind of bytes with the buffer API. - String s = asUTF16StringOrError(prefix); + String s = asU16BytesOrError(prefix); // If s is non-BMP, and this is a PyString (bytes), result will correctly be false. return sliceLen >= s.length() && (getString().startsWith(s, start) || shadow.startsWith(s, start)); @@ -259,7 +259,7 @@ // Loop will return true if this slice starts with any prefix in the tuple for (PyObject prefixObj : ((PyTuple)prefix).getArray()) { // It ought to be PyUnicode or some kind of bytes with the buffer API. - String s = asUTF16StringOrError(prefixObj); + String s = asU16BytesOrError(prefixObj); // If s is non-BMP, and this is a PyString (bytes), result will correctly be false. if (sliceLen >= s.length() && (getString().startsWith(s, start) || shadow.startsWith(s, start))) { diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java --- a/src/org/python/core/PyString.java +++ b/src/org/python/core/PyString.java @@ -5,6 +5,7 @@ import java.lang.ref.SoftReference; import java.math.BigInteger; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.regex.Matcher; @@ -81,6 +82,10 @@ this(TYPE, buffer.toString()); } + PyString(PyBuffer buffer) { + this(TYPE, buffer.toString()); + } + /** * Local-use constructor in which the client is allowed to guarantee that the * String argument contains only characters in the byte range. We do not then @@ -260,7 +265,7 @@ @Override public PyUnicode __unicode__() { - return new PyUnicode(this); + return new PyUnicode(this); // Decodes with default codec. } @Override @@ -720,8 +725,9 @@ return getString().compareTo(s) >= 0 ? Py.True : Py.False; } + /** Interpret the object as a Java String representing bytes or return null. */ private static String coerce(PyObject o) { - if (o instanceof PyString) { + if (o instanceof PyString && !(o instanceof PyUnicode)) { return o.toString(); } return null; @@ -841,17 +847,19 @@ } /** - * Return a String equivalent to the argument. This is a helper function to those methods that - * accept any byte array type (any object that supports a one-dimensional byte buffer), or - * accept a unicode argument which they interpret from its UTF-16 encoded form (the - * internal representation returned by {@link PyUnicode#getString()}). + * Return a Java String that is the Jython-internal equivalent of the byte-like + * argument (a str or any object that supports a one-dimensional byte buffer). If + * the argument is not acceptable (this includes a unicode argument) return null. * * @param obj to coerce to a String * @return coerced value or null if it can't be */ - private static String asUTF16StringOrNull(PyObject obj) { + private static String asU16BytesOrNull(PyObject obj) { if (obj instanceof PyString) { - // str or unicode object: go directly to the String + if (obj instanceof PyUnicode) { + return null; + } + // str but not unicode object: go directly to the String return ((PyString) obj).getString(); } else if (obj instanceof BufferProtocol) { // Other object with buffer API: briefly access the buffer @@ -869,23 +877,11 @@ * not a unicode. * * @param obj to coerce to a String - * @return coerced value or null if it can't be (including unicode) - */ - private static String asStringOrNull(PyObject obj) { - return (obj instanceof PyUnicode) ? null : asUTF16StringOrNull(obj); - } - - /** - * Return a String equivalent to the argument. This is a helper function to those methods that - * accept any byte array type (any object that supports a one-dimensional byte buffer), but - * not a unicode. - * - * @param obj to coerce to a String * @return coerced value * @throws PyException if the coercion fails (including unicode) */ - private static String asStringOrError(PyObject obj) throws PyException { - String ret = (obj instanceof PyUnicode) ? null : asUTF16StringOrNull(obj); + protected static String asU16BytesOrError(PyObject obj) throws PyException { + String ret = asU16BytesOrNull(obj); if (ret != null) { return ret; } else { @@ -906,12 +902,11 @@ * @return coerced value or null * @throws PyException if the coercion fails (including unicode) */ - private static String asStringNullOrError(PyObject obj, String name) throws PyException { - + private static String asU16BytesNullOrError(PyObject obj, String name) throws PyException { if (obj == null || obj == Py.None) { return null; } else { - String ret = (obj instanceof PyUnicode) ? null : asUTF16StringOrNull(obj); + String ret = asU16BytesOrNull(obj); if (ret != null) { return ret; } else if (name == null) { @@ -924,26 +919,6 @@ } } - /** - * Return a String equivalent to the argument according to the calling conventions of the - * certain methods of str. Those methods accept as a byte string anything bearing - * the buffer interface, or accept a unicode argument which they interpret from its - * UTF-16 encoded form (the internal representation returned by {@link PyUnicode#getString()}). - * - * @param obj to coerce to a String - * @return coerced value - * @throws PyException if the coercion fails - */ - protected static String asUTF16StringOrError(PyObject obj) { - // PyUnicode accepted here. Care required in the client if obj is not basic plane. - String ret = asUTF16StringOrNull(obj); - if (ret != null) { - return ret; - } else { - throw Py.TypeError("expected str, bytearray, unicode or buffer compatible object"); - } - } - @Override public boolean __contains__(PyObject o) { return str___contains__(o); @@ -951,8 +926,15 @@ @ExposedMethod(doc = BuiltinDocs.str___contains___doc) final boolean str___contains__(PyObject o) { - String other = asUTF16StringOrError(o); - return getString().indexOf(other) >= 0; + String other = asU16BytesOrNull(o); + if (other != null) { + return getString().indexOf(other) >= 0; + } else if (o instanceof PyUnicode) { + return decode().__contains__(o); + } else { + throw Py.TypeError("'in ' requires string as left operand, not " + + (o == null ? Py.None : o).getType().fastGetName()); + } } @Override @@ -1014,12 +996,12 @@ @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.str___add___doc) final PyObject str___add__(PyObject other) { // Expect other to be some kind of byte-like object. - String otherStr = asStringOrNull(other); + String otherStr = asU16BytesOrNull(other); if (otherStr != null) { // Yes it is: concatenate as strings, which are guaranteed byte-like. return new PyString(getString().concat(otherStr), true); } else if (other instanceof PyUnicode) { - // Convert self to PyUnicode and escalate the problem + // Escalate the problem to PyUnicode return decode().__add__(other); } else { // Allow PyObject._basic_add to pick up the pieces or raise informative error @@ -1237,7 +1219,7 @@ return ((PyUnicode) decode()).unicode_strip(chars); } else { // It ought to be None, null, some kind of bytes with the buffer API. - String stripChars = asStringNullOrError(chars, "strip"); + String stripChars = asU16BytesNullOrError(chars, "strip"); // Strip specified characters or whitespace if stripChars == null return new PyString(_strip(stripChars), true); } @@ -1407,7 +1389,7 @@ return ((PyUnicode) decode()).unicode_lstrip(chars); } else { // It ought to be None, null, some kind of bytes with the buffer API. - String stripChars = asStringNullOrError(chars, "lstrip"); + String stripChars = asU16BytesNullOrError(chars, "lstrip"); // Strip specified characters or whitespace if stripChars == null return new PyString(_lstrip(stripChars), true); } @@ -1496,7 +1478,7 @@ return ((PyUnicode) decode()).unicode_rstrip(chars); } else { // It ought to be None, null, some kind of bytes with the buffer API. - String stripChars = asStringNullOrError(chars, "rstrip"); + String stripChars = asU16BytesNullOrError(chars, "rstrip"); // Strip specified characters or whitespace if stripChars == null return new PyString(_rstrip(stripChars), true); } @@ -1617,7 +1599,7 @@ return ((PyUnicode) decode()).unicode_split(sepObj, maxsplit); } else { // It ought to be None, null, some kind of bytes with the buffer API. - String sep = asStringNullOrError(sepObj, "split"); + String sep = asU16BytesNullOrError(sepObj, "split"); // Split on specified string or whitespace if sep == null return _split(sep, maxsplit); } @@ -1868,7 +1850,7 @@ return ((PyUnicode) decode()).unicode_rsplit(sepObj, maxsplit); } else { // It ought to be None, null, some kind of bytes with the buffer API. - String sep = asStringNullOrError(sepObj, "rsplit"); + String sep = asU16BytesNullOrError(sepObj, "rsplit"); // Split on specified string or whitespace if sep == null return _rsplit(sep, maxsplit); } @@ -2056,7 +2038,7 @@ } else { // It ought to be some kind of bytes with the buffer API. - String sep = asStringOrError(sepObj); + String sep = asU16BytesOrError(sepObj); if (sep.length() == 0) { throw Py.ValueError("empty separator"); @@ -2115,7 +2097,7 @@ } else { // It ought to be some kind of bytes with the buffer API. - String sep = asStringOrError(sepObj); + String sep = asU16BytesOrError(sepObj); if (sep.length() == 0) { throw Py.ValueError("empty separator"); @@ -2418,10 +2400,10 @@ final int str_count(PyObject subObj, PyObject start, PyObject end) { if (subObj instanceof PyUnicode) { // Promote the problem to a Unicode one - return ((PyUnicode) decode()).unicode_count(subObj, start, end); + return asUnicode(start, end).unicode_count(subObj, null, null); } else { // It ought to be some kind of bytes with the buffer API. - String sub = asStringOrError(subObj); + String sub = asU16BytesOrError(subObj); return _count(sub, start, end); } } @@ -2533,10 +2515,11 @@ final int str_find(PyObject subObj, PyObject start, PyObject end) { if (subObj instanceof PyUnicode) { // Promote the problem to a Unicode one + // XXX Questionable: return is a Unicode character index not byte index return ((PyUnicode) decode()).unicode_find(subObj, start, end); } else { - // It ought to be some kind of bytes with the buffer API. - String sub = asStringOrError(subObj); + // It ought to be a bytes-like object. + String sub = asU16BytesOrError(subObj); return _find(sub, start, end); } } @@ -2641,7 +2624,7 @@ return ((PyUnicode) decode()).unicode_rfind(subObj, start, end); } else { // It ought to be some kind of bytes with the buffer API. - String sub = asStringOrError(subObj); + String sub = asU16BytesOrError(subObj); return _rfind(sub, start, end); } } @@ -3213,8 +3196,8 @@ return ((PyUnicode) decode()).unicode_replace(oldPieceObj, newPieceObj, count); } else { // Neither is a PyUnicode: both ought to be some kind of bytes with the buffer API. - String oldPiece = asStringOrError(oldPieceObj); - String newPiece = asStringOrError(newPieceObj); + String oldPiece = asU16BytesOrError(oldPieceObj); + String newPiece = asU16BytesOrError(newPieceObj); return _replace(oldPiece, newPiece, count); } } @@ -3401,7 +3384,7 @@ * false. */ public boolean startswith(PyObject prefix) { - return str_startswith(prefix, null, null); + return startswith(prefix, null, null); } /** @@ -3416,7 +3399,7 @@ * false. */ public boolean startswith(PyObject prefix, PyObject start) { - return str_startswith(prefix, start, null); + return startswith(prefix, start, null); } /** @@ -3438,28 +3421,49 @@ @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.str_startswith_doc) final boolean str_startswith(PyObject prefix, PyObject startObj, PyObject endObj) { + int[] indices = translateIndices(startObj, endObj); int start = indices[0]; int sliceLen = indices[1] - start; if (!(prefix instanceof PyTuple)) { - // It ought to be PyUnicode or some kind of bytes with the buffer API. - String s = asUTF16StringOrError(prefix); - // If s is non-BMP, and this is a PyString (bytes), result will correctly be false. - return sliceLen >= s.length() && getString().startsWith(s, start); + if (prefix instanceof PyUnicode) { + // Promote to a unicode problem on the decoded slice + return asUnicode(startObj, endObj).unicode_startswith(prefix, null, null); + } else { + // It ought to be a bytes-like object. + String s = asU16BytesOrError(prefix); + return sliceLen >= s.length() && getString().startsWith(s, start); + } } else { - // Loop will return true if this slice starts with any prefix in the tuple - for (PyObject prefixObj : ((PyTuple) prefix).getArray()) { - // It ought to be PyUnicode or some kind of bytes with the buffer API. - String s = asUTF16StringOrError(prefixObj); - // If s is non-BMP, and this is a PyString (bytes), result will correctly be false. - if (sliceLen >= s.length() && getString().startsWith(s, start)) { - return true; + // It's a tuple so we have to iterate through the members. + PyObject[] prefixes = ((PyTuple) prefix).getArray(); + String string = getString(); + + // Test with only the bytes prefixes first and save the unicode ones + int unicodeCount = 0; + for (PyObject o : prefixes) { + if (o instanceof PyUnicode) { + // Pack the unicode prefixes to the start of the array without trying them + prefixes[unicodeCount++] = o; + } else { + // It ought to be a bytes-like object. + String s = asU16BytesOrError(o); + if (sliceLen >= s.length() && string.startsWith(s, start)) { + return true; + } } } - // None matched - return false; + + if (unicodeCount == 0) { + // Only bytes prefixes given and nothing matched + return false; + } else { + // There were unicode prefixes: test the decoded slice for them. + PyTuple t = new PyTuple(Arrays.copyOf(prefixes, unicodeCount)); + return asUnicode(startObj, endObj).unicode_startswith(t, null, null); + } } } @@ -3472,7 +3476,7 @@ * false. */ public boolean endswith(PyObject suffix) { - return str_endswith(suffix, null, null); + return endswith(suffix, null, null); } /** @@ -3487,7 +3491,7 @@ * false. */ public boolean endswith(PyObject suffix, PyObject start) { - return str_endswith(suffix, start, null); + return endswith(suffix, start, null); } /** @@ -3511,26 +3515,45 @@ final boolean str_endswith(PyObject suffix, PyObject startObj, PyObject endObj) { int[] indices = translateIndices(startObj, endObj); - String substr = getString().substring(indices[0], indices[1]); if (!(suffix instanceof PyTuple)) { - // It ought to be PyUnicode or some kind of bytes with the buffer API. - String s = asUTF16StringOrError(suffix); - // If s is non-BMP, and this is a PyString (bytes), result will correctly be false. - return substr.endsWith(s); + if (suffix instanceof PyUnicode) { + // Promote to a unicode problem on the decoded slice + return asUnicode(startObj, endObj).unicode_endswith(suffix, null, null); + } else { + // It ought to be a bytes-like object. + String s = asU16BytesOrError(suffix); + return getString().substring(indices[0], indices[1]).endsWith(s); + } } else { - // Loop will return true if this slice ends with any suffix in the tuple - for (PyObject suffixObj : ((PyTuple) suffix).getArray()) { - // It ought to be PyUnicode or some kind of bytes with the buffer API. - String s = asUTF16StringOrError(suffixObj); - // If s is non-BMP, and this is a PyString (bytes), result will correctly be false. - if (substr.endsWith(s)) { - return true; + // It's a tuple so we have to iterate through the members. + PyObject[] suffixes = ((PyTuple) suffix).getArray(); + String string = getString().substring(indices[0], indices[1]); + + // Test with only the bytes suffixes first and save the unicode ones + int unicodeCount = 0; + for (PyObject o : suffixes) { + if (o instanceof PyUnicode) { + // Pack the unicode suffixes to the start of the array without trying them + suffixes[unicodeCount++] = o; + } else { + // It ought to be a bytes-like object. + String s = asU16BytesOrError(o); + if (string.endsWith(s)) { + return true; + } } } - // None matched - return false; + + if (unicodeCount == 0) { + // Only bytes suffixes given and nothing matched + return false; + } else { + // There were unicode suffixes: test the decoded slice for them. + PyTuple t = new PyTuple(Arrays.copyOf(suffixes, unicodeCount)); + return asUnicode(startObj, endObj).unicode_endswith(t, null, null); + } } } @@ -3655,8 +3678,8 @@ @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.str_translate_doc) final String str_translate(PyObject tableObj, PyObject deletecharsObj) { // Accept anythiong withthe buffer API or null - String table = asStringNullOrError(tableObj, null); - String deletechars = asStringNullOrError(deletecharsObj, null); + String table = asU16BytesNullOrError(tableObj, null); + String deletechars = asU16BytesNullOrError(deletecharsObj, null); return _translate(table, deletechars); } @@ -4317,6 +4340,35 @@ public CharSequence subSequence(int start, int end) { return string.subSequence(start, end); } + + /** + * Decode this str object to a unicode, like + * __unicode__() but without the possibility it will be overridden. + * + * @return this as a unicode using the default encoding. + */ + private PyUnicode asUnicode() { + return new PyUnicode(this); + } + + /** + * Decode a slice of this str object to a unicode, using Python slice + * semantics and the default encoding. This supports the many library methods that accept + * slicing as part of the API, in the case where the calculation must be promoted due to a + * unicode argument. + * + * @param startObj start index (or null or None) + * @param endObj end index (or null or None) + * @return the slice as a unicode using the default encoding. + */ + private PyUnicode asUnicode(PyObject startObj, PyObject endObj) { + if (startObj == null && endObj == null) { + return asUnicode(); + } else { + int[] indices = translateIndices(startObj, endObj); + return new PyUnicode(fromSubstring(indices[0], indices[1])); + } + } } diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java --- a/src/org/python/core/PyUnicode.java +++ b/src/org/python/core/PyUnicode.java @@ -715,19 +715,48 @@ return createInstance(buffer.toString()); } - @ExposedMethod(type = MethodType.CMP, doc = BuiltinDocs.unicode___getslice___doc) + @ExposedMethod(type = MethodType.CMP) final int unicode___cmp__(PyObject other) { + // XXX needs proper coercion like __eq__, then UCS-32 code point order :( return str___cmp__(other); } - @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc) - final PyObject unicode___eq__(PyObject other) { - return str___eq__(other); + @Override + public PyObject __eq__(PyObject other) { + return unicode___eq__(other); } - @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc) + @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___eq___doc) + final PyObject unicode___eq__(PyObject other) { + try { + String s = coerceForComparison(other); + if (s == null) { + return null; + } + return getString().equals(s) ? Py.True : Py.False; + } catch (PyException e) { + // Decoding failed: treat as unequal + return Py.False; + } + } + + @Override + public PyObject __ne__(PyObject other) { + return unicode___ne__(other); + } + + @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___ne___doc) final PyObject unicode___ne__(PyObject other) { - return str___ne__(other); + try { + String s = coerceForComparison(other); + if (s == null) { + return null; + } + return getString().equals(s) ? Py.False : Py.True; + } catch (PyException e) { + // Decoding failed: treat as unequal + return Py.True; + } } @ExposedMethod(doc = BuiltinDocs.unicode___hash___doc) @@ -900,53 +929,156 @@ } /** - * Helper used many times to "coerce" a method argument into a PyUnicode (which it - * may already be). A null or incoercible argument will raise a - * TypeError. + * Interpret the object as a Java String representing characters as UTF-16, or + * return null if the type does not admit this conversion. From a + * PyUnicode we return its internal string. A byte argument is decoded with the + * default encoding. + * + * @param o the object to coerce + * @return an equivalent String + */ + private static String coerceToStringOrNull(PyObject o) { + if (o instanceof PyUnicode) { + return ((PyUnicode) o).getString(); + } else if (o instanceof PyString) { + return ((PyString) o).decode().toString(); + } else if (o instanceof BufferProtocol) { + // PyByteArray, PyMemoryView, Py2kBuffer ... + // We ought to be able to call codecs.decode on o but see Issue #2164 + try (PyBuffer buf = ((BufferProtocol) o).getBuffer(PyBUF.FULL_RO)) { + PyString s = new PyString(buf); + // For any sensible codec, the return is unicode and toString() is getString(). + return s.decode().toString(); + } + } else { + // o is some type not allowed: + return null; + } + } + + /** + * Interpret the object as a Java String for use in comparison. The return + * represents characters as UTF-16. From a PyUnicode we return its internal string. + * A str and buffer argument is decoded with the default encoding. + * Equivalent to {@link #coerceToStringOrNull(PyObject)} allowing only the types supported in + * (C)Python unicode.__eq__. + * + * @param o the object to coerce + * @return an equivalent String + */ + private static String coerceForComparison(PyObject o) { + if (o instanceof PyUnicode) { + return ((PyUnicode) o).getString(); + } else if (o instanceof PyString) { + return ((PyString) o).decode().toString(); + } else if (o instanceof Py2kBuffer) { + // We ought to be able to call codecs.decode on o but see Issue #2164 + try (PyBuffer buf = ((BufferProtocol) o).getBuffer(PyBUF.FULL_RO)) { + PyString s = new PyString(buf); + // For any sensible codec, the return is unicode and toString() is getString(). + return s.decode().toString(); + } + } else { + // o is some type not allowed: + return null; + } + } + + /** + * Interpret the object as a Java String representing characters as UTF-16, or + * raise an error if the type does not admit this conversion. A byte argument is decoded with + * the default encoding. + * + * @param o the object to coerce + * @return an equivalent String (and never null) + */ + private static String coerceToString(PyObject o) { + String s = coerceToStringOrNull(o); + if (s == null) { + throw errorCoercingToUnicode(o); + } + return s; + } + + /** + * Interpret the object as a Java String representing characters as UTF-16, or + * optionally as null (for a null or None argument if the + * second argument is true). Raise an error if the type does not admit this + * conversion. + * + * @param o the object to coerce + * @param allowNullArgument iff true allow a null or none argument + * @return an equivalent String or null + */ + private static String coerceToString(PyObject o, boolean allowNullArgument) { + if (allowNullArgument && (o == null || o == Py.None)) { + return null; + } else { + return coerceToString(o); + } + } + + /** Construct exception "coercing to Unicode: ..." */ + private static PyException errorCoercingToUnicode(PyObject o) { + return Py.TypeError("coercing to Unicode: need string or buffer, " + + (o == null ? Py.None : o).getType().fastGetName() + " found"); + } + + /** + * Interpret the object as a PyUnicode, or return null if the type + * does not admit this conversion. From a PyUnicode we return itself. A byte + * argument is decoded with the default encoding. * * @param o the object to coerce * @return an equivalent PyUnicode (or o itself) */ - private PyUnicode coerceToUnicode(PyObject o) { + private static PyUnicode coerceToUnicodeOrNull(PyObject o) { if (o instanceof PyUnicode) { return (PyUnicode) o; } else if (o instanceof PyString) { - return new PyUnicode(((PyString) o).getString(), true); + // For any sensible codec, the return here is unicode. + PyObject u = ((PyString) o).decode(); + return (u instanceof PyUnicode) ? (PyUnicode) u : new PyUnicode(o.toString()); } else if (o instanceof BufferProtocol) { // PyByteArray, PyMemoryView, Py2kBuffer ... + // We ought to be able to call codecs.decode on o but see Issue #2164 try (PyBuffer buf = ((BufferProtocol) o).getBuffer(PyBUF.FULL_RO)) { - return new PyUnicode(buf.toString(), true); + PyString s = new PyString(buf); + // For any sensible codec, the return is unicode and toString() is getString(). + PyObject u = s.decode(); + return (u instanceof PyUnicode) ? (PyUnicode) u : new PyUnicode(o.toString()); } } else { // o is some type not allowed: - if (o == null) { - // Do something safe and approximately right - o = Py.None; - } - throw Py.TypeError("coercing to Unicode: need string or buffer, " - + o.getType().fastGetName() + " found"); + return null; } } /** - * Helper used many times to "coerce" a method argument into a PyUnicode (which it - * may already be). A null argument or a PyNone causes - * null to be returned. + * Interpret the object as a PyUnicode, or raise a TypeError if the + * type does not admit this conversion. From a PyUnicode we return itself. A byte + * argument is decoded with the default encoding. * * @param o the object to coerce - * @return an equivalent PyUnicode (or o itself, or null) + * @return an equivalent PyUnicode (or o itself) */ - private PyUnicode coerceToUnicodeOrNull(PyObject o) { - if (o == null || o == Py.None) { - return null; - } else { - return coerceToUnicode(o); + private static PyUnicode coerceToUnicode(PyObject o) { + PyUnicode u = coerceToUnicodeOrNull(o); + if (u == null) { + throw errorCoercingToUnicode(o); } + return u; + } + + @Override + public boolean __contains__(PyObject o) { + return unicode___contains__(o); } @ExposedMethod(doc = BuiltinDocs.unicode___contains___doc) final boolean unicode___contains__(PyObject o) { - return str___contains__(o); + String other = coerceToString(o); + return getString().indexOf(other) >= 0; } @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc) @@ -966,15 +1098,9 @@ @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___getslice___doc) final PyObject unicode___add__(PyObject other) { - PyUnicode otherUnicode; - if (other instanceof PyUnicode) { - otherUnicode = (PyUnicode) other; - } else if (other instanceof PyString) { - otherUnicode = (PyUnicode) ((PyString) other).decode(); - } else { - return null; - } - return new PyUnicode(getString().concat(otherUnicode.getString())); + // Interpret other as a Java String + String s = coerceToStringOrNull(other); + return s == null ? null : new PyUnicode(getString().concat(s)); } @ExposedMethod(doc = BuiltinDocs.unicode_lower_doc) @@ -1077,25 +1203,25 @@ } } - // compliance requires that we need to support a bit of inconsistency - // compared to other coercion used + // Compliance requires a bit of inconsistency with other coercions used. /** * Helper used in .strip() to "coerce" a method argument into a * PyUnicode (which it may already be). A null argument or a * PyNone causes null to be returned. A buffer type is not acceptable * to (Unicode) .strip(). This is the difference from - * {@link #coerceToUnicodeOrNull(PyObject)}. + * {@link #coerceToUnicode(PyObject, boolean)}. * * @param o the object to coerce * @return an equivalent PyUnicode (or o itself, or null) */ - private PyUnicode coerceStripSepToUnicode(PyObject o) { + private static PyUnicode coerceStripSepToUnicode(PyObject o) { if (o == null) { return null; } else if (o instanceof PyUnicode) { return (PyUnicode) o; } else if (o instanceof PyString) { - return new PyUnicode(((PyString) o).decode().toString()); + PyObject u = ((PyString) o).decode(); + return (u instanceof PyUnicode) ? (PyUnicode) u : new PyUnicode(u.toString()); } else if (o == Py.None) { return null; } else { @@ -1431,9 +1557,9 @@ @ExposedMethod(defaults = {"null", "-1"}, doc = BuiltinDocs.unicode_split_doc) final PyList unicode_split(PyObject sepObj, int maxsplit) { - PyUnicode sep = coerceToUnicodeOrNull(sepObj); + String sep = coerceToString(sepObj, true); if (sep != null) { - return _split(sep.getString(), maxsplit); + return _split(sep, maxsplit); } else { return _split(null, maxsplit); } @@ -1441,9 +1567,9 @@ @ExposedMethod(defaults = {"null", "-1"}, doc = BuiltinDocs.unicode_rsplit_doc) final PyList unicode_rsplit(PyObject sepObj, int maxsplit) { - PyUnicode sep = coerceToUnicodeOrNull(sepObj); + String sep = coerceToString(sepObj, true); if (sep != null) { - return _rsplit(sep.getString(), maxsplit); + return _rsplit(sep, maxsplit); } else { return _rsplit(null, maxsplit); } @@ -1452,7 +1578,6 @@ @ExposedMethod(defaults = "false", doc = BuiltinDocs.unicode___getslice___doc) final PyList unicode_splitlines(boolean keepends) { return new PyList(new LineSplitIterator(keepends)); - } @Override @@ -1463,16 +1588,16 @@ @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_index_doc) final int unicode_index(PyObject subObj, PyObject start, PyObject end) { - final PyUnicode sub = coerceToUnicode(subObj); - // Now use the mechanics of the PyString on the UTF-16 of the PyUnicode. - return checkIndex(_find(sub.getString(), start, end)); + final String sub = coerceToString(subObj); + // Now use the mechanics of the PyString on the UTF-16. + return checkIndex(_find(sub, start, end)); } @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_index_doc) final int unicode_rindex(PyObject subObj, PyObject start, PyObject end) { - final PyUnicode sub = coerceToUnicode(subObj); - // Now use the mechanics of the PyString on the UTF-16 of the PyUnicode. - return checkIndex(_rfind(sub.getString(), start, end)); + final String sub = coerceToString(subObj); + // Now use the mechanics of the PyString on the UTF-16. + return checkIndex(_rfind(sub, start, end)); } @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_count_doc) @@ -1492,7 +1617,6 @@ break; } matched--; - } if (matched == 0) { count++; @@ -1503,13 +1627,13 @@ @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_find_doc) final int unicode_find(PyObject subObj, PyObject start, PyObject end) { - int found = _find(coerceToUnicode(subObj).getString(), start, end); + int found = _find(coerceToString(subObj), start, end); return found < 0 ? -1 : translator.codePointIndex(found); } @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_rfind_doc) final int unicode_rfind(PyObject subObj, PyObject start, PyObject end) { - int found = _rfind(coerceToUnicode(subObj).getString(), start, end); + int found = _rfind(coerceToString(subObj), start, end); return found < 0 ? -1 : translator.codePointIndex(found); } @@ -1685,14 +1809,89 @@ return unicodeJoin(seq); } + /** + * Equivalent to the Python unicode.startswith method, testing whether a string + * starts with a specified prefix, where a sub-range is specified by [start:end]. + * Arguments start and end are interpreted as in slice notation, with + * null or {@link Py#None} representing "missing". prefix can also be a tuple of + * prefixes to look for. + * + * @param prefix string to check for (or a PyTuple of them). + * @param start start of slice. + * @param end end of slice. + * @return true if this string slice starts with a specified prefix, otherwise + * false. + */ + @Override + public boolean startswith(PyObject prefix, PyObject start, PyObject end) { + return unicode_startswith(prefix, start, end); + } + @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_startswith_doc) - final boolean unicode_startswith(PyObject prefix, PyObject start, PyObject end) { - return str_startswith(prefix, start, end); + final boolean unicode_startswith(PyObject prefix, PyObject startObj, PyObject endObj) { + int[] indices = translateIndices(startObj, endObj); + int start = indices[0]; + int sliceLen = indices[1] - start; + + if (!(prefix instanceof PyTuple)) { + // It ought to be PyUnicode or some kind of bytes with the buffer API to decode. + String s = coerceToString(prefix); + return sliceLen >= s.length() && getString().startsWith(s, start); + + } else { + // Loop will return true if this slice starts with any prefix in the tuple + for (PyObject prefixObj : ((PyTuple) prefix).getArray()) { + // It ought to be PyUnicode or some kind of bytes with the buffer API. + String s = coerceToString(prefixObj); + if (sliceLen >= s.length() && getString().startsWith(s, start)) { + return true; + } + } + // None matched + return false; + } + } + + /** + * Equivalent to the Python unicode.endswith method, testing whether a string ends + * with a specified suffix, where a sub-range is specified by [start:end]. + * Arguments start and end are interpreted as in slice notation, with + * null or {@link Py#None} representing "missing". suffix can also be a tuple of + * suffixes to look for. + * + * @param suffix string to check for (or a PyTuple of them). + * @param start start of slice. + * @param end end of slice. + * @return true if this string slice ends with a specified suffix, otherwise + * false. + */ + @Override + public boolean endswith(PyObject suffix, PyObject start, PyObject end) { + return unicode_endswith(suffix, start, end); } @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_endswith_doc) - final boolean unicode_endswith(PyObject suffix, PyObject start, PyObject end) { - return str_endswith(suffix, start, end); + final boolean unicode_endswith(PyObject suffix, PyObject startObj, PyObject endObj) { + int[] indices = translateIndices(startObj, endObj); + String substr = getString().substring(indices[0], indices[1]); + + if (!(suffix instanceof PyTuple)) { + // It ought to be PyUnicode or some kind of bytes with the buffer API. + String s = coerceToString(suffix); + return substr.endsWith(s); + + } else { + // Loop will return true if this slice ends with any suffix in the tuple + for (PyObject suffixObj : ((PyTuple) suffix).getArray()) { + // It ought to be PyUnicode or some kind of bytes with the buffer API. + String s = coerceToString(suffixObj); + if (substr.endsWith(s)) { + return true; + } + } + // None matched + return false; + } } @ExposedMethod(doc = BuiltinDocs.unicode_translate_doc) diff --git a/src/org/python/core/__builtin__.java b/src/org/python/core/__builtin__.java --- a/src/org/python/core/__builtin__.java +++ b/src/org/python/core/__builtin__.java @@ -1267,17 +1267,22 @@ "is the number of parent directories to search relative to the current module."); } + private static final String[] ARGS = {"name", "globals", "locals", "fromlist", "level"}; + @Override public PyObject __call__(PyObject args[], String keywords[]) { - ArgParser ap = new ArgParser("__import__", args, keywords, - new String[] {"name", "globals", "locals", "fromlist", - "level"}, - 1); - String module = ap.getString(0); + ArgParser ap = new ArgParser("__import__", args, keywords, ARGS, 1); + PyObject module = ap.getPyObject(0); + String name; + if (module instanceof PyUnicode) { + name = ((PyUnicode) module).encode("ascii").toString(); + } else { + name = ap.getString(0); + } PyObject globals = ap.getPyObject(1, null); PyObject fromlist = ap.getPyObject(3, Py.EmptyTuple); int level = ap.getInt(4, imp.DEFAULT_LEVEL); - return imp.importName(module.intern(), fromlist == Py.None || fromlist.__len__() == 0, + return imp.importName(name.intern(), fromlist == Py.None || fromlist.__len__() == 0, globals, fromlist, level); } } diff --git a/src/org/python/core/codecs.java b/src/org/python/core/codecs.java --- a/src/org/python/core/codecs.java +++ b/src/org/python/core/codecs.java @@ -126,7 +126,7 @@ } private static PyUnicode wrapDecodeResult(String result) { - return new PyUnicode(result, true); + return new PyUnicode(result); } /** diff --git a/src/org/python/core/imp.java b/src/org/python/core/imp.java --- a/src/org/python/core/imp.java +++ b/src/org/python/core/imp.java @@ -412,7 +412,7 @@ * moduleLocation should be the full uri for c. */ public static PyObject createFromCode(String name, PyCode c, String moduleLocation) { - PyUnicode.checkEncoding(name); + checkName(name); PyModule module = addModule(name); PyBaseCode code = null; @@ -585,7 +585,7 @@ } static PyObject loadFromLoader(PyObject importer, String name) { - PyUnicode.checkEncoding(name); + checkName(name); PyObject load_module = importer.__getattr__("load_module"); ReentrantLock importLock = Py.getSystemState().getImportLock(); importLock.lock(); @@ -714,7 +714,7 @@ * @return the loaded module */ public static PyObject load(String name) { - PyUnicode.checkEncoding(name); + checkName(name); ReentrantLock importLock = Py.getSystemState().getImportLock(); importLock.lock(); try { @@ -986,6 +986,18 @@ } } + /** + * Enforce ASCII module name, as a guard on module names supplied as an argument. The parser + * guarantees the name from an actual import statement is a valid identifier. + */ + private static void checkName(String name) { + for (int i = name.length(); i > 0;) { + if (name.charAt(--i) > 255) { + throw Py.ImportError("No module named " + name); + } + } + } + private static void ensureFromList(PyObject mod, PyObject fromlist, String name) { ensureFromList(mod, fromlist, name, false); } @@ -1029,7 +1041,7 @@ */ public static PyObject importName(String name, boolean top) { checkNotFile(name); - PyUnicode.checkEncoding(name); + checkName(name); ReentrantLock importLock = Py.getSystemState().getImportLock(); importLock.lock(); try { @@ -1050,7 +1062,7 @@ public static PyObject importName(String name, boolean top, PyObject modDict, PyObject fromlist, int level) { checkNotFile(name); - PyUnicode.checkEncoding(name); + checkName(name); ReentrantLock importLock = Py.getSystemState().getImportLock(); importLock.lock(); try { -- Repository URL: https://hg.python.org/jython From jython-checkins at python.org Tue Nov 21 17:39:10 2017 From: jython-checkins at python.org (jeff.allen) Date: Tue, 21 Nov 2017 22:39:10 +0000 Subject: [Jython-checkins] =?utf-8?q?jython=3A_Mixed_comparison_=28unicod?= =?utf-8?q?e=2C_str=29_respects_default_encoding_=28fixes_=232638=29=2E?= Message-ID: <20171121223909.73572.6FA802C02238A4C9@mg.python.org> https://hg.python.org/jython/rev/f71e0b2cfaf7 changeset: 8139:f71e0b2cfaf7 user: Jeff Allen date: Sun Nov 19 19:56:33 2017 +0000 summary: Mixed comparison (unicode, str) respects default encoding (fixes #2638). PyUnicode is given its own implementation of the rich comparison operators rather than inheriting from PyString (which is to treat encoded bytes as latin-1 characters). Corresponding tests are added to test_unicode_jy, where coverage of other byte types is also improved. files: Lib/test/test_unicode_jy.py | 466 +++++++++++----- src/org/python/core/PyUnicode.java | 73 ++- 2 files changed, 383 insertions(+), 156 deletions(-) diff --git a/Lib/test/test_unicode_jy.py b/Lib/test/test_unicode_jy.py --- a/Lib/test/test_unicode_jy.py +++ b/Lib/test/test_unicode_jy.py @@ -122,8 +122,8 @@ self.assertRaises(UnicodeDecodeError, '???'.join, [u'foo', u'bar']) def test_file_encoding(self): - '''Ensure file writing doesn't attempt to encode things by default and reading doesn't - decode things by default. This was jython's behavior prior to 2.2.1''' + # Ensure file writing doesn't attempt to encode things by default and reading doesn't + # decode things by default. This was jython's behavior prior to 2.2.1''' EURO_SIGN = u"\u20ac" try: EURO_SIGN.encode() @@ -852,6 +852,7 @@ self.assertRaises(ValueError, fmt.format, u"{0}", 10, 20, i=100) self.assertRaises(ValueError, fmt.format, u"{i}", 10, 20, i=100) + class UnicodeSpaceTest(unittest.TestCase): # Test classification of characters as whitespace (some Jython divergence) @@ -900,6 +901,7 @@ self.assertEqual(2, len(s.split()), "no split made in " + repr(s)) self.assertEqual(2, len(s.rsplit()), "no rsplit made in " + repr(s)) + class EncodingContext(object): """Context manager to save and restore the encoding. @@ -923,70 +925,196 @@ class DefaultDecodingTestCase(unittest.TestCase): - # Test use of default encoding to coerce str to unicode + # Test use of default encoding to coerce byte-like data to unicode + + BYTE_TYPES = (str, buffer, bytearray, memoryview) + BYTE_TYPES_COMPARE = (str, buffer) # Restricted as for CPython __eq__ etc. + + if not test_support.is_jython: + # CPython restricts the acceptable the byte-like types by context + BYTE_TYPES = (str, buffer) + BYTE_TYPES_COMPARE = (str, buffer) + + # Operators def test_add(self): + cs = self.encoding ref = u'caf? cr?me' - s1 = ref[:4].encode(self.encoding) - s2 = ref[4:].encode(self.encoding) - with EncodingContext(self.encoding): + s1 = ref[:4].encode(cs) + s2 = ref[4:].encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + b2 = B(s2) + self.assertEqual( ref[:4] + b2, ref) + # Really we're testing that str promotes. Other Bs may not. self.assertEqual( s1 + ref[4:], ref) - self.assertEqual( ref[:4] + s2, ref) def test_in(self): + cs = self.encoding ref = u'caf? cr?me' - with EncodingContext(self.encoding): - self.assertTrue(u'?'.encode(self.encoding) in ref) - self.assertTrue(u'f?'.encode(self.encoding) in ref) - # Fails if the string is interpreted as code points. - if self.encoding != 'latin-1': - self.assertFalse('\xc3\xa9' in u'caf\xc3\xa9') + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + self.assertTrue(B(u'?'.encode(cs)) in ref) + self.assertTrue(B(u'f?'.encode(cs)) in ref) + # Fails if the string is interpreted as code points. + if cs != 'latin-1': + self.assertFalse(B('\xc3\xa9') in u'caf\xc3\xa9') def test_eq(self): - ref = u'caf? cr?me' - b = ref.encode(self.encoding) - with EncodingContext(self.encoding): - self.assertTrue(ref == b) - self.assertTrue(b == ref) + cs = self.encoding + u = u"Un caf\xe9 cr\xe8me." + # Derive a string such that u1 != u and the encoded versions s, s1 + u1 = u.replace('cr', 'm') + s, s1 = u.encode(cs), u1.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES_COMPARE: + #print B, + b, b1 = B(s), B(s1) + self.assertTrue (u == b ) + self.assertTrue (b == u ) + self.assertFalse(u == b1) + self.assertFalse(b1== u ) + # Check not implicitly comparing as latin-1. + if cs != 'latin-1': + b = B('caf\xc3\xa9') + self.assertFalse(u'caf\xc3\xa9'== b) + self.assertFalse(b == u'caf\xc3\xa9') def test_ne(self): - with EncodingContext(self.encoding): - # Fails if the string is interpreted as code points. - if self.encoding != 'latin-1': - self.assertFalse(u'caf\xc3\xa9'== 'caf\xc3\xa9') - self.assertFalse('caf\xc3\xa9' == u'caf\xc3\xa9') + cs = self.encoding + u = u"Un caf\xe9 cr\xe8me." + # Derive a string such that u1 != u and the encoded versions s, s1 + u1 = u.replace('cr', 'm') + s, s1 = u.encode(cs), u1.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES_COMPARE: + #print B, + b, b1 = B(s), B(s1) + self.assertTrue (u != b1) + self.assertTrue (b != u1) + self.assertFalse(u != b ) + self.assertFalse(b != u ) + # Check not implicitly comparing as latin-1. + if cs != 'latin-1': + b = B('caf\xc3\xa9') + self.assertTrue(u'caf\xc3\xa9'!= b) + self.assertTrue(b != u'caf\xc3\xa9') + + def test_lt(self): + cs = self.encoding + u = u"Un caf\xe9 cr\xe8me." + # Derive strings such that u0 < u < u1 and their encodings + u0 = u.replace('cr', 'Cr') + u1 = u.replace('.', '?') + s0, s, s1 = u0.encode(cs), u.encode(cs), u1.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES_COMPARE: + #print B, + b0, b, b1 = B(s0), B(s), B(s1) + self.assertTrue (b0 < u ) + self.assertFalse(b < u ) + self.assertFalse(b1 < u ) + self.assertFalse(u < b0) + self.assertFalse(u < b ) + self.assertTrue (u < b1) + + def test_le(self): + cs = self.encoding + u = u"Un caf\xe9 cr\xe8me." + # Derive strings such that u0 < u < u1 and their encodings + u0 = u.replace('cr', 'Cr') + u1 = u.replace('.', '?') + s0, s, s1 = u0.encode(cs), u.encode(cs), u1.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES_COMPARE: + #print B, + b0, b, b1 = B(s0), B(s), B(s1) + self.assertTrue (b0 <= u ) + self.assertTrue (b <= u ) + self.assertFalse(b1 <= u ) + self.assertFalse(u <= b0) + self.assertTrue (u <= b ) + self.assertTrue (u <= b1) + + def test_gt(self): + cs = self.encoding + u = u"Un caf\xe9 cr\xe8me." + # Derive strings such that u0 < u < u1 and their encodings + u0 = u.replace('cr', 'Cr') + u1 = u.replace('.', '?') + s0, s, s1 = u0.encode(cs), u.encode(cs), u1.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES_COMPARE: + #print B, + b0, b, b1 = B(s0), B(s), B(s1) + self.assertTrue (b1 > u ) + self.assertFalse(b > u ) + self.assertFalse(b0 > u ) + self.assertFalse(u > b1) + self.assertFalse(u > b ) + self.assertTrue (u > b0) + + def test_ge(self): + cs = self.encoding + u = u"Un caf\xe9 cr\xe8me." + # Derive strings such that u0 < u < u1 and their encodings + u0 = u.replace('cr', 'Cr') + u1 = u.replace('.', '?') + s0, s, s1 = u0.encode(cs), u.encode(cs), u1.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES_COMPARE: + #print B, + b0, b, b1 = B(s0), B(s), B(s1) + self.assertTrue (b1 >= u ) + self.assertTrue (b >= u ) + self.assertFalse(b0 >= u ) + self.assertFalse(u >= b1) + self.assertTrue (u >= b ) + self.assertTrue (u >= b0) + + + # Methods def test_count(self): + cs = self.encoding ref = u'Le caf? des f?es ?gar?es' - with EncodingContext(self.encoding): - self.assertEqual(ref.count(u'?'.encode(self.encoding)), 4) - self.assertEqual(ref.count(u'f?'.encode(self.encoding)), 2) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + self.assertEqual(ref.count(B(u'?'.encode(cs))), 4) + self.assertEqual(ref.count(B(u'f?'.encode(cs))), 2) def test_endswith(self): + cs = self.encoding # Set up the test using unicode values and indices ref = u'caf? cr?me' s, u, v = ref[-4:], u'?m?', u'??e' # Encode all this - enc = ref.encode(self.encoding) - u1, v1 = u.encode(self.encoding), v.encode(self.encoding) - s1 = s.encode(self.encoding) + enc = ref.encode(cs) + s1, u1, v1 = s.encode(cs), u.encode(cs), v.encode(cs) - with EncodingContext(self.encoding): - # Test with single argument - self.assertFalse(ref.endswith(v1)) - self.assertTrue(ref.endswith(s1)) - # Test with a mixed tuple as the argument - self.assertFalse(ref.endswith((u1, u, v1, v))) - self.assertTrue(ref.endswith((u1, s1, v1))) - self.assertTrue(ref.endswith((u1, u, s1, v1, v))) - self.assertFalse(enc.endswith((u1, v1, u, v))) - self.assertTrue(enc.endswith((u, s, v))) - self.assertTrue(enc.endswith((u1, u, s, v1, v))) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + sb, ub, vb = B(s1), B(u1), B(v1) + # Test with single argument + self.assertFalse(ref.endswith(vb)) + self.assertTrue(ref.endswith(sb)) + # Test with a mixed tuple as the argument + self.assertFalse(ref.endswith((ub, u, vb, v))) + self.assertTrue(ref.endswith((ub, sb, vb))) + self.assertTrue(ref.endswith((ub, u, sb, vb, v))) + self.assertFalse(enc.endswith((ub, vb, u, v))) + self.assertTrue(enc.endswith((u, s, v))) + self.assertTrue(enc.endswith((ub, u, s, vb, v))) def test_endswith_slice(self): + cs = self.encoding # Set up the test using unicode values and indices ref = u'?Un caf? cr?me??' - if len(u'??'.encode(self.encoding))!=2 and not test_support.is_jython: + if len(u'??'.encode(cs))!=2 and not test_support.is_jython: # CPython fails on str.startswith(unicode, int, int) as it passes # byte indices to unicode.startswith(unicode, int, int) unchanged. # It only works if ? and ? encode to single bytes. Easier test: @@ -994,133 +1122,170 @@ a, b = 4, -2 s, u, v = ref[b-4:b], u'?m?', u'??e' # Encode all this, including the indices - enc = ref.encode(self.encoding) - u1, v1 = u.encode(self.encoding), v.encode(self.encoding) - a1 = len(ref[:a].encode(self.encoding)) - b1 = - len(ref[b:].encode(self.encoding)) - s1 = s.encode(self.encoding) + enc = ref.encode(cs) + u1, v1 = u.encode(cs), v.encode(cs) + a1 = len(ref[:a].encode(cs)) + b1 = - len(ref[b:].encode(cs)) + s1 = s.encode(cs) - with EncodingContext(self.encoding): - # Test the assumption on which the test is based - self.assertEqual(ref[a:b], enc[a1:b1]) - # Test slice with single argument - self.assertFalse(ref.endswith(v1, a, b)) - self.assertTrue(ref.endswith(s1, a, b)) - self.assertFalse(enc.endswith(v1, a1, b1)) - self.assertTrue(enc.endswith(s, a1, b1)) - # CPython would pass: - #self.assertTrue(enc.endswith(s, a, b)) - # Test slice with a mixed tuple as the argument - self.assertFalse(ref.endswith((u1, u, v1, v), a, b)) - self.assertTrue(ref.endswith((u1, s1, v1), a, b)) - self.assertTrue(ref.endswith((u1, u, s1, v1, v), a, b)) - self.assertFalse(enc.endswith((u1, v1, u, v), a1, b1)) - self.assertTrue(enc.endswith((u, s, v), a1, b1)) - self.assertTrue(enc.endswith((u1, u, s, v1, v), a1, b1)) - # CPython would pass: - #self.assertTrue(enc.endswith((u, s, v), a, b)) - #self.assertTrue(enc.endswith((u1, u, s, v1, v), a, b)) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + sb, ub, vb = B(s1), B(u1), B(v1) + # Test the assumption on which the test is based + self.assertEqual(ref[a:b], enc[a1:b1]) + # Test slice with single argument + self.assertFalse(ref.endswith(vb, a, b)) + self.assertTrue(ref.endswith(sb, a, b)) + self.assertFalse(enc.endswith(vb, a1, b1)) + self.assertTrue(enc.endswith(s, a1, b1)) + # CPython would pass: + #self.assertTrue(enc.endswith(s, a, b)) + # Test slice with a mixed tuple as the argument + self.assertFalse(ref.endswith((ub, u, vb, v), a, b)) + self.assertTrue(ref.endswith((ub, sb, vb), a, b)) + self.assertTrue(ref.endswith((ub, u, sb, vb, v), a, b)) + self.assertFalse(enc.endswith((ub, vb, u, v), a1, b1)) + self.assertTrue(enc.endswith((u, s, v), a1, b1)) + self.assertTrue(enc.endswith((ub, u, s, vb, v), a1, b1)) + # CPython would pass: + #self.assertTrue(enc.endswith((u, s, v), a, b)) + #self.assertTrue(enc.endswith((ub, u, s, vb, v), a, b)) def test_find(self): + cs = self.encoding ref = u'caf? cr?me' - sub = u'?'.encode(self.encoding) - with EncodingContext(self.encoding): - self.assertEqual(ref.find(sub), 7) + sub = u'?'.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + self.assertEqual(ref.find(B(sub)), 7) def test_index(self): + cs = self.encoding ref = u'caf? cr?me' - sub = u'?'.encode(self.encoding) - with EncodingContext(self.encoding): - self.assertEqual(ref.index(sub), 7) + sub = u'?'.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + self.assertEqual(ref.index(B(sub)), 7) def test_lstrip(self): + cs = self.encoding ref = u"??????du bl? ?" - sep = u'???'.encode(self.encoding) - with EncodingContext(self.encoding): + sep = u'???'.encode(cs) + with EncodingContext(cs): self.assertEqual(ref.lstrip(sep), u"du bl? ?") def test_partition(self): + cs = self.encoding ref = u"Des f?es h?b?t?es." - sep1 = u'?'.encode(self.encoding) - sep2 = u'?es'.encode(self.encoding) - with EncodingContext(self.encoding): - self.assertEqual(ref.partition(sep1), (u"Des f", u"?", u"es h?b?t?es.")) - self.assertEqual(ref.partition(sep2), (u"Des f", u"?es", u" h?b?t?es.")) + sep1 = u'?'.encode(cs) + sep2 = u'?es'.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + self.assertEqual(ref.partition(B(sep1)), (u"Des f", u"?", u"es h?b?t?es.")) + self.assertEqual(ref.partition(B(sep2)), (u"Des f", u"?es", u" h?b?t?es.")) def test_replace(self): + cs = self.encoding ref = u"?t?." - a = u'?'.encode(self.encoding) - b = u'?'.encode(self.encoding) - with EncodingContext(self.encoding): - self.assertEqual(ref.replace(a, b), u"?t?.") - self.assertEqual(ref.replace(b, a), u"?t?.") + a = u'?'.encode(cs) + b = u'?'.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + self.assertEqual(ref.replace(B(a), B(b)), u"?t?.") + self.assertEqual(ref.replace(B(b), B(a)), u"?t?.") def test_rfind(self): + cs = self.encoding ref = u'caf? cr?me' - sub = u'?'.encode(self.encoding) - with EncodingContext(self.encoding): - self.assertEqual(ref.rfind(sub), 3) + sub = u'?'.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + self.assertEqual(ref.rfind(B(sub)), 3) def test_rindex(self): + cs = self.encoding ref = u'caf? cr?me' - sub = u'?'.encode(self.encoding) - with EncodingContext(self.encoding): - self.assertEqual(ref.index(sub), 3) + sub = u'?'.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + self.assertEqual(ref.index(B(sub)), 3) def test_rpartition(self): + cs = self.encoding ref = u"Des f?es h?b?t?es." - sep1 = u'?'.encode(self.encoding) - sep2 = u'?es'.encode(self.encoding) - with EncodingContext(self.encoding): - self.assertEqual(ref.rpartition(sep1), (u"Des f?es h?b?t", u"?", u"es.")) - self.assertEqual(ref.rpartition(sep2), (u"Des f?es h?b?t", u"?es", u".")) + sep1 = u'?'.encode(cs) + sep2 = u'?es'.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + self.assertEqual(ref.rpartition(B(sep1)), (u"Des f?es h?b?t", u"?", u"es.")) + self.assertEqual(ref.rpartition(B(sep2)), (u"Des f?es h?b?t", u"?es", u".")) def test_rsplit(self): + cs = self.encoding ref = u"Des f?es h?b?t?es." - sep1 = u'?'.encode(self.encoding) - sep2 = u'?es'.encode(self.encoding) - with EncodingContext(self.encoding): - self.assertEqual(ref.rsplit(sep1, 3), [u"Des f?es h", u"b", u"t", u"es."]) - self.assertEqual(ref.rsplit(sep2), [u"Des f", u" h?b?t", u"."]) + sep1 = u'?'.encode(cs) + sep2 = u'?es'.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + self.assertEqual(ref.rsplit(B(sep1), 3), [u"Des f?es h", u"b", u"t", u"es."]) + self.assertEqual(ref.rsplit(B(sep2)), [u"Des f", u" h?b?t", u"."]) def test_rstrip(self): + cs = self.encoding ref = u"? du bl???????" - sep = u'???'.encode(self.encoding) - with EncodingContext(self.encoding): + sep = u'???'.encode(cs) + with EncodingContext(cs): self.assertEqual(ref.rstrip(sep), u"? du bl?") def test_split(self): + cs = self.encoding ref = u"Des f?es h?b?t?es." - sep1 = u'?'.encode(self.encoding) - sep2 = u'?es'.encode(self.encoding) - with EncodingContext(self.encoding): - self.assertEqual(ref.split(sep1, 3), [u"Des f", u"es h", u"b", u"t?es."]) - self.assertEqual(ref.split(sep2), [u"Des f", u" h?b?t", u"."]) + sep1 = u'?'.encode(cs) + sep2 = u'?es'.encode(cs) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + self.assertEqual(ref.split(B(sep1), 3), [u"Des f", u"es h", u"b", u"t?es."]) + self.assertEqual(ref.split(B(sep2)), [u"Des f", u" h?b?t", u"."]) def test_startsswith(self): + cs = self.encoding # Set up the test using unicode values and indices ref = u'caf? cr?me' s, u, v = ref[:4], u'?af', u'caf?' # Encode all this - enc = ref.encode(self.encoding) - u1, v1 = u.encode(self.encoding), v.encode(self.encoding) - s1 = s.encode(self.encoding) + enc = ref.encode(cs) + u1, v1 = u.encode(cs), v.encode(cs) + s1 = s.encode(cs) - with EncodingContext(self.encoding): - self.assertFalse(ref.startswith(v1)) - self.assertTrue(ref.startswith(enc[:5])) - # Test with a mixed tuple as the argument - self.assertFalse(ref.startswith((u1, u, v1, v))) - self.assertTrue(ref.startswith((u1, enc[:5], v1))) - self.assertTrue(ref.startswith((u1, u, enc[:5], v1, v))) - self.assertFalse(enc.startswith((u1, v1, u, v))) - self.assertTrue(enc.startswith((u, ref[:4], v))) - self.assertTrue(enc.startswith((u1, u, ref[:4], v1, v))) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + sb, ub, vb, b5 = B(s1), B(u1), B(v1), B(enc[:5]) + self.assertFalse(ref.startswith(vb)) + self.assertTrue(ref.startswith(b5)) + # Test with a mixed tuple as the argument + self.assertFalse(ref.startswith((ub, u, vb, v))) + self.assertTrue(ref.startswith((ub, b5, vb))) + self.assertTrue(ref.startswith((ub, u, b5, vb, v))) + self.assertFalse(enc.startswith((ub, vb, u, v))) + self.assertTrue(enc.startswith((u, ref[:4], v))) + self.assertTrue(enc.startswith((ub, u, ref[:4], vb, v))) def test_startsswith_slice(self): + cs = self.encoding # Set up the test using unicode values and indices ref = u'?Un caf? cr?me??' - if len(u'??'.encode(self.encoding))!=2 and not test_support.is_jython: + if len(u'??'.encode(cs))!=2 and not test_support.is_jython: # CPython fails on str.startswith(unicode, int, int) as it passes # byte indices to unicode.startswith(unicode, int, int) unchanged. # It only works if ? and ? encode to single bytes. Easier test: @@ -1128,40 +1293,43 @@ a, b = 4, -2 s, u, v = ref[a:a+4], u'?af', u'caf?' # Encode all this, including the indices - enc = ref.encode(self.encoding) - u1, v1 = u.encode(self.encoding), v.encode(self.encoding) - a1 = len(ref[:a].encode(self.encoding)) - b1 = - len(ref[b:].encode(self.encoding)) - s1 = s.encode(self.encoding) + enc = ref.encode(cs) + u1, v1 = u.encode(cs), v.encode(cs) + a1 = len(ref[:a].encode(cs)) + b1 = - len(ref[b:].encode(cs)) + s1 = s.encode(cs) - with EncodingContext(self.encoding): - # Test the assumption on which the test is based - self.assertEqual(ref[a:b], enc[a1:b1]) - # Test slice with single argument - self.assertFalse(ref.startswith(v, a, b)) - self.assertTrue(ref.startswith(s1, a, b)) - self.assertFalse(enc.startswith(v1, a1, b1)) - self.assertTrue(enc.startswith(s, a1, b1)) - # CPython would pass: - #self.assertTrue(enc.startswith(s, a, b)) - # Test slice with a mixed tuple as the argument - self.assertFalse(ref.startswith((u1, u, v1, v), a, b)) - self.assertTrue(ref.startswith((u1, s1, v1), a, b)) - self.assertTrue(ref.startswith((u1, u, s1, v1, v), a, b)) - self.assertFalse(enc.startswith((u1, v1, u, v), a1, b1)) - self.assertTrue(enc.startswith((u, s, v), a1, b1)) - self.assertTrue(enc.startswith((u1, u, s, v1, v), a1, b1)) - # CPython would pass: - #self.assertTrue(enc.startswith((u, s, v), a, b)) - #self.assertTrue(enc.startswith((u1, u, s, v1, v), a, b)) + with EncodingContext(cs): + for B in self.BYTE_TYPES: + #print B, + sb, ub, vb = B(s1), B(u1), B(v1) + # Test the assumption on which the test is based + self.assertEqual(ref[a:b], enc[a1:b1]) + # Test slice with single argument + self.assertFalse(ref.startswith(v, a, b)) + self.assertTrue(ref.startswith(sb, a, b)) + self.assertFalse(enc.startswith(vb, a1, b1)) + self.assertTrue(enc.startswith(s, a1, b1)) + # CPython would pass: + #self.assertTrue(enc.startswith(s, a, b)) + # Test slice with a mixed tuple as the argument + self.assertFalse(ref.startswith((ub, u, vb, v), a, b)) + self.assertTrue(ref.startswith((ub, sb, vb), a, b)) + self.assertTrue(ref.startswith((ub, u, sb, vb, v), a, b)) + self.assertFalse(enc.startswith((ub, vb, u, v), a1, b1)) + self.assertTrue(enc.startswith((u, s, v), a1, b1)) + self.assertTrue(enc.startswith((ub, u, s, vb, v), a1, b1)) + # CPython would pass: + #self.assertTrue(enc.startswith((u, s, v), a, b)) + #self.assertTrue(enc.startswith((ub, u, s, vb, v), a, b)) def test_strip(self): + cs = self.encoding ref = u"??????du bl???????" - sep = u'???'.encode(self.encoding) - with EncodingContext(self.encoding): + sep = u'???'.encode(cs) + with EncodingContext(cs): self.assertEqual(ref.strip(sep), u"du bl?") - class DefaultDecodingLatin1(DefaultDecodingTestCase): encoding = "latin-1" diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java --- a/src/org/python/core/PyUnicode.java +++ b/src/org/python/core/PyUnicode.java @@ -759,6 +759,62 @@ } } + @Override + public PyObject __lt__(PyObject other) { + return unicode___lt__(other); + } + + @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___lt___doc) + final PyObject unicode___lt__(PyObject other) { + String s = coerceForComparison(other); + if (s == null) { + return null; + } + return getString().compareTo(s) < 0 ? Py.True : Py.False; + } + + @Override + public PyObject __le__(PyObject other) { + return unicode___le__(other); + } + + @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___le___doc) + final PyObject unicode___le__(PyObject other) { + String s = coerceForComparison(other); + if (s == null) { + return null; + } + return getString().compareTo(s) <= 0 ? Py.True : Py.False; + } + + @Override + public PyObject __gt__(PyObject other) { + return unicode___gt__(other); + } + + @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___gt___doc) + final PyObject unicode___gt__(PyObject other) { + String s = coerceForComparison(other); + if (s == null) { + return null; + } + return getString().compareTo(s) > 0 ? Py.True : Py.False; + } + + @Override + public PyObject __ge__(PyObject other) { + return unicode___ge__(other); + } + + @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.unicode___ge___doc) + final PyObject unicode___ge__(PyObject other) { + String s = coerceForComparison(other); + if (s == null) { + return null; + } + return getString().compareTo(s) >= 0 ? Py.True : Py.False; + } + @ExposedMethod(doc = BuiltinDocs.unicode___hash___doc) final int unicode___hash__() { return str___hash__(); @@ -960,8 +1016,10 @@ * Interpret the object as a Java String for use in comparison. The return * represents characters as UTF-16. From a PyUnicode we return its internal string. * A str and buffer argument is decoded with the default encoding. - * Equivalent to {@link #coerceToStringOrNull(PyObject)} allowing only the types supported in - * (C)Python unicode.__eq__. + *

+ * This method could be replaced by {@link #coerceToStringOrNull(PyObject)} if we were content + * to allowing a wider range of types to be supported in comparison operations than (C)Python + * unicode.__eq__. * * @param o the object to coerce * @return an equivalent String @@ -1212,9 +1270,10 @@ * {@link #coerceToUnicode(PyObject, boolean)}. * * @param o the object to coerce + * @param name of method * @return an equivalent PyUnicode (or o itself, or null) */ - private static PyUnicode coerceStripSepToUnicode(PyObject o) { + private static PyUnicode coerceStripSepToUnicode(PyObject o, String name) { if (o == null) { return null; } else if (o instanceof PyUnicode) { @@ -1225,14 +1284,14 @@ } else if (o == Py.None) { return null; } else { - throw Py.TypeError("strip arg must be None, unicode or str"); + throw Py.TypeError(name + " arg must be None, unicode or str"); } } @ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode_strip_doc) final PyObject unicode_strip(PyObject sepObj) { - PyUnicode sep = coerceStripSepToUnicode(sepObj); + PyUnicode sep = coerceStripSepToUnicode(sepObj, "strip"); if (isBasicPlane()) { // this contains only basic plane characters @@ -1253,7 +1312,7 @@ @ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode_lstrip_doc) final PyObject unicode_lstrip(PyObject sepObj) { - PyUnicode sep = coerceStripSepToUnicode(sepObj); + PyUnicode sep = coerceStripSepToUnicode(sepObj, "lstrip"); if (isBasicPlane()) { // this contains only basic plane characters @@ -1273,7 +1332,7 @@ @ExposedMethod(defaults = "null", doc = BuiltinDocs.unicode_rstrip_doc) final PyObject unicode_rstrip(PyObject sepObj) { - PyUnicode sep = coerceStripSepToUnicode(sepObj); + PyUnicode sep = coerceStripSepToUnicode(sepObj, "rstrip"); if (isBasicPlane()) { // this contains only basic plane characters -- Repository URL: https://hg.python.org/jython From jython-checkins at python.org Tue Nov 21 17:39:12 2017 From: jython-checkins at python.org (jeff.allen) Date: Tue, 21 Nov 2017 22:39:12 +0000 Subject: [Jython-checkins] =?utf-8?q?jython=3A_Accept_unicode_arguments_a?= =?utf-8?q?t_a_csv=2Ewriter_=28fixes_=232632=29=2E?= Message-ID: <20171121223910.73736.142DE4EBB7A31077@mg.python.org> https://hg.python.org/jython/rev/08978c4d1ab0 changeset: 8140:08978c4d1ab0 user: Jeff Allen date: Tue Nov 21 19:37:02 2017 +0000 summary: Accept unicode arguments at a csv.writer (fixes #2632). The CPython csv.writer accepts unicode strings and encodes them using the current default encoding. This is not documented, but we can easily reproduce the behaviour, which is relied on by some users. A simple test_csv_jy is added for UTF-8 default. We hide sys.setdefaultencoding again after use since this otherwise causes test_site to fail. The same fault is corrected, where it was latent in test_unicode_jy. files: Lib/test/test_csv_jy.py | 96 ++++++++++ Lib/test/test_unicode_jy.py | 8 +- src/org/python/modules/_csv/PyDialect.java | 33 +- src/org/python/modules/_csv/PyWriter.java | 48 ++-- 4 files changed, 145 insertions(+), 40 deletions(-) diff --git a/Lib/test/test_csv_jy.py b/Lib/test/test_csv_jy.py new file mode 100644 --- /dev/null +++ b/Lib/test/test_csv_jy.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2017 Jython Developers + +# Additional csv module unit tests for Jython + +import csv +import io +import sys +from tempfile import TemporaryFile +from test import test_support +import unittest + +# This test has been adapted from Python 3 test_csv.TestUnicode. In Python 3, +# the csv module supports Unicode directly. In Python 2, it does not, except +# that it is transparent to byte data. Many tools, however, accept UTF-8 +# encoded text in a CSV file. +# +class EncodingContext(object): + """Context manager to save and restore the encoding. + + Use like this: + + with EncodingContext("utf-8"): + self.assertEqual("'caf\xc3\xa9'", u"'caf\xe9'") + """ + + def __init__(self, encoding): + if not hasattr(sys, "setdefaultencoding"): + reload(sys) + self.original_encoding = sys.getdefaultencoding() + sys.setdefaultencoding(encoding) + + def __enter__(self): + return self + + def __exit__(self, *ignore_exc): + sys.setdefaultencoding(self.original_encoding) + +class TestUnicode(unittest.TestCase): + + names = [u"Martin von L?wis", + u"Marc Andr? Lemburg", + u"Guido van Rossum", + u"Fran?ois Pinard", + u"????"] + + def test_decode_read(self): + # The user code receives byte data and takes care of the decoding + with TemporaryFile("w+b") as fileobj: + line = u",".join(self.names) + u"\r\n" + fileobj.write(line.encode('utf-8')) + fileobj.seek(0) + reader = csv.reader(fileobj) + # The reader yields rows of byte strings that decode to the data + table = [[e.decode('utf-8') for e in row] for row in reader] + self.assertEqual(table, [self.names]) + + def test_encode_write(self): + # The user encodes unicode objects to byte data that csv writes + with TemporaryFile("w+b") as fileobj: + writer = csv.writer(fileobj) + # We present a row of encoded strings to the writer + writer.writerow([n.encode('utf-8') for n in self.names]) + # We expect the file contents to be the UTF-8 of the csv data + expected = u",".join(self.names) + u"\r\n" + fileobj.seek(0) + self.assertEqual(fileobj.read().decode('utf-8'), expected) + + def test_unicode_write(self): + # The user supplies unicode data that csv.writer default-encodes + # (undocumented feature relied upon by client code). + # See Issue #2632 https://github.com/jythontools/jython/issues/90 + with TemporaryFile("w+b") as fileobj: + with EncodingContext('utf-8'): + writer = csv.writer(fileobj) + # We present a row of unicode strings to the writer + writer.writerow(self.names) + # We expect the file contents to be the UTF-8 of the csv data + expected = u",".join(self.names) + u"\r\n" + fileobj.seek(0) + self.assertEqual(fileobj.read().decode(), expected) + + +def test_main(): + # We'll be enabling sys.setdefaultencoding so remember to disable + had_set = hasattr(sys, "setdefaultencoding") + try: + test_support.run_unittest( + TestUnicode, + ) + finally: + if not had_set: + delattr(sys, "setdefaultencoding") + +if __name__ == "__main__": + test_main() diff --git a/Lib/test/test_unicode_jy.py b/Lib/test/test_unicode_jy.py --- a/Lib/test/test_unicode_jy.py +++ b/Lib/test/test_unicode_jy.py @@ -1341,7 +1341,10 @@ def test_main(): - test_support.run_unittest( + # We'll be enabling sys.setdefaultencoding so remember to disable + had_set = hasattr(sys, "setdefaultencoding") + try: + test_support.run_unittest( UnicodeTestCase, UnicodeIndexMixTest, UnicodeFormatTestCase, @@ -1353,6 +1356,9 @@ DefaultDecodingUTF8, DefaultDecodingCp850, ) + finally: + if not had_set: + delattr(sys, "setdefaultencoding") if __name__ == "__main__": diff --git a/src/org/python/modules/_csv/PyDialect.java b/src/org/python/modules/_csv/PyDialect.java --- a/src/org/python/modules/_csv/PyDialect.java +++ b/src/org/python/modules/_csv/PyDialect.java @@ -1,4 +1,4 @@ -/* Copyright (c) Jython Developers */ +/* Copyright (c)2017 Jython Developers */ package org.python.modules._csv; import org.python.core.ArgParser; @@ -9,6 +9,7 @@ import org.python.core.PyObject; import org.python.core.PyString; import org.python.core.PyType; +import org.python.core.PyUnicode; import org.python.core.Untraversable; import org.python.expose.ExposedDelete; import org.python.expose.ExposedGet; @@ -153,17 +154,21 @@ private static char toChar(String name, PyObject src, char dflt) { if (src == null) { return dflt; - } - boolean isStr = Py.isInstance(src, PyString.TYPE); - if (src == Py.None || isStr && src.__len__() == 0) { + } else if (src == Py.None) { return '\0'; - } else if (!isStr || src.__len__() != 1) { - throw Py.TypeError(String.format("\"%s\" must be an 1-character string", name)); + } else if (src instanceof PyString) { + String s = (src instanceof PyUnicode) ? ((PyUnicode) src).encode() : src.toString(); + if (s.length() == 0) { + return '\0'; + } else if (s.length() == 1) { + return s.charAt(0); + } } - return src.toString().charAt(0); + // This is only going to work for BMP strings because of the char return type + throw Py.TypeError(String.format("\"%s\" must be a 1-character string", name)); } - private static int toInt(String name, PyObject src, int dflt) { + private static int toInt(String name, PyObject src, int dflt) { if (src == null) { return dflt; } @@ -176,14 +181,14 @@ private static String toStr(String name, PyObject src, String dflt) { if (src == null) { return dflt; - } - if (src == Py.None) { + } else if (src == Py.None) { return null; + } else if (src instanceof PyUnicode) { + return ((PyUnicode) src).encode().toString(); + } else if (src instanceof PyString) { + return src.toString(); } - if (!(src instanceof PyBaseString)) { - throw Py.TypeError(String.format("\"%s\" must be an string", name)); - } - return src.toString(); + throw Py.TypeError(String.format("\"%s\" must be a string", name)); } @ExposedGet(name = "escapechar") diff --git a/src/org/python/modules/_csv/PyWriter.java b/src/org/python/modules/_csv/PyWriter.java --- a/src/org/python/modules/_csv/PyWriter.java +++ b/src/org/python/modules/_csv/PyWriter.java @@ -1,4 +1,4 @@ -/* Copyright (c) Jython Developers */ +/* Copyright (c)2017 Jython Developers */ package org.python.modules._csv; import org.python.core.Py; @@ -7,6 +7,7 @@ import org.python.core.PyObject; import org.python.core.PyString; import org.python.core.PyType; +import org.python.core.PyUnicode; import org.python.core.Traverseproc; import org.python.core.Visitproc; import org.python.expose.ExposedType; @@ -21,11 +22,9 @@ @ExposedType(name = "_csv.writer", doc = PyWriter.writer_doc) public class PyWriter extends PyObject implements Traverseproc { - public static final String writer_doc = - "CSV writer\n" + - "\n" + - "Writer objects are responsible for generating tabular data\n" + - "in CSV format from sequence input.\n"; + public static final String writer_doc = "CSV writer\n\n"// + + "Writer objects are responsible for generating tabular data\n" + + "in CSV format from sequence input.\n"; public static final PyType TYPE = PyType.fromClass(PyWriter.class); @@ -53,11 +52,10 @@ this.dialect = dialect; } - public static PyString __doc__writerows = Py.newString( - "writerows(sequence of sequences)\n" + - "\n" + - "Construct and write a series of sequences to a csv file. Non-string\n" + - "elements will be converted to string."); + public static PyString __doc__writerows = Py.newString(// + "writerows(sequence of sequences)\n\n" + + "Construct and write a series of sequences to a csv file. Non-string\n" + + "elements will be converted to string."); public void writerows(PyObject seqseq) { writer_writerows(seqseq); @@ -82,12 +80,10 @@ } } - public static PyString __doc__writerow = Py.newString( - "writerow(sequence)\n" + - "\n" + - "Construct and write a CSV record from a sequence of fields. Non-string\n" + - "elements will be converted to string." - ); + public static PyString __doc__writerow = Py.newString(// + "writerow(sequence)\n\n" + + "Construct and write a CSV record from a sequence of fields. Non-string\n" + + "elements will be converted to string."); public boolean writerow(PyObject seq) { return writer_writerow(seq); @@ -134,14 +130,17 @@ quoted = false; } - if (field instanceof PyString) { + if (field instanceof PyUnicode) { + // Unicode fields get the default encoding (must yield U16 bytes). + append_ok = join_append(((PyString) field).encode(), len == 1); + } else if (field instanceof PyString) { + // Not unicode, so must be U16 bytes. append_ok = join_append(field.toString(), len == 1); } else if (field == Py.None) { append_ok = join_append("", len == 1); } else { PyObject str; - //XXX: in 3.x this check can go away and we can just always use - // __str__ + // XXX: in 3.x this check can go away and we can just always use __str__ if (field.getClass() == PyFloat.class) { str = field.__repr__(); } else { @@ -195,9 +194,9 @@ } /** - * This method behaves differently depending on the value of copy_phase: if copy_phase - * is false, then the method determines the new record length. If copy_phase is true - * then the new field is appended to the record. + * This method behaves differently depending on the value of copy_phase: if copy_phase is false, + * then the method determines the new record length. If copy_phase is true then the new field is + * appended to the record. */ private int join_append_data(String field, boolean quote_empty, boolean copy_phase) { int i; @@ -225,7 +224,7 @@ break; } if (c == dialect.delimiter || c == dialect.escapechar || c == dialect.quotechar - || dialect.lineterminator.indexOf(c) > -1) { + || dialect.lineterminator.indexOf(c) > -1) { if (dialect.quoting == QuoteStyle.QUOTE_NONE) { want_escape = true; } else { @@ -282,7 +281,6 @@ rec_len++; } - /* Traverseproc implementation */ @Override public int traverse(Visitproc visit, Object arg) { -- Repository URL: https://hg.python.org/jython From jython-checkins at python.org Thu Nov 23 13:12:38 2017 From: jython-checkins at python.org (jeff.allen) Date: Thu, 23 Nov 2017 18:12:38 +0000 Subject: [Jython-checkins] =?utf-8?q?jython=3A_Added_tag_v2=2E7=2E2a1_for?= =?utf-8?q?_changeset_dfc49bafbe79?= Message-ID: <20171123181156.66881.68987DC9FA21C87F@mg.python.org> https://hg.python.org/jython/rev/fb0952d97b20 changeset: 8142:fb0952d97b20 user: Jeff Allen date: Thu Nov 23 18:10:51 2017 +0000 summary: Added tag v2.7.2a1 for changeset dfc49bafbe79 files: .hgtags | 1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/.hgtags b/.hgtags --- a/.hgtags +++ b/.hgtags @@ -109,3 +109,4 @@ a5a06c9efdb6dd361d5f5c5c1ef07c2ac802e2e0 v2.7.1rc3 b6e989b788d563b8ecb0c0458ab486fca8d128d6 v2.7.1rc3 dd7e191d4c90d9f5d5fe8f0840f186697ecf272a v2.7.1 +dfc49bafbe79566bd54c8d417829e001ff2316ea v2.7.2a1 -- Repository URL: https://hg.python.org/jython From jython-checkins at python.org Thu Nov 23 13:12:50 2017 From: jython-checkins at python.org (jeff.allen) Date: Thu, 23 Nov 2017 18:12:50 +0000 Subject: [Jython-checkins] =?utf-8?q?jython=3A_Update_version_reported_to?= =?utf-8?q?_2=2E7=2E2a1?= Message-ID: <20171123181156.66542.11914A562FC65202@mg.python.org> https://hg.python.org/jython/rev/dfc49bafbe79 changeset: 8141:dfc49bafbe79 tag: v2.7.2a1 user: Jeff Allen date: Thu Nov 23 18:08:23 2017 +0000 summary: Update version reported to 2.7.2a1 files: NEWS | 5 ++++- README.txt | 19 +++++++------------ build.xml | 10 +++++----- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/NEWS b/NEWS --- a/NEWS +++ b/NEWS @@ -2,8 +2,11 @@ For more details, please see https://hg.python.org/jython -Development tip +Jython 2.7.2a1 Bugs fixed + - [ 2632 ] Handle unicode data appropriately in csv module + - [ 2638 ] str not default-decoded in str-unicode operations + - [ 2622 ] json dumps error (use of AbstractDict) - [ 2607, 2620 ] Error loading Python DLL (error code 14001) - [ 2612 ] NPE while trying to load class - [ 2609 ] PyType.fromClass publication race (discovered in strptime and re) diff --git a/README.txt b/README.txt --- a/README.txt +++ b/README.txt @@ -1,8 +1,8 @@ Jython: Python for the Java Platform -Welcome to Jython 2.7.1! +Welcome to Jython 2.7.2a1. -This is the final release of the 2.7.1 version of Jython. Along with +This is an alpha release of the 2.7.2 version of Jython. Along with language and runtime compatibility with CPython 2.7, Jython 2.7 provides substantial support of the Python ecosystem. This includes built-in support of pip/setuptools (you can use with bin/pip) and a @@ -31,13 +31,8 @@ See ACKNOWLEDGMENTS for details about Jython's copyright, license, contributors, and mailing lists; and NEWS for detailed release notes, -including bugs fixed, backwards breaking changes, and new -features. Thanks go to Google for sponsoring Stefan Richthofer for the -Google Summer of Code; there are so many others to thank, but Stefan's -work proved instrumental for getting 2.7.1 out, all in preparation for -his actual work on JyNI for the summer of 2017 -(http://jyni.org/). Motivation helps! We also deeply thank all who -contribute to Jython, including - but not limited to - bug reports, -patches, pull requests, documentation changes, support emails, and -fantastic conversation on Freenode at #jython. Join us there for your -questions and answers! +including bugs fixed, backwards breaking changes, and new features. We +sincerely thank all who contribute to Jython, including - but not +limited to - bug reports, patches, pull requests, documentation +changes, support emails, and fantastic conversation on Freenode at +#jython. Join us there for your questions and answers! diff --git a/build.xml b/build.xml --- a/build.xml +++ b/build.xml @@ -84,15 +84,15 @@ - - + + - - + + - + -- Repository URL: https://hg.python.org/jython From jython-checkins at python.org Thu Nov 23 13:32:22 2017 From: jython-checkins at python.org (jeff.allen) Date: Thu, 23 Nov 2017 18:32:22 +0000 Subject: [Jython-checkins] =?utf-8?q?jython=3A_Identify_as_2=2E7=2E2a1+?= Message-ID: <20171123183217.45101.14C71771D63386D7@mg.python.org> https://hg.python.org/jython/rev/dd42e3dc8b05 changeset: 8143:dd42e3dc8b05 user: Jeff Allen date: Thu Nov 23 18:31:14 2017 +0000 summary: Identify as 2.7.2a1+ I *think* this is what we do to identify that the current (dev) version is 2.7.2a1 plus some changes, not the one released as 2.7.2a1. files: build.xml | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/build.xml b/build.xml --- a/build.xml +++ b/build.xml @@ -84,7 +84,7 @@ - + -- Repository URL: https://hg.python.org/jython From jython-checkins at python.org Sun Nov 26 03:07:59 2017 From: jython-checkins at python.org (jeff.allen) Date: Sun, 26 Nov 2017 08:07:59 +0000 Subject: [Jython-checkins] =?utf-8?q?jython=3A_Add_a_test_for_PyShadowStr?= =?utf-8?q?ing_matching?= Message-ID: <20171126080659.74384.148FBACAF27D897E@mg.python.org> https://hg.python.org/jython/rev/30a70b0ac355 changeset: 8144:30a70b0ac355 user: Jeff Allen date: Sun Nov 26 07:16:26 2017 +0000 summary: Add a test for PyShadowString matching files: Lib/test/test_shadowstr_jy.py | 120 ++++++++++++++++++++++ 1 files changed, 120 insertions(+), 0 deletions(-) diff --git a/Lib/test/test_shadowstr_jy.py b/Lib/test/test_shadowstr_jy.py new file mode 100644 --- /dev/null +++ b/Lib/test/test_shadowstr_jy.py @@ -0,0 +1,120 @@ +# Made for Jython + +# Tests of built-in type shadowstr + +import os +import sys +from test import string_tests +from test.test_support import run_unittest, is_jython +from test.test_str import StrTest +import unittest + +from org.python.core import PyShadowString + +# Ideally we would test shadowstr is a str but the tests need to sub-class it +# +# class StrTestCase( # Should pass all tests for str +# string_tests.CommonTest, +# string_tests.MixinStrUnicodeUserStringTest, +# string_tests.MixinStrUserStringTest, +# string_tests.MixinStrUnicodeTest, +# ): +# +# type2test = PyShadowString + + +class ShadowStrTestCase(unittest.TestCase): + + def setUp(self): + self.ss = PyShadowString("hello", "bonjour") + + def check_first_eq(self): + self.assertTrue(self.ss == "hello") + self.assertFalse(self.ss == "bonjour") + + def check_both_eq(self): + self.assertTrue(self.ss == "hello") + self.assertTrue(self.ss == "bonjour") + + def test_eq(self): + # Test recognition unconditionally + self.check_first_eq() + self.ss.addtarget(None) # match any + self.check_both_eq() + + def test_eq_class(self): + # Test recognition of class context only + self.check_first_eq() + # The Java class of a python module may be $py + self.ss.addtarget(r"test\.test_shadowstr_jy\$py") # class only + # Or it may be org.python.pycode._pyx + self.ss.addtarget(r"org\.python\.pycode\._pyx\d+") # class only + self.check_both_eq() + + def test_eq_method(self): + # Test recognition of method context only + self.check_first_eq() + # The Java method name of a python function is name$ + self.ss.addtarget(None, r"test_eq_method\$\d+") # method only + self.check_both_eq() + + def test_eq_class_method(self): + # Test recognition of class and method context + self.check_first_eq() + # Match this method in this module + self.ss.addtarget(r"test\.test_shadowstr_jy\$py", # class + r"test_eq_class_method\$\d+") # method + self.ss.addtarget(r"org\.python\.pycode\._pyx\d+", # class + r"test_eq_class_method\$\d+") # method + self.check_both_eq() + + def check_first_startswith(self): + self.assertTrue(self.ss.startswith("hel")) + self.assertFalse(self.ss.startswith("bon")) + + def check_both_startswith(self): + self.assertTrue(self.ss.startswith("hel")) + self.assertTrue(self.ss.startswith("bon")) + + def test_startswith(self): + # Test recognition unconditionally + self.check_first_startswith() + self.ss.addtarget(None) # match any + self.check_both_startswith() + + def test_startswith_class(self): + # Test recognition of class context only + self.check_first_startswith() + # The Java class of a python module may be $py + self.ss.addtarget(r"test\.test_shadowstr_jy\$py") # class only + # Or it may be org.python.pycode._pyx + self.ss.addtarget(r"org\.python\.pycode\._pyx\d+") # class only + self.check_both_startswith() + + def test_startswith_method(self): + # Test recognition of method context only + self.check_first_startswith() + # The Java method name of a python function is name$ + self.ss.addtarget(None, r"test_startswith_method\$\d+") # method only + self.check_both_startswith() + + def test_startswith_class_method(self): + # Test recognition of class and method context + self.check_first_startswith() + # Match this method in this module + self.ss.addtarget(r"test\.test_shadowstr_jy\$py", # class + r"test_startswith_class_method\$\d+") # method + self.ss.addtarget(r"org\.python\.pycode\._pyx\d+", # class + r"test_startswith_class_method\$\d+") # method + self.check_both_startswith() + + +def test_main(): + run_unittest( + #StrTestCase, + ShadowStrTestCase, + ) + + +if __name__ == "__main__": + test_main() -- Repository URL: https://hg.python.org/jython From jython-checkins at python.org Sun Nov 26 13:10:07 2017 From: jython-checkins at python.org (jeff.allen) Date: Sun, 26 Nov 2017 18:10:07 +0000 Subject: [Jython-checkins] =?utf-8?q?jython=3A_Correct_use_of_encoding_in?= =?utf-8?q?_test=5Fos=5Fjy_test_of_getcwd=2E_Fixes_=232646=2E?= Message-ID: <20171126181006.66292.F4BA18626E486DC2@mg.python.org> https://hg.python.org/jython/rev/320911f1aeba changeset: 8145:320911f1aeba user: Jeff Allen date: Sun Nov 26 17:19:57 2017 +0000 summary: Correct use of encoding in test_os_jy test of getcwd. Fixes #2646. All information is in the FS encoding, so decode() was spurious, and only passed because of a false conception of == between str and unicode, corrected in #2630. We re-enable the test for Windows, as it keeps catching us out, add a similar test for getcwdu, and beef up comments. files: Lib/test/regrtest.py | 2 +- Lib/test/test_os_jy.py | 26 ++++++++++++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/Lib/test/regrtest.py b/Lib/test/regrtest.py --- a/Lib/test/regrtest.py +++ b/Lib/test/regrtest.py @@ -1370,7 +1370,7 @@ 'java.nt': # Expected to fail on Windows """ test_mailbox # fails miserably and ruins other tests - test_os_jy # Locale tests fail on Cygwin (but not Windows) + # test_os_jy # Locale tests fail on Cygwin (but not Windows) # test_popen # Passes, but see http://bugs.python.org/issue1559298 test_select_new # Hangs (Windows), though ok run singly test_urllib2 # file not on local host (likely Windows only) diff --git a/Lib/test/test_os_jy.py b/Lib/test/test_os_jy.py --- a/Lib/test/test_os_jy.py +++ b/Lib/test/test_os_jy.py @@ -231,16 +231,30 @@ 'sys.stdout.write(os.getenv("TEST_HOME"))'], stdout=subprocess.PIPE, env=newenv) - # Decode with default encoding utf-8 (because ... ?) + # Decode with FS encoding used by subprocess communication self.assertEqual(p.stdout.read().decode('utf-8'), expected) def test_getcwd(self): with test_support.temp_cwd(name=u"tempcwd-??") as temp_cwd: - p = subprocess.Popen([sys.executable, "-c", - 'import sys,os;' \ - 'sys.stdout.write(os.getcwd().encode("utf-8"))'], - stdout=subprocess.PIPE) - self.assertEqual(p.stdout.read().decode("utf-8"), temp_cwd) + # os.getcwd reports the working directory as an FS-encoded str, + # which is also the encoding used in subprocess communication. + p = subprocess.Popen([ + sys.executable, "-c", + 'import sys,os;' \ + 'sys.stdout.write(os.getcwd())'], + stdout=subprocess.PIPE) + self.assertEqual(p.stdout.read(), temp_cwd) + + def test_getcwdu(self): + with test_support.temp_cwd(name=u"tempcwd-??") as temp_cwd: + # os.getcwdu reports the working directory as unicode, + # which must be encoded for subprocess communication. + p = subprocess.Popen([ + sys.executable, "-c", + 'import sys,os;' \ + 'sys.stdout.write(os.getcwdu().encode(sys.getfilesystemencoding()))'], + stdout=subprocess.PIPE) + self.assertEqual(p.stdout.read(), temp_cwd) def test_listdir(self): # It is hard to avoid Unicode paths on systems like OS X. Use relative -- Repository URL: https://hg.python.org/jython