[Jython-checkins] jython: Comment the %-formatting mechanism in PyString.java.
jeff.allen
jython-checkins at python.org
Mon May 19 00:49:34 CEST 2014
http://hg.python.org/jython/rev/cfecb7862f21
changeset: 7258:cfecb7862f21
user: Jeff Allen <ja.py at farowl.co.uk>
date: Sun May 04 11:37:55 2014 +0100
summary:
Comment the %-formatting mechanism in PyString.java.
No significant change to code. This is to aid comprehension of intended changes.
files:
src/org/python/core/PyString.java | 256 ++++++++++++++++-
1 files changed, 233 insertions(+), 23 deletions(-)
diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java
--- a/src/org/python/core/PyString.java
+++ b/src/org/python/core/PyString.java
@@ -1296,8 +1296,8 @@
* @param stripChars characters to strip from the left end of this str/bytes, or null
* @return a new String, stripped of the specified characters/bytes
*/
- public String lstrip(String sep) {
- return _lstrip(sep);
+ public String lstrip(String stripChars) {
+ return _lstrip(stripChars);
}
/**
@@ -1310,8 +1310,8 @@
* @return a new <code>PyString</code> (or {@link PyUnicode}), stripped of the specified
* characters/bytes
*/
- public PyObject lstrip(PyObject sep) {
- return str_lstrip(sep);
+ public PyObject lstrip(PyObject stripChars) {
+ return str_lstrip(stripChars);
}
@ExposedMethod(defaults = "null", doc = BuiltinDocs.str_lstrip_doc)
@@ -1385,8 +1385,8 @@
* @param stripChars characters to strip from either end of this str/bytes, or null
* @return a new String, stripped of the specified characters/bytes
*/
- public String rstrip(String sep) {
- return _rstrip(sep);
+ public String rstrip(String stripChars) {
+ return _rstrip(stripChars);
}
/**
@@ -1399,8 +1399,8 @@
* @return a new <code>PyString</code> (or {@link PyUnicode}), stripped of the specified
* characters/bytes
*/
- public PyObject rstrip(PyObject sep) {
- return str_rstrip(sep);
+ public PyObject rstrip(PyObject stripChars) {
+ return str_rstrip(stripChars);
}
@ExposedMethod(defaults = "null", doc = BuiltinDocs.str_rstrip_doc)
@@ -1546,7 +1546,7 @@
* the last element of the list contains the what is left over after the last split.
* <p>
* Implementation note: although a str contains only bytes, this method is also called by
- * {@link PyUnicode#unicode_split(PyObject)}.
+ * {@link PyUnicode#unicode_split(PyObject, int)}.
*
* @param sep string to use as separator (or <code>null</code> if to split on whitespace)
* @param maxsplit maximum number of splits to make (there may be <code>maxsplit+1</code>
@@ -1798,7 +1798,7 @@
* left over after the last split.
* <p>
* Implementation note: although a str contains only bytes, this method is also called by
- * {@link PyUnicode#unicode_rsplit(PyObject)} .
+ * {@link PyUnicode#unicode_rsplit(PyObject, int)} .
*
* @param sep string to use as separator (or <code>null</code> if to split on whitespace)
* @param maxsplit maximum number of splits to make (there may be <code>maxsplit+1</code>
@@ -2931,11 +2931,10 @@
*
* @param oldPiece to replace where found.
* @param newPiece replacement text.
- * @param count maximum number of replacements to make, or -1 meaning all of them.
* @return PyString (or PyUnicode if any string is one), this string after replacements.
*/
- public PyString replace(PyObject oldPieceObj, PyObject newPieceObj) {
- return str_replace(oldPieceObj, newPieceObj, -1);
+ public PyString replace(PyObject oldPiece, PyObject newPiece) {
+ return str_replace(oldPiece, newPiece, -1);
}
/**
@@ -2949,8 +2948,8 @@
* @param count maximum number of replacements to make, or -1 meaning all of them.
* @return PyString (or PyUnicode if any string is one), this string after replacements.
*/
- public PyString replace(PyObject oldPieceObj, PyObject newPieceObj, int count) {
- return str_replace(oldPieceObj, newPieceObj, count);
+ public PyString replace(PyObject oldPiece, PyObject newPiece, int count) {
+ return str_replace(oldPiece, newPiece, count);
}
@ExposedMethod(defaults = "-1", doc = BuiltinDocs.str_replace_doc)
@@ -3161,8 +3160,8 @@
* @return <code>true</code> if this string slice starts with a specified prefix, otherwise
* <code>false</code>.
*/
- public boolean startswith(PyObject prefix, PyObject offset) {
- return str_startswith(prefix, offset, null);
+ public boolean startswith(PyObject prefix, PyObject start) {
+ return str_startswith(prefix, start, null);
}
/**
@@ -4002,13 +4001,24 @@
*/
final class StringFormatter {
+ /** Index into {@link #format} being interpreted. */
int index;
+ /** Format being interpreted. */
String format;
+ /** Where the output is built. */
StringBuilder buffer;
+ /** Remembers that the value currently converted is negative */
boolean negative;
+ /** Precision from format specification. */
int precision;
+ /**
+ * Index into args of argument currently being worked, or special values indicating -1: a single
+ * item that has not yet been used, -2: a single item that has already been used, -3: a mapping.
+ */
int argIndex;
+ /** Arguments supplied to {@link #format(PyObject)} method. */
PyObject args;
+ /** Indicate a <code>PyUnicode</code> result is expected. */
boolean unicodeCoercion;
final char pop() {
@@ -4027,6 +4037,11 @@
index--;
}
+ /**
+ * Initialise the interpreter with the given format string, ready for {@link #format(PyObject)}.
+ *
+ * @param format string to interpret
+ */
public StringFormatter(String format) {
this(format, false);
}
@@ -4044,6 +4059,10 @@
buffer = new StringBuilder(format.length() + 100);
}
+ /**
+ * Read the next object from the argument list, taking special values of <code>argIndex</code>
+ * into account.
+ */
PyObject getarg() {
PyObject ret = null;
switch (argIndex) {
@@ -4064,6 +4083,10 @@
return ret;
}
+ /**
+ * Parse a number from the format, except if the next thing is "*", read it from the argument
+ * list.
+ */
int getNumber() {
char c = pop();
if (c == '*') {
@@ -4093,7 +4116,24 @@
}
+ /**
+ * Format the argument interpreted as a long, using the argument's <code>__str__</code>,
+ * <code>__oct__</code>, or <code>__hex__</code> method according to <code>type</code>. If v is
+ * being treated as signed, the sign of v is transferred to {@link #negative} and the absolute
+ * value is converted. The <code>altFlag</code> argument controls the appearance of a "0x" or
+ * "0X" prefix in the hex case, or a "0" prefix in the octal case. The hexadecimal case, the
+ * case of characters and digits will match the type ('x' meaning lowercase, 'X' meaning
+ * uppercase).
+ *
+ * @param arg to convert
+ * @param type one of 'o' for octal, 'x' or 'X' for hex, anything else calls
+ * <code>arg.__str__</code>.
+ * @param altFlag if true there will be a prefix
+ * @return converted value as <code>String</code>
+ */
private String formatLong(PyObject arg, char type, boolean altFlag) {
+ // Convert using the appropriate type
+ // XXX Results in behaviour divergent from CPython when any of the methods is overridden.
PyString argAsString;
switch (type) {
case 'o':
@@ -4107,29 +4147,37 @@
argAsString = arg.__str__();
break;
}
+
checkPrecision("long");
String s = argAsString.toString();
int end = s.length();
int ptr = 0;
+ // In the hex case, the __hex__ return starts 0x
+ // XXX (we assume, perhaps falsely)
int numnondigits = 0;
if (type == 'x' || type == 'X') {
numnondigits = 2;
}
+ // Strip a "long" indicator
if (s.endsWith("L")) {
end--;
}
+ // Strip a possible sign to member negative
negative = s.charAt(0) == '-';
if (negative) {
ptr++;
}
+ // The formatted number is s[ptr:end] and starts with numnondigits non-digits.
int numdigits = end - numnondigits - ptr;
if (!altFlag) {
+ // We should have no "base tag" '0' or "0x" on the front.
switch (type) {
case 'o':
+ // Strip the '0'
if (numdigits > 1) {
++ptr;
--numdigits;
@@ -4137,27 +4185,36 @@
break;
case 'x':
case 'X':
+ // Strip the "0x"
ptr += 2;
numnondigits -= 2;
break;
}
}
+
+ // If necessary, add leading zeros to the numerical digits part.
if (precision > numdigits) {
+ // Recompose the formatted number in this buffer
StringBuilder buf = new StringBuilder();
+ // The base indicator prefix
for (int i = 0; i < numnondigits; ++i) {
buf.append(s.charAt(ptr++));
}
+ // The extra zeros
for (int i = 0; i < precision - numdigits; i++) {
buf.append('0');
}
+ // The previously known digits
for (int i = 0; i < numdigits; i++) {
buf.append(s.charAt(ptr++));
}
s = buf.toString();
} else if (end < s.length() || ptr > 0) {
+ // It's only necessary to extract the formatted number from s
s = s.substring(ptr, end);
}
+ // And finally, deal with the case, so it matches x or X.
switch (type) {
case 'X':
s = s.toUpperCase();
@@ -4167,10 +4224,16 @@
}
/**
- * Formats arg as an integer, with the specified radix
+ * Formats arg as an integer, with the specified radix. The integer value is obtained from the
+ * result of <code>arg.__int__()</code>. <code>type</code> and <code>altFlag</code> are passed
+ * to {@link #formatLong(PyObject, char, boolean)} in case the result is a PyLong.
*
- * type and altFlag are needed to be passed to {@link #formatLong(PyObject, char, boolean)} in
- * case the result of <code>arg.__int__()</code> is a PyLong.
+ * @param arg to convert
+ * @param radix in which to express <code>arg</code>
+ * @param unsigned true if required to interpret a 32-bit integer as unsigned ('u' legacy?).
+ * @param type of conversion ('d', 'o', 'x', or 'X')
+ * @param altFlag '#' present in format (causes "0x" prefix in hex, and '0' prefix in octal)
+ * @return string form of the value
*/
private String formatInteger(PyObject arg, int radix, boolean unsigned, char type,
boolean altFlag) {
@@ -4202,25 +4265,44 @@
}
}
if (argAsInt instanceof PyInteger) {
+ // This call does not provide the prefix and will be lowercase.
return formatInteger(((PyInteger)argAsInt).getValue(), radix, unsigned);
} else { // must be a PyLong (as per __int__ contract)
+ // This call provides the base prefix and case-matches with 'x' or 'X'.
return formatLong(argAsInt, type, altFlag);
}
}
+ /**
+ * Convert a 32-bit integer (as from a {@link PyInteger}) to characters, signed or unsigned. The
+ * values is presented in a <code>long</code>. The string result is left-padded with zeros to
+ * the stated {@link #precision}. If v is being treated as signed, the sign of v is transferred
+ * to {@link #negative} and the absolute value is converted. Otherwise (unsigned case)
+ * <code>0x100000000L + v</code> is converted. This method does not provide the '0' or "0x"
+ * prefix, just the padded digit string.
+ *
+ * @param v value to convert
+ * @param radix of conversion
+ * @param unsigned if should be treated as unsigned
+ * @return string form
+ */
private String formatInteger(long v, int radix, boolean unsigned) {
checkPrecision("integer");
if (unsigned) {
+ // If the high bit was set, this will have been sign-extended: correct that.
if (v < 0) {
v = 0x100000000l + v;
}
} else {
+ // If the high bit was set, the sign extension was correct, but we need sign + abs(v).
if (v < 0) {
negative = true;
v = -v;
}
}
+ // Use the method in java.lang.Long (lowercase, no prefix)
String s = Long.toString(v, radix);
+ // But zero pad to the requested precision
while (s.length() < precision) {
s = "0" + s;
}
@@ -4308,15 +4390,23 @@
return buf.toString();
}
+ /**
+ * Main service of this class: format one or more arguments with the format string supplied at
+ * construction.
+ *
+ * @param args tuple or map containing objects, or a single object, to convert
+ * @return result of formatting
+ */
@SuppressWarnings("fallthrough")
public PyString format(PyObject args) {
PyObject dict = null;
this.args = args;
boolean needUnicode = unicodeCoercion;
if (args instanceof PyTuple) {
+ // We will simply work through the tuple elements
argIndex = 0;
} else {
- // special index indicating a single item rather than a tuple
+ // Not a tuple, but possibly still some kind of container: use special argIndex values.
argIndex = -1;
if (args instanceof PyDictionary || args instanceof PyStringMap
|| (!(args instanceof PySequence) && args.__findattr__("__getitem__") != null)) {
@@ -4326,6 +4416,8 @@
}
while (index < format.length()) {
+
+ // Attributes to be parsed from the next format specifier
boolean ljustFlag = false;
boolean signFlag = false;
boolean blankFlag = false;
@@ -4335,16 +4427,31 @@
int width = -1;
precision = -1;
+ // Read one character from the format string
char c = pop();
if (c != '%') {
buffer.append(c);
continue;
}
+
+ // It's a %, so the beginning of a conversion specifier. Parse it.
+
+ // A conversion specifier contains the following components, in this order:
+ // + The '%' character, which marks the start of the specifier.
+ // + Mapping key (optional), consisting of a parenthesised sequence of characters.
+ // + Conversion flags (optional), which affect the result of some conversion types.
+ // + Minimum field width (optional), or an '*' (asterisk).
+ // + Precision (optional), given as a '.' (dot) followed by the precision or '*'.
+ // + Length modifier (optional).
+ // + Conversion type.
+
c = pop();
if (c == '(') {
+ // Mapping key, consisting of a parenthesised sequence of characters.
if (dict == null) {
throw Py.TypeError("format requires a mapping");
}
+ // Scan along until a matching close parenthesis is found
int parens = 1;
int keyStart = index;
while (parens > 0) {
@@ -4355,11 +4462,16 @@
parens++;
}
}
+ // Last c=pop() is the closing ')' while indexKey is just after the opening '('
String tmp = format.substring(keyStart, index - 1);
+ // Look it up using this extent as the (right type of) key.
this.args = dict.__getitem__(needUnicode ? new PyUnicode(tmp) : new PyString(tmp));
} else {
+ // Not a mapping key: next clause will re-read c.
push();
}
+
+ // Conversion flags (optional) that affect the result of some conversion types.
while (true) {
switch (c = pop()) {
case '-':
@@ -4380,43 +4492,77 @@
}
break;
}
+ // Push back c as next clause will re-read c.
push();
+
+ /*
+ * Minimum field width (optional). If specified as an '*' (asterisk), the actual width
+ * is read from the next element of the tuple in values, and the object to convert comes
+ * after the minimum field width and optional precision. A custom getNumber() takes care
+ * of the '*' case.
+ */
width = getNumber();
if (width < 0) {
width = -width;
ljustFlag = true;
}
+
+ /*
+ * Precision (optional), given as a '.' (dot) followed by the precision. If specified as
+ * '*' (an asterisk), the actual precision is read from the next element of the tuple in
+ * values, and the value to convert comes after the precision. A custom getNumber()
+ * takes care of the '*' case.
+ */
c = pop();
if (c == '.') {
precision = getNumber();
if (precision < -1) {
precision = 0;
}
-
c = pop();
}
+
+ // Length modifier (optional). (Compatibility feature?) It has no effect.
if (c == 'h' || c == 'l' || c == 'L') {
c = pop();
}
+
+ // c is now the conversion type.
if (c == '%') {
+ // It was just a percent sign after all
buffer.append(c);
continue;
}
+
+ /*
+ * Process argument according to format specification decoded from the string. It is
+ * important we don't read the argumnent from the list until this point because of the
+ * possibility that width and precision were specified via the argument list.
+ */
PyObject arg = getarg();
- char fill = ' ';
String string = null;
negative = false;
+
+ // Independent of type, decide the padding character based on decoded flags.
+ char fill = ' ';
if (zeroFlag) {
fill = '0';
} else {
fill = ' ';
}
+
+ // Perform the type-specific formatting
switch (c) {
+
case 's':
+ // String (converts any Python object using str()).
if (arg instanceof PyUnicode) {
needUnicode = true;
}
+ // fall through ...
+
case 'r':
+ // String (converts any Python object using repr()).
fill = ' ';
if (c == 's') {
if (needUnicode) {
@@ -4432,15 +4578,19 @@
}
break;
+
case 'i':
case 'd':
+ // Signed integer decimal. Note floats accepted.
if (arg instanceof PyLong) {
string = formatLong(arg, c, altFlag);
} else {
string = formatInteger(arg, 10, false, c, altFlag);
}
break;
+
case 'u':
+ // Obsolete type – it is identical to 'd'. (Why not identical here?)
if (arg instanceof PyLong) {
string = formatLong(arg, c, altFlag);
} else if (arg instanceof PyInteger || arg instanceof PyFloat) {
@@ -4449,10 +4599,15 @@
throw Py.TypeError("int argument required");
}
break;
+
case 'o':
+ // Signed octal value. Note floats accepted.
if (arg instanceof PyLong) {
+ // This call provides the base prefix '0' if altFlag.
string = formatLong(arg, c, altFlag);
} else if (arg instanceof PyInteger || arg instanceof PyFloat) {
+ // This call does not provide the '0' prefix and will be lowercase ...
+ // ... except where arg.__int__ returns PyLong, then it's like formatLong.
string = formatInteger(arg, 8, false, c, altFlag);
if (altFlag && string.charAt(0) != '0') {
string = "0" + string;
@@ -4461,10 +4616,15 @@
throw Py.TypeError("int argument required");
}
break;
+
case 'x':
+ // Signed hexadecimal (lowercase). Note floats accepted.
if (arg instanceof PyLong) {
+ // This call provides the base prefix "0x" if altFlag and case-matches c.
string = formatLong(arg, c, altFlag);
} else if (arg instanceof PyInteger || arg instanceof PyFloat) {
+ // This call does not provide the "0x" prefix and will be lowercase.
+ // ... except where arg.__int__ returns PyLong, then it's like formatLong.
string = formatInteger(arg, 16, false, c, altFlag);
string = string.toLowerCase();
if (altFlag) {
@@ -4474,10 +4634,15 @@
throw Py.TypeError("int argument required");
}
break;
+
case 'X':
+ // Signed hexadecimal (uppercase). Note floats accepted.
if (arg instanceof PyLong) {
+ // This call provides the base prefix "0x" if altFlag and case-matches c.
string = formatLong(arg, c, altFlag);
} else if (arg instanceof PyInteger || arg instanceof PyFloat) {
+ // This call does not provide the "0x" prefix and will be lowercase.
+ // ... except where arg.__int__ returns PyLong, then it's like formatLong.
string = formatInteger(arg, 16, false, c, altFlag);
string = string.toUpperCase();
if (altFlag) {
@@ -4487,22 +4652,28 @@
throw Py.TypeError("int argument required");
}
break;
+
case 'e':
case 'E':
+ // Floating point exponential format (+case).
string = formatFloatExponential(arg, c, false);
if (c == 'E') {
string = string.toUpperCase();
}
break;
+
case 'f':
case 'F':
+ // Floating point decimal format (+case). Note ints accepted.
string = formatFloatDecimal(asDouble(arg), false);
if (c == 'F') {
string = string.toUpperCase();
}
break;
+
case 'g':
case 'G':
+ // Value-adaptive floating point format (+case). Note ints accepted.
int origPrecision = precision;
if (precision == -1) {
precision = 6;
@@ -4540,7 +4711,9 @@
string = string.toUpperCase();
}
break;
+
case 'c':
+ // Single character (accepts integer or single character string).
fill = ' ';
if (arg instanceof PyString) {
string = ((PyString)arg).toString();
@@ -4552,6 +4725,8 @@
}
break;
}
+
+ // arg is not a str (or unicode)
int val;
try {
// Explicitly __int__ so we can look for an AttributeError (which is
@@ -4563,6 +4738,7 @@
}
throw e;
}
+ // Range check, according to ultimate type of result as presentl;y known.
if (!needUnicode) {
if (val < 0) {
throw Py.OverflowError("unsigned byte integer is less than minimum");
@@ -4580,8 +4756,15 @@
+ codecs.encode(Py.newString(c), null, "replace") + "' (0x"
+ Integer.toHexString(c) + ") at index " + (index - 1));
}
+
+ /*
+ * We have now dealt with the translation of the (absolute value of the) argument, in
+ * variable string[]. In the next sections we deal with sign, padding and base prefix.
+ */
int length = string.length();
int skip = 0;
+
+ // Decide how to represent the sign according to format and actual sign of argument.
String signString = null;
if (negative) {
signString = "-";
@@ -4593,34 +4776,47 @@
}
}
+ // The width (from here on) will be the remaining width on the line.
if (width < length) {
width = length;
}
+
+ // Insert the sign in the buffer and adjust the width.
if (signString != null) {
if (fill != ' ') {
+ // When the fill is not space, the sign comes before the fill.
buffer.append(signString);
}
+ // Adjust width for sign.
if (width > length) {
width--;
}
}
+
+ // Insert base prefix used with alternate mode for hexadecimal.
if (altFlag && (c == 'x' || c == 'X')) {
if (fill != ' ') {
+ // When the fill is not space, this base prefix comes before the fill.
buffer.append('0');
buffer.append(c);
skip += 2;
}
+ // Adjust width for base prefix.
width -= 2;
if (width < 0) {
width = 0;
}
length -= 2;
}
+
+ // Fill on the left of the item.
if (width > length && !ljustFlag) {
do {
buffer.append(fill);
} while (--width > length);
}
+
+ // If the fill is spaces, we will have deferred the sign and hex base prefix
if (fill == ' ') {
if (signString != null) {
buffer.append(signString);
@@ -4631,19 +4827,33 @@
skip += 2;
}
}
+
+ // Now append the converted argument.
if (skip > 0) {
+ // The string contains a hex-prefix, but we have already inserted one.
buffer.append(string.substring(skip));
} else {
buffer.append(string);
}
+ // If this hasn't filled the space required, add right-padding.
while (--width >= length) {
buffer.append(' ');
}
}
+
+ /*
+ * All fields in the format string have been used to convert arguments (or used the argument
+ * as a width, etc.). This had better not leave any arguments unused. Note argIndex is an
+ * index into args or has a special value. If args is a 'proper' index, It should now be out
+ * of range; if a special value, it would be wrong if it were -1, indicating a single item
+ * that has not yet been used.
+ */
if (argIndex == -1 || (argIndex >= 0 && args.__finditem__(argIndex) != null)) {
throw Py.TypeError("not all arguments converted during string formatting");
}
+
+ // Return the final buffer contents as a str or unicode as appropriate.
if (needUnicode) {
return new PyUnicode(buffer);
}
--
Repository URL: http://hg.python.org/jython
More information about the Jython-checkins
mailing list