[Jython-checkins] jython: Use UTF-8 for file paths expressed in bytes.
jeff.allen
jython-checkins at python.org
Sun May 21 05:06:52 EDT 2017
https://hg.python.org/jython/rev/1888a0b15f81
changeset: 8084:1888a0b15f81
user: Jeff Allen <ja.py at farowl.co.uk>
date: Thu Apr 20 23:20:46 2017 +0100
summary:
Use UTF-8 for file paths expressed in bytes.
This fairly extensive change regularises the approach to file and path names in
the interests of handling non-ascii paths correctly. See notes to issue #2356.
We are not finished with the consequential changes, but to commit work so far
helps make it manageable. regrtest runs with 24 failed tests.
files:
CPythonLib.includes | 1 +
Lib/ntpath.py | 560 ----------
Lib/subprocess.py | 38 +-
src/org/python/core/Py.java | 134 ++-
src/org/python/core/PyBytecode.java | 9 +-
src/org/python/core/PyFile.java | 4 -
src/org/python/core/PyNullImporter.java | 13 +-
src/org/python/core/PySystemState.java | 53 +-
src/org/python/core/PyTableCode.java | 6 +-
src/org/python/core/StdoutWrapper.java | 3 +-
src/org/python/core/imp.java | 13 +-
src/org/python/core/io/FileIO.java | 10 +-
src/org/python/modules/_imp.java | 30 +-
src/org/python/modules/posix/PosixModule.java | 18 +-
14 files changed, 224 insertions(+), 668 deletions(-)
diff --git a/CPythonLib.includes b/CPythonLib.includes
--- a/CPythonLib.includes
+++ b/CPythonLib.includes
@@ -110,6 +110,7 @@
netrc.py
nntplib.py
numbers.py
+ntpath.py
nturl2path.py
opcode.py
optparse.py
diff --git a/Lib/ntpath.py b/Lib/ntpath.py
deleted file mode 100644
--- a/Lib/ntpath.py
+++ /dev/null
@@ -1,560 +0,0 @@
-# Module 'ntpath' -- common operations on WinNT/Win95 pathnames
-"""Common pathname manipulations, WindowsNT/95 version.
-
-Instead of importing this module directly, import os and refer to this
-module as os.path.
-"""
-
-import os
-import sys
-import stat
-import genericpath
-import warnings
-
-from genericpath import *
-
-__all__ = ["normcase","isabs","join","splitdrive","split","splitext",
- "basename","dirname","commonprefix","getsize","getmtime",
- "getatime","getctime", "islink","exists","lexists","isdir","isfile",
- "ismount","walk","expanduser","expandvars","normpath","abspath",
- "splitunc","curdir","pardir","sep","pathsep","defpath","altsep",
- "extsep","devnull","realpath","supports_unicode_filenames","relpath"]
-
-# strings representing various path-related bits and pieces
-curdir = '.'
-pardir = '..'
-extsep = '.'
-sep = '\\'
-pathsep = ';'
-altsep = '/'
-defpath = '.;C:\\bin'
-if 'ce' in sys.builtin_module_names:
- defpath = '\\Windows'
-elif 'os2' in sys.builtin_module_names:
- # OS/2 w/ VACPP
- altsep = '/'
-devnull = 'nul'
-
-# Normalize the case of a pathname and map slashes to backslashes.
-# Other normalizations (such as optimizing '../' away) are not done
-# (this is done by normpath).
-
-def normcase(s):
- """Normalize case of pathname.
-
- Makes all characters lowercase and all slashes into backslashes."""
- return s.replace("/", "\\").lower()
-
-
-# Return whether a path is absolute.
-# Trivial in Posix, harder on the Mac or MS-DOS.
-# For DOS it is absolute if it starts with a slash or backslash (current
-# volume), or if a pathname after the volume letter and colon / UNC resource
-# starts with a slash or backslash.
-
-def isabs(s):
- """Test whether a path is absolute"""
- s = splitdrive(s)[1]
- return s != '' and s[:1] in '/\\'
-
-
-# Join two (or more) paths.
-
-def join(a, *p):
- """Join two or more pathname components, inserting "\\" as needed.
- If any component is an absolute path, all previous path components
- will be discarded."""
- path = a
- for b in p:
- b_wins = 0 # set to 1 iff b makes path irrelevant
- if path == "":
- b_wins = 1
-
- elif isabs(b):
- # This probably wipes out path so far. However, it's more
- # complicated if path begins with a drive letter:
- # 1. join('c:', '/a') == 'c:/a'
- # 2. join('c:/', '/a') == 'c:/a'
- # But
- # 3. join('c:/a', '/b') == '/b'
- # 4. join('c:', 'd:/') = 'd:/'
- # 5. join('c:/', 'd:/') = 'd:/'
- if path[1:2] != ":" or b[1:2] == ":":
- # Path doesn't start with a drive letter, or cases 4 and 5.
- b_wins = 1
-
- # Else path has a drive letter, and b doesn't but is absolute.
- elif len(path) > 3 or (len(path) == 3 and
- path[-1] not in "/\\"):
- # case 3
- b_wins = 1
-
- if b_wins:
- path = b
- else:
- # Join, and ensure there's a separator.
- assert len(path) > 0
- if path[-1] in "/\\":
- if b and b[0] in "/\\":
- path += b[1:]
- else:
- path += b
- elif path[-1] == ":":
- path += b
- elif b:
- if b[0] in "/\\":
- path += b
- else:
- path += "\\" + b
- else:
- # path is not empty and does not end with a backslash,
- # but b is empty; since, e.g., split('a/') produces
- # ('a', ''), it's best if join() adds a backslash in
- # this case.
- path += '\\'
-
- return path
-
-
-# Split a path in a drive specification (a drive letter followed by a
-# colon) and the path specification.
-# It is always true that drivespec + pathspec == p
-def splitdrive(p):
- """Split a pathname into drive and path specifiers. Returns a 2-tuple
-"(drive,path)"; either part may be empty"""
- if p[1:2] == ':':
- return p[0:2], p[2:]
- return '', p
-
-
-# Parse UNC paths
-def splitunc(p):
- """Split a pathname into UNC mount point and relative path specifiers.
-
- Return a 2-tuple (unc, rest); either part may be empty.
- If unc is not empty, it has the form '//host/mount' (or similar
- using backslashes). unc+rest is always the input path.
- Paths containing drive letters never have an UNC part.
- """
- if p[1:2] == ':':
- return '', p # Drive letter present
- firstTwo = p[0:2]
- if firstTwo == '//' or firstTwo == '\\\\':
- # is a UNC path:
- # vvvvvvvvvvvvvvvvvvvv equivalent to drive letter
- # \\machine\mountpoint\directories...
- # directory ^^^^^^^^^^^^^^^
- normp = normcase(p)
- index = normp.find('\\', 2)
- if index == -1:
- ##raise RuntimeError, 'illegal UNC path: "' + p + '"'
- return ("", p)
- index = normp.find('\\', index + 1)
- if index == -1:
- index = len(p)
- return p[:index], p[index:]
- return '', p
-
-
-# Split a path in head (everything up to the last '/') and tail (the
-# rest). After the trailing '/' is stripped, the invariant
-# join(head, tail) == p holds.
-# The resulting head won't end in '/' unless it is the root.
-
-def split(p):
- """Split a pathname.
-
- Return tuple (head, tail) where tail is everything after the final slash.
- Either part may be empty."""
-
- d, p = splitdrive(p)
- # set i to index beyond p's last slash
- i = len(p)
- while i and p[i-1] not in '/\\':
- i = i - 1
- head, tail = p[:i], p[i:] # now tail has no slashes
- # remove trailing slashes from head, unless it's all slashes
- head2 = head
- while head2 and head2[-1] in '/\\':
- head2 = head2[:-1]
- head = head2 or head
- return d + head, tail
-
-
-# Split a path in root and extension.
-# The extension is everything starting at the last dot in the last
-# pathname component; the root is everything before that.
-# It is always true that root + ext == p.
-
-def splitext(p):
- return genericpath._splitext(p, sep, altsep, extsep)
-splitext.__doc__ = genericpath._splitext.__doc__
-
-
-# Return the tail (basename) part of a path.
-
-def basename(p):
- """Returns the final component of a pathname"""
- return split(p)[1]
-
-
-# Return the head (dirname) part of a path.
-
-def dirname(p):
- """Returns the directory component of a pathname"""
- return split(p)[0]
-
-# Is a path a symbolic link?
-# This will always return false on systems where posix.lstat doesn't exist.
-
-def islink(path):
- """Test for symbolic link.
- On WindowsNT/95 and OS/2 always returns false
- """
- return False
-
-# alias exists to lexists
-lexists = exists
-
-# Is a path a mount point? Either a root (with or without drive letter)
-# or an UNC path with at most a / or \ after the mount point.
-
-def ismount(path):
- """Test whether a path is a mount point (defined as root of drive)"""
- unc, rest = splitunc(path)
- if unc:
- return rest in ("", "/", "\\")
- p = splitdrive(path)[1]
- return len(p) == 1 and p[0] in '/\\'
-
-
-# Directory tree walk.
-# For each directory under top (including top itself, but excluding
-# '.' and '..'), func(arg, dirname, filenames) is called, where
-# dirname is the name of the directory and filenames is the list
-# of files (and subdirectories etc.) in the directory.
-# The func may modify the filenames list, to implement a filter,
-# or to impose a different order of visiting.
-
-def walk(top, func, arg):
- """Directory tree walk with callback function.
-
- For each directory in the directory tree rooted at top (including top
- itself, but excluding '.' and '..'), call func(arg, dirname, fnames).
- dirname is the name of the directory, and fnames a list of the names of
- the files and subdirectories in dirname (excluding '.' and '..'). func
- may modify the fnames list in-place (e.g. via del or slice assignment),
- and walk will only recurse into the subdirectories whose names remain in
- fnames; this can be used to implement a filter, or to impose a specific
- order of visiting. No semantics are defined for, or required of, arg,
- beyond that arg is always passed to func. It can be used, e.g., to pass
- a filename pattern, or a mutable object designed to accumulate
- statistics. Passing None for arg is common."""
- warnings.warnpy3k("In 3.x, os.path.walk is removed in favor of os.walk.",
- stacklevel=2)
- try:
- names = os.listdir(top)
- except os.error:
- return
- func(arg, top, names)
- for name in names:
- name = join(top, name)
- if isdir(name):
- walk(name, func, arg)
-
-
-# Expand paths beginning with '~' or '~user'.
-# '~' means $HOME; '~user' means that user's home directory.
-# If the path doesn't begin with '~', or if the user or $HOME is unknown,
-# the path is returned unchanged (leaving error reporting to whatever
-# function is called with the expanded path as argument).
-# See also module 'glob' for expansion of *, ? and [...] in pathnames.
-# (A function should also be defined to do full *sh-style environment
-# variable expansion.)
-
-def expanduser(path):
- """Expand ~ and ~user constructs.
-
- If user or $HOME is unknown, do nothing."""
- if path[:1] != '~':
- return path
- i, n = 1, len(path)
- while i < n and path[i] not in '/\\':
- i = i + 1
-
- if 'HOME' in os.environ:
- userhome = os.environ['HOME']
- elif 'USERPROFILE' in os.environ:
- userhome = os.environ['USERPROFILE']
- elif not 'HOMEPATH' in os.environ:
- return path
- else:
- try:
- drive = os.environ['HOMEDRIVE']
- except KeyError:
- drive = ''
- userhome = join(drive, os.environ['HOMEPATH'])
-
- if i != 1: #~user
- userhome = join(dirname(userhome), path[1:i])
-
- return userhome + path[i:]
-
-
-# Expand paths containing shell variable substitutions.
-# The following rules apply:
-# - no expansion within single quotes
-# - '$$' is translated into '$'
-# - '%%' is translated into '%' if '%%' are not seen in %var1%%var2%
-# - ${varname} is accepted.
-# - $varname is accepted.
-# - %varname% is accepted.
-# - varnames can be made out of letters, digits and the characters '_-'
-# (though is not verifed in the ${varname} and %varname% cases)
-# XXX With COMMAND.COM you can use any characters in a variable name,
-# XXX except '^|<>='.
-
-def expandvars(path):
- """Expand shell variables of the forms $var, ${var} and %var%.
-
- Unknown variables are left unchanged."""
- if '$' not in path and '%' not in path:
- return path
- import string
- varchars = string.ascii_letters + string.digits + '_-'
- res = ''
- index = 0
- pathlen = len(path)
- while index < pathlen:
- c = path[index]
- if c == '\'': # no expansion within single quotes
- path = path[index + 1:]
- pathlen = len(path)
- try:
- index = path.index('\'')
- res = res + '\'' + path[:index + 1]
- except ValueError:
- res = res + path
- index = pathlen - 1
- elif c == '%': # variable or '%'
- if path[index + 1:index + 2] == '%':
- res = res + c
- index = index + 1
- else:
- path = path[index+1:]
- pathlen = len(path)
- try:
- index = path.index('%')
- except ValueError:
- res = res + '%' + path
- index = pathlen - 1
- else:
- var = path[:index]
- if var in os.environ:
- res = res + os.environ[var]
- else:
- res = res + '%' + var + '%'
- elif c == '$': # variable or '$$'
- if path[index + 1:index + 2] == '$':
- res = res + c
- index = index + 1
- elif path[index + 1:index + 2] == '{':
- path = path[index+2:]
- pathlen = len(path)
- try:
- index = path.index('}')
- var = path[:index]
- if var in os.environ:
- res = res + os.environ[var]
- else:
- res = res + '${' + var + '}'
- except ValueError:
- res = res + '${' + path
- index = pathlen - 1
- else:
- var = ''
- index = index + 1
- c = path[index:index + 1]
- while c != '' and c in varchars:
- var = var + c
- index = index + 1
- c = path[index:index + 1]
- if var in os.environ:
- res = res + os.environ[var]
- else:
- res = res + '$' + var
- if c != '':
- index = index - 1
- else:
- res = res + c
- index = index + 1
- return res
-
-
-# Normalize a path, e.g. A//B, A/./B and A/foo/../B all become A\B.
-# Previously, this function also truncated pathnames to 8+3 format,
-# but as this module is called "ntpath", that's obviously wrong!
-
-def normpath(path):
- """Normalize path, eliminating double slashes, etc."""
- # Preserve unicode (if path is unicode)
- backslash, dot = (u'\\', u'.') if isinstance(path, unicode) else ('\\', '.')
- if path.startswith(('\\\\.\\', '\\\\?\\')):
- # in the case of paths with these prefixes:
- # \\.\ -> device names
- # \\?\ -> literal paths
- # do not do any normalization, but return the path unchanged
- return path
- path = path.replace("/", "\\")
- prefix, path = splitdrive(path)
- # We need to be careful here. If the prefix is empty, and the path starts
- # with a backslash, it could either be an absolute path on the current
- # drive (\dir1\dir2\file) or a UNC filename (\\server\mount\dir1\file). It
- # is therefore imperative NOT to collapse multiple backslashes blindly in
- # that case.
- # The code below preserves multiple backslashes when there is no drive
- # letter. This means that the invalid filename \\\a\b is preserved
- # unchanged, where a\\\b is normalised to a\b. It's not clear that there
- # is any better behaviour for such edge cases.
- if prefix == '':
- # No drive letter - preserve initial backslashes
- while path[:1] == "\\":
- prefix = prefix + backslash
- path = path[1:]
- else:
- # We have a drive letter - collapse initial backslashes
- if path.startswith("\\"):
- prefix = prefix + backslash
- path = path.lstrip("\\")
- comps = path.split("\\")
- i = 0
- while i < len(comps):
- if comps[i] in ('.', ''):
- del comps[i]
- elif comps[i] == '..':
- if i > 0 and comps[i-1] != '..':
- del comps[i-1:i+1]
- i -= 1
- elif i == 0 and prefix.endswith("\\"):
- del comps[i]
- else:
- i += 1
- else:
- i += 1
- # If the path is now empty, substitute '.'
- if not prefix and not comps:
- comps.append(dot)
- return prefix + backslash.join(comps)
-
-
-# Return an absolute path.
-try:
- from nt import _getfullpathname
-
-except ImportError: # no built-in nt module - maybe it's Jython ;)
-
- if os._name == 'nt' :
- # on Windows so Java version of sys deals in NT paths
- def abspath(path):
- """Return the absolute version of a path."""
- try:
- if isinstance(path, unicode):
- # Result must be unicode
- if path:
- path = sys.getPath(path)
- else:
- # Empty path must return current working directory
- path = os.getcwdu()
- else:
- # Result must be bytes
- if path:
- path = sys.getPath(path).encode('latin-1')
- else:
- # Empty path must return current working directory
- path = os.getcwd()
- except EnvironmentError:
- pass # Bad path - return unchanged.
- return normpath(path)
-
- else:
- # not running on Windows - mock up something sensible
- def abspath(path):
- """Return the absolute version of a path."""
- try:
- if isinstance(path, unicode):
- # Result must be unicode
- if path:
- path = join(os.getcwdu(), path)
- else:
- # Empty path must return current working directory
- path = os.getcwdu()
- else:
- # Result must be bytes
- if path:
- path = join(os.getcwd(), path)
- else:
- # Empty path must return current working directory
- path = os.getcwd()
- except EnvironmentError:
- pass # Bad path - return unchanged.
- return normpath(path)
-
-else: # use native Windows method on Windows
- def abspath(path):
- """Return the absolute version of a path."""
-
- if path: # Empty path must return current working directory.
- try:
- path = _getfullpathname(path)
- except WindowsError:
- pass # Bad path - return unchanged.
- elif isinstance(path, unicode):
- path = os.getcwdu()
- else:
- path = os.getcwd()
- return normpath(path)
-
-# realpath is a no-op on systems without islink support
-realpath = abspath
-# Win9x family and earlier have no Unicode filename support.
-supports_unicode_filenames = (hasattr(sys, "getwindowsversion") and
- sys.getwindowsversion()[3] >= 2)
-
-def _abspath_split(path):
- abs = abspath(normpath(path))
- prefix, rest = splitunc(abs)
- is_unc = bool(prefix)
- if not is_unc:
- prefix, rest = splitdrive(abs)
- return is_unc, prefix, [x for x in rest.split(sep) if x]
-
-def relpath(path, start=curdir):
- """Return a relative version of a path"""
-
- if not path:
- raise ValueError("no path specified")
-
- start_is_unc, start_prefix, start_list = _abspath_split(start)
- path_is_unc, path_prefix, path_list = _abspath_split(path)
-
- if path_is_unc ^ start_is_unc:
- raise ValueError("Cannot mix UNC and non-UNC paths (%s and %s)"
- % (path, start))
- if path_prefix.lower() != start_prefix.lower():
- if path_is_unc:
- raise ValueError("path is on UNC root %s, start on UNC root %s"
- % (path_prefix, start_prefix))
- else:
- raise ValueError("path is on drive %s, start on drive %s"
- % (path_prefix, start_prefix))
- # Work out how much of the filepath is shared by start and path.
- i = 0
- for e1, e2 in zip(start_list, path_list):
- if e1.lower() != e2.lower():
- break
- i += 1
-
- rel_list = [pardir] * (len(start_list)-i) + path_list[i:]
- if not rel_list:
- return curdir
- return join(*rel_list)
diff --git a/Lib/subprocess.py b/Lib/subprocess.py
--- a/Lib/subprocess.py
+++ b/Lib/subprocess.py
@@ -438,6 +438,7 @@
import java.nio.ByteBuffer
import org.python.core.io.RawIOBase
import org.python.core.io.StreamIO
+ from org.python.core.Py import fileSystemDecode
else:
import select
_has_poll = hasattr(select, 'poll')
@@ -779,7 +780,7 @@
maintain those byte values (which may be butchered as
Strings) for the subprocess if they haven't been modified.
"""
- # Determine what's safe to merge
+ # Determine what's necessary to merge (new or different)
merge_env = dict((key, value) for key, value in env.iteritems()
if key not in builder_env or
builder_env.get(key) != value)
@@ -789,8 +790,10 @@
for entry in entries:
if entry.getKey() not in env:
entries.remove()
-
- builder_env.putAll(merge_env)
+ # add anything new or different in env
+ for key, value in merge_env.iteritems():
+ # If the new value is bytes, assume it to be FS-encoded
+ builder_env.put(key, fileSystemDecode(value))
class Popen(object):
@@ -1308,9 +1311,6 @@
args = _cmdline2listimpl(args)
else:
args = list(args)
- # NOTE: CPython posix (execv) will str() any unicode
- # args first, maybe we should do the same on
- # posix. Windows passes unicode through, however
if any(not isinstance(arg, (str, unicode)) for arg in args):
raise TypeError('args must contain only strings')
args = _escape_args(args)
@@ -1321,6 +1321,11 @@
if executable is not None:
args[0] = executable
+ # NOTE: CPython posix (execv) will FS-encode any unicode args, but
+ # pass on bytes unchanged, because that's what the system expects.
+ # Java expects unicode, so we do the converse: leave unicode
+ # unchanged but FS-decode any supplied as bytes.
+ args = [fileSystemDecode(arg) for arg in args]
builder = java.lang.ProcessBuilder(args)
if stdin is None:
@@ -1330,16 +1335,20 @@
if stderr is None:
builder.redirectError(java.lang.ProcessBuilder.Redirect.INHERIT)
- # os.environ may be inherited for compatibility with CPython
+ # os.environ may be inherited for compatibility with CPython.
+ # Elements taken from os.environ are FS-decoded to unicode.
_setup_env(dict(os.environ if env is None else env),
builder.environment())
+ # The current working directory must also be unicode.
if cwd is None:
- cwd = os.getcwd()
- elif not os.path.exists(cwd):
- raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), cwd)
- elif not os.path.isdir(cwd):
- raise OSError(errno.ENOTDIR, os.strerror(errno.ENOTDIR), cwd)
+ cwd = os.getcwdu()
+ else:
+ cwd = fileSystemDecode(cwd)
+ if not os.path.exists(cwd):
+ raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), cwd)
+ elif not os.path.isdir(cwd):
+ raise OSError(errno.ENOTDIR, os.strerror(errno.ENOTDIR), cwd)
builder.directory(java.io.File(cwd))
# Let Java manage redirection of stderr to stdout (it's more
@@ -1890,9 +1899,10 @@
args = _cmdline2listimpl(command)
args = _escape_args(args)
args = _shell_command + args
- cwd = os.getcwd()
+ cwd = os.getcwdu()
-
+ # Python supplies FS-encoded arguments while Java expects String
+ args = [fileSystemDecode(arg) for arg in args]
builder = java.lang.ProcessBuilder(args)
builder.directory(java.io.File(cwd))
diff --git a/src/org/python/core/Py.java b/src/org/python/core/Py.java
--- a/src/org/python/core/Py.java
+++ b/src/org/python/core/Py.java
@@ -84,6 +84,7 @@
throw new StreamCorruptedException("unknown singleton: " + which);
}
}
+
/* Holds the singleton None and Ellipsis objects */
/** The singleton None Python object **/
public final static PyObject None = new PyNone();
@@ -222,6 +223,10 @@
return new PyException(Py.IOError, args);
}
+ public static PyException IOError(Constant errno, String filename) {
+ return new PyException(Py.IOError, Py.fileSystemEncode(filename)); // XXX newStringOrUnicode?
+ }
+
public static PyException IOError(Constant errno, PyObject filename) {
int value = errno.intValue();
PyObject args = new PyTuple(Py.newInteger(value), PosixModule.strerror(value), filename);
@@ -683,6 +688,103 @@
}
}
+ /**
+ * Return a file name or path as Unicode (Java UTF-16 <code>String</code>), decoded if necessary
+ * from a Python <code>bytes</code> object, using the file system encoding. In Jython, this
+ * encoding is UTF-8, irrespective of the OS platform. This method is comparable with Python 3
+ * <code>os.fsdecode</code>, but for Java use, in places such as the <code>os</code> module. If
+ * the argument is not a <code>PyUnicode</code>, it will be decoded using the nominal Jython
+ * file system encoding. If the argument <i>is</i> a <code>PyUnicode</code>, its
+ * <code>String</code> is returned.
+ *
+ * @param filename as <code>bytes</code> to decode, or already as <code>unicode</code>
+ * @return unicode version of path
+ */
+ public static String fileSystemDecode(PyString filename) {
+ String s = filename.getString();
+ if (filename instanceof PyUnicode || CharMatcher.ascii().matchesAllOf(s)) {
+ // Already encoded or usable as ASCII
+ return s;
+ } else {
+ // It's bytes, so must decode properly
+ assert "utf-8".equals(PySystemState.FILE_SYSTEM_ENCODING.toString());
+ return codecs.PyUnicode_DecodeUTF8(s, null);
+ }
+ }
+
+ /**
+ * As {@link #fileSystemDecode(PyString)} but raising <code>ValueError</code> if not a
+ * <code>str</code> or <code>unicode</code>.
+ *
+ * @param filename as <code>bytes</code> to decode, or already as <code>unicode</code>
+ * @return unicode version of the file name
+ */
+ public static String fileSystemDecode(PyObject filename) {
+ if (filename instanceof PyString) {
+ return fileSystemDecode((PyString)filename);
+ } else
+ throw Py.TypeError(String.format("coercing to Unicode: need string, %s type found",
+ filename.getType().fastGetName()));
+ }
+
+ /**
+ * Return a PyString object we can use as a file name or file path in places where Python
+ * expects a <code>bytes</code> (that is a <code>str</code>) object in the file system encoding.
+ * In Jython, this encoding is UTF-8, irrespective of the OS platform.
+ * <p>
+ * This is subtly different from CPython's use of "file system encoding", which tracks the
+ * platform's choice so that OS services may be called that have a bytes interface. Jython's
+ * interaction with the OS occurs via Java using String arguments representing Unicode values,
+ * so we have no need to match the encoding actually chosen by the platform (e.g. 'mbcs' on
+ * Windows). Rather we need a nominal Jython file system encoding, for use where the standard
+ * library forces byte paths on us (in Python 2). There is no reason for this choice to vary
+ * with OS platform. Methods receiving paths as <code>bytes</code> will
+ * {@link #fileSystemDecode(PyString)} them again for Java.
+ *
+ * @param filename as <code>unicode</code> to encode, or already as <code>bytes</code>
+ * @return encoded bytes version of path
+ */
+ public static PyString fileSystemEncode(String filename) {
+ if (CharMatcher.ascii().matchesAllOf(filename)) {
+ // Just wrap it as US-ASCII is a subset of the file system encoding
+ return Py.newString(filename);
+ } else {
+ // It's non just US-ASCII, so must encode properly
+ assert "utf-8".equals(PySystemState.FILE_SYSTEM_ENCODING.toString());
+ return Py.newString(codecs.PyUnicode_EncodeUTF8(filename, null));
+ }
+ }
+
+ /**
+ * Return a PyString object we can use as a file name or file path in places where Python
+ * expects a <code>bytes</code> (that is, <code>str</code>) object in the file system encoding.
+ * In Jython, this encoding is UTF-8, irrespective of the OS platform. This method is comparable
+ * with Python 3 <code>os.fsencode</code>. If the argument is a PyString, it is returned
+ * unchanged. If the argument is a PyUnicode, it is converted to a <code>bytes</code> using the
+ * nominal Jython file system encoding.
+ *
+ * @param filename as <code>unicode</code> to encode, or already as <code>bytes</code>
+ * @return encoded bytes version of path
+ */
+ public static PyString fileSystemEncode(PyString filename) {
+ return (filename instanceof PyUnicode) ? fileSystemEncode(filename.getString()) : filename;
+ }
+
+ /**
+ * Convert a <code>PyList</code> path to a list of Java <code>String</code> objects decoded from
+ * the path elements to strings guaranteed usable in the Java API.
+ *
+ * @param path a Python search path
+ * @return equivalent Java list
+ */
+ private static List<String> fileSystemDecode(PyList path) {
+ List<String> list = new ArrayList<>(path.__len__());
+ for (PyObject filename : path.getList()) {
+ list.add(fileSystemDecode(filename));
+ }
+ return list;
+ }
+
public static PyStringMap newStringMap() {
// enable lazy bootstrapping (see issue #1671)
if (!PyType.hasBuilder(PyStringMap.class)) {
@@ -1282,7 +1384,7 @@
if (moduleName == null) {
buf.append("<unknown>");
} else {
- String moduleStr = moduleName.toString();
+ String moduleStr = Py.fileSystemDecode(moduleName);
if (!moduleStr.equals("exceptions")) {
buf.append(moduleStr);
buf.append(".");
@@ -1294,7 +1396,7 @@
}
if (value != null && value != Py.None) {
// only print colon if the str() of the object is not the empty string
- PyObject s = useRepr ? value.__repr__() : value.__str__();
+ PyObject s = useRepr ? value.__repr__() : value;
if (!(s instanceof PyString) || s.__len__() != 0) {
buf.append(": ");
}
@@ -1565,6 +1667,16 @@
}
}
+ private static final String IMPORT_SITE_ERROR = ""
+ + "Cannot import site module and its dependencies: %s\n"
+ + "Determine if the following attributes are correct:\n" //
+ + " * sys.path: %s\n"
+ + " This attribute might be including the wrong directories, such as from CPython\n"
+ + " * sys.prefix: %s\n"
+ + " This attribute is set by the system property python.home, although it can\n"
+ + " be often automatically determined by the location of the Jython jar file\n\n"
+ + "You can use the -S option or python.import.site=false to not import the site module";
+
public static boolean importSiteIfSelected() {
if (Options.importSite) {
try {
@@ -1574,18 +1686,10 @@
} catch (PyException pye) {
if (pye.match(Py.ImportError)) {
PySystemState sys = Py.getSystemState();
- throw Py.ImportError(String.format(""
- + "Cannot import site module and its dependencies: %s\n"
- + "Determine if the following attributes are correct:\n"
- + " * sys.path: %s\n"
- + " This attribute might be including the wrong directories, such as from CPython\n"
- + " * sys.prefix: %s\n"
- + " This attribute is set by the system property python.home, although it can\n"
- + " be often automatically determined by the location of the Jython jar file\n\n"
- + "You can use the -S option or python.import.site=false to not import the site module",
- pye.value.__getattr__("args").__getitem__(0),
- sys.path,
- sys.prefix));
+ String value = pye.value.__getattr__("args").__getitem__(0).toString();
+ List<String> path = fileSystemDecode(sys.path);
+ throw Py.ImportError(
+ String.format(IMPORT_SITE_ERROR, value, path, PySystemState.prefix));
} else {
throw pye;
}
@@ -2266,7 +2370,7 @@
}
/* Here we would actually like to call cls.__findattr__("__metaclass__")
* rather than cls.getType(). However there are circumstances where the
- * metaclass doesn't show up as __metaclass__. On the other hand we need
+ * metaclass doesn't show up as __metaclass__. On the other hand we need
* to avoid that checker refers to builtin type___subclasscheck__ or
* type___instancecheck__. Filtering out checker-instances of
* PyBuiltinMethodNarrow does the trick. We also filter out PyMethodDescr
diff --git a/src/org/python/core/PyBytecode.java b/src/org/python/core/PyBytecode.java
--- a/src/org/python/core/PyBytecode.java
+++ b/src/org/python/core/PyBytecode.java
@@ -116,11 +116,13 @@
throw Py.AttributeError(name);
}
+ @Override
public void __setattr__(String name, PyObject value) {
// no writable attributes
throwReadonly(name);
}
+ @Override
public void __delattr__(String name) {
throwReadonly(name);
}
@@ -137,6 +139,7 @@
return new PyTuple(pystr);
}
+ @Override
public PyObject __findattr_ex__(String name) {
// have to craft co_varnames specially
if (name == "co_varnames") {
@@ -149,7 +152,7 @@
return toPyStringTuple(co_freevars);
}
if (name == "co_filename") {
- return new PyString(co_filename);
+ return Py.fileSystemEncode(co_filename); // bytes object expected by clients
}
if (name == "co_name") {
return new PyString(co_name);
@@ -1156,7 +1159,7 @@
"zap" this information, to prevent END_FINALLY from
re-raising the exception. (But non-local gotos
should still be resumed.)
- */
+ */
PyObject exit;
PyObject u = stack.pop(), v, w;
if (u == Py.None) {
@@ -1350,7 +1353,7 @@
if (why != Why.RETURN) {
retval = Py.None;
}
- } else {
+ } else {
// store the stack in the frame for reentry from the yield;
f.f_savedlocals = stack.popN(stack.size());
}
diff --git a/src/org/python/core/PyFile.java b/src/org/python/core/PyFile.java
--- a/src/org/python/core/PyFile.java
+++ b/src/org/python/core/PyFile.java
@@ -168,10 +168,6 @@
ArgParser ap = new ArgParser("file", args, kwds, new String[] {"name", "mode", "buffering"},
1);
PyObject name = ap.getPyObject(0);
- if (!(name instanceof PyString)) {
- throw Py.TypeError("coercing to Unicode: need string, '" + name.getType().fastGetName()
- + "' type found");
- }
String mode = ap.getString(1, "r");
int bufsize = ap.getInt(2, -1);
file___init__(new FileIO((PyString) name, parseMode(mode)), name, mode, bufsize);
diff --git a/src/org/python/core/PyNullImporter.java b/src/org/python/core/PyNullImporter.java
--- a/src/org/python/core/PyNullImporter.java
+++ b/src/org/python/core/PyNullImporter.java
@@ -20,7 +20,7 @@
public PyNullImporter(PyObject pathObj) {
super();
- String pathStr = asPath(pathObj);
+ String pathStr = Py.fileSystemDecode(pathObj);
if (pathStr.equals("")) {
throw Py.ImportError("empty pathname");
}
@@ -42,17 +42,6 @@
return Py.None;
}
- // FIXME Refactoring move helper function to a central util library
- // FIXME Also can take in account working in zip file systems
-
- private static String asPath(PyObject pathObj) {
- if (!(pathObj instanceof PyString)) {
- throw Py.TypeError(String.format("coercing to Unicode: need string, %s type found",
- pathObj.getType().fastGetName()));
- }
- return pathObj.toString();
- }
-
private static boolean isDir(String pathStr) {
if (pathStr.equals("")) {
return false;
diff --git a/src/org/python/core/PySystemState.java b/src/org/python/core/PySystemState.java
--- a/src/org/python/core/PySystemState.java
+++ b/src/org/python/core/PySystemState.java
@@ -82,6 +82,9 @@
public final static PyString float_repr_style = Py.newString("short");
+ /** Nominal Jython file system encoding (as <code>sys.getfilesystemencoding()</code>) */
+ static final PyString FILE_SYSTEM_ENCODING = Py.newString("utf-8");
+
public static boolean py3kwarning = false;
public final static Class flags = Options.class;
@@ -109,13 +112,13 @@
public static PackageManager packageManager;
private static File cachedir;
- private static PyList defaultPath;
- private static PyList defaultArgv;
- private static PyObject defaultExecutable;
+ private static PyList defaultPath; // list of bytes or unicode
+ private static PyList defaultArgv; // list of bytes or unicode
+ private static PyObject defaultExecutable; // bytes or unicode or None
public static Properties registry; // = init_registry();
- public static PyObject prefix;
- public static PyObject exec_prefix = Py.EmptyString;
+ public static PyObject prefix; // bytes or unicode
+ public static PyObject exec_prefix = Py.EmptyString; // bytes or unicode
public static final PyString byteorder = new PyString("big");
public static final int maxint = Integer.MAX_VALUE;
@@ -504,7 +507,7 @@
}
public PyObject getfilesystemencoding() {
- return Py.None;
+ return FILE_SYSTEM_ENCODING;
}
@@ -840,10 +843,10 @@
}
}
if (prefix != null) {
- PySystemState.prefix = Py.newString(prefix);
+ PySystemState.prefix = Py.newStringOrUnicode(prefix);
}
if (exec_prefix != null) {
- PySystemState.exec_prefix = Py.newString(exec_prefix);
+ PySystemState.exec_prefix = Py.newStringOrUnicode(exec_prefix);
}
try {
String jythonpath = System.getenv("JYTHONPATH");
@@ -1174,16 +1177,16 @@
PyList argv = new PyList();
if (args != null) {
for (String arg : args) {
- argv.append(Py.newStringOrUnicode(arg));
+ argv.append(Py.newStringOrUnicode(arg)); // XXX or always newUnicode?
}
}
return argv;
}
/**
- * Determine the default sys.executable value from the registry.
- * If registry is not set (as in standalone jython jar), will use sys.prefix + /bin/jython(.exe) and the file may
- * not exist. Users can create a wrapper in it's place to make it work in embedded environments.
+ * Determine the default sys.executable value from the registry. If registry is not set (as in
+ * standalone jython jar), we will use sys.prefix + /bin/jython(.exe) and the file may not
+ * exist. Users can create a wrapper in it's place to make it work in embedded environments.
* Only if sys.prefix is null, returns Py.None
*
* @param props a Properties registry
@@ -1191,26 +1194,26 @@
*/
private static PyObject initExecutable(Properties props) {
String executable = props.getProperty("python.executable");
- if (executable == null) {
+ File executableFile;
+ if (executable != null) {
+ // The executable from the registry is a Unicode String path
+ executableFile = new File(executable);
+ } else {
if (prefix == null) {
return Py.None;
} else {
- executable = prefix.asString() + File.separator + "bin" + File.separator;
- if (Platform.IS_WINDOWS) {
- executable += "jython.exe";
- } else {
- executable += "jython";
- }
+ // The prefix is a unicode or encoded bytes object
+ executableFile = new File(Py.fileSystemDecode(prefix),
+ Platform.IS_WINDOWS ? "bin\\jython.exe" : "bin/jython");
}
}
- File executableFile = new File(executable);
try {
executableFile = executableFile.getCanonicalFile();
} catch (IOException ioe) {
executableFile = executableFile.getAbsoluteFile();
}
- return new PyString(executableFile.getPath());
+ return Py.newStringOrUnicode(executableFile.getPath()); // XXX always bytes in CPython
}
/**
@@ -1353,8 +1356,8 @@
PyList path = new PyList();
addPaths(path, props.getProperty("python.path", ""));
if (prefix != null) {
- String libpath = new File(prefix.toString(), "Lib").toString();
- path.append(new PyString(libpath));
+ String libpath = new File(Py.fileSystemDecode(prefix), "Lib").toString();
+ path.append(Py.fileSystemEncode(libpath)); // XXX or newStringOrUnicode or newUnicode?
}
if (standalone) {
// standalone jython: add the /Lib directory inside JYTHON_JAR to the path
@@ -1397,7 +1400,8 @@
private static void addPaths(PyList path, String pypath) {
StringTokenizer tok = new StringTokenizer(pypath, java.io.File.pathSeparator);
while (tok.hasMoreTokens()) {
- path.append(new PyString(tok.nextToken().trim()));
+ // Use unicode object if necessary to represent the element
+ path.append(Py.newStringOrUnicode(tok.nextToken().trim()));
}
}
@@ -1540,6 +1544,7 @@
closer.cleanup();
}
+ @Override
public void close() { cleanup(); }
public static class PySystemStateCloser {
diff --git a/src/org/python/core/PyTableCode.java b/src/org/python/core/PyTableCode.java
--- a/src/org/python/core/PyTableCode.java
+++ b/src/org/python/core/PyTableCode.java
@@ -66,6 +66,7 @@
// co_lnotab, co_stacksize
};
+ @Override
public PyObject __dir__() {
PyString members[] = new PyString[__members__.length];
for (int i = 0; i < __members__.length; i++)
@@ -80,11 +81,13 @@
throw Py.AttributeError(name);
}
+ @Override
public void __setattr__(String name, PyObject value) {
// no writable attributes
throwReadonly(name);
}
+ @Override
public void __delattr__(String name) {
throwReadonly(name);
}
@@ -99,6 +102,7 @@
return new PyTuple(pystr);
}
+ @Override
public PyObject __findattr_ex__(String name) {
// have to craft co_varnames specially
if (name == "co_varnames") {
@@ -111,7 +115,7 @@
return toPyStringTuple(co_freevars);
}
if (name == "co_filename") {
- return new PyString(co_filename);
+ return Py.fileSystemEncode(co_filename); // bytes object expected by clients
}
if (name == "co_name") {
return new PyString(co_name);
diff --git a/src/org/python/core/StdoutWrapper.java b/src/org/python/core/StdoutWrapper.java
--- a/src/org/python/core/StdoutWrapper.java
+++ b/src/org/python/core/StdoutWrapper.java
@@ -105,7 +105,8 @@
String s;
if (o instanceof PyUnicode) {
// Use the encoding and policy defined for the stream. (Each may be null.)
- s = ((PyUnicode)o).encode(file.encoding, file.errors);
+ s = ((PyUnicode)o).encode(file.encoding, "replace"); //FIXME: back to ...
+ // s = ((PyUnicode)o).encode(file.encoding, file.errors);
} else {
s = o.__str__().toString();
}
diff --git a/src/org/python/core/imp.java b/src/org/python/core/imp.java
--- a/src/org/python/core/imp.java
+++ b/src/org/python/core/imp.java
@@ -418,7 +418,8 @@
}
if (moduleLocation != null) {
- module.__setattr__("__file__", new PyString(moduleLocation));
+ // Standard library expects __file__ to be encoded bytes
+ module.__setattr__("__file__", Py.fileSystemEncode(moduleLocation));
} else if (module.__findattr__("__file__") == null) {
// Should probably never happen (but maybe with an odd custom builtins, or
// Java Integration)
@@ -543,10 +544,8 @@
return loadFromLoader(loader, moduleName);
}
}
- if (!(p instanceof PyUnicode)) {
- p = p.__str__();
- }
- ret = loadFromSource(sys, name, moduleName, p.toString());
+ // p could be unicode or bytes (in the file system encoding)
+ ret = loadFromSource(sys, name, moduleName, Py.fileSystemDecode(p));
if (ret != null) {
return ret;
}
@@ -606,7 +605,7 @@
// display names are for identification purposes (e.g. __file__): when entry is
// null it forces java.io.File to be a relative path (e.g. foo/bar.py instead of
// /tmp/foo/bar.py)
- String displayDirName = entry.equals("") ? null : entry.toString();
+ String displayDirName = entry.equals("") ? null : entry;
String displaySourceName = new File(new File(displayDirName, name), sourceName).getPath();
String displayCompiledName =
new File(new File(displayDirName, name), compiledName).getPath();
@@ -640,7 +639,7 @@
compiledFile = new File(dirName, compiledName);
} else {
PyModule m = addModule(modName);
- PyObject filename = new PyString(new File(displayDirName, name).getPath());
+ PyObject filename = Py.newStringOrUnicode(new File(displayDirName, name).getPath()); // XXX fileSystemEncode?
m.__dict__.__setitem__("__path__", new PyList(new PyObject[] {filename}));
}
diff --git a/src/org/python/core/io/FileIO.java b/src/org/python/core/io/FileIO.java
--- a/src/org/python/core/io/FileIO.java
+++ b/src/org/python/core/io/FileIO.java
@@ -64,10 +64,10 @@
private boolean emulateAppend;
/**
- * @see #FileIO(PyString name, String mode)
+ * @see #FileIO(String name, String mode)
*/
- public FileIO(String name, String mode) {
- this(Py.newString(name), mode);
+ public FileIO(PyString name, String mode) {
+ this(Py.fileSystemDecode(name), mode);
}
/**
@@ -80,9 +80,9 @@
* @param name the name of the file
* @param mode a raw io file mode String
*/
- public FileIO(PyString name, String mode) {
+ public FileIO(String name, String mode) {
parseMode(mode);
- File absPath = new RelativeFile(name.toString());
+ File absPath = new RelativeFile(name);
try {
if ((appending && !(reading || plus)) || (writing && !reading && !plus)) {
diff --git a/src/org/python/modules/_imp.java b/src/org/python/modules/_imp.java
--- a/src/org/python/modules/_imp.java
+++ b/src/org/python/modules/_imp.java
@@ -68,7 +68,7 @@
* This needs to be consolidated with the code in (@see org.python.core.imp).
*
* @param name module name
- * @param entry a path String
+ * @param entry a path String (Unicode file or directory name)
* @param findingPackage if looking for a package only try to locate __init__
* @return null if no module found otherwise module information
*/
@@ -190,8 +190,10 @@
public static PyObject find_module(String name, PyObject path) {
if (path == Py.None && PySystemState.getBuiltin(name) != null) {
- return new PyTuple(Py.None, Py.newString(name),
- new PyTuple(Py.EmptyString, Py.EmptyString,
+ return new PyTuple(Py.None,
+ Py.newString(name),
+ new PyTuple(Py.EmptyString,
+ Py.EmptyString,
Py.newInteger(C_BUILTIN)));
}
@@ -199,14 +201,14 @@
path = Py.getSystemState().path;
}
for (PyObject p : path.asIterable()) {
- ModuleInfo mi = findFromSource(name, p.toString(), false, true);
+ ModuleInfo mi = findFromSource(name, Py.fileSystemDecode(p), false, true);
if(mi == null) {
continue;
}
return new PyTuple(mi.file,
- new PyString(mi.filename),
- new PyTuple(new PyString(mi.suffix),
- new PyString(mi.mode),
+ Py.newStringOrUnicode(mi.filename),
+ new PyTuple(Py.newString(mi.suffix),
+ Py.newString(mi.mode),
Py.newInteger(mi.type)));
}
throw Py.ImportError("No module named " + name);
@@ -216,7 +218,8 @@
PyObject mod = Py.None;
PySystemState sys = Py.getSystemState();
int type = data.__getitem__(2).asInt();
- while(mod == Py.None) {
+ String filenameString = Py.fileSystemDecode(filename);
+ while (mod == Py.None) {
String compiledName;
switch (type) {
case PY_SOURCE:
@@ -226,7 +229,7 @@
}
// XXX: This should load the accompanying byte code file instead, if it exists
- String resolvedFilename = sys.getPath(filename.toString());
+ String resolvedFilename = sys.getPath(filenameString);
compiledName = makeCompiledFilename(resolvedFilename);
if (name.endsWith(".__init__")) {
name = name.substring(0, name.length() - ".__init__".length());
@@ -241,19 +244,20 @@
}
mod = imp.createFromSource(name.intern(), (InputStream)o,
- filename.toString(), compiledName, mtime);
+ filenameString, compiledName, mtime);
break;
case PY_COMPILED:
- mod = load_compiled(name, filename.toString(), file);
+ mod = load_compiled(name, filenameString, file);
break;
case PKG_DIRECTORY:
PyModule m = imp.addModule(name);
m.__dict__.__setitem__("__path__", new PyList(new PyObject[] {filename}));
m.__dict__.__setitem__("__file__", filename);
- ModuleInfo mi = findFromSource(name, filename.toString(), true, true);
+ ModuleInfo mi = findFromSource(name, filenameString, true, true);
type = mi.type;
file = mi.file;
- filename = new PyString(mi.filename);
+ filenameString = mi.filename;
+ filename = Py.newStringOrUnicode(filenameString);
break;
default:
throw Py.ImportError("No module named " + name);
diff --git a/src/org/python/modules/posix/PosixModule.java b/src/org/python/modules/posix/PosixModule.java
--- a/src/org/python/modules/posix/PosixModule.java
+++ b/src/org/python/modules/posix/PosixModule.java
@@ -486,7 +486,8 @@
"getcwd() -> path\n\n" +
"Return a string representing the current working directory.");
public static PyObject getcwd() {
- return Py.newStringOrUnicode(Py.getSystemState().getCurrentWorkingDir());
+ // The return value is bytes in the file system encoding
+ return Py.fileSystemEncode(Py.getSystemState().getCurrentWorkingDir());
}
public static PyString __doc__getcwdu = new PyString(
@@ -1343,25 +1344,24 @@
return environ;
}
for (Map.Entry<String, String> entry : env.entrySet()) {
+ // The shell restricts names to a subset of ASCII and values are encoded byte strings.
environ.__setitem__(
- Py.newStringOrUnicode(entry.getKey()),
- Py.newStringOrUnicode(entry.getValue()));
+ Py.newString(entry.getKey()),
+ Py.fileSystemEncode(entry.getValue()));
}
return environ;
}
/**
- * Return a path as a String from a PyObject
+ * Return a path as a String from a PyObject, which must be <code>str</code> or
+ * <code>unicode</code>. If the path is a <code>str</code> (that is, <code>bytes</code>), it is
+ * interpreted into Unicode using the file system encoding.
*
* @param path a PyObject, raising a TypeError if an invalid path type
* @return a String path
*/
private static String asPath(PyObject path) {
- if (path instanceof PyString) {
- return path.toString();
- }
- throw Py.TypeError(String.format("coercing to Unicode: need string, %s type found",
- path.getType().fastGetName()));
+ return Py.fileSystemDecode(path);
}
/**
--
Repository URL: https://hg.python.org/jython
More information about the Jython-checkins
mailing list