unicode filenames
Andrew Dalke
adalke at mindspring.com
Tue Feb 4 01:55:32 EST 2003
Me:
>> if os.path.supports_unicode_filenames:
>> cwd = os.getcwdu()
>> else:
>> encoding = .. get default filesystem encoding ... or 'latin-1'
>> cwd = unicode(os.getcwd(), encoding)
>>
>>Ugly .. quite ugly. And suggestions on the proper way to
>>handle this is not documented as far as I can find.
Neil Hodgson wrote:
> Yes, it is ugly but I don't know how to handle this well on Unix. In my
> above example there is one partition mounted in UTF-8 mode but other
> partitions could be using other encodings. I imagine there is some way to
> reach the mount options for a given directory...
Okay, so it seems like no one knows how to handle unicode filenames
under Unix. Perhaps the following is the proper behaviour?
1) there is a default filesystem encoding, which is initialized
to None if os.path.supports_unicode_file is True, otherwise
it's set to sys.getdefaultencoding()
2) there is a registration system which is used to define encodings
used for different mount locations. If a filename/dirname is
not covered, sue the default filesystem encoding
3) a) when the input dirname or filename is a string, use the
current behaviour
b) when unicode, use the encoding from 2 (may have to get
the absolute path name ... don't like this part of it.
Perhaps the call to #2 should only be done for full paths?)
Here's an example implementation for listdir and getcwdu
_filesystem_encoding = None
def get_default_filesystem_encoding():
if _filesystem_encoding is None:
if os.path.supports_unicode_filenames:
return None
return sys.getdefaultencoding()
return _filesystem_encoding
def set_default_filesystem_encoding(encoding):
global _filesystem_encoding
_filesystem_encoding = encoding
# This is use if different mount points have different
# encodings. See below for how to use it
class FilesystemEncoding:
def __init__(self):
self.data = {}
def __setitem__(self, dirname, encoding):
if dirname.endswith(os.sep):
dirname = dirname[:-len(os.sep)]
self.data[dirname] = encoding
def lookup(self, name):
while 1:
if not name:
return get_default_filesystem_encoding()
if name in self.data:
return self.data[name]
new_name = os.path.dirname(name)
if name == new_name:
name = None
else:
name = new_name
filesystem_encodings = FilesystemEncoding()
....
>>> filesystem_encodings["/home/dalke"] = "utf8"
>>> filesystem_encodings["/"] = "latin-1"
>>> filesystem_encodings.lookup("/home/dalke/test.txt")
'utf8'
>>> filesystem_encodings.lookup("/home/spam")
'ascii'
>>>
....
def listdir(dirname):
if not isinstance(dirname, unicode):
return os.listdir(dirname)
encoding = filesystem_encodings.lookup(os.path.abspath(dirname))
if encoding is None:
return os.listdir(dirname)
raw_dirname = dirname.encode(encoding)
return [unicode(s, encoding) for s in os.listdir(raw_dirname)]
def getcwdu():
if os.path.supports_unicode_filenames:
return os.path.getcwdu()
s = os.getcwd()
encoding = filesystem_encodings.lookup(s)
return unicode(s, encoding)
...
>>> os.path.abspath(".")
'/home/dalke/tmp'
>>> os.path.abspath(u".")
u'/home/dalke/tmp'
>>>
>>> s = u"1 to \N{INFINITY}"
>>> s.encode("utf8")
'1 to \xe2\x88\x9e'
>>> t = s.encode("utf8")
>>> os.mkdir(t)
>>> os.listdir(".")
['1 to \xe2\x88\x9e']
>>> os.listdir(u".")
['1 to \xe2\x88\x9e']
>>>
>>> listdir(".")
['1 to \xe2\x88\x9e']
>>> listdir(u".")
['1 to \u221e']
>>>
>>> os.chdir(t)
>>> os.getcwdu()
Traceback (most recent call last):
File "<stdin>", line 1, in ?
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 39:
ordinal not in range(128)
>>> getcwdu()
u'/home/dalke/tmp/1 to \u221e'
>>>
If this makes sense, should it be added to Python's core?
Andrew
dalke at dalkescientific.com
More information about the Python-list
mailing list