unicode filenames

Tue Feb 4 01:55:32 EST 2003

Me:
>>   if os.path.supports_unicode_filenames:
>>     cwd = os.getcwdu()
>>   else:
>>     encoding = .. get default filesystem encoding ... or 'latin-1'
>>     cwd = unicode(os.getcwd(), encoding)
>>
>>Ugly .. quite ugly.  And suggestions on the proper way to
>>handle this is not documented as far as I can find.

Neil Hodgson wrote:
>    Yes, it is ugly but I don't know how to handle this well on Unix. In my
> above example there is one partition mounted in UTF-8 mode but other
> partitions could be using other encodings. I imagine there is some way to
> reach the mount options for a given directory...

Okay, so it seems like no one knows how to handle unicode filenames
under Unix.  Perhaps the following is the proper behaviour?

   1) there is a default filesystem encoding, which is initialized
       to None if os.path.supports_unicode_file is True, otherwise
       it's set to sys.getdefaultencoding()

   2) there is a registration system which is used to define encodings
       used for different mount locations.  If a filename/dirname is
       not covered, sue the default filesystem encoding

   3) a) when the input dirname or filename is a string, use the
        current behaviour
      b) when unicode, use the encoding from 2 (may have to get
        the absolute path name  ... don't like this part of it.
        Perhaps the call to #2 should only be done for full paths?)

Here's an example implementation for listdir and getcwdu

_filesystem_encoding = None

def get_default_filesystem_encoding():
   if _filesystem_encoding is None:
     if os.path.supports_unicode_filenames:
       return None
     return sys.getdefaultencoding()
   return _filesystem_encoding

def set_default_filesystem_encoding(encoding):
   global _filesystem_encoding
   _filesystem_encoding = encoding

# This is use if different mount points have different
# encodings.  See below for how to use it
class FilesystemEncoding:
   def __init__(self):
     self.data = {}
   def __setitem__(self, dirname, encoding):
     if dirname.endswith(os.sep):
       dirname = dirname[:-len(os.sep)]
     self.data[dirname] = encoding
   def lookup(self, name):
     while 1:
       if not name:
         return get_default_filesystem_encoding()
       if name in self.data:
         return self.data[name]
       new_name = os.path.dirname(name)
       if name == new_name:
         name = None
       else:
         name = new_name

filesystem_encodings = FilesystemEncoding()
   ....

 >>> filesystem_encodings["/home/dalke"] = "utf8"
 >>> filesystem_encodings["/"] = "latin-1"
 >>> filesystem_encodings.lookup("/home/dalke/test.txt")
'utf8'
 >>> filesystem_encodings.lookup("/home/spam")
'ascii'
 >>>

   ....

def listdir(dirname):
   if not isinstance(dirname, unicode):
     return os.listdir(dirname)
   encoding = filesystem_encodings.lookup(os.path.abspath(dirname))
   if encoding is None:
     return os.listdir(dirname)
   raw_dirname = dirname.encode(encoding)
   return [unicode(s, encoding) for s in os.listdir(raw_dirname)]

def getcwdu():
   if os.path.supports_unicode_filenames:
     return os.path.getcwdu()
   s = os.getcwd()
   encoding = filesystem_encodings.lookup(s)
   return unicode(s, encoding)

   ...

 >>> os.path.abspath(".")
'/home/dalke/tmp'
 >>> os.path.abspath(u".")
u'/home/dalke/tmp'
 >>>
 >>> s = u"1 to \N{INFINITY}"
 >>> s.encode("utf8")
'1 to \xe2\x88\x9e'
 >>> t = s.encode("utf8")
 >>> os.mkdir(t)
 >>> os.listdir(".")
['1 to \xe2\x88\x9e']
 >>> os.listdir(u".")
['1 to \xe2\x88\x9e']
 >>>
 >>> listdir(".")
['1 to \xe2\x88\x9e']
 >>> listdir(u".")
['1 to \u221e']
 >>>

 >>> os.chdir(t)
 >>> os.getcwdu()
Traceback (most recent call last):
   File "<stdin>", line 1, in ?
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 39: 
ordinal not in range(128)
 >>> getcwdu()
u'/home/dalke/tmp/1 to \u221e'
 >>>

If this makes sense, should it be added to Python's core?

					Andrew
					dalke at dalkescientific.com