Wanted: safe codec for filenames

Wed Sep 5 18:20:45 EDT 2007

Hallöchen!

Torsten Bronger writes:

> I'd like to map general unicode strings to safe filename.  I tried
> punycode but it is case-sensitive, which Windows is not.  Thus,
> "Hallo" and "hallo" are mapped to "Hallo-" and "hallo-", however,
> I need uppercase Latin letters being encoded, too, and the
> encoding must contain only lowercase Latin letters, numbers,
> underscores, and maybe a little bit more.  The result should be
> more legible than base64, though.
>
> Has anybody created such a codec already?

Okay, the following works fine for me:

--8<---------------cut here---------------start------------->8---
import codecs

class Codec(codecs.Codec):
    """Codec class for safe filenames.  Safe filenames work on all important
    filesystems, i.e., they don't contain special or dangerous characters, and
    they don't assume that filenames are treated case-sensitively.

        >>> u"hallo".encode("safefilename")
        'hallo'
        >>> u"Hallo".encode("safefilename")
        '(h)allo'
        >>> u"MIT Thesis".encode("safefilename")
        '(mit)_(t)hesis'
        >>> u"Gesch\\u00e4ftsbrief".encode("safefilename")
        '(g)esch{e4}ftsbrief'

    Of course, the mapping works in both directions as expected:

        >>> "(g)esch{e4}ftsbrief".decode("safefilename")
        u'Gesch\\xe4ftsbrief'
        >>> "(mit)_(t)hesis".decode("safefilename")
        u'MIT Thesis'

    """
    lowercase_letters = "abcdefghijklmnopqrstuvwxyz"
    safe_characters = lowercase_letters + "0123456789-+!$%&`'@~#.,^"
    uppercase_letters = lowercase_letters.upper()
    def encode(self, input, errors='strict'):
        """Convert Unicode strings to safe filenames."""
        output = ""
        i = 0
        input_length = len(input)
        while i < input_length:
            c = input[i]
            if c in self.safe_characters:
                output += str(c)
            elif c == " ":
                output += "_"
            elif c in self.uppercase_letters:
                output += "("
                while i < input_length and input[i] in self.uppercase_letters:
                    output += str(input[i]).lower()
                    i += 1
                output += ")"
                continue
            else:
                output += "{" + hex(ord(c))[2:] + "}"
            i += 1
        return output, input_length
    def handle_problematic_characters(self, errors, input, start, end, message):
        if errors == 'ignore':
            return u""
        elif errors == 'replace':
            return u"?"
        else:
            raise UnicodeDecodeError("safefilename", input, start, end, message)
    def decode(self, input, errors='strict'):
        """Convert safe filenames to Unicode strings."""
        input = str(input)
        input_length = len(input)
        output = u""
        i = 0
        while i < input_length:
            c = input[i]
            if c in self.safe_characters:
                output += c
            elif c == "_":
                output += " "
            elif c == "(":
                i += 1
                while i < input_length and input[i] in self.lowercase_letters:
                    output += input[i].upper()
                    i += 1
                if i == input_length:
                    self.handle_problematic_characters(errors, input, i-1, i, "open parenthesis was never closed")
                    continue
                if input[i] != ')':
                    self.handle_problematic_characters(
                            errors, input, i, i+1, "invalid character '%s' in parentheses sequence" % input[i])
                    continue
            elif c == "{":
                end_position = input.find("}", i)
                if end_position == -1:
                    end_position = i+1
                    while end_position < input_length and input[end_position] in "0123456789abcdef" and \
                            end_position - i <= 8:
                        end_position += 1
                    output += self.handle_problematic_characters(errors, input, i, end_position,
                                                                 "open backet was never closed")
                    i = end_position
                    continue
                else:
                    try:
                        output += unichr(int(input[i+1:end_position], 16))
                    except:
                        output += self.handle_problematic_characters(errors, input, i, end_position+1,
                                                                "invalid data between brackets")
                i = end_position
            else:
                output += self.handle_problematic_characters(errors, input, i, i+1, "invalid character '%s'" % c)
            i += 1
        return output, input_length

class StreamWriter(Codec, codecs.StreamWriter):
    pass

class StreamReader(Codec, codecs.StreamReader):
    pass

def _registry(encoding):
    if encoding == "safefilename":
        return (Codec().encode, Codec().decode, StreamReader, StreamWriter)
    else:
        return None

codecs.register(_registry)

if __name__ == "__main__":
    import doctest
    doctest.testmod()
--8<---------------cut here---------------end--------------->8---

-- 
Torsten Bronger, aquisgrana, europa vetus
                                      Jabber ID: bronger at jabber.org
                      (See http://ime.webhop.org for ICQ, MSN, etc.)