encoding problems (é and è)

Fri Mar 24 14:37:03 EST 2006

John Machin wrote:

> Some of the transformations are a little unfortunate :-(

here's a slightly silly way to map a unicode string to its "unaccented"
version:

###

import unicodedata, sys

CHAR_REPLACEMENT = {
    0xc6: u"AE", # LATIN CAPITAL LETTER AE
    0xd0: u"D",  # LATIN CAPITAL LETTER ETH
    0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE
    0xde: u"Th", # LATIN CAPITAL LETTER THORN
    0xdf: u"ss", # LATIN SMALL LETTER SHARP S
    0xe6: u"ae", # LATIN SMALL LETTER AE
    0xf0: u"d",  # LATIN SMALL LETTER ETH
    0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE
    0xfe: u"th", # LATIN SMALL LETTER THORN
    }

class unaccented_map(dict):

    def mapchar(self, key):
        ch = self.get(key)
        if ch is not None:
            return ch
        ch = unichr(key)
        try:
            ch = unichr(int(unicodedata.decomposition(ch).split()[0], 16))
        except (IndexError, ValueError):
            ch = CHAR_REPLACEMENT.get(key, ch)
        # uncomment the following line if you want to remove remaining
        # non-ascii characters
        # if ch >= u"\x80": return None
        self[key] = ch
        return ch

    if sys.version >= "2.5":
        __missing__ = mapchar
    else:
        __getitem__ = mapchar

assert isinstance(mystring, unicode)

print mystring.translate(unaccented_map())

###

if the source string is not unicode, you can use something like

    s = mystring.decode("iso-8859-1")
    s = s.translate(unaccented_map())
    s = s.encode("ascii", "ignore")

(this works well for characters in the latin-1 range, at least.  no
guarantees for other character ranges)

</F>