codec for html/xml entities!?

Fri Apr 18 03:06:18 EDT 2008

Hi friends, I've been OFF-Python now for quite a while and am glad
being back. At least to some part as work permits.

Q:
What's a good way to encode and decode those entities like € or
€ ?

I need isolated functions to process lines. Looking at the xml and
sgmlib stuff I didn't really get a clue as to what's the most pythonic
way. Are there library functions I didn't see?

FYI, here is what I hacked down and what will probably (hopefully...)
do the job.

Feel free to comment.

# -*- coding: iso-8859-1 -*-
"""\
entity_stuff.py, mb, 2008-03-14, 2008-03-18

"""

import htmlentitydefs
import re

RE_OBJ_entity = re.compile('(&.+?;)')

def entity2uc(entity):
    """Convert entity like { to unichr.

    Return (result,True) on success or (input string, False)
    otherwise. Example:
        entity2cp('€')   -> (u'\u20ac',True)
        entity2cp('&#x20ac;') -> (u'\u20ac',True)
        entity2cp('€')  -> (u'\u20ac',True)
        entity2cp('&foobar;') -> ('&foobar;',False)
    """

    gotCodepoint = False
    gotUnichr = False
    if entity.startswith('&#'):
        if entity[2] == 'x':
            base = 16
            digits = entity[3:-1]
        else:
            base = 10
            digits = entity[2:-1]
        try:
            v = int(digits,base)
            gotCodepoint = True
        except:
            pass
    else:
        v = htmlentitydefs.name2codepoint.get(entity[1:-1],None)
        if not v is None:
            gotCodepoint = True

    if gotCodepoint:
        try:
            v = unichr(v)
            gotUnichr = True
        except:
            pass
    if gotUnichr:
        return v, gotUnichr
    else:
        return entity, gotUnichr

def line_entities_to_uc(line):
    result = []
    cntProblems = 0
    for e in RE_OBJ_entity.split(line):
        if e.startswith('&'):
            e,success = entity2uc(e)
            if not success:
                cntProblems += 1
        result.append(e)
    return u''.join(result), cntProblems

def uc2entity(uc):
    cp = ord(uc)
    if cp > 127:
        name = htmlentitydefs.codepoint2name.get(cp,None)
        if name:
            result = '&%s;' % name
        else:
            result = '&#x%x;' % cp
    else:
        result = chr(cp)
    return result

def encode_line(line):
    return ''.join([uc2entity(u) for u in line])

if 1 and __name__=="__main__":
    import codecs
    infile = 'temp.ascii.xml'
    outfile = 'temp.utf8.xml'
    of = codecs.open(outfile,'wb','utf-8')
    totalProblems = 0
    totalLines = 0
    for line in file(infile,'rb'):
        line2, cntProblems = line_entities_to_uc(line)
        of.write(line2)
        totalLines += 1
        totalProblems += cntProblems
    of.close()
    print
    print "Summary:"
    print "  Infile : %s" % (infile,)
    print "  Outfile: %s" % (outfile,)
    print '  %8d %s %s' % (totalLines,
['lines','line'][totalLines==1], 'written.')
    print '  %8d %s %s' % (totalProblems,
['entities','entity'][totalProblems==1], 'left unconverted.')
    print '%s' % ('Done.',)

Have a nice day and
ru, Martin
(read you, ;-)