Htmlizing text

Mon Nov 29 16:49:24 EST 1999

> Is there a function in the standard Python library to HTML-ize text,
> i.e. to replace 'a > b & c < d' with 'a > b & c < d'?

This is the one I use (look for text2html below).  It does a little bit more
than what you asked for, like turning URLs into clickable links, changing
*word* to italicized etc.

(and yes, it's extracted almost verbatim from the faqwizard :-)

-- bjorn

import string
import re

def translate(text, pre=0):
    translate_prog = prog =
re.compile(r'\b(http|ftp|https)://\S+(\b|/)|\b[-.\w]+@[-.\w]+')
    i = 0
    list = []
    while 1:
        m = prog.search(text, i)
        if not m:
            break
        j = m.start()
        list.append(escape(text[i:j]))
        i = j
        url = m.group(0)
        while url[-1] in '();:,.?\'"<>':
            url = url[:-1]
        i = i + len(url)
        url = escape(url)
        if not pre:
            if ':' in url:
                repl = '<A HREF="%s">%s</A>' % (url, url)
            else:
                repl = '<A HREF="mailto:%s"><%s></A>' % (url, url)
        else:
            repl = url
        list.append(repl)
    j = len(text)
    list.append(escape(text[i:j]))
    return string.join(list, '')

def escape(s):
    s = string.replace(s, '&', '&')
    s = string.replace(s, '<', '<')
    s = string.replace(s, '>', '>')
    return s

def escapeq(s):
    s = escape(s)
    s = string.replace(s, '"', '"')
    return s

def emphasize(line):
    return re.sub(r'\*([a-zA-Z]+)\*', r'<I>\1</I>', line)

def text2html(body):
    res = []
    pre = 0
    raw = 0
    for line in string.split(body, '\n'):
        tag = string.lower(string.rstrip(line))
        if tag == '<html>':
            raw = 1
            continue
        if tag == '</html>':
            raw = 0
            continue
        if raw:
            res.append(line)
            continue
        if not string.strip(line):
            if pre:
                res.append('</PRE>')
                pre = 0
            else:
                res.append('<P>')
        else:
            if line[0] not in string.whitespace:
                if pre:
                    res.append('</PRE>')
                    pre = 0
            else:
                if not pre:
                    res.append('<PRE>')
                    pre = 1
            if '/' in line or '@' in line:
                line = translate(line, pre)
            elif '<' in line or '&' in line:
                line = escape(line)
            if not pre and '*' in line:
                line = emphasize(line)
            res.append(line)
    if pre:
        res.append('</PRE>')
        pre = 0
    return string.join(res)