MultiReplace (performance and unordered dicts)

Joseph Reagle reagle at mit.edu
Thu Jun 4 17:30:58 EDT 2009


I have programs that do lots of string-to-string replacements, so I'm trying
to create a speedy implementation (tons of .replace statements has become
unwieldy). My MultiReplace object does as well as the function regexp,
which both do better than the for loop function, any other suggestions?

def multi_repl(text, subs):
    for ori, sub in subs:
        text = text.replace(ori, sub)
    return text


import string
latex_esc_dic = dict(latex_esc)
latex_esc_ori, latex_esc_rep = zip(*latex_esc)
def symbol_replace(match, get=latex_esc_dic.get):
    return get(match.group(1), "")
symbol_pattern = re.compile(
    "(" + string.join(map(re.escape, latex_esc_ori), "|") + ")"
    )

class MultiReplace(object):
    """
    Replace multiple instances from a list of ori/rep pairs.
    I use an object for performance: compiled regexes persist.
    Table is a list of pairs, I have to convert to dict for regex
    replace function, don't use a dict natively since they aren't ordered.
    """
    def __init__(self, table):
        print "initing object"
        self.originals, self.replacements = zip(*table)
        self.pattern = re.compile(
            "(" + string.join(map(re.escape, self.originals), "|") + ")"
        )
        self.table_dic = dict(table)

    def _get_replacement(self, match): # passed match
        #print "replacing %s with %s" % (match.group(1),
self.table_dic.get(match.group(1), ""))
        return self.table_dic.get(match.group(1), "") # use match to return
replacement

    def replace(self, line):
        return self.pattern.sub(self._get_replacement, line) # pass
replacement function
mr = MultiReplace(latex_esc)

...

#line = multi_repl(line, latex_esc) # 0.406
#line = symbol_pattern.sub(symbol_replace, line) #0.385
line = mr.replace(line) #0.385





More information about the Python-list mailing list