RE Engine error with sub()
André Søreng
andreis at stud.cs.uit.no
Fri Apr 15 04:15:47 EDT 2005
Instead of using regular expressions, you could perhaps
use a multiple keyword matcher, and then for each match,
replace it with the correct string.
http://hkn.eecs.berkeley.edu/~dyoo/python/ahocorasick/
contains the Aho-Corasick algorithm written in C with
a Python extension.
Maurice LING wrote:
> Hi,
>
> I have the following codes:
>
> from __future__ import nested_scopes
> import re
> from UserDict import UserDict
>
>
> class Replacer(UserDict):
> """
> An all-in-one multiple string substitution class. This class was
> contributed by Xavier
> Defrang to the ASPN Python Cookbook
> (http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/81330)
> and alane at sourceforge.net.
>
> Copyright: The methods _make_regex(), __call__() and substitute()
> were the work of Xavier Defrang,
> __init__() was the work of alane at sourceforge.net, all others were
> the work of Maurice Ling"""
>
> def __init__(self, dict = None, file = None):
> """Constructor. It calls for the compilation of regular
> expressions from either
> a dictionary object or a replacement rule file.
>
> @param dict: dictionary object containing replacement rules with
> the string to be
> replaced as keys.
> @param file: file name of replacement rule file
> """
> self.re = None
> self.regex = None
> if file == None:
> UserDict.__init__(self, dict)
> self._make_regex()
> else:
> UserDict.__init__(self, self.readDictionaryFile(file))
> self._make_regex()
>
> def cleanDictionaryFile(self, file):
> """
> Method to clean up the replacement rule dictionary file and
> write the cleaned
> file as the same name as the original file."""
> import os
> dict = self.readDictionaryFile(file)
> f = open(file, 'w')
> for key in dict.keys(): f.write(str(key) + '=' + str(dict[key])
> + os.linesep)
> f.close()
>
> def readDictionaryFile(self, file):
> """
> Method to parse a replacement rule file (file) into a dictionary
> for regular
> expression processing. Each rule in the rule file is in the form:
> <string to be replaced>=<string to replace with>
> """
> import string
> import os
> f = open(file, 'r')
> data = f.readlines()
> f.close()
> dict = {}
> for rule in data:
> rule = rule.split('=')
> if rule[1][-1] == os.linesep: rule[1] = rule[1][:-1]
> dict[str(rule[0])] = str(rule[1])
> print '%s replacement rule(s) read from %s' %
> (str(len(dict.keys())), str(file))
> return dict
>
> def _make_regex(self):
> """ Build a regular expression object based on the keys of the
> current dictionary """
> self.re = "(%s)" % "|".join(map(re.escape, self.keys()))
> self.regex = re.compile(self.re)
>
> def __call__(self, mo):
> """ This handler will be invoked for each regex match """
> # Count substitutions
> self.count += 1 # Look-up string
> return self[mo.string[mo.start():mo.end()]]
>
> def substitute(self, text):
> """ Translate text, returns the modified text. """
> # Reset substitution counter
> self.count = 0
> # Process text
> #return self._make_regex().sub(self, text)
> return self.regex.sub(self, text)
>
> def rmBracketDuplicate(self, text):
> """Removes the bracketed text in occurrences of '<text-x>
> (<text-x>)'"""
> regex = re.compile(r'(\w+)\s*(\(\1\))')
> return regex.sub(r'\1', text)
>
> def substituteMultiple(self, text):
> """Similar to substitute() method except that this method loops
> round the same text
> multiple times until no more substitutions can be made or when
> it had looped
> 10 times. This is to pre-ampt for cases of recursive
> abbreviations."""
> count = 1 # to get into the loop
> run = 0 # counter for number of runs thru the text
> while count > 0 and run < 10:
> count = 0
> text = self.rmBracketDuplicate(self.substitute(text))
> count = count + self.count
> run = run + 1
> print "Pass %d: Changed %d things(s)" % (run, count)
> return text
>
>
>
>
> Normally I will use the following to instantiate my module:
>
> replace = Replacer('', 'rule.mdf')
>
> rule.mdf is in the format of "<string to be replaced>=<string to replace
> with>\n"
>
> Then using replace.substituteMultiple('<my text>') to carry out multiple
> replacements.
>
> It all works well for rule count up to 800+ but when my replacement
> rules swells up to 1800+, it gives me a runtime error that says
> "Internal error in regular expression engine"... traceable to "return
> self.regex.sub(self, text)" in substitute() method.
>
>
> Any ideas or workarounds?
>
> Thanks in advance.
>
> Cheers,
> Maurice
More information about the Python-list
mailing list