RE Engine error with sub()

Maurice LING mauriceling at acm.org
Fri Apr 15 01:01:09 EDT 2005


Hi,

I have the following codes:

from __future__ import nested_scopes
import re
from UserDict import UserDict


class Replacer(UserDict):
     """
     An all-in-one multiple string substitution class. This class was 
contributed by Xavier
     Defrang to the ASPN Python Cookbook 
(http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/81330)
     and alane at sourceforge.net.

     Copyright: The methods _make_regex(), __call__() and substitute() 
were the work of Xavier Defrang,
     __init__() was the work of alane at sourceforge.net, all others were 
the work of Maurice Ling"""

     def __init__(self, dict = None, file = None):
         """Constructor. It calls for the compilation of regular 
expressions from either
         a dictionary object or a replacement rule file.

         @param dict: dictionary object containing replacement rules 
with the string to be
                         replaced as keys.
         @param file: file name of replacement rule file
         """
         self.re = None
         self.regex = None
         if file == None:
             UserDict.__init__(self, dict)
             self._make_regex()
         else:
             UserDict.__init__(self, self.readDictionaryFile(file))
             self._make_regex()

     def cleanDictionaryFile(self, file):
         """
         Method to clean up the replacement rule dictionary file and 
write the cleaned
         file as the same name as the original file."""
         import os
         dict = self.readDictionaryFile(file)
         f = open(file, 'w')
         for key in dict.keys(): f.write(str(key) + '=' + str(dict[key]) 
+ os.linesep)
         f.close()

     def readDictionaryFile(self, file):
         """
         Method to parse a replacement rule file (file) into a 
dictionary for regular
         expression processing. Each rule in the rule file is in the form:
             <string to be replaced>=<string to replace with>
             """
         import string
         import os
         f = open(file, 'r')
         data = f.readlines()
         f.close()
         dict = {}
         for rule in data:
             rule = rule.split('=')
             if rule[1][-1] == os.linesep: rule[1] = rule[1][:-1]
             dict[str(rule[0])] = str(rule[1])
         print '%s replacement rule(s) read from %s' % 
(str(len(dict.keys())), str(file))
         return dict

     def _make_regex(self):
         """ Build a regular expression object based on the keys of the 
current dictionary """
         self.re = "(%s)" % "|".join(map(re.escape, self.keys()))
         self.regex = re.compile(self.re)

     def __call__(self, mo):
         """ This handler will be invoked for each regex match """
	    # Count substitutions
         self.count += 1 # Look-up string
         return self[mo.string[mo.start():mo.end()]]

     def substitute(self, text):
         """ Translate text, returns the modified text. """
         # Reset substitution counter
         self.count = 0
         # Process text
         #return self._make_regex().sub(self, text)
         return self.regex.sub(self, text)

     def rmBracketDuplicate(self, text):
         """Removes the bracketed text in occurrences of '<text-x> 
(<text-x>)'"""
         regex = re.compile(r'(\w+)\s*(\(\1\))')
         return regex.sub(r'\1', text)

     def substituteMultiple(self, text):
         """Similar to substitute() method except that this method loops 
round the same text
         multiple times until no more substitutions can be made or when 
it had looped
         10 times. This is to pre-ampt for cases of recursive 
abbreviations."""
         count = 1 # to get into the loop
         run = 0 # counter for number of runs thru the text
         while count > 0 and run < 10:
             count = 0
             text = self.rmBracketDuplicate(self.substitute(text))
             count = count + self.count
             run = run + 1
             print "Pass %d: Changed %d things(s)" % (run, count)
         return text




Normally I will use the following to instantiate my module:

replace = Replacer('', 'rule.mdf')

rule.mdf is in the format of "<string to be replaced>=<string to replace 
with>\n"

Then using replace.substituteMultiple('<my text>') to carry out multiple 
replacements.

It all works well for rule count up to 800+ but when my replacement 
rules swells up to 1800+, it gives me a runtime error that says 
"Internal error in regular expression engine"... traceable to "return 
self.regex.sub(self, text)" in substitute() method.


Any ideas or workarounds?

Thanks in advance.

Cheers,
Maurice



More information about the Python-list mailing list