[Python-checkins] r83875 - in python/branches/py3k: Lib/re.py Lib/test/test_re.py Misc/NEWS

raymond.hettinger python-checkins at python.org
Mon Aug 9 07:56:50 CEST 2010


Author: raymond.hettinger
Date: Mon Aug  9 07:56:50 2010
New Revision: 83875

Log:
Revert 83784 adding functools.lru_cache() to the re module.

The problem is that the re module is imported by sysconfig
and re needs functools which uses collections.OrderedDict()
but the _collectionsmodule.c code is not yet constructed
at this point in the build.

The likely best solution will be to include _collections
as part of the static build before the rest of the
boot-strapping.




Modified:
   python/branches/py3k/Lib/re.py
   python/branches/py3k/Lib/test/test_re.py
   python/branches/py3k/Misc/NEWS

Modified: python/branches/py3k/Lib/re.py
==============================================================================
--- python/branches/py3k/Lib/re.py	(original)
+++ python/branches/py3k/Lib/re.py	Mon Aug  9 07:56:50 2010
@@ -118,7 +118,6 @@
 import sys
 import sre_compile
 import sre_parse
-import functools
 
 # public symbols
 __all__ = [ "match", "search", "sub", "subn", "split", "findall",
@@ -206,9 +205,9 @@
     return _compile(pattern, flags)
 
 def purge():
-    "Clear the regular expression caches"
-    _compile_typed.clear()
-    _compile_repl.clear()
+    "Clear the regular expression cache"
+    _cache.clear()
+    _cache_repl.clear()
 
 def template(pattern, flags=0):
     "Compile a template pattern, returning a pattern object"
@@ -290,12 +289,12 @@
             # Ignore problems if the cache changed from another thread.
             pass
 
-def _compile(*args):
-    return _compile_typed(type(args[0]), *args)
-
- at functools.lru_cache(maxsize=_MAXCACHE)
-def _compile_typed(type, *key):
+def _compile(*key):
     # internal: compile pattern
+    cachekey = (type(key[0]),) + key
+    p = _cache.get(cachekey)
+    if p is not None:
+        return p
     pattern, flags = key
     if isinstance(pattern, _pattern_type):
         if flags:
@@ -304,14 +303,23 @@
         return pattern
     if not sre_compile.isstring(pattern):
         raise TypeError("first argument must be string or compiled pattern")
-    return sre_compile.compile(pattern, flags)
+    p = sre_compile.compile(pattern, flags)
+    if len(_cache) >= _MAXCACHE:
+        _shrink_cache(_cache, _MAXCACHE)
+    _cache[cachekey] = p
     return p
 
- at functools.lru_cache(maxsize=_MAXCACHE)
 def _compile_repl(*key):
     # internal: compile replacement pattern
+    p = _cache_repl.get(key)
+    if p is not None:
+        return p
     repl, pattern = key
-    return sre_parse.parse_template(repl, pattern)
+    p = sre_parse.parse_template(repl, pattern)
+    if len(_cache_repl) >= _MAXCACHE:
+        _shrink_cache(_cache_repl, _MAXCACHE)
+    _cache_repl[key] = p
+    return p
 
 def _expand(pattern, match, template):
     # internal: match.expand implementation hook

Modified: python/branches/py3k/Lib/test/test_re.py
==============================================================================
--- python/branches/py3k/Lib/test/test_re.py	(original)
+++ python/branches/py3k/Lib/test/test_re.py	Mon Aug  9 07:56:50 2010
@@ -875,8 +875,70 @@
                     print('=== Fails on unicode-sensitive match', t)
 
 
+class ReCacheTests(unittest.TestCase):
+    """These tests are specific to the re._shrink_cache implementation."""
+
+    def setUp(self):
+        self._orig_maxcache = re._MAXCACHE
+
+    def tearDown(self):
+        re._MAXCACHE = self._orig_maxcache
+
+    def test_compile_cache_overflow(self):
+        # NOTE: If a profiler or debugger is tracing code and compiling
+        # regular expressions while tracing through this test... expect
+        # the test to fail.  This test is not concurrency safe.
+
+        # Explicitly fill the caches.
+        re._MAXCACHE = 20
+        max_cache = re._MAXCACHE
+        unique_chars = tuple(chr(char_num) for char_num in
+                             range(b'a'[0], b'a'[0]+max_cache))
+        re._cache.clear()
+        for char in unique_chars:
+            re._compile(char, 0)
+        self.assertEqual(max_cache, len(re._cache))
+        re._cache_repl.clear()
+        for char in unique_chars:
+            re._compile_repl(char*2, char)
+        self.assertEqual(max_cache, len(re._cache_repl))
+
+        # Overflow both caches and make sure they have extra room left
+        # afterwards as well as having more than a single entry.
+        re._compile('A', 0)
+        self.assertLess(len(re._cache), max_cache)
+        self.assertGreater(len(re._cache), 1)
+        re._compile_repl('A', 'A')
+        self.assertLess(len(re._cache_repl), max_cache)
+        self.assertGreater(len(re._cache_repl), 1)
+
+    def test_shrink_cache_at_limit(self):
+        cache = dict(zip(range(6), range(6)))
+        re._shrink_cache(cache, 6, divisor=3)
+        self.assertEqual(4, len(cache))
+
+    def test_shrink_cache_empty(self):
+        cache = {}
+        re._shrink_cache(cache, 6, divisor=3)
+        # Cache was empty, make sure we didn't raise an exception.
+        self.assertEqual(0, len(cache))
+
+    def test_shrink_cache_overflowing(self):
+        cache = dict(zip(range(6), range(6)))
+        re._shrink_cache(cache, 4, divisor=2)
+        # Cache was larger than the maximum, be sure we shrunk to smaller.
+        self.assertEqual(2, len(cache))
+
+    def test_shrink_cache_underflow(self):
+        cache = dict(zip(range(6), range(6)))
+        # No shrinking to do.
+        re._shrink_cache(cache, 9, divisor=3)
+        self.assertEqual(6, len(cache))
+
+
 def test_main():
     run_unittest(ReTests)
+    run_unittest(ReCacheTests)
     run_re_tests()
 
 if __name__ == "__main__":

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS	(original)
+++ python/branches/py3k/Misc/NEWS	Mon Aug  9 07:56:50 2010
@@ -697,8 +697,8 @@
 
 - The default size of the re module's compiled regular expression cache has been
   increased from 100 to 500 and the cache replacement policy has changed from
-  simply clearing the entire cache on overflow to forgetting the least recently
-  used cached compiled regular expressions.  This is a performance win for
+  simply clearing the entire cache on overflow to randomly forgetting 20% of the
+  existing cached compiled regular expressions.  This is a performance win for
   applications that use a lot of regular expressions and limits the impact of
   the performance hit anytime the cache is exceeded.
 


More information about the Python-checkins mailing list