[Python-checkins] CVS: python/dist/src/Lib sre_compile.py,1.37,1.38 sre_constants.py,1.28,1.29
Fredrik Lundh
effbot@users.sourceforge.net
Mon, 02 Jul 2001 09:58:40 -0700
Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv555/Lib
Modified Files:
sre_compile.py sre_constants.py
Log Message:
added martin's BIGCHARSET patch to SRE 2.1.1. martin reports 2x
speedups for certain unicode character ranges.
Index: sre_compile.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_compile.py,v
retrieving revision 1.37
retrieving revision 1.38
diff -C2 -r1.37 -r1.38
*** sre_compile.py 2001/03/22 15:50:10 1.37
--- sre_compile.py 2001/07/02 16:58:38 1.38
***************
*** 157,160 ****
--- 157,162 ----
elif op is CHARSET:
code.extend(av)
+ elif op is BIGCHARSET:
+ code.extend(av)
elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE:
***************
*** 186,190 ****
except IndexError:
# character set contains unicode characters
! return charset
# compress character map
i = p = n = 0
--- 188,192 ----
except IndexError:
# character set contains unicode characters
! return _optimize_unicode(charset, fixup)
# compress character map
i = p = n = 0
***************
*** 212,227 ****
else:
# use bitmap
! data = []
! m = 1; v = 0
! for c in charmap:
! if c:
! v = v + m
! m = m << 1
! if m > MAXCODE:
! data.append(v)
! m = 1; v = 0
out.append((CHARSET, data))
return out
return charset
def _simple(av):
--- 214,288 ----
else:
# use bitmap
! data = _mk_bitmap(charmap)
out.append((CHARSET, data))
return out
return charset
+
+ def _mk_bitmap(bits):
+ data = []
+ m = 1; v = 0
+ for c in bits:
+ if c:
+ v = v + m
+ m = m << 1
+ if m > MAXCODE:
+ data.append(v)
+ m = 1; v = 0
+ return data
+
+ # To represent a big charset, first a bitmap of all characters in the
+ # set is constructed. Then, this bitmap is sliced into chunks of 256
+ # characters, duplicate chunks are eliminitated, and each chunk is
+ # given a number. In the compiled expression, the charset is
+ # represented by a 16-bit word sequence, consisting of one word for
+ # the number of different chunks, a sequence of 256 bytes (128 words)
+ # of chunk numbers indexed by their original chunk position, and a
+ # sequence of chunks (16 words each).
+
+ # Compression is normally good: in a typical charset, large ranges of
+ # Unicode will be either completely excluded (e.g. if only cyrillic
+ # letters are to be matched), or completely included (e.g. if large
+ # subranges of Kanji match). These ranges will be represented by
+ # chunks of all one-bits or all zero-bits.
+
+ # Matching can be also done efficiently: the more significant byte of
+ # the Unicode character is an index into the chunk number, and the
+ # less significant byte is a bit index in the chunk (just like the
+ # CHARSET matching).
+
+ def _optimize_unicode(charset, fixup):
+ charmap = [0]*65536
+ negate = 0
+ for op, av in charset:
+ if op is NEGATE:
+ negate = 1
+ elif op is LITERAL:
+ charmap[fixup(av)] = 1
+ elif op is RANGE:
+ for i in range(fixup(av[0]), fixup(av[1])+1):
+ charmap[i] = 1
+ elif op is CATEGORY:
+ # XXX: could expand category
+ return charset # cannot compress
+ if negate:
+ for i in range(65536):
+ charmap[i] = not charmap[i]
+ comps = {}
+ mapping = [0]*256
+ block = 0
+ data = []
+ for i in range(256):
+ chunk = tuple(charmap[i*256:(i+1)*256])
+ new = comps.setdefault(chunk, block)
+ mapping[i] = new
+ if new == block:
+ block += 1
+ data += _mk_bitmap(chunk)
+ header = [block]
+ assert MAXCODE == 65535
+ for i in range(128):
+ header.append(mapping[2*i]+256*mapping[2*i+1])
+ data[0:0] = header
+ return [(BIGCHARSET, data)]
def _simple(av):
Index: sre_constants.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_constants.py,v
retrieving revision 1.28
retrieving revision 1.29
diff -C2 -r1.28 -r1.29
*** sre_constants.py 2001/03/22 15:50:10 1.28
--- sre_constants.py 2001/07/02 16:58:38 1.29
***************
*** 12,16 ****
# update when constants are added or removed
! MAGIC = 20010320
# max code word in this release
--- 12,16 ----
# update when constants are added or removed
! MAGIC = 20010701
# max code word in this release
***************
*** 34,37 ****
--- 34,38 ----
ASSERT_NOT = "assert_not"
AT = "at"
+ BIGCHARSET = "bigcharset"
BRANCH = "branch"
CALL = "call"
***************
*** 104,108 ****
CALL,
CATEGORY,
! CHARSET,
GROUPREF, GROUPREF_IGNORE,
IN, IN_IGNORE,
--- 105,109 ----
CALL,
CATEGORY,
! CHARSET, BIGCHARSET,
GROUPREF, GROUPREF_IGNORE,
IN, IN_IGNORE,