[Python-checkins] CVS: python/dist/src/Lib sre_parse.py,1.3,1.4
A.M. Kuchling
python-dev@python.org
Fri, 9 Jun 2000 07:08:11 -0700
Update of /cvsroot/python/python/dist/src/Lib
In directory slayer.i.sourceforge.net:/tmp/cvs-serv19644
Modified Files:
sre_parse.py
Log Message:
Patch from /F: this patch brings the CVS version of SRE in sync with the
latest public snapshot.""
Index: sre_parse.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_parse.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** sre_parse.py 2000/04/10 17:10:48 1.3
--- sre_parse.py 2000/06/09 14:08:07 1.4
***************
*** 3,9 ****
# $Id$
#
! # convert re-style regular expression to SRE template. the current
! # implementation is somewhat incomplete, and not very fast. should
! # definitely be rewritten before Python 1.6 goes beta.
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
--- 3,7 ----
# $Id$
#
! # convert re-style regular expression to sre pattern
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
***************
*** 17,27 ****
#
- # FIXME: comments marked with the FIXME tag are open issues. all such
- # issues should be closed before the final beta.
-
import string, sys
from sre_constants import *
SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"
--- 15,28 ----
#
import string, sys
+ import _sre
+
from sre_constants import *
+ # FIXME: should be 65535, but the array module currently chokes on
+ # unsigned integers larger than 32767...
+ MAXREPEAT = int(2L**(_sre.getcodesize()*8-1))-1
+
SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"
***************
*** 33,36 ****
--- 34,39 ----
HEXDIGITS = tuple("0123456789abcdefABCDEF")
+ WHITESPACE = tuple(string.whitespace)
+
ESCAPES = {
"\\a": (LITERAL, chr(7)),
***************
*** 56,63 ****
}
! class Pattern:
! # FIXME: <fl> rename class, and store flags in here too!
def __init__(self):
! self.flags = []
self.groups = 1
self.groupdict = {}
--- 59,74 ----
}
! FLAGS = {
! "i": SRE_FLAG_IGNORECASE,
! "L": SRE_FLAG_LOCALE,
! "m": SRE_FLAG_MULTILINE,
! "s": SRE_FLAG_DOTALL,
! "t": SRE_FLAG_TEMPLATE,
! "x": SRE_FLAG_VERBOSE,
! }
!
! class State:
def __init__(self):
! self.flags = 0
self.groups = 1
self.groupdict = {}
***************
*** 68,74 ****
self.groupdict[name] = gid
return gid
- def setflag(self, flag):
- if flag in self.flags:
- self.flags.append(flag)
class SubPattern:
--- 79,82 ----
***************
*** 79,83 ****
data = []
self.data = data
- self.flags = []
self.width = None
def __repr__(self):
--- 87,90 ----
***************
*** 122,127 ****
elif op in (MIN_REPEAT, MAX_REPEAT):
i, j = av[2].getwidth()
! lo = lo + i * av[0]
! hi = hi + j * av[1]
elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
lo = lo + 1
--- 129,134 ----
elif op in (MIN_REPEAT, MAX_REPEAT):
i, j = av[2].getwidth()
! lo = lo + long(i) * av[0]
! hi = hi + long(j) * av[1]
elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
lo = lo + 1
***************
*** 131,175 ****
self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
return self.width
- def set(self, flag):
- if not flag in self.flags:
- self.flags.append(flag)
- def reset(self, flag):
- if flag in self.flags:
- self.flags.remove(flag)
class Tokenizer:
def __init__(self, string):
! self.string = list(string)
self.next = self.__next()
def __next(self):
! if not self.string:
return None
! char = self.string[0]
if char[0] == "\\":
try:
! c = self.string[1]
except IndexError:
raise SyntaxError, "bogus escape"
char = char + c
! try:
! if c == "x":
! # hexadecimal constant
! for i in xrange(2, sys.maxint):
! c = self.string[i]
! if str(c) not in HEXDIGITS:
! break
! char = char + c
! elif str(c) in DIGITS:
! # decimal (or octal) number
! for i in xrange(2, sys.maxint):
! c = self.string[i]
! # FIXME: if larger than current number of
! # groups, interpret as an octal number
! if str(c) not in DIGITS:
! break
! char = char + c
! except IndexError:
! pass # use what we've got this far
! del self.string[0:len(char)]
return char
def match(self, char):
--- 138,158 ----
self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
return self.width
class Tokenizer:
def __init__(self, string):
! self.index = 0
! self.string = string
self.next = self.__next()
def __next(self):
! if self.index >= len(self.string):
return None
! char = self.string[self.index]
if char[0] == "\\":
try:
! c = self.string[self.index + 1]
except IndexError:
raise SyntaxError, "bogus escape"
char = char + c
! self.index = self.index + len(char)
return char
def match(self, char):
***************
*** 188,221 ****
return this
! def _fixescape(escape, character_class=0):
! # convert escape to (type, value)
! if character_class:
! # inside a character class, we'll look in the character
! # escapes dictionary first
! code = ESCAPES.get(escape)
! if code:
! return code
! code = CATEGORIES.get(escape)
! else:
! code = CATEGORIES.get(escape)
! if code:
! return code
! code = ESCAPES.get(escape)
if code:
return code
- if not character_class:
- try:
- group = int(escape[1:])
- # FIXME: only valid if group <= current number of groups
- return GROUP, group
- except ValueError:
- pass
try:
if escape[1:2] == "x":
escape = escape[2:]
! return LITERAL, chr(int(escape[-2:], 16) & 0xff)
elif str(escape[1:2]) in DIGITS:
! return LITERAL, chr(int(escape[1:], 8) & 0xff)
! elif len(escape) == 2:
return LITERAL, escape[1]
except ValueError:
--- 171,242 ----
return this
! def _group(escape, state):
! # check if the escape string represents a valid group
! try:
! group = int(escape[1:])
! if group and group < state.groups:
! return group
! except ValueError:
! pass
! return None # not a valid group
!
! def _class_escape(source, escape):
! # handle escape code inside character class
! code = ESCAPES.get(escape)
! if code:
! return code
! code = CATEGORIES.get(escape)
! if code:
! return code
! try:
! if escape[1:2] == "x":
! while source.next in HEXDIGITS:
! escape = escape + source.get()
! escape = escape[2:]
! # FIXME: support unicode characters!
! return LITERAL, chr(int(escape[-4:], 16) & 0xff)
! elif str(escape[1:2]) in OCTDIGITS:
! while source.next in OCTDIGITS:
! escape = escape + source.get()
! escape = escape[1:]
! # FIXME: support unicode characters!
! return LITERAL, chr(int(escape[-6:], 8) & 0xff)
! if len(escape) == 2:
! return LITERAL, escape[1]
! except ValueError:
! pass
! raise SyntaxError, "bogus escape: %s" % repr(escape)
!
! def _escape(source, escape, state):
! # handle escape code in expression
! code = CATEGORIES.get(escape)
! if code:
! return code
! code = ESCAPES.get(escape)
if code:
return code
try:
if escape[1:2] == "x":
+ while source.next in HEXDIGITS:
+ escape = escape + source.get()
escape = escape[2:]
! # FIXME: support unicode characters!
! return LITERAL, chr(int(escape[-4:], 16) & 0xff)
elif str(escape[1:2]) in DIGITS:
! while 1:
! group = _group(escape, state)
! if group:
! if (not source.next or
! not _group(escape + source.next, state)):
! return GROUP, group
! escape = escape + source.get()
! elif source.next in OCTDIGITS:
! escape = escape + source.get()
! else:
! break
! escape = escape[1:]
! # FIXME: support unicode characters!
! return LITERAL, chr(int(escape[-6:], 8) & 0xff)
! if len(escape) == 2:
return LITERAL, escape[1]
except ValueError:
***************
*** 223,231 ****
raise SyntaxError, "bogus escape: %s" % repr(escape)
- def _branch(subpattern, items):
# form a branch operator from a set of items (FIXME: move this
# optimization to the compiler module!)
# check if all items share a common prefix
while 1:
--- 244,255 ----
raise SyntaxError, "bogus escape: %s" % repr(escape)
+ def _branch(pattern, items):
+
# form a branch operator from a set of items (FIXME: move this
# optimization to the compiler module!)
+ subpattern = SubPattern(pattern)
+
# check if all items share a common prefix
while 1:
***************
*** 258,272 ****
set.append(item[0])
subpattern.append((IN, set))
! return
subpattern.append((BRANCH, (None, items)))
! def _parse(source, pattern, flags=()):
# parse regular expression pattern into an operator list.
-
- subpattern = SubPattern(pattern)
! this = None
while 1:
--- 282,295 ----
set.append(item[0])
subpattern.append((IN, set))
! return subpattern
subpattern.append((BRANCH, (None, items)))
+ return subpattern
! def _parse(source, state, flags=0):
# parse regular expression pattern into an operator list.
! subpattern = SubPattern(state)
while 1:
***************
*** 278,281 ****
--- 301,315 ----
break # end of pattern
+ if state.flags & SRE_FLAG_VERBOSE:
+ # skip whitespace and comments
+ if this in WHITESPACE:
+ continue
+ if this == "#":
+ while 1:
+ this = source.get()
+ if this in (None, "\n"):
+ break
+ continue
+
if this and this[0] not in SPECIAL_CHARS:
subpattern.append((LITERAL, this))
***************
*** 295,299 ****
break
elif this and this[0] == "\\":
! code1 = _fixescape(this, 1)
elif this:
code1 = LITERAL, this
--- 329,333 ----
break
elif this and this[0] == "\\":
! code1 = _class_escape(source, this)
elif this:
code1 = LITERAL, this
***************
*** 309,313 ****
else:
if this[0] == "\\":
! code2 = _fixescape(this, 1)
else:
code2 = LITERAL, this
--- 343,347 ----
else:
if this[0] == "\\":
! code2 = _class_escape(source, this)
else:
code2 = LITERAL, this
***************
*** 322,326 ****
set.append(code1)
! # FIXME: <fl> move set optimization to support function
if len(set)==1 and set[0][0] is LITERAL:
subpattern.append(set[0]) # optimization
--- 356,360 ----
set.append(code1)
! # FIXME: <fl> move set optimization to compiler!
if len(set)==1 and set[0][0] is LITERAL:
subpattern.append(set[0]) # optimization
***************
*** 336,344 ****
min, max = 0, 1
elif this == "*":
! min, max = 0, sys.maxint
elif this == "+":
! min, max = 1, sys.maxint
elif this == "{":
! min, max = 0, sys.maxint
lo = hi = ""
while str(source.next) in DIGITS:
--- 370,378 ----
min, max = 0, 1
elif this == "*":
! min, max = 0, MAXREPEAT
elif this == "+":
! min, max = 1, MAXREPEAT
elif this == "{":
! min, max = 0, MAXREPEAT
lo = hi = ""
while str(source.next) in DIGITS:
***************
*** 359,376 ****
raise SyntaxError, "not supported"
# figure out which item to repeat
- # FIXME: should back up to the right mark, right?
if subpattern:
! index = len(subpattern)-1
! while subpattern[index][0] is MARK:
! index = index - 1
! item = subpattern[index:index+1]
else:
raise SyntaxError, "nothing to repeat"
if source.match("?"):
! subpattern[index] = (MIN_REPEAT, (min, max, item))
else:
! subpattern[index] = (MAX_REPEAT, (min, max, item))
elif this == ".":
subpattern.append((ANY, None))
elif this == "(":
group = 1
--- 393,408 ----
raise SyntaxError, "not supported"
# figure out which item to repeat
if subpattern:
! item = subpattern[-1:]
else:
raise SyntaxError, "nothing to repeat"
if source.match("?"):
! subpattern[-1] = (MIN_REPEAT, (min, max, item))
else:
! subpattern[-1] = (MAX_REPEAT, (min, max, item))
!
elif this == ".":
subpattern.append((ANY, None))
+
elif this == "(":
group = 1
***************
*** 380,405 ****
# options
if source.match("P"):
! # named group: skip forward to end of name
if source.match("<"):
name = ""
while 1:
char = source.get()
! if char is None or char == ">":
break
name = name + char
group = 1
elif source.match(":"):
# non-capturing group
group = 2
! elif source.match_set("iI"):
! pattern.setflag("i")
! elif source.match_set("lL"):
! pattern.setflag("l")
! elif source.match_set("mM"):
! pattern.setflag("m")
! elif source.match_set("sS"):
! pattern.setflag("s")
! elif source.match_set("xX"):
! pattern.setflag("x")
if group:
# parse group contents
--- 412,450 ----
# options
if source.match("P"):
! # python extensions
if source.match("<"):
+ # named group: skip forward to end of name
name = ""
while 1:
char = source.get()
! if char is None:
! raise SyntaxError, "unterminated name"
! if char == ">":
break
+ # FIXME: check for valid character
name = name + char
group = 1
+ elif source.match("="):
+ # named backreference
+ raise SyntaxError, "not yet implemented"
+
+ else:
+ char = source.get()
+ if char is None:
+ raise SyntaxError, "unexpected end of pattern"
+ raise SyntaxError, "unknown specifier: ?P%s" % char
elif source.match(":"):
# non-capturing group
group = 2
! elif source.match("#"):
! # comment
! while 1:
! char = source.get()
! if char is None or char == ")":
! break
! else:
! # flags
! while FLAGS.has_key(source.next):
! state.flags = state.flags | FLAGS[source.get()]
if group:
# parse group contents
***************
*** 409,423 ****
group = None
else:
! group = pattern.getgroup(name)
! if group:
! subpattern.append((MARK, (group-1)*2))
while 1:
! p = _parse(source, pattern, flags)
if source.match(")"):
if b:
b.append(p)
! _branch(subpattern, b)
! else:
! subpattern.append((SUBPATTERN, (group, p)))
break
elif source.match("|"):
--- 454,465 ----
group = None
else:
! group = state.getgroup(name)
while 1:
! p = _parse(source, state, flags)
if source.match(")"):
if b:
b.append(p)
! p = _branch(state, b)
! subpattern.append((SUBPATTERN, (group, p)))
break
elif source.match("|"):
***************
*** 425,436 ****
else:
raise SyntaxError, "group not properly closed"
- if group:
- subpattern.append((MARK, (group-1)*2+1))
else:
- # FIXME: should this really be a while loop?
while 1:
char = source.get()
if char is None or char == ")":
break
elif this == "^":
--- 467,476 ----
else:
raise SyntaxError, "group not properly closed"
else:
while 1:
char = source.get()
if char is None or char == ")":
break
+ # FIXME: skip characters?
elif this == "^":
***************
*** 441,445 ****
elif this and this[0] == "\\":
! code =_fixescape(this)
subpattern.append(code)
--- 481,485 ----
elif this and this[0] == "\\":
! code = _escape(source, this, state)
subpattern.append(code)
***************
*** 449,459 ****
return subpattern
! def parse(source, flags=()):
! s = Tokenizer(source)
! g = Pattern()
b = []
while 1:
! p = _parse(s, g, flags)
! tail = s.get()
if tail == "|":
b.append(p)
--- 489,500 ----
return subpattern
! def parse(pattern, flags=0):
! # parse 're' pattern into list of (opcode, argument) tuples
! source = Tokenizer(pattern)
! state = State()
b = []
while 1:
! p = _parse(source, state, flags)
! tail = source.get()
if tail == "|":
b.append(p)
***************
*** 463,471 ****
if b:
b.append(p)
! p = SubPattern(g)
! _branch(p, b)
break
else:
raise SyntaxError, "bogus characters at end of regular expression"
return p
--- 504,531 ----
if b:
b.append(p)
! p = _branch(state, b)
break
else:
raise SyntaxError, "bogus characters at end of regular expression"
+ return p
+
+ def parse_replacement(source, pattern):
+ # parse 're' replacement string into list of literals and
+ # group references
+ s = Tokenizer(source)
+ p = []
+ a = p.append
+ while 1:
+ this = s.get()
+ if this is None:
+ break # end of replacement string
+ if this and this[0] == "\\":
+ try:
+ a(LITERAL, ESCAPES[this])
+ except KeyError:
+ for char in this:
+ a(LITERAL, char)
+ else:
+ a(LITERAL, this)
return p