[Python-checkins] CVS: python/dist/src/Lib sre.py,1.2,1.3 sre_compile.py,1.2,1.3 sre_constants.py,1.2,1.3 sre_parse.py,1.2,1.3

Guido van Rossum python-dev@python.org
Mon, 10 Apr 2000 13:10:52 -0400 (EDT)


Update of /projects/cvsroot/python/dist/src/Lib
In directory eric:/projects/python/develop/guido/src/Lib

Modified Files:
	sre.py sre_compile.py sre_constants.py sre_parse.py 
Log Message:
Fredrik Lundh: new snapshot.  Mostly reindented.
This one should work with unicode expressions, and compile
a bit more silently.

Index: sre.py
===================================================================
RCS file: /projects/cvsroot/python/dist/src/Lib/sre.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** sre.py	2000/04/02 05:22:29	1.2
--- sre.py	2000/04/10 17:10:48	1.3
***************
*** 2,6 ****
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre.py,v 1.2 2000/04/02 05:22:29 akuchlin Exp $
  #
  # re-compatible interface for the sre matching engine
--- 2,6 ----
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre.py,v 1.3 2000/04/10 17:10:48 guido Exp $
  #
  # re-compatible interface for the sre matching engine

Index: sre_compile.py
===================================================================
RCS file: /projects/cvsroot/python/dist/src/Lib/sre_compile.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** sre_compile.py	2000/04/02 05:22:30	1.2
--- sre_compile.py	2000/04/10 17:10:48	1.3
***************
*** 1,5 ****
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre_compile.py,v 1.2 2000/04/02 05:22:30 akuchlin Exp $
  #
  # convert template to internal format
--- 1,5 ----
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre_compile.py,v 1.3 2000/04/10 17:10:48 guido Exp $
  #
  # convert template to internal format
***************
*** 27,31 ****
  for WORDSIZE in "BHil":
      if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
!         break
  else:
      raise RuntimeError, "cannot find a useable array type"
--- 27,31 ----
  for WORDSIZE in "BHil":
      if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
! 	break
  else:
      raise RuntimeError, "cannot find a useable array type"
***************
*** 35,50 ****
  class Code:
      def __init__(self):
!         self.data = []
      def __len__(self):
!         return len(self.data)
      def __getitem__(self, index):
!         return self.data[index]
      def __setitem__(self, index, code):
!         self.data[index] = code
      def append(self, code):
!         self.data.append(code)
      def todata(self):
!         # print self.data
!         return array.array(WORDSIZE, self.data).tostring()
  
  def _lower(literal):
--- 35,50 ----
  class Code:
      def __init__(self):
! 	self.data = []
      def __len__(self):
! 	return len(self.data)
      def __getitem__(self, index):
! 	return self.data[index]
      def __setitem__(self, index, code):
! 	self.data[index] = code
      def append(self, code):
! 	self.data.append(code)
      def todata(self):
! 	# print self.data
! 	return array.array(WORDSIZE, self.data).tostring()
  
  def _lower(literal):
***************
*** 55,174 ****
      append = code.append
      for op, av in pattern:
!         if op is ANY:
!             if "s" in flags:
!                 append(CODES[op]) # any character at all!
!             else:
!                 append(CODES[NOT_LITERAL])
!                 append(10)
!         elif op in (SUCCESS, FAILURE):
!             append(CODES[op])
!         elif op is AT:
!             append(CODES[op])
!             append(POSITIONS[av])
!         elif op is BRANCH:
!             append(CODES[op])
!             tail = []
!             for av in av[1]:
!                 skip = len(code); append(0)
!                 _compile(code, av, flags)
!                 append(CODES[JUMP])
!                 tail.append(len(code)); append(0)
!                 code[skip] = len(code) - skip
!             append(0) # end of branch
!             for tail in tail:
!                 code[tail] = len(code) - tail
!         elif op is CALL:
!             append(CODES[op])
!             skip = len(code); append(0)
!             _compile(code, av, flags)
!             append(CODES[SUCCESS])
!             code[skip] = len(code) - skip
!         elif op is CATEGORY: # not used by current parser
!             append(CODES[op])
!             append(CATEGORIES[av])
!         elif op is GROUP:
!             if "i" in flags:
!                 append(CODES[MAP_IGNORE[op]])
!             else:
!                 append(CODES[op])
!             append(av)
!         elif op is IN:
!             if "i" in flags:
!                 append(CODES[MAP_IGNORE[op]])
!                 def fixup(literal):
!                     return ord(_lower(literal))
!             else:
!                 append(CODES[op])
!                 fixup = ord
!             skip = len(code); append(0)
!             for op, av in av:
!                 append(CODES[op])
!                 if op is NEGATE:
!                     pass
!                 elif op is LITERAL:
!                     append(fixup(av))
!                 elif op is RANGE:
!                     append(fixup(av[0]))
!                     append(fixup(av[1]))
!                 elif op is CATEGORY:
!                     append(CATEGORIES[av])
!                 else:
!                     raise ValueError, "unsupported set operator"
!             append(CODES[FAILURE])
!             code[skip] = len(code) - skip
!         elif op in (LITERAL, NOT_LITERAL):
!             if "i" in flags:
!                 append(CODES[MAP_IGNORE[op]])
!                 append(ord(_lower(av)))
!             else:
!                 append(CODES[op])
!                 append(ord(av))
!         elif op is MARK:
!             append(CODES[op])
!             append(av)
!         elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
!             lo, hi = av[2].getwidth()
!             if lo == 0:
!                 raise SyntaxError, "cannot repeat zero-width items"
!             if lo == hi == 1 and op is MAX_REPEAT:
!                 append(CODES[MAX_REPEAT_ONE])
!                 skip = len(code); append(0)
!                 append(av[0])
!                 append(av[1])
!                 _compile(code, av[2], flags)
!                 append(CODES[SUCCESS])
!                 code[skip] = len(code) - skip
!             else:
!                 append(CODES[op])
!                 skip = len(code); append(0)
!                 append(av[0])
!                 append(av[1])
!                 _compile(code, av[2], flags)
!                 if op is MIN_REPEAT:
!                     append(CODES[MIN_UNTIL])
!                 else:
!                     # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
!                     append(CODES[MAX_UNTIL])
!                 code[skip] = len(code) - skip
!         elif op is SUBPATTERN:
! ##          group = av[0]
! ##          if group:
! ##              append(CODES[MARK])
! ##              append((group-1)*2)
!             _compile(code, av[1], flags)
! ##          if group:
! ##              append(CODES[MARK])
! ##              append((group-1)*2+1)
!         else:
!             raise ValueError, ("unsupported operand type", op)
  
  def compile(p, flags=()):
      # convert pattern list to internal format
!     if type(p) is type(""):
!         import sre_parse
!         pattern = p
!         p = sre_parse.parse(p)
      else:
!         pattern = None
      # print p.getwidth()
      # print p
--- 55,174 ----
      append = code.append
      for op, av in pattern:
! 	if op is ANY:
! 	    if "s" in flags:
! 		append(CODES[op]) # any character at all!
! 	    else:
! 		append(CODES[NOT_LITERAL])
! 		append(10)
! 	elif op in (SUCCESS, FAILURE):
! 	    append(CODES[op])
! 	elif op is AT:
! 	    append(CODES[op])
! 	    append(POSITIONS[av])
! 	elif op is BRANCH:
! 	    append(CODES[op])
! 	    tail = []
! 	    for av in av[1]:
! 		skip = len(code); append(0)
! 		_compile(code, av, flags)
! 		append(CODES[JUMP])
! 		tail.append(len(code)); append(0)
! 		code[skip] = len(code) - skip
! 	    append(0) # end of branch
! 	    for tail in tail:
! 		code[tail] = len(code) - tail
! 	elif op is CALL:
! 	    append(CODES[op])
! 	    skip = len(code); append(0)
! 	    _compile(code, av, flags)
! 	    append(CODES[SUCCESS])
! 	    code[skip] = len(code) - skip
! 	elif op is CATEGORY: # not used by current parser
! 	    append(CODES[op])
! 	    append(CATEGORIES[av])
! 	elif op is GROUP:
! 	    if "i" in flags:
! 		append(CODES[MAP_IGNORE[op]])
! 	    else:
! 		append(CODES[op])
! 	    append(av)
! 	elif op is IN:
! 	    if "i" in flags:
! 		append(CODES[MAP_IGNORE[op]])
! 		def fixup(literal):
! 		    return ord(_lower(literal))
! 	    else:
! 		append(CODES[op])
! 		fixup = ord
! 	    skip = len(code); append(0)
! 	    for op, av in av:
! 		append(CODES[op])
! 		if op is NEGATE:
! 		    pass
! 		elif op is LITERAL:
! 		    append(fixup(av))
! 		elif op is RANGE:
! 		    append(fixup(av[0]))
! 		    append(fixup(av[1]))
! 		elif op is CATEGORY:
! 		    append(CATEGORIES[av])
! 		else:
! 		    raise ValueError, "unsupported set operator"
! 	    append(CODES[FAILURE])
! 	    code[skip] = len(code) - skip
! 	elif op in (LITERAL, NOT_LITERAL):
! 	    if "i" in flags:
! 		append(CODES[MAP_IGNORE[op]])
! 		append(ord(_lower(av)))
! 	    else:
! 		append(CODES[op])
! 		append(ord(av))
! 	elif op is MARK:
! 	    append(CODES[op])
! 	    append(av)
!  	elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
! 	    lo, hi = av[2].getwidth()
!  	    if lo == 0:
!  		raise SyntaxError, "cannot repeat zero-width items"
! 	    if lo == hi == 1 and op is MAX_REPEAT:
! 		append(CODES[MAX_REPEAT_ONE])
! 		skip = len(code); append(0)
! 		append(av[0])
! 		append(av[1])
! 		_compile(code, av[2], flags)
! 		append(CODES[SUCCESS])
! 		code[skip] = len(code) - skip
! 	    else:
! 		append(CODES[op])
! 		skip = len(code); append(0)
! 		append(av[0])
! 		append(av[1])
! 		_compile(code, av[2], flags)
! 		if op is MIN_REPEAT:
! 		    append(CODES[MIN_UNTIL])
! 		else:
! 		    # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
! 		    append(CODES[MAX_UNTIL])
! 		code[skip] = len(code) - skip
! 	elif op is SUBPATTERN:
! ## 	    group = av[0]
! ## 	    if group:
! ## 		append(CODES[MARK])
! ## 		append((group-1)*2)
! 	    _compile(code, av[1], flags)
! ## 	    if group:
! ## 		append(CODES[MARK])
! ## 		append((group-1)*2+1)
! 	else:
! 	    raise ValueError, ("unsupported operand type", op)
  
  def compile(p, flags=()):
      # convert pattern list to internal format
!     if type(p) in (type(""), type(u"")):
! 	import sre_parse
! 	pattern = p
! 	p = sre_parse.parse(p)
      else:
! 	pattern = None
      # print p.getwidth()
      # print p
***************
*** 179,187 ****
      data = code.todata()
      if 0: # debugging
!         print
!         print "-" * 68
!         import sre_disasm
!         sre_disasm.disasm(data)
!         print "-" * 68
      # print len(data), p.pattern.groups, len(p.pattern.groupdict)
      return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict)
--- 179,187 ----
      data = code.todata()
      if 0: # debugging
! 	print
! 	print "-" * 68
! 	import sre_disasm
! 	sre_disasm.disasm(data)
! 	print "-" * 68
      # print len(data), p.pattern.groups, len(p.pattern.groupdict)
      return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict)

Index: sre_constants.py
===================================================================
RCS file: /projects/cvsroot/python/dist/src/Lib/sre_constants.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** sre_constants.py	2000/04/02 05:22:30	1.2
--- sre_constants.py	2000/04/10 17:10:48	1.3
***************
*** 1,5 ****
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre_constants.py,v 1.2 2000/04/02 05:22:30 akuchlin Exp $
  #
  # various symbols used by the regular expression engine.
--- 1,5 ----
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre_constants.py,v 1.3 2000/04/10 17:10:48 guido Exp $
  #
  # various symbols used by the regular expression engine.
***************
*** 127,131 ****
      f.write("/* generated by sre_constants.py */\n")
      for k, v in items:
!         f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n")
      f.close()
      print "done"
--- 127,131 ----
      f.write("/* generated by sre_constants.py */\n")
      for k, v in items:
! 	f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n")
      f.close()
      print "done"

Index: sre_parse.py
===================================================================
RCS file: /projects/cvsroot/python/dist/src/Lib/sre_parse.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** sre_parse.py	2000/04/02 05:22:30	1.2
--- sre_parse.py	2000/04/10 17:10:48	1.3
***************
*** 1,5 ****
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre_parse.py,v 1.2 2000/04/02 05:22:30 akuchlin Exp $
  #
  # convert re-style regular expression to SRE template.  the current
--- 1,5 ----
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre_parse.py,v 1.3 2000/04/10 17:10:48 guido Exp $
  #
  # convert re-style regular expression to SRE template.  the current
***************
*** 27,33 ****
  REPEAT_CHARS  = "*+?{"
  
! OCTDIGITS = "01234567"
! HEXDIGITS = "0123456789abcdefABCDEF"
  
  ESCAPES = {
      "\\a": (LITERAL, chr(7)),
--- 27,36 ----
  REPEAT_CHARS  = "*+?{"
  
! # FIXME: string in tuple tests may explode with if char is unicode :-(
! DIGITS = tuple(string.digits)
  
+ OCTDIGITS = tuple("01234567")
+ HEXDIGITS = tuple("0123456789abcdefABCDEF")
+ 
  ESCAPES = {
      "\\a": (LITERAL, chr(7)),
***************
*** 56,221 ****
      # FIXME: <fl> rename class, and store flags in here too!
      def __init__(self):
!         self.flags = []
!         self.groups = 1
!         self.groupdict = {}
      def getgroup(self, name=None):
!         gid = self.groups
!         self.groups = gid + 1
!         if name:
!             self.groupdict[name] = gid
!         return gid
      def setflag(self, flag):
!         if flag not in self.flags:
!             self.flags.append(flag)
  
  class SubPattern:
      # a subpattern, in intermediate form
      def __init__(self, pattern, data=None):
!         self.pattern = pattern
!         if not data:
!             data = []
!         self.data = data
!         self.flags = []
!         self.width = None
      def __repr__(self):
!         return repr(self.data)
      def __len__(self):
!         return len(self.data)
      def __delitem__(self, index):
!         del self.data[index]
      def __getitem__(self, index):
!         return self.data[index]
      def __setitem__(self, index, code):
!         self.data[index] = code
      def __getslice__(self, start, stop):
!         return SubPattern(self.pattern, self.data[start:stop])
      def insert(self, index, code):
!         self.data.insert(index, code)
      def append(self, code):
!         self.data.append(code)
      def getwidth(self):
!         # determine the width (min, max) for this subpattern
!         if self.width:
!             return self.width
!         lo = hi = 0L
!         for op, av in self.data:
!             if op is BRANCH:
!                 l = sys.maxint
!                 h = 0
!                 for av in av[1]:
!                     i, j = av.getwidth()
!                     l = min(l, i)
!                     h = min(h, j)
!                 lo = lo + i
!                 hi = hi + j
!             elif op is CALL:
!                 i, j = av.getwidth()
!                 lo = lo + i
!                 hi = hi + j
!             elif op is SUBPATTERN:
!                 i, j = av[1].getwidth()
!                 lo = lo + i
!                 hi = hi + j
!             elif op in (MIN_REPEAT, MAX_REPEAT):
!                 i, j = av[2].getwidth()
!                 lo = lo + i * av[0]
!                 hi = hi + j * av[1]
!             elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
!                 lo = lo + 1
!                 hi = hi + 1
!             elif op == SUCCESS:
!                 break
!         self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
!         return self.width
      def set(self, flag):
!         if not flag in self.flags:
!             self.flags.append(flag)
      def reset(self, flag):
!         if flag in self.flags:
!             self.flags.remove(flag)
  
  class Tokenizer:
      def __init__(self, string):
!         self.string = list(string)
!         self.next = self.__next()
      def __next(self):
!         if not self.string:
!             return None
!         char = self.string[0]
!         if char[0] == "\\":
!             try:
!                 c = self.string[1]
!             except IndexError:
!                 raise SyntaxError, "bogus escape"
!             char = char + c
!             try:
!                 if c == "x":
!                     # hexadecimal constant
!                     for i in xrange(2, sys.maxint):
!                         c = self.string[i]
!                         if c not in HEXDIGITS:
!                             break
!                         char = char + c
!                 elif c in string.digits:
!                     # decimal (or octal) number
!                     for i in xrange(2, sys.maxint):
!                         c = self.string[i]
!                         # FIXME: if larger than current number of
!                         # groups, interpret as an octal number 
!                         if c not in string.digits:
!                             break
!                         char = char + c
!             except IndexError:
!                 pass # use what we've got this far
!         del self.string[0:len(char)]
!         return char
      def match(self, char):
!         if char == self.next:
!             self.next = self.__next()
!             return 1
!         return 0
      def match_set(self, set):
!         if self.next in set:
!             self.next = self.__next()
!             return 1
!         return 0
      def get(self):
!         this = self.next
!         self.next = self.__next()
!         return this
  
  def _fixescape(escape, character_class=0):
      # convert escape to (type, value)
      if character_class:
!         # inside a character class, we'll look in the character
!         # escapes dictionary first
!         code = ESCAPES.get(escape)
!         if code:
!             return code
!         code = CATEGORIES.get(escape)
      else:
!         code = CATEGORIES.get(escape)
!         if code:
!             return code
!         code = ESCAPES.get(escape)
      if code:
!         return code
      if not character_class:
!         try:
!             group = int(escape[1:])
!             # FIXME: only valid if group <= current number of groups
!             return GROUP, group
!         except ValueError:
!             pass
      try:
!         if escape[1:2] == "x":
!             escape = escape[2:]
!             return LITERAL, chr(string.atoi(escape[-2:], 16) & 0xff)
!         elif escape[1:2] in string.digits:
!             return LITERAL, chr(string.atoi(escape[1:], 8) & 0xff)
!         elif len(escape) == 2:
!             return LITERAL, escape[1]
      except ValueError:
!         pass
      raise SyntaxError, "bogus escape: %s" % repr(escape)
  
--- 59,224 ----
      # FIXME: <fl> rename class, and store flags in here too!
      def __init__(self):
! 	self.flags = []
! 	self.groups = 1
! 	self.groupdict = {}
      def getgroup(self, name=None):
! 	gid = self.groups
! 	self.groups = gid + 1
! 	if name:
! 	    self.groupdict[name] = gid
! 	return gid
      def setflag(self, flag):
! 	if flag in self.flags:
! 	    self.flags.append(flag)
  
  class SubPattern:
      # a subpattern, in intermediate form
      def __init__(self, pattern, data=None):
! 	self.pattern = pattern
! 	if not data:
! 	    data = []
! 	self.data = data
! 	self.flags = []
! 	self.width = None
      def __repr__(self):
! 	return repr(self.data)
      def __len__(self):
! 	return len(self.data)
      def __delitem__(self, index):
! 	del self.data[index]
      def __getitem__(self, index):
! 	return self.data[index]
      def __setitem__(self, index, code):
! 	self.data[index] = code
      def __getslice__(self, start, stop):
! 	return SubPattern(self.pattern, self.data[start:stop])
      def insert(self, index, code):
! 	self.data.insert(index, code)
      def append(self, code):
! 	self.data.append(code)
      def getwidth(self):
! 	# determine the width (min, max) for this subpattern
! 	if self.width:
! 	    return self.width
! 	lo = hi = 0L
! 	for op, av in self.data:
! 	    if op is BRANCH:
! 		l = sys.maxint
! 		h = 0
! 		for av in av[1]:
! 		    i, j = av.getwidth()
! 		    l = min(l, i)
! 		    h = min(h, j)
! 		lo = lo + i
! 		hi = hi + j
! 	    elif op is CALL:
! 		i, j = av.getwidth()
! 		lo = lo + i
! 		hi = hi + j
! 	    elif op is SUBPATTERN:
! 		i, j = av[1].getwidth()
! 		lo = lo + i
! 		hi = hi + j
! 	    elif op in (MIN_REPEAT, MAX_REPEAT):
! 		i, j = av[2].getwidth()
! 		lo = lo + i * av[0]
! 		hi = hi + j * av[1]
! 	    elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
! 		lo = lo + 1
! 		hi = hi + 1
! 	    elif op == SUCCESS:
! 		break
! 	self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
! 	return self.width
      def set(self, flag):
! 	if not flag in self.flags:
! 	    self.flags.append(flag)
      def reset(self, flag):
! 	if flag in self.flags:
! 	    self.flags.remove(flag)
  
  class Tokenizer:
      def __init__(self, string):
! 	self.string = list(string)
! 	self.next = self.__next()
      def __next(self):
! 	if not self.string:
! 	    return None
! 	char = self.string[0]
! 	if char[0] == "\\":
! 	    try:
! 		c = self.string[1]
! 	    except IndexError:
! 		raise SyntaxError, "bogus escape"
! 	    char = char + c
! 	    try:
! 		if c == "x":
! 		    # hexadecimal constant
! 		    for i in xrange(2, sys.maxint):
! 			c = self.string[i]
! 			if str(c) not in HEXDIGITS:
! 			    break
! 			char = char + c
! 		elif str(c) in DIGITS:
! 		    # decimal (or octal) number
! 		    for i in xrange(2, sys.maxint):
! 			c = self.string[i]
! 			# FIXME: if larger than current number of
! 			# groups, interpret as an octal number 
! 			if str(c) not in DIGITS:
! 			    break
! 			char = char + c
! 	    except IndexError:
! 		pass # use what we've got this far
! 	del self.string[0:len(char)]
! 	return char
      def match(self, char):
! 	if char == self.next:
! 	    self.next = self.__next()
! 	    return 1
! 	return 0
      def match_set(self, set):
! 	if self.next and self.next in set:
! 	    self.next = self.__next()
! 	    return 1
! 	return 0
      def get(self):
! 	this = self.next
! 	self.next = self.__next()
! 	return this
  
  def _fixescape(escape, character_class=0):
      # convert escape to (type, value)
      if character_class:
! 	# inside a character class, we'll look in the character
! 	# escapes dictionary first
! 	code = ESCAPES.get(escape)
! 	if code:
! 	    return code
! 	code = CATEGORIES.get(escape)
      else:
! 	code = CATEGORIES.get(escape)
! 	if code:
! 	    return code
! 	code = ESCAPES.get(escape)
      if code:
! 	return code
      if not character_class:
! 	try:
! 	    group = int(escape[1:])
! 	    # FIXME: only valid if group <= current number of groups
! 	    return GROUP, group
! 	except ValueError:
! 	    pass
      try:
! 	if escape[1:2] == "x":
! 	    escape = escape[2:]
! 	    return LITERAL, chr(int(escape[-2:], 16) & 0xff)
! 	elif str(escape[1:2]) in DIGITS:
! 	    return LITERAL, chr(int(escape[1:], 8) & 0xff)
! 	elif len(escape) == 2:
! 	    return LITERAL, escape[1]
      except ValueError:
! 	pass
      raise SyntaxError, "bogus escape: %s" % repr(escape)
  
***************
*** 227,259 ****
      # check if all items share a common prefix
      while 1:
!         prefix = None
!         for item in items:
!             if not item:
!                 break
!             if prefix is None:
!                 prefix = item[0]
!             elif item[0] != prefix:
!                 break
!         else:
!             # all subitems start with a common "prefix".
!             # move it out of the branch
!             for item in items:
!                 del item[0]
!             subpattern.append(prefix)
!             continue # check next one
!         break
  
      # check if the branch can be replaced by a character set
      for item in items:
!         if len(item) != 1 or item[0][0] != LITERAL:
!             break
      else:
!         # we can store this as a character set instead of a
!         # branch (FIXME: use a range if possible)
!         set = []
!         for item in items:
!             set.append(item[0])
!         subpattern.append((IN, set))
!         return
  
      subpattern.append((BRANCH, (None, items)))
--- 230,262 ----
      # check if all items share a common prefix
      while 1:
! 	prefix = None
! 	for item in items:
! 	    if not item:
! 		break
! 	    if prefix is None:
! 		prefix = item[0]
! 	    elif item[0] != prefix:
! 		break
! 	else:
! 	    # all subitems start with a common "prefix".
! 	    # move it out of the branch
! 	    for item in items:
! 		del item[0]
! 	    subpattern.append(prefix)
! 	    continue # check next one
! 	break
  
      # check if the branch can be replaced by a character set
      for item in items:
! 	if len(item) != 1 or item[0][0] != LITERAL:
! 	    break
      else:
! 	# we can store this as a character set instead of a
! 	# branch (FIXME: use a range if possible)
! 	set = []
! 	for item in items:
! 	    set.append(item[0])
! 	subpattern.append((IN, set))
! 	return
  
      subpattern.append((BRANCH, (None, items)))
***************
*** 269,444 ****
      while 1:
  
!         if source.next in ("|", ")"):
!             break # end of subpattern
!         this = source.get()
!         if this is None:
!             break # end of pattern
! 
!         if this and this[0] not in SPECIAL_CHARS:
!             subpattern.append((LITERAL, this))
! 
!         elif this == "[":
!             # character set
!             set = []
! ##          if source.match(":"):
! ##              pass # handle character classes
!             if source.match("^"):
!                 set.append((NEGATE, None))
!             # check remaining characters
!             start = set[:]
!             while 1:
!                 this = source.get()
!                 if this == "]" and set != start:
!                     break
!                 elif this and this[0] == "\\":
!                     code1 = _fixescape(this, 1)
!                 elif this:
!                     code1 = LITERAL, this
!                 else:
!                     raise SyntaxError, "unexpected end of regular expression"
!                 if source.match("-"):
!                     # potential range
!                     this = source.get()
!                     if this == "]":
!                         set.append(code1)
!                         set.append((LITERAL, "-"))
!                         break
!                     else:
!                         if this[0] == "\\":
!                             code2 = _fixescape(this, 1)
!                         else:
!                             code2 = LITERAL, this
!                         if code1[0] != LITERAL or code2[0] != LITERAL:
!                             raise SyntaxError, "illegal range"
!                         if len(code1[1]) != 1 or len(code2[1]) != 1:
!                             raise SyntaxError, "illegal range"
!                         set.append((RANGE, (code1[1], code2[1])))
!                 else:
!                     if code1[0] is IN:
!                         code1 = code1[1][0]
!                     set.append(code1)
! 
!             # FIXME: <fl> move set optimization to support function
!             if len(set)==1 and set[0][0] is LITERAL:
!                 subpattern.append(set[0]) # optimization
!             elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
!                 subpattern.append((NOT_LITERAL, set[1][1])) # optimization
!             else:
!                 # FIXME: <fl> add charmap optimization
!                 subpattern.append((IN, set))
! 
!         elif this and this[0] in REPEAT_CHARS:
!             # repeat previous item
!             if this == "?":
!                 min, max = 0, 1
!             elif this == "*":
!                 min, max = 0, sys.maxint
!             elif this == "+":
!                 min, max = 1, sys.maxint
!             elif this == "{":
!                 min, max = 0, sys.maxint
!                 lo = hi = ""
!                 while source.next in string.digits:
!                     lo = lo + source.get()
!                 if source.match(","):
!                     while source.next in string.digits:
!                         hi = hi + source.get()
!                 else:
!                     hi = lo
!                 if not source.match("}"):
!                     raise SyntaxError, "bogus range"
!                 if lo:
!                     min = int(lo)
!                 if hi:
!                     max = int(hi)
!                 # FIXME: <fl> check that hi >= lo!
!             else:
!                 raise SyntaxError, "not supported"
!             # figure out which item to repeat
!             # FIXME: should back up to the right mark, right?
!             if subpattern:
!                 index = len(subpattern)-1
!                 while subpattern[index][0] is MARK:
!                     index = index - 1
!                 item = subpattern[index:index+1]
!             else:
!                 raise SyntaxError, "nothing to repeat"
!             if source.match("?"):
!                 subpattern[index] = (MIN_REPEAT, (min, max, item))
!             else:
!                 subpattern[index] = (MAX_REPEAT, (min, max, item))
!         elif this == ".":
!             subpattern.append((ANY, None))
!         elif this == "(":
!             group = 1
!             name = None
!             if source.match("?"):
!                 group = 0
!                 # options
!                 if source.match("P"):
!                     # named group: skip forward to end of name
!                     if source.match("<"):
!                         name = ""
!                         while 1:
!                             char = source.get()
!                             if char in (">", None):
!                                 break
!                             name = name + char
!                         group = 1
!                 elif source.match(":"):
!                     # non-capturing group
!                     group = 2
!                 elif source.match_set("iI"):
!                     pattern.setflag("i")
!                 elif source.match_set("lL"):
!                     pattern.setflag("l")
!                 elif source.match_set("mM"):
!                     pattern.setflag("m")
!                 elif source.match_set("sS"):
!                     pattern.setflag("s")
!                 elif source.match_set("xX"):
!                     pattern.setflag("x")
!             if group:
!                 # parse group contents
!                 b = []
!                 if group == 2:
!                     # anonymous group
!                     group = None
!                 else:
!                     group = pattern.getgroup(name)
!                 if group:
!                     subpattern.append((MARK, (group-1)*2))
!                 while 1:
!                     p = _parse(source, pattern, flags)
!                     if source.match(")"):
!                         if b:
!                             b.append(p)
!                             _branch(subpattern, b)
!                         else:
!                             subpattern.append((SUBPATTERN, (group, p)))
!                         break
!                     elif source.match("|"):
!                         b.append(p)
!                     else:
!                         raise SyntaxError, "group not properly closed"
!                 if group:
!                     subpattern.append((MARK, (group-1)*2+1))
!             else:
!                 # FIXME: should this really be a while loop?
!                 while source.get() not in (")", None):
!                     pass
! 
!         elif this == "^":
!             subpattern.append((AT, AT_BEGINNING))
! 
!         elif this == "$":
!             subpattern.append((AT, AT_END))
! 
!         elif this and this[0] == "\\":
!             code =_fixescape(this)
!             subpattern.append(code)
  
!         else:
!             raise SyntaxError, "parser error"
  
      return subpattern
--- 272,449 ----
      while 1:
  
! 	if str(source.next) in ("|", ")"):
! 	    break # end of subpattern
! 	this = source.get()
! 	if this is None:
! 	    break # end of pattern
! 
! 	if this and this[0] not in SPECIAL_CHARS:
! 	    subpattern.append((LITERAL, this))
! 
! 	elif this == "[":
! 	    # character set
! 	    set = []
! ## 	    if source.match(":"):
! ## 		pass # handle character classes
! 	    if source.match("^"):
! 		set.append((NEGATE, None))
! 	    # check remaining characters
! 	    start = set[:]
! 	    while 1:
! 		this = source.get()
! 		if this == "]" and set != start:
! 		    break
! 		elif this and this[0] == "\\":
! 		    code1 = _fixescape(this, 1)
! 		elif this:
! 		    code1 = LITERAL, this
! 		else:
! 		    raise SyntaxError, "unexpected end of regular expression"
! 		if source.match("-"):
! 		    # potential range
! 		    this = source.get()
! 		    if this == "]":
! 			set.append(code1)
! 			set.append((LITERAL, "-"))
! 			break
! 		    else:
! 			if this[0] == "\\":
! 			    code2 = _fixescape(this, 1)
! 			else:
! 			    code2 = LITERAL, this
! 			if code1[0] != LITERAL or code2[0] != LITERAL:
! 			    raise SyntaxError, "illegal range"
! 			if len(code1[1]) != 1 or len(code2[1]) != 1:
! 			    raise SyntaxError, "illegal range"
! 			set.append((RANGE, (code1[1], code2[1])))
! 		else:
! 		    if code1[0] is IN:
! 			code1 = code1[1][0]
! 		    set.append(code1)
! 
! 	    # FIXME: <fl> move set optimization to support function
! 	    if len(set)==1 and set[0][0] is LITERAL:
! 		subpattern.append(set[0]) # optimization
! 	    elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
! 		subpattern.append((NOT_LITERAL, set[1][1])) # optimization
! 	    else:
! 		# FIXME: <fl> add charmap optimization
! 		subpattern.append((IN, set))
! 
! 	elif this and this[0] in REPEAT_CHARS:
! 	    # repeat previous item
! 	    if this == "?":
! 		min, max = 0, 1
! 	    elif this == "*":
! 		min, max = 0, sys.maxint
! 	    elif this == "+":
! 		min, max = 1, sys.maxint
! 	    elif this == "{":
! 		min, max = 0, sys.maxint
! 		lo = hi = ""
! 		while str(source.next) in DIGITS:
! 		    lo = lo + source.get()
! 		if source.match(","):
! 		    while str(source.next) in DIGITS:
! 			hi = hi + source.get()
! 		else:
! 		    hi = lo
! 		if not source.match("}"):
! 		    raise SyntaxError, "bogus range"
! 		if lo:
! 		    min = int(lo)
! 		if hi:
! 		    max = int(hi)
! 		# FIXME: <fl> check that hi >= lo!
! 	    else:
! 		raise SyntaxError, "not supported"
! 	    # figure out which item to repeat
! 	    # FIXME: should back up to the right mark, right?
! 	    if subpattern:
! 		index = len(subpattern)-1
! 		while subpattern[index][0] is MARK:
! 		    index = index - 1
! 		item = subpattern[index:index+1]
! 	    else:
! 		raise SyntaxError, "nothing to repeat"
! 	    if source.match("?"):
! 		subpattern[index] = (MIN_REPEAT, (min, max, item))
! 	    else:
! 		subpattern[index] = (MAX_REPEAT, (min, max, item))
! 	elif this == ".":
! 	    subpattern.append((ANY, None))
! 	elif this == "(":
! 	    group = 1
! 	    name = None
! 	    if source.match("?"):
! 		group = 0
! 		# options
! 		if source.match("P"):
! 		    # named group: skip forward to end of name
! 		    if source.match("<"):
! 			name = ""
! 			while 1:
! 			    char = source.get()
! 			    if char is None or char == ">":
! 				break
! 			    name = name + char
! 			group = 1
! 		elif source.match(":"):
! 		    # non-capturing group
! 		    group = 2
! 		elif source.match_set("iI"):
! 		    pattern.setflag("i")
! 		elif source.match_set("lL"):
! 		    pattern.setflag("l")
! 		elif source.match_set("mM"):
! 		    pattern.setflag("m")
! 		elif source.match_set("sS"):
! 		    pattern.setflag("s")
! 		elif source.match_set("xX"):
! 		    pattern.setflag("x")
! 	    if group:
! 		# parse group contents
! 		b = []
! 		if group == 2:
! 		    # anonymous group
! 		    group = None
! 		else:
! 		    group = pattern.getgroup(name)
!  		if group:
!  		    subpattern.append((MARK, (group-1)*2))
! 		while 1:
! 		    p = _parse(source, pattern, flags)
! 		    if source.match(")"):
! 			if b:
! 			    b.append(p)
! 			    _branch(subpattern, b)
! 			else:
! 			    subpattern.append((SUBPATTERN, (group, p)))
! 			break
! 		    elif source.match("|"):
! 			b.append(p)
! 		    else:
! 			raise SyntaxError, "group not properly closed"
!  		if group:
!  		    subpattern.append((MARK, (group-1)*2+1))
! 	    else:
! 		# FIXME: should this really be a while loop?
! 		while 1:
! 		    char = source.get()
! 		    if char is None or char == ")":
! 			break
! 
! 	elif this == "^":
! 	    subpattern.append((AT, AT_BEGINNING))
! 
! 	elif this == "$":
! 	    subpattern.append((AT, AT_END))
! 
! 	elif this and this[0] == "\\":
! 	    code =_fixescape(this)
! 	    subpattern.append(code)
  
! 	else:
! 	    raise SyntaxError, "parser error"
  
      return subpattern
***************
*** 449,466 ****
      b = []
      while 1:
!         p = _parse(s, g, flags)
!         tail = s.get()
!         if tail == "|":
!             b.append(p)
!         elif tail == ")":
!             raise SyntaxError, "unbalanced parenthesis"
!         elif tail is None:
!             if b:
!                 b.append(p)
!                 p = SubPattern(g)
!                 _branch(p, b)
!             break
!         else:
!             raise SyntaxError, "bogus characters at end of regular expression"
      return p
  
--- 454,471 ----
      b = []
      while 1:
! 	p = _parse(s, g, flags)
! 	tail = s.get()
! 	if tail == "|":
! 	    b.append(p)
! 	elif tail == ")":
! 	    raise SyntaxError, "unbalanced parenthesis"
! 	elif tail is None:
! 	    if b:
! 		b.append(p)
! 		p = SubPattern(g)
! 		_branch(p, b)
! 	    break
! 	else:
! 	    raise SyntaxError, "bogus characters at end of regular expression"
      return p
  
***************
*** 470,490 ****
      a = b = c = 0
      for pattern, flags in PATTERNS:
!         if flags:
!             continue
!         print "-"*68
!         try:
!             p = parse(pattern)
!             print repr(pattern), "->"
!             pprint(p.data)
!             import sre_compile
!             try:
!                 code = sre_compile.compile(p)
!                 c = c + 1
!             except:
!                 pass
!             a = a + 1
!         except SyntaxError, v:
!             print "**", repr(pattern), v
!         b = b + 1
      print "-"*68
      print a, "of", b, "patterns successfully parsed"
--- 475,495 ----
      a = b = c = 0
      for pattern, flags in PATTERNS:
! 	if flags:
! 	    continue
! 	print "-"*68
! 	try:
! 	    p = parse(pattern)
! 	    print repr(pattern), "->"
! 	    pprint(p.data)
! 	    import sre_compile
! 	    try:
! 		code = sre_compile.compile(p)
! 		c = c + 1
! 	    except:
! 		pass
! 	    a = a + 1
! 	except SyntaxError, v:
! 	    print "**", repr(pattern), v
! 	b = b + 1
      print "-"*68
      print a, "of", b, "patterns successfully parsed"