[Python-checkins] CVS: python/dist/src/Lib sre.py,1.20,1.21 sre_compile.py,1.25,1.26 sre_constants.py,1.17,1.18 sre_parse.py,1.25,1.26

Fredrik Lundh python-dev@python.org
Sun, 23 Jul 2000 14:46:21 -0700


Update of /cvsroot/python/python/dist/src/Lib
In directory slayer.i.sourceforge.net:/tmp/cvs-serv6140/Lib

Modified Files:
	sre.py sre_compile.py sre_constants.py sre_parse.py 
Log Message:


-- SRE 0.9.6 sync.  this includes:

 + added "regs" attribute
 + fixed "pos" and "endpos" attributes
 + reset "lastindex" and "lastgroup" in scanner methods
 + removed (?P#id) syntax; the "lastindex" and "lastgroup"
   attributes are now always set
 + removed string module dependencies in sre_parse
 + better debugging support in sre_parse
 + various tweaks to build under 1.5.2


Index: sre.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre.py,v
retrieving revision 1.20
retrieving revision 1.21
diff -C2 -r1.20 -r1.21
*** sre.py	2000/07/02 22:59:57	1.20
--- sre.py	2000/07/23 21:46:17	1.21
***************
*** 11,17 ****
--- 11,21 ----
  #
  
+ # FIXME: change all FIXME's to XXX ;-)
+ 
  import sre_compile
  import sre_parse
  
+ import string
+ 
  # flags
  I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
***************
*** 54,57 ****
--- 58,64 ----
      return _compile(pattern, flags)
  
+ def purge():
+     _cache.clear()
+ 
  def template(pattern, flags=0):
      return _compile(pattern, flags|T)
***************
*** 66,70 ****
              else:
                  s[i] = "\\" + c
!     return pattern[:0].join(s)
  
  # --------------------------------------------------------------------
--- 73,77 ----
              else:
                  s[i] = "\\" + c
!     return _join(s, pattern)
  
  # --------------------------------------------------------------------
***************
*** 74,81 ****
  _MAXCACHE = 100
  
  def _compile(pattern, flags=0):
      # internal: compile pattern
      tp = type(pattern)
!     if tp not in (type(""), type(u"")):
          return pattern
      key = (tp, pattern, flags)
--- 81,92 ----
  _MAXCACHE = 100
  
+ def _join(seq, sep):
+     # internal: join into string having the same type as sep
+     return string.join(seq, sep[:0])
+ 
  def _compile(pattern, flags=0):
      # internal: compile pattern
      tp = type(pattern)
!     if tp not in sre_compile.STRING_TYPES:
          return pattern
      key = (tp, pattern, flags)
***************
*** 90,97 ****
      return p
  
- def purge():
-     # clear pattern cache
-     _cache.clear()
- 
  def _sub(pattern, template, string, count=0):
      # internal: pattern.sub implementation hook
--- 101,104 ----
***************
*** 121,125 ****
          n = n + 1
      append(string[i:])
!     return string[:0].join(s), n
  
  def _split(pattern, string, maxsplit=0):
--- 128,132 ----
          n = n + 1
      append(string[i:])
!     return _join(s, string[:0]), n
  
  def _split(pattern, string, maxsplit=0):
***************
*** 162,170 ****
  class Scanner:
      def __init__(self, lexicon):
          self.lexicon = lexicon
          p = []
          for phrase, action in lexicon:
!             p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
!         self.scanner = _compile("|".join(p))
      def scan(self, string):
          result = []
--- 169,185 ----
  class Scanner:
      def __init__(self, lexicon):
+         from sre_constants import BRANCH, SUBPATTERN, INDEX
          self.lexicon = lexicon
+         # combine phrases into a compound pattern
          p = []
+         s = sre_parse.Pattern()
          for phrase, action in lexicon:
!             p.append(sre_parse.SubPattern(s, [
!                 (SUBPATTERN, (None, sre_parse.parse(phrase))),
!                 (INDEX, len(p))
!                 ]))
!         p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
!         s.groups = len(p)
!         self.scanner = sre_compile.compile(p)
      def scan(self, string):
          result = []

Index: sre_compile.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_compile.py,v
retrieving revision 1.25
retrieving revision 1.26
diff -C2 -r1.25 -r1.26
*** sre_compile.py	2000/07/05 21:14:15	1.25
--- sre_compile.py	2000/07/23 21:46:17	1.26
***************
*** 198,205 ****
                  emit(ATCODES[av])
          elif op is BRANCH:
-             emit(OPCODES[op])
              tail = []
              for av in av[1]:
                  skip = len(code); emit(0)
                  _compile(code, av, flags)
                  emit(OPCODES[JUMP])
--- 198,206 ----
                  emit(ATCODES[av])
          elif op is BRANCH:
              tail = []
              for av in av[1]:
+                 emit(OPCODES[op])
                  skip = len(code); emit(0)
+                 emit(MAXCODE) # save mark
                  _compile(code, av, flags)
                  emit(OPCODES[JUMP])
***************
*** 287,295 ****
      code[skip] = len(code) - skip
  
  def compile(p, flags=0):
      # internal: convert pattern list to internal format
  
      # compile, as necessary
!     if type(p) in (type(""), type(u"")):
          import sre_parse
          pattern = p
--- 288,303 ----
      code[skip] = len(code) - skip
  
+ STRING_TYPES = [type("")]
+ 
+ try:
+     STRING_TYPES.append(type(unicode("")))
+ except NameError:
+     pass
+ 
  def compile(p, flags=0):
      # internal: convert pattern list to internal format
  
      # compile, as necessary
!     if type(p) in STRING_TYPES:
          import sre_parse
          pattern = p
***************
*** 308,311 ****
--- 316,321 ----
  
      code.append(OPCODES[SUCCESS])
+ 
+     # print code
  
      # FIXME: <fl> get rid of this limitation!

Index: sre_constants.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_constants.py,v
retrieving revision 1.17
retrieving revision 1.18
diff -C2 -r1.17 -r1.18
*** sre_constants.py	2000/07/16 12:04:30	1.17
--- sre_constants.py	2000/07/23 21:46:17	1.18
***************
*** 173,177 ****
  SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
  SRE_FLAG_IGNORECASE = 2 # case insensitive
! SRE_FLAG_LOCALE = 4 # honor system locale
  SRE_FLAG_MULTILINE = 8 # treat target as multiline string
  SRE_FLAG_DOTALL = 16 # treat target as a single string
--- 173,177 ----
  SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
  SRE_FLAG_IGNORECASE = 2 # case insensitive
! SRE_FLAG_LOCALE = 4 # honour system locale
  SRE_FLAG_MULTILINE = 8 # treat target as multiline string
  SRE_FLAG_DOTALL = 16 # treat target as a single string

Index: sre_parse.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_parse.py,v
retrieving revision 1.25
retrieving revision 1.26
diff -C2 -r1.25 -r1.26
*** sre_parse.py	2000/07/03 21:31:48	1.25
--- sre_parse.py	2000/07/23 21:46:17	1.26
***************
*** 26,35 ****
  REPEAT_CHARS  = "*+?{"
  
! DIGITS = tuple(string.digits)
  
  OCTDIGITS = tuple("01234567")
  HEXDIGITS = tuple("0123456789abcdefABCDEF")
  
! WHITESPACE = tuple(string.whitespace)
  
  ESCAPES = {
--- 26,35 ----
  REPEAT_CHARS  = "*+?{"
  
! DIGITS = tuple("012345689")
  
  OCTDIGITS = tuple("01234567")
  HEXDIGITS = tuple("0123456789abcdefABCDEF")
  
! WHITESPACE = tuple(" \t\n\r\v\f")
  
  ESCAPES = {
***************
*** 69,73 ****
  }
  
! class State:
      def __init__(self):
          self.flags = 0
--- 69,74 ----
  }
  
! class Pattern:
!     # master pattern object.  keeps track of global attributes
      def __init__(self):
          self.flags = 0
***************
*** 89,92 ****
--- 90,120 ----
          self.data = data
          self.width = None
+     def dump(self, level=0):
+         nl = 1
+         for op, av in self.data:
+             print level*"  " + op,; nl = 0
+             if op == "in":
+                 # member sublanguage
+                 print; nl = 1
+                 for op, a in av:
+                     print (level+1)*"  " + op, a
+             elif op == "branch":
+                 print; nl = 1
+                 i = 0
+                 for a in av[1]:
+                     if i > 0:
+                         print level*"  " + "or"
+                     a.dump(level+1); nl = 1
+                     i = i + 1
+             elif type(av) in (type(()), type([])):
+                 for a in av:
+                     if isinstance(a, SubPattern):
+                         if not nl: print
+                         a.dump(level+1); nl = 1
+                     else:
+                         print a, ; nl = 0
+             else:
+                 print av, ; nl = 0
+             if not nl: print
      def __repr__(self):
          return repr(self.data)
***************
*** 256,264 ****
      raise error, "bogus escape: %s" % repr(escape)
  
! def _branch(pattern, items):
!     # form a branch operator from a set of items
  
!     subpattern = SubPattern(pattern)
  
      # check if all items share a common prefix
      while 1:
--- 284,307 ----
      raise error, "bogus escape: %s" % repr(escape)
  
! def _parse_sub(source, state, nested=1):
!     # parse an alternation: a|b|c
  
!     items = []
!     while 1:
!         items.append(_parse(source, state))
!         if source.match("|"):
!             continue
!         if not nested:
!             break
!         if not source.next or source.match(")"):
!             break
!         else:
!             raise error, "pattern not properly closed"
  
+     if len(items) == 1:
+         return items[0]
+ 
+     subpattern = SubPattern(state)
+ 
      # check if all items share a common prefix
      while 1:
***************
*** 286,290 ****
      else:
          # we can store this as a character set instead of a
!         # branch (FIXME: use a range if possible)
          set = []
          for item in items:
--- 329,333 ----
      else:
          # we can store this as a character set instead of a
!         # branch (the compiler may optimize this even more)
          set = []
          for item in items:
***************
*** 297,302 ****
  
  def _parse(source, state):
! 
!     # parse regular expression pattern into an operator list.
  
      subpattern = SubPattern(state)
--- 340,344 ----
  
  def _parse(source, state):
!     # parse a simple pattern
  
      subpattern = SubPattern(state)
***************
*** 452,471 ****
                              raise error, "unknown group name"
                          subpattern.append((GROUPREF, gid))
-                     elif source.match("#"):
-                         index = ""
-                         while 1:
-                             char = source.get()
-                             if char is None:
-                                 raise error, "unterminated index"
-                             if char == ")":
-                                 break
-                             index = index + char
-                         try:
-                             index = int(index)
-                             if index < 0 or index > MAXREPEAT:
-                                 raise ValueError
-                         except ValueError:
-                             raise error, "illegal index"
-                         subpattern.append((INDEX, index))
                          continue
                      else:
--- 494,497 ----
***************
*** 492,511 ****
                          dir = -1 # lookbehind
                          char = source.get()
!                     b = []
!                     while 1:
!                         p = _parse(source, state)
!                         if source.next == ")":
!                             if b:
!                                 b.append(p)
!                                 p = _branch(state, b)
!                             if char == "=":
!                                 subpattern.append((ASSERT, (dir, p)))
!                             else:
!                                 subpattern.append((ASSERT_NOT, (dir, p)))
!                             break
!                         elif source.match("|"):
!                             b.append(p)
!                         else:
!                             raise error, "pattern not properly closed"
                  else:
                      # flags
--- 518,527 ----
                          dir = -1 # lookbehind
                          char = source.get()
!                     p = _parse_sub(source, state)
!                     if char == "=":
!                         subpattern.append((ASSERT, (dir, p)))
!                     else:
!                         subpattern.append((ASSERT_NOT, (dir, p)))
!                     continue
                  else:
                      # flags
***************
*** 514,518 ****
              if group:
                  # parse group contents
-                 b = []
                  if group == 2:
                      # anonymous group
--- 530,533 ----
***************
*** 520,537 ****
                  else:
                      group = state.getgroup(name)
!                 while 1:
!                     p = _parse(source, state)
!                     if group is not None:
!                         p.append((INDEX, group))
!                     if source.match(")"):
!                         if b:
!                             b.append(p)
!                             p = _branch(state, b)
!                         subpattern.append((SUBPATTERN, (group, p)))
!                         break
!                     elif source.match("|"):
!                         b.append(p)
!                     else:
!                         raise error, "group not properly closed"
              else:
                  while 1:
--- 535,542 ----
                  else:
                      group = state.getgroup(name)
!                 p = _parse_sub(source, state)
!                 subpattern.append((SUBPATTERN, (group, p)))
!                 if group is not None:
!                     p.append((INDEX, group))
              else:
                  while 1:
***************
*** 556,579 ****
      return subpattern
  
! def parse(pattern, flags=0):
      # parse 're' pattern into list of (opcode, argument) tuples
!     source = Tokenizer(pattern)
!     state = State()
!     state.flags = flags
!     b = []
!     while 1:
!         p = _parse(source, state)
!         tail = source.get()
!         if tail == "|":
!             b.append(p)
!         elif tail == ")":
!             raise error, "unbalanced parenthesis"
!         elif tail is None:
!             if b:
!                 b.append(p)
!                 p = _branch(state, b)
!             break
!         else:
!             raise error, "bogus characters at end of regular expression"
      return p
  
--- 561,582 ----
      return subpattern
  
! def parse(str, flags=0):
      # parse 're' pattern into list of (opcode, argument) tuples
! 
!     source = Tokenizer(str)
! 
!     pattern = Pattern()
!     pattern.flags = flags
! 
!     p = _parse_sub(source, pattern, 0)
! 
!     tail = source.get()
!     if tail == ")":
!         raise error, "unbalanced parenthesis"
!     elif tail:
!         raise error, "bogus characters at end of regular expression"
! 
!     # p.dump()
! 
      return p
  
***************
*** 657,659 ****
                  raise error, "empty group"
              a(s)
!     return sep.join(p)
--- 660,662 ----
                  raise error, "empty group"
              a(s)
!     return string.join(p, sep)