[Python-checkins] CVS: python/dist/src/Modules _sre.c,2.29,2.30 sre.h,2.16,2.17

Fredrik Lundh python-dev@python.org
Tue, 1 Aug 2000 14:05:44 -0700


Update of /cvsroot/python/python/dist/src/Modules
In directory slayer.i.sourceforge.net:/tmp/cvs-serv12762/Modules

Modified Files:
	_sre.c sre.h 
Log Message:


-- fixed width calculations for alternations
-- fixed literal check in branch operator
   (this broke test_tokenize, as reported by Mark Favas)
-- added REPEAT_ONE operator (still not enabled, though)
-- added some debugging stuff (maxlevel)

Index: _sre.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Modules/_sre.c,v
retrieving revision 2.29
retrieving revision 2.30
diff -C2 -r2.29 -r2.30
*** _sre.c	2000/08/01 18:20:07	2.29
--- _sre.c	2000/08/01 21:05:41	2.30
***************
*** 220,223 ****
--- 220,231 ----
  mark_fini(SRE_STATE* state)
  {
+ #if 0
+     /* FIXME: debugging */
+     if (state->maxlevel > 0)
+         printf("max %d\n", state->maxlevel);
+     if (state->mark_stack_base > 0)
+         printf("mark stack %d\n", state->mark_stack_base);
+ #endif
+ 
      if (state->mark_stack)
          free(state->mark_stack);
***************
*** 431,435 ****
  
  LOCAL(int)
! SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
  {
      /* check if string matches the given pattern.  returns -1 for
--- 439,443 ----
  
  LOCAL(int)
! SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
  {
      /* check if string matches the given pattern.  returns -1 for
***************
*** 444,448 ****
      SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
  
!     TRACE(("%8d: enter\n", PTR(ptr)));
  
      if (pattern[0] == SRE_OP_INFO) {
--- 452,456 ----
      SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
  
!     TRACE(("%8d: enter %d\n", PTR(ptr), level));
  
      if (pattern[0] == SRE_OP_INFO) {
***************
*** 457,460 ****
--- 465,472 ----
      }
  
+     /* FIXME: debugging */
+     if (level > state->maxlevel)
+         state->maxlevel = level;
+ 
      for (;;) {
  
***************
*** 624,628 ****
              if (state->ptr < state->beginning)
                  return 0;
!             i = SRE_MATCH(state, pattern + 2);
              if (i <= 0)
                  return i;
--- 636,640 ----
              if (state->ptr < state->beginning)
                  return 0;
!             i = SRE_MATCH(state, pattern + 2, level + 1);
              if (i <= 0)
                  return i;
***************
*** 639,643 ****
              if (state->ptr < state->beginning)
                  return 0;
!             i = SRE_MATCH(state, pattern + 2);
              if (i < 0)
                  return i;
--- 651,655 ----
              if (state->ptr < state->beginning)
                  return 0;
!             i = SRE_MATCH(state, pattern + 2, level + 1);
              if (i < 0)
                  return i;
***************
*** 657,664 ****
                  while (pattern[0]) {
                      TRACE(("%8d: try branch\n", PTR(ptr)));
!                     if (pattern[2] != SRE_OP_LITERAL ||
!                         (ptr < end && (SRE_CODE) ptr[0] == pattern[3])) {
                          state->ptr = ptr;
!                         i = SRE_MATCH(state, pattern + 1);
                          if (i)
                              return i;
--- 669,676 ----
                  while (pattern[0]) {
                      TRACE(("%8d: try branch\n", PTR(ptr)));
!                     if (pattern[1] != SRE_OP_LITERAL ||
!                         (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
                          state->ptr = ptr;
!                         i = SRE_MATCH(state, pattern + 1, level + 1);
                          if (i)
                              return i;
***************
*** 671,674 ****
--- 683,835 ----
              return 0;
  
+         case SRE_OP_REPEAT_ONE:
+             /* match repeated sequence (maximizing regexp) */
+ 
+             /* this operator only works if the repeated item is
+                exactly one character wide, and we're not already
+                collecting backtracking points.  for other cases,
+                use the MAX_REPEAT operator instead */
+ 
+             /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
+ 
+             TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
+                    pattern[1], pattern[2]));
+ 
+             count = 0;
+ 
+             if (pattern[3] == SRE_OP_ANY) {
+                 /* repeated wildcard.  skip to the end of the target
+                    string, and backtrack from there */
+                 /* FIXME: must look for line endings */
+                 if (ptr + pattern[1] > end)
+                     return 0; /* cannot match */
+                 count = pattern[2];
+                 if (count > end - ptr)
+                     count = end - ptr;
+                 ptr += count;
+ 
+             } else if (pattern[3] == SRE_OP_LITERAL) {
+                 /* repeated literal */
+                 SRE_CODE chr = pattern[4];
+                 while (count < (int) pattern[2]) {
+                     if (ptr >= end || (SRE_CODE) ptr[0] != chr)
+                         break;
+                     ptr++;
+                     count++;
+                 }
+ 
+             } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
+                 /* repeated literal */
+                 SRE_CODE chr = pattern[4];
+                 while (count < (int) pattern[2]) {
+                     if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
+                         break;
+                     ptr++;
+                     count++;
+                 }
+ 
+             } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
+                 /* repeated non-literal */
+                 SRE_CODE chr = pattern[4];
+                 while (count < (int) pattern[2]) {
+                     if (ptr >= end || (SRE_CODE) ptr[0] == chr)
+                         break;
+                     ptr++;
+                     count++;
+                 }
+ 
+             } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
+                 /* repeated non-literal */
+                 SRE_CODE chr = pattern[4];
+                 while (count < (int) pattern[2]) {
+                     if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
+                         break;
+                     ptr++;
+                     count++;
+                 }
+ 
+             } else if (pattern[3] == SRE_OP_IN) {
+                 /* repeated set */
+                 while (count < (int) pattern[2]) {
+                     if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
+                         break;
+                     ptr++;
+                     count++;
+                 }
+ 
+             } else {
+                 /* repeated single character pattern */
+                 state->ptr = ptr;
+                 while (count < (int) pattern[2]) {
+                     i = SRE_MATCH(state, pattern + 3, level + 1);
+                     if (i < 0)
+                         return i;
+                     if (!i)
+                         break;
+                     count++;
+                 }
+                 state->ptr = ptr;
+                 ptr += count;
+             }
+ 
+             /* when we arrive here, count contains the number of
+                matches, and ptr points to the tail of the target
+                string.  check if the rest of the pattern matches,
+                and backtrack if not. */
+ 
+             TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
+ 
+             if (count < (int) pattern[1])
+                 return 0;
+ 
+             if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
+                 /* tail is empty.  we're finished */
+                 TRACE(("%8d: tail is empty\n", PTR(ptr)));
+                 state->ptr = ptr;
+                 return 1;
+ 
+             } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
+                 /* tail starts with a literal. skip positions where
+                    the rest of the pattern cannot possibly match */
+                 SRE_CODE chr = pattern[pattern[0]+1];
+                 TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
+                 for (;;) {
+                     TRACE(("%8d: scan for tail match\n", PTR(ptr)));
+                     while (count >= (int) pattern[1] &&
+                            (ptr >= end || *ptr != chr)) {
+                         ptr--;
+                         count--;
+                     }
+                     TRACE(("%8d: check tail\n", PTR(ptr)));
+                     if (count < (int) pattern[1])
+                         break;
+                     state->ptr = ptr;
+                     i = SRE_MATCH(state, pattern + pattern[0], level + 1);
+                     if (i > 0) {
+                         TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+                         return 1;
+                     }
+                     ptr--;
+                     count--;
+                 }
+ 
+             } else {
+                 /* general case */
+                 TRACE(("%8d: tail is pattern\n", PTR(ptr)));
+                 while (count >= (int) pattern[1]) {
+                     state->ptr = ptr;
+                     i = SRE_MATCH(state, pattern + pattern[0], level + 1);
+                     if (i < 0)
+                         return i;
+                     if (i) {
+                         TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+                         return 1;
+                     }
+                     ptr--;
+                     count--;
+                 }
+             }
+             return 0;
+ 
          case SRE_OP_REPEAT:
              /* create repeat context.  all the hard work is done
***************
*** 678,683 ****
                     pattern[1], pattern[2]));
  
-             state->ptr = ptr;
- 
              rep.count = -1;
              rep.pattern = pattern;
--- 839,842 ----
***************
*** 687,694 ****
              state->repeat = &rep;
  
!             i = SRE_MATCH(state, pattern + pattern[0]);
  
              state->repeat = rep.prev;
-             /* free(rp); */
  
              return i;
--- 846,853 ----
              state->repeat = &rep;
  
!             state->ptr = ptr;
!             i = SRE_MATCH(state, pattern + pattern[0], level + 1);
  
              state->repeat = rep.prev;
  
              return i;
***************
*** 715,719 ****
                  TRACE(("%8d: match item (required)\n", PTR(ptr)));
                  rp->count = count;
!                 i = SRE_MATCH(state, rp->pattern + 3);
                  if (i)
                      return i;
--- 874,878 ----
                  TRACE(("%8d: match item (required)\n", PTR(ptr)));
                  rp->count = count;
!                 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
                  if (i)
                      return i;
***************
*** 730,734 ****
                  lastmark = state->lastmark;
                  mark_save(state, 0, lastmark);
!                 i = SRE_MATCH(state, rp->pattern + 3);
                  if (i)
                      return i;
--- 889,893 ----
                  lastmark = state->lastmark;
                  mark_save(state, 0, lastmark);
!                 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
                  if (i)
                      return i;
***************
*** 742,750 ****
              TRACE(("%8d: match tail\n", PTR(ptr)));
              state->repeat = rp->prev;
!             i = SRE_MATCH(state, pattern);
!             if (i) {
!                 /* free(rp); */
                  return i;
-             }
              state->repeat = rp;
              return 0;
--- 901,907 ----
              TRACE(("%8d: match tail\n", PTR(ptr)));
              state->repeat = rp->prev;
!             i = SRE_MATCH(state, pattern, level + 1);
!             if (i)
                  return i;
              state->repeat = rp;
              return 0;
***************
*** 768,772 ****
                  TRACE(("%8d: match item (required)\n", PTR(ptr)));
                  rp->count = count;
!                 i = SRE_MATCH(state, rp->pattern + 3);
                  if (i)
                      return i;
--- 925,929 ----
                  TRACE(("%8d: match item (required)\n", PTR(ptr)));
                  rp->count = count;
!                 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
                  if (i)
                      return i;
***************
*** 779,783 ****
              TRACE(("%8d: match tail\n", PTR(ptr)));
              state->repeat = rp->prev;
!             i = SRE_MATCH(state, pattern);
              if (i) {
                  /* free(rp); */
--- 936,940 ----
              TRACE(("%8d: match tail\n", PTR(ptr)));
              state->repeat = rp->prev;
!             i = SRE_MATCH(state, pattern, level + 1);
              if (i) {
                  /* free(rp); */
***************
*** 791,795 ****
              TRACE(("%8d: match item (optional)\n", PTR(ptr)));
              rp->count = count;
!             i = SRE_MATCH(state, rp->pattern + 3);
              if (i)
                  return i;
--- 948,952 ----
              TRACE(("%8d: match item (optional)\n", PTR(ptr)));
              rp->count = count;
!             i = SRE_MATCH(state, rp->pattern + 3, level + 1);
              if (i)
                  return i;
***************
*** 866,870 ****
                          if (flags & SRE_INFO_LITERAL)
                              return 1; /* we got all of it */
!                         status = SRE_MATCH(state, pattern + 2*prefix_len);
                          if (status != 0)
                              return status;
--- 1023,1027 ----
                          if (flags & SRE_INFO_LITERAL)
                              return 1; /* we got all of it */
!                         status = SRE_MATCH(state, pattern + 2*prefix_len, 1);
                          if (status != 0)
                              return status;
***************
*** 894,898 ****
              state->start = ptr;
              state->ptr = ++ptr;
!             status = SRE_MATCH(state, pattern + 2);
              if (status != 0)
                  break;
--- 1051,1055 ----
              state->start = ptr;
              state->ptr = ++ptr;
!             status = SRE_MATCH(state, pattern + 2, 1);
              if (status != 0)
                  break;
***************
*** 908,912 ****
              state->start = ptr;
              state->ptr = ptr;
!             status = SRE_MATCH(state, pattern);
              if (status != 0)
                  break;
--- 1065,1069 ----
              state->start = ptr;
              state->ptr = ptr;
!             status = SRE_MATCH(state, pattern, 1);
              if (status != 0)
                  break;
***************
*** 917,921 ****
              TRACE(("%8d: === SEARCH ===\n", PTR(ptr))); 
              state->start = state->ptr = ptr++;
!             status = SRE_MATCH(state, pattern);
              if (status != 0)
                  break;
--- 1074,1078 ----
              TRACE(("%8d: === SEARCH ===\n", PTR(ptr))); 
              state->start = state->ptr = ptr++;
!             status = SRE_MATCH(state, pattern, 1);
              if (status != 0)
                  break;
***************
*** 1033,1036 ****
--- 1190,1196 ----
      state->repeat = NULL;
  
+     /* FIXME: debugging */
+     state->maxlevel = 0;
+ 
      mark_fini(state);
  }
***************
*** 1111,1114 ****
--- 1271,1275 ----
  
      state->mark_stack = NULL;
+     state->mark_stack_base = 0;
  
      state_reset(state);
***************
*** 1263,1270 ****
  
      if (state.charsize == 1) {
!         status = sre_match(&state, PatternObject_GetCode(self));
      } else {
  #if defined(HAVE_UNICODE)
!         status = sre_umatch(&state, PatternObject_GetCode(self));
  #endif
      }
--- 1424,1431 ----
  
      if (state.charsize == 1) {
!         status = sre_match(&state, PatternObject_GetCode(self), 1);
      } else {
  #if defined(HAVE_UNICODE)
!         status = sre_umatch(&state, PatternObject_GetCode(self), 1);
  #endif
      }
***************
*** 1942,1949 ****
  
      if (state->charsize == 1) {
!         status = sre_match(state, PatternObject_GetCode(self->pattern));
      } else {
  #if defined(HAVE_UNICODE)
!         status = sre_umatch(state, PatternObject_GetCode(self->pattern));
  #endif
      }
--- 2103,2110 ----
  
      if (state->charsize == 1) {
!         status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
      } else {
  #if defined(HAVE_UNICODE)
!         status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
  #endif
      }

Index: sre.h
===================================================================
RCS file: /cvsroot/python/python/dist/src/Modules/sre.h,v
retrieving revision 2.16
retrieving revision 2.17
diff -C2 -r2.16 -r2.17
*** sre.h	2000/08/01 18:20:07	2.16
--- sre.h	2000/08/01 21:05:41	2.17
***************
*** 75,78 ****
--- 75,80 ----
      /* hooks */
      SRE_TOLOWER_HOOK lower;
+     /* debugging */
+     int maxlevel;
  } SRE_STATE;