[Python-checkins] cpython: Issue #433028: Added support of modifier spans in regular expressions.

serhiy.storchaka python-checkins at python.org
Fri Sep 9 18:15:11 EDT 2016


https://hg.python.org/cpython/rev/ce5a834978ac
changeset:   103479:ce5a834978ac
parent:      103471:66afc449efa9
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Sat Sep 10 00:57:55 2016 +0300
summary:
  Issue #433028: Added support of modifier spans in regular expressions.

files:
  Doc/library/re.rst   |   10 ++
  Doc/whatsnew/3.6.rst |    9 ++
  Lib/re.py            |    2 +-
  Lib/sre_compile.py   |   69 ++++++++++--------
  Lib/sre_parse.py     |  114 ++++++++++++++++++++++--------
  Lib/test/test_re.py  |   40 +++++++++-
  Misc/NEWS            |    2 +
  7 files changed, 180 insertions(+), 66 deletions(-)


diff --git a/Doc/library/re.rst b/Doc/library/re.rst
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -237,6 +237,16 @@
    *cannot* be retrieved after performing a match or referenced later in the
    pattern.
 
+``(?imsx-imsx:...)``
+   (Zero or more letters from the set ``'i'``, ``'m'``, ``'s'``, ``'x'``,
+   optionally followed by ``'-'`` followed by one or more letters from the
+   same set.)  The letters set or removes the corresponding flags:
+   :const:`re.I` (ignore case), :const:`re.M` (multi-line), :const:`re.S`
+   (dot matches all), and :const:`re.X` (verbose), for the part of the
+   expression.  (The flags are described in :ref:`contents-of-module-re`.)
+
+   .. versionadded: 3.7
+
 ``(?P<name>...)``
    Similar to regular parentheses, but the substring matched by the group is
    accessible via the symbolic group name *name*.  Group names must be valid
diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@@ -645,6 +645,15 @@
 Storchaka in :issue:`24164`.)
 
 
+re
+--
+
+Added support of modifier spans in regular expressions.  Examples:
+``'(?i:p)ython'`` matches ``'python'`` and ``'Python'``, but not ``'PYTHON'``;
+``'(?i)g(?-i:v)r'`` matches ``'GvR'`` and ``'gvr'``, but not ``'GVR'``.
+(Contributed by Serhiy Storchaka in :issue:`433028`.)
+
+
 readline
 --------
 
diff --git a/Lib/re.py b/Lib/re.py
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -352,7 +352,7 @@
         for phrase, action in lexicon:
             gid = s.opengroup()
             p.append(sre_parse.SubPattern(s, [
-                (SUBPATTERN, (gid, sre_parse.parse(phrase, flags))),
+                (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))),
                 ]))
             s.closegroup(gid, p[-1])
         p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -71,7 +71,8 @@
     ASSERT_CODES = _ASSERT_CODES
     if (flags & SRE_FLAG_IGNORECASE and
             not (flags & SRE_FLAG_LOCALE) and
-            flags & SRE_FLAG_UNICODE):
+            flags & SRE_FLAG_UNICODE and
+            not (flags & SRE_FLAG_ASCII)):
         fixes = _ignorecase_fixes
     else:
         fixes = None
@@ -137,14 +138,15 @@
                 else:
                     emit(MIN_UNTIL)
         elif op is SUBPATTERN:
-            if av[0]:
+            group, add_flags, del_flags, p = av
+            if group:
                 emit(MARK)
-                emit((av[0]-1)*2)
-            # _compile_info(code, av[1], flags)
-            _compile(code, av[1], flags)
-            if av[0]:
+                emit((group-1)*2)
+            # _compile_info(code, p, (flags | add_flags) & ~del_flags)
+            _compile(code, p, (flags | add_flags) & ~del_flags)
+            if group:
                 emit(MARK)
-                emit((av[0]-1)*2+1)
+                emit((group-1)*2+1)
         elif op in SUCCESS_CODES:
             emit(op)
         elif op in ASSERT_CODES:
@@ -172,7 +174,7 @@
                 av = AT_MULTILINE.get(av, av)
             if flags & SRE_FLAG_LOCALE:
                 av = AT_LOCALE.get(av, av)
-            elif flags & SRE_FLAG_UNICODE:
+            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                 av = AT_UNICODE.get(av, av)
             emit(av)
         elif op is BRANCH:
@@ -193,7 +195,7 @@
             emit(op)
             if flags & SRE_FLAG_LOCALE:
                 av = CH_LOCALE[av]
-            elif flags & SRE_FLAG_UNICODE:
+            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                 av = CH_UNICODE[av]
             emit(av)
         elif op is GROUPREF:
@@ -237,7 +239,7 @@
         elif op is CATEGORY:
             if flags & SRE_FLAG_LOCALE:
                 emit(CH_LOCALE[av])
-            elif flags & SRE_FLAG_UNICODE:
+            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                 emit(CH_UNICODE[av])
             else:
                 emit(av)
@@ -414,14 +416,16 @@
     prefix = []
     prefixappend = prefix.append
     prefix_skip = None
-    got_all = True
     for op, av in pattern.data:
         if op is LITERAL:
             prefixappend(av)
         elif op is SUBPATTERN:
-            prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1])
+            group, add_flags, del_flags, p = av
+            if add_flags & SRE_FLAG_IGNORECASE:
+                break
+            prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
             if prefix_skip is None:
-                if av[0] is not None:
+                if group is not None:
                     prefix_skip = len(prefix)
                 elif prefix_skip1 is not None:
                     prefix_skip = len(prefix) + prefix_skip1
@@ -429,32 +433,35 @@
             if not got_all:
                 break
         else:
-            got_all = False
             break
-    return prefix, prefix_skip, got_all
+    else:
+        return prefix, prefix_skip, True
+    return prefix, prefix_skip, False
 
 def _get_charset_prefix(pattern):
     charset = [] # not used
     charsetappend = charset.append
     if pattern.data:
         op, av = pattern.data[0]
-        if op is SUBPATTERN and av[1]:
-            op, av = av[1][0]
-            if op is LITERAL:
-                charsetappend((op, av))
-            elif op is BRANCH:
-                c = []
-                cappend = c.append
-                for p in av[1]:
-                    if not p:
-                        break
-                    op, av = p[0]
-                    if op is LITERAL:
-                        cappend((op, av))
+        if op is SUBPATTERN:
+            group, add_flags, del_flags, p = av
+            if p and not (add_flags & SRE_FLAG_IGNORECASE):
+                op, av = p[0]
+                if op is LITERAL:
+                    charsetappend((op, av))
+                elif op is BRANCH:
+                    c = []
+                    cappend = c.append
+                    for p in av[1]:
+                        if not p:
+                            break
+                        op, av = p[0]
+                        if op is LITERAL:
+                            cappend((op, av))
+                        else:
+                            break
                     else:
-                        break
-                else:
-                    charset = c
+                        charset = c
         elif op is BRANCH:
             c = []
             cappend = c.append
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -65,6 +65,12 @@
     "u": SRE_FLAG_UNICODE,
 }
 
+GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE |
+                SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE)
+
+class Verbose(Exception):
+    pass
+
 class Pattern:
     # master pattern object.  keeps track of global attributes
     def __init__(self):
@@ -184,7 +190,7 @@
                 lo = lo + i
                 hi = hi + j
             elif op is SUBPATTERN:
-                i, j = av[1].getwidth()
+                i, j = av[-1].getwidth()
                 lo = lo + i
                 hi = hi + j
             elif op in _REPEATCODES:
@@ -395,7 +401,7 @@
         pass
     raise source.error("bad escape %s" % escape, len(escape))
 
-def _parse_sub(source, state, nested=True):
+def _parse_sub(source, state, verbose, nested=True):
     # parse an alternation: a|b|c
 
     items = []
@@ -403,7 +409,7 @@
     sourcematch = source.match
     start = source.tell()
     while True:
-        itemsappend(_parse(source, state))
+        itemsappend(_parse(source, state, verbose))
         if not sourcematch("|"):
             break
 
@@ -445,10 +451,10 @@
     subpattern.append((BRANCH, (None, items)))
     return subpattern
 
-def _parse_sub_cond(source, state, condgroup):
-    item_yes = _parse(source, state)
+def _parse_sub_cond(source, state, condgroup, verbose):
+    item_yes = _parse(source, state, verbose)
     if source.match("|"):
-        item_no = _parse(source, state)
+        item_no = _parse(source, state, verbose)
         if source.next == "|":
             raise source.error("conditional backref with more than two branches")
     else:
@@ -457,7 +463,7 @@
     subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
     return subpattern
 
-def _parse(source, state):
+def _parse(source, state, verbose):
     # parse a simple pattern
     subpattern = SubPattern(state)
 
@@ -467,7 +473,6 @@
     sourcematch = source.match
     _len = len
     _ord = ord
-    verbose = state.flags & SRE_FLAG_VERBOSE
 
     while True:
 
@@ -621,6 +626,8 @@
             group = True
             name = None
             condgroup = None
+            add_flags = 0
+            del_flags = 0
             if sourcematch("?"):
                 # options
                 char = sourceget()
@@ -682,7 +689,7 @@
                         lookbehindgroups = state.lookbehindgroups
                         if lookbehindgroups is None:
                             state.lookbehindgroups = state.groups
-                    p = _parse_sub(source, state)
+                    p = _parse_sub(source, state, verbose)
                     if dir < 0:
                         if lookbehindgroups is None:
                             state.lookbehindgroups = None
@@ -718,19 +725,13 @@
                             raise source.error("invalid group reference",
                                                len(condname) + 1)
                     state.checklookbehindgroup(condgroup, source)
-                elif char in FLAGS:
+                elif char in FLAGS or char == "-":
                     # flags
-                    while True:
-                        state.flags |= FLAGS[char]
-                        char = sourceget()
-                        if char is None:
-                            raise source.error("missing )")
-                        if char == ")":
-                            break
-                        if char not in FLAGS:
-                            raise source.error("unknown flag", len(char))
-                    verbose = state.flags & SRE_FLAG_VERBOSE
-                    continue
+                    flags = _parse_flags(source, state, char)
+                    if flags is None:  # global flags
+                        continue
+                    add_flags, del_flags = flags
+                    group = None
                 else:
                     raise source.error("unknown extension ?" + char,
                                        len(char) + 1)
@@ -742,15 +743,17 @@
                 except error as err:
                     raise source.error(err.msg, len(name) + 1) from None
             if condgroup:
-                p = _parse_sub_cond(source, state, condgroup)
+                p = _parse_sub_cond(source, state, condgroup, verbose)
             else:
-                p = _parse_sub(source, state)
+                sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
+                               not (del_flags & SRE_FLAG_VERBOSE))
+                p = _parse_sub(source, state, sub_verbose)
             if not source.match(")"):
                 raise source.error("missing ), unterminated subpattern",
                                    source.tell() - start)
             if group is not None:
                 state.closegroup(group, p)
-            subpatternappend((SUBPATTERN, (group, p)))
+            subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
 
         elif this == "^":
             subpatternappend((AT, AT_BEGINNING))
@@ -763,6 +766,53 @@
 
     return subpattern
 
+def _parse_flags(source, state, char):
+    sourceget = source.get
+    add_flags = 0
+    del_flags = 0
+    if char != "-":
+        while True:
+            add_flags |= FLAGS[char]
+            char = sourceget()
+            if char is None:
+                raise source.error("missing -, : or )")
+            if char in ")-:":
+                break
+            if char not in FLAGS:
+                msg = "unknown flag" if char.isalpha() else "missing -, : or )"
+                raise source.error(msg, len(char))
+    if char == ")":
+        if ((add_flags & SRE_FLAG_VERBOSE) and
+            not (state.flags & SRE_FLAG_VERBOSE)):
+            raise Verbose
+        state.flags |= add_flags
+        return None
+    if add_flags & GLOBAL_FLAGS:
+        raise source.error("bad inline flags: cannot turn on global flag", 1)
+    if char == "-":
+        char = sourceget()
+        if char is None:
+            raise source.error("missing flag")
+        if char not in FLAGS:
+            msg = "unknown flag" if char.isalpha() else "missing flag"
+            raise source.error(msg, len(char))
+        while True:
+            del_flags |= FLAGS[char]
+            char = sourceget()
+            if char is None:
+                raise source.error("missing :")
+            if char == ":":
+                break
+            if char not in FLAGS:
+                msg = "unknown flag" if char.isalpha() else "missing :"
+                raise source.error(msg, len(char))
+    assert char == ":"
+    if del_flags & GLOBAL_FLAGS:
+        raise source.error("bad inline flags: cannot turn off global flag", 1)
+    if add_flags & del_flags:
+        raise source.error("bad inline flags: flag turned on and off", 1)
+    return add_flags, del_flags
+
 def fix_flags(src, flags):
     # Check and fix flags according to the type of pattern (str or bytes)
     if isinstance(src, str):
@@ -789,18 +839,22 @@
     pattern.flags = flags
     pattern.str = str
 
-    p = _parse_sub(source, pattern, 0)
+    try:
+        p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False)
+    except Verbose:
+        # the VERBOSE flag was switched on inside the pattern.  to be
+        # on the safe side, we'll parse the whole thing again...
+        pattern = Pattern()
+        pattern.flags = flags | SRE_FLAG_VERBOSE
+        pattern.str = str
+        p = _parse_sub(source, pattern, True, False)
+
     p.pattern.flags = fix_flags(str, p.pattern.flags)
 
     if source.next is not None:
         assert source.next == ")"
         raise source.error("unbalanced parenthesis")
 
-    if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
-        # the VERBOSE flag was switched on inside the pattern.  to be
-        # on the safe side, we'll parse the whole thing again...
-        return parse(str, p.pattern.flags)
-
     if flags & SRE_FLAG_DEBUG:
         p.dump()
 
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1376,6 +1376,38 @@
         self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
         self.assertRaises(ValueError, re.compile, b'(?aL)')
 
+    def test_scoped_flags(self):
+        self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
+        self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
+        self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
+        self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
+        self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
+        self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
+
+        self.assertTrue(re.match(r'(?x: a) b', 'a b'))
+        self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
+        self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
+        self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
+
+        self.checkPatternError(r'(?a:\w)',
+                               'bad inline flags: cannot turn on global flag', 3)
+        self.checkPatternError(r'(?a)(?-a:\w)',
+                               'bad inline flags: cannot turn off global flag', 8)
+        self.checkPatternError(r'(?i-i:a)',
+                               'bad inline flags: flag turned on and off', 5)
+
+        self.checkPatternError(r'(?-', 'missing flag', 3)
+        self.checkPatternError(r'(?-+', 'missing flag', 3)
+        self.checkPatternError(r'(?-z', 'unknown flag', 3)
+        self.checkPatternError(r'(?-i', 'missing :', 4)
+        self.checkPatternError(r'(?-i)', 'missing :', 4)
+        self.checkPatternError(r'(?-i+', 'missing :', 4)
+        self.checkPatternError(r'(?-iz', 'unknown flag', 4)
+        self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
+        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
+        self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
+        self.checkPatternError(r'(?iz', 'unknown flag', 3)
+
     def test_bug_6509(self):
         # Replacement strings of both types must parse properly.
         # all strings
@@ -1538,9 +1570,9 @@
         with captured_stdout() as out:
             re.compile(pat, re.DEBUG)
         dump = '''\
-SUBPATTERN 1
+SUBPATTERN 1 0 0
   LITERAL 46
-SUBPATTERN None
+SUBPATTERN None 0 0
   BRANCH
     IN
       LITERAL 99
@@ -1548,7 +1580,7 @@
   OR
     LITERAL 112
     LITERAL 121
-SUBPATTERN None
+SUBPATTERN None 0 0
   GROUPREF_EXISTS 1
     AT AT_END
   ELSE
@@ -1664,7 +1696,7 @@
         self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
         self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
         self.checkPatternError(r'(?iz)', 'unknown flag', 3)
-        self.checkPatternError(r'(?i', 'missing )', 3)
+        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
         self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
         self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
         self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -120,6 +120,8 @@
 Library
 -------
 
+- Issue #433028: Added support of modifier spans in regular expressions.
+
 - Issue #24594: Validates persist parameter when opening MSI database
 
 - Issue #28047: Fixed calculation of line length used for the base64 CTE

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list