[Python-checkins] cpython (2.7): #10713: Improve documentation for \b and \B and add a few tests. Initial patch
ezio.melotti
python-checkins at python.org
Wed Feb 29 10:50:08 CET 2012
http://hg.python.org/cpython/rev/fc89e09ca2fc
changeset: 75339:fc89e09ca2fc
branch: 2.7
parent: 75336:eb88cc90cc56
user: Ezio Melotti <ezio.melotti at gmail.com>
date: Wed Feb 29 11:40:00 2012 +0200
summary:
#10713: Improve documentation for \b and \B and add a few tests. Initial patch and tests by Martin Pool.
files:
Doc/library/re.rst | 15 ++++++++++-----
Lib/test/test_re.py | 26 ++++++++++++++++++++++++++
2 files changed, 36 insertions(+), 5 deletions(-)
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -325,14 +325,19 @@
Matches the empty string, but only at the beginning or end of a word. A word is
defined as a sequence of alphanumeric or underscore characters, so the end of a
word is indicated by whitespace or a non-alphanumeric, non-underscore character.
- Note that ``\b`` is defined as the boundary between ``\w`` and ``\W``, so the
- precise set of characters deemed to be alphanumeric depends on the values of the
- ``UNICODE`` and ``LOCALE`` flags. Inside a character range, ``\b`` represents
- the backspace character, for compatibility with Python's string literals.
+ Note that formally, ``\b`` is defined as the boundary between a ``\w`` and
+ a ``\W`` character (or vice versa), or between ``\w`` and the beginning/end
+ of the string, so the precise set of characters deemed to be alphanumeric
+ depends on the values of the ``UNICODE`` and ``LOCALE`` flags.
+ For example, ``r'\bfoo\b'`` matches ``'foo'``, ``'foo.'``, ``'(foo)'``,
+ ``'bar foo baz'`` but not ``'foobar'`` or ``'foo3'``.
+ Inside a character range, ``\b`` represents the backspace character, for compatibility with Python's string literals.
``\B``
Matches the empty string, but only when it is *not* at the beginning or end of a
- word. This is just the opposite of ``\b``, so is also subject to the settings
+ word. This means that ``r'py\B'`` matches ``'python'``, ``'py3'``, ``'py2'``,
+ but not ``'py'``, ``'py.'``, or ``'py!'``.
+ ``\B`` is just the opposite of ``\b``, so is also subject to the settings
of ``LOCALE`` and ``UNICODE``.
``\d``
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -373,6 +373,32 @@
self.assertEqual(re.search(r"\d\D\w\W\s\S",
"1aa! a", re.UNICODE).group(0), "1aa! a")
+ def test_string_boundaries(self):
+ # See http://bugs.python.org/issue10713
+ self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
+ "abc")
+ # There's a word boundary at the start of a string.
+ self.assertTrue(re.match(r"\b", "abc"))
+ # A non-empty string includes a non-boundary zero-length match.
+ self.assertTrue(re.search(r"\B", "abc"))
+ # There is no non-boundary match at the start of a string.
+ self.assertFalse(re.match(r"\B", "abc"))
+ # However, an empty string contains no word boundaries, and also no
+ # non-boundaries.
+ self.assertEqual(re.search(r"\B", ""), None)
+ # This one is questionable and different from the perlre behaviour,
+ # but describes current behavior.
+ self.assertEqual(re.search(r"\b", ""), None)
+ # A single word-character string has two boundaries, but no
+ # non-boundary gaps.
+ self.assertEqual(len(re.findall(r"\b", "a")), 2)
+ self.assertEqual(len(re.findall(r"\B", "a")), 0)
+ # If there are no words, there are no boundaries
+ self.assertEqual(len(re.findall(r"\b", " ")), 0)
+ self.assertEqual(len(re.findall(r"\b", " ")), 0)
+ # Can match around the whitespace.
+ self.assertEqual(len(re.findall(r"\B", " ")), 2)
+
def test_bigcharset(self):
self.assertEqual(re.match(u"([\u2222\u2223])",
u"\u2222").group(1), u"\u2222")
--
Repository URL: http://hg.python.org/cpython
More information about the Python-checkins
mailing list