[Python-checkins] cpython (merge 3.2 -> default): #13273: merge with 3.2.
ezio.melotti
python-checkins at python.org
Fri Oct 28 12:24:08 CEST 2011
http://hg.python.org/cpython/rev/b194117f176c
changeset: 73160:b194117f176c
parent: 73158:64b2efa5009f
parent: 73159:41d41776aa6d
user: Ezio Melotti <ezio.melotti at gmail.com>
date: Fri Oct 28 13:23:57 2011 +0300
summary:
#13273: merge with 3.2.
files:
Lib/html/parser.py | 5 +--
Lib/test/test_htmlparser.py | 33 +++++++++++++++++++++++++
Misc/NEWS | 3 ++
3 files changed, 38 insertions(+), 3 deletions(-)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -30,7 +30,7 @@
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile(
- r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
+ r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
@@ -277,12 +277,11 @@
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
-
while k < endpos:
if self.strict:
m = attrfind.match(rawdata, k)
else:
- m = attrfind_tolerant.search(rawdata, k)
+ m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -373,6 +373,39 @@
[('action', 'bogus|&#()value')])],
collector = self.collector)
+ def test_issue13273(self):
+ html = ('<div style="" ><b>The <a href="some_url">rain</a> '
+ '<br /> in <span>Spain</span></b></div>')
+ expected = [
+ ('starttag', 'div', [('style', '')]),
+ ('starttag', 'b', []),
+ ('data', 'The '),
+ ('starttag', 'a', [('href', 'some_url')]),
+ ('data', 'rain'),
+ ('endtag', 'a'),
+ ('data', ' '),
+ ('startendtag', 'br', []),
+ ('data', ' in '),
+ ('starttag', 'span', []),
+ ('data', 'Spain'),
+ ('endtag', 'span'),
+ ('endtag', 'b'),
+ ('endtag', 'div')
+ ]
+ self._run_check(html, expected, collector=self.collector)
+
+ def test_issue13273_2(self):
+ html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>'
+ expected = [
+ ('starttag', 'div', [('style', ''), ('foo', 'bar')]),
+ ('starttag', 'b', []),
+ ('data', 'The '),
+ ('starttag', 'a', [('href', 'some_url')]),
+ ('data', 'rain'),
+ ('endtag', 'a'),
+ ]
+ self._run_check(html, expected, collector=self.collector)
+
def test_unescape_function(self):
p = html.parser.HTMLParser()
self.assertEqual(p.unescape('&#bad;'),'&#bad;')
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -341,6 +341,9 @@
Library
-------
+- Issue #13273: fix a bug that prevented HTMLParser to properly detect some
+ tags when strict=False.
+
- Issue #11183: Add finer-grained exceptions to the ssl module, so that
you don't have to inspect the exception's attributes in the common case.
--
Repository URL: http://hg.python.org/cpython
More information about the Python-checkins
mailing list