[Python-checkins] cpython (2.7): #13987: HTMLParser is now able to handle malformed start tags.
ezio.melotti
python-checkins at python.org
Wed Feb 15 12:19:25 CET 2012
http://hg.python.org/cpython/rev/3d7904e3f4b9
changeset: 74946:3d7904e3f4b9
branch: 2.7
user: Ezio Melotti <ezio.melotti at gmail.com>
date: Wed Feb 15 13:19:10 2012 +0200
summary:
#13987: HTMLParser is now able to handle malformed start tags.
files:
Lib/HTMLParser.py | 10 ++++++----
Lib/test/test_htmlparser.py | 3 ++-
Misc/NEWS | 2 +-
3 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -315,8 +315,8 @@
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
- self.error("junk characters in start tag: %r"
- % (rawdata[k:endpos][:20],))
+ self.handle_data(rawdata[i:endpos])
+ return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
@@ -353,8 +353,10 @@
# end of input in or before attribute value, or we have the
# '/' from a '/>' ending
return -1
- self.updatepos(i, j)
- self.error("malformed start tag")
+ if j > i:
+ return j
+ else:
+ return i + 1
raise AssertionError("we should not get here!")
# Internal -- parse endtag, return end or -1 if incomplete
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -206,7 +206,8 @@
self._run_check("</$>", [('comment', '$')])
self._run_check("</", [('data', '</')])
self._run_check("</a", [('data', '</a')])
- self._parse_error("<a<a>")
+ # XXX this might be wrong
+ self._run_check("<a<a>", [('data', '<a'), ('starttag', 'a', [])])
self._run_check("</a<a>", [('endtag', 'a<a')])
self._run_check("<!", [('data', '<!')])
self._run_check("<a", [('data', '<a')])
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -94,7 +94,7 @@
-------
- Issue #13987: HTMLParser is now able to handle EOFs in the middle of a
- construct.
+ construct and malformed start tags.
- Issue #13015: Fix a possible reference leak in defaultdict.__repr__.
Patch by Suman Saha.
--
Repository URL: http://hg.python.org/cpython
More information about the Python-checkins
mailing list