[Python-checkins] cpython (2.7): #13987: HTMLParser is now able to handle malformed start tags.

Wed Feb 15 12:19:25 CET 2012

http://hg.python.org/cpython/rev/3d7904e3f4b9
changeset:   74946:3d7904e3f4b9
branch:      2.7
user:        Ezio Melotti <ezio.melotti at gmail.com>
date:        Wed Feb 15 13:19:10 2012 +0200
summary:
  #13987: HTMLParser is now able to handle malformed start tags.

files:
  Lib/HTMLParser.py           |  10 ++++++----
  Lib/test/test_htmlparser.py |   3 ++-
  Misc/NEWS                   |   2 +-
  3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -315,8 +315,8 @@
                          - self.__starttag_text.rfind("\n")
             else:
                 offset = offset + len(self.__starttag_text)
-            self.error("junk characters in start tag: %r"
-                       % (rawdata[k:endpos][:20],))
+            self.handle_data(rawdata[i:endpos])
+            return endpos
         if end.endswith('/>'):
             # XHTML-style empty tag: <span attr="value" />
             self.handle_startendtag(tag, attrs)
@@ -353,8 +353,10 @@
                 # end of input in or before attribute value, or we have the
                 # '/' from a '/>' ending
                 return -1
-            self.updatepos(i, j)
-            self.error("malformed start tag")
+            if j > i:
+                return j
+            else:
+                return i + 1
         raise AssertionError("we should not get here!")
 
     # Internal -- parse endtag, return end or -1 if incomplete
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -206,7 +206,8 @@
         self._run_check("</$>", [('comment', '$')])
         self._run_check("</", [('data', '</')])
         self._run_check("</a", [('data', '</a')])
-        self._parse_error("<a<a>")
+        # XXX this might be wrong
+        self._run_check("<a<a>", [('data', '<a'), ('starttag', 'a', [])])
         self._run_check("</a<a>", [('endtag', 'a<a')])
         self._run_check("<!", [('data', '<!')])
         self._run_check("<a", [('data', '<a')])
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -94,7 +94,7 @@
 -------
 
 - Issue #13987: HTMLParser is now able to handle EOFs in the middle of a
-  construct.
+  construct and malformed start tags.
 
 - Issue #13015: Fix a possible reference leak in defaultdict.__repr__.
   Patch by Suman Saha.

-- 
Repository URL: http://hg.python.org/cpython