[Python-checkins] r51853 - in python/branches/release24-maint: Lib/sgmllib.py Lib/test/sgml_input.html Lib/test/test_sgmllib.py Misc/NEWS

Mon Sep 11 06:18:08 CEST 2006

Author: neal.norwitz
Date: Mon Sep 11 06:18:06 2006
New Revision: 51853

Added:
   python/branches/release24-maint/Lib/test/sgml_input.html
      - copied unchanged from r51850, python/branches/release25-maint/Lib/test/sgml_input.html
Modified:
   python/branches/release24-maint/Lib/sgmllib.py
   python/branches/release24-maint/Lib/test/test_sgmllib.py
   python/branches/release24-maint/Misc/NEWS
Log:
Backport 51850 from release25-maint branch.

As mentioned on python-dev, reverting patch #1504333 because it introduced
an infinite loop in rev 47154.

This patch also adds a test to prevent the regression.



Modified: python/branches/release24-maint/Lib/sgmllib.py
==============================================================================

--- python/branches/release24-maint/Lib/sgmllib.py	(original)
+++ python/branches/release24-maint/Lib/sgmllib.py	Mon Sep 11 06:18:06 2006
@@ -29,12 +29,7 @@
 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
 piclose = re.compile('>')
-starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*('
-        r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
-        r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]'
-        r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?'
-    r')*\s*/?\s*(?=[<>])')
-endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])')
+endbracket = re.compile('[<>]')
 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@@ -250,10 +245,14 @@
             self.finish_shorttag(tag, data)
             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
             return k
-        match = starttag.match(rawdata, i)
+        # XXX The following should skip matching quotes (' or ")
+        # As a shortcut way to exit, this isn't so bad, but shouldn't
+        # be used to locate the actual end of the start tag since the
+        # < or > characters may be embedded in an attribute value.
+        match = endbracket.search(rawdata, i+1)
         if not match:
             return -1
-        j = match.end(0)
+        j = match.start(0)
         # Now parse the data between i+1 and j into a tag and attrs
         attrs = []
         if rawdata[i:i+2] == '<>':
@@ -287,10 +286,10 @@
     # Internal -- parse endtag
     def parse_endtag(self, i):
         rawdata = self.rawdata
-        match = endtag.match(rawdata, i)
+        match = endbracket.search(rawdata, i+1)
         if not match:
             return -1
-        j = match.end(0)
+        j = match.start(0)
         tag = rawdata[i+2:j].strip().lower()
         if rawdata[j] == '>':
             j = j+1

Modified: python/branches/release24-maint/Lib/test/test_sgmllib.py
==============================================================================
--- python/branches/release24-maint/Lib/test/test_sgmllib.py	(original)
+++ python/branches/release24-maint/Lib/test/test_sgmllib.py	Mon Sep 11 06:18:06 2006
@@ -214,21 +214,6 @@
             ("starttag", "e", [("a", "rgb(1,2,3)")]),
             ])
 
-    def test_attr_values_quoted_markup(self):
-        """Multi-line and markup in attribute values"""
-        self.check_events("""<a title='foo\n<br>bar'>text</a>""",
-            [("starttag", "a", [("title", "foo\n<br>bar")]),
-             ("data", "text"),
-             ("endtag", "a")])
-        self.check_events("""<a title='less < than'>text</a>""",
-            [("starttag", "a", [("title", "less < than")]),
-             ("data", "text"),
-             ("endtag", "a")])
-        self.check_events("""<a title='greater > than'>text</a>""",
-            [("starttag", "a", [("title", "greater > than")]),
-             ("data", "text"),
-             ("endtag", "a")])
-
     def test_attr_funky_names(self):
         self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
@@ -304,6 +289,19 @@
             ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
             ])
 
+    def test_read_chunks(self):
+        # SF bug #1541697, this caused sgml parser to hang
+        # Just verify this code doesn't cause a hang.
+        CHUNK = 1024  # increasing this to 8212 makes the problem go away
+
+        f = open(test_support.findfile('sgml_input.html'))
+        fp = sgmllib.SGMLParser()
+        while 1:
+            data = f.read(CHUNK)
+            fp.feed(data)
+            if len(data) != CHUNK:
+                break
+
     # XXX These tests have been disabled by prefixing their names with
     # an underscore.  The first two exercise outstanding bugs in the
     # sgmllib module, and the third exhibits questionable behavior

Modified: python/branches/release24-maint/Misc/NEWS
==============================================================================
--- python/branches/release24-maint/Misc/NEWS	(original)
+++ python/branches/release24-maint/Misc/NEWS	Mon Sep 11 06:18:06 2006
@@ -92,6 +92,8 @@
 Library
 -------
 
+- Reverted patch #1504333 because it introduced an infinite loop.
+
 - Fix missing import of the types module in logging.config.
 
 - Bug #1112549, DoS attack on cgi.FieldStorage.