[Python-checkins] cpython (merge 3.5 -> default): #23144: merge with 3.5.
ezio.melotti
python-checkins at python.org
Sun Sep 6 20:55:33 CEST 2015
https://hg.python.org/cpython/rev/48ae9d66c720
changeset: 97713:48ae9d66c720
parent: 97710:d51a82f68a70
parent: 97712:1f6155ffcaf6
user: Ezio Melotti <ezio.melotti at gmail.com>
date: Sun Sep 06 21:49:48 2015 +0300
summary:
#23144: merge with 3.5.
files:
Lib/html/parser.py | 10 +++++++++-
Lib/test/test_htmlparser.py | 15 ++++++++++++---
Misc/NEWS | 6 +++++-
3 files changed, 26 insertions(+), 5 deletions(-)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -139,7 +139,15 @@
if self.convert_charrefs and not self.cdata_elem:
j = rawdata.find('<', i)
if j < 0:
- if not end:
+ # if we can't find the next <, either we are at the end
+ # or there's more text incoming. If the latter is True,
+ # we can't pass the text to handle_data in case we have
+ # a charref cut in half at end. Try to determine if
+ # this is the case before proceding by looking for an
+ # & near the end and see if it's followed by a space or ;.
+ amppos = rawdata.rfind('&', max(i, n-34))
+ if (amppos >= 0 and
+ not re.compile(r'[\s;]').search(rawdata, amppos)):
break # wait till we get all the text
j = n
else:
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -72,9 +72,6 @@
class EventCollectorCharrefs(EventCollector):
- def get_events(self):
- return self.events
-
def handle_charref(self, data):
self.fail('This should never be called with convert_charrefs=True')
@@ -633,6 +630,18 @@
]
self._run_check(html, expected)
+ def test_convert_charrefs_dropped_text(self):
+ # #23144: make sure that all the events are triggered when
+ # convert_charrefs is True, even if we don't call .close()
+ parser = EventCollector(convert_charrefs=True)
+ # before the fix, bar & baz was missing
+ parser.feed("foo <a>link</a> bar & baz")
+ self.assertEqual(
+ parser.get_events(),
+ [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'),
+ ('endtag', 'a'), ('data', ' bar & baz')]
+ )
+
class AttributesTestCase(TestCaseBase):
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -1,4 +1,4 @@
-+++++++++++
++++++++++++
Python News
+++++++++++
@@ -181,9 +181,13 @@
Library
-------
+- Issue #23144: Make sure that HTMLParser.feed() returns all the data, even
+ when convert_charrefs is True.
+
- Issue #24635: Fixed a bug in typing.py where isinstance([], typing.Iterable)
would return True once, then False on subsequent calls.
+
- Issue #24989: Fixed buffer overread in BytesIO.readline() if a position is
set beyond size. Based on patch by John Leitch.
--
Repository URL: https://hg.python.org/cpython
More information about the Python-checkins
mailing list