[Jython-checkins] jython: Update robotparser and its test to CPython 2.7 latest.
jim.baker
jython-checkins at python.org
Wed Feb 24 00:55:31 EST 2016
https://hg.python.org/jython/rev/56bd2ed235d9
changeset: 7911:56bd2ed235d9
user: Jim Baker <jim.baker at rackspace.com>
date: Tue Feb 23 22:53:40 2016 -0700
summary:
Update robotparser and its test to CPython 2.7 latest.
test_robotparser occasionally times out in CI; this upgrade may help
resolve this problem.
files:
Lib/test/test_support.py | 13 +++++++
lib-python/2.7/robotparser.py | 15 +++++++-
lib-python/2.7/test/test_robotparser.py | 24 +++++++++++-
3 files changed, 48 insertions(+), 4 deletions(-)
diff --git a/Lib/test/test_support.py b/Lib/test/test_support.py
--- a/Lib/test/test_support.py
+++ b/Lib/test/test_support.py
@@ -433,6 +433,19 @@
IPV6_ENABLED = False #_is_ipv6_enabled()
+def system_must_validate_cert(f):
+ """Skip the test on TLS certificate validation failures."""
+ @functools.wraps(f)
+ def dec(*args, **kwargs):
+ try:
+ f(*args, **kwargs)
+ except IOError as e:
+ if "CERTIFICATE_VERIFY_FAILED" in str(e):
+ raise unittest.SkipTest("system does not contain "
+ "necessary certificates")
+ raise
+ return dec
+
FUZZ = 1e-6
def fcmp(x, y): # fuzzy comparison function
diff --git a/lib-python/2.7/robotparser.py b/lib-python/2.7/robotparser.py
--- a/lib-python/2.7/robotparser.py
+++ b/lib-python/2.7/robotparser.py
@@ -7,7 +7,8 @@
2) PSF license for Python 2.2
The robots.txt Exclusion Protocol is implemented as specified in
- http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
+ http://www.robotstxt.org/norobots-rfc.txt
+
"""
import urlparse
import urllib
@@ -60,7 +61,7 @@
self.errcode = opener.errcode
if self.errcode in (401, 403):
self.disallow_all = True
- elif self.errcode >= 400:
+ elif self.errcode >= 400 and self.errcode < 500:
self.allow_all = True
elif self.errcode == 200 and lines:
self.parse(lines)
@@ -86,6 +87,7 @@
linenumber = 0
entry = Entry()
+ self.modified()
for line in lines:
linenumber += 1
if not line:
@@ -131,6 +133,14 @@
return False
if self.allow_all:
return True
+
+ # Until the robots.txt file has been read or found not
+ # to exist, we must assume that no url is allowable.
+ # This prevents false positives when a user erronenously
+ # calls can_fetch() before calling read().
+ if not self.last_checked:
+ return False
+
# search for given user agent matches
# the first match counts
parsed_url = urlparse.urlparse(urllib.unquote(url))
@@ -160,6 +170,7 @@
if path == '' and not allowance:
# an empty value means allow all
allowance = True
+ path = urlparse.urlunparse(urlparse.urlparse(path))
self.path = urllib.quote(path)
self.allowance = allowance
diff --git a/lib-python/2.7/test/test_robotparser.py b/lib-python/2.7/test/test_robotparser.py
--- a/lib-python/2.7/test/test_robotparser.py
+++ b/lib-python/2.7/test/test_robotparser.py
@@ -2,6 +2,12 @@
from test import test_support
from urllib2 import urlopen, HTTPError
+HAVE_HTTPS = True
+try:
+ from urllib2 import HTTPSHandler
+except ImportError:
+ HAVE_HTTPS = False
+
class RobotTestCase(unittest.TestCase):
def __init__(self, index, parser, url, good, agent):
unittest.TestCase.__init__(self)
@@ -228,6 +234,18 @@
RobotTest(15, doc, good, bad)
+# 16. Empty query (issue #17403). Normalizing the url first.
+doc = """
+User-agent: *
+Allow: /some/path?
+Disallow: /another/path?
+"""
+
+good = ['/some/path?']
+bad = ['/another/path?']
+
+RobotTest(16, doc, good, bad)
+
class NetworkTestCase(unittest.TestCase):
@@ -257,14 +275,16 @@
self.skipTest('%s is unavailable' % url)
self.assertEqual(parser.can_fetch("*", robots_url), False)
+ @unittest.skipUnless(HAVE_HTTPS, 'need SSL support to download license')
+ @test_support.system_must_validate_cert
def testPythonOrg(self):
test_support.requires('network')
with test_support.transient_internet('www.python.org'):
parser = robotparser.RobotFileParser(
- "http://www.python.org/robots.txt")
+ "https://www.python.org/robots.txt")
parser.read()
self.assertTrue(
- parser.can_fetch("*", "http://www.python.org/robots.txt"))
+ parser.can_fetch("*", "https://www.python.org/robots.txt"))
def test_main():
--
Repository URL: https://hg.python.org/jython
More information about the Jython-checkins
mailing list