[Jython-checkins] jython: Update robotparser and its test to CPython 2.7 latest.

jim.baker jython-checkins at python.org
Wed Feb 24 00:55:31 EST 2016


https://hg.python.org/jython/rev/56bd2ed235d9
changeset:   7911:56bd2ed235d9
user:        Jim Baker <jim.baker at rackspace.com>
date:        Tue Feb 23 22:53:40 2016 -0700
summary:
  Update robotparser and its test to CPython 2.7 latest.

test_robotparser occasionally times out in CI; this upgrade may help
resolve this problem.

files:
  Lib/test/test_support.py                |  13 +++++++
  lib-python/2.7/robotparser.py           |  15 +++++++-
  lib-python/2.7/test/test_robotparser.py |  24 +++++++++++-
  3 files changed, 48 insertions(+), 4 deletions(-)


diff --git a/Lib/test/test_support.py b/Lib/test/test_support.py
--- a/Lib/test/test_support.py
+++ b/Lib/test/test_support.py
@@ -433,6 +433,19 @@
 
 IPV6_ENABLED = False  #_is_ipv6_enabled()
 
+def system_must_validate_cert(f):
+    """Skip the test on TLS certificate validation failures."""
+    @functools.wraps(f)
+    def dec(*args, **kwargs):
+        try:
+            f(*args, **kwargs)
+        except IOError as e:
+            if "CERTIFICATE_VERIFY_FAILED" in str(e):
+                raise unittest.SkipTest("system does not contain "
+                                        "necessary certificates")
+            raise
+    return dec
+
 FUZZ = 1e-6
 
 def fcmp(x, y): # fuzzy comparison function
diff --git a/lib-python/2.7/robotparser.py b/lib-python/2.7/robotparser.py
--- a/lib-python/2.7/robotparser.py
+++ b/lib-python/2.7/robotparser.py
@@ -7,7 +7,8 @@
     2) PSF license for Python 2.2
 
     The robots.txt Exclusion Protocol is implemented as specified in
-    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
+    http://www.robotstxt.org/norobots-rfc.txt
+
 """
 import urlparse
 import urllib
@@ -60,7 +61,7 @@
         self.errcode = opener.errcode
         if self.errcode in (401, 403):
             self.disallow_all = True
-        elif self.errcode >= 400:
+        elif self.errcode >= 400 and self.errcode < 500:
             self.allow_all = True
         elif self.errcode == 200 and lines:
             self.parse(lines)
@@ -86,6 +87,7 @@
         linenumber = 0
         entry = Entry()
 
+        self.modified()
         for line in lines:
             linenumber += 1
             if not line:
@@ -131,6 +133,14 @@
             return False
         if self.allow_all:
             return True
+
+        # Until the robots.txt file has been read or found not
+        # to exist, we must assume that no url is allowable.
+        # This prevents false positives when a user erronenously
+        # calls can_fetch() before calling read().
+        if not self.last_checked:
+            return False
+
         # search for given user agent matches
         # the first match counts
         parsed_url = urlparse.urlparse(urllib.unquote(url))
@@ -160,6 +170,7 @@
         if path == '' and not allowance:
             # an empty value means allow all
             allowance = True
+        path = urlparse.urlunparse(urlparse.urlparse(path))
         self.path = urllib.quote(path)
         self.allowance = allowance
 
diff --git a/lib-python/2.7/test/test_robotparser.py b/lib-python/2.7/test/test_robotparser.py
--- a/lib-python/2.7/test/test_robotparser.py
+++ b/lib-python/2.7/test/test_robotparser.py
@@ -2,6 +2,12 @@
 from test import test_support
 from urllib2 import urlopen, HTTPError
 
+HAVE_HTTPS = True
+try:
+    from urllib2 import HTTPSHandler
+except ImportError:
+    HAVE_HTTPS = False
+
 class RobotTestCase(unittest.TestCase):
     def __init__(self, index, parser, url, good, agent):
         unittest.TestCase.__init__(self)
@@ -228,6 +234,18 @@
 
 RobotTest(15, doc, good, bad)
 
+# 16. Empty query (issue #17403). Normalizing the url first.
+doc = """
+User-agent: *
+Allow: /some/path?
+Disallow: /another/path?
+"""
+
+good = ['/some/path?']
+bad = ['/another/path?']
+
+RobotTest(16, doc, good, bad)
+
 
 class NetworkTestCase(unittest.TestCase):
 
@@ -257,14 +275,16 @@
                 self.skipTest('%s is unavailable' % url)
             self.assertEqual(parser.can_fetch("*", robots_url), False)
 
+    @unittest.skipUnless(HAVE_HTTPS, 'need SSL support to download license')
+    @test_support.system_must_validate_cert
     def testPythonOrg(self):
         test_support.requires('network')
         with test_support.transient_internet('www.python.org'):
             parser = robotparser.RobotFileParser(
-                "http://www.python.org/robots.txt")
+                "https://www.python.org/robots.txt")
             parser.read()
             self.assertTrue(
-                parser.can_fetch("*", "http://www.python.org/robots.txt"))
+                parser.can_fetch("*", "https://www.python.org/robots.txt"))
 
 
 def test_main():

-- 
Repository URL: https://hg.python.org/jython


More information about the Jython-checkins mailing list