[Python-checkins] cpython (merge 3.6 -> default): Issue #25400: Merge from 3.6

berker.peksag python-checkins at python.org
Sun Sep 18 13:17:30 EDT 2016


https://hg.python.org/cpython/rev/911070065e38
changeset:   103926:911070065e38
parent:      103923:35820a4a6967
parent:      103925:d5d910cfd288
user:        Berker Peksag <berker.peksag at gmail.com>
date:        Sun Sep 18 20:18:35 2016 +0300
summary:
  Issue #25400: Merge from 3.6

files:
  Lib/test/test_robotparser.py |  56 +++++++++++++++--------
  Lib/urllib/robotparser.py    |   8 ++-
  Misc/NEWS                    |   3 +
  3 files changed, 46 insertions(+), 21 deletions(-)


diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -79,7 +79,28 @@
     bad = ['/cyberworld/map/index.html', '/', '/tmp/']
 
 
-class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase):
+class BaseRequestRateTest(BaseRobotTest):
+
+    def test_request_rate(self):
+        for url in self.good + self.bad:
+            agent, url = self.get_agent_and_url(url)
+            with self.subTest(url=url, agent=agent):
+                if self.crawl_delay:
+                    self.assertEqual(
+                        self.parser.crawl_delay(agent), self.crawl_delay
+                    )
+                if self.request_rate:
+                    self.assertEqual(
+                        self.parser.request_rate(agent).requests,
+                        self.request_rate.requests
+                    )
+                    self.assertEqual(
+                        self.parser.request_rate(agent).seconds,
+                        self.request_rate.seconds
+                    )
+
+
+class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
     robots_txt = """\
 User-agent: figtree
 Crawl-delay: 3
@@ -96,24 +117,6 @@
     bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
            '/a%2fb.html', '/~joe/index.html']
 
-    def test_request_rate(self):
-        for url in self.good:
-            agent, url = self.get_agent_and_url(url)
-            with self.subTest(url=url, agent=agent):
-                if self.crawl_delay:
-                    self.assertEqual(
-                        self.parser.crawl_delay(agent), self.crawl_delay
-                    )
-                if self.request_rate and self.parser.request_rate(agent):
-                    self.assertEqual(
-                        self.parser.request_rate(agent).requests,
-                        self.request_rate.requests
-                    )
-                    self.assertEqual(
-                        self.parser.request_rate(agent).seconds,
-                        self.request_rate.seconds
-                    )
-
 
 class DifferentAgentTest(CrawlDelayAndRequestRateTest):
     agent = 'FigTree Robot libwww-perl/5.04'
@@ -230,6 +233,19 @@
     bad = ['/another/path?']
 
 
+class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
+    robots_txt = """\
+User-agent: *
+Crawl-delay: 1
+Request-rate: 3/15
+Disallow: /cyberworld/map/
+    """
+    request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
+    crawl_delay = 1
+    good = ['/', '/test.html']
+    bad = ['/cyberworld/map/index.html']
+
+
 class RobotHandler(BaseHTTPRequestHandler):
 
     def do_GET(self):
@@ -309,6 +325,8 @@
         self.assertTrue(parser.allow_all)
         self.assertFalse(parser.disallow_all)
         self.assertEqual(parser.mtime(), 0)
+        self.assertIsNone(parser.crawl_delay('*'))
+        self.assertIsNone(parser.request_rate('*'))
 
 if __name__=='__main__':
     unittest.main()
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -175,16 +175,20 @@
         return True
 
     def crawl_delay(self, useragent):
+        if not self.mtime():
+            return None
         for entry in self.entries:
             if entry.applies_to(useragent):
                 return entry.delay
-        return None
+        return self.default_entry.delay
 
     def request_rate(self, useragent):
+        if not self.mtime():
+            return None
         for entry in self.entries:
             if entry.applies_to(useragent):
                 return entry.req_rate
-        return None
+        return self.default_entry.req_rate
 
     def __str__(self):
         return ''.join([str(entry) + "\n" for entry in self.entries])
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -35,6 +35,9 @@
 Library
 -------
 
+- Issue #25400: RobotFileParser now correctly returns default values for
+  crawl_delay and request_rate.  Initial patch by Peter Wirtz.
+
 - Issue #27932: Prevent memory leak in win32_ver().
 
 - Fix UnboundLocalError in socket._sendfile_use_sendfile.

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list