[Python-checkins] CVS: python/dist/src/Lib robotparser.py,1.10,1.11

Martin v. L?wis loewis@users.sourceforge.net
Thu, 28 Feb 2002 07:24:49 -0800


Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv31559/Lib

Modified Files:
	robotparser.py 
Log Message:
Correct various errors: 
- Use substring search, not re search for user-agent and paths. 
- Consider * entry last. Unquote, then requote URLs. 
- Treat empty Disallow as "allow everything". 
Add test cases. Fixes #523041


Index: robotparser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/robotparser.py,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** robotparser.py	13 Aug 2001 14:43:43 -0000	1.10
--- robotparser.py	28 Feb 2002 15:24:47 -0000	1.11
***************
*** 23,26 ****
--- 23,27 ----
      def __init__(self, url=''):
          self.entries = []
+         self.default_entry = None
          self.disallow_all = 0
          self.allow_all = 0
***************
*** 73,77 ****
                      state = 0
                  elif state==2:
!                     self.entries.append(entry)
                      entry = Entry()
                      state = 0
--- 74,82 ----
                      state = 0
                  elif state==2:
!                     if "*" in entry.useragents:
!                         # the default entry is considered last
!                         self.default_entry = entry
!                     else:
!                         self.entries.append(entry)
                      entry = Entry()
                      state = 0
***************
*** 86,90 ****
              if len(line) == 2:
                  line[0] = line[0].strip().lower()
!                 line[1] = line[1].strip()
                  if line[0] == "user-agent":
                      if state==2:
--- 91,95 ----
              if len(line) == 2:
                  line[0] = line[0].strip().lower()
!                 line[1] = urllib.unquote(line[1].strip())
                  if line[0] == "user-agent":
                      if state==2:
***************
*** 129,136 ****
          # search for given user agent matches
          # the first match counts
!         url = urllib.quote(urlparse.urlparse(url)[2]) or "/"
          for entry in self.entries:
              if entry.applies_to(useragent):
                  return entry.allowance(url)
          # agent not found ==> access granted
          return 1
--- 134,144 ----
          # search for given user agent matches
          # the first match counts
!         url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
          for entry in self.entries:
              if entry.applies_to(useragent):
                  return entry.allowance(url)
+         # try the default entry last
+         if self.default_entry:
+             return self.default_entry.allowance(url)
          # agent not found ==> access granted
          return 1
***************
*** 148,156 ****
         (allowance==0) followed by a path."""
      def __init__(self, path, allowance):
          self.path = urllib.quote(path)
          self.allowance = allowance
  
      def applies_to(self, filename):
!         return self.path=="*" or re.match(self.path, filename)
  
      def __str__(self):
--- 156,167 ----
         (allowance==0) followed by a path."""
      def __init__(self, path, allowance):
+         if path == '' and not allowance:
+             # an empty value means allow all
+             allowance = 1
          self.path = urllib.quote(path)
          self.allowance = allowance
  
      def applies_to(self, filename):
!         return self.path=="*" or filename.startswith(self.path)
  
      def __str__(self):
***************
*** 181,186 ****
                  return 1
              agent = agent.lower()
!             # don't forget to re.escape
!             if re.search(re.escape(useragent), agent):
                  return 1
          return 0
--- 192,196 ----
                  return 1
              agent = agent.lower()
!             if useragent.find(agent) != -1:
                  return 1
          return 0