[Python-checkins] CVS: python/dist/src/Lib robotparser.py,1.3,1.4

Skip Montanaro montanaro@users.sourceforge.net
Sat, 20 Jan 2001 07:59:28 -0800


Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv23068

Modified Files:
	robotparser.py 
Log Message:
rewrite of robotparser.py by Bastian Kleineidam.  Closes patch 102229.


Index: robotparser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/robotparser.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** robotparser.py	2000/03/27 19:29:31	1.3
--- robotparser.py	2001/01/20 15:59:25	1.4
***************
*** 1,16 ****
! """
  
! Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
! input, builds a set of rules from that list, then answers questions about
! fetchability of other URLs.
  
  """
  
! class RobotFileParser:
  
!     def __init__(self):
!         self.rules = {}
!         self.debug = 0
!         self.url = ''
          self.last_checked = 0
  
--- 1,27 ----
! """ robotparser.py
! 
!     Copyright (C) 2000  Bastian Kleineidam
  
!     You can choose between two licenses when using this package:
!     1) GNU GPLv2
!     2) PYTHON 2.0 OPEN SOURCE LICENSE
  
+     The robots.txt Exclusion Protocol is implemented as specified in
+     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
  """
+ import re,string,urlparse,urllib
  
! debug = 0
  
! def _debug(msg):
!     if debug: print msg
! 
! 
! class RobotFileParser:
!     def __init__(self, url=''):
!         self.entries = []
!         self.disallow_all = 0
!         self.allow_all = 0
!         self.set_url(url)
          self.last_checked = 0
  
***************
*** 24,97 ****
      def set_url(self, url):
          self.url = url
  
      def read(self):
!         import urllib
!         self.parse(urllib.urlopen(self.url).readlines())
  
      def parse(self, lines):
!         """parse the input lines from a robot.txt file"""
!         import string, re
!         active = []
          for line in lines:
!             if self.debug: print '>', line,
!             # blank line terminates current record
!             if not line[:-1]:
!                 active = []
!                 continue
              # remove optional comment and strip line
!             line = string.strip(line[:string.find(line, '#')])
              if not line:
                  continue
!             line = re.split(' *: *', line)
              if len(line) == 2:
!                 line[0] = string.lower(line[0])
!                 if line[0] == 'user-agent':
!                     # this record applies to this user agent
!                     if self.debug: print '>> user-agent:', line[1]
!                     active.append(line[1])
!                     if not self.rules.has_key(line[1]):
!                         self.rules[line[1]] = []
!                 elif line[0] == 'disallow':
!                     if line[1]:
!                         if self.debug: print '>> disallow:', line[1]
!                         for agent in active:
!                             self.rules[agent].append(re.compile(line[1]))
                      else:
!                         pass
!                         for agent in active:
!                             if self.debug: print '>> allow', agent
!                             self.rules[agent] = []
                  else:
!                     if self.debug: print '>> unknown:', line
  
-         self.modified()
  
-     # returns true if agent is allowed to fetch url
      def can_fetch(self, useragent, url):
          """using the parsed robots.txt decide if useragent can fetch url"""
!         import urlparse
!         ag = useragent
!         if not self.rules.has_key(ag): ag = '*'
!         if not self.rules.has_key(ag):
!             if self.debug: print '>> allowing', url, 'fetch by', useragent
              return 1
!         path = urlparse.urlparse(url)[2]
!         for rule in self.rules[ag]:
!             if rule.match(path) is not None:
!                 if self.debug: print '>> disallowing', url, 'fetch by', useragent
!                 return 0
!         if self.debug: print '>> allowing', url, 'fetch by', useragent
          return 1
  
  def _test():
      rp = RobotFileParser()
!     rp.debug = 1
!     rp.set_url('http://www.musi-cal.com/robots.txt')
!     rp.read()
!     print rp.rules
!     print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
!     print rp.can_fetch('Musi-Cal-Robot',
!                        'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
  
! if __name__ == "__main__":
      _test()
--- 35,216 ----
      def set_url(self, url):
          self.url = url
+         self.host, self.path = urlparse.urlparse(url)[1:3]
  
      def read(self):
!         import httplib
!         tries = 0
!         while tries<5:
!             connection = httplib.HTTP(self.host)
!             connection.putrequest("GET", self.path)
!             connection.putheader("Host", self.host)
!             connection.endheaders()
!             status, text, mime = connection.getreply()
!             if status in [301,302] and mime:
!                 tries = tries + 1
!                 newurl = mime.get("Location", mime.get("Uri", ""))
!                 newurl = urlparse.urljoin(self.url, newurl)
!                 self.set_url(newurl)
!             else:
!                 break
!         if status==401 or status==403:
!             self.disallow_all = 1
!         elif status>=400:
!             self.allow_all = 1
!         else:
! 	    # status < 400
!             self.parse(connection.getfile().readlines())
  
      def parse(self, lines):
!         """parse the input lines from a robot.txt file.
! 	   We allow that a user-agent: line is not preceded by
! 	   one or more blank lines."""
!         state = 0
!         linenumber = 0
!         entry = Entry()
!         
          for line in lines:
!             line = string.strip(line)
!             linenumber = linenumber + 1
!             if not line:
!                 if state==1:
!                     _debug("line %d: warning: you should insert"
! 		           " allow: or disallow: directives below any"
! 			   " user-agent: line" % linenumber)
!                     entry = Entry()
!                     state = 0
!                 elif state==2:
!                     self.entries.append(entry)
!                     entry = Entry()
!                     state = 0
              # remove optional comment and strip line
!             i = string.find(line, '#')
!             if i>=0:
!                 line = line[:i]
!             line = string.strip(line)
              if not line:
                  continue
!             line = string.split(line, ':', 1)
              if len(line) == 2:
!                 line[0] = string.lower(string.strip(line[0]))
!                 line[1] = string.strip(line[1])
!                 if line[0] == "user-agent":
!                     if state==2:
!                         _debug("line %d: warning: you should insert a blank"
! 			       " line before any user-agent"
!                                " directive" % linenumber)
!                         self.entries.append(entry)
!                         entry = Entry()
!                     entry.useragents.append(line[1])
!                     state = 1
!                 elif line[0] == "disallow":
!                     if state==0:
!                         _debug("line %d: error: you must insert a user-agent:"
! 			       " directive before this line" % linenumber)
                      else:
!                         entry.rulelines.append(RuleLine(line[1], 0))
!                         state = 2
!                 elif line[0] == "allow":
!                     if state==0:
!                         _debug("line %d: error: you must insert a user-agent:"
! 			       " directive before this line" % linenumber)
!                     else:
!                         entry.rulelines.append(RuleLine(line[1], 1))
                  else:
!                     _debug("line %d: warning: unknown key %s" % (linenumber,
!                                line[0]))
!             else:
!                 _debug("line %d: error: malformed line %s"%(linenumber, line))
!         if state==2:
!             self.entries.append(entry)
!         _debug("Parsed rules:\n%s" % str(self))
  
  
      def can_fetch(self, useragent, url):
          """using the parsed robots.txt decide if useragent can fetch url"""
!         _debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url))
!         if self.disallow_all:
!             return 0
!         if self.allow_all:
              return 1
!         # search for given user agent matches
!         # the first match counts
!         useragent = string.lower(useragent)
!         url = urllib.quote(urlparse.urlparse(url)[2])
!         for entry in self.entries:
!             if entry.applies_to(useragent):
!                 return entry.allowance(url)
!         # agent not found ==> access granted
!         return 1
! 
! 
!     def __str__(self):
!         ret = ""
!         for entry in self.entries:
!             ret = ret + str(entry) + "\n"
!         return ret
! 
! 
! class RuleLine:
!     """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
!        (allowance==0) followed by a path."""
!     def __init__(self, path, allowance):
!         self.path = urllib.quote(path)
!         self.allowance = allowance
! 
!     def applies_to(self, filename):
!         return self.path=="*" or re.match(self.path, filename)
! 
!     def __str__(self):
!         return (self.allowance and "Allow" or "Disallow")+": "+self.path
! 
! 
! class Entry:
!     """An entry has one or more user-agents and zero or more rulelines"""
!     def __init__(self):
!         self.useragents = []
!         self.rulelines = []
! 
!     def __str__(self):
!         ret = ""
!         for agent in self.useragents:
!             ret = ret + "User-agent: "+agent+"\n"
!         for line in self.rulelines:
!             ret = ret + str(line) + "\n"
!         return ret
! 
!     def applies_to(self, useragent):
!         "check if this entry applies to the specified agent"
!         for agent in self.useragents:
!             if agent=="*":
!                 return 1
!             if re.match(agent, useragent):
!                 return 1
!         return 0
! 
!     def allowance(self, filename):
!         """Preconditions:
!         - our agent applies to this entry
!         - filename is URL decoded"""
!         for line in self.rulelines:
!             if line.applies_to(filename):
!                 return line.allowance
          return 1
  
+ 
  def _test():
+     global debug
+     import sys
      rp = RobotFileParser()
!     debug = 1
!     if len(sys.argv) <= 1:
!         rp.set_url('http://www.musi-cal.com/robots.txt')
!         rp.read()
!     else:
!         rp.parse(open(sys.argv[1]).readlines())
!     print rp.can_fetch('*', 'http://www.musi-cal.com/')
!     print rp.can_fetch('Musi-Cal-Robot/1.0',
!                        'http://www.musi-cal.com/cgi-bin/event-search'
! 		       '?city=San+Francisco')
  
! if __name__ == '__main__':
      _test()