[Python-checkins] CVS: python/dist/src/Lib robotparser.py,1.10,1.11
Martin v. L?wis
loewis@users.sourceforge.net
Thu, 28 Feb 2002 07:24:49 -0800
Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv31559/Lib
Modified Files:
robotparser.py
Log Message:
Correct various errors:
- Use substring search, not re search for user-agent and paths.
- Consider * entry last. Unquote, then requote URLs.
- Treat empty Disallow as "allow everything".
Add test cases. Fixes #523041
Index: robotparser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/robotparser.py,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** robotparser.py 13 Aug 2001 14:43:43 -0000 1.10
--- robotparser.py 28 Feb 2002 15:24:47 -0000 1.11
***************
*** 23,26 ****
--- 23,27 ----
def __init__(self, url=''):
self.entries = []
+ self.default_entry = None
self.disallow_all = 0
self.allow_all = 0
***************
*** 73,77 ****
state = 0
elif state==2:
! self.entries.append(entry)
entry = Entry()
state = 0
--- 74,82 ----
state = 0
elif state==2:
! if "*" in entry.useragents:
! # the default entry is considered last
! self.default_entry = entry
! else:
! self.entries.append(entry)
entry = Entry()
state = 0
***************
*** 86,90 ****
if len(line) == 2:
line[0] = line[0].strip().lower()
! line[1] = line[1].strip()
if line[0] == "user-agent":
if state==2:
--- 91,95 ----
if len(line) == 2:
line[0] = line[0].strip().lower()
! line[1] = urllib.unquote(line[1].strip())
if line[0] == "user-agent":
if state==2:
***************
*** 129,136 ****
# search for given user agent matches
# the first match counts
! url = urllib.quote(urlparse.urlparse(url)[2]) or "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
# agent not found ==> access granted
return 1
--- 134,144 ----
# search for given user agent matches
# the first match counts
! url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
+ # try the default entry last
+ if self.default_entry:
+ return self.default_entry.allowance(url)
# agent not found ==> access granted
return 1
***************
*** 148,156 ****
(allowance==0) followed by a path."""
def __init__(self, path, allowance):
self.path = urllib.quote(path)
self.allowance = allowance
def applies_to(self, filename):
! return self.path=="*" or re.match(self.path, filename)
def __str__(self):
--- 156,167 ----
(allowance==0) followed by a path."""
def __init__(self, path, allowance):
+ if path == '' and not allowance:
+ # an empty value means allow all
+ allowance = 1
self.path = urllib.quote(path)
self.allowance = allowance
def applies_to(self, filename):
! return self.path=="*" or filename.startswith(self.path)
def __str__(self):
***************
*** 181,186 ****
return 1
agent = agent.lower()
! # don't forget to re.escape
! if re.search(re.escape(useragent), agent):
return 1
return 0
--- 192,196 ----
return 1
agent = agent.lower()
! if useragent.find(agent) != -1:
return 1
return 0