[Python-checkins] CVS: python/dist/src/Lib sgmllib.py,1.33,1.34
Fred L. Drake
fdrake@users.sourceforge.net
Mon, 16 Jul 2001 11:30:37 -0700
Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv30799
Modified Files:
sgmllib.py
Log Message:
In CDATA mode, make sure entity-reference syntax is not interpreted;
entity references are not allowed in that mode.
Do a better job of scanning <!DOCTYPE ...> declarations; based on the
code in HTMLParser.py.
Index: sgmllib.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sgmllib.py,v
retrieving revision 1.33
retrieving revision 1.34
diff -C2 -r1.33 -r1.34
*** sgmllib.py 2001/07/14 05:50:33 1.33
--- sgmllib.py 2001/07/16 18:30:35 1.34
***************
*** 6,10 ****
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
! # and CDATA (character data -- only end tags are special).
--- 6,11 ----
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
! # and CDATA (character data -- only end tags are special). RCDATA is
! # not supported at all.
***************
*** 35,38 ****
--- 36,42 ----
commentopen = re.compile('<!--')
commentclose = re.compile(r'--\s*>')
+ declopen = re.compile('<!')
+ declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
+ declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
***************
*** 161,164 ****
--- 165,172 ----
continue
elif rawdata[i] == '&':
+ if self.literal:
+ self.handle_data(rawdata[i])
+ i = i+1
+ continue
match = charref.match(rawdata, i)
if match:
***************
*** 211,219 ****
# Internal -- parse declaration.
def parse_declaration(self, i):
rawdata = self.rawdata
j = i + 2
n = len(rawdata)
while j < n:
! c = rawdata[j:j+1]
if c == ">":
# end of declaration syntax
--- 219,236 ----
# Internal -- parse declaration.
def parse_declaration(self, i):
+ # This is some sort of declaration; in "HTML as
+ # deployed," this should only be the document type
+ # declaration ("<!DOCTYPE html...>").
rawdata = self.rawdata
j = i + 2
+ assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
+ if rawdata[j:j+1] in ("-", ""):
+ # Start of comment followed by buffer boundary,
+ # or just a buffer boundary.
+ return -1
+ # in practice, this should look like: ((name|stringlit) S*)+ '>'
n = len(rawdata)
while j < n:
! c = rawdata[j]
if c == ">":
# end of declaration syntax
***************
*** 223,235 ****
m = declstringlit.match(rawdata, j)
if not m:
! # incomplete or an error?
! return -1
j = m.end()
! else:
! m = decldata.match(rawdata, j)
if not m:
! # incomplete or an error?
! return -1
j = m.end()
# end of buffer between tokens
return -1
--- 240,253 ----
m = declstringlit.match(rawdata, j)
if not m:
! return -1 # incomplete
j = m.end()
! elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
! m = declname.match(rawdata, j)
if not m:
! return -1 # incomplete
j = m.end()
+ else:
+ raise SGMLParseError(
+ "unexpected char in declaration: %s" % `rawdata[j]`)
# end of buffer between tokens
return -1