[Python-checkins] CVS: python/dist/src/Lib sgmllib.py,1.33,1.34

Fred L. Drake fdrake@users.sourceforge.net
Mon, 16 Jul 2001 11:30:37 -0700


Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv30799

Modified Files:
	sgmllib.py 
Log Message:

In CDATA mode, make sure entity-reference syntax is not interpreted;
entity references are not allowed in that mode.

Do a better job of scanning <!DOCTYPE ...> declarations; based on the
code in HTMLParser.py.


Index: sgmllib.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sgmllib.py,v
retrieving revision 1.33
retrieving revision 1.34
diff -C2 -r1.33 -r1.34
*** sgmllib.py	2001/07/14 05:50:33	1.33
--- sgmllib.py	2001/07/16 18:30:35	1.34
***************
*** 6,10 ****
  # character data -- the normal case), RCDATA (replaceable character
  # data -- only char and entity references and end tags are special)
! # and CDATA (character data -- only end tags are special).
  
  
--- 6,11 ----
  # character data -- the normal case), RCDATA (replaceable character
  # data -- only char and entity references and end tags are special)
! # and CDATA (character data -- only end tags are special).  RCDATA is
! # not supported at all.
  
  
***************
*** 35,38 ****
--- 36,42 ----
  commentopen = re.compile('<!--')
  commentclose = re.compile(r'--\s*>')
+ declopen = re.compile('<!')
+ declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
+ declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
  tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  attrfind = re.compile(
***************
*** 161,164 ****
--- 165,172 ----
                      continue
              elif rawdata[i] == '&':
+                 if self.literal:
+                     self.handle_data(rawdata[i])
+                     i = i+1
+                     continue
                  match = charref.match(rawdata, i)
                  if match:
***************
*** 211,219 ****
      # Internal -- parse declaration.
      def parse_declaration(self, i):
          rawdata = self.rawdata
          j = i + 2
          n = len(rawdata)
          while j < n:
!             c = rawdata[j:j+1]
              if c == ">":
                  # end of declaration syntax
--- 219,236 ----
      # Internal -- parse declaration.
      def parse_declaration(self, i):
+         # This is some sort of declaration; in "HTML as
+         # deployed," this should only be the document type
+         # declaration ("<!DOCTYPE html...>").
          rawdata = self.rawdata
          j = i + 2
+         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
+         if rawdata[j:j+1] in ("-", ""):
+             # Start of comment followed by buffer boundary,
+             # or just a buffer boundary.
+             return -1
+         # in practice, this should look like: ((name|stringlit) S*)+ '>'
          n = len(rawdata)
          while j < n:
!             c = rawdata[j]
              if c == ">":
                  # end of declaration syntax
***************
*** 223,235 ****
                  m = declstringlit.match(rawdata, j)
                  if not m:
!                     # incomplete or an error?
!                     return -1
                  j = m.end()
!             else:
!                 m = decldata.match(rawdata, j)
                  if not m:
!                     # incomplete or an error?
!                     return -1
                  j = m.end()
          # end of buffer between tokens
          return -1
--- 240,253 ----
                  m = declstringlit.match(rawdata, j)
                  if not m:
!                     return -1 # incomplete
                  j = m.end()
!             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
!                 m = declname.match(rawdata, j)
                  if not m:
!                     return -1 # incomplete
                  j = m.end()
+             else:
+                 raise SGMLParseError(
+                     "unexpected char in declaration: %s" % `rawdata[j]`)
          # end of buffer between tokens
          return -1