[Spambayes-checkins] spambayes/spambayes PyMeldLite.py,1.4,1.5
Richie Hindle
richiehindle at users.sourceforge.net
Thu Jan 23 10:28:19 EST 2003
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv29471
Modified Files:
PyMeldLite.py
Log Message:
Uses expat rather than xmllib when running under 2.3 (expat is now
included in the distribution, and xmllib is deprecated).
Improved the bad-XML-characters code to write high characters
as charrefs rather than replacing them with '?'.
Index: PyMeldLite.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/PyMeldLite.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** PyMeldLite.py 22 Jan 2003 18:29:11 -0000 1.4
--- PyMeldLite.py 23 Jan 2003 18:28:15 -0000 1.5
***************
*** 195,209 ****
# Entrian.Coverage: Pragma Stop
! try:
! # XXX Take this seriously before 2.4 comes out...
! import warnings
! warnings.filterwarnings(action='ignore',
! message='.*xmllib',
! category=DeprecationWarning)
! except ImportError:
! pass
!
! import re, xmllib
!
try:
True, False, bool
--- 195,199 ----
# Entrian.Coverage: Pragma Stop
! import sys, re, string
try:
True, False, bool
***************
*** 223,230 ****
nonSelfClose = {'textarea': None}
! # Map characters not allowed in XML content to '?'
! import string
! badxml_chars = ''.join([chr(c) for c in range(0, 32) + range(128, 160)
! if c not in [9, 10, 13]])
badxml_map = string.maketrans(badxml_chars, '?' * len(badxml_chars))
--- 213,222 ----
nonSelfClose = {'textarea': None}
! # Map high characters to charrefs.
! def replaceHighCharacters(match):
! return "&#%d;" % ord(match.group(1))
!
! # Map meaningless low characters to '?'
! badxml_chars = ''.join([chr(c) for c in range(0, 32) if c not in [9, 10, 13]])
badxml_map = string.maketrans(badxml_chars, '?' * len(badxml_chars))
***************
*** 359,454 ****
! class _TreeGenerator(xmllib.XMLParser):
! """An XML parser that generates a lightweight DOM tree. Call `feed()`
! with XML source, then `close()`, then `getTree()` will give you the
! tree's `_RootNode`:
! >>> g = _TreeGenerator()
! >>> g.feed("<xml>Stuff. ")
! >>> g.feed("More stuff.</xml>")
! >>> g.close()
! >>> tree = g.getTree()
! >>> print tree.toText()
! <xml>Stuff. More stuff.</xml>
! """
! def __init__(self):
! xmllib.XMLParser.__init__(self, translate_attribute_references=False)
! self.entitydefs = {} # entitydefs is an xmllib.XMLParser attribute.
! self._tree = _RootNode()
! self._currentNode = self._tree
! self._pendingText = []
! def getTree(self):
! """Returns the generated tree; call `feed()` then `close()` first."""
! return self._tree
! def _collapsePendingText(self):
! """Text (any content that isn't an open/close element) is built up
! in `self._pendingText` until an open/close element is seen, at which
! point it gets collapsed into a `_TextNode`."""
! data = ''.join(self._pendingText)
! self._currentNode.children.append(_TextNode(data))
! self._pendingText = []
! def handle_xml(self, encoding, standalone):
! xml = '<?xml version="1.0"'
! if encoding:
! xml += ' encoding="%s"' % encoding
! if standalone:
! xml += ' standalone="%s"' % standalone
! xml += '?>'
! self._pendingText.append(xml)
! def handle_doctype(self, tag, pubid, syslit, data):
! doctype = '<!DOCTYPE %s' % tag
! if pubid:
! doctype += ' PUBLIC "%s"' % pubid
! elif syslit:
! doctype += ' SYSTEM'
! if syslit:
! doctype += ' "%s"' % syslit
! if data:
! doctype += ' [%s]>' % data
! else:
! doctype += '>'
! self._pendingText.append(doctype)
! def handle_comment(self, data):
! self._pendingText.append('<!--%s-->' % data)
! def handle_proc(self, name, data):
! self._pendingText.append('<?%s %s ?>' % (name, data.strip()))
! def handle_data(self, data):
! self._pendingText.append(data)
! def handle_charref(self, ref):
! self._pendingText.append('&#%s;' % ref)
! unknown_charref = handle_charref
! def handle_entityref(self, ref):
! self._pendingText.append('&%s;' % ref)
! unknown_entityref = handle_entityref
! def handle_cdata(self, data):
! if self._pendingText:
! self._collapsePendingText()
! self._pendingText.append('<![CDATA[%s]]>' % data)
! def unknown_starttag(self, tag, attributes):
! if self._pendingText:
! self._collapsePendingText()
! newNode = _ElementNode(self._currentNode, tag, attributes)
! self._currentNode.children.append(newNode)
! self._currentNode = newNode
! def unknown_endtag(self, tag):
! if self._pendingText:
! self._collapsePendingText()
! self._currentNode = self._currentNode.parent
--- 351,540 ----
! # For XML parsing we use xmllib in versions prior to 2.3, because we can't
! # be sure that expat will be there, or that it will be a decent version.
! # We use expat in versions 2.3 and above, because we can be sure it will
! # be there and xmllib is deprecated from 2.3.
! # The slightly odd Entrian.Coverage pragmas in this section make sure that
! # whichever branch is taken, we get code coverage for that branch and no
! # coverage failures for the other.
! if sys.hexversion >> 16 < 0x203:
! # Entrian.Coverage: Pragma Stop
! import xmllib
! class _TreeGenerator(xmllib.XMLParser):
! # Entrian.Coverage: Pragma Start
! """An XML parser that generates a lightweight DOM tree. Call `feed()`
! with XML source, then `close()`, then `getTree()` will give you the
! tree's `_RootNode`:
! >>> g = _TreeGenerator()
! >>> g.feed("<xml>Stuff. ")
! >>> g.feed("More stuff.</xml>")
! >>> g.close()
! >>> tree = g.getTree()
! >>> print tree.toText()
! <xml>Stuff. More stuff.</xml>
! """
! def __init__(self):
! xmllib.XMLParser.__init__(self,
! translate_attribute_references=False)
! self.entitydefs = {} # This is an xmllib.XMLParser attribute.
! self._tree = _RootNode()
! self._currentNode = self._tree
! self._pendingText = []
! def getTree(self):
! """Returns the generated tree; call `feed` then `close` first."""
! return self._tree
! def _collapsePendingText(self):
! """Text (any content that isn't an open/close element) is built up
! in `self._pendingText` until an open/close element is seen, at
! which point it gets collapsed into a `_TextNode`."""
! data = ''.join(self._pendingText)
! self._currentNode.children.append(_TextNode(data))
! self._pendingText = []
! def handle_xml(self, encoding, standalone):
! xml = '<?xml version="1.0"'
! if encoding:
! xml += ' encoding="%s"' % encoding
! if standalone:
! xml += ' standalone="%s"' % standalone
! xml += '?>'
! self._pendingText.append(xml)
! def handle_doctype(self, tag, pubid, syslit, data):
! doctype = '<!DOCTYPE %s' % tag
! if pubid:
! doctype += ' PUBLIC "%s"' % pubid
! elif syslit:
! doctype += ' SYSTEM'
! if syslit:
! doctype += ' "%s"' % syslit
! if data:
! doctype += ' [%s]>' % data
! else:
! doctype += '>'
! self._pendingText.append(doctype)
! def handle_comment(self, data):
! self._pendingText.append('<!--%s-->' % data)
! def handle_proc(self, name, data):
! self._pendingText.append('<?%s %s ?>' % (name, data.strip()))
! def handle_data(self, data):
! self._pendingText.append(data)
! def handle_charref(self, ref):
! self._pendingText.append('&#%s;' % ref)
! unknown_charref = handle_charref
! def handle_entityref(self, ref):
! self._pendingText.append('&%s;' % ref)
! unknown_entityref = handle_entityref
! def handle_cdata(self, data):
! if self._pendingText:
! self._collapsePendingText()
! self._pendingText.append('<![CDATA[%s]]>' % data)
! def unknown_starttag(self, tag, attributes):
! if self._pendingText:
! self._collapsePendingText()
! newNode = _ElementNode(self._currentNode, tag, attributes)
! self._currentNode.children.append(newNode)
! self._currentNode = newNode
!
! def unknown_endtag(self, tag):
! if self._pendingText:
! self._collapsePendingText()
! self._currentNode = self._currentNode.parent
!
! else:
! # Entrian.Coverage: Pragma Stop
! import xml.parsers.expat
! class _TreeGenerator:
! # Entrian.Coverage: Pragma Start
! """An XML parser that generates a lightweight DOM tree. Call `feed()`
! with XML source, then `close()`, then `getTree()` will give you the
! tree's `_RootNode`:
!
! >>> g = _TreeGenerator()
! >>> g.feed("<xml>Stuff. ")
! >>> g.feed("More stuff.</xml>")
! >>> g.close()
! >>> tree = g.getTree()
! >>> print tree.toText()
! <xml>Stuff. More stuff.</xml>
! """
!
! def __init__(self):
! self._tree = _RootNode()
! self._currentNode = self._tree
! self._pendingText = []
! self._parser = xml.parsers.expat.ParserCreate()
! self._parser.buffer_text = True
! self._parser.DefaultHandler = self.DefaultHandler
! self._parser.StartElementHandler = self.StartElementHandler
! self._parser.EndElementHandler = self.EndElementHandler
!
! # All entities and charrefs, like • and  , are considered
! # valid - who are we to argue? Expat thinks it knows better, so we
! # fool it here.
! def _mungeEntities(self, data):
! return re.sub(r'&(\w+);', r':PyMeldEntity:\1:', data)
!
! def _unmungeEntities(self, data):
! return re.sub(r':PyMeldEntity:(\w+):', r'&\1;', data)
!
! def feed(self, data):
! """Call this with XML content to be parsed."""
! data = self._mungeEntities(data)
! self._parser.Parse(data)
!
! def close(self):
! """Call this when you've passed all your XML content to `feed`."""
! self._parser.Parse("", True)
!
! def getTree(self):
! """Returns the generated tree; call `feed` then `close` first."""
! return self._tree
!
! def _collapsePendingText(self):
! """Text (any content that isn't an open/close element) is built up
! in `self._pendingText` until an open/close element is seen, at
! which point it gets collapsed into a `_TextNode`."""
!
! data = ''.join(self._pendingText)
! data = self._unmungeEntities(data)
! self._currentNode.children.append(_TextNode(data))
! self._pendingText = []
!
! def DefaultHandler(self, data):
! """Expat handler."""
! self._pendingText.append(str(data))
!
! def StartElementHandler(self, tag, attributes):
! """Expat handler."""
! if self._pendingText:
! self._collapsePendingText()
! newAttributes = {}
! for name, value in attributes.iteritems():
! newAttributes[str(name)] = self._unmungeEntities(str(value))
! newNode = _ElementNode(self._currentNode, str(tag), newAttributes)
! self._currentNode.children.append(newNode)
! self._currentNode = newNode
!
! def EndElementHandler(self, tag):
! """Expat handler."""
! if self._pendingText:
! self._collapsePendingText()
! self._currentNode = self._currentNode.parent
***************
*** 480,485 ****
source[match.end(1):]
! # Map characters not allowed in XML content to '?'
source = source.translate(badxml_map)
# Parse the XML and generate the tree.
--- 566,572 ----
source[match.end(1):]
! # Map characters not allowed in XML content to sensible things.
source = source.translate(badxml_map)
+ source = re.sub('([\x80-\xff])', replaceHighCharacters, source)
# Parse the XML and generate the tree.
***************
*** 889,897 ****
'XML proc': """
! >>> print Meld('''<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
... <?codewarrior exportversion="1.0.1" ideversion="4.2" ?>
... <!DOCTYPE PROJECT [
... <!ELEMENT PROJECT (TARGETLIST, TARGETORDER, GROUPLIST, DESIGNLIST?)>
- ... (...etc...)
... ]>
... <PROJECT>Stuff</PROJECT>''')
--- 976,983 ----
'XML proc': """
! >>> print Meld('''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
... <?codewarrior exportversion="1.0.1" ideversion="4.2" ?>
... <!DOCTYPE PROJECT [
... <!ELEMENT PROJECT (TARGETLIST, TARGETORDER, GROUPLIST, DESIGNLIST?)>
... ]>
... <PROJECT>Stuff</PROJECT>''')
***************
*** 900,904 ****
<!DOCTYPE PROJECT [
<!ELEMENT PROJECT (TARGETLIST, TARGETORDER, GROUPLIST, DESIGNLIST?)>
- (...etc...)
]>
<PROJECT>Stuff</PROJECT>
--- 986,989 ----
***************
*** 913,923 ****
'entities and charrefs': """
>>> page = Meld('''<html><body>• This "and that"...
! ... <span id="s" title=""Quoted" & Not">x</span></body></html>''')
>>> print page.s.title
"Quoted" & Not
- >>> page.s.title = page.s.title # Accept liberally, produce strictly.
- >>> print page
- <html><body>• This "and that"...
- <span id="s" title=""Quoted" & Not">x</span></body></html>
>>> page.s.title = page.s.title + " <>"
>>> print page.s.title
--- 998,1004 ----
'entities and charrefs': """
>>> page = Meld('''<html><body>• This "and that"...
! ... <span id="s" title=""Quoted" & Not">x</span></body></html>''')
>>> print page.s.title
"Quoted" & Not
>>> page.s.title = page.s.title + " <>"
>>> print page.s.title
***************
*** 1068,1076 ****
'bad XML characters': """
>>> page = Meld('''<x>
! ... Valentines Day Special \x96 2 bikinis for the price of one
... </x>''') # No exception.
>>> print page
<x>
! Valentines Day Special ? 2 bikinis for the price of one
</x>
"""
--- 1149,1157 ----
'bad XML characters': """
>>> page = Meld('''<x>
! ... Valentines Day Special \x96 2 bikinis for the price of one \x01
... </x>''') # No exception.
>>> print page
<x>
! Valentines Day Special – 2 bikinis for the price of one ?
</x>
"""
More information about the Spambayes-checkins
mailing list