[Python-checkins] r46995 - in python/trunk: Doc/lib/libsgmllib.tex Lib/sgmllib.py Lib/test/test_sgmllib.py Misc/ACKS

fred.drake python-checkins at python.org
Sat Jun 17 01:45:07 CEST 2006


Author: fred.drake
Date: Sat Jun 17 01:45:06 2006
New Revision: 46995

Modified:
   python/trunk/Doc/lib/libsgmllib.tex
   python/trunk/Lib/sgmllib.py
   python/trunk/Lib/test/test_sgmllib.py
   python/trunk/Misc/ACKS
Log:
SF patch 1504676: Make sgmllib char and entity references pluggable
(implementation/tests contributed by Sam Ruby)


Modified: python/trunk/Doc/lib/libsgmllib.tex
==============================================================================
--- python/trunk/Doc/lib/libsgmllib.tex	(original)
+++ python/trunk/Doc/lib/libsgmllib.tex	Sat Jun 17 01:45:06 2006
@@ -132,27 +132,59 @@
 
 \begin{methoddesc}{handle_charref}{ref}
 This method is called to process a character reference of the form
-\samp{\&\#\var{ref};}.  In the base implementation, \var{ref} must
-be a decimal number in the
-range 0-255.  It translates the character to \ASCII{} and calls the
-method \method{handle_data()} with the character as argument.  If
-\var{ref} is invalid or out of range, the method
-\code{unknown_charref(\var{ref})} is called to handle the error.  A
-subclass must override this method to provide support for named
-character entities.
+\samp{\&\#\var{ref};}.  The base implementation uses
+\method{convert_charref()} to convert the reference to a string.  If
+that method returns a string, it is passed to \method{handle_data()},
+otherwise \method{unknown_charref(\var{ref})} is called to handle the
+error.
+\versionchanged[Use \method{convert_charref()} instead of hard-coding
+the conversion]{2.5}
+\end{methoddesc}
+
+\begin{methoddesc}{convert_charref}{ref}
+Convert a character reference to a string, or \code{None}.  \var{ref}
+is the reference passed in as a string.  In the base implementation,
+\var{ref} must be a decimal number in the range 0-255.  It converts
+the code point found using the \method{convert_codepoint()} method.
+If \var{ref} is invalid or out of range, this method returns
+\code{None}.  This method is called by the default
+\method{handle_charref()} implementation and by the attribute value
+parser.
+\versionadded{2.5}
+\end{methoddesc}
+
+\begin{methoddesc}{convert_codepoint}{codepoint}
+Convert a codepoint to a \class{str} value.  Encodings can be handled
+here if appropriate, though the rest of \module{sgmllib} is oblivious
+on this matter.
+\versionadded{2.5}
 \end{methoddesc}
 
 \begin{methoddesc}{handle_entityref}{ref}
 This method is called to process a general entity reference of the
 form \samp{\&\var{ref};} where \var{ref} is an general entity
-reference.  It looks for \var{ref} in the instance (or class)
-variable \member{entitydefs} which should be a mapping from entity
-names to corresponding translations.  If a translation is found, it
+reference.  It converts \var{ref} by passing it to
+\method{convert_entityref()}.  If a translation is returned, it
 calls the method \method{handle_data()} with the translation;
 otherwise, it calls the method \code{unknown_entityref(\var{ref})}.
 The default \member{entitydefs} defines translations for
 \code{\&}, \code{\&apos}, \code{\>}, \code{\<}, and
 \code{\"}.
+\versionchanged[Use \method{convert_entityref()} instead of hard-coding
+the conversion]{2.5}
+\end{methoddesc}
+
+\begin{methoddesc}{convert_entityref}{ref}
+Convert a named entity reference to a \class{str} value, or
+\code{None}.  The resulting value will not be parsed.  \var{ref} will
+be only the name of the entity.  The default implementation looks for
+\var{ref} in the instance (or class) variable \member{entitydefs}
+which should be a mapping from entity names to corresponding
+translations.  If no translation is available for \var{ref}, this
+method returns \code{None}.  This method is called by the default
+\method{handle_entityref()} implementation and by the attribute value
+parser.
+\versionadded{2.5}
 \end{methoddesc}
 
 \begin{methoddesc}{handle_comment}{comment}

Modified: python/trunk/Lib/sgmllib.py
==============================================================================
--- python/trunk/Lib/sgmllib.py	(original)
+++ python/trunk/Lib/sgmllib.py	Sat Jun 17 01:45:06 2006
@@ -53,6 +53,10 @@
 # self.handle_entityref() with the entity reference as argument.
 
 class SGMLParser(markupbase.ParserBase):
+    # Definition of entities -- derived classes may override
+    entity_or_charref = re.compile('&(?:'
+      '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
+      ')(;?)')
 
     def __init__(self, verbose=0):
         """Initialize and reset this instance."""
@@ -277,32 +281,8 @@
                     attrvalue[:1] == '"' == attrvalue[-1:]):
                     # strip quotes
                     attrvalue = attrvalue[1:-1]
-                l = 0
-                new_attrvalue = ''
-                while l < len(attrvalue):
-                    av_match = entityref.match(attrvalue, l)
-                    if (av_match and av_match.group(1) in self.entitydefs and
-                        attrvalue[av_match.end(1)] == ';'):
-                        # only substitute entityrefs ending in ';' since
-                        # otherwise we may break <a href='?p=x&q=y'>
-                        # which is very common
-                        new_attrvalue += self.entitydefs[av_match.group(1)]
-                        l = av_match.end(0)
-                        continue
-                    ch_match = charref.match(attrvalue, l)
-                    if ch_match:
-                        try:
-                            char = chr(int(ch_match.group(1)))
-                            new_attrvalue += char
-                            l = ch_match.end(0)
-                            continue
-                        except ValueError:
-                            # invalid character reference, don't substitute
-                            pass
-                    # all other cases
-                    new_attrvalue += attrvalue[l]
-                    l += 1
-                attrvalue = new_attrvalue
+                attrvalue = self.entity_or_charref.sub(
+                    self._convert_ref, attrvalue)
             attrs.append((attrname.lower(), attrvalue))
             k = match.end(0)
         if rawdata[j] == '>':
@@ -311,6 +291,17 @@
         self.finish_starttag(tag, attrs)
         return j
 
+    # Internal -- convert entity or character reference
+    def _convert_ref(self, match):
+        if match.group(2):
+            return self.convert_charref(match.group(2)) or \
+                '&#%s%s' % match.groups()[1:]
+        elif match.group(3):
+            return self.convert_entityref(match.group(1)) or \
+                '&%s;' % match.group(1)
+        else:
+            return '&%s' % match.group(1)
+
     # Internal -- parse endtag
     def parse_endtag(self, i):
         rawdata = self.rawdata
@@ -394,35 +385,51 @@
             print '*** Unbalanced </' + tag + '>'
             print '*** Stack:', self.stack
 
-    def handle_charref(self, name):
-        """Handle character reference, no need to override."""
+    def convert_charref(self, name):
+        """Convert character reference, may be overridden."""
         try:
             n = int(name)
         except ValueError:
-            self.unknown_charref(name)
             return
         if not 0 <= n <= 255:
-            self.unknown_charref(name)
             return
-        self.handle_data(chr(n))
+        return self.convert_codepoint(n)
+
+    def convert_codepoint(self, codepoint):
+        return chr(codepoint)
+
+    def handle_charref(self, name):
+        """Handle character reference, no need to override."""
+        replacement = convert_charref(name)
+        if replacement is None:
+            self.unknown_charref(name)
+        else:
+            self.handle_data(convert_charref(name))
 
     # Definition of entities -- derived classes may override
     entitydefs = \
             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
 
-    def handle_entityref(self, name):
-        """Handle entity references.
+    def convert_entityref(self, name):
+        """Convert entity references.
 
-        There should be no need to override this method; it can be
-        tailored by setting up the self.entitydefs mapping appropriately.
+        As an alternative to overriding this method; one can tailor the
+        results by setting up the self.entitydefs mapping appropriately.
         """
         table = self.entitydefs
         if name in table:
-            self.handle_data(table[name])
+            return table[name]
         else:
-            self.unknown_entityref(name)
             return
 
+    def handle_entityref(self, name):
+        """Handle entity references, no need to override."""
+        replacement = convert_entityref(name)
+        if replacement is None:
+            self.unknown_entityref(name)
+        else:
+            self.handle_data(convert_entityref(name))
+
     # Example -- handle data, should be overridden
     def handle_data(self, data):
         pass

Modified: python/trunk/Lib/test/test_sgmllib.py
==============================================================================
--- python/trunk/Lib/test/test_sgmllib.py	(original)
+++ python/trunk/Lib/test/test_sgmllib.py	Sat Jun 17 01:45:06 2006
@@ -64,6 +64,23 @@
         self.setliteral()
 
 
+class HTMLEntityCollector(EventCollector):
+    import re, htmlentitydefs
+    entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
+        '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
+
+    def convert_charref(self, name):
+        self.append(("charref", "convert", name))
+        if name.startswith('x'):
+            return unichr(int(name[1:],16))
+        else:
+            return unichr(int(name))
+
+    def convert_entityref(self, name):
+        self.append(("entityref", "convert", name))
+        return unichr(self.htmlentitydefs.name2codepoint[name])
+
+
 class SGMLParserTestCase(unittest.TestCase):
 
     collector = EventCollector
@@ -233,6 +250,16 @@
                                 ("k", "&#42;"),
                                 ])])
 
+    def test_convert_overrides(self):
+        self.collector = HTMLEntityCollector
+        self.check_events('<a title="&ldquo;test&#x201d;">foo</a>', [
+            ('entityref', 'convert', 'ldquo'),
+            ('charref', 'convert', 'x201d'),
+            ('starttag', 'a', [('title', u'\u201ctest\u201d')]),
+            ('data', 'foo'),
+            ('endtag', 'a'),
+            ])
+
     def test_attr_funky_names(self):
         self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),

Modified: python/trunk/Misc/ACKS
==============================================================================
--- python/trunk/Misc/ACKS	(original)
+++ python/trunk/Misc/ACKS	Sat Jun 17 01:45:06 2006
@@ -528,6 +528,7 @@
 Saskia van Rossum
 Donald Wallace Rouse II
 Liam Routt
+Sam Ruby
 Paul Rubin
 Audun S. Runde
 Jeff Rush


More information about the Python-checkins mailing list