[XML-SIG] sgmlop and html parsing

Alexandre Fayolle Alexandre.Fayolle at logilab.fr
Wed Jan 14 05:49:06 EST 2004


Here's the patch I came up with. It fixes the bug that was reported on
Debian (http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=227219), but
I'd like to have some feedback before committing the change to the CVS. 

diff -u -r1.7 drv_sgmlop.py
--- xml/sax/drivers2/drv_sgmlop.py      21 Jan 2003 12:42:28 -0000
1.7
+++ xml/sax/drivers2/drv_sgmlop.py      14 Jan 2004 10:40:00 -0000
@@ -99,6 +99,29 @@
         if self._lexical_handler is not None:
             self._lexical_handler.comment(to_xml_string(data,self._encoding))
 
+    def handle_charref(self, name):
+        try:
+            if name[0] == 'x':
+                n = int(name[1:], 16)
+            else:
+                n = int(name)
+        except ValueError:
+            self.unknown_charref(name)
+            return
+        try:
+            unichar = unichr(n)
+        except NameError:
+            if not 0 <= n <= 255:
+                self.unknown_charref(name)
+                return
+            self.handle_data(chr(n))
+        else:
+            prev_encoding = self.getProperty(handler.property_encoding)
+            self.setProperty(handler.property_encoding, 'utf-8')
+            self.handle_data(unichar.encode('utf-8'))
+            self.setProperty(handler.property_encoding, prev_encoding)
+
+
     def setProperty(self,name,value):
         if name == handler.property_lexical_handler:
             self._lexical_handler = value
@@ -113,6 +136,7 @@
             return self._encoding
         raise SAXNotRecognizedException("Property '%s' not recognized"
% name)
 
+
 ##    def getFeature(self, name):
 ##        if name == handler.feature_namespaces:
 ##            return self._namespaces
 

-- 
Alexandre Fayolle
LOGILAB, Paris (France).
http://www.logilab.com   http://www.logilab.fr  http://www.logilab.org
Développement logiciel avancé - Intelligence Artificielle - Formations



More information about the XML-SIG mailing list