[Python-checkins] r83851 - in python/branches/py3k: Doc/library/xml.etree.elementtree.rst Lib/test/test_xml_etree.py Lib/xml/etree/ElementTree.py Misc/NEWS

florent.xicluna python-checkins at python.org
Sun Aug 8 21:48:30 CEST 2010


Author: florent.xicluna
Date: Sun Aug  8 21:48:29 2010
New Revision: 83851

Log:
Issue #8047: Fix the xml.etree serializer to return bytes by default.
Use ``encoding="unicode"`` to generate a Unicode string.



Modified:
   python/branches/py3k/Doc/library/xml.etree.elementtree.rst
   python/branches/py3k/Lib/test/test_xml_etree.py
   python/branches/py3k/Lib/xml/etree/ElementTree.py
   python/branches/py3k/Misc/NEWS

Modified: python/branches/py3k/Doc/library/xml.etree.elementtree.rst
==============================================================================
--- python/branches/py3k/Doc/library/xml.etree.elementtree.rst	(original)
+++ python/branches/py3k/Doc/library/xml.etree.elementtree.rst	Sun Aug  8 21:48:29 2010
@@ -148,20 +148,22 @@
    arguments.  Returns an element instance.
 
 
-.. function:: tostring(element, encoding=None, method="xml")
+.. function:: tostring(element, encoding="us-ascii", method="xml")
 
    Generates a string representation of an XML element, including all
    subelements.  *element* is an :class:`Element` instance.  *encoding* [1]_ is
-   the output encoding (default is None).  *method* is either ``"xml"``,
+   the output encoding (default is US-ASCII).  Use ``encoding="unicode"`` to
+   generate a Unicode string.  *method* is either ``"xml"``,
    ``"html"`` or ``"text"`` (default is ``"xml"``).  Returns an (optionally)
    encoded string containing the XML data.
 
 
-.. function:: tostringlist(element, encoding=None, method="xml")
+.. function:: tostringlist(element, encoding="us-ascii", method="xml")
 
    Generates a string representation of an XML element, including all
    subelements.  *element* is an :class:`Element` instance.  *encoding* [1]_ is
-   the output encoding (default is None).   *method* is either ``"xml"``,
+   the output encoding (default is US-ASCII).  Use ``encoding="unicode"`` to
+   generate a Unicode string.  *method* is either ``"xml"``,
    ``"html"`` or ``"text"`` (default is ``"xml"``).  Returns a list of
    (optionally) encoded strings containing the XML data.  It does not guarantee
    any specific sequence, except that ``"".join(tostringlist(element)) ==
@@ -430,6 +432,7 @@
 
 
    .. method:: getroot()
+
       Returns the root element for this tree.
 
 
@@ -457,15 +460,16 @@
       root element.
 
 
-   .. method:: write(file, encoding=None, xml_declaration=None, method="xml")
+   .. method:: write(file, encoding="us-ascii", xml_declaration=None, method="xml")
 
       Writes the element tree to a file, as XML.  *file* is a file name, or a
       file object opened for writing.  *encoding* [1]_ is the output encoding
-      (default is None).  *xml_declaration* controls if an XML declaration
+      (default is US-ASCII).  Use ``encoding="unicode"`` to write a Unicode string.
+      *xml_declaration* controls if an XML declaration
       should be added to the file.  Use False for never, True for always, None
-      for only if not US-ASCII or UTF-8 (default is None).  *method* is either
-      ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``).  Returns an
-      (optionally) encoded string.
+      for only if not US-ASCII or UTF-8 or Unicode (default is None).  *method* is
+      either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``).
+      Returns an (optionally) encoded string.
 
 This is the XML file that is going to be manipulated::
 

Modified: python/branches/py3k/Lib/test/test_xml_etree.py
==============================================================================
--- python/branches/py3k/Lib/test/test_xml_etree.py	(original)
+++ python/branches/py3k/Lib/test/test_xml_etree.py	Sun Aug  8 21:48:29 2010
@@ -71,14 +71,14 @@
     if not hasattr(method, '__call__'):
         print(method, "not callable")
 
-def serialize(elem, to_string=True, **options):
+def serialize(elem, to_string=True, encoding='unicode', **options):
     import io
-    if options.get("encoding"):
+    if encoding != 'unicode':
         file = io.BytesIO()
     else:
         file = io.StringIO()
     tree = ET.ElementTree(elem)
-    tree.write(file, **options)
+    tree.write(file, encoding=encoding, **options)
     if to_string:
         return file.getvalue()
     else:
@@ -537,7 +537,7 @@
     >>> elem.set('testa', 'testval')
     >>> elem.set('testb', 'test2')
     >>> ET.tostring(elem)
-    '<test testa="testval" testb="test2">aa</test>'
+    b'<test testa="testval" testb="test2">aa</test>'
     >>> sorted(elem.keys())
     ['testa', 'testb']
     >>> sorted(elem.items())
@@ -547,7 +547,7 @@
     >>> elem.attrib['testb'] = 'test1'
     >>> elem.attrib['testc'] = 'test2'
     >>> ET.tostring(elem)
-    '<test testa="testval" testb="test1" testc="test2">aa</test>'
+    b'<test testa="testval" testb="test1" testc="test2">aa</test>'
     """
 
 def makeelement():
@@ -587,7 +587,7 @@
 
     >>> tree = ET.parse(SIMPLE_XMLFILE)
     >>> normalize_crlf(tree)
-    >>> tree.write(sys.stdout)
+    >>> tree.write(sys.stdout, encoding='unicode')
     <root>
        <element key="value">text</element>
        <element>text</element>tail
@@ -595,7 +595,7 @@
     </root>
     >>> tree = ET.parse(SIMPLE_NS_XMLFILE)
     >>> normalize_crlf(tree)
-    >>> tree.write(sys.stdout)
+    >>> tree.write(sys.stdout, encoding='unicode')
     <ns0:root xmlns:ns0="namespace">
        <ns0:element key="value">text</ns0:element>
        <ns0:element>text</ns0:element>tail
@@ -636,17 +636,17 @@
 def parseliteral():
     """
     >>> element = ET.XML("<html><body>text</body></html>")
-    >>> ET.ElementTree(element).write(sys.stdout)
+    >>> ET.ElementTree(element).write(sys.stdout, encoding='unicode')
     <html><body>text</body></html>
     >>> element = ET.fromstring("<html><body>text</body></html>")
-    >>> ET.ElementTree(element).write(sys.stdout)
+    >>> ET.ElementTree(element).write(sys.stdout, encoding='unicode')
     <html><body>text</body></html>
     >>> sequence = ["<html><body>", "text</bo", "dy></html>"]
     >>> element = ET.fromstringlist(sequence)
     >>> print(ET.tostring(element))
-    <html><body>text</body></html>
-    >>> print("".join(ET.tostringlist(element)))
-    <html><body>text</body></html>
+    b'<html><body>text</body></html>'
+    >>> print(b"".join(ET.tostringlist(element)))
+    b'<html><body>text</body></html>'
     >>> ET.tostring(element, "ascii")
     b"<?xml version='1.0' encoding='ascii'?>\\n<html><body>text</body></html>"
     >>> _, ids = ET.XMLID("<html><body>text</body></html>")
@@ -875,10 +875,10 @@
     """
     >>> elem = ET.XML("<html><body>text</body></html>")
     >>> ET.tostring(elem)
-    '<html><body>text</body></html>'
+    b'<html><body>text</body></html>'
     >>> elem = ET.fromstring("<html><body>text</body></html>")
     >>> ET.tostring(elem)
-    '<html><body>text</body></html>'
+    b'<html><body>text</body></html>'
     """
 
 def check_encoding(encoding):
@@ -1233,14 +1233,14 @@
     Test ProcessingInstruction directly
 
     >>> ET.tostring(ET.ProcessingInstruction('test', 'instruction'))
-    '<?test instruction?>'
+    b'<?test instruction?>'
     >>> ET.tostring(ET.PI('test', 'instruction'))
-    '<?test instruction?>'
+    b'<?test instruction?>'
 
     Issue #2746
 
     >>> ET.tostring(ET.PI('test', '<testing&>'))
-    '<?test <testing&>?>'
+    b'<?test <testing&>?>'
     >>> ET.tostring(ET.PI('test', '<testing&>\xe3'), 'latin1')
     b"<?xml version='1.0' encoding='latin1'?>\\n<?test <testing&>\\xe3?>"
     """
@@ -1643,11 +1643,11 @@
 
     >>> e = ET.Element('SomeTag', text="def _f():\n  return 3\n")
     >>> ET.tostring(e)
-    '<SomeTag text="def _f():&#10;  return 3&#10;" />'
+    b'<SomeTag text="def _f():&#10;  return 3&#10;" />'
     >>> ET.XML(ET.tostring(e)).get("text")
     'def _f():\n  return 3\n'
     >>> ET.tostring(ET.XML(ET.tostring(e)))
-    '<SomeTag text="def _f():&#10;  return 3&#10;" />'
+    b'<SomeTag text="def _f():&#10;  return 3&#10;" />'
 
     """
 
@@ -1698,15 +1698,15 @@
     """
 
     >>> ET.tostring(ET.Element("{http://namespace.invalid/does/not/exist/}title"))
-    '<ns0:title xmlns:ns0="http://namespace.invalid/does/not/exist/" />'
+    b'<ns0:title xmlns:ns0="http://namespace.invalid/does/not/exist/" />'
     >>> ET.register_namespace("foo", "http://namespace.invalid/does/not/exist/")
     >>> ET.tostring(ET.Element("{http://namespace.invalid/does/not/exist/}title"))
-    '<foo:title xmlns:foo="http://namespace.invalid/does/not/exist/" />'
+    b'<foo:title xmlns:foo="http://namespace.invalid/does/not/exist/" />'
 
     And the Dublin Core namespace is in the default list:
 
     >>> ET.tostring(ET.Element("{http://purl.org/dc/elements/1.1/}title"))
-    '<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/" />'
+    b'<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/" />'
 
     """
 
@@ -1792,7 +1792,7 @@
     '{${stuff}}localname'
     >>> t = ET.ElementTree(e)
     >>> ET.tostring(e)
-    '<ns0:localname xmlns:ns0="${stuff}" />'
+    b'<ns0:localname xmlns:ns0="${stuff}" />'
 
     """
 

Modified: python/branches/py3k/Lib/xml/etree/ElementTree.py
==============================================================================
--- python/branches/py3k/Lib/xml/etree/ElementTree.py	(original)
+++ python/branches/py3k/Lib/xml/etree/ElementTree.py	Sun Aug  8 21:48:29 2010
@@ -792,12 +792,13 @@
     # @def write(file, **options)
     # @param file A file name, or a file object opened for writing.
     # @param **options Options, given as keyword arguments.
-    # @keyparam encoding Optional output encoding (default is None).
+    # @keyparam encoding Optional output encoding (default is US-ASCII).
+    #     Use "unicode" to return a Unicode string.
     # @keyparam method Optional output method ("xml", "html", "text" or
     #     "c14n"; default is "xml").
     # @keyparam xml_declaration Controls if an XML declaration should
     #     be added to the file.  Use False for never, True for always,
-    #     None for only if not US-ASCII or UTF-8.  None is default.
+    #     None for only if not US-ASCII or UTF-8 or Unicode.  None is default.
 
     def write(self, file_or_filename,
               # keyword arguments
@@ -811,14 +812,23 @@
         elif method not in _serialize:
             # FIXME: raise an ImportError for c14n if ElementC14N is missing?
             raise ValueError("unknown method %r" % method)
+        if not encoding:
+            if method == "c14n":
+                encoding = "utf-8"
+            else:
+                encoding = "us-ascii"
+        elif encoding == str:  # lxml.etree compatibility.
+            encoding = "unicode"
+        else:
+            encoding = encoding.lower()
         if hasattr(file_or_filename, "write"):
             file = file_or_filename
         else:
-            if encoding:
+            if encoding != "unicode":
                 file = open(file_or_filename, "wb")
             else:
                 file = open(file_or_filename, "w")
-        if encoding:
+        if encoding != "unicode":
             def write(text):
                 try:
                     return file.write(text.encode(encoding,
@@ -827,20 +837,15 @@
                     _raise_serialization_error(text)
         else:
             write = file.write
-        if not encoding:
-            if method == "c14n":
-                encoding = "utf-8"
-            else:
-                encoding = None
-        elif xml_declaration or (xml_declaration is None and
-                                 encoding not in ("utf-8", "us-ascii")):
-            if method == "xml":
-                encoding_ = encoding
-                if not encoding:
-                    # Retrieve the default encoding for the xml declaration
-                    import locale
-                    encoding_ = locale.getpreferredencoding()
-                write("<?xml version='1.0' encoding='%s'?>\n" % encoding_)
+        if method == "xml" and (xml_declaration or
+                (xml_declaration is None and
+                 encoding not in ("utf-8", "us-ascii", "unicode"))):
+            declared_encoding = encoding
+            if encoding == "unicode":
+                # Retrieve the default encoding for the xml declaration
+                import locale
+                declared_encoding = locale.getpreferredencoding()
+            write("<?xml version='1.0' encoding='%s'?>\n" % declared_encoding)
         if method == "text":
             _serialize_text(write, self._root)
         else:
@@ -1127,11 +1132,12 @@
 
 ##
 # Generates a string representation of an XML element, including all
-# subelements.  If encoding is None, the return type is a string;
+# subelements.  If encoding is "unicode", the return type is a string;
 # otherwise it is a bytes array.
 #
 # @param element An Element instance.
-# @keyparam encoding Optional output encoding (default is None).
+# @keyparam encoding Optional output encoding (default is US-ASCII).
+#     Use "unicode" to return a Unicode string.
 # @keyparam method Optional output method ("xml", "html", "text" or
 #     "c14n"; default is "xml").
 # @return An (optionally) encoded string containing the XML data.
@@ -1144,17 +1150,20 @@
     file = dummy()
     file.write = data.append
     ElementTree(element).write(file, encoding, method=method)
-    if encoding:
-        return b"".join(data)
-    else:
+    if encoding in (str, "unicode"):
         return "".join(data)
+    else:
+        return b"".join(data)
 
 ##
 # Generates a string representation of an XML element, including all
-# subelements.  The string is returned as a sequence of string fragments.
+# subelements.  If encoding is False, the string is returned as a
+# sequence of string fragments; otherwise it is a sequence of
+# bytestrings.
 #
 # @param element An Element instance.
 # @keyparam encoding Optional output encoding (default is US-ASCII).
+#     Use "unicode" to return a Unicode string.
 # @keyparam method Optional output method ("xml", "html", "text" or
 #     "c14n"; default is "xml").
 # @return A sequence object containing the XML data.
@@ -1184,7 +1193,7 @@
     # debugging
     if not isinstance(elem, ElementTree):
         elem = ElementTree(elem)
-    elem.write(sys.stdout)
+    elem.write(sys.stdout, encoding="unicode")
     tail = elem.getroot().tail
     if not tail or tail[-1] != "\n":
         sys.stdout.write("\n")

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS	(original)
+++ python/branches/py3k/Misc/NEWS	Sun Aug  8 21:48:29 2010
@@ -55,6 +55,9 @@
 Library
 -------
 
+- Issue #8047: Fix the xml.etree serializer to return bytes by default.  Use
+  ``encoding="unicode"`` to generate a Unicode string.
+
 - Fix Issue8280 - urllib2's Request method will remove fragements in the url.
   This is how it is supposed to work, wget and curl do the same.  Previous
   behavior was wrong.


More information about the Python-checkins mailing list