[Python-checkins] cpython: Issue #25638: Optimized ElementTree.iterparse(); it is now 2x faster.

serhiy.storchaka python-checkins at python.org
Sun Dec 6 19:31:36 EST 2015


https://hg.python.org/cpython/rev/dd67c8c53aea
changeset:   99484:dd67c8c53aea
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Mon Dec 07 02:31:11 2015 +0200
summary:
  Issue #25638: Optimized ElementTree.iterparse(); it is now 2x faster.

ElementTree.XMLParser._setevents now accepts any objects with the append
method, not just a list.

files:
  Lib/xml/etree/ElementTree.py    |  92 +++++++-------------
  Misc/NEWS                       |   2 +
  Modules/_elementtree.c          |  35 ++++---
  Modules/clinic/_elementtree.c.h |   7 +-
  4 files changed, 56 insertions(+), 80 deletions(-)


diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py
--- a/Lib/xml/etree/ElementTree.py
+++ b/Lib/xml/etree/ElementTree.py
@@ -95,6 +95,7 @@
 import re
 import warnings
 import io
+import collections
 import contextlib
 
 from . import ElementPath
@@ -1198,16 +1199,37 @@
     Returns an iterator providing (event, elem) pairs.
 
     """
+    # Use the internal, undocumented _parser argument for now; When the
+    # parser argument of iterparse is removed, this can be killed.
+    pullparser = XMLPullParser(events=events, _parser=parser)
+    def iterator():
+        try:
+            while True:
+                yield from pullparser.read_events()
+                # load event buffer
+                data = source.read(16 * 1024)
+                if not data:
+                    break
+                pullparser.feed(data)
+            root = pullparser._close_and_return_root()
+            yield from pullparser.read_events()
+            it.root = root
+        finally:
+            if close_source:
+                source.close()
+
+    class IterParseIterator(collections.Iterator):
+        __next__ = iterator().__next__
+    it = IterParseIterator()
+    it.root = None
+    del iterator, IterParseIterator
+
     close_source = False
     if not hasattr(source, "read"):
         source = open(source, "rb")
         close_source = True
-    try:
-        return _IterParseIterator(source, events, parser, close_source)
-    except:
-        if close_source:
-            source.close()
-        raise
+
+    return it
 
 
 class XMLPullParser:
@@ -1217,9 +1239,7 @@
         # upon in user code. It will be removed in a future release.
         # See http://bugs.python.org/issue17741 for more details.
 
-        # _elementtree.c expects a list, not a deque
-        self._events_queue = []
-        self._index = 0
+        self._events_queue = collections.deque()
         self._parser = _parser or XMLParser(target=TreeBuilder())
         # wire up the parser for event reporting
         if events is None:
@@ -1257,64 +1277,14 @@
         retrieved from the iterator.
         """
         events = self._events_queue
-        while True:
-            index = self._index
-            try:
-                event = events[self._index]
-                # Avoid retaining references to past events
-                events[self._index] = None
-            except IndexError:
-                break
-            index += 1
-            # Compact the list in a O(1) amortized fashion
-            # As noted above, _elementree.c needs a list, not a deque
-            if index * 2 >= len(events):
-                events[:index] = []
-                self._index = 0
-            else:
-                self._index = index
+        while events:
+            event = events.popleft()
             if isinstance(event, Exception):
                 raise event
             else:
                 yield event
 
 
-class _IterParseIterator:
-
-    def __init__(self, source, events, parser, close_source=False):
-        # Use the internal, undocumented _parser argument for now; When the
-        # parser argument of iterparse is removed, this can be killed.
-        self._parser = XMLPullParser(events=events, _parser=parser)
-        self._file = source
-        self._close_file = close_source
-        self.root = self._root = None
-
-    def __next__(self):
-        try:
-            while 1:
-                for event in self._parser.read_events():
-                    return event
-                if self._parser._parser is None:
-                    break
-                # load event buffer
-                data = self._file.read(16 * 1024)
-                if data:
-                    self._parser.feed(data)
-                else:
-                    self._root = self._parser._close_and_return_root()
-            self.root = self._root
-        except:
-            if self._close_file:
-                self._file.close()
-            raise
-        if self._close_file:
-            self._file.close()
-        raise StopIteration
-
-    def __iter__(self):
-        return self
-
-
 def XML(text, parser=None):
     """Parse XML document from string constant.
 
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -109,6 +109,8 @@
 Library
 -------
 
+- Issue #25638: Optimized ElementTree.iterparse(); it is now 2x faster.
+
 - Issue #25761: Improved detecting errors in broken pickle data.
 
 - Issue #25717: Restore the previous behaviour of tolerating most fstat()
diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c
--- a/Modules/_elementtree.c
+++ b/Modules/_elementtree.c
@@ -2289,7 +2289,7 @@
     PyObject *element_factory;
 
     /* element tracing */
-    PyObject *events; /* list of events, or NULL if not collecting */
+    PyObject *events_append; /* the append method of the list of events, or NULL */
     PyObject *start_event_obj; /* event objects (NULL to ignore) */
     PyObject *end_event_obj;
     PyObject *start_ns_event_obj;
@@ -2324,7 +2324,7 @@
         }
         t->index = 0;
 
-        t->events = NULL;
+        t->events_append = NULL;
         t->start_event_obj = t->end_event_obj = NULL;
         t->start_ns_event_obj = t->end_ns_event_obj = NULL;
     }
@@ -2374,7 +2374,7 @@
     Py_CLEAR(self->start_ns_event_obj);
     Py_CLEAR(self->end_event_obj);
     Py_CLEAR(self->start_event_obj);
-    Py_CLEAR(self->events);
+    Py_CLEAR(self->events_append);
     Py_CLEAR(self->stack);
     Py_CLEAR(self->data);
     Py_CLEAR(self->last);
@@ -2455,13 +2455,14 @@
                          PyObject *node)
 {
     if (action != NULL) {
-        PyObject *res = PyTuple_Pack(2, action, node);
+        PyObject *res;
+        PyObject *event = PyTuple_Pack(2, action, node);
+        if (event == NULL)
+            return -1;
+        res = PyObject_CallFunctionObjArgs(self->events_append, event, NULL);
+        Py_DECREF(event);
         if (res == NULL)
             return -1;
-        if (PyList_Append(self->events, res) < 0) {
-            Py_DECREF(res);
-            return -1;
-        }
         Py_DECREF(res);
     }
     return 0;
@@ -3039,7 +3040,7 @@
     if (PyErr_Occurred())
         return;
 
-    if (!target->events || !target->start_ns_event_obj)
+    if (!target->events_append || !target->start_ns_event_obj)
         return;
 
     if (!uri)
@@ -3062,7 +3063,7 @@
     if (PyErr_Occurred())
         return;
 
-    if (!target->events)
+    if (!target->events_append)
         return;
 
     treebuilder_append_event(target, target->end_ns_event_obj, Py_None);
@@ -3551,7 +3552,7 @@
 /*[clinic input]
 _elementtree.XMLParser._setevents
 
-    events_queue: object(subclass_of='&PyList_Type')
+    events_queue: object
     events_to_report: object = None
     /
 
@@ -3561,12 +3562,12 @@
 _elementtree_XMLParser__setevents_impl(XMLParserObject *self,
                                        PyObject *events_queue,
                                        PyObject *events_to_report)
-/*[clinic end generated code: output=1440092922b13ed1 input=59db9742910c6174]*/
+/*[clinic end generated code: output=1440092922b13ed1 input=abf90830a1c3b0fc]*/
 {
     /* activate element event reporting */
     Py_ssize_t i, seqlen;
     TreeBuilderObject *target;
-    PyObject *events_seq;
+    PyObject *events_append, *events_seq;
 
     if (!TreeBuilder_CheckExact(self->target)) {
         PyErr_SetString(
@@ -3579,9 +3580,11 @@
 
     target = (TreeBuilderObject*) self->target;
 
-    Py_INCREF(events_queue);
-    Py_XDECREF(target->events);
-    target->events = events_queue;
+    events_append = PyObject_GetAttrString(events_queue, "append");
+    if (events_append == NULL)
+        return NULL;
+    Py_XDECREF(target->events_append);
+    target->events_append = events_append;
 
     /* clear out existing events */
     Py_CLEAR(target->start_event_obj);
diff --git a/Modules/clinic/_elementtree.c.h b/Modules/clinic/_elementtree.c.h
--- a/Modules/clinic/_elementtree.c.h
+++ b/Modules/clinic/_elementtree.c.h
@@ -668,12 +668,13 @@
     PyObject *events_queue;
     PyObject *events_to_report = Py_None;
 
-    if (!PyArg_ParseTuple(args, "O!|O:_setevents",
-        &PyList_Type, &events_queue, &events_to_report))
+    if (!PyArg_UnpackTuple(args, "_setevents",
+        1, 2,
+        &events_queue, &events_to_report))
         goto exit;
     return_value = _elementtree_XMLParser__setevents_impl(self, events_queue, events_to_report);
 
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=25b8bf7e7f2151ca input=a9049054013a1b77]*/
+/*[clinic end generated code: output=19d94e2d2726d3aa input=a9049054013a1b77]*/

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list