[Python-checkins] python/dist/src/Modules pyexpat.c,2.62,2.63

fdrake@users.sourceforge.net fdrake@users.sourceforge.net
Fri, 28 Jun 2002 15:56:51 -0700


Update of /cvsroot/python/python/dist/src/Modules
In directory usw-pr-cvs1:/tmp/cvs-serv32623/Modules

Modified Files:
	pyexpat.c 
Log Message:
Added character data buffering to pyexpat parser objects.

Setting the buffer_text attribute to true causes the parser to collect
character data, waiting as long as possible to report it to the Python
callback.  This can save an enormous number of callbacks from C to
Python, which can be a substantial performance improvement.

buffer_text defaults to false.


Index: pyexpat.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Modules/pyexpat.c,v
retrieving revision 2.62
retrieving revision 2.63
diff -C2 -d -r2.62 -r2.63
*** pyexpat.c	28 Jun 2002 22:29:01 -0000	2.62
--- pyexpat.c	28 Jun 2002 22:56:48 -0000	2.63
***************
*** 61,68 ****
--- 61,74 ----
      int specified_attributes;   /* Report only specified attributes. */
      int in_callback;            /* Is a callback active? */
+     XML_Char *buffer;           /* Buffer used when accumulating characters */
+                                 /* NULL if not enabled */
+     int buffer_size;            /* Size of buffer, in XML_Char units */
+     int buffer_used;            /* Buffer units in use */
      PyObject *intern;           /* Dictionary to intern strings */
      PyObject **handlers;
  } xmlparseobject;
  
+ #define CHARACTER_DATA_BUFFER_SIZE 8192
+ 
  staticforward PyTypeObject Xmlparsetype;
  
***************
*** 314,317 ****
--- 320,402 ----
  }
  
+ /* Return 0 on success, -1 on exception.
+  * flag_error() will be called before return if needed.
+  */
+ static int
+ call_character_handler(xmlparseobject *self, const XML_Char *buffer, int len)
+ {
+     PyObject *args;
+     PyObject *temp;
+ 
+     args = PyTuple_New(1);
+     if (args == NULL)
+         return -1;
+ #ifdef Py_USING_UNICODE
+     temp = (self->returns_unicode 
+             ? conv_string_len_to_unicode(buffer, len) 
+             : conv_string_len_to_utf8(buffer, len));
+ #else
+     temp = conv_string_len_to_utf8(buffer, len);
+ #endif
+     if (temp == NULL) {
+         Py_DECREF(args);
+         flag_error(self);
+         return -1;
+     }
+     PyTuple_SET_ITEM(args, 0, temp);
+     /* temp is now a borrowed reference; consider it unused. */
+     self->in_callback = 1;
+     temp = call_with_frame(getcode(CharacterData, "CharacterData", __LINE__),
+                            self->handlers[CharacterData], args);
+     /* temp is an owned reference again, or NULL */
+     self->in_callback = 0;
+     Py_DECREF(args);
+     if (temp == NULL) {
+         flag_error(self);
+         return -1;
+     }
+     Py_DECREF(temp);
+     return 0;
+ }
+ 
+ static int
+ flush_character_buffer(xmlparseobject *self)
+ {
+     int rc;
+     if (self->buffer == NULL || self->buffer_used == 0)
+         return 0;
+     rc = call_character_handler(self, self->buffer, self->buffer_used);
+     self->buffer_used = 0;
+     return rc;
+ }
+ 
+ static void
+ my_CharacterDataHandler(void *userData, const XML_Char *data, int len) 
+ {
+     xmlparseobject *self = (xmlparseobject *) userData;
+     if (self->buffer == NULL)
+         call_character_handler(self, data, len);
+     else {
+         if ((self->buffer_used + len) > self->buffer_size) {
+             if (flush_character_buffer(self) < 0)
+                 return;
+             /* handler might have changed; drop the rest on the floor
+              * if there isn't a handler anymore
+              */
+             if (!have_handler(self, CharacterData))
+                 return;
+         }
+         if (len > self->buffer_size) {
+             call_character_handler(self, data, len);
+             self->buffer_used = 0;
+         }
+         else {
+             memcpy(self->buffer + self->buffer_used,
+                    data, len * sizeof(XML_Char));
+             self->buffer_used += len;
+         }
+     }
+ }
+ 
  static void
  my_StartElementHandler(void *userData,
***************
*** 324,327 ****
--- 409,414 ----
          int i, max;
  
+         if (flush_character_buffer(self) < 0)
+             return;
          /* Set max to the number of slots filled in atts[]; max/2 is
           * the number of attributes we need to process.
***************
*** 403,406 ****
--- 490,495 ----
  \
      if (have_handler(self, NAME)) { \
+         if (flush_character_buffer(self) < 0) \
+             return RETURN; \
          args = Py_BuildValue PARAM_FORMAT ;\
          if (!args) { flag_error(self); return RETURN;} \
***************
*** 439,454 ****
               ("(NO&)", string_intern(self, target), STRING_CONV_FUNC,data))
  
- #ifndef Py_USING_UNICODE
- VOID_HANDLER(CharacterData,
-              (void *userData, const XML_Char *data, int len),
-              ("(N)", conv_string_len_to_utf8(data,len)))
- #else
- VOID_HANDLER(CharacterData,
-              (void *userData, const XML_Char *data, int len),
-              ("(N)", (self->returns_unicode
-                       ? conv_string_len_to_unicode(data,len)
-                       : conv_string_len_to_utf8(data,len))))
- #endif
- 
  VOID_HANDLER(UnparsedEntityDecl,
               (void *userData,
--- 528,531 ----
***************
*** 674,677 ****
--- 751,757 ----
          return set_error(self);
      }
+     if (flush_character_buffer(self) < 0) {
+         return NULL;
+     }
      return PyInt_FromLong(rv);
  }
***************
*** 891,894 ****
--- 971,985 ----
      if (new_parser == NULL)
          return NULL;
+     new_parser->buffer_size = self->buffer_size;
+     new_parser->buffer_used = 0;
+     if (self->buffer != NULL) {
+         new_parser->buffer = malloc(new_parser->buffer_size);
+         if (new_parser->buffer == NULL) {
+             PyObject_GC_Del(new_parser);
+             return PyErr_NoMemory();
+         }
+     }
+     else
+         new_parser->buffer = NULL;
      new_parser->returns_unicode = self->returns_unicode;
      new_parser->ordered_attributes = self->ordered_attributes;
***************
*** 914,921 ****
  
      /* allocate and clear handlers first */
!     for(i = 0; handler_info[i].name != NULL; i++)
          /* do nothing */;
  
!     new_parser->handlers = malloc(sizeof(PyObject *)*i);
      if (!new_parser->handlers) {
          Py_DECREF(new_parser);
--- 1005,1012 ----
  
      /* allocate and clear handlers first */
!     for (i = 0; handler_info[i].name != NULL; i++)
          /* do nothing */;
  
!     new_parser->handlers = malloc(sizeof(PyObject *) * i);
      if (!new_parser->handlers) {
          Py_DECREF(new_parser);
***************
*** 1054,1057 ****
--- 1145,1151 ----
      self->returns_unicode = 1;
  #endif
+     self->buffer = NULL;
+     self->buffer_size = CHARACTER_DATA_BUFFER_SIZE;
+     self->buffer_used = 0;
      self->ordered_attributes = 0;
      self->specified_attributes = 0;
***************
*** 1082,1086 ****
  #endif
  
!     for(i = 0; handler_info[i].name != NULL; i++)
          /* do nothing */;
  
--- 1176,1180 ----
  #endif
  
!     for (i = 0; handler_info[i].name != NULL; i++)
          /* do nothing */;
  
***************
*** 1119,1122 ****
--- 1213,1220 ----
          self->handlers = NULL;
      }
+     if (self->buffer != NULL) {
+         free(self->buffer);
+         self->buffer = NULL;
+     }
      Py_XDECREF(self->intern);
  #if PY_MAJOR_VERSION == 1 && PY_MINOR_VERSION < 6
***************
*** 1180,1183 ****
--- 1278,1289 ----
                                    XML_GetErrorByteIndex(self->itself));
      }
+     if (name[0] == 'b') {
+         if (strcmp(name, "buffer_size") == 0)
+             return PyInt_FromLong((long) self->buffer_size);
+         if (strcmp(name, "buffer_text") == 0)
+             return get_pybool(self->buffer != NULL);
+         if (strcmp(name, "buffer_used") == 0)
+             return PyInt_FromLong((long) self->buffer_used);
+     }
      if (strcmp(name, "ordered_attributes") == 0)
          return get_pybool(self->ordered_attributes);
***************
*** 1207,1210 ****
--- 1313,1319 ----
          PyList_Append(rc, PyString_FromString("ErrorColumnNumber"));
          PyList_Append(rc, PyString_FromString("ErrorByteIndex"));
+         PyList_Append(rc, PyString_FromString("buffer_size"));
+         PyList_Append(rc, PyString_FromString("buffer_text"));
+         PyList_Append(rc, PyString_FromString("buffer_used"));
          PyList_Append(rc, PyString_FromString("ordered_attributes"));
          PyList_Append(rc, PyString_FromString("returns_unicode"));
***************
*** 1247,1250 ****
--- 1356,1378 ----
          return -1;
      }
+     if (strcmp(name, "buffer_text") == 0) {
+         if (PyObject_IsTrue(v)) {
+             if (self->buffer == NULL) {
+                 self->buffer = malloc(self->buffer_size);
+                 if (self->buffer == NULL) {
+                     PyErr_NoMemory();
+                     return -1;
+                 }
+                 self->buffer_used = 0;
+             }
+         }
+         else if (self->buffer != NULL) {
+             if (flush_character_buffer(self) < 0)
+                 return -1;
+             free(self->buffer);
+             self->buffer = NULL;
+         }
+         return 0;
+     }
      if (strcmp(name, "ordered_attributes") == 0) {
          if (PyObject_IsTrue(v))
***************
*** 1275,1278 ****
--- 1403,1415 ----
          return 0;
      }
+     if (strcmp(name, "CharacterDataHandler") == 0) {
+         /* If we're changing the character data handler, flush all
+          * cached data with the old handler.  Not sure there's a
+          * "right" thing to do, though, but this probably won't
+          * happen.
+          */
+         if (flush_character_buffer(self) < 0)
+             return -1;
+     }
      if (sethandler(self, name, v)) {
          return 0;
***************
*** 1659,1672 ****
      {"UnparsedEntityDeclHandler",
       (xmlhandlersetter)XML_SetUnparsedEntityDeclHandler,
!      (xmlhandler)my_UnparsedEntityDeclHandler },
      {"NotationDeclHandler",
       (xmlhandlersetter)XML_SetNotationDeclHandler,
!      (xmlhandler)my_NotationDeclHandler },
      {"StartNamespaceDeclHandler",
       (xmlhandlersetter)XML_SetStartNamespaceDeclHandler,
!      (xmlhandler)my_StartNamespaceDeclHandler },
      {"EndNamespaceDeclHandler",
       (xmlhandlersetter)XML_SetEndNamespaceDeclHandler,
!      (xmlhandler)my_EndNamespaceDeclHandler },
      {"CommentHandler",
       (xmlhandlersetter)XML_SetCommentHandler,
--- 1796,1809 ----
      {"UnparsedEntityDeclHandler",
       (xmlhandlersetter)XML_SetUnparsedEntityDeclHandler,
!      (xmlhandler)my_UnparsedEntityDeclHandler},
      {"NotationDeclHandler",
       (xmlhandlersetter)XML_SetNotationDeclHandler,
!      (xmlhandler)my_NotationDeclHandler},
      {"StartNamespaceDeclHandler",
       (xmlhandlersetter)XML_SetStartNamespaceDeclHandler,
!      (xmlhandler)my_StartNamespaceDeclHandler},
      {"EndNamespaceDeclHandler",
       (xmlhandlersetter)XML_SetEndNamespaceDeclHandler,
!      (xmlhandler)my_EndNamespaceDeclHandler},
      {"CommentHandler",
       (xmlhandlersetter)XML_SetCommentHandler,
***************
*** 1689,1693 ****
      {"ExternalEntityRefHandler",
       (xmlhandlersetter)XML_SetExternalEntityRefHandler,
!      (xmlhandler)my_ExternalEntityRefHandler },
      {"StartDoctypeDeclHandler",
       (xmlhandlersetter)XML_SetStartDoctypeDeclHandler,
--- 1826,1830 ----
      {"ExternalEntityRefHandler",
       (xmlhandlersetter)XML_SetExternalEntityRefHandler,
!      (xmlhandler)my_ExternalEntityRefHandler},
      {"StartDoctypeDeclHandler",
       (xmlhandlersetter)XML_SetStartDoctypeDeclHandler,