[Python-checkins] python/dist/src/Python ceval.c,2.347,2.348 compile.c,2.272,2.273 sysmodule.c,2.112,2.113

jhylton@users.sourceforge.net jhylton@users.sourceforge.net
Wed, 05 Feb 2003 15:13:04 -0800


Update of /cvsroot/python/python/dist/src/Python
In directory sc8-pr-cvs1:/tmp/cvs-serv3401/Python

Modified Files:
	ceval.c compile.c sysmodule.c 
Log Message:
Small function call optimization and special build option for call stats.

-DCALL_PROFILE: Count the number of function calls executed.

When this symbol is defined, the ceval mainloop and helper functions
count the number of function calls made.  It keeps detailed statistics
about what kind of object was called and whether the call hit any of
the special fast paths in the code.

Optimization:

When we take the fast_function() path, which seems to be taken for
most function calls, and there is minimal frame setup to do, avoid
call PyEval_EvalCodeEx().  The eval code ex function does a lot of
work to handle keywords args and star args, free variables,
generators, etc.  The inlined version simply allocates the frame and
copies the arguments values into the frame.

The optimization gets a little help from compile.c which adds a
CO_NOFREE flag to code objects that don't have free variables or cell
variables.  This change allows fast_function() to get into the fast
path with fewer tests.

I measure a couple of percent speedup in pystone with this change, but
there's surely more that can be done.


Index: ceval.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Python/ceval.c,v
retrieving revision 2.347
retrieving revision 2.348
diff -C2 -d -r2.347 -r2.348
*** ceval.c	19 Jan 2003 05:08:13 -0000	2.347
--- ceval.c	5 Feb 2003 23:12:57 -0000	2.348
***************
*** 88,91 ****
--- 88,147 ----
  #endif
  
+ /* Function call profile */
+ #ifdef CALL_PROFILE
+ #define PCALL_NUM 11
+ static int pcall[PCALL_NUM];
+ 
+ #define PCALL_ALL 0
+ #define PCALL_FUNCTION 1
+ #define PCALL_FAST_FUNCTION 2
+ #define PCALL_FASTER_FUNCTION 3
+ #define PCALL_METHOD 4
+ #define PCALL_BOUND_METHOD 5
+ #define PCALL_CFUNCTION 6
+ #define PCALL_TYPE 7
+ #define PCALL_GENERATOR 8
+ #define PCALL_OTHER 9
+ #define PCALL_POP 10
+ 
+ /* Notes about the statistics
+ 
+    PCALL_FAST stats
+ 
+    FAST_FUNCTION means no argument tuple needs to be created.
+    FASTER_FUNCTION means that the fast-path frame setup code is used.
+ 
+    If there is a method call where the call can be optimized by changing
+    the argument tuple and calling the function directly, it gets recorded
+    twice.
+ 
+    As a result, the relationship among the statistics appears to be
+    PCALL_ALL == PCALL_FUNCTION + PCALL_METHOD - PCALL_BOUND_METHOD +
+                 PCALL_CFUNCTION + PCALL_TYPE + PCALL_GENERATOR + PCALL_OTHER
+    PCALL_FUNCTION > PCALL_FAST_FUNCTION > PCALL_FASTER_FUNCTION
+    PCALL_METHOD > PCALL_BOUND_METHOD
+ */
+ 
+ #define PCALL(POS) pcall[POS]++
+ 
+ PyObject *
+ PyEval_GetCallStats(PyObject *self)
+ {
+ 	return Py_BuildValue("iiiiiiiiii", 
+ 			     pcall[0], pcall[1], pcall[2], pcall[3],
+ 			     pcall[4], pcall[5], pcall[6], pcall[7],
+ 			     pcall[8], pcall[9]);
+ }
+ #else
+ #define PCALL(O)
+ 
+ PyObject *
+ PyEval_GetCallStats(PyObject *self)
+ {
+ 	Py_INCREF(Py_None);
+ 	return Py_None;
+ }
+ #endif
+ 
  static PyTypeObject gentype;
  
***************
*** 476,479 ****
--- 532,536 ----
  PyEval_EvalCode(PyCodeObject *co, PyObject *globals, PyObject *locals)
  {
+ 	/* XXX raise SystemError if globals is NULL */
  	return PyEval_EvalCodeEx(co,
  			  globals, locals,
***************
*** 1981,1984 ****
--- 2038,2042 ----
  
  		case CALL_FUNCTION:
+ 			PCALL(PCALL_ALL);
  			x = call_function(&stack_pointer, oparg);
  			PUSH(x);
***************
*** 1996,1999 ****
--- 2054,2058 ----
  		    int n = na + 2 * nk;
  		    PyObject **pfunc, *func;
+ 		    PCALL(PCALL_ALL);
  		    if (flags & CALL_FLAG_VAR)
  			    n++;
***************
*** 2318,2324 ****
  	}
  
! 	f = PyFrame_New(tstate,			/*back*/
! 			co,			/*code*/
! 			globals, locals);
  	if (f == NULL)
  		return NULL;
--- 2377,2382 ----
  	}
  
! 	assert(globals != NULL);
! 	f = PyFrame_New(tstate, co, globals, locals);
  	if (f == NULL)
  		return NULL;
***************
*** 2521,2524 ****
--- 2579,2584 ----
  		f->f_back = NULL;
  
+ 		PCALL(PCALL_GENERATOR);
+ 
  		/* Create a new generator that owns the ready to run frame
  		 * and return that as the value. */
***************
*** 3199,3208 ****
  	PyObject *x, *w;
  
! 	/* Always dispatch PyCFunction first, because
! 	   these are presumed to be the most frequent
! 	   callable object.
  	*/
  	if (PyCFunction_Check(func) && nk == 0) {
  		int flags = PyCFunction_GET_FLAGS(func);
  		if (flags & (METH_NOARGS | METH_O)) {
  			PyCFunction meth = PyCFunction_GET_FUNCTION(func);
--- 3259,3268 ----
  	PyObject *x, *w;
  
! 	/* Always dispatch PyCFunction first, because these are
! 	   presumed to be the most frequent callable object.
  	*/
  	if (PyCFunction_Check(func) && nk == 0) {
  		int flags = PyCFunction_GET_FLAGS(func);
+ 		PCALL(PCALL_CFUNCTION);
  		if (flags & (METH_NOARGS | METH_O)) {
  			PyCFunction meth = PyCFunction_GET_FUNCTION(func);
***************
*** 3230,3233 ****
--- 3290,3295 ----
  			/* optimize access to bound methods */
  			PyObject *self = PyMethod_GET_SELF(func);
+ 			PCALL(PCALL_METHOD);
+ 			PCALL(PCALL_BOUND_METHOD);
  			Py_INCREF(self);
  			func = PyMethod_GET_FUNCTION(func);
***************
*** 3246,3252 ****
--- 3308,3316 ----
  	}
  	
+ 	/* What does this do? */
  	while ((*pp_stack) > pfunc) {
  		w = EXT_POP(*pp_stack);
  		Py_DECREF(w);
+ 		PCALL(PCALL_POP);
  	}
  	return x;
***************
*** 3255,3258 ****
--- 3319,3327 ----
  /* The fast_function() function optimize calls for which no argument
     tuple is necessary; the objects are passed directly from the stack.
+    For the simplest case -- a function that takes only positional
+    arguments and is called with only positional arguments -- it
+    inlines the most primitive frame setup code from
+    PyEval_EvalCodeEx(), which vastly reduces the checks that must be
+    done before evaluating the frame.
  */
  
***************
*** 3260,3278 ****
  fast_function(PyObject *func, PyObject ***pp_stack, int n, int na, int nk)
  {
! 	PyObject *co = PyFunction_GET_CODE(func);
  	PyObject *globals = PyFunction_GET_GLOBALS(func);
  	PyObject *argdefs = PyFunction_GET_DEFAULTS(func);
- 	PyObject *closure = PyFunction_GET_CLOSURE(func);
  	PyObject **d = NULL;
  	int nd = 0;
  
  	if (argdefs != NULL) {
  		d = &PyTuple_GET_ITEM(argdefs, 0);
  		nd = ((PyTupleObject *)argdefs)->ob_size;
  	}
! 	return PyEval_EvalCodeEx((PyCodeObject *)co, globals,
! 			  (PyObject *)NULL, (*pp_stack)-n, na,
! 			  (*pp_stack)-2*nk, nk, d, nd,
! 			  closure);
  }
  
--- 3329,3380 ----
  fast_function(PyObject *func, PyObject ***pp_stack, int n, int na, int nk)
  {
! 	PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func);
  	PyObject *globals = PyFunction_GET_GLOBALS(func);
  	PyObject *argdefs = PyFunction_GET_DEFAULTS(func);
  	PyObject **d = NULL;
  	int nd = 0;
  
+ 	PCALL(PCALL_FUNCTION);
+ 	PCALL(PCALL_FAST_FUNCTION);
+ 	if (argdefs == NULL && co->co_argcount == n && 
+ 	    co->co_flags == (CO_OPTIMIZED | CO_NEWLOCALS | CO_NOFREE)) {
+ 		PyFrameObject *f;
+ 		PyObject *retval = NULL;
+ 		PyThreadState *tstate = PyThreadState_GET();
+ 		PyObject **fastlocals, **stack;
+ 		int i;
+ 
+ 		PCALL(PCALL_FASTER_FUNCTION);
+ 		assert(globals != NULL);
+ 		/* XXX Perhaps we should create a specialized
+ 		   PyFrame_New() that doesn't take locals, but does
+ 		   take builtins without sanity checking them.
+ 		*/
+ 		f = PyFrame_New(tstate, co, globals, NULL);
+ 		if (f == NULL)
+ 			return NULL;
+ 
+ 		fastlocals = f->f_localsplus;
+ 		stack = (*pp_stack) - n;
+ 
+ 		for (i = 0; i < n; i++) {
+ 			Py_INCREF(*stack);
+ 			fastlocals[i] = *stack++;
+ 		}
+ 		retval = eval_frame(f);
+ 		assert(tstate != NULL);
+ 		++tstate->recursion_depth;
+ 		Py_DECREF(f);
+ 		--tstate->recursion_depth;
+ 		return retval;
+ 	}
  	if (argdefs != NULL) {
  		d = &PyTuple_GET_ITEM(argdefs, 0);
  		nd = ((PyTupleObject *)argdefs)->ob_size;
  	}
! 	return PyEval_EvalCodeEx(co, globals,
! 				 (PyObject *)NULL, (*pp_stack)-n, na,
! 				 (*pp_stack)-2*nk, nk, d, nd,
! 				 PyFunction_GET_CLOSURE(func));
  }
  
***************
*** 3372,3375 ****
--- 3474,3491 ----
  	if (callargs == NULL)
  		goto call_fail;
+ #ifdef CALL_PROFILE
+ 	/* At this point, we have to look at the type of func to
+ 	   update the call stats properly.  Do it here so as to avoid
+ 	   exposing the call stats machinery outside ceval.c
+ 	*/
+ 	if (PyFunction_Check(func))
+ 		PCALL(PCALL_FUNCTION);
+ 	else if (PyMethod_Check(func))
+ 		PCALL(PCALL_METHOD);
+ 	else if (PyType_Check(func))
+ 		PCALL(PCALL_TYPE);
+ 	else
+ 		PCALL(PCALL_OTHER);
+ #endif
  	result = PyObject_Call(func, callargs, kwdict);
   call_fail:
***************
*** 3427,3430 ****
--- 3543,3560 ----
  	if (callargs == NULL)
  		goto ext_call_fail;
+ #ifdef CALL_PROFILE
+ 	/* At this point, we have to look at the type of func to
+ 	   update the call stats properly.  Do it here so as to avoid
+ 	   exposing the call stats machinery outside ceval.c
+ 	*/
+ 	if (PyFunction_Check(func))
+ 		PCALL(PCALL_FUNCTION);
+ 	else if (PyMethod_Check(func))
+ 		PCALL(PCALL_METHOD);
+ 	else if (PyType_Check(func))
+ 		PCALL(PCALL_TYPE);
+ 	else
+ 		PCALL(PCALL_OTHER);
+ #endif
  	result = PyObject_Call(func, callargs, kwdict);
        ext_call_fail:

Index: compile.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Python/compile.c,v
retrieving revision 2.272
retrieving revision 2.273
diff -C2 -d -r2.272 -r2.273
*** compile.c	16 Jan 2003 15:39:07 -0000	2.272
--- compile.c	5 Feb 2003 23:13:00 -0000	2.273
***************
*** 386,389 ****
--- 386,392 ----
  		Py_INCREF(lnotab);
  		co->co_lnotab = lnotab;
+ 		if (PyTuple_GET_SIZE(freevars) == 0 &&
+ 		    PyTuple_GET_SIZE(cellvars) == 0)
+ 		    co->co_flags |= CO_NOFREE;
  	}
  	return co;

Index: sysmodule.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Python/sysmodule.c,v
retrieving revision 2.112
retrieving revision 2.113
diff -C2 -d -r2.112 -r2.113
*** sysmodule.c	8 Jan 2003 14:33:48 -0000	2.112
--- sysmodule.c	5 Feb 2003 23:13:00 -0000	2.113
***************
*** 563,566 ****
--- 563,588 ----
  }
  
+ PyDoc_STRVAR(callstats_doc,
+ "callstats() -> tuple of integers\n\
+ \n\
+ Return a tuple of function call statistics, if CALL_PROFILE was defined\n\
+ when Python was built.  Otherwise, return None.\n\
+ \n\
+ When enabled, this function returns detailed, implementation-specific\n\
+ details about the number of function calls executed. The return value is\n\
+ a 11-tuple where the entries in the tuple are counts of:\n\
+ 0. all function calls\n\
+ 1. calls to PyFunction_Type objects\n\
+ 2. PyFunction calls that do not create an argument tuple\n\
+ 3. PyFunction calls that do not create an argument tuple\n\
+    and bypass PyEval_EvalCodeEx()\n\
+ 4. PyMethod calls\n\
+ 5. PyMethod calls on bound methods\n\
+ 6. PyType calls\n\
+ 7. PyCFunction calls\n\
+ 8. generator calls\n\
+ 9. All other calls\n\
+ 10. Number of stack pops performed by call_function()"
+ );
  
  #ifdef Py_TRACE_REFS
***************
*** 576,579 ****
--- 598,603 ----
  static PyMethodDef sys_methods[] = {
  	/* Might as well keep this in alphabetic order */
+ 	{"callstats", (PyCFunction)PyEval_GetCallStats, METH_NOARGS, 
+ 	 callstats_doc},
  	{"displayhook",	sys_displayhook, METH_O, displayhook_doc},
  	{"exc_info",	(PyCFunction)sys_exc_info, METH_NOARGS, exc_info_doc},
***************
*** 581,586 ****
  	{"exit",	sys_exit, METH_VARARGS, exit_doc},
  #ifdef Py_USING_UNICODE
! 	{"getdefaultencoding", (PyCFunction)sys_getdefaultencoding, METH_NOARGS,
! 	 getdefaultencoding_doc}, 
  #endif
  #ifdef HAVE_DLOPEN
--- 605,610 ----
  	{"exit",	sys_exit, METH_VARARGS, exit_doc},
  #ifdef Py_USING_UNICODE
! 	{"getdefaultencoding", (PyCFunction)sys_getdefaultencoding, 
! 	 METH_NOARGS, getdefaultencoding_doc}, 
  #endif
  #ifdef HAVE_DLOPEN