xreadlines (was Re: while true: !!!)

Jeff Epler jepler at inetnebr.com
Wed Dec 13 18:57:45 EST 2000


It seems that the most common place that people write the 'while 1:
... if test: berak' idiom is when iterating over lines from a file
where the file is not known to fit in memory.

In Python, there is an existing precedent for creating a special 'iteratable'
object where the entire object would not fit in memory, the 'xrange' function
and its associated object type.

I take this cue to suggest the idea of an 'xreadlines' object.  I have
implemented 'xreadlines' in Python (where it was slower than all other
solutions, probably due to method call overhead) and in C (where it
holds its own with the fastest idiom I know,
        file = open(testfile)
        while 1:
                lines = file.readlines(1024)
                if not lines: break
                for line in lines:
                        pass
)

Attached is a copy of xreadlines.c and test.py (the latter contains the
XReadlines class).. test.py benchmarks 5 methods of reading the file's
contents.  Pick a long text file on your system, and run the test to find the
relative speeds of each method.

On a 433 MHz machine on the 45k-line, 400kbyte file /usr/dict/words:
% ./python test.py 
0.390071988106
1.04547798634
0.354212999344
0.365339994431
1.616106987

Jeff Epler
jepler at inetnebr.com

# ---- cut here for test.py
import xreadlines, time

class XReadlines:
	def __init__(self, file):
		self.lines = []
		self.file = file
		self.index = 0

	def __getitem__(self, i):
		if self.index >= len(self.lines):
			self.lines = self.file.readlines(512 * 1024)
			self.index = 0
		ret = self.lines[self.index]
		self.index = self.index + 1
		return ret

def timer(func, *args, **kw):
	t0 = time.time()
	func(*args, **kw)
	t1 = time.time()
	print t1-t0

testfile = "/usr/dict/words"
def test1():
	for line in open(testfile).readlines():
		pass

def test2():
	file = open(testfile)
	while 1:
		if file.readline() == "": break

def test3():
	file = open(testfile)
	while 1:
		lines = file.readlines(1024)
		if not lines: break
		for line in lines:
			pass

def test4():
	for line in xreadlines.xreadlines(open(testfile)):
		pass

def test5():
	for line in XReadlines(open(testfile)):
		pass

timer(test1)
timer(test2)
timer(test3)
timer(test4)
timer(test5)

# ---- test.py ends here

/* cut here for xreadlinesmodule.c */

#include "Python.h"

typedef struct {
	PyObject_HEAD
	PyObject *file;
	PyObject *lines;
	int lineslen;
	int lineno;
} PyXReadlinesObject;

staticforward PyTypeObject XReadlinesObject_Type;

static void
xreadlines_dealloc(PyXReadlinesObject *op) {
	Py_TRASHCAN_SAFE_BEGIN(op)
	PyObject_GC_Fini(op);
	Py_XDECREF(op->file);
	Py_XDECREF(op->lines);
	PyObject_DEL(op);
	Py_TRASHCAN_SAFE_END(op)
}

#if SIZEOF_INT < 4
#define BIGCHUNK  (512 * 32)
#else
#define BIGCHUNK  (512 * 1024)
#endif

static PyXReadlinesObject *
newreadlinesobject(PyObject *file) {
	PyXReadlinesObject *op;
	op = PyObject_NEW(PyXReadlinesObject, &XReadlinesObject_Type);
	if(op == NULL) return NULL;
	Py_XINCREF(file);
	op->file = file;
	op->lines = NULL;
	op->lineno = op->lineslen = 0;
	return op;
}

static PyObject *
xreadlines(PyObject *self, PyObject *args) {
	PyObject *file;
	PyXReadlinesObject *ret;

	if (!PyArg_ParseTuple(args, "O:xreadlines", &file))
		return NULL;
	ret = newreadlinesobject(file);
	Py_INCREF(ret);
	return (PyObject*)ret;
}

static PyObject*
xreadlines_item(PyXReadlinesObject *a, int i) {
	PyObject *ret;
	if(a->lineno >= a->lineslen) {
		Py_XDECREF(a->lines);
		a->lines = PyObject_CallMethod(a->file, "readlines", "(i)", BIGCHUNK);
/*
		if (a->lineslen == 0) {
			static PyObject *indexerr = NULL;
			if (indexerr == NULL)
				indexerr = PyString_FromString(
					"end of file");
			PyErr_SetObject(PyExc_IndexError, indexerr)
			return NULL;
		}
*/
		a->lineno = 0;
		a->lineslen = PyList_Size(a->lines);
	}
	ret = PyList_GetItem(a->lines, a->lineno++);
	Py_XINCREF(ret);
	return ret;
}

static PySequenceMethods xreadlines_as_sequence = {
	0, /*sq_length*/
	0, /*sq_concat*/
	0, /*sq_repeat*/
	(intargfunc)xreadlines_item, /*sq_item*/
};

static PyTypeObject XReadlinesObject_Type = {
	PyObject_HEAD_INIT(&PyType_Type)
	0,
	"xreadlines",
	sizeof(PyXReadlinesObject) + PyGC_HEAD_SIZE,
	0,
	(destructor)xreadlines_dealloc, /*tp_dealloc*/
	0, /*tp_print*/
	0, /*tp_getattr*/
	0, /*tp_setattr*/
	0, /*tp_compare*/
	0, /*tp_repr*/
	0, /*tp_as_number*/
	&xreadlines_as_sequence, /*tp_as_sequence*/
	0,		/*tp_as_mapping*/
	0,		/*tp_hash*/
	0,		/*tp_call*/
	0,		/*tp_str*/
	0,		/*tp_getattro*/
	0,		/*tp_setattro*/
	0,		/*tp_as_buffer*/
	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_GC,	/*tp_flags*/
 	0,		/* tp_doc */
};

static PyMethodDef xreadlines_methods[] = {
	{"xreadlines", xreadlines, METH_VARARGS},
	{NULL, NULL}
};

void
initxreadlines(void)
{
	PyObject *m;

	m = Py_InitModule("xreadlines", xreadlines_methods);
}
/* xreadlinesmodule.c ends here */



More information about the Python-list mailing list