xreadlines (was Re: while true: !!!)
Jeff Epler
jepler at inetnebr.com
Wed Dec 13 18:57:45 EST 2000
It seems that the most common place that people write the 'while 1:
... if test: berak' idiom is when iterating over lines from a file
where the file is not known to fit in memory.
In Python, there is an existing precedent for creating a special 'iteratable'
object where the entire object would not fit in memory, the 'xrange' function
and its associated object type.
I take this cue to suggest the idea of an 'xreadlines' object. I have
implemented 'xreadlines' in Python (where it was slower than all other
solutions, probably due to method call overhead) and in C (where it
holds its own with the fastest idiom I know,
file = open(testfile)
while 1:
lines = file.readlines(1024)
if not lines: break
for line in lines:
pass
)
Attached is a copy of xreadlines.c and test.py (the latter contains the
XReadlines class).. test.py benchmarks 5 methods of reading the file's
contents. Pick a long text file on your system, and run the test to find the
relative speeds of each method.
On a 433 MHz machine on the 45k-line, 400kbyte file /usr/dict/words:
% ./python test.py
0.390071988106
1.04547798634
0.354212999344
0.365339994431
1.616106987
Jeff Epler
jepler at inetnebr.com
# ---- cut here for test.py
import xreadlines, time
class XReadlines:
def __init__(self, file):
self.lines = []
self.file = file
self.index = 0
def __getitem__(self, i):
if self.index >= len(self.lines):
self.lines = self.file.readlines(512 * 1024)
self.index = 0
ret = self.lines[self.index]
self.index = self.index + 1
return ret
def timer(func, *args, **kw):
t0 = time.time()
func(*args, **kw)
t1 = time.time()
print t1-t0
testfile = "/usr/dict/words"
def test1():
for line in open(testfile).readlines():
pass
def test2():
file = open(testfile)
while 1:
if file.readline() == "": break
def test3():
file = open(testfile)
while 1:
lines = file.readlines(1024)
if not lines: break
for line in lines:
pass
def test4():
for line in xreadlines.xreadlines(open(testfile)):
pass
def test5():
for line in XReadlines(open(testfile)):
pass
timer(test1)
timer(test2)
timer(test3)
timer(test4)
timer(test5)
# ---- test.py ends here
/* cut here for xreadlinesmodule.c */
#include "Python.h"
typedef struct {
PyObject_HEAD
PyObject *file;
PyObject *lines;
int lineslen;
int lineno;
} PyXReadlinesObject;
staticforward PyTypeObject XReadlinesObject_Type;
static void
xreadlines_dealloc(PyXReadlinesObject *op) {
Py_TRASHCAN_SAFE_BEGIN(op)
PyObject_GC_Fini(op);
Py_XDECREF(op->file);
Py_XDECREF(op->lines);
PyObject_DEL(op);
Py_TRASHCAN_SAFE_END(op)
}
#if SIZEOF_INT < 4
#define BIGCHUNK (512 * 32)
#else
#define BIGCHUNK (512 * 1024)
#endif
static PyXReadlinesObject *
newreadlinesobject(PyObject *file) {
PyXReadlinesObject *op;
op = PyObject_NEW(PyXReadlinesObject, &XReadlinesObject_Type);
if(op == NULL) return NULL;
Py_XINCREF(file);
op->file = file;
op->lines = NULL;
op->lineno = op->lineslen = 0;
return op;
}
static PyObject *
xreadlines(PyObject *self, PyObject *args) {
PyObject *file;
PyXReadlinesObject *ret;
if (!PyArg_ParseTuple(args, "O:xreadlines", &file))
return NULL;
ret = newreadlinesobject(file);
Py_INCREF(ret);
return (PyObject*)ret;
}
static PyObject*
xreadlines_item(PyXReadlinesObject *a, int i) {
PyObject *ret;
if(a->lineno >= a->lineslen) {
Py_XDECREF(a->lines);
a->lines = PyObject_CallMethod(a->file, "readlines", "(i)", BIGCHUNK);
/*
if (a->lineslen == 0) {
static PyObject *indexerr = NULL;
if (indexerr == NULL)
indexerr = PyString_FromString(
"end of file");
PyErr_SetObject(PyExc_IndexError, indexerr)
return NULL;
}
*/
a->lineno = 0;
a->lineslen = PyList_Size(a->lines);
}
ret = PyList_GetItem(a->lines, a->lineno++);
Py_XINCREF(ret);
return ret;
}
static PySequenceMethods xreadlines_as_sequence = {
0, /*sq_length*/
0, /*sq_concat*/
0, /*sq_repeat*/
(intargfunc)xreadlines_item, /*sq_item*/
};
static PyTypeObject XReadlinesObject_Type = {
PyObject_HEAD_INIT(&PyType_Type)
0,
"xreadlines",
sizeof(PyXReadlinesObject) + PyGC_HEAD_SIZE,
0,
(destructor)xreadlines_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
&xreadlines_as_sequence, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash*/
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_GC, /*tp_flags*/
0, /* tp_doc */
};
static PyMethodDef xreadlines_methods[] = {
{"xreadlines", xreadlines, METH_VARARGS},
{NULL, NULL}
};
void
initxreadlines(void)
{
PyObject *m;
m = Py_InitModule("xreadlines", xreadlines_methods);
}
/* xreadlinesmodule.c ends here */
More information about the Python-list
mailing list