Implementing file reading in C/Python
MRAB
google at mrabarnett.plus.com
Thu Jan 8 22:05:19 EST 2009
Johannes Bauer wrote:
> Hello group,
>
> I've come from C/C++ and am now trying to code some Python because I
> absolutely love the language. However I still have trouble getting
> Python code to run efficiently. Right now I have a easy task: Get a
> file, split it up into a million chunks, count the most prominent
> character in each chunk and output that value into a file - in other
> words: Say we have a 2 GB file, we evaluate what character is most
> prominent in filepos [0, 2048[ - say it's a "A", then put a 65 in there
> (ord("A")).
>
> I've first tried Python. Please don't beat me, it's slow as hell and
> probably a horrible solution:
>
> #!/usr/bin/python
> import sys
> import os
>
> f = open(sys.argv[1], "r")
> filesize = os.stat(sys.argv[1])[6]
>
> width = 1024
> height = 1024
> pixels = width * height
> blocksize = filesize / width / height
>
> print("Filesize : %d" % (filesize))
> print("Image size : %dx%d" % (width, height))
> print("Bytes per Pixel: %d" % (blocksize))
>
> picture = { }
> havepixels = 0
> while True:
> data = f.read(blocksize)
> if len(data) <= 0: break
>
> datamap = { }
> for i in range(len(data)):
> datamap[ord(data[i])] = datamap.get(data[i], 0) + 1
>
> maxchr = None
> maxcnt = None
> for (char, count) in datamap.items():
> if (maxcnt is None) or (count > maxcnt):
> maxcnt = count
> maxchr = char
>
> most = maxchr
>
> posx = havepixels % width
> posy = havepixels / width
>
> havepixels += 1
> if (havepixels % 1024) == 0:
> print("Progresss %s: %.1f%%" % (sys.argv[1], 100.0 * havepixels / pixels))
>
> picture[(posx, posy)] = most
>
> pic = open(sys.argv[1] + ".pgm", "w")
> pic.write("P2\n")
> pic.write("# CREATOR: Crappyass Python Script\n")
> pic.write("%d %d\n" % (width, height))
> pic.write("255\n")
> for y in range(height):
> for x in range(width):
> pos = (x, y)
> most = picture.get(pos, -1)
> pic.write("%d\n" % (most))
>
> As this was horribly slow (20 Minutes for a 2GB file) I coded the whole
> thing in C also:
>
> #include <stdio.h>
> #include <errno.h>
> #include <string.h>
> #include <stdlib.h>
>
> #define BLOCKSIZE 2048
>
> int main(int argc, char **argv) {
> unsigned int count[256];
> int width, height;
> FILE *f;
> FILE *in;
> width = 1024;
> height = 1024;
> char temp[2048];
>
> if (argc != 2) { fprintf(stderr, "Argument?\n"); exit(2); }
>
> in = fopen(argv[1], "r");
> if (!in) { perror("fopen"); exit(1); }
>
> snprintf(temp, 255, "%s.pgm", argv[1]);
> f = fopen(temp, "w");
> if (!f) { perror("fopen"); exit(1); }
>
> fprintf(f, "P2\n");
> fprintf(f, "# CREATOR: C\n");
> fprintf(f, "%d %d\n", width, height);
> fprintf(f, "255\n");
>
> width = 1024;
> height = 1024;
> while (fread(temp, 1, sizeof(temp), in) == sizeof(temp)) {
> int i;
> memset(count, 0, sizeof(count));
> for (i = 0; i < sizeof(temp); i++) {
> count[(int)temp[i]]++;
> }
>
> int greatest;
> int maxcount;
>
> greatest = 0;
> maxcount = count[0];
> for (i = 1; i < 256; i++) {
> if (count[i] > maxcount) {
> maxcount = count[i];
> greatest = i;
> }
> }
>
> fprintf(f, "%d\n", greatest);
> }
>
> fclose(f);
> fclose(in);
> return 0;
> }
>
> Which takes about 40 seconds. I want the niceness of Python but a little
> more speed than I'm getting (I'd settle for factor 2 or 3 slower, but
> factor 30 is just too much).
>
> Can anyone point out how to solve this efficiently in Python?
>
Have a look at psyco: http://psyco.sourceforge.net/
More information about the Python-list
mailing list