From montanaro at users.sourceforge.net Sun Sep 10 00:18:31 2006 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sat, 09 Sep 2006 15:18:31 -0700 Subject: [Spambayes-checkins] spambayes/spambayes ImageStripper.py,1.4,1.5 Message-ID: <20060909221833.594001E4007@bag.python.org> Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv30280 Modified Files: ImageStripper.py Log Message: Add crude support for multi-frame GIFs to PIL_decode_parts(). I made a few assumptions: 1. NetPBM support will eventually be ripped out. Everyone should be able to install PIL. Consequently, no attempt to update the NetPBM code was made. 2. The image with the fewest background pixels is probably the one containing the text. GIF image frames can be just part of the overall image, so this assumption will be violated in the future. For the time being it appears most spammers have a hard time setting frame duration properly (are they trying to induce epileptic seizures or sell stocks?), let alone carving up frames into pieces. We'll cross that bridge when we come to it. 3. If an image's info dict doesn't have a "duration" key it's assumed to be a single-frame image. Index: ImageStripper.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/ImageStripper.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** ImageStripper.py 14 Aug 2006 02:58:11 -0000 1.4 --- ImageStripper.py 9 Sep 2006 22:18:28 -0000 1.5 *************** *** 22,26 **** try: ! from PIL import Image except ImportError: Image = None --- 22,26 ---- try: ! from PIL import Image, ImageSequence except ImportError: Image = None *************** *** 189,192 **** --- 189,219 ---- continue else: + # Spammers are now using GIF image sequences. From examining a + # miniscule set of multi-frame GIFs it appears the frame with + # the fewest number of background pixels is the one with the + # text content. + + if "duration" in image.info: + # Big assumption? I don't know. If the image's info dict + # has a duration key assume it's a multi-frame image. This + # should save some needless construction of pixel + # histograms for single-frame images. + bgpix = 1e17 # ridiculously large number of pixels + try: + for frame in ImageSequence.Iterator(image): + # Assume the pixel with the largest value is the + # background. + bg = max(frame.histogram()) + if bg < bgpix: + image = frame + bgpix = bg + # I've empirically determined: + # * ValueError => GIF image isn't multi-frame. + # * IOError => Decoding error + except IOError: + tokens.add("invalid-image:%s" % part.get_content_type()) + continue + except ValueError: + pass image = image.convert("RGB") From montanaro at users.sourceforge.net Sun Sep 10 00:59:40 2006 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sat, 09 Sep 2006 15:59:40 -0700 Subject: [Spambayes-checkins] spambayes/spambayes ImageStripper.py,1.5,1.6 Message-ID: <20060909225943.984D41E4007@bag.python.org> Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv13966 Modified Files: ImageStripper.py Log Message: I don't want to support netpbm stuff, and it's highly unlikely anyone else will either, so be a man about it and rip it out now. Index: ImageStripper.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/ImageStripper.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** ImageStripper.py 9 Sep 2006 22:18:28 -0000 1.5 --- ImageStripper.py 9 Sep 2006 22:59:38 -0000 1.6 *************** *** 58,86 **** return "" - def find_decoders(): - # check for filters to convert to netpbm - for decode_jpeg in ["jpegtopnm", "djpeg"]: - if find_program(decode_jpeg): - break - else: - decode_jpeg = None - for decode_png in ["pngtopnm"]: - if find_program(decode_png): - break - else: - decode_png = None - for decode_gif in ["giftopnm"]: - if find_program(decode_gif): - break - else: - decode_gif = None - - decoders = { - "image/jpeg": decode_jpeg, - "image/gif": decode_gif, - "image/png": decode_png, - } - return decoders - def imconcatlr(left, right): """Concatenate two images left to right.""" --- 58,61 ---- *************** *** 101,167 **** return result - def pnmsize(pnmfile): - """Return dimensions of a PNM file.""" - f = open(pnmfile) - line1 = f.readline() - line2 = f.readline() - w, h = [int(n) for n in line2.split()] - return w, h - - def NetPBM_decode_parts(parts, decoders): - """Decode and assemble a bunch of images using NetPBM tools.""" - rows = [] - tokens = Set() - for part in parts: - decoder = decoders.get(part.get_content_type()) - if decoder is None: - continue - try: - bytes = part.get_payload(decode=True) - except: - tokens.add("invalid-image:%s" % part.get_content_type()) - continue - - if len(bytes) > options["Tokenizer", "max_image_size"]: - tokens.add("image:big") - continue # assume it's just a picture for now - - fd, imgfile = tempfile.mkstemp() - os.write(fd, bytes) - os.close(fd) - - fd, pnmfile = tempfile.mkstemp() - os.close(fd) - os.system("%s <%s >%s 2>dev.null" % (decoder, imgfile, pnmfile)) - w, h = pnmsize(pnmfile) - if not rows: - # first image - rows.append([pnmfile]) - elif pnmsize(rows[-1][-1])[1] != h: - # new image, different height => start new row - rows.append([pnmfile]) - else: - # new image, same height => extend current row - rows[-1].append(pnmfile) - - for (i, row) in enumerate(rows): - if len(row) > 1: - fd, pnmfile = tempfile.mkstemp() - os.close(fd) - os.system("pnmcat -lr %s > %s 2>/dev/null" % - (" ".join(row), pnmfile)) - for f in row: - os.unlink(f) - rows[i] = pnmfile - else: - rows[i] = row[0] - - fd, pnmfile = tempfile.mkstemp() - os.close(fd) - os.system("pnmcat -tb %s > %s 2>/dev/null" % (" ".join(rows), pnmfile)) - for f in rows: - os.unlink(f) - return [pnmfile], tokens - def PIL_decode_parts(parts): """Decode and assemble a bunch of images using PIL.""" --- 76,79 ---- *************** *** 298,304 **** pnmfiles, tokens = PIL_decode_parts(parts) else: ! if not find_program("pnmcat"): ! return "", Set() ! pnmfiles, tokens = NetPBM_decode_parts(parts, find_decoders()) if pnmfiles: --- 210,214 ---- pnmfiles, tokens = PIL_decode_parts(parts) else: ! return "", Set() if pnmfiles: From montanaro at users.sourceforge.net Sun Sep 10 01:02:09 2006 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sat, 09 Sep 2006 16:02:09 -0700 Subject: [Spambayes-checkins] spambayes CHANGELOG.txt,1.57,1.58 Message-ID: <20060909230210.D688C1E4007@bag.python.org> Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv15200 Modified Files: CHANGELOG.txt Log Message: note latest changes Index: CHANGELOG.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/CHANGELOG.txt,v retrieving revision 1.57 retrieving revision 1.58 diff -C2 -d -r1.57 -r1.58 *** CHANGELOG.txt 19 Aug 2006 00:26:38 -0000 1.57 --- CHANGELOG.txt 9 Sep 2006 23:02:06 -0000 1.58 *************** *** 1,4 **** --- 1,9 ---- [Note that all dates are in ISO 8601 format, e.g. YYYY-MM-DD to ease sorting] + Release 1.1a4 + + Skip Montanaro 2006-09-09 First crack at handling image sequences + Skip Montanaro 2006-09-09 Dump NetPBM decode support in favor of PIL + Release 1.1a3 ============= From montanaro at users.sourceforge.net Sun Sep 10 01:02:22 2006 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sat, 09 Sep 2006 16:02:22 -0700 Subject: [Spambayes-checkins] spambayes/spambayes Options.py,1.138,1.139 Message-ID: <20060909230223.E4B741E4007@bag.python.org> Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv15255 Modified Files: Options.py Log Message: netpbm -> pil Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v retrieving revision 1.138 retrieving revision 1.139 diff -C2 -d -r1.138 -r1.139 *** Options.py 14 Aug 2006 02:58:11 -0000 1.138 --- Options.py 9 Sep 2006 23:02:20 -0000 1.139 *************** *** 129,133 **** (hopefully) text content contained in any images in each message. The current support is minimal, relies on the installation of ! ocrad (http://www.gnu.org/software/ocrad/ocrad.html) and netpbm. It is almost certainly only useful in its current form on Unix-like machines."""), --- 129,133 ---- (hopefully) text content contained in any images in each message. The current support is minimal, relies on the installation of ! ocrad (http://www.gnu.org/software/ocrad/ocrad.html) and PIL. It is almost certainly only useful in its current form on Unix-like machines."""),