Problem with uudecode

Juho Saarikko sorry at but.no.spam
Tue May 25 12:42:17 EDT 2004


I made a Python script which takes Usenet message bodies from a database,
decodes uuencoded contents and inserts them as Large Object into a
PostGreSQL database. However, it appears that the to last few bytes
of uudecoded data are always mangled. Take a look of this hexdump output:

Originals (decoded with Pan, each line is from a different file):
000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
0011a10 ff54 00d9
00093e0 fb4f a80d ffd9 c200 ffef 00d9

Decoded by the script:
000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
0011a10 ff54 00d8
00093e0 fb4f a80d ffd9 c200 ffef 00d8

As you can see, one of the last two bytes gets altered in all cases.

The script also outputs the decoded file to disk for debugging purposes,
and the database large object and filesystem file match so it can't be a
PostGreSQL problem.

So, if anyone has any idea what is wrong, please tell me ? I can't found
any reason why the bytes would get mangled...

The script follows:

#!/usr/local/bin/python2.3

# Insert message contents into the database, for each message-id already there
#
# Copyright 2004 by Juho Saarikko
# License: GNU General Public License (GPL) version 2
# See www.gnu.org for details

from pyPgSQL import libpq
import nntplib
import sys
import string
import regex
import sha
import imghdr
import binascii
import StringIO
import os

def strip_trailing_dots(n):
  tmp = []
  for i in range(len(n)):
    if n[i][-1] == "," or n[i][-1] == ".":
      tmp.append(n[i][:-1])
    else:
      tmp.append(n[i])
  return tmp

def findmimetype(body, filename):
  tail4 = string.lower(filename[-5:])
  tail3 = string.lower(filename[-4:])
  if tail4 == ".jpeg":
    return "image/jpeg"
  if tail3 == ".jpg":
    return "image/jpeg"
  if tail3 == ".png":
    return "image/png"
  if tail3 == ".jpe":
    return "image/jpeg"
  if tail3 == ".gif":
    return "image/gif"
  return None

def insert_picture(conn, image, filename):
  hash = sha.new(image)
  qhash = libpq.PgQuoteBytea(hash.digest())
  candidates = conn.query("SELECT id, picture FROM pictures WHERE hash = " + qhash )
  if candidates.ntuples > 0:
    print "Found possible mathces " + str(candidates.ntuples)
    for x in range(candidates.ntuples):
      old = candidates.getvalue(x, 1)
      old.open("r")
      oldpic = old.read()
      old.close()
      if oldpic == image:
        print "Found a match"
        ret = (candidates.getvalue(x,0), 1)
        return ret
  mime = findmimetype(image, filename)
  print "attempting to get mimetype"
  if mime == None:
    print "No mimetype found"
    ret = (0, 0)
    return ret
  mime = libpq.PgQuoteString(mime)
  mimeres = conn.query("SELECT id FROM mimetypes WHERE mimetype = " + mime)
  if mimeres.ntuples == 0:
    conn.query("INSERT INTO mimetypes (mimetype) VALUES (" + mime + ")")
    mimeres = conn.query("SELECT id FROM mimetypes WHERE mimetype = " + mime)
  mimetype = mimeres.getvalue(0,0)
  picture = conn.lo_creat("rw")
  picture.open("rw")
  picture.write(image)
  picture.close()
  tmp = conn.query("INSERT INTO pictures (hash, mimetype, picture) VALUES (" + qhash + ", " +str(mimetype) + ", " + picture.name + ")")
  temp = conn.query("SELECT id FROM pictures WHERE OID = " + str(tmp.oidValue))
  id = temp.getvalue(0,0)
  ret = (id, 0)
  return ret

def try_decode_and_insert_uuencoded(conn, id):
  begin = regex.compile("begin [0-9]+ \(.*\)")
  conn.query("BEGIN")
  basedir = "kuvat"
  message = conn.query("SELECT data FROM fragments_bodies WHERE message = " + str(id) + " ORDER BY line")
#  print message.ntuples
  
  keywords = []
  picids = []
  newpicids = []
  n = 0
  s = ""
  picid = 0
  print 'Starting message id ' + str(id)
  while n < message.ntuples:
#    print "length of row " + str(n)
#    print str(message.getlength(n, 0))
#    print "Got length"
    abcddummy = message.getvalue(n, 0)
#    print "Got value"
    s = message.getvalue(n, 0)
#    print "Got s"
    if begin.match(s) > 0:
#    if match_beginning(s) > 0:
#      print "Begin matched"
      body = []
      file = begin.group(1)
#      file = get_file_name(s)
#      print "Starting to decode, at line " + str(n + 1)
      for k in range(n+1, message.ntuples):
#        print "Decodind row " + str(k)
        s = message.getvalue(k, 0)
        if s[:3] == "end":
          n = k + 1
          break
        try:
          body.append(binascii.a2b_uu(s))
        except:
          try:
            bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
            body.append(binascii.a2b_uu(s[:bytes]))
          except:
            print "Broken attachment in message " + str(id)
            conn.query("ROLLBACK")
            return
#      print "Got to end, at line " + str(n)
#      print "Attempting to join body"
      body = string.join(body, "")
#      print "Attempting to hash body"
#      hash = sha.new(body)
#      qhash = libpq.PgQuoteBytea(hash.digest())
#      qbody = libpq.PgQuoteBytea(body)
#      print "Attempting to find whether the pic already exists"
      print "Mimetype returned " + str(findmimetype(body, file))
#      temporary = open("dummy", "wb")
#      temporary.write(body)
#      temporary.close()
#      dummy.write("dsfds")
      print "Calling insert function"
      picid, exists = insert_picture(conn, body, file)
      print "Returned from insert function with value " + str(picid)
      if picid > 0:
#      already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
#      if already.ntuples == 0:
#        print "Attempting to find mimetype"
#        mimetype = findmimetype(body, file)
#        print "Found mimetype"
#        if mimetype != None:
#          o = conn.query("INSERT INTO pictures (picture, hash, mimetype) VALUES (" + qbody + ", " + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
#          already = conn.query("SELECT id FROM pictures WHERE OID = " + str(o.oidValue()));
#          already = conn.query("SELECT id FROM pictures WHERE data = " + qbody)
#           already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
#           print "Attempting to insert hash and mimetype"
#           conn.query("INSERT INTO pictures (hash, mimetype) VALUES (" + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
#           print "Attempting to get id"
#           already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
#           print "Attempting to get value"
#           picid = already.getvalue(0, 0)           
           print picid
           print "Attempting to OK dir"
           if os.access(basedir + "/tmp", os.F_OK) != 1:
             os.mkdir(basedir + "/tmp")
           fh = open(basedir + "/tmp/" + str(picid), "wb")
           fh.write(body)
           fh.close()
           print "File ok"
           picids.append(picid)
           if exists == 0:
             newpicids.append(picid)
           if file != "":
              keywords.append(file)
#      else:
#        picid = already.getvalue(0, 0)
#        if already.ntuples == 0:
#          conn.query("ROLLBACK")
#          return
#        picids.append(picid)
#      if already.ntuples == 0:
#        print "already.ntuples == 0, ROLLBACKing"
#        conn.query("ROLLBACK")
#        return
#      print "Appending picid"
#      picids.append(picid)
#     print "Picid appended"
    else:
      tmptmp = string.split(s)
      tmpkey = strip_trailing_dots(tmptmp)
      if len(tmpkey) > 0:
        for j in range(len(tmpkey)):
          keywords.append(tmpkey[j])
#    print "Adding 1 to n"
    n = n + 1
  if len(picids) > 0:
    print "Found " + str(len(picids)) + " pictures (" + str(len(newpicids)) + " new ones)"
#    print "Finding Subject"
    head = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ilike 'Subject')")
    if head.ntuples > 0:
#      print "Splitting Subject"
      blah = head.getvalue(0,0)
#      print str(blah)
      blahblah = string.split(str(blah))
#      print "Stripping"
      abctmpkey = strip_trailing_dots(blahblah)
#      print "Stripping done"
#      print "Really"
      tmpkey = abctmpkey
#      print "Subject split"
      if len(tmpkey) > 0:
        for j in range(len(tmpkey)):
          keywords.append(tmpkey[j])
    o = conn.query("INSERT INTO messages DEFAULT VALUES")
    mid = conn.query("SELECT id FROM messages WHERE OID = " + str(o.oidValue))
    messageid = mid.getvalue(0, 0)
    nresult = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ILIKE 'Newsgroups')")
    if nresult.ntuples > 0:
      for x in range(nresult.ntuples):
        newsgroups = string.split(nresult.getvalue(x, 0), ",")
        if len(newsgroups) > 0:
          for y in range (len(newsgroups)):
            newsgroup = libpq.PgQuoteString(newsgroups[y])
            ngroupres = conn.query("SELECT id FROM newsgroups WHERE name = " + newsgroup)
            if ngroupres.ntuples > 0:
              newsgid = ngroupres.getvalue(0, 0)
            else:
              conn.query("INSERT INTO newsgroups (name) VALUES (" + newsgroup + ")")
              ngrtmpres = conn.query("SELECT id FROM newsgroups WHERE name = " + newsgroup)
              newsgid = ngrtmpres.getvalue(0, 0)
            conn.query("INSERT INTO messages_ngroups_glue (message, newsgroup) VALUES (" + str(messageid) + ", " + str(newsgid) + ")")
        else:
          print "An empty Newsgroups: header at messag " + str(id)
          conn.query("ROLLBACK")
          return
    else:
      print "No Newsgroups: header at message " + str(id)
      conn.query("ROLLBACK")
      return 
    for x in range(len(picids)):
      conn.query("INSERT INTO messages_pictures_glue (message, picture) VALUES (" + str(messageid) + ", " + str(picids[x]) + ")")
    if len(keywords) > 0:
      for x in range(len(tmpkey)):
        qword = libpq.PgQuoteString(str(keywords[x]))
        tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
        if tmp.ntuples == 0:
          conn.query("INSERT INTO keywords_words (keyword) VALUES (" + qword + ")")
          tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
        keyid = str(tmp.getvalue(0, 0))
        for y in range(len(picids)):
          conn.query("INSERT INTO keywords_glue(word, picture) VALUES (" + keyid + ", " + str(picids[y]) + ")")
    dummyone = "SELECT fragments_header_contents.line, fragments_header_names.header,"
    dummytwo = " fragments_header_contents.contents FROM fragments_header_names, fragments_header_contents"
    dummythree = " WHERE fragments_header_contents.message = " + str(id)
    dummyfour = " AND fragments_header_contents.header = fragments_header_names.id"
    head = conn.query(dummyone + dummytwo + dummythree + dummyfour)
    if head.ntuples > 0:
      for h in range(head.ntuples):
        qhead = libpq.PgQuoteString(str(head.getvalue(h, 1)))
        qcont = libpq.PgQuoteString(str(head.getvalue(h, 2)))
        tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
        if tmp.ntuples == 0:
          conn.query("INSERT INTO header_names (header) VALUES (" + qhead + ")")
          tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
        headid = str(tmp.getvalue(0, 0))
        line = str(head.getvalue(0, 0))
        conn.query("INSERT INTO header_contents (header, message, line, contents) VALUES (" + headid + ", " + str(messageid) + ", " + line + ", " + qcont + ")")
    conn.query("DELETE FROM fragments_header_contents WHERE message = " + str(id))
    conn.query("DELETE FROM fragments_bodies WHERE message = " + str(id))
    conn.query("COMMIT")
    if len(newpicids) > 0:
      tmpdir = basedir + "/tmp/"
      for i in range(len(newpicids)):
        picid = newpicids[i]
        tmppicname = tmpdir + str(picid)
        permpicname = basedir + "/" + str(picid%1000) + "/" + str(picid)
        print tmppicname
        print permpicname
        if os.access(basedir + "/" + str(picid%1000), os.F_OK) != 1:
          os.mkdir(basedir + "/" + str(picid%1000))
        os.link(tmppicname, permpicname)
        os.unlink(tmpdir +str(picid))
  else:
    print "No pictures found"
    conn.query("ROLLBACK")
    return


database = libpq.PQconnectdb('dbname = kuvat')
items = database.query("SELECT message FROM whole_attachments")

# try_decode_and_insert_uuencoded(database, 5407)

for i in range(items.ntuples):
  try:
    print 'Starting call ' + str(i)
    try_decode_and_insert_uuencoded(database, items.getvalue(items.ntuples - 1 - i,0))
    print ' returned from call ' + str(i)
  except:
    print 'Some other error occurred at message " + str(i) + ", trying to continue...'





More information about the Python-list mailing list