converting file formats to txt

BJörn Lindqvist bjourne at gmail.com
Thu Jul 6 06:15:31 EDT 2006


On 4 Jul 2006 08:38:47 -0700, Gaurav Agarwal
<gaurav.agarwal1904 at gmail.com> wrote:
> Thanks Steven, Actually i wanted a do text processing for my office
> where I can view all files in the system and use the first three to
> give a summary of the document. Instead of having somebody actually
> entering the summary. Seems there is no one code that can act as
> convertor across formats, i'll have to check out convertors for
> individual formats.

I have some old code that does just that. It uses pdftotext, catdoc
and links to convert .doc, .pdf and .html to text.

##################################################################
import mimetypes
from subprocess import call, Popen, PIPE
import sys

class ConversionError(Exception):
    pass

class UnknownMimeType(ConversionError):
    pass

class NotAMimeType(ConversionError):
    pass

class ParseError(ConversionError):
    pass

def has_program(progname):
    return call(["which", progname], stdout = PIPE) == 0

def check_requirements():
    missing = []
    for prog in "catdoc", "pdftotext", "links":
        if not has_program(prog):
            missing.append(prog)
    if missing:
        print "You need to have the programs:", " ".join(missing)
        return False
    return True

if not check_requirements():
    print "Needed external programs not found, quitting"
    sys.exit(1)

def get_catdoc_args(infile):
    return ["catdoc", "-s", "8859-1", infile]

def get_pdftotext_args(infile):
    return ["pdftotext", infile, "-"]

def get_links_args(infile):
    return ["links", infile, "-dump"]

def totext(document):
    filetype_to_args_map = {"application/msword" : get_catdoc_args,
                            "application/pdf" : get_pdftotext_args,
                            "text/html" : get_links_args}

    ftype, ign = mimetypes.guess_type(document)
    if not ftype:
        raise NotAMimeType, "Couldn't detect mimetype for %s" % document
    try:
        argfunc = filetype_to_args_map[ftype]
    except KeyError:
        s = "Don't know how to handle %s documents" % ftype
        raise UnknownMimeType, s

    p = Popen(argfunc(document), stdout = PIPE, stderr = PIPE)
    text = p.stdout.read()
    if p.wait():
        # Force a better exception to be thrown if the file doesn't exist.
        open(document)
        raise ParseError, "Failed to parse %s" % document
    return text

if __name__ == "__main__":
    print totext("testpdf.pdf")



-- 
mvh Björn



More information about the Python-list mailing list