converting file formats to txt

Gaurav Agarwal gaurav.agarwal1904 at gmail.com
Mon Jul 10 09:07:44 EDT 2006


tks this ws really helpful, i used catdoc, catppt, xls2csv, pdftotext
from xdf and ps2txt from ghostview!..

BJörn Lindqvist wrote:
> On 4 Jul 2006 08:38:47 -0700, Gaurav Agarwal
> <gaurav.agarwal1904 at gmail.com> wrote:
> > Thanks Steven, Actually i wanted a do text processing for my office
> > where I can view all files in the system and use the first three to
> > give a summary of the document. Instead of having somebody actually
> > entering the summary. Seems there is no one code that can act as
> > convertor across formats, i'll have to check out convertors for
> > individual formats.
>
> I have some old code that does just that. It uses pdftotext, catdoc
> and links to convert .doc, .pdf and .html to text.
>
> ##################################################################
> import mimetypes
> from subprocess import call, Popen, PIPE
> import sys
>
> class ConversionError(Exception):
>     pass
>
> class UnknownMimeType(ConversionError):
>     pass
>
> class NotAMimeType(ConversionError):
>     pass
>
> class ParseError(ConversionError):
>     pass
>
> def has_program(progname):
>     return call(["which", progname], stdout = PIPE) == 0
>
> def check_requirements():
>     missing = []
>     for prog in "catdoc", "pdftotext", "links":
>         if not has_program(prog):
>             missing.append(prog)
>     if missing:
>         print "You need to have the programs:", " ".join(missing)
>         return False
>     return True
>
> if not check_requirements():
>     print "Needed external programs not found, quitting"
>     sys.exit(1)
>
> def get_catdoc_args(infile):
>     return ["catdoc", "-s", "8859-1", infile]
>
> def get_pdftotext_args(infile):
>     return ["pdftotext", infile, "-"]
>
> def get_links_args(infile):
>     return ["links", infile, "-dump"]
>
> def totext(document):
>     filetype_to_args_map = {"application/msword" : get_catdoc_args,
>                             "application/pdf" : get_pdftotext_args,
>                             "text/html" : get_links_args}
>
>     ftype, ign = mimetypes.guess_type(document)
>     if not ftype:
>         raise NotAMimeType, "Couldn't detect mimetype for %s" % document
>     try:
>         argfunc = filetype_to_args_map[ftype]
>     except KeyError:
>         s = "Don't know how to handle %s documents" % ftype
>         raise UnknownMimeType, s
>
>     p = Popen(argfunc(document), stdout = PIPE, stderr = PIPE)
>     text = p.stdout.read()
>     if p.wait():
>         # Force a better exception to be thrown if the file doesn't exist.
>         open(document)
>         raise ParseError, "Failed to parse %s" % document
>     return text
>
> if __name__ == "__main__":
>     print totext("testpdf.pdf")
> 
> 
> 
> -- 
> mvh Björn




More information about the Python-list mailing list