converting file formats to txt
BJörn Lindqvist
bjourne at gmail.com
Thu Jul 6 06:15:31 EDT 2006
On 4 Jul 2006 08:38:47 -0700, Gaurav Agarwal
<gaurav.agarwal1904 at gmail.com> wrote:
> Thanks Steven, Actually i wanted a do text processing for my office
> where I can view all files in the system and use the first three to
> give a summary of the document. Instead of having somebody actually
> entering the summary. Seems there is no one code that can act as
> convertor across formats, i'll have to check out convertors for
> individual formats.
I have some old code that does just that. It uses pdftotext, catdoc
and links to convert .doc, .pdf and .html to text.
##################################################################
import mimetypes
from subprocess import call, Popen, PIPE
import sys
class ConversionError(Exception):
pass
class UnknownMimeType(ConversionError):
pass
class NotAMimeType(ConversionError):
pass
class ParseError(ConversionError):
pass
def has_program(progname):
return call(["which", progname], stdout = PIPE) == 0
def check_requirements():
missing = []
for prog in "catdoc", "pdftotext", "links":
if not has_program(prog):
missing.append(prog)
if missing:
print "You need to have the programs:", " ".join(missing)
return False
return True
if not check_requirements():
print "Needed external programs not found, quitting"
sys.exit(1)
def get_catdoc_args(infile):
return ["catdoc", "-s", "8859-1", infile]
def get_pdftotext_args(infile):
return ["pdftotext", infile, "-"]
def get_links_args(infile):
return ["links", infile, "-dump"]
def totext(document):
filetype_to_args_map = {"application/msword" : get_catdoc_args,
"application/pdf" : get_pdftotext_args,
"text/html" : get_links_args}
ftype, ign = mimetypes.guess_type(document)
if not ftype:
raise NotAMimeType, "Couldn't detect mimetype for %s" % document
try:
argfunc = filetype_to_args_map[ftype]
except KeyError:
s = "Don't know how to handle %s documents" % ftype
raise UnknownMimeType, s
p = Popen(argfunc(document), stdout = PIPE, stderr = PIPE)
text = p.stdout.read()
if p.wait():
# Force a better exception to be thrown if the file doesn't exist.
open(document)
raise ParseError, "Failed to parse %s" % document
return text
if __name__ == "__main__":
print totext("testpdf.pdf")
--
mvh Björn
More information about the Python-list
mailing list