ANN: Python Decrypt PDF script -- builds on pdftools

Herb Schilling herbcle at en.com
Thu Aug 19 21:43:43 EDT 2004


>Hi,
>
>I wanted to extract the meta-data from an encrypted/protected PDF file
>and could not find any Python scripts to do this. So, I decided to
>write something myself, the result follows.
>
>This demonstration utility requires the `pdftools` files from
><http://www.boddie.org.uk/david/Projects/Python/pdftools/> but the
>decryption functions themselves should be usable with other Python PDF
>libraries.
>
>Documentation is marginal and all I can say is that worked on the
>three PDF files I tested it on... :-)
>
>--Phil.
>
>P.S. The usual Usenet-mangling warning applies--yeah, I know--I should
>put it up on a web site somewhere... :-)
>
>#!/usr/bin/python
>#
># Decrypt PDF Info
>#
># Decrypts PDF files and displays meta-data associated with them. (If
>the
># file isn't encrypted the information is displayed as is.)
>#
># The results are similar to xpdf's `pdfinfo` utility.
>#
># It should be possible to decrypt all of the objects contained
># in the PDF, but this only reads the Document Information Dictionary.
>#
># (Note: All the PDF handling is provided by `pdftools`, this just
>adds
>#        the ability to deal with encrypted PDF files.)
>#
># Requires:
>#   + pdftools
>#     <http://www.boddie.org.uk/david/Projects/Python/pdftools/>
>#
># Based on:
>#   + `pdfdecrypt.pl`
>#     <http://www-2.cs.cmu.edu/~dst/Adobe/Gallery/pdfdecrypt.pl>
>[PDFPL]
>#
># Incorporates:
>#   + RC4 from CipherSaber implementation by Ka-Ping Yee
><ping at lfw.org>
>#     <http://www.xs4all.nl/~cg/ciphersaber/comp/python.txt>
>#
># References:
>#   + 
><http://www-2.cs.cmu.edu/~dst/Adobe/Gallery/anon21jul01-pdf-encryption.txt>
>[PDFE]
>#
># Author:
>#   follower at myrealbox.com (Standing on *many* shoulders...)
>#
>import sys
>import md5
>import struct
>
>from pdftools import PDFdocument
>
>def arcfour(input, key):
>     """
>     Perform the ARCFOUR (RC4) algorithm on a given input list of bytes
>with
>     a key given as a list of bytes, and return the output as a list of
>     bytes.
>
>     (From CipherSaber implementation by Ka-Ping Yee <ping at lfw.org>
>      <http://www.xs4all.nl/~cg/ciphersaber/comp/python.txt>)
>     """
>     i, j, state = 0, 0, range(256)
>     for i in range(256):
>         j = (j + state[i] + key[i % len(key)]) % 256
>         state[i], state[j] = state[j], state[i]
>     i, j, output = 0, 0, []
>     for byte in input:
>         i = (i + 1) % 256
>         j = (j + state[i]) % 256
>         state[i], state[j] = state[j], state[i]
>         n = (state[i] + state[j]) % 256
>         output.append(byte ^ state[n])
>     return output
>
>
>_passwordPad = [
>   0x28, 0xbf, 0x4e, 0x5e, 0x4e, 0x75, 0x8a, 0x41,
>   0x64, 0x00, 0x4e, 0x56, 0xff, 0xfa, 0x01, 0x08,
>   0x2e, 0x2e, 0x00, 0xb6, 0xd0, 0x68, 0x3e, 0x80,
>   0x2f, 0x0c, 0xa9, 0xfe, 0x64, 0x53, 0x69, 0x7a]
>
>passwordPad = "".join([chr(b) for b in _passwordPad])
>
>def calculateFileKey(fileId, ownerHash, userHash, permissions,
>                      userPassword = ""):
>     """
>     Calculates the file key for the document as described in
>references
>     (see [PDFE] and [PDFPL]).   
>     """
>     md = md5.new()
>     md.update((userPassword + passwordPad)[:32])
>     md.update(ownerHash)
>     md.update(struct.pack("<L", permissions))
>     md.update(fileId)
>
>     fileKey = md.digest()[:5]
>
>     return fileKey   
>
>
>def calculateObjectKey(fileKey, objectNumber, generationNumber):
>     """
>     Calculates the key for the object as described in references
>     (see [PDFE] and [PDFPL]).   
>     """
>     md = md5.new()
>     md.update(fileKey)
>     md.update(struct.pack("<L", objectNumber)[:3])
>     md.update(struct.pack("<L", generationNumber)[:2])
>     objectKey = md.digest()[:10]
>
>     return objectKey   
>
>
>class NotEncryptedException(Exception):
>     """
>     The supplied PDF document is not encrypted.
>     """
>
>
>KEY_OWNER_HASH = 'O'
>KEY_USER_HASH = 'U'
>KEY_PERMISSIONS = 'P'
>
>def getFileKey(doc, userPassword = ""):
>     """
>     Extracts the information required to calculate the file key
>     from the supplied PDF document.
>
>     In most cases `userPassword` can be left empty.  
>     """
>     fileId = doc.trailer_dict['ID'][0] # Is the ID always repeated?
>
>     try:
>         encryptDict = doc.dereference(doc.trailer_dict['Encrypt'])
>     except KeyError:
>         raise NotEncryptedException
>     # TODO: Check encryption version is ok. (filter/standard/v/1/r/2)
>
>     ownerHash = encryptDict[KEY_OWNER_HASH]
>     userHash = encryptDict[KEY_USER_HASH]
>     # `permissions` should be "four-byte integer, LSB first."
>     permissions = encryptDict[KEY_PERMISSIONS]
>    
>     fileKey = calculateFileKey(fileId, ownerHash, userHash,
>permissions,
>                                userPassword)
>
>     # Sanity check user password
>     assert(decrypt(userHash, fileKey) == passwordPad)
>
>     return fileKey   
>
>
>def decrypt(text, key):
>     """
>     Decrypts the supplied object (as a string) with the supplied key.
>
>     Returns "plain text" form of object as a string.
>     """
>     return "".join([chr(b)
>                     for b in arcfour(map(ord, text), map(ord, key))])
>    
>
>def showDocumentInfo(doc, fileKey):
>     """
>     Displays the content of the (optionally encrypted) Document
>Information
>     Dictionary for the supplied PDF document.
>     """
>     infoDictRef = doc.trailer_dict['Info']
>     objectNumber = infoDictRef.obj
>     generationNumber = infoDictRef.gen
>
>     infoDict = doc.dereference(infoDictRef)
>
>     objectKey = calculateObjectKey(fileKey, objectNumber,
>generationNumber)
>
>     for field, encryptedValue in infoDict.iteritems():
>         if fileKey:
>             value = decrypt(encryptedValue, objectKey)
>         else:
>             value = encryptedValue
>         print "%s: %s" % (field, value)
>    
>
>if __name__ == "__main__":
>     try:
>         filename = sys.argv[1]
>     except IndexError:
>         raise SystemExit("Usage %s <filename.pdf>" % sys.argv[0])
>    
>     doc = PDFdocument(filename)
>
>     try:
>         fileKey = getFileKey(doc)
>     except NotEncryptedException:
>         fileKey = ""
>
>     showDocumentInfo(doc, fileKey)


-- 
Herb Schilling



More information about the Python-list mailing list