duplicate file finder
Lowell Kirsh
lkirsh at cs.ubc.ca
Thu Feb 24 18:32:03 EST 2005
It looks pretty good, but I'll have to take a better look later. Out of
curiosity, why did you convert the first spaces to pipes rather than add
the code as an attachment?
Lowell
Christos TZOTZIOY Georgiou wrote:
> On Wed, 23 Feb 2005 01:56:02 -0800, rumours say that Lowell Kirsh
> <lkirsh at cs.ubc.ca> might have written:
>
>
>>Good idea about hashing part of the file before comparing entire files.
>>It will make the script longer but the speed increase will most likely
>>make it worth it.
>
>
> My dupefind module was one of the first modules I wrote in python (it was a
> shell script once...), so it's not appropriate to post. However, rewriting it
> was in my RSN list; after your post, I said, "what the heck" :)
>
> I wouldn't describe it as /clean/ Python code, so any comments on style,
> methodology (and of course, bugs!) are mostly welcome. If you have any ideas
> how to make it more "Pythonic" in style, perhaps we can even send it to ASPN.
> On the upside, it's a good demo of Python dynamism and therefore power.
>
> All first spaces are converted to pipe symbols to keep code indents, and sorry
> for the assignment operators lacking spacing at the left, but it helped me know
> what I meant in C, so I kept the habit in Python.
>
>
>
> """dupefinder.py -- a module to find duplicate files.
>
> find_duplicate_files(*paths):
> . A function returning an iterable of lists of duplicate files
> . To be used as
> . for duplicate_files in dupefinder.find_duplicate_files(dir1, dir2...):
> . # process every group of duplicates
> """
>
> import os, md5, errno, sys
>
> IGNORED_ERROR_CODES= frozenset( [errno.EACCES, errno.ENOENT] )
>
> class File(object):
> . """A wrapper for files to facilitate their comparison.
> .
> . Interface:
> . - .get_hash(level) returns appropriate key for the current cmp level
> . - .filename
> . - .size"""
>
> . __slots__= "filename", "size", "_hashes",
>
> . def __init__(self, filename):
> . self.filename= filename
> . self.size= os.path.getsize(filename)
> . if self.size == 0:
> . self._hashes= [0, None, None]
> . else:
> . self._hashes= [self.size]
>
> . def __repr__(self):
> . return "%s('%s')" % (self.__class__.__name__, self.filename)
>
> . def get_hash(self, level):
> . """Return appropriate key for level of comparison.
> . level == 0 returns file size
> . level == 1 returns hash of first few kb
> . level == 2 returns md5 sum of whole file"""
> . if level >= len(self._hashes):
> . if 1 <= len(self._hashes):
> . self._hashes.append(self._get_partial_hash())
> . if 2 <= len(self._hashes):
> . self._hashes.append(self._get_full_hash())
> . return self._hashes[level]
>
> . def _get_partial_hash(self):
> . fp= open(self.filename)
> . try:
> . return hash(fp.read(8192))
> . finally:
> . fp.close()
>
> . def _get_full_hash(self):
> . fp= open(self.filename, "rb")
> . full_hash= md5.new()
> . while 1:
> . data= fp.read(65536)
> . if not data: break
> . full_hash.update(data)
> . fp.close()
> . return full_hash.digest()
>
> class GenericFilesDict(dict):
> . """A virtual wrapper for the dictionaries of file comparisons.
> . Not to be used on its own, but through subclassing.
> .
> . Each subclass should define a _level class attribute to be used with the
> . File.get_hash method, and a _next_class attribute pointing to the class
> . managing the next level comparisons."""
> . __slots__= ()
>
> . def add_file(self, fileobj):
> . """Add a File object to self keyed by the appropriate key based on
> . self._level. If another file object exists in the same spot, replace it
> . by a _next_level_class instance containing the pre-existing and new file
> . obj."""
>
> . this_hash= fileobj.get_hash(self._level)
>
> . try:
> . result = self[this_hash]
> . except KeyError:
> . self[this_hash]= fileobj
> . return
>
> . try: # there was something in slot [this_hash]
> . result.add_file(fileobj) # so add fileobj to it
> . except AttributeError: # it has no add_file method, so it's a File
> . _= self[this_hash]= self._next_class() # make an instance
> . _.add_file(result) # add pre-existing
> . _.add_file(fileobj) # add new
>
> . def __repr__(self):
> . return "%s%s" % (self.__class__.__name__, dict.__repr__(self))
>
> . def __iter__(self):
> . "Return all instances of SameFiles (subclass of list)."
> . for item, value in self.iteritems():
> . try: _next_class= value._next_class
> . except AttributeError: continue
> . if _next_class is None:
> . yield value
> . else:
> . for item in value:
> . yield item
>
> class SameFiles(list):
> . "A list of duplicate files."
> . __slots__= ()
> . _level= 3
> . _next_class= None # search stops here
>
> . def add_file(self, fileobj):
> . self.append(fileobj)
>
> class FilesByFullHash(GenericFilesDict):
> . """A dictionary keyed on md5 hash of whole file.
>
> . The algorithm assures that all File objects in this dict
> . have the same size and the same hash of first few kiB."""
> . __slots__= ()
> . _level= 2
> . _next_class= SameFiles
>
> class FilesByPartialHash(GenericFilesDict):
> . """A dictionary keyed on hosh of first few kiB of file.
>
> . The algorithm assures that all File objects in this dict
> . have the same size."""
> . __slots__= ()
> . _level= 1
> . _next_class= FilesByFullHash
>
> class FilesBySize(GenericFilesDict):
> . """A dictionary keyed on file size."""
> . __slots__= ()
> . _level= 0
> . _next_class= FilesByPartialHash
>
> def find_duplicate_files(*directories):
> . """Find all duplicate files in the specified directories.
>
> . The returned object can be iterated on, yielding a list of
> . duplicate files every time."""
> . dupefinder= FilesBySize()
> ## count= 0
> . for path in directories:
> . for dirname, dirnames, filenames in os.walk(path):
> . for filename in filenames:
> . try:
> . dupefinder.add_file(File(os.path.join(dirname, filename)))
> . except EnvironmentError, exc:
> . if exc.errno not in IGNORED_ERROR_CODES: # deleted or b0rken
> symlink
> . raise
> ## count += 1
> ## if count & 15 == 0:
> ## sys.stderr.write("\r%d" % count)
>
> . return dupefinder
>
> def pprint(obj, level=0):
> . """For testing purposes."""
> . indent= " "*level
> . if isinstance(obj, File):
> . print `obj`
> . elif isinstance(obj, list):
> . print "list"
> . for subobj in obj:
> . print indent, "-", repr(subobj)
> . else:
> . print obj.__class__.__name__
> . for key, subobj in obj.iteritems():
> . print indent, "+(%r)" % key,
> . pprint(subobj, level+1)
>
> if __name__=="__main__":
> . result= find_duplicate_files(*sys.argv[1:])
> . import pprint
> . for group in result:
> . pprint.pprint(group)
>
More information about the Python-list
mailing list