duplicate file finder

Thu Feb 24 18:32:03 EST 2005

It looks pretty good, but I'll have to take a better look later. Out of 
curiosity, why did you convert the first spaces to pipes rather than add 
the code as an attachment?

Lowell

Christos TZOTZIOY Georgiou wrote:
> On Wed, 23 Feb 2005 01:56:02 -0800, rumours say that Lowell Kirsh
> <lkirsh at cs.ubc.ca> might have written:
> 
> 
>>Good idea about hashing part of the file before comparing entire files. 
>>It will make the script longer but the speed increase will most likely 
>>make it worth it.
> 
> 
> My dupefind module was one of the first modules I wrote in python (it was a
> shell script once...), so it's not appropriate to post.  However, rewriting it
> was in my RSN list; after your post, I said, "what the heck" :)
> 
> I wouldn't describe it as /clean/ Python code, so any comments on style,
> methodology (and of course, bugs!) are mostly welcome.  If you have any ideas
> how to make it more "Pythonic" in style, perhaps we can even send it to ASPN.
> On the upside, it's a good demo of Python dynamism and therefore power.
> 
> All first spaces are converted to pipe symbols to keep code indents, and sorry
> for the assignment operators lacking spacing at the left, but it helped me know
> what I meant in C, so I kept the habit in Python.
> 
> 
> 
> """dupefinder.py -- a module to find duplicate files.
> 
> find_duplicate_files(*paths):
> .   A function returning an iterable of lists of duplicate files
> .   To be used as
> .   for duplicate_files in dupefinder.find_duplicate_files(dir1, dir2...):
> .       # process every group of duplicates
> """
> 
> import os, md5, errno, sys
> 
> IGNORED_ERROR_CODES= frozenset( [errno.EACCES, errno.ENOENT] )
> 
> class File(object):
> .   """A wrapper for files to facilitate their comparison.
> .   
> .   Interface:
> .   -   .get_hash(level) returns appropriate key for the current cmp level
> .   -   .filename
> .   -   .size"""
> 
> .   __slots__= "filename", "size", "_hashes",
> 
> .   def __init__(self, filename):
> .       self.filename= filename
> .       self.size= os.path.getsize(filename)
> .       if self.size == 0:
> .           self._hashes= [0, None, None]
> .       else:
> .           self._hashes= [self.size]
> 
> .   def __repr__(self):
> .       return "%s('%s')" % (self.__class__.__name__, self.filename)
> 
> .   def get_hash(self, level):
> .       """Return appropriate key for level of comparison.
> .       level == 0 returns file size
> .       level == 1 returns hash of first few kb
> .       level == 2 returns md5 sum of whole file"""
> .       if level >= len(self._hashes):
> .           if 1 <= len(self._hashes):
> .               self._hashes.append(self._get_partial_hash())
> .           if 2 <= len(self._hashes):
> .               self._hashes.append(self._get_full_hash())
> .       return self._hashes[level]
> 
> .   def _get_partial_hash(self):
> .       fp= open(self.filename)
> .       try:
> .           return hash(fp.read(8192))
> .       finally:
> .           fp.close()
> 
> .   def _get_full_hash(self):
> .       fp= open(self.filename, "rb")
> .       full_hash= md5.new()
> .       while 1:
> .           data= fp.read(65536)
> .           if not data: break
> .           full_hash.update(data)
> .       fp.close()
> .       return full_hash.digest()
> 
> class GenericFilesDict(dict):
> .   """A virtual wrapper for the dictionaries of file comparisons.
> .   Not to be used on its own, but through subclassing.
> .   
> .   Each subclass should define a _level class attribute to be used with the
> .   File.get_hash method, and a _next_class attribute pointing to the class
> .   managing the next level comparisons."""
> .   __slots__= ()
> 
> .   def add_file(self, fileobj):
> .       """Add a File object to self keyed by the appropriate key based on
> .       self._level. If another file object exists in the same spot, replace it
> .       by a _next_level_class instance containing the pre-existing and new file
> .       obj."""
> 
> .       this_hash= fileobj.get_hash(self._level)
> 
> .       try:
> .           result = self[this_hash]
> .       except KeyError:
> .           self[this_hash]= fileobj
> .           return
> 
> .       try: # there was something in slot [this_hash]
> .           result.add_file(fileobj) # so add fileobj to it
> .       except AttributeError: # it has no add_file method, so it's a File
> .           _= self[this_hash]= self._next_class() # make an instance
> .           _.add_file(result) # add pre-existing
> .           _.add_file(fileobj) # add new
> 
> .   def __repr__(self):
> .       return "%s%s" % (self.__class__.__name__, dict.__repr__(self))
> 
> .   def __iter__(self):
> .       "Return all instances of SameFiles (subclass of list)."
> .       for item, value in self.iteritems():
> .           try: _next_class= value._next_class
> .           except AttributeError: continue
> .           if _next_class is None:
> .               yield value
> .           else:
> .               for item in value:
> .                   yield item
> 
> class SameFiles(list):
> .   "A list of duplicate files."
> .   __slots__= ()
> .   _level= 3
> .   _next_class= None # search stops here
> 
> .   def add_file(self, fileobj):
> .       self.append(fileobj)
> 
> class FilesByFullHash(GenericFilesDict):
> .   """A dictionary keyed on md5 hash of whole file.
> 
> .   The algorithm assures that all File objects in this dict
> .   have the same size and the same hash of first few kiB."""
> .   __slots__= ()
> .   _level= 2
> .   _next_class= SameFiles
> 
> class FilesByPartialHash(GenericFilesDict):
> .   """A dictionary keyed on hosh of first few kiB of file.
> 
> .   The algorithm assures that all File objects in this dict
> .   have the same size."""
> .   __slots__= ()
> .   _level= 1
> .   _next_class= FilesByFullHash
> 
> class FilesBySize(GenericFilesDict):
> .   """A dictionary keyed on file size."""
> .   __slots__= ()
> .   _level= 0
> .   _next_class= FilesByPartialHash
> 
> def find_duplicate_files(*directories):
> .   """Find all duplicate files in the specified directories.
> 
> .   The returned object can be iterated on, yielding a list of
> .   duplicate files every time."""
> .   dupefinder= FilesBySize()
> ##  count= 0
> .   for path in directories:
> .       for dirname, dirnames, filenames in os.walk(path):
> .           for filename in filenames:
> .               try:
> .                   dupefinder.add_file(File(os.path.join(dirname, filename)))
> .               except EnvironmentError, exc:
> .                   if exc.errno not in IGNORED_ERROR_CODES: # deleted or b0rken
> symlink
> .                       raise
> ##              count += 1
> ##              if count & 15 == 0:
> ##                  sys.stderr.write("\r%d" % count)
> 
> .   return dupefinder
> 
> def pprint(obj, level=0):
> .   """For testing purposes."""
> .   indent= "  "*level
> .   if isinstance(obj, File):
> .       print `obj`
> .   elif isinstance(obj, list):
> .       print "list"
> .       for subobj in obj:
> .           print indent, "-", repr(subobj)
> .   else:
> .       print obj.__class__.__name__
> .       for key, subobj in obj.iteritems():
> .           print indent, "+(%r)" % key,
> .           pprint(subobj, level+1)
> 
> if __name__=="__main__":
> .   result= find_duplicate_files(*sys.argv[1:])
> .   import pprint
> .   for group in result:
> .       pprint.pprint(group)
>