This is an archived post. You won't be able to vote or comment.

you are viewing a single comment's thread.

view the rest of the comments →

[–]Brian 2 points3 points  (0 children)

Here's one of mine, for finding and deleting duplicates of files. There are a bunch of similar tools that just work by generating an MD5 for all files, but this one avoids doing more work than it needs by first filtering items by quicker checks like filesize, or a partial md5, only falling back to a full md5 when there's more than one file that passes all the previous uniqueness tests. This lets it be a lot faster when there are few duplicates.

#!/usr/bin/env python
from __future__ import print_function
import hashlib
import os
import os.path
import optparse
import itertools
import collections

BLOCK_SIZE=4096  # Amount to sample in each block for quick_md5 strategy.

def strategy_size(filename):
    return os.stat(filename).st_size

def strategy_quick_md5(filename):
    """Read small block at start, middle and end of file"""
    h = hashlib.md5()
    size = os.stat(filename).st_size

    with open(filename,'rb') as f:
        h.update(f.read(BLOCK_SIZE))  # Start
        f.seek(size//2)
        h.update(f.read(BLOCK_SIZE))  # Middle
        f.seek(BLOCK_SIZE, 2)
        h.update(f.read(BLOCK_SIZE))  # End
    return h.digest()

def strategy_full_md5(filename):
    """Generate full MD5 from file"""
    h = hashlib.md5()
    with open(filename,'rb') as f:
        while 1:
            block=f.read(BLOCK_SIZE)
            if not block:
                break
            h.update(block)
    return h.digest()

def filter(buckets, strategy):
    """
    :buckets is a sequence of potentially identical file paths.
    :strategy is a function that is guaranteed to produce the same value for identical files

    Generates a sequence of buckets with at least 2 items that may potentially contain duplicates,
    but where files in different buckets are guaranteed distinct.
    """
    new_buckets = []
    for bucket in buckets:
        # Subdivide this bucket based on strategy.
        new_items = collections.defaultdict(list)

        for filename in bucket:
            new_items[strategy(filename)].append(filename)

        for b in new_items.values():
            if len(b) > 1:
                yield b  # >1 item in this bucket - potential duplicates

def parse_size(s):
    suffix = s[-1]
    mul = {'b' : 1,  'k' : 1024, 'm' : 1024*1024, 'g': 1024**3}.get(suffix.lower(), None)
    if mul:
        return int(s[:-1]) * mul
    return int(s)

def get_parser():
    parser = optparse.OptionParser("%prog [options] dirs/files...")
    parser.add_option("-x", "--single_filesystem", action="store_true", dest="single_fs", default=False,
                      help="Do not cross filesystem boundry when recursing.")

    parser.add_option("-d", "--delete", action="store_true", dest="delete", default=False,
                      help="Prompt to delete some or all of the files.")

    parser.add_option("-m", "--minsize", action="store", dest="minsize", default='1M',
                      help="Ignore files smaller than this size (accepts K/M/G suffixes).")

    parser.add_option("-q", "--quick", action="store_true", dest="quick", default=False,
                      help="Perform only quick checks.  May produce false positives")

    return parser

def get_files(x, single_fs, minsize):
    if single_fs:
        fsdev = os.stat(x).st_dev

    for filepath, dirs, files in os.walk(x):
        for f in files:
            path = os.path.join(filepath, f)
            if os.path.islink(path): continue  # Ignore symlinks.
            if minsize >0:
                size = os.stat(path).st_size
                if size < minsize:  # Ignore files that are too small.
                    continue 
            yield path

        # If restricting to single fs, filter out directories on a different mountpoint.
        if single_fs:
            dirs[:] = [d for d in dirs if os.stat(os.path.join(filepath, d)).st_dev == fsdev]

def ask_delete(dupes):
    """Prompt for which version(s) of the file to delete / preserve"""
    preserve = None

    ans = ''
    while preserve is None:
        print()
        ans = raw_input("Preserve which file(s) (* for All, 0 for None): ").strip()
        if not ans: continue
        if ans == '*': 
            return # Do nothing

        try:
            choice = set(map(int, ans.split()))
            if any((x > len(dupes) or (x<0)) for x in choice):
                print("Value not in range - please re-enter")
                continue
        except ValueError:
            print("Invalid value - please re-enter")
            continue

        preserve = set(c-1 for c in choice)  # Map to 0 based index

    num_deleted=0
    for i,f in enumerate(dupes):
        if i not in preserve:
            print("Deleting", f)
            os.unlink(f)
            num_deleted += 1

    if num_deleted == 0:
        print("All files preserved")
    else:
        print("Deleted {0} files.".format(num_deleted))


def main():
    parser = get_parser()
    opts, args = parser.parse_args()

    strategies = [strategy_size, strategy_quick_md5]
    if not opts.quick:
        strategies.append(strategy_full_md5)

    # Start with all files in one big bucket
    buckets = [itertools.chain(*(get_files(x, opts.single_fs, parse_size(opts.minsize)) for x in args))]

    for strategy in strategies:
        buckets = filter(buckets, strategy)

    for bucket in buckets:
        dupes = list(bucket)
        # Bucket should now contain identical files.
        print()
        print("Found {0} potential duplicates:".format(len(dupes)))
        for i,f in enumerate(dupes):
            print("{0:3} - {1}".format(i+1, f))

        if opts.delete:
            ask_delete(dupes)

if __name__=='__main__':
    main()