# HG changeset patch # User Jeff Hammel # Date 1487553931 28800 # Node ID aa9a3850ed56b143f6aa1cbc4f60addee4c45055 # Parent 5a74c7ae19cd93384d15d5c4d5e394680c2f487c make it work diff -r 5a74c7ae19cd -r aa9a3850ed56 python/find_duplicate_files.py --- a/python/find_duplicate_files.py Sun Feb 19 09:03:52 2017 -0800 +++ b/python/find_duplicate_files.py Sun Feb 19 17:25:31 2017 -0800 @@ -7,8 +7,10 @@ # imports import argparse +import csv +import difflib +import json import os -import subprocess import sys @@ -19,6 +21,9 @@ kwargs.setdefault('description', __doc__) argparse.ArgumentParser.__init__(self, **kwargs) self.add_argument('directory') + self.add_argument('--identical-sizes', dest='identical_sizes', + action='store_true', default=False, + help="print out all matches with identical sizes and exit") self.options = None def parse_args(self, *args, **kw): @@ -40,8 +45,43 @@ parser = DuplicateFilesParser() options = parser.parse_args(args) - # get all files - raise NotImplementedError('TODO') # -> record TODO items + # get all file sizes + sizes = {} + directory = options.directory + for dirpath, dirnames, files in os.walk(directory, topdown=True): + for path in files: + path = os.path.join(dirpath, path) + sizes.setdefault(os.path.getsize(path), []).append(path) + + # filter out those with identical sizes + identical_sizes = {k: v for k, v in sizes.items() + if len(v) > 1} + if options.identical_sizes: + print(json.dumps(identical_sizes, indent=2, sort_keys=True)) + + + # now that we've narrowed it down, let's find the identical files + duplicate_files = [] + for row in identical_sizes.values(): + + while len(row) > 1: + duplicates = [] + ref_file = row.pop() + ref = open(ref_file).read() + for index, path in reversed(list(enumerate(row))): + comp = open(path).read() + if ref == comp: + if not duplicates: + duplicates.append(ref_file) + duplicates.append(path) + row.pop(index) + if duplicates: + duplicate_files.append(duplicates) + + + # output CSV + writer = csv.writer(sys.stdout) + writer.writerows(duplicate_files) if __name__ == '__main__': main()