Mercurial > hg > config
comparison python/find_duplicate_files.py @ 826:aa9a3850ed56
make it work
| author | Jeff Hammel <k0scist@gmail.com> |
|---|---|
| date | Sun, 19 Feb 2017 17:25:31 -0800 |
| parents | bea4dd61ae45 |
| children |
comparison
equal
deleted
inserted
replaced
| 825:5a74c7ae19cd | 826:aa9a3850ed56 |
|---|---|
| 5 find duplicate files in a directory | 5 find duplicate files in a directory |
| 6 """ | 6 """ |
| 7 | 7 |
| 8 # imports | 8 # imports |
| 9 import argparse | 9 import argparse |
| 10 import csv | |
| 11 import difflib | |
| 12 import json | |
| 10 import os | 13 import os |
| 11 import subprocess | |
| 12 import sys | 14 import sys |
| 13 | 15 |
| 14 | 16 |
| 15 class DuplicateFilesParser(argparse.ArgumentParser): | 17 class DuplicateFilesParser(argparse.ArgumentParser): |
| 16 """CLI option parser""" | 18 """CLI option parser""" |
| 17 | 19 |
| 18 def __init__(self, **kwargs): | 20 def __init__(self, **kwargs): |
| 19 kwargs.setdefault('description', __doc__) | 21 kwargs.setdefault('description', __doc__) |
| 20 argparse.ArgumentParser.__init__(self, **kwargs) | 22 argparse.ArgumentParser.__init__(self, **kwargs) |
| 21 self.add_argument('directory') | 23 self.add_argument('directory') |
| 24 self.add_argument('--identical-sizes', dest='identical_sizes', | |
| 25 action='store_true', default=False, | |
| 26 help="print out all matches with identical sizes and exit") | |
| 22 self.options = None | 27 self.options = None |
| 23 | 28 |
| 24 def parse_args(self, *args, **kw): | 29 def parse_args(self, *args, **kw): |
| 25 options = argparse.ArgumentParser.parse_args(self, *args, **kw) | 30 options = argparse.ArgumentParser.parse_args(self, *args, **kw) |
| 26 self.validate(options) | 31 self.validate(options) |
| 38 | 43 |
| 39 # parse command line options | 44 # parse command line options |
| 40 parser = DuplicateFilesParser() | 45 parser = DuplicateFilesParser() |
| 41 options = parser.parse_args(args) | 46 options = parser.parse_args(args) |
| 42 | 47 |
| 43 # get all files | 48 # get all file sizes |
| 44 raise NotImplementedError('TODO') # -> record TODO items | 49 sizes = {} |
| 50 directory = options.directory | |
| 51 for dirpath, dirnames, files in os.walk(directory, topdown=True): | |
| 52 for path in files: | |
| 53 path = os.path.join(dirpath, path) | |
| 54 sizes.setdefault(os.path.getsize(path), []).append(path) | |
| 55 | |
| 56 # filter out those with identical sizes | |
| 57 identical_sizes = {k: v for k, v in sizes.items() | |
| 58 if len(v) > 1} | |
| 59 if options.identical_sizes: | |
| 60 print(json.dumps(identical_sizes, indent=2, sort_keys=True)) | |
| 61 | |
| 62 | |
| 63 # now that we've narrowed it down, let's find the identical files | |
| 64 duplicate_files = [] | |
| 65 for row in identical_sizes.values(): | |
| 66 | |
| 67 while len(row) > 1: | |
| 68 duplicates = [] | |
| 69 ref_file = row.pop() | |
| 70 ref = open(ref_file).read() | |
| 71 for index, path in reversed(list(enumerate(row))): | |
| 72 comp = open(path).read() | |
| 73 if ref == comp: | |
| 74 if not duplicates: | |
| 75 duplicates.append(ref_file) | |
| 76 duplicates.append(path) | |
| 77 row.pop(index) | |
| 78 if duplicates: | |
| 79 duplicate_files.append(duplicates) | |
| 80 | |
| 81 | |
| 82 # output CSV | |
| 83 writer = csv.writer(sys.stdout) | |
| 84 writer.writerows(duplicate_files) | |
| 45 | 85 |
| 46 if __name__ == '__main__': | 86 if __name__ == '__main__': |
| 47 main() | 87 main() |
