Mercurial > hg > config
comparison python/find_duplicate_files.py @ 826:aa9a3850ed56
make it work
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 19 Feb 2017 17:25:31 -0800 |
parents | bea4dd61ae45 |
children |
comparison
equal
deleted
inserted
replaced
825:5a74c7ae19cd | 826:aa9a3850ed56 |
---|---|
5 find duplicate files in a directory | 5 find duplicate files in a directory |
6 """ | 6 """ |
7 | 7 |
8 # imports | 8 # imports |
9 import argparse | 9 import argparse |
10 import csv | |
11 import difflib | |
12 import json | |
10 import os | 13 import os |
11 import subprocess | |
12 import sys | 14 import sys |
13 | 15 |
14 | 16 |
15 class DuplicateFilesParser(argparse.ArgumentParser): | 17 class DuplicateFilesParser(argparse.ArgumentParser): |
16 """CLI option parser""" | 18 """CLI option parser""" |
17 | 19 |
18 def __init__(self, **kwargs): | 20 def __init__(self, **kwargs): |
19 kwargs.setdefault('description', __doc__) | 21 kwargs.setdefault('description', __doc__) |
20 argparse.ArgumentParser.__init__(self, **kwargs) | 22 argparse.ArgumentParser.__init__(self, **kwargs) |
21 self.add_argument('directory') | 23 self.add_argument('directory') |
24 self.add_argument('--identical-sizes', dest='identical_sizes', | |
25 action='store_true', default=False, | |
26 help="print out all matches with identical sizes and exit") | |
22 self.options = None | 27 self.options = None |
23 | 28 |
24 def parse_args(self, *args, **kw): | 29 def parse_args(self, *args, **kw): |
25 options = argparse.ArgumentParser.parse_args(self, *args, **kw) | 30 options = argparse.ArgumentParser.parse_args(self, *args, **kw) |
26 self.validate(options) | 31 self.validate(options) |
38 | 43 |
39 # parse command line options | 44 # parse command line options |
40 parser = DuplicateFilesParser() | 45 parser = DuplicateFilesParser() |
41 options = parser.parse_args(args) | 46 options = parser.parse_args(args) |
42 | 47 |
43 # get all files | 48 # get all file sizes |
44 raise NotImplementedError('TODO') # -> record TODO items | 49 sizes = {} |
50 directory = options.directory | |
51 for dirpath, dirnames, files in os.walk(directory, topdown=True): | |
52 for path in files: | |
53 path = os.path.join(dirpath, path) | |
54 sizes.setdefault(os.path.getsize(path), []).append(path) | |
55 | |
56 # filter out those with identical sizes | |
57 identical_sizes = {k: v for k, v in sizes.items() | |
58 if len(v) > 1} | |
59 if options.identical_sizes: | |
60 print(json.dumps(identical_sizes, indent=2, sort_keys=True)) | |
61 | |
62 | |
63 # now that we've narrowed it down, let's find the identical files | |
64 duplicate_files = [] | |
65 for row in identical_sizes.values(): | |
66 | |
67 while len(row) > 1: | |
68 duplicates = [] | |
69 ref_file = row.pop() | |
70 ref = open(ref_file).read() | |
71 for index, path in reversed(list(enumerate(row))): | |
72 comp = open(path).read() | |
73 if ref == comp: | |
74 if not duplicates: | |
75 duplicates.append(ref_file) | |
76 duplicates.append(path) | |
77 row.pop(index) | |
78 if duplicates: | |
79 duplicate_files.append(duplicates) | |
80 | |
81 | |
82 # output CSV | |
83 writer = csv.writer(sys.stdout) | |
84 writer.writerows(duplicate_files) | |
45 | 85 |
46 if __name__ == '__main__': | 86 if __name__ == '__main__': |
47 main() | 87 main() |