comparison python/find_duplicate_files.py @ 826:aa9a3850ed56

make it work
author Jeff Hammel <k0scist@gmail.com>
date Sun, 19 Feb 2017 17:25:31 -0800
parents bea4dd61ae45
children
comparison
equal deleted inserted replaced
825:5a74c7ae19cd 826:aa9a3850ed56
5 find duplicate files in a directory 5 find duplicate files in a directory
6 """ 6 """
7 7
8 # imports 8 # imports
9 import argparse 9 import argparse
10 import csv
11 import difflib
12 import json
10 import os 13 import os
11 import subprocess
12 import sys 14 import sys
13 15
14 16
15 class DuplicateFilesParser(argparse.ArgumentParser): 17 class DuplicateFilesParser(argparse.ArgumentParser):
16 """CLI option parser""" 18 """CLI option parser"""
17 19
18 def __init__(self, **kwargs): 20 def __init__(self, **kwargs):
19 kwargs.setdefault('description', __doc__) 21 kwargs.setdefault('description', __doc__)
20 argparse.ArgumentParser.__init__(self, **kwargs) 22 argparse.ArgumentParser.__init__(self, **kwargs)
21 self.add_argument('directory') 23 self.add_argument('directory')
24 self.add_argument('--identical-sizes', dest='identical_sizes',
25 action='store_true', default=False,
26 help="print out all matches with identical sizes and exit")
22 self.options = None 27 self.options = None
23 28
24 def parse_args(self, *args, **kw): 29 def parse_args(self, *args, **kw):
25 options = argparse.ArgumentParser.parse_args(self, *args, **kw) 30 options = argparse.ArgumentParser.parse_args(self, *args, **kw)
26 self.validate(options) 31 self.validate(options)
38 43
39 # parse command line options 44 # parse command line options
40 parser = DuplicateFilesParser() 45 parser = DuplicateFilesParser()
41 options = parser.parse_args(args) 46 options = parser.parse_args(args)
42 47
43 # get all files 48 # get all file sizes
44 raise NotImplementedError('TODO') # -> record TODO items 49 sizes = {}
50 directory = options.directory
51 for dirpath, dirnames, files in os.walk(directory, topdown=True):
52 for path in files:
53 path = os.path.join(dirpath, path)
54 sizes.setdefault(os.path.getsize(path), []).append(path)
55
56 # filter out those with identical sizes
57 identical_sizes = {k: v for k, v in sizes.items()
58 if len(v) > 1}
59 if options.identical_sizes:
60 print(json.dumps(identical_sizes, indent=2, sort_keys=True))
61
62
63 # now that we've narrowed it down, let's find the identical files
64 duplicate_files = []
65 for row in identical_sizes.values():
66
67 while len(row) > 1:
68 duplicates = []
69 ref_file = row.pop()
70 ref = open(ref_file).read()
71 for index, path in reversed(list(enumerate(row))):
72 comp = open(path).read()
73 if ref == comp:
74 if not duplicates:
75 duplicates.append(ref_file)
76 duplicates.append(path)
77 row.pop(index)
78 if duplicates:
79 duplicate_files.append(duplicates)
80
81
82 # output CSV
83 writer = csv.writer(sys.stdout)
84 writer.writerows(duplicate_files)
45 85
46 if __name__ == '__main__': 86 if __name__ == '__main__':
47 main() 87 main()