annotate python/find_duplicate_files.py @ 895:8d3df8c0c730

wordstream is a requirement
author Jeff Hammel <k0scist@gmail.com>
date Fri, 13 Aug 2021 15:16:21 -0700
parents aa9a3850ed56
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
711
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2 # -*- coding: utf-8 -*-
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 """
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 find duplicate files in a directory
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6 """
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 # imports
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
9 import argparse
826
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
10 import csv
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
11 import difflib
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
12 import json
711
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
13 import os
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
14 import sys
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
15
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
16
799
dbd2562cb03e remove old way of doing things; note TODO on replacing
Jeff Hammel <k0scist@gmail.com>
parents: 711
diff changeset
17 class DuplicateFilesParser(argparse.ArgumentParser):
711
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
18 """CLI option parser"""
799
dbd2562cb03e remove old way of doing things; note TODO on replacing
Jeff Hammel <k0scist@gmail.com>
parents: 711
diff changeset
19
711
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
20 def __init__(self, **kwargs):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
21 kwargs.setdefault('description', __doc__)
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
22 argparse.ArgumentParser.__init__(self, **kwargs)
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
23 self.add_argument('directory')
826
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
24 self.add_argument('--identical-sizes', dest='identical_sizes',
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
25 action='store_true', default=False,
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
26 help="print out all matches with identical sizes and exit")
711
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
27 self.options = None
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
28
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
29 def parse_args(self, *args, **kw):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
30 options = argparse.ArgumentParser.parse_args(self, *args, **kw)
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
31 self.validate(options)
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
32 self.options = options
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
33 return options
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
34
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
35 def validate(self, options):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
36 """validate options"""
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
37 if not os.path.isdir(options.directory):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
38 self.error("Not a directory: {}".format(options.directory))
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
39
801
bea4dd61ae45 cleanup
Jeff Hammel <k0scist@gmail.com>
parents: 799
diff changeset
40
711
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
41 def main(args=sys.argv[1:]):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
42 """CLI"""
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
43
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
44 # parse command line options
799
dbd2562cb03e remove old way of doing things; note TODO on replacing
Jeff Hammel <k0scist@gmail.com>
parents: 711
diff changeset
45 parser = DuplicateFilesParser()
711
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
46 options = parser.parse_args(args)
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
47
826
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
48 # get all file sizes
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
49 sizes = {}
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
50 directory = options.directory
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
51 for dirpath, dirnames, files in os.walk(directory, topdown=True):
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
52 for path in files:
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
53 path = os.path.join(dirpath, path)
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
54 sizes.setdefault(os.path.getsize(path), []).append(path)
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
55
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
56 # filter out those with identical sizes
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
57 identical_sizes = {k: v for k, v in sizes.items()
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
58 if len(v) > 1}
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
59 if options.identical_sizes:
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
60 print(json.dumps(identical_sizes, indent=2, sort_keys=True))
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
61
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
62
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
63 # now that we've narrowed it down, let's find the identical files
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
64 duplicate_files = []
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
65 for row in identical_sizes.values():
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
66
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
67 while len(row) > 1:
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
68 duplicates = []
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
69 ref_file = row.pop()
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
70 ref = open(ref_file).read()
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
71 for index, path in reversed(list(enumerate(row))):
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
72 comp = open(path).read()
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
73 if ref == comp:
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
74 if not duplicates:
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
75 duplicates.append(ref_file)
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
76 duplicates.append(path)
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
77 row.pop(index)
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
78 if duplicates:
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
79 duplicate_files.append(duplicates)
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
80
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
81
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
82 # output CSV
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
83 writer = csv.writer(sys.stdout)
aa9a3850ed56 make it work
Jeff Hammel <k0scist@gmail.com>
parents: 801
diff changeset
84 writer.writerows(duplicate_files)
711
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
85
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
86 if __name__ == '__main__':
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
87 main()