Mercurial > hg > config
annotate python/find_duplicate_files.py @ 925:a92db57f62f8 default tip
add lxml
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Mon, 20 Jan 2025 09:20:00 -0800 |
parents | aa9a3850ed56 |
children |
rev | line source |
---|---|
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
3 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
4 """ |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
5 find duplicate files in a directory |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
6 """ |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
7 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
8 # imports |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
9 import argparse |
826 | 10 import csv |
11 import difflib | |
12 import json | |
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
13 import os |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
14 import sys |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
15 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
16 |
799
dbd2562cb03e
remove old way of doing things; note TODO on replacing
Jeff Hammel <k0scist@gmail.com>
parents:
711
diff
changeset
|
17 class DuplicateFilesParser(argparse.ArgumentParser): |
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
18 """CLI option parser""" |
799
dbd2562cb03e
remove old way of doing things; note TODO on replacing
Jeff Hammel <k0scist@gmail.com>
parents:
711
diff
changeset
|
19 |
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
20 def __init__(self, **kwargs): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
21 kwargs.setdefault('description', __doc__) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
22 argparse.ArgumentParser.__init__(self, **kwargs) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
23 self.add_argument('directory') |
826 | 24 self.add_argument('--identical-sizes', dest='identical_sizes', |
25 action='store_true', default=False, | |
26 help="print out all matches with identical sizes and exit") | |
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
27 self.options = None |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
28 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
29 def parse_args(self, *args, **kw): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
30 options = argparse.ArgumentParser.parse_args(self, *args, **kw) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
31 self.validate(options) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
32 self.options = options |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
33 return options |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
34 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
35 def validate(self, options): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
36 """validate options""" |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
37 if not os.path.isdir(options.directory): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
38 self.error("Not a directory: {}".format(options.directory)) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
39 |
801 | 40 |
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
41 def main(args=sys.argv[1:]): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
42 """CLI""" |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
43 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
44 # parse command line options |
799
dbd2562cb03e
remove old way of doing things; note TODO on replacing
Jeff Hammel <k0scist@gmail.com>
parents:
711
diff
changeset
|
45 parser = DuplicateFilesParser() |
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
46 options = parser.parse_args(args) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
47 |
826 | 48 # get all file sizes |
49 sizes = {} | |
50 directory = options.directory | |
51 for dirpath, dirnames, files in os.walk(directory, topdown=True): | |
52 for path in files: | |
53 path = os.path.join(dirpath, path) | |
54 sizes.setdefault(os.path.getsize(path), []).append(path) | |
55 | |
56 # filter out those with identical sizes | |
57 identical_sizes = {k: v for k, v in sizes.items() | |
58 if len(v) > 1} | |
59 if options.identical_sizes: | |
60 print(json.dumps(identical_sizes, indent=2, sort_keys=True)) | |
61 | |
62 | |
63 # now that we've narrowed it down, let's find the identical files | |
64 duplicate_files = [] | |
65 for row in identical_sizes.values(): | |
66 | |
67 while len(row) > 1: | |
68 duplicates = [] | |
69 ref_file = row.pop() | |
70 ref = open(ref_file).read() | |
71 for index, path in reversed(list(enumerate(row))): | |
72 comp = open(path).read() | |
73 if ref == comp: | |
74 if not duplicates: | |
75 duplicates.append(ref_file) | |
76 duplicates.append(path) | |
77 row.pop(index) | |
78 if duplicates: | |
79 duplicate_files.append(duplicates) | |
80 | |
81 | |
82 # output CSV | |
83 writer = csv.writer(sys.stdout) | |
84 writer.writerows(duplicate_files) | |
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
85 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
86 if __name__ == '__main__': |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
87 main() |