Mercurial > hg > config
annotate python/find_duplicate_files.py @ 759:f632a9850bb8
lookie, i can has x platform
| author | Jeff Hammel <k0scist@gmail.com> | 
|---|---|
| date | Wed, 16 Sep 2015 15:09:19 -0700 | 
| parents | ab831c7621e9 | 
| children | dbd2562cb03e | 
| rev | line source | 
|---|---|
| 
711
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
1 #!/usr/bin/env python | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
2 # -*- coding: utf-8 -*- | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
3 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
4 """ | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
5 find duplicate files in a directory | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
6 """ | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
7 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
8 # imports | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
9 import argparse | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
10 import os | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
11 import subprocess | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
12 import sys | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
13 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
14 # module globals | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
15 __all__ = ['main', 'Parser'] | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
16 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
17 class Parser(argparse.ArgumentParser): | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
18 """CLI option parser""" | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
19 def __init__(self, **kwargs): | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
20 kwargs.setdefault('description', __doc__) | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
21 argparse.ArgumentParser.__init__(self, **kwargs) | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
22 self.add_argument('directory') | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
23 self.options = None | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
24 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
25 def parse_args(self, *args, **kw): | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
26 options = argparse.ArgumentParser.parse_args(self, *args, **kw) | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
27 self.validate(options) | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
28 self.options = options | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
29 return options | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
30 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
31 def validate(self, options): | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
32 """validate options""" | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
33 if not os.path.isdir(options.directory): | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
34 self.error("Not a directory: {}".format(options.directory)) | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
35 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
36 def main(args=sys.argv[1:]): | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
37 """CLI""" | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
38 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
39 # parse command line options | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
40 parser = Parser() | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
41 options = parser.parse_args(args) | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
42 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
43 output = subprocess.check_output(['ls', '-l', options.directory]).strip() | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
44 rows = [row.strip().split() for row in output.splitlines()[1:]] | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
45 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
46 sizes = {} | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
47 for row in rows: | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
48 size = int(row[4]) | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
49 filename = row[-1] | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
50 sizes.setdefault(size, []).append(filename) | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
51 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
52 duplicates = {} | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
53 for size, filenames in sizes.items(): | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
54 if len(filenames) < 2: | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
55 continue | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
56 duplicates[size] = filenames | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
57 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
58 for size in sorted(duplicates.keys()): | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
59 print ('{} : '.format(size)) | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
60 print ('\n'.join(duplicates[size])) | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
61 print ('\n') | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
62 | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
63 if __name__ == '__main__': | 
| 
 
ab831c7621e9
hacky way to note duplicate files
 
Jeff Hammel <k0scist@gmail.com> 
parents:  
diff
changeset
 | 
64 main() | 
