annotate python/find_duplicate_files.py @ 711:ab831c7621e9

hacky way to note duplicate files
author Jeff Hammel <k0scist@gmail.com>
date Thu, 16 Oct 2014 11:25:49 -0700
parents
children dbd2562cb03e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
711
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2 # -*- coding: utf-8 -*-
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 """
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 find duplicate files in a directory
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6 """
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 # imports
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
9 import argparse
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
10 import os
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 import subprocess
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12 import sys
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
13
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
14 # module globals
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
15 __all__ = ['main', 'Parser']
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
16
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
17 class Parser(argparse.ArgumentParser):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
18 """CLI option parser"""
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
19 def __init__(self, **kwargs):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
20 kwargs.setdefault('description', __doc__)
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
21 argparse.ArgumentParser.__init__(self, **kwargs)
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
22 self.add_argument('directory')
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
23 self.options = None
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
24
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
25 def parse_args(self, *args, **kw):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
26 options = argparse.ArgumentParser.parse_args(self, *args, **kw)
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
27 self.validate(options)
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
28 self.options = options
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
29 return options
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
30
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
31 def validate(self, options):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
32 """validate options"""
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
33 if not os.path.isdir(options.directory):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
34 self.error("Not a directory: {}".format(options.directory))
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
35
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
36 def main(args=sys.argv[1:]):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
37 """CLI"""
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
38
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
39 # parse command line options
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
40 parser = Parser()
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
41 options = parser.parse_args(args)
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
42
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
43 output = subprocess.check_output(['ls', '-l', options.directory]).strip()
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
44 rows = [row.strip().split() for row in output.splitlines()[1:]]
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
45
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
46 sizes = {}
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
47 for row in rows:
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
48 size = int(row[4])
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
49 filename = row[-1]
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
50 sizes.setdefault(size, []).append(filename)
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
51
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
52 duplicates = {}
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
53 for size, filenames in sizes.items():
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
54 if len(filenames) < 2:
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
55 continue
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
56 duplicates[size] = filenames
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
57
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
58 for size in sorted(duplicates.keys()):
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
59 print ('{} : '.format(size))
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
60 print ('\n'.join(duplicates[size]))
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
61 print ('\n')
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
62
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
63 if __name__ == '__main__':
ab831c7621e9 hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
64 main()