Mercurial > hg > config
annotate python/find_duplicate_files.py @ 740:25622fb5906d
example code for counting + duplicates
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Wed, 27 May 2015 15:55:29 -0700 |
parents | ab831c7621e9 |
children | dbd2562cb03e |
rev | line source |
---|---|
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
3 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
4 """ |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
5 find duplicate files in a directory |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
6 """ |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
7 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
8 # imports |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
9 import argparse |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
10 import os |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
11 import subprocess |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
12 import sys |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
13 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
14 # module globals |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
15 __all__ = ['main', 'Parser'] |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
16 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
17 class Parser(argparse.ArgumentParser): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
18 """CLI option parser""" |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
19 def __init__(self, **kwargs): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
20 kwargs.setdefault('description', __doc__) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
21 argparse.ArgumentParser.__init__(self, **kwargs) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
22 self.add_argument('directory') |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
23 self.options = None |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
24 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
25 def parse_args(self, *args, **kw): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
26 options = argparse.ArgumentParser.parse_args(self, *args, **kw) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
27 self.validate(options) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
28 self.options = options |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
29 return options |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
30 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
31 def validate(self, options): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
32 """validate options""" |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
33 if not os.path.isdir(options.directory): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
34 self.error("Not a directory: {}".format(options.directory)) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
35 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
36 def main(args=sys.argv[1:]): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
37 """CLI""" |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
38 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
39 # parse command line options |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
40 parser = Parser() |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
41 options = parser.parse_args(args) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
42 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
43 output = subprocess.check_output(['ls', '-l', options.directory]).strip() |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
44 rows = [row.strip().split() for row in output.splitlines()[1:]] |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
45 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
46 sizes = {} |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
47 for row in rows: |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
48 size = int(row[4]) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
49 filename = row[-1] |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
50 sizes.setdefault(size, []).append(filename) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
51 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
52 duplicates = {} |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
53 for size, filenames in sizes.items(): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
54 if len(filenames) < 2: |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
55 continue |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
56 duplicates[size] = filenames |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
57 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
58 for size in sorted(duplicates.keys()): |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
59 print ('{} : '.format(size)) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
60 print ('\n'.join(duplicates[size])) |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
61 print ('\n') |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
62 |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
63 if __name__ == '__main__': |
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
64 main() |