comparison python/find_duplicate_files.py @ 711:ab831c7621e9

hacky way to note duplicate files
author Jeff Hammel <k0scist@gmail.com>
date Thu, 16 Oct 2014 11:25:49 -0700
parents
children dbd2562cb03e
comparison
equal deleted inserted replaced
710:7f910ce4da04 711:ab831c7621e9
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 """
5 find duplicate files in a directory
6 """
7
8 # imports
9 import argparse
10 import os
11 import subprocess
12 import sys
13
14 # module globals
15 __all__ = ['main', 'Parser']
16
17 class Parser(argparse.ArgumentParser):
18 """CLI option parser"""
19 def __init__(self, **kwargs):
20 kwargs.setdefault('description', __doc__)
21 argparse.ArgumentParser.__init__(self, **kwargs)
22 self.add_argument('directory')
23 self.options = None
24
25 def parse_args(self, *args, **kw):
26 options = argparse.ArgumentParser.parse_args(self, *args, **kw)
27 self.validate(options)
28 self.options = options
29 return options
30
31 def validate(self, options):
32 """validate options"""
33 if not os.path.isdir(options.directory):
34 self.error("Not a directory: {}".format(options.directory))
35
36 def main(args=sys.argv[1:]):
37 """CLI"""
38
39 # parse command line options
40 parser = Parser()
41 options = parser.parse_args(args)
42
43 output = subprocess.check_output(['ls', '-l', options.directory]).strip()
44 rows = [row.strip().split() for row in output.splitlines()[1:]]
45
46 sizes = {}
47 for row in rows:
48 size = int(row[4])
49 filename = row[-1]
50 sizes.setdefault(size, []).append(filename)
51
52 duplicates = {}
53 for size, filenames in sizes.items():
54 if len(filenames) < 2:
55 continue
56 duplicates[size] = filenames
57
58 for size in sorted(duplicates.keys()):
59 print ('{} : '.format(size))
60 print ('\n'.join(duplicates[size]))
61 print ('\n')
62
63 if __name__ == '__main__':
64 main()