changeset 711:ab831c7621e9

hacky way to note duplicate files
author Jeff Hammel <k0scist@gmail.com>
date Thu, 16 Oct 2014 11:25:49 -0700 (2014-10-16)
parents 7f910ce4da04
children 1d066bfdb744
files python/find_duplicate_files.py
diffstat 1 files changed, 64 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/python/find_duplicate_files.py	Thu Oct 16 11:25:49 2014 -0700
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+find duplicate files in a directory
+"""
+
+# imports
+import argparse
+import os
+import subprocess
+import sys
+
+# module globals
+__all__ = ['main', 'Parser']
+
+class Parser(argparse.ArgumentParser):
+    """CLI option parser"""
+    def __init__(self, **kwargs):
+        kwargs.setdefault('description', __doc__)
+        argparse.ArgumentParser.__init__(self, **kwargs)
+        self.add_argument('directory')
+        self.options = None
+
+    def parse_args(self, *args, **kw):
+        options = argparse.ArgumentParser.parse_args(self, *args, **kw)
+        self.validate(options)
+        self.options = options
+        return options
+
+    def validate(self, options):
+        """validate options"""
+        if not os.path.isdir(options.directory):
+            self.error("Not a directory: {}".format(options.directory))
+
+def main(args=sys.argv[1:]):
+    """CLI"""
+
+    # parse command line options
+    parser = Parser()
+    options = parser.parse_args(args)
+
+    output = subprocess.check_output(['ls', '-l', options.directory]).strip()
+    rows = [row.strip().split() for row in output.splitlines()[1:]]
+
+    sizes = {}
+    for row in rows:
+        size = int(row[4])
+        filename = row[-1]
+        sizes.setdefault(size, []).append(filename)
+
+    duplicates = {}
+    for size, filenames in sizes.items():
+        if len(filenames) < 2:
+            continue
+        duplicates[size] = filenames
+
+    for size in sorted(duplicates.keys()):
+        print ('{} : '.format(size))
+        print ('\n'.join(duplicates[size]))
+        print ('\n')
+
+if __name__ == '__main__':
+    main()