changeset 826:aa9a3850ed56

make it work
author Jeff Hammel <k0scist@gmail.com>
date Sun, 19 Feb 2017 17:25:31 -0800 (2017-02-20)
parents 5a74c7ae19cd
children a5a339b7fd82
files python/find_duplicate_files.py
diffstat 1 files changed, 43 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/python/find_duplicate_files.py	Sun Feb 19 09:03:52 2017 -0800
+++ b/python/find_duplicate_files.py	Sun Feb 19 17:25:31 2017 -0800
@@ -7,8 +7,10 @@
 
 # imports
 import argparse
+import csv
+import difflib
+import json
 import os
-import subprocess
 import sys
 
 
@@ -19,6 +21,9 @@
         kwargs.setdefault('description', __doc__)
         argparse.ArgumentParser.__init__(self, **kwargs)
         self.add_argument('directory')
+        self.add_argument('--identical-sizes', dest='identical_sizes',
+                          action='store_true', default=False,
+                          help="print out all matches with identical sizes and exit")
         self.options = None
 
     def parse_args(self, *args, **kw):
@@ -40,8 +45,43 @@
     parser = DuplicateFilesParser()
     options = parser.parse_args(args)
 
-    # get all files
-    raise NotImplementedError('TODO') # -> record TODO items
+    # get all file sizes
+    sizes = {}
+    directory = options.directory
+    for dirpath, dirnames, files in os.walk(directory, topdown=True):
+        for path in files:
+            path = os.path.join(dirpath, path)
+            sizes.setdefault(os.path.getsize(path), []).append(path)
+
+    # filter out those with identical sizes
+    identical_sizes = {k: v for k, v in sizes.items()
+                       if len(v) > 1}
+    if options.identical_sizes:
+        print(json.dumps(identical_sizes, indent=2, sort_keys=True))
+
+
+    # now that we've narrowed it down, let's find the identical files
+    duplicate_files = []
+    for row in identical_sizes.values():
+
+        while len(row) > 1:
+            duplicates = []
+            ref_file = row.pop()
+            ref = open(ref_file).read()
+            for index, path in reversed(list(enumerate(row))):
+                comp = open(path).read()
+                if ref == comp:
+                    if not duplicates:
+                        duplicates.append(ref_file)
+                    duplicates.append(path)
+                    row.pop(index)
+            if duplicates:
+                duplicate_files.append(duplicates)
+
+
+    # output CSV
+    writer = csv.writer(sys.stdout)
+    writer.writerows(duplicate_files)
 
 if __name__ == '__main__':
     main()