view python/find_duplicate_files.py @ 925:a92db57f62f8 default tip

add lxml
author Jeff Hammel <k0scist@gmail.com>
date Mon, 20 Jan 2025 09:20:00 -0800
parents aa9a3850ed56
children
line wrap: on
line source

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
find duplicate files in a directory
"""

# imports
import argparse
import csv
import difflib
import json
import os
import sys


class DuplicateFilesParser(argparse.ArgumentParser):
    """CLI option parser"""

    def __init__(self, **kwargs):
        kwargs.setdefault('description', __doc__)
        argparse.ArgumentParser.__init__(self, **kwargs)
        self.add_argument('directory')
        self.add_argument('--identical-sizes', dest='identical_sizes',
                          action='store_true', default=False,
                          help="print out all matches with identical sizes and exit")
        self.options = None

    def parse_args(self, *args, **kw):
        options = argparse.ArgumentParser.parse_args(self, *args, **kw)
        self.validate(options)
        self.options = options
        return options

    def validate(self, options):
        """validate options"""
        if not os.path.isdir(options.directory):
            self.error("Not a directory: {}".format(options.directory))


def main(args=sys.argv[1:]):
    """CLI"""

    # parse command line options
    parser = DuplicateFilesParser()
    options = parser.parse_args(args)

    # get all file sizes
    sizes = {}
    directory = options.directory
    for dirpath, dirnames, files in os.walk(directory, topdown=True):
        for path in files:
            path = os.path.join(dirpath, path)
            sizes.setdefault(os.path.getsize(path), []).append(path)

    # filter out those with identical sizes
    identical_sizes = {k: v for k, v in sizes.items()
                       if len(v) > 1}
    if options.identical_sizes:
        print(json.dumps(identical_sizes, indent=2, sort_keys=True))


    # now that we've narrowed it down, let's find the identical files
    duplicate_files = []
    for row in identical_sizes.values():

        while len(row) > 1:
            duplicates = []
            ref_file = row.pop()
            ref = open(ref_file).read()
            for index, path in reversed(list(enumerate(row))):
                comp = open(path).read()
                if ref == comp:
                    if not duplicates:
                        duplicates.append(ref_file)
                    duplicates.append(path)
                    row.pop(index)
            if duplicates:
                duplicate_files.append(duplicates)


    # output CSV
    writer = csv.writer(sys.stdout)
    writer.writerows(duplicate_files)

if __name__ == '__main__':
    main()