view lemuriformes/deduplicate.py @ 14:756dbd3e391e

JSON deduplication module
author Jeff Hammel <k0scist@gmail.com>
date Sun, 10 Dec 2017 17:10:25 -0800
parents
children
line wrap: on
line source

#!/usr/bin/env python

"""
deduplicate JSON data
"""


import argparse
import json
import sys
from .cast import isstring
from .cli import ConfigurationParser
from .serialize import dump_json


def deduplicate(data):
    """
    data -- a list of dicts
    """
    retval = []
    keys = {}
    for item in data:
        for key, value in item.items():

            if isstring(value):
                value = value.lower()
            if value in keys.get(key, set()):
                break
        else:
            for key, value in item.items():
                if isstring(value):
                    value = value.lower()
                keys.setdefault(key, set()).add(value)
            retval.append(item)
    return retval


def main(args=sys.argv[1:]):
    """CLI"""

    # parse command line
    parser = ConfigurationParser(description=__doc__)
    parser.add_argument('data', type=argparse.FileType('r'),
                        help="JSON data file of lists of dicts")
    parser.add_argument('-o', '--output', dest='output',
                        type=argparse.FileType('w'), default=sys.stdout,
                        help="where to write clean data to, or stdout by default")
    options = parser.parse_args(args)

    # load
    data = json.load(options.data)

    # dedeuplicate
    cleaned = deduplicate(data)

    # output
    options.output.write(dump_json(cleaned) + '\n')

if __name__ == '__main__':
    main()