# HG changeset patch # User Jeff Hammel # Date 1512954625 28800 # Node ID 756dbd3e391e33ea24573d3a05581ab6f07ff5fb # Parent 2227ff372388faa25f746ade3be18527a324fd96 JSON deduplication module diff -r 2227ff372388 -r 756dbd3e391e lemuriformes/deduplicate.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lemuriformes/deduplicate.py Sun Dec 10 17:10:25 2017 -0800 @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +""" +deduplicate JSON data +""" + + +import argparse +import json +import sys +from .cast import isstring +from .cli import ConfigurationParser +from .serialize import dump_json + + +def deduplicate(data): + """ + data -- a list of dicts + """ + retval = [] + keys = {} + for item in data: + for key, value in item.items(): + + if isstring(value): + value = value.lower() + if value in keys.get(key, set()): + break + else: + for key, value in item.items(): + if isstring(value): + value = value.lower() + keys.setdefault(key, set()).add(value) + retval.append(item) + return retval + + +def main(args=sys.argv[1:]): + """CLI""" + + # parse command line + parser = ConfigurationParser(description=__doc__) + parser.add_argument('data', type=argparse.FileType('r'), + help="JSON data file of lists of dicts") + parser.add_argument('-o', '--output', dest='output', + type=argparse.FileType('w'), default=sys.stdout, + help="where to write clean data to, or stdout by default") + options = parser.parse_args(args) + + # load + data = json.load(options.data) + + # dedeuplicate + cleaned = deduplicate(data) + + # output + options.output.write(dump_json(cleaned) + '\n') + +if __name__ == '__main__': + main()