Mercurial > hg > Lemuriformes
changeset 14:756dbd3e391e
JSON deduplication module
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 10 Dec 2017 17:10:25 -0800 |
parents | 2227ff372388 |
children | 0d1b8bb1d97b |
files | lemuriformes/deduplicate.py |
diffstat | 1 files changed, 60 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lemuriformes/deduplicate.py Sun Dec 10 17:10:25 2017 -0800 @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +""" +deduplicate JSON data +""" + + +import argparse +import json +import sys +from .cast import isstring +from .cli import ConfigurationParser +from .serialize import dump_json + + +def deduplicate(data): + """ + data -- a list of dicts + """ + retval = [] + keys = {} + for item in data: + for key, value in item.items(): + + if isstring(value): + value = value.lower() + if value in keys.get(key, set()): + break + else: + for key, value in item.items(): + if isstring(value): + value = value.lower() + keys.setdefault(key, set()).add(value) + retval.append(item) + return retval + + +def main(args=sys.argv[1:]): + """CLI""" + + # parse command line + parser = ConfigurationParser(description=__doc__) + parser.add_argument('data', type=argparse.FileType('r'), + help="JSON data file of lists of dicts") + parser.add_argument('-o', '--output', dest='output', + type=argparse.FileType('w'), default=sys.stdout, + help="where to write clean data to, or stdout by default") + options = parser.parse_args(args) + + # load + data = json.load(options.data) + + # dedeuplicate + cleaned = deduplicate(data) + + # output + options.output.write(dump_json(cleaned) + '\n') + +if __name__ == '__main__': + main()