Mercurial > hg > Lemuriformes
view lemuriformes/deduplicate.py @ 18:56596902e9ae default tip
add some setup + tests
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 10 Dec 2017 17:57:03 -0800 |
parents | 756dbd3e391e |
children |
line wrap: on
line source
#!/usr/bin/env python """ deduplicate JSON data """ import argparse import json import sys from .cast import isstring from .cli import ConfigurationParser from .serialize import dump_json def deduplicate(data): """ data -- a list of dicts """ retval = [] keys = {} for item in data: for key, value in item.items(): if isstring(value): value = value.lower() if value in keys.get(key, set()): break else: for key, value in item.items(): if isstring(value): value = value.lower() keys.setdefault(key, set()).add(value) retval.append(item) return retval def main(args=sys.argv[1:]): """CLI""" # parse command line parser = ConfigurationParser(description=__doc__) parser.add_argument('data', type=argparse.FileType('r'), help="JSON data file of lists of dicts") parser.add_argument('-o', '--output', dest='output', type=argparse.FileType('w'), default=sys.stdout, help="where to write clean data to, or stdout by default") options = parser.parse_args(args) # load data = json.load(options.data) # dedeuplicate cleaned = deduplicate(data) # output options.output.write(dump_json(cleaned) + '\n') if __name__ == '__main__': main()