Mercurial > hg > Lemuriformes
comparison lemuriformes/deduplicate.py @ 14:756dbd3e391e
JSON deduplication module
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 10 Dec 2017 17:10:25 -0800 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
13:2227ff372388 | 14:756dbd3e391e |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 deduplicate JSON data | |
5 """ | |
6 | |
7 | |
8 import argparse | |
9 import json | |
10 import sys | |
11 from .cast import isstring | |
12 from .cli import ConfigurationParser | |
13 from .serialize import dump_json | |
14 | |
15 | |
16 def deduplicate(data): | |
17 """ | |
18 data -- a list of dicts | |
19 """ | |
20 retval = [] | |
21 keys = {} | |
22 for item in data: | |
23 for key, value in item.items(): | |
24 | |
25 if isstring(value): | |
26 value = value.lower() | |
27 if value in keys.get(key, set()): | |
28 break | |
29 else: | |
30 for key, value in item.items(): | |
31 if isstring(value): | |
32 value = value.lower() | |
33 keys.setdefault(key, set()).add(value) | |
34 retval.append(item) | |
35 return retval | |
36 | |
37 | |
38 def main(args=sys.argv[1:]): | |
39 """CLI""" | |
40 | |
41 # parse command line | |
42 parser = ConfigurationParser(description=__doc__) | |
43 parser.add_argument('data', type=argparse.FileType('r'), | |
44 help="JSON data file of lists of dicts") | |
45 parser.add_argument('-o', '--output', dest='output', | |
46 type=argparse.FileType('w'), default=sys.stdout, | |
47 help="where to write clean data to, or stdout by default") | |
48 options = parser.parse_args(args) | |
49 | |
50 # load | |
51 data = json.load(options.data) | |
52 | |
53 # dedeuplicate | |
54 cleaned = deduplicate(data) | |
55 | |
56 # output | |
57 options.output.write(dump_json(cleaned) + '\n') | |
58 | |
59 if __name__ == '__main__': | |
60 main() |