14
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 deduplicate JSON data
|
|
5 """
|
|
6
|
|
7
|
|
8 import argparse
|
|
9 import json
|
|
10 import sys
|
|
11 from .cast import isstring
|
|
12 from .cli import ConfigurationParser
|
|
13 from .serialize import dump_json
|
|
14
|
|
15
|
|
16 def deduplicate(data):
|
|
17 """
|
|
18 data -- a list of dicts
|
|
19 """
|
|
20 retval = []
|
|
21 keys = {}
|
|
22 for item in data:
|
|
23 for key, value in item.items():
|
|
24
|
|
25 if isstring(value):
|
|
26 value = value.lower()
|
|
27 if value in keys.get(key, set()):
|
|
28 break
|
|
29 else:
|
|
30 for key, value in item.items():
|
|
31 if isstring(value):
|
|
32 value = value.lower()
|
|
33 keys.setdefault(key, set()).add(value)
|
|
34 retval.append(item)
|
|
35 return retval
|
|
36
|
|
37
|
|
38 def main(args=sys.argv[1:]):
|
|
39 """CLI"""
|
|
40
|
|
41 # parse command line
|
|
42 parser = ConfigurationParser(description=__doc__)
|
|
43 parser.add_argument('data', type=argparse.FileType('r'),
|
|
44 help="JSON data file of lists of dicts")
|
|
45 parser.add_argument('-o', '--output', dest='output',
|
|
46 type=argparse.FileType('w'), default=sys.stdout,
|
|
47 help="where to write clean data to, or stdout by default")
|
|
48 options = parser.parse_args(args)
|
|
49
|
|
50 # load
|
|
51 data = json.load(options.data)
|
|
52
|
|
53 # dedeuplicate
|
|
54 cleaned = deduplicate(data)
|
|
55
|
|
56 # output
|
|
57 options.output.write(dump_json(cleaned) + '\n')
|
|
58
|
|
59 if __name__ == '__main__':
|
|
60 main()
|