comparison lemuriformes/deduplicate.py @ 14:756dbd3e391e

JSON deduplication module
author Jeff Hammel <k0scist@gmail.com>
date Sun, 10 Dec 2017 17:10:25 -0800
parents
children
comparison
equal deleted inserted replaced
13:2227ff372388 14:756dbd3e391e
1 #!/usr/bin/env python
2
3 """
4 deduplicate JSON data
5 """
6
7
8 import argparse
9 import json
10 import sys
11 from .cast import isstring
12 from .cli import ConfigurationParser
13 from .serialize import dump_json
14
15
16 def deduplicate(data):
17 """
18 data -- a list of dicts
19 """
20 retval = []
21 keys = {}
22 for item in data:
23 for key, value in item.items():
24
25 if isstring(value):
26 value = value.lower()
27 if value in keys.get(key, set()):
28 break
29 else:
30 for key, value in item.items():
31 if isstring(value):
32 value = value.lower()
33 keys.setdefault(key, set()).add(value)
34 retval.append(item)
35 return retval
36
37
38 def main(args=sys.argv[1:]):
39 """CLI"""
40
41 # parse command line
42 parser = ConfigurationParser(description=__doc__)
43 parser.add_argument('data', type=argparse.FileType('r'),
44 help="JSON data file of lists of dicts")
45 parser.add_argument('-o', '--output', dest='output',
46 type=argparse.FileType('w'), default=sys.stdout,
47 help="where to write clean data to, or stdout by default")
48 options = parser.parse_args(args)
49
50 # load
51 data = json.load(options.data)
52
53 # dedeuplicate
54 cleaned = deduplicate(data)
55
56 # output
57 options.output.write(dump_json(cleaned) + '\n')
58
59 if __name__ == '__main__':
60 main()