annotate lemuriformes/deduplicate.py @ 14:756dbd3e391e

JSON deduplication module
author Jeff Hammel <k0scist@gmail.com>
date Sun, 10 Dec 2017 17:10:25 -0800
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
14
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3 """
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 deduplicate JSON data
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 """
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 import argparse
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
9 import json
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
10 import sys
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 from .cast import isstring
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12 from .cli import ConfigurationParser
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
13 from .serialize import dump_json
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
14
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
15
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
16 def deduplicate(data):
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
17 """
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
18 data -- a list of dicts
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
19 """
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
20 retval = []
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
21 keys = {}
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
22 for item in data:
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
23 for key, value in item.items():
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
24
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
25 if isstring(value):
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
26 value = value.lower()
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
27 if value in keys.get(key, set()):
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
28 break
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
29 else:
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
30 for key, value in item.items():
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
31 if isstring(value):
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
32 value = value.lower()
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
33 keys.setdefault(key, set()).add(value)
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
34 retval.append(item)
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
35 return retval
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
36
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
37
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
38 def main(args=sys.argv[1:]):
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
39 """CLI"""
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
40
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
41 # parse command line
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
42 parser = ConfigurationParser(description=__doc__)
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
43 parser.add_argument('data', type=argparse.FileType('r'),
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
44 help="JSON data file of lists of dicts")
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
45 parser.add_argument('-o', '--output', dest='output',
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
46 type=argparse.FileType('w'), default=sys.stdout,
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
47 help="where to write clean data to, or stdout by default")
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
48 options = parser.parse_args(args)
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
49
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
50 # load
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
51 data = json.load(options.data)
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
52
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
53 # dedeuplicate
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
54 cleaned = deduplicate(data)
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
55
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
56 # output
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
57 options.output.write(dump_json(cleaned) + '\n')
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
58
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
59 if __name__ == '__main__':
756dbd3e391e JSON deduplication module
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
60 main()