changeset 14:756dbd3e391e

JSON deduplication module
author Jeff Hammel <k0scist@gmail.com>
date Sun, 10 Dec 2017 17:10:25 -0800
parents 2227ff372388
children 0d1b8bb1d97b
files lemuriformes/deduplicate.py
diffstat 1 files changed, 60 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lemuriformes/deduplicate.py	Sun Dec 10 17:10:25 2017 -0800
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+"""
+deduplicate JSON data
+"""
+
+
+import argparse
+import json
+import sys
+from .cast import isstring
+from .cli import ConfigurationParser
+from .serialize import dump_json
+
+
+def deduplicate(data):
+    """
+    data -- a list of dicts
+    """
+    retval = []
+    keys = {}
+    for item in data:
+        for key, value in item.items():
+
+            if isstring(value):
+                value = value.lower()
+            if value in keys.get(key, set()):
+                break
+        else:
+            for key, value in item.items():
+                if isstring(value):
+                    value = value.lower()
+                keys.setdefault(key, set()).add(value)
+            retval.append(item)
+    return retval
+
+
+def main(args=sys.argv[1:]):
+    """CLI"""
+
+    # parse command line
+    parser = ConfigurationParser(description=__doc__)
+    parser.add_argument('data', type=argparse.FileType('r'),
+                        help="JSON data file of lists of dicts")
+    parser.add_argument('-o', '--output', dest='output',
+                        type=argparse.FileType('w'), default=sys.stdout,
+                        help="where to write clean data to, or stdout by default")
+    options = parser.parse_args(args)
+
+    # load
+    data = json.load(options.data)
+
+    # dedeuplicate
+    cleaned = deduplicate(data)
+
+    # output
+    options.output.write(dump_json(cleaned) + '\n')
+
+if __name__ == '__main__':
+    main()