diff lemuriformes/uniques.py @ 17:4793f99b73e0

[lemuriformes] utility functions
author Jeff Hammel <k0scist@gmail.com>
date Sun, 10 Dec 2017 17:42:52 -0800
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lemuriformes/uniques.py	Sun Dec 10 17:42:52 2017 -0800
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+count uniques in each CSV file column
+"""
+
+import argparse
+import csv
+import sys
+from collections import OrderedDict
+from .columns import read_columns
+
+
+def uniques(fp):
+    """get unique counts for columns in CSV file `fp`"""
+
+    # read columns
+    columns = read_columns(fp, type=OrderedDict)
+
+    # convert to sets
+    for key, value in columns.iteritems():
+        columns[key] = set(value)
+
+    return columns
+
+
+def main(args=sys.argv[1:]):
+    """CLI"""
+
+    # parse command line
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('input',
+                        type=argparse.FileType('r'),
+                        help="input CSV file with headers")
+    options = parser.parse_args(args)
+
+    # determine sets
+    columns = uniques(options.input)
+
+    # output uniques
+    writer = csv.writer(sys.stdout)
+    for key, value in columns.iteritems():
+        writer.writerow([key, len(value)])
+
+
+if __name__ == '__main__':
+    main()