diff lemuriformes/issubset.py @ 15:0d1b8bb1d97b

SQL + data related functionality
author Jeff Hammel <k0scist@gmail.com>
date Sun, 10 Dec 2017 17:16:52 -0800
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lemuriformes/issubset.py	Sun Dec 10 17:16:52 2017 -0800
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+determine if one CSV column is a subsset of another.
+If it is a subset, output nothing and exit 0.
+If it is not a subset, output all elements that are
+in the asserted subset but not the total set
+and exits 1.
+Exits 2 on error.
+"""
+
+import os
+import sys
+from collections import OrderedDict
+from .cli import ConfigurationParser
+from .uniques import uniques
+
+def main(args=sys.argv[1:]):
+    """CLI"""
+
+    sets = OrderedDict([('subset', "the `PATH` to the CSV and `COLUMN` of the asserted subset"),
+                        ('total', "the `PATH` to the CSV and `COLUMN` of the total set")])
+
+    # parse command line
+    parser = ConfigurationParser(description=__doc__)
+    for key, description in sets.items():
+        parser.add_argument(key, nargs=2,
+                            help=description)
+    options = parser.parse_args(args)
+
+    # sanity
+    filenames = set()
+    for key in sets.keys():
+        # - ensure the values are listw
+        setattr(options, key, list(getattr(options, key)))
+        value = getattr(options, key)
+        # - make files absolute
+        value[0] = os.path.realpath(value[0])
+        # - ensure files exist
+        filename = value[0]
+        if not os.path.isfile(filename):
+            parser.error("Not a file: {}".format(filename))
+        filenames.add(filename)
+
+    # read the files
+    columns = {filename: uniques(filename)
+               for filename in filenames}
+
+    # assert that the columns are in the files they have been ascribed to
+    for key in sets.keys():
+        filename, column = getattr(options, key)
+        if column not in columns[filename]:
+            parser.error("Column '{}' not found in file '{}'".format(column, filename))
+
+    # calculate the difference
+    difference = columns[options.subset[0]][options.subset[1]].difference(
+        columns[options.total[0]][options.total[1]])
+    if not difference:
+        return
+    print ("\n".join([str(i) for i in sorted(difference)]))
+    sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()