Mercurial > hg > Lemuriformes
diff lemuriformes/issubset.py @ 15:0d1b8bb1d97b
SQL + data related functionality
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 10 Dec 2017 17:16:52 -0800 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lemuriformes/issubset.py Sun Dec 10 17:16:52 2017 -0800 @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +determine if one CSV column is a subsset of another. +If it is a subset, output nothing and exit 0. +If it is not a subset, output all elements that are +in the asserted subset but not the total set +and exits 1. +Exits 2 on error. +""" + +import os +import sys +from collections import OrderedDict +from .cli import ConfigurationParser +from .uniques import uniques + +def main(args=sys.argv[1:]): + """CLI""" + + sets = OrderedDict([('subset', "the `PATH` to the CSV and `COLUMN` of the asserted subset"), + ('total', "the `PATH` to the CSV and `COLUMN` of the total set")]) + + # parse command line + parser = ConfigurationParser(description=__doc__) + for key, description in sets.items(): + parser.add_argument(key, nargs=2, + help=description) + options = parser.parse_args(args) + + # sanity + filenames = set() + for key in sets.keys(): + # - ensure the values are listw + setattr(options, key, list(getattr(options, key))) + value = getattr(options, key) + # - make files absolute + value[0] = os.path.realpath(value[0]) + # - ensure files exist + filename = value[0] + if not os.path.isfile(filename): + parser.error("Not a file: {}".format(filename)) + filenames.add(filename) + + # read the files + columns = {filename: uniques(filename) + for filename in filenames} + + # assert that the columns are in the files they have been ascribed to + for key in sets.keys(): + filename, column = getattr(options, key) + if column not in columns[filename]: + parser.error("Column '{}' not found in file '{}'".format(column, filename)) + + # calculate the difference + difference = columns[options.subset[0]][options.subset[1]].difference( + columns[options.total[0]][options.total[1]]) + if not difference: + return + print ("\n".join([str(i) for i in sorted(difference)])) + sys.exit(1) + + +if __name__ == '__main__': + main()