view lemuriformes/issubset.py @ 15:0d1b8bb1d97b

SQL + data related functionality
author Jeff Hammel <k0scist@gmail.com>
date Sun, 10 Dec 2017 17:16:52 -0800
parents
children
line wrap: on
line source

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
determine if one CSV column is a subsset of another.
If it is a subset, output nothing and exit 0.
If it is not a subset, output all elements that are
in the asserted subset but not the total set
and exits 1.
Exits 2 on error.
"""

import os
import sys
from collections import OrderedDict
from .cli import ConfigurationParser
from .uniques import uniques

def main(args=sys.argv[1:]):
    """CLI"""

    sets = OrderedDict([('subset', "the `PATH` to the CSV and `COLUMN` of the asserted subset"),
                        ('total', "the `PATH` to the CSV and `COLUMN` of the total set")])

    # parse command line
    parser = ConfigurationParser(description=__doc__)
    for key, description in sets.items():
        parser.add_argument(key, nargs=2,
                            help=description)
    options = parser.parse_args(args)

    # sanity
    filenames = set()
    for key in sets.keys():
        # - ensure the values are listw
        setattr(options, key, list(getattr(options, key)))
        value = getattr(options, key)
        # - make files absolute
        value[0] = os.path.realpath(value[0])
        # - ensure files exist
        filename = value[0]
        if not os.path.isfile(filename):
            parser.error("Not a file: {}".format(filename))
        filenames.add(filename)

    # read the files
    columns = {filename: uniques(filename)
               for filename in filenames}

    # assert that the columns are in the files they have been ascribed to
    for key in sets.keys():
        filename, column = getattr(options, key)
        if column not in columns[filename]:
            parser.error("Column '{}' not found in file '{}'".format(column, filename))

    # calculate the difference
    difference = columns[options.subset[0]][options.subset[1]].difference(
        columns[options.total[0]][options.total[1]])
    if not difference:
        return
    print ("\n".join([str(i) for i in sorted(difference)]))
    sys.exit(1)


if __name__ == '__main__':
    main()