Mercurial > hg > Lemuriformes
annotate lemuriformes/uniques.py @ 17:4793f99b73e0
[lemuriformes] utility functions
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 10 Dec 2017 17:42:52 -0800 |
parents | |
children |
rev | line source |
---|---|
17
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
3 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
4 """ |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
5 count uniques in each CSV file column |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
6 """ |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
7 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
8 import argparse |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
9 import csv |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
10 import sys |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
11 from collections import OrderedDict |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
12 from .columns import read_columns |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
13 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
14 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
15 def uniques(fp): |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
16 """get unique counts for columns in CSV file `fp`""" |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
17 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
18 # read columns |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
19 columns = read_columns(fp, type=OrderedDict) |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
20 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
21 # convert to sets |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
22 for key, value in columns.iteritems(): |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
23 columns[key] = set(value) |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
24 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
25 return columns |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
26 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
27 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
28 def main(args=sys.argv[1:]): |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
29 """CLI""" |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
30 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
31 # parse command line |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
32 parser = argparse.ArgumentParser(description=__doc__) |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
33 parser.add_argument('input', |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
34 type=argparse.FileType('r'), |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
35 help="input CSV file with headers") |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
36 options = parser.parse_args(args) |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
37 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
38 # determine sets |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
39 columns = uniques(options.input) |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
40 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
41 # output uniques |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
42 writer = csv.writer(sys.stdout) |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
43 for key, value in columns.iteritems(): |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
44 writer.writerow([key, len(value)]) |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
45 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
46 |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
47 if __name__ == '__main__': |
4793f99b73e0
[lemuriformes] utility functions
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
48 main() |