annotate numerics/table.py @ 179:f63194f81f7d

stubbing table splitting
author Jeff Hammel <k0scist@gmail.com>
date Tue, 09 Aug 2016 14:03:33 -0700
parents
children 69543d62ae7a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
179
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2 # -*- coding: utf-8 -*-
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 """
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 parse header-based CSV
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6 """
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 # imports
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
9 import argparse
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
10 import csv
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 import json
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12 import os
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
13 import sys
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
14 import time
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
15
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
16 string = (str, unicode)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
17
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
18
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
19 def duplicates(*values):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
20 """return all duplicates in `values`"""
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
21
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
22 counts = {value: values.count(value)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
23 for value in set(values)}
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
24 retval = []
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
25 for value in values:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
26 if counts[value] > 1 and value not in retval:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
27 retval.append(value)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
28 return retval
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
29
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
30
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
31 def read_table(fp, verbose=False):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
32 """read table with header and return list of dictionaries"""
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
33
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
34 if isinstance(fp, string):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
35 with open(fp, 'r') as _fp:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
36 return read_table(_fp, verbose)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
37
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
38 # read CSV
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
39 data = [row for row in csv.reader(fp)]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
40
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
41 # check data
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
42 columns = set([len(row) for row in data])
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
43 if len(columns) != 1:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
44 raise AssertionError("Expected: a constant number of columns, instead got: {}".format(', '.join([str(column)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
45 for column in sorted(columns)])))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
46 columns = columns.pop()
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
47 if verbose:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
48 print "{} columns".format(columns)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
49 data = [[item.strip() for item in row]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
50 for row in data]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
51
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
52 # xform to JSON-format structure
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
53 header = data.pop(0)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
54 if verbose:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
55 print "Header:\n{header}".format(header=json.dumps(header, indent=1))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
56 duplicate_fields = duplicates(*header)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
57 if duplicate_fields:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
58 raise AssertionError("Duplicate header fields found: {duplicates}".format(duplicates=', '.join(duplicate_fields)))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
59 return [dict(zip(header, row))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
60 for row in data]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
61
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
62
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
63 class TableParser(argparse.ArgumentParser):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
64 """CLI option parser"""
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
65
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
66 def __init__(self, **kwargs):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
67 kwargs.setdefault('formatter_class', argparse.RawTextHelpFormatter)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
68 kwargs.setdefault('description', __doc__)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
69 argparse.ArgumentParser.__init__(self, **kwargs)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
70 self.add_arguments()
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
71 self.options = None
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
72
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
73 def add_arguments(self):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
74 self.add_argument('input', type=argparse.FileType('r'),
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
75 help="input CSV file")
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
76 self.add_argument('-o', '--output', dest='output',
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
77 type=argparse.FileType('w'), default=sys.stdout,
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
78 help="output file to write to, or stdout by default")
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
79 self.add_argument('-c', '--column', dest='columns', nargs='+',
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
80 help="column names to output")
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
81 self.add_argument('--format', dest='format',
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
82 choices=('json', 'csv'), default='json',
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
83 help="output in this format")
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
84 self.add_argument('-v', '--verbose', dest='verbose',
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
85 action='store_true', default=False,
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
86 help="be verbose")
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
87
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
88 def parse_args(self, *args, **kw):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
89 options = argparse.ArgumentParser.parse_args(self, *args, **kw)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
90 self.validate(options)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
91 self.options = options
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
92 return options
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
93
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
94 def validate(self, options):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
95 """validate options"""
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
96
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
97 def read_table(self):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
98
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
99 assert self.options
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
100
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
101 data = read_table(self.options.input,
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
102 verbose=self.options.verbose)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
103
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
104 if self.options.columns:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
105 missing = [column
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
106 for column in self.options.columns
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
107 if column not in header]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
108 if missing:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
109 self.error("Columns not found in header: {0}".format(", ".join(missing)))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
110 header = options.columns
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
111 data = [dict(zip(header,
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
112 [row[column] for column in header]))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
113 for row in data]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
114 return data
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
115
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
116 def main(args=sys.argv[1:]):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
117 """CLI"""
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
118
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
119 # parse command line options
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
120 parser = TableParser()
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
121 options = parser.parse_args(args)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
122
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
123 # read table
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
124 data = parser.read_table()
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
125
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
126 # output to JSON
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
127 if options.verbose:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
128 print ("Output {format}:".format(format=options.format))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
129 if options.format == 'json':
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
130 options.output.write(json.dumps(data, indent=1))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
131 elif options.format == 'csv':
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
132 writer = csv.writer(options.output)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
133 for row in data:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
134 writer.writerow([row[column] for column in header])
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
135
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
136
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
137 if __name__ == '__main__':
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
138 main()