annotate numerics/table.py @ 180:69543d62ae7a

more split stubbing
author Jeff Hammel <k0scist@gmail.com>
date Tue, 09 Aug 2016 14:34:31 -0700
parents f63194f81f7d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
179
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2 # -*- coding: utf-8 -*-
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 """
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 parse header-based CSV
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6 """
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 # imports
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
9 import argparse
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
10 import csv
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 import json
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12 import os
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
13 import sys
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
14 import time
180
69543d62ae7a more split stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 179
diff changeset
15 from collections import OrderedDict
179
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
16
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
17 string = (str, unicode)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
18
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
19
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
20 def duplicates(*values):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
21 """return all duplicates in `values`"""
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
22
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
23 counts = {value: values.count(value)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
24 for value in set(values)}
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
25 retval = []
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
26 for value in values:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
27 if counts[value] > 1 and value not in retval:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
28 retval.append(value)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
29 return retval
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
30
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
31
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
32 def read_table(fp, verbose=False):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
33 """read table with header and return list of dictionaries"""
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
34
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
35 if isinstance(fp, string):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
36 with open(fp, 'r') as _fp:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
37 return read_table(_fp, verbose)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
38
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
39 # read CSV
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
40 data = [row for row in csv.reader(fp)]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
41
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
42 # check data
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
43 columns = set([len(row) for row in data])
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
44 if len(columns) != 1:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
45 raise AssertionError("Expected: a constant number of columns, instead got: {}".format(', '.join([str(column)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
46 for column in sorted(columns)])))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
47 columns = columns.pop()
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
48 if verbose:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
49 print "{} columns".format(columns)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
50 data = [[item.strip() for item in row]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
51 for row in data]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
52
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
53 # xform to JSON-format structure
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
54 header = data.pop(0)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
55 if verbose:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
56 print "Header:\n{header}".format(header=json.dumps(header, indent=1))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
57 duplicate_fields = duplicates(*header)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
58 if duplicate_fields:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
59 raise AssertionError("Duplicate header fields found: {duplicates}".format(duplicates=', '.join(duplicate_fields)))
180
69543d62ae7a more split stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 179
diff changeset
60 return [OrderedDict(zip(header, row))
179
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
61 for row in data]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
62
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
63
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
64 class TableParser(argparse.ArgumentParser):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
65 """CLI option parser"""
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
66
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
67 def __init__(self, **kwargs):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
68 kwargs.setdefault('formatter_class', argparse.RawTextHelpFormatter)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
69 kwargs.setdefault('description', __doc__)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
70 argparse.ArgumentParser.__init__(self, **kwargs)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
71 self.add_arguments()
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
72 self.options = None
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
73
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
74 def add_arguments(self):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
75 self.add_argument('input', type=argparse.FileType('r'),
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
76 help="input CSV file")
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
77 self.add_argument('-o', '--output', dest='output',
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
78 type=argparse.FileType('w'), default=sys.stdout,
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
79 help="output file to write to, or stdout by default")
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
80 self.add_argument('-c', '--column', dest='columns', nargs='+',
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
81 help="column names to output")
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
82 self.add_argument('--format', dest='format',
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
83 choices=('json', 'csv'), default='json',
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
84 help="output in this format")
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
85 self.add_argument('-v', '--verbose', dest='verbose',
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
86 action='store_true', default=False,
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
87 help="be verbose")
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
88
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
89 def parse_args(self, *args, **kw):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
90 options = argparse.ArgumentParser.parse_args(self, *args, **kw)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
91 self.validate(options)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
92 self.options = options
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
93 return options
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
94
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
95 def validate(self, options):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
96 """validate options"""
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
97
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
98 def read_table(self):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
99
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
100 assert self.options
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
101
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
102 data = read_table(self.options.input,
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
103 verbose=self.options.verbose)
180
69543d62ae7a more split stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 179
diff changeset
104 if not data:
69543d62ae7a more split stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 179
diff changeset
105 parser.error("No data found: {}".format(self.options.intput))
179
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
106 if self.options.columns:
180
69543d62ae7a more split stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 179
diff changeset
107 header = data[0].keys()
179
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
108 missing = [column
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
109 for column in self.options.columns
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
110 if column not in header]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
111 if missing:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
112 self.error("Columns not found in header: {0}".format(", ".join(missing)))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
113 header = options.columns
180
69543d62ae7a more split stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 179
diff changeset
114 data = [OrderedDict(zip(header,
69543d62ae7a more split stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 179
diff changeset
115 [row[column] for column in header]))
179
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
116 for row in data]
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
117 return data
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
118
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
119 def main(args=sys.argv[1:]):
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
120 """CLI"""
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
121
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
122 # parse command line options
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
123 parser = TableParser()
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
124 options = parser.parse_args(args)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
125
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
126 # read table
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
127 data = parser.read_table()
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
128
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
129 # output to JSON
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
130 if options.verbose:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
131 print ("Output {format}:".format(format=options.format))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
132 if options.format == 'json':
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
133 options.output.write(json.dumps(data, indent=1))
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
134 elif options.format == 'csv':
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
135 writer = csv.writer(options.output)
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
136 for row in data:
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
137 writer.writerow([row[column] for column in header])
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
138
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
139
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
140 if __name__ == '__main__':
f63194f81f7d stubbing table splitting
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
141 main()