Mercurial > hg > numerics
annotate numerics/read.py @ 107:19a5c2fb52bb
add transpose functionality
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 15 Mar 2015 10:02:48 -0700 |
parents | 32a849b8f1f0 |
children | 5790bcb30bd8 |
rev | line source |
---|---|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
3 |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
4 """ |
103
067aa27050a3
limping along towards bar charts
Jeff Hammel <k0scist@gmail.com>
parents:
101
diff
changeset
|
5 read CSV |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
6 """ |
103
067aa27050a3
limping along towards bar charts
Jeff Hammel <k0scist@gmail.com>
parents:
101
diff
changeset
|
7 # TODO: support other formats |
067aa27050a3
limping along towards bar charts
Jeff Hammel <k0scist@gmail.com>
parents:
101
diff
changeset
|
8 |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
9 |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
10 # imports |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
11 import argparse |
24 | 12 import csv |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
13 import os |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
14 import sys |
107
19a5c2fb52bb
add transpose functionality
Jeff Hammel <k0scist@gmail.com>
parents:
105
diff
changeset
|
15 from .data import transpose |
80 | 16 from .write import CSVWriter |
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
17 |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
18 # module globals |
103
067aa27050a3
limping along towards bar charts
Jeff Hammel <k0scist@gmail.com>
parents:
101
diff
changeset
|
19 __all__ = ['CSVSchema', 'read_csv', 'CSVParser', 'main'] |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
20 string = (str, unicode) |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
21 |
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
22 |
20 | 23 class CSVSchema(object): |
24 """read CSV with a schema""" | |
25 | |
26 def __init__(self, columns): | |
27 self.columns = columns | |
28 | |
29 def read(self, f): | |
30 | |
31 if isinstance(f, string): | |
32 with open(f) as fp: | |
33 return self.read(fp) | |
34 | |
35 retval = [] | |
36 reader = csv.reader(f) | |
37 for row in reader: | |
38 retval.append(dict(zip(self.columns, row))) | |
39 return retval | |
40 | |
41 __call__ = read | |
42 | |
43 | |
44 def aggregate_columns(directory, schema): | |
45 | |
46 # check for missing files | |
47 missing = [path for path in schema | |
48 if not os.path.exists(os.path.join(directory, path))] | |
49 assert not missing, "Missing files: {}".format(', '.join(missing)) | |
50 | |
51 # read records | |
52 records = {filename: CSVSchema(columns).read(os.path.join(directory, filename)) | |
53 for filename, columns in schema.items()} | |
54 | |
55 | |
56 # check lengths | |
57 lengths = [len(value) for value in records.values()] | |
58 assert len(set(lengths)) == 1, "Differing lengths found for files" | |
59 | |
60 # build new rows | |
61 retval = [] | |
62 for row in zip(*records.values()): | |
63 new_row = {} | |
64 for record in row: | |
65 for key, value in record.items(): | |
66 if new_row.get(key, value) != value: | |
67 raise AssertionError("{} != {}".format(new_row.get(key), value)) | |
68 new_row[key] = value | |
69 retval.append(new_row) | |
70 | |
71 return retval | |
72 | |
73 | |
11 | 74 def read_csv(*fp): |
19 | 75 """read a series of CSV files""" |
76 | |
11 | 77 retval = [] |
78 for f in fp: | |
79 | |
80 if isinstance(f, string): | |
81 with open(f) as _f: | |
82 retval.extend(read_csv(_f)) | |
83 continue | |
84 | |
85 reader = csv.reader(f) | |
86 retval.extend([row for row in reader]) | |
87 | |
88 return retval | |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
89 |
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
90 |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
91 class CSVParser(argparse.ArgumentParser): |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
92 """CLI option parser""" |
28 | 93 |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
94 def __init__(self, **kwargs): |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
95 kwargs.setdefault('description', __doc__) |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
96 argparse.ArgumentParser.__init__(self, **kwargs) |
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
97 self.add_argument('csv', nargs='*', |
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
98 help="CSV files to read, or read from stdin") |
25 | 99 self.add_argument('-+', '--add', dest='added_columns', nargs='+', |
100 help="append this column") | |
28 | 101 self.add_argument('-c', '--col', '--columns', dest='columns', |
102 nargs='+', type=int, | |
103 help="column numbers to output, starting with 0") | |
25 | 104 self.add_argument('-o', '--output', dest='output', |
105 type=argparse.FileType('a'), default=sys.stdout, | |
106 help='output destination, or stdout') | |
30
75270e7a051b
add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents:
28
diff
changeset
|
107 self.add_argument('--index', dest='index', |
75270e7a051b
add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents:
28
diff
changeset
|
108 action='store_true', default=False, |
75270e7a051b
add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents:
28
diff
changeset
|
109 help="prepend each row with numeric index") |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
110 self.options = None |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
111 |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
112 def parse_args(self, *args, **kw): |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
113 options = argparse.ArgumentParser.parse_args(self, *args, **kw) |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
114 self.validate(options) |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
115 self.options = options |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
116 return options |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
117 |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
118 def validate(self, options): |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
119 """validate options""" |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
120 |
101
b7d4b7f84883
simple script to compute means
Jeff Hammel <k0scist@gmail.com>
parents:
80
diff
changeset
|
121 if not options.csv: |
b7d4b7f84883
simple script to compute means
Jeff Hammel <k0scist@gmail.com>
parents:
80
diff
changeset
|
122 options.csv = [sys.stdin] |
b7d4b7f84883
simple script to compute means
Jeff Hammel <k0scist@gmail.com>
parents:
80
diff
changeset
|
123 |
45
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
124 def read(self): |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
125 """read and process CSV""" |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
126 |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
127 data = read_csv(*self.options.csv) |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
128 |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
129 if self.options.added_columns: |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
130 # add columns |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
131 for row in data: |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
132 row.extend(options.added_columns) |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
133 |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
134 if self.options.columns: |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
135 # filter by column |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
136 data = [[row[column] for column in self.options.columns] |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
137 for row in data] |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
138 |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
139 if self.options.index: |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
140 # prepend numeric index |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
141 for index, row in enumerate(data): |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
142 row.insert(0, index) |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
143 |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
144 # return processed data |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
145 return data |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
146 |
105
32a849b8f1f0
stubbing; it looks like most of this work is upstreaming
Jeff Hammel <k0scist@gmail.com>
parents:
103
diff
changeset
|
147 def columns(self): |
32a849b8f1f0
stubbing; it looks like most of this work is upstreaming
Jeff Hammel <k0scist@gmail.com>
parents:
103
diff
changeset
|
148 """return columns vs `data`'s rows""" |
107
19a5c2fb52bb
add transpose functionality
Jeff Hammel <k0scist@gmail.com>
parents:
105
diff
changeset
|
149 return transpose(self.read()) |
105
32a849b8f1f0
stubbing; it looks like most of this work is upstreaming
Jeff Hammel <k0scist@gmail.com>
parents:
103
diff
changeset
|
150 |
45
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
151 |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
152 def main(args=sys.argv[1:]): |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
153 """CLI""" |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
154 |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
155 # parse command line options |
24 | 156 parser = CSVParser() |
107
19a5c2fb52bb
add transpose functionality
Jeff Hammel <k0scist@gmail.com>
parents:
105
diff
changeset
|
157 parser.add_argument('--transpose', dest='transpose', |
19a5c2fb52bb
add transpose functionality
Jeff Hammel <k0scist@gmail.com>
parents:
105
diff
changeset
|
158 action='store_true', default=False, |
19a5c2fb52bb
add transpose functionality
Jeff Hammel <k0scist@gmail.com>
parents:
105
diff
changeset
|
159 help="transpose columns and rows") |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
160 options = parser.parse_args(args) |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
161 |
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
162 if not options.csv: |
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
163 # read from stdin |
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
164 options.csv = [sys.stdin] |
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
165 |
11 | 166 # read CSV |
107
19a5c2fb52bb
add transpose functionality
Jeff Hammel <k0scist@gmail.com>
parents:
105
diff
changeset
|
167 if options.transpose: |
19a5c2fb52bb
add transpose functionality
Jeff Hammel <k0scist@gmail.com>
parents:
105
diff
changeset
|
168 data = parser.columns() |
19a5c2fb52bb
add transpose functionality
Jeff Hammel <k0scist@gmail.com>
parents:
105
diff
changeset
|
169 else: |
19a5c2fb52bb
add transpose functionality
Jeff Hammel <k0scist@gmail.com>
parents:
105
diff
changeset
|
170 data = parser.read() |
28 | 171 |
25 | 172 # write CSV |
80 | 173 writer = CSVWriter(options.output) |
174 writer.write(data) | |
175 | |
11 | 176 |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
177 if __name__ == '__main__': |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
178 main() |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
179 |