Mercurial > hg > numerics
annotate numerics/read.py @ 69:5dceb1d05a29
fix syntax error
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sat, 28 Feb 2015 16:59:08 -0800 |
parents | ef915968d104 |
children | 8bfa28ff74ce |
rev | line source |
---|---|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
3 |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
4 """ |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
5 read CSV, etc |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
6 """ |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
7 |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
8 # imports |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
9 import argparse |
24 | 10 import csv |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
11 import os |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
12 import sys |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
13 |
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
14 |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
15 # module globals |
26 | 16 __all__ = ['main', 'CSVParser'] |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
17 string = (str, unicode) |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
18 |
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
19 |
20 | 20 class CSVSchema(object): |
21 """read CSV with a schema""" | |
22 | |
23 def __init__(self, columns): | |
24 self.columns = columns | |
25 | |
26 def read(self, f): | |
27 | |
28 if isinstance(f, string): | |
29 with open(f) as fp: | |
30 return self.read(fp) | |
31 | |
32 retval = [] | |
33 reader = csv.reader(f) | |
34 for row in reader: | |
35 retval.append(dict(zip(self.columns, row))) | |
36 return retval | |
37 | |
38 __call__ = read | |
39 | |
40 | |
41 def aggregate_columns(directory, schema): | |
42 | |
43 # check for missing files | |
44 missing = [path for path in schema | |
45 if not os.path.exists(os.path.join(directory, path))] | |
46 assert not missing, "Missing files: {}".format(', '.join(missing)) | |
47 | |
48 # read records | |
49 records = {filename: CSVSchema(columns).read(os.path.join(directory, filename)) | |
50 for filename, columns in schema.items()} | |
51 | |
52 | |
53 # check lengths | |
54 lengths = [len(value) for value in records.values()] | |
55 assert len(set(lengths)) == 1, "Differing lengths found for files" | |
56 | |
57 # build new rows | |
58 retval = [] | |
59 for row in zip(*records.values()): | |
60 new_row = {} | |
61 for record in row: | |
62 for key, value in record.items(): | |
63 if new_row.get(key, value) != value: | |
64 raise AssertionError("{} != {}".format(new_row.get(key), value)) | |
65 new_row[key] = value | |
66 retval.append(new_row) | |
67 | |
68 return retval | |
69 | |
70 | |
11 | 71 def read_csv(*fp): |
19 | 72 """read a series of CSV files""" |
73 | |
11 | 74 retval = [] |
75 for f in fp: | |
76 | |
77 if isinstance(f, string): | |
78 with open(f) as _f: | |
79 retval.extend(read_csv(_f)) | |
80 continue | |
81 | |
82 reader = csv.reader(f) | |
83 retval.extend([row for row in reader]) | |
84 | |
85 return retval | |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
86 |
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
87 |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
88 class CSVParser(argparse.ArgumentParser): |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
89 """CLI option parser""" |
28 | 90 |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
91 def __init__(self, **kwargs): |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
92 kwargs.setdefault('description', __doc__) |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
93 argparse.ArgumentParser.__init__(self, **kwargs) |
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
94 self.add_argument('csv', nargs='*', |
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
95 help="CSV files to read, or read from stdin") |
25 | 96 self.add_argument('-+', '--add', dest='added_columns', nargs='+', |
97 help="append this column") | |
28 | 98 self.add_argument('-c', '--col', '--columns', dest='columns', |
99 nargs='+', type=int, | |
100 help="column numbers to output, starting with 0") | |
25 | 101 self.add_argument('-o', '--output', dest='output', |
102 type=argparse.FileType('a'), default=sys.stdout, | |
103 help='output destination, or stdout') | |
30
75270e7a051b
add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents:
28
diff
changeset
|
104 self.add_argument('--index', dest='index', |
75270e7a051b
add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents:
28
diff
changeset
|
105 action='store_true', default=False, |
75270e7a051b
add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents:
28
diff
changeset
|
106 help="prepend each row with numeric index") |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
107 self.options = None |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
108 |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
109 def parse_args(self, *args, **kw): |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
110 options = argparse.ArgumentParser.parse_args(self, *args, **kw) |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
111 self.validate(options) |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
112 self.options = options |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
113 return options |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
114 |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
115 def validate(self, options): |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
116 """validate options""" |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
117 |
45
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
118 def read(self): |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
119 """read and process CSV""" |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
120 |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
121 data = read_csv(*self.options.csv) |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
122 |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
123 if self.options.added_columns: |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
124 # add columns |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
125 for row in data: |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
126 row.extend(options.added_columns) |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
127 |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
128 if self.options.columns: |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
129 # filter by column |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
130 data = [[row[column] for column in self.options.columns] |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
131 for row in data] |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
132 |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
133 if self.options.index: |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
134 # prepend numeric index |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
135 for index, row in enumerate(data): |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
136 row.insert(0, index) |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
137 |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
138 # return processed data |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
139 return data |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
140 |
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
141 |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
142 def main(args=sys.argv[1:]): |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
143 """CLI""" |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
144 |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
145 # parse command line options |
24 | 146 parser = CSVParser() |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
147 options = parser.parse_args(args) |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
148 |
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
149 if not options.csv: |
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
150 # read from stdin |
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
151 options.csv = [sys.stdin] |
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
152 |
11 | 153 # read CSV |
45
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
154 data = parser.read() |
28 | 155 |
25 | 156 # write CSV |
157 writer = csv.writer(options.output) | |
158 for row in data: | |
159 writer.writerow(row) | |
11 | 160 |
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
161 if __name__ == '__main__': |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
162 main() |
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
163 |