annotate numerics/read.py @ 80:8bfa28ff74ce

use that thing we made
author Jeff Hammel <k0scist@gmail.com>
date Sun, 01 Mar 2015 09:29:38 -0800
parents ef915968d104
children b7d4b7f84883
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2 # -*- coding: utf-8 -*-
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 """
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 read CSV, etc
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6 """
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 # imports
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
9 import argparse
24
f42808450199 bugfixes
Jeff Hammel <k0scist@gmail.com>
parents: 20
diff changeset
10 import csv
5
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 import os
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12 import sys
80
8bfa28ff74ce use that thing we made
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
13 from .write import CSVWriter
43
bcf9ec537bda read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents: 30
diff changeset
14
5
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
15 # module globals
26
e01d84d29cd0 remove obselete
Jeff Hammel <k0scist@gmail.com>
parents: 25
diff changeset
16 __all__ = ['main', 'CSVParser']
5
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
17 string = (str, unicode)
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
18
43
bcf9ec537bda read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents: 30
diff changeset
19
20
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
20 class CSVSchema(object):
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
21 """read CSV with a schema"""
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
22
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
23 def __init__(self, columns):
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
24 self.columns = columns
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
25
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
26 def read(self, f):
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
27
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
28 if isinstance(f, string):
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
29 with open(f) as fp:
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
30 return self.read(fp)
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
31
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
32 retval = []
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
33 reader = csv.reader(f)
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
34 for row in reader:
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
35 retval.append(dict(zip(self.columns, row)))
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
36 return retval
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
37
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
38 __call__ = read
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
39
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
40
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
41 def aggregate_columns(directory, schema):
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
42
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
43 # check for missing files
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
44 missing = [path for path in schema
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
45 if not os.path.exists(os.path.join(directory, path))]
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
46 assert not missing, "Missing files: {}".format(', '.join(missing))
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
47
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
48 # read records
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
49 records = {filename: CSVSchema(columns).read(os.path.join(directory, filename))
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
50 for filename, columns in schema.items()}
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
51
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
52
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
53 # check lengths
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
54 lengths = [len(value) for value in records.values()]
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
55 assert len(set(lengths)) == 1, "Differing lengths found for files"
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
56
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
57 # build new rows
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
58 retval = []
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
59 for row in zip(*records.values()):
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
60 new_row = {}
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
61 for record in row:
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
62 for key, value in record.items():
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
63 if new_row.get(key, value) != value:
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
64 raise AssertionError("{} != {}".format(new_row.get(key), value))
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
65 new_row[key] = value
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
66 retval.append(new_row)
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
67
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
68 return retval
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
69
Jeff Hammel <k0scist@gmail.com>
parents: 19
diff changeset
70
11
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
71 def read_csv(*fp):
19
Jeff Hammel <k0scist@gmail.com>
parents: 13
diff changeset
72 """read a series of CSV files"""
Jeff Hammel <k0scist@gmail.com>
parents: 13
diff changeset
73
11
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
74 retval = []
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
75 for f in fp:
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
76
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
77 if isinstance(f, string):
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
78 with open(f) as _f:
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
79 retval.extend(read_csv(_f))
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
80 continue
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
81
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
82 reader = csv.reader(f)
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
83 retval.extend([row for row in reader])
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
84
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
85 return retval
5
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
86
43
bcf9ec537bda read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents: 30
diff changeset
87
5
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
88 class CSVParser(argparse.ArgumentParser):
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
89 """CLI option parser"""
28
2cadb4349753 parse by columns
Jeff Hammel <k0scist@gmail.com>
parents: 26
diff changeset
90
5
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
91 def __init__(self, **kwargs):
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
92 kwargs.setdefault('description', __doc__)
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
93 argparse.ArgumentParser.__init__(self, **kwargs)
43
bcf9ec537bda read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents: 30
diff changeset
94 self.add_argument('csv', nargs='*',
bcf9ec537bda read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents: 30
diff changeset
95 help="CSV files to read, or read from stdin")
25
1a0dfe551d71 minor fixes/features
Jeff Hammel <k0scist@gmail.com>
parents: 24
diff changeset
96 self.add_argument('-+', '--add', dest='added_columns', nargs='+',
1a0dfe551d71 minor fixes/features
Jeff Hammel <k0scist@gmail.com>
parents: 24
diff changeset
97 help="append this column")
28
2cadb4349753 parse by columns
Jeff Hammel <k0scist@gmail.com>
parents: 26
diff changeset
98 self.add_argument('-c', '--col', '--columns', dest='columns',
2cadb4349753 parse by columns
Jeff Hammel <k0scist@gmail.com>
parents: 26
diff changeset
99 nargs='+', type=int,
2cadb4349753 parse by columns
Jeff Hammel <k0scist@gmail.com>
parents: 26
diff changeset
100 help="column numbers to output, starting with 0")
25
1a0dfe551d71 minor fixes/features
Jeff Hammel <k0scist@gmail.com>
parents: 24
diff changeset
101 self.add_argument('-o', '--output', dest='output',
1a0dfe551d71 minor fixes/features
Jeff Hammel <k0scist@gmail.com>
parents: 24
diff changeset
102 type=argparse.FileType('a'), default=sys.stdout,
1a0dfe551d71 minor fixes/features
Jeff Hammel <k0scist@gmail.com>
parents: 24
diff changeset
103 help='output destination, or stdout')
30
75270e7a051b add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents: 28
diff changeset
104 self.add_argument('--index', dest='index',
75270e7a051b add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents: 28
diff changeset
105 action='store_true', default=False,
75270e7a051b add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents: 28
diff changeset
106 help="prepend each row with numeric index")
5
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
107 self.options = None
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
108
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
109 def parse_args(self, *args, **kw):
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
110 options = argparse.ArgumentParser.parse_args(self, *args, **kw)
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
111 self.validate(options)
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
112 self.options = options
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
113 return options
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
114
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
115 def validate(self, options):
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
116 """validate options"""
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
117
45
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
118 def read(self):
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
119 """read and process CSV"""
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
120
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
121 data = read_csv(*self.options.csv)
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
122
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
123 if self.options.added_columns:
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
124 # add columns
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
125 for row in data:
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
126 row.extend(options.added_columns)
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
127
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
128 if self.options.columns:
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
129 # filter by column
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
130 data = [[row[column] for column in self.options.columns]
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
131 for row in data]
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
132
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
133 if self.options.index:
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
134 # prepend numeric index
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
135 for index, row in enumerate(data):
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
136 row.insert(0, index)
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
137
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
138 # return processed data
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
139 return data
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
140
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
141
5
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
142 def main(args=sys.argv[1:]):
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
143 """CLI"""
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
144
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
145 # parse command line options
24
f42808450199 bugfixes
Jeff Hammel <k0scist@gmail.com>
parents: 20
diff changeset
146 parser = CSVParser()
5
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
147 options = parser.parse_args(args)
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
148
43
bcf9ec537bda read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents: 30
diff changeset
149 if not options.csv:
bcf9ec537bda read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents: 30
diff changeset
150 # read from stdin
bcf9ec537bda read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents: 30
diff changeset
151 options.csv = [sys.stdin]
bcf9ec537bda read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents: 30
diff changeset
152
11
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
153 # read CSV
45
ef915968d104 put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents: 43
diff changeset
154 data = parser.read()
28
2cadb4349753 parse by columns
Jeff Hammel <k0scist@gmail.com>
parents: 26
diff changeset
155
25
1a0dfe551d71 minor fixes/features
Jeff Hammel <k0scist@gmail.com>
parents: 24
diff changeset
156 # write CSV
80
8bfa28ff74ce use that thing we made
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
157 writer = CSVWriter(options.output)
8bfa28ff74ce use that thing we made
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
158 writer.write(data)
8bfa28ff74ce use that thing we made
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
159
11
5609225fb254 read csv
Jeff Hammel <k0scist@gmail.com>
parents: 5
diff changeset
160
5
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
161 if __name__ == '__main__':
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
162 main()
d5447d401c44 serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
163