179
|
1 #!/usr/bin/env python
|
|
2 # -*- coding: utf-8 -*-
|
|
3
|
|
4 """
|
|
5 parse header-based CSV
|
|
6 """
|
|
7
|
|
8 # imports
|
|
9 import argparse
|
|
10 import csv
|
|
11 import json
|
|
12 import os
|
|
13 import sys
|
|
14 import time
|
180
|
15 from collections import OrderedDict
|
179
|
16
|
|
17 string = (str, unicode)
|
|
18
|
|
19
|
|
20 def duplicates(*values):
|
|
21 """return all duplicates in `values`"""
|
|
22
|
|
23 counts = {value: values.count(value)
|
|
24 for value in set(values)}
|
|
25 retval = []
|
|
26 for value in values:
|
|
27 if counts[value] > 1 and value not in retval:
|
|
28 retval.append(value)
|
|
29 return retval
|
|
30
|
|
31
|
|
32 def read_table(fp, verbose=False):
|
|
33 """read table with header and return list of dictionaries"""
|
|
34
|
|
35 if isinstance(fp, string):
|
|
36 with open(fp, 'r') as _fp:
|
|
37 return read_table(_fp, verbose)
|
|
38
|
|
39 # read CSV
|
|
40 data = [row for row in csv.reader(fp)]
|
|
41
|
|
42 # check data
|
|
43 columns = set([len(row) for row in data])
|
|
44 if len(columns) != 1:
|
|
45 raise AssertionError("Expected: a constant number of columns, instead got: {}".format(', '.join([str(column)
|
|
46 for column in sorted(columns)])))
|
|
47 columns = columns.pop()
|
|
48 if verbose:
|
|
49 print "{} columns".format(columns)
|
|
50 data = [[item.strip() for item in row]
|
|
51 for row in data]
|
|
52
|
|
53 # xform to JSON-format structure
|
|
54 header = data.pop(0)
|
|
55 if verbose:
|
|
56 print "Header:\n{header}".format(header=json.dumps(header, indent=1))
|
|
57 duplicate_fields = duplicates(*header)
|
|
58 if duplicate_fields:
|
|
59 raise AssertionError("Duplicate header fields found: {duplicates}".format(duplicates=', '.join(duplicate_fields)))
|
180
|
60 return [OrderedDict(zip(header, row))
|
179
|
61 for row in data]
|
|
62
|
|
63
|
|
64 class TableParser(argparse.ArgumentParser):
|
|
65 """CLI option parser"""
|
|
66
|
|
67 def __init__(self, **kwargs):
|
|
68 kwargs.setdefault('formatter_class', argparse.RawTextHelpFormatter)
|
|
69 kwargs.setdefault('description', __doc__)
|
|
70 argparse.ArgumentParser.__init__(self, **kwargs)
|
|
71 self.add_arguments()
|
|
72 self.options = None
|
|
73
|
|
74 def add_arguments(self):
|
|
75 self.add_argument('input', type=argparse.FileType('r'),
|
|
76 help="input CSV file")
|
|
77 self.add_argument('-o', '--output', dest='output',
|
|
78 type=argparse.FileType('w'), default=sys.stdout,
|
|
79 help="output file to write to, or stdout by default")
|
|
80 self.add_argument('-c', '--column', dest='columns', nargs='+',
|
|
81 help="column names to output")
|
|
82 self.add_argument('--format', dest='format',
|
|
83 choices=('json', 'csv'), default='json',
|
|
84 help="output in this format")
|
|
85 self.add_argument('-v', '--verbose', dest='verbose',
|
|
86 action='store_true', default=False,
|
|
87 help="be verbose")
|
|
88
|
|
89 def parse_args(self, *args, **kw):
|
|
90 options = argparse.ArgumentParser.parse_args(self, *args, **kw)
|
|
91 self.validate(options)
|
|
92 self.options = options
|
|
93 return options
|
|
94
|
|
95 def validate(self, options):
|
|
96 """validate options"""
|
|
97
|
|
98 def read_table(self):
|
|
99
|
|
100 assert self.options
|
|
101
|
|
102 data = read_table(self.options.input,
|
|
103 verbose=self.options.verbose)
|
180
|
104 if not data:
|
|
105 parser.error("No data found: {}".format(self.options.intput))
|
179
|
106 if self.options.columns:
|
180
|
107 header = data[0].keys()
|
179
|
108 missing = [column
|
|
109 for column in self.options.columns
|
|
110 if column not in header]
|
|
111 if missing:
|
|
112 self.error("Columns not found in header: {0}".format(", ".join(missing)))
|
|
113 header = options.columns
|
180
|
114 data = [OrderedDict(zip(header,
|
|
115 [row[column] for column in header]))
|
179
|
116 for row in data]
|
|
117 return data
|
|
118
|
|
119 def main(args=sys.argv[1:]):
|
|
120 """CLI"""
|
|
121
|
|
122 # parse command line options
|
|
123 parser = TableParser()
|
|
124 options = parser.parse_args(args)
|
|
125
|
|
126 # read table
|
|
127 data = parser.read_table()
|
|
128
|
|
129 # output to JSON
|
|
130 if options.verbose:
|
|
131 print ("Output {format}:".format(format=options.format))
|
|
132 if options.format == 'json':
|
|
133 options.output.write(json.dumps(data, indent=1))
|
|
134 elif options.format == 'csv':
|
|
135 writer = csv.writer(options.output)
|
|
136 for row in data:
|
|
137 writer.writerow([row[column] for column in header])
|
|
138
|
|
139
|
|
140 if __name__ == '__main__':
|
|
141 main()
|