179
|
1 #!/usr/bin/env python
|
|
2 # -*- coding: utf-8 -*-
|
|
3
|
|
4 """
|
|
5 parse header-based CSV
|
|
6 """
|
|
7
|
|
8 # imports
|
|
9 import argparse
|
|
10 import csv
|
|
11 import json
|
|
12 import os
|
|
13 import sys
|
|
14 import time
|
|
15
|
|
16 string = (str, unicode)
|
|
17
|
|
18
|
|
19 def duplicates(*values):
|
|
20 """return all duplicates in `values`"""
|
|
21
|
|
22 counts = {value: values.count(value)
|
|
23 for value in set(values)}
|
|
24 retval = []
|
|
25 for value in values:
|
|
26 if counts[value] > 1 and value not in retval:
|
|
27 retval.append(value)
|
|
28 return retval
|
|
29
|
|
30
|
|
31 def read_table(fp, verbose=False):
|
|
32 """read table with header and return list of dictionaries"""
|
|
33
|
|
34 if isinstance(fp, string):
|
|
35 with open(fp, 'r') as _fp:
|
|
36 return read_table(_fp, verbose)
|
|
37
|
|
38 # read CSV
|
|
39 data = [row for row in csv.reader(fp)]
|
|
40
|
|
41 # check data
|
|
42 columns = set([len(row) for row in data])
|
|
43 if len(columns) != 1:
|
|
44 raise AssertionError("Expected: a constant number of columns, instead got: {}".format(', '.join([str(column)
|
|
45 for column in sorted(columns)])))
|
|
46 columns = columns.pop()
|
|
47 if verbose:
|
|
48 print "{} columns".format(columns)
|
|
49 data = [[item.strip() for item in row]
|
|
50 for row in data]
|
|
51
|
|
52 # xform to JSON-format structure
|
|
53 header = data.pop(0)
|
|
54 if verbose:
|
|
55 print "Header:\n{header}".format(header=json.dumps(header, indent=1))
|
|
56 duplicate_fields = duplicates(*header)
|
|
57 if duplicate_fields:
|
|
58 raise AssertionError("Duplicate header fields found: {duplicates}".format(duplicates=', '.join(duplicate_fields)))
|
|
59 return [dict(zip(header, row))
|
|
60 for row in data]
|
|
61
|
|
62
|
|
63 class TableParser(argparse.ArgumentParser):
|
|
64 """CLI option parser"""
|
|
65
|
|
66 def __init__(self, **kwargs):
|
|
67 kwargs.setdefault('formatter_class', argparse.RawTextHelpFormatter)
|
|
68 kwargs.setdefault('description', __doc__)
|
|
69 argparse.ArgumentParser.__init__(self, **kwargs)
|
|
70 self.add_arguments()
|
|
71 self.options = None
|
|
72
|
|
73 def add_arguments(self):
|
|
74 self.add_argument('input', type=argparse.FileType('r'),
|
|
75 help="input CSV file")
|
|
76 self.add_argument('-o', '--output', dest='output',
|
|
77 type=argparse.FileType('w'), default=sys.stdout,
|
|
78 help="output file to write to, or stdout by default")
|
|
79 self.add_argument('-c', '--column', dest='columns', nargs='+',
|
|
80 help="column names to output")
|
|
81 self.add_argument('--format', dest='format',
|
|
82 choices=('json', 'csv'), default='json',
|
|
83 help="output in this format")
|
|
84 self.add_argument('-v', '--verbose', dest='verbose',
|
|
85 action='store_true', default=False,
|
|
86 help="be verbose")
|
|
87
|
|
88 def parse_args(self, *args, **kw):
|
|
89 options = argparse.ArgumentParser.parse_args(self, *args, **kw)
|
|
90 self.validate(options)
|
|
91 self.options = options
|
|
92 return options
|
|
93
|
|
94 def validate(self, options):
|
|
95 """validate options"""
|
|
96
|
|
97 def read_table(self):
|
|
98
|
|
99 assert self.options
|
|
100
|
|
101 data = read_table(self.options.input,
|
|
102 verbose=self.options.verbose)
|
|
103
|
|
104 if self.options.columns:
|
|
105 missing = [column
|
|
106 for column in self.options.columns
|
|
107 if column not in header]
|
|
108 if missing:
|
|
109 self.error("Columns not found in header: {0}".format(", ".join(missing)))
|
|
110 header = options.columns
|
|
111 data = [dict(zip(header,
|
|
112 [row[column] for column in header]))
|
|
113 for row in data]
|
|
114 return data
|
|
115
|
|
116 def main(args=sys.argv[1:]):
|
|
117 """CLI"""
|
|
118
|
|
119 # parse command line options
|
|
120 parser = TableParser()
|
|
121 options = parser.parse_args(args)
|
|
122
|
|
123 # read table
|
|
124 data = parser.read_table()
|
|
125
|
|
126 # output to JSON
|
|
127 if options.verbose:
|
|
128 print ("Output {format}:".format(format=options.format))
|
|
129 if options.format == 'json':
|
|
130 options.output.write(json.dumps(data, indent=1))
|
|
131 elif options.format == 'csv':
|
|
132 writer = csv.writer(options.output)
|
|
133 for row in data:
|
|
134 writer.writerow([row[column] for column in header])
|
|
135
|
|
136
|
|
137 if __name__ == '__main__':
|
|
138 main()
|