# HG changeset patch # User Jeff Hammel # Date 1412032775 25200 # Node ID 178dc9a943872edc39d8ed1b8855b19a1a394ce9 # Parent 276beb743a596b5946b4b9382d4bf3ef9837999d port diff -r 276beb743a59 -r 178dc9a94387 numerics/read.py --- a/numerics/read.py Sun Sep 21 20:13:17 2014 -0700 +++ b/numerics/read.py Mon Sep 29 16:19:35 2014 -0700 @@ -15,6 +15,57 @@ __all__ = ['main', 'ReadParser'] string = (str, unicode) +class CSVSchema(object): + """read CSV with a schema""" + + def __init__(self, columns): + self.columns = columns + + def read(self, f): + + if isinstance(f, string): + with open(f) as fp: + return self.read(fp) + + retval = [] + reader = csv.reader(f) + for row in reader: + retval.append(dict(zip(self.columns, row))) + return retval + + __call__ = read + + +def aggregate_columns(directory, schema): + + # check for missing files + missing = [path for path in schema + if not os.path.exists(os.path.join(directory, path))] + assert not missing, "Missing files: {}".format(', '.join(missing)) + + # read records + records = {filename: CSVSchema(columns).read(os.path.join(directory, filename)) + for filename, columns in schema.items()} + + + # check lengths + lengths = [len(value) for value in records.values()] + assert len(set(lengths)) == 1, "Differing lengths found for files" + + # build new rows + retval = [] + for row in zip(*records.values()): + new_row = {} + for record in row: + for key, value in record.items(): + if new_row.get(key, value) != value: + raise AssertionError("{} != {}".format(new_row.get(key), value)) + new_row[key] = value + retval.append(new_row) + + return retval + + def ensure_dir(directory): """ensure a directory exists"""