Mercurial > hg > numerics
changeset 20:178dc9a94387
port
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Mon, 29 Sep 2014 16:19:35 -0700 (2014-09-29) |
parents | 276beb743a59 |
children | fef3f407113f |
files | numerics/read.py |
diffstat | 1 files changed, 51 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/numerics/read.py Sun Sep 21 20:13:17 2014 -0700 +++ b/numerics/read.py Mon Sep 29 16:19:35 2014 -0700 @@ -15,6 +15,57 @@ __all__ = ['main', 'ReadParser'] string = (str, unicode) +class CSVSchema(object): + """read CSV with a schema""" + + def __init__(self, columns): + self.columns = columns + + def read(self, f): + + if isinstance(f, string): + with open(f) as fp: + return self.read(fp) + + retval = [] + reader = csv.reader(f) + for row in reader: + retval.append(dict(zip(self.columns, row))) + return retval + + __call__ = read + + +def aggregate_columns(directory, schema): + + # check for missing files + missing = [path for path in schema + if not os.path.exists(os.path.join(directory, path))] + assert not missing, "Missing files: {}".format(', '.join(missing)) + + # read records + records = {filename: CSVSchema(columns).read(os.path.join(directory, filename)) + for filename, columns in schema.items()} + + + # check lengths + lengths = [len(value) for value in records.values()] + assert len(set(lengths)) == 1, "Differing lengths found for files" + + # build new rows + retval = [] + for row in zip(*records.values()): + new_row = {} + for record in row: + for key, value in record.items(): + if new_row.get(key, value) != value: + raise AssertionError("{} != {}".format(new_row.get(key), value)) + new_row[key] = value + retval.append(new_row) + + return retval + + def ensure_dir(directory): """ensure a directory exists"""