changeset 20:178dc9a94387

port
author Jeff Hammel <k0scist@gmail.com>
date Mon, 29 Sep 2014 16:19:35 -0700
parents 276beb743a59
children fef3f407113f
files numerics/read.py
diffstat 1 files changed, 51 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/numerics/read.py	Sun Sep 21 20:13:17 2014 -0700
+++ b/numerics/read.py	Mon Sep 29 16:19:35 2014 -0700
@@ -15,6 +15,57 @@
 __all__ = ['main', 'ReadParser']
 string = (str, unicode)
 
+class CSVSchema(object):
+    """read CSV with a schema"""
+
+    def __init__(self, columns):
+        self.columns = columns
+
+    def read(self, f):
+
+        if isinstance(f, string):
+            with open(f) as fp:
+                return self.read(fp)
+
+        retval = []
+        reader = csv.reader(f)
+        for row in reader:
+            retval.append(dict(zip(self.columns, row)))
+        return retval
+
+    __call__ = read
+
+
+def aggregate_columns(directory, schema):
+
+    # check for missing files
+    missing = [path for path in schema
+               if not os.path.exists(os.path.join(directory, path))]
+    assert not missing, "Missing files: {}".format(', '.join(missing))
+
+    # read records
+    records = {filename: CSVSchema(columns).read(os.path.join(directory, filename))
+               for filename, columns in schema.items()}
+
+
+    # check lengths
+    lengths = [len(value) for value in records.values()]
+    assert len(set(lengths)) == 1, "Differing lengths found for files"
+
+    # build new rows
+    retval = []
+    for row in zip(*records.values()):
+        new_row = {}
+        for record in row:
+            for key, value in record.items():
+                if new_row.get(key, value) != value:
+                    raise AssertionError("{} != {}".format(new_row.get(key), value))
+                new_row[key] = value
+        retval.append(new_row)
+
+    return retval
+
+
 
 def ensure_dir(directory):
     """ensure a directory exists"""