Mercurial > hg > Lemuriformes
comparison lemuriformes/csv2sqlite.py @ 11:afc259799019
[CSV] add script to convert to SQL
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 10 Dec 2017 13:58:55 -0800 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
10:ad1bf59eedb0 | 11:afc259799019 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 convert CSV files to a SQLite DB file | |
5 """ | |
6 | |
7 # For SQLite types see | |
8 # http://www.sqlite.org/datatype3.html | |
9 | |
10 import argparse | |
11 import csv | |
12 import os | |
13 import sqlite3 | |
14 import sys | |
15 from collections import OrderedDict | |
16 from .cast import string | |
17 from .sql import SQLConnection | |
18 | |
19 | |
20 def read_csv(csv_file): | |
21 """read csv file with header and return list of dicts""" | |
22 | |
23 reader = csv.DictReader(csv_file) | |
24 return [row for row in reader] | |
25 | |
26 | |
27 def path_root(path): | |
28 """return basename file root sans extension""" | |
29 | |
30 return os.path.splitext(os.path.basename(path))[0] | |
31 | |
32 | |
33 class SQLiteConnection(SQLConnection): | |
34 """connection class to SQLite database""" | |
35 | |
36 # mapping of python types to SQLite types | |
37 types = {int: "INTEGER", | |
38 float: "REAL", | |
39 str: "TEXT"} | |
40 | |
41 def __init__(self, db_file): | |
42 if not os.path.exists(db_file): | |
43 with open(db_file, 'wb') as f: | |
44 # touch file | |
45 pass | |
46 self.db_file = db_file | |
47 self.conn = sqlite3.connect(self.db_file) | |
48 | |
49 def __call__(self, sql, *args): | |
50 c = self.conn.cursor() | |
51 c.execute(sql, args) | |
52 retval = c.fetchall() | |
53 self.conn.commit() | |
54 return retval | |
55 | |
56 def __del__(self): | |
57 self.conn.close() | |
58 | |
59 def tables(self): | |
60 """ | |
61 return tables in the SQLite database | |
62 Ref: https://stackoverflow.com/questions/82875/how-to-list-the-tables-in-an-sqlite-database-file-that-was-opened-with-attach | |
63 """ | |
64 sql = "SELECT name FROM sqlite_master WHERE type='table';" | |
65 results = self(sql) | |
66 return [row[0] for row in results] | |
67 | |
68 def create_table(self, table, **columns): | |
69 """add a table to the database""" | |
70 | |
71 columns = ', '.join(["{key} {value}".format(key=key, value=value) | |
72 for key, value in columns.items()]) | |
73 self("CREATE TABLE IF NOT EXISTS {table} ({columns});".format(table=table, | |
74 columns=columns)) | |
75 | |
76 def schema(self, table): | |
77 """get dictionary of typing for a table""" | |
78 # http://www.sqlite.org/pragma.html#pragma_table_info | |
79 | |
80 sql = "PRAGMA table_info('{}')".format(table) | |
81 results = self(sql) | |
82 # TODO: get data types from e.g. | |
83 # [(0, u'how', u'', 0, None, 0), (1, u'are', u'', 0, None, 0), (2, u'you', u'', 0, None, 0)] | |
84 # ... now | |
85 # [(0, u'how', u'TEXT', 0, None, 0), (1, u'you', u'TEXT', 0, None, 0), (2, u'are', u'TEXT', 0, None, 0)] | |
86 return OrderedDict([(result[1], result[2]) | |
87 for result in results]) | |
88 | |
89 def columns(self, table): | |
90 """return ordered list of column names""" | |
91 | |
92 return self.schema(table).keys() | |
93 | |
94 def select(self, table): | |
95 """just `select *` for now""" | |
96 | |
97 sql = "SELECT * FROM {table};" | |
98 sql = sql.format(table=table) | |
99 data = self(sql) | |
100 columns = self.columns(table) | |
101 return [OrderedDict(zip(columns, row)) | |
102 for row in data] | |
103 | |
104 select_all = select | |
105 | |
106 | |
107 class CSV2SQLite(object): | |
108 """ | |
109 convert Comma Separated Value input to SQLite | |
110 """ | |
111 # TODO: allow conversion to arbitrary SQL | |
112 | |
113 def __init__(self, output): | |
114 self.conn = SQLiteConnection(output) | |
115 | |
116 def __call__(self, *csv_files, **csv_dict): | |
117 | |
118 # allow tables of default(ing) name | |
119 csv_dict.update({csv_file: None | |
120 for csv_file in csv_files}) | |
121 | |
122 for csv_file, tablename in csv_dict.items(): | |
123 self.csv2table(csv_file, tablename=tablename) | |
124 | |
125 def csv2table(self, csv_file, tablename=None): | |
126 if isinstance(csv_file, string): | |
127 with open(csv_file) as f: | |
128 return self.csv2table(f, tablename=tablename) | |
129 if tablename is None: | |
130 # TODO: allow lookup from scheme | |
131 tablename = path_root(csv_file.name) | |
132 # read csv | |
133 data = read_csv(csv_file) | |
134 assert data | |
135 | |
136 # infer schema from data | |
137 # TODO: allow lookup from scheme | |
138 columns = {column: "TEXT" for column in data[0].keys()} | |
139 | |
140 # create table | |
141 self.conn.create_table(tablename, **columns) | |
142 | |
143 # inseert data | |
144 for row in data: | |
145 self.conn.insert_row(tablename, **row) | |
146 | |
147 | |
148 def main(args=sys.argv[1:]): | |
149 """CLI""" | |
150 | |
151 # parse command line | |
152 parser = argparse.ArgumentParser(description=__doc__) | |
153 parser.add_argument('input', nargs='+', | |
154 type=argparse.FileType('r'), | |
155 help="input CSV files; table names taken from file names") | |
156 parser.add_argument('-o', '--output', dest='output', | |
157 required=True, | |
158 help="output SQLite file") | |
159 options = parser.parse_args(args) | |
160 | |
161 # overwrite the file | |
162 # TODO: deprecate and allow appending | |
163 with open(options.output, 'w') as f: | |
164 pass | |
165 | |
166 # instantiate converter | |
167 conn = CSV2SQLite(options.output) | |
168 | |
169 # convert input CSV to SQLite tables | |
170 conn(*options.input) | |
171 | |
172 | |
173 if __name__ == '__main__': | |
174 main() |