Mercurial > hg > GlobalNeighbors
diff globalneighbors/schema.py @ 0:5dba84370182
initial commit; half-working prototype
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sat, 24 Jun 2017 12:03:39 -0700 |
parents | |
children | 1b94f3bf97e5 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/globalneighbors/schema.py Sat Jun 24 12:03:39 2017 -0700 @@ -0,0 +1,93 @@ +""" +Schema for cities 1000 +From http://download.geonames.org/export/dump/ +""" + +from collections import OrderedDict + + +# column descriptions +descriptions = { + 'geonameid' : "integer id of record in geonames database", + 'name' : "name of geographical point (utf8) varchar(200)", + 'asciiname' : "name of geographical point in plain ascii characters, varchar(200)", + 'alternatenames' : "alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)", + 'latitude' : "latitude in decimal degrees (wgs84)", + 'longitude' : "longitude in decimal degrees (wgs84)", + 'feature class' : "see http://www.geonames.org/export/codes.html, char(1)", + 'feature code' : "see http://www.geonames.org/export/codes.html, varchar(10)", + 'country code' : "ISO-3166 2-letter country code, 2 characters", + 'cc2' : "alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters", + 'admin1 code' : "fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)", + 'admin2 code' : "code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80)", + 'admin3 code' : "code for third level administrative division, varchar(20)", + 'admin4 code' : "code for fourth level administrative division, varchar(20)", + 'population' : "bigint (8 byte int)", + 'elevation' : "in meters, integer", + 'dem' : "digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.", + 'timezone' : "the iana timezone id (see file timeZone.txt) varchar(40)", + 'modification date' : "date of last modification in yyyy-MM-dd format" +} + + +# schema of python types to cast to +types = OrderedDict([ + ('geonameid', int), + ('name', str), + ('asciiname', str), + ('alternatenames', str), + ('latitude', float), + ('longitude', float), + ('feature class', str), + ('feature code', str), + ('country code', str), + ('cc2', str), + ('admin1 code', str), + ('admin2 code', str), + ('admin3 code', str), + ('admin4 code', str), + ('population', int), + ('elevation', int), + ('dem', int), + ('timezone', str), + ('modification date', str) +]) + + +# Fields we care about; we'll discard the rest +fields = ( + 'geonameid', + 'name', + 'asciiname', + 'latitude', + 'longitude', + 'country code', + 'population', +) + +# Keys we care about +name = 'asciiname' +primary_key = 'geonameid' + + +def cast_row(row, types=types): + """ + cast an iterable `row` of data to + a `dict` according to `types` casting rule + """ + + if len(row) != len(types): + raise AssertionError("Length of row {} != length of types {}: {}".format(len(row), len(types), row)) + + retval = {} + for value, (key, _type) in zip(row, types.items()): + try: + retval[key] = _type(value) + except ValueError as e: + # Allow exceptions as elevation isn't always recorded + # TODO make this configurable + if not value and key in ('elevation',): + retval[key] = None + else: + raise + return retval