diff globalneighbors/schema.py @ 0:5dba84370182

initial commit; half-working prototype
author Jeff Hammel <k0scist@gmail.com>
date Sat, 24 Jun 2017 12:03:39 -0700
parents
children 1b94f3bf97e5
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/globalneighbors/schema.py	Sat Jun 24 12:03:39 2017 -0700
@@ -0,0 +1,93 @@
+"""
+Schema for cities 1000
+From http://download.geonames.org/export/dump/
+"""
+
+from collections import OrderedDict
+
+
+# column descriptions
+descriptions = {
+    'geonameid'         : "integer id of record in geonames database",
+    'name'              : "name of geographical point (utf8) varchar(200)",
+    'asciiname'         : "name of geographical point in plain ascii characters, varchar(200)",
+    'alternatenames'    : "alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)",
+    'latitude'          : "latitude in decimal degrees (wgs84)",
+    'longitude'         : "longitude in decimal degrees (wgs84)",
+    'feature class'     : "see http://www.geonames.org/export/codes.html, char(1)",
+    'feature code'      : "see http://www.geonames.org/export/codes.html, varchar(10)",
+    'country code'      : "ISO-3166 2-letter country code, 2 characters",
+    'cc2'               : "alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters",
+    'admin1 code'       : "fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)",
+    'admin2 code'       : "code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80)",
+    'admin3 code'       : "code for third level administrative division, varchar(20)",
+    'admin4 code'       : "code for fourth level administrative division, varchar(20)",
+    'population'        : "bigint (8 byte int)",
+    'elevation'         : "in meters, integer",
+    'dem'               : "digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.",
+    'timezone'          : "the iana timezone id (see file timeZone.txt) varchar(40)",
+    'modification date' : "date of last modification in yyyy-MM-dd format"
+}
+
+
+# schema of python types to cast to
+types = OrderedDict([
+    ('geonameid',      int),
+    ('name',           str),
+    ('asciiname',      str),
+    ('alternatenames', str),
+    ('latitude',     float),
+    ('longitude',    float),
+    ('feature class',  str),
+    ('feature code',   str),
+    ('country code',   str),
+    ('cc2',            str),
+    ('admin1 code',    str),
+    ('admin2 code',    str),
+    ('admin3 code',    str),
+    ('admin4 code',    str),
+    ('population',     int),
+    ('elevation',      int),
+    ('dem',            int),
+    ('timezone',       str),
+    ('modification date', str)
+])
+
+
+# Fields we care about; we'll discard the rest
+fields = (
+    'geonameid',
+    'name',
+    'asciiname',
+    'latitude',
+    'longitude',
+    'country code',
+    'population',
+)
+
+# Keys we care about
+name = 'asciiname'
+primary_key = 'geonameid'
+
+
+def cast_row(row, types=types):
+    """
+    cast an iterable `row` of data to
+    a `dict` according to `types` casting rule
+    """
+
+    if len(row) != len(types):
+        raise AssertionError("Length of row {} != length of types {}: {}".format(len(row), len(types), row))
+
+    retval = {}
+    for value, (key, _type) in zip(row, types.items()):
+        try:
+            retval[key] = _type(value)
+        except ValueError as e:
+            # Allow exceptions as elevation isn't always recorded
+            # TODO make this configurable
+            if not value and key in ('elevation',):
+                retval[key] = None
+            else:
+                raise
+    return retval