view globalneighbors/schema.py @ 1:1b94f3bf97e5

* limit distance function * start gridding * improve unicode handling
author Jeff Hammel <k0scist@gmail.com>
date Sat, 24 Jun 2017 14:02:14 -0700
parents 5dba84370182
children
line wrap: on
line source

"""
Schema for cities 1000
From http://download.geonames.org/export/dump/
"""

from collections import OrderedDict


# column descriptions
descriptions = {
    'geonameid'         : "integer id of record in geonames database",
    'name'              : "name of geographical point (utf8) varchar(200)",
    'asciiname'         : "name of geographical point in plain ascii characters, varchar(200)",
    'alternatenames'    : "alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)",
    'latitude'          : "latitude in decimal degrees (wgs84)",
    'longitude'         : "longitude in decimal degrees (wgs84)",
    'feature class'     : "see http://www.geonames.org/export/codes.html, char(1)",
    'feature code'      : "see http://www.geonames.org/export/codes.html, varchar(10)",
    'country code'      : "ISO-3166 2-letter country code, 2 characters",
    'cc2'               : "alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters",
    'admin1 code'       : "fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)",
    'admin2 code'       : "code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80)",
    'admin3 code'       : "code for third level administrative division, varchar(20)",
    'admin4 code'       : "code for fourth level administrative division, varchar(20)",
    'population'        : "bigint (8 byte int)",
    'elevation'         : "in meters, integer",
    'dem'               : "digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.",
    'timezone'          : "the iana timezone id (see file timeZone.txt) varchar(40)",
    'modification date' : "date of last modification in yyyy-MM-dd format"
}


# schema of python types to cast to
types = OrderedDict([
    ('geonameid',      int),
    ('name',           str),
    ('asciiname',      str),
    ('alternatenames', str),
    ('latitude',     float),
    ('longitude',    float),
    ('feature class',  str),
    ('feature code',   str),
    ('country code',   str),
    ('cc2',            str),
    ('admin1 code',    str),
    ('admin2 code',    str),
    ('admin3 code',    str),
    ('admin4 code',    str),
    ('population',     int),
    ('elevation',      int),
    ('dem',            int),
    ('timezone',       str),
    ('modification date', str)
])


# Fields we care about; we'll discard the rest
fields = (
    'geonameid',
    'name',
    'asciiname',
    'latitude',
    'longitude',
    'country code',
    'population',
)

# Keys we care about
name = 'asciiname'
primary_key = 'geonameid'

# fields that should be unicode
unicode_fields = ('name', 'asciiname')

def cast_row(row, types=types):
    """
    cast an iterable `row` of data to
    a `dict` according to `types` casting rule
    """

    if len(row) != len(types):
        raise AssertionError("Length of row {} != length of types {}: {}".format(len(row), len(types), row))

    retval = {}
    for value, (key, _type) in zip(row, types.items()):
        try:
            retval[key] = _type(value)
        except ValueError as e:
            # Allow exceptions as elevation isn't always recorded
            # TODO make this configurable
            if not value and key in ('elevation',):
                retval[key] = None
            else:
                raise
        if key in unicode_fields:
            retval[key] = retval[key].decode('utf-8')
    return retval