view globalneighbors/ @ 18:87ae70245201

ubuntu wants .conf, redhat wants ini of course
author Jeff Hammel <>
date Sun, 25 Jun 2017 15:13:29 -0700
parents 1b94f3bf97e5
line wrap: on
line source

Schema for cities 1000

from collections import OrderedDict

# column descriptions
descriptions = {
    'geonameid'         : "integer id of record in geonames database",
    'name'              : "name of geographical point (utf8) varchar(200)",
    'asciiname'         : "name of geographical point in plain ascii characters, varchar(200)",
    'alternatenames'    : "alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)",
    'latitude'          : "latitude in decimal degrees (wgs84)",
    'longitude'         : "longitude in decimal degrees (wgs84)",
    'feature class'     : "see, char(1)",
    'feature code'      : "see, varchar(10)",
    'country code'      : "ISO-3166 2-letter country code, 2 characters",
    'cc2'               : "alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters",
    'admin1 code'       : "fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)",
    'admin2 code'       : "code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80)",
    'admin3 code'       : "code for third level administrative division, varchar(20)",
    'admin4 code'       : "code for fourth level administrative division, varchar(20)",
    'population'        : "bigint (8 byte int)",
    'elevation'         : "in meters, integer",
    'dem'               : "digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.",
    'timezone'          : "the iana timezone id (see file timeZone.txt) varchar(40)",
    'modification date' : "date of last modification in yyyy-MM-dd format"

# schema of python types to cast to
types = OrderedDict([
    ('geonameid',      int),
    ('name',           str),
    ('asciiname',      str),
    ('alternatenames', str),
    ('latitude',     float),
    ('longitude',    float),
    ('feature class',  str),
    ('feature code',   str),
    ('country code',   str),
    ('cc2',            str),
    ('admin1 code',    str),
    ('admin2 code',    str),
    ('admin3 code',    str),
    ('admin4 code',    str),
    ('population',     int),
    ('elevation',      int),
    ('dem',            int),
    ('timezone',       str),
    ('modification date', str)

# Fields we care about; we'll discard the rest
fields = (
    'country code',

# Keys we care about
name = 'asciiname'
primary_key = 'geonameid'

# fields that should be unicode
unicode_fields = ('name', 'asciiname')

def cast_row(row, types=types):
    cast an iterable `row` of data to
    a `dict` according to `types` casting rule

    if len(row) != len(types):
        raise AssertionError("Length of row {} != length of types {}: {}".format(len(row), len(types), row))

    retval = {}
    for value, (key, _type) in zip(row, types.items()):
            retval[key] = _type(value)
        except ValueError as e:
            # Allow exceptions as elevation isn't always recorded
            # TODO make this configurable
            if not value and key in ('elevation',):
                retval[key] = None
        if key in unicode_fields:
            retval[key] = retval[key].decode('utf-8')
    return retval