view tests/test_data.py @ 22:e69cb496324e

we have a data dump
author Jeff Hammel <k0scist@gmail.com>
date Sun, 25 Jun 2017 17:45:19 -0700
parents 5dba84370182
children
line wrap: on
line source

#!/usr/bin/env python

"""
test date integrity;  ensure we know what
data we're dealing with
"""

import os
import unittest
from globalneighbors.locations import locations
from globalneighbors.read import read_cities
from globalneighbors.schema import primary_key

here = os.path.dirname(os.path.abspath(__file__))
data = os.path.join(here, 'data')
full_tsv_lines = 149092

class DataIntegrityTest(unittest.TestCase):

    sample_tsv = os.path.join(data, 'sample.tsv')
    assert os.path.exists(sample_tsv)
    full_tsv = os.path.join(data, 'cities1000.txt')
    assert os.path.exists(full_tsv)

    def test_primary_key(self):
        """ensure we have a unique key to identify cities"""

        with open(self.full_tsv) as f:
            cities = list(read_cities(f))
        n_cities = len(cities)
        assert n_cities == full_tsv_lines

        # show we have duplicate names
        nameset = set([city['name'] for city in cities])
        assert len(nameset) != n_cities
        asciinameset = set([city['asciiname'] for city in cities])
        assert len(asciinameset) != n_cities

        # show we do have a unique key
        geonameids = set([city['geonameid'] for city in cities])
        assert len(geonameids) == n_cities

        # is ('name', 'country code') unique?
        nameccset = set([(city['asciiname'], city['country code'])
                         for city in cities])
        assert len(nameccset) != n_cities  # Nope!

    def test_latlon(self):
        """
        ensure all latitudes and longitudes are in the
        range lat=(-90..90) and lon=(-180..180)
        """

        # read cities
        with open(self.full_tsv) as f:
            cities = list(read_cities(f))
        n_cities = len(cities)
        assert n_cities == full_tsv_lines

        # make a location map
        city_locations = locations(cities)

        # ensure our data is value
        latrange = (-90., 90.)
        lonrange = (-180., 180.)
        for geoid, (lat, lon) in city_locations.iteritems():
            assert isinstance(geoid, int)
            assert latrange[0] <= lat <= latrange[-1]
            assert lonrange[0] <= lon <= lonrange[-1]


if __name__ == '__main__':
    unittest.main()