view tests/test_distance.py @ 25:991bce6b6881 default tip

[knn] placeholder for planning session
author Jeff Hammel <k0scist@gmail.com>
date Sun, 17 Sep 2017 14:35:50 -0700
parents 27925261c137
children
line wrap: on
line source

#!/usr/bin/env python

"""
test distance calculation
"""

import math
import os
import random
import unittest
from globalneighbors import distance
from globalneighbors.constants import Rearth
from globalneighbors.locations import locations
from globalneighbors.read import read_cities
from globalneighbors.read import read_city_list
from globalneighbors.schema import primary_key

here = os.path.dirname(os.path.abspath(__file__))
data = os.path.join(here, 'data')
full_tsv_lines = 149092


class DistanceTests(unittest.TestCase):

    # created with
    # head -n 10 cities1000.txt > GlobalNeighbors/tests/data/sample.tsv
    test_tsv = os.path.join(data, 'sample.tsv')
    test_tsv_lines = 10

    # full dataset:  test with caution
    full_tsv = os.path.join(data, 'cities1000.txt')
    full_tsv_lines = 149092

    # here's a smaller one
    moderate_tsv = os.path.join(data, '10000cities.tsv')

    def test_haversine(self):

        # a simple canned case
        # equator to pole
        lat1 = 0.
        lat2 = 90.
        lon2 = 70.  # undefined, technically
        expected_distance = 0.5*math.pi

        for lon1 in range(-135, 135, 15):
            radians = [distance.deg_to_rad(degrees)
                       for degrees in (lat1, lon2, lat2, lon2)]
            error = (distance.haversine(*radians) == expected_distance)
            assert error < 1e-4

    def test_distance(self):
        """test distance between two known cities"""

        # Source:https://en.wikipedia.org/wiki/List_of_cities_by_latitude
        # http://www.distancefromto.net/distance-from-new-york-to-chicago-us
        chicago = (40.71278,
                   -74.00594)

        new_york = (41.85003,
                    -87.65005)
        ref_distance = 1149.

        args = [distance.deg_to_rad(i) for i in
                list(chicago) + list(new_york)]
        calculated = distance.haversine(*args, r=Rearth)

        # Allow some error for circular projection approximation
        error = abs(calculated - ref_distance)/ref_distance

        assert error < 0.01

    def test_distances(self):
        """"ensure disances monotonically decay"""

        # parse the data
        assert os.path.exists(self.test_tsv)
        cities = read_city_list(self.test_tsv)
        assert len(cities) == self.test_tsv_lines
        city_locations = locations(cities)
        assert len(city_locations) == self.test_tsv_lines

        # calculate all the neighbors
        # WARNING: n*2 algorithm Too computationally intensive
        # for full data set
        for key, value in distance.calculate_distances(city_locations,
                                                       r=Rearth):
            # for now, just make sure we can iterate over them
            pass

    def test_neighbors(self):

        # parse the data
        tsv = os.path.join(data, 'sample.tsv')
        assert os.path.exists(tsv)
        cities = read_city_list(tsv)
        city_locations = locations(cities)

        # calculate the neighbors
        neighbors = distance.calculate_neighbors(city_locations,
                                                 k=self.test_tsv_lines)
        assert len(neighbors) == self.test_tsv_lines

        # ensure distance increases for each thing
        for src, value in neighbors.items():
            distances = [i[-1] for i in value]
            assert len(distances) == self.test_tsv_lines - 1
            for i in range(1, len(distances)):
                assert distances[i] >= distances[i-1]

    def test_10000cities(self):
        """a moderate size test"""

        assert os.path.exists(self.moderate_tsv)
        with open(self.moderate_tsv) as f:
            cities = locations(read_cities(f))

        # test over different values of # of neighbors
        for k in (10, 100, 1000):
            neighbors = distance.calculate_neighbors(cities,
                                                     k=k)

            # ensure you have no more neighbors than you ask for
            assert max([len(value) for value in neighbors.values()]) <= k

            # assert distances increase
            for key, value in neighbors.items():
                distances = [i[-1] for i in value]
                assert distances == sorted(distances)
                neighbors = [i[0] for i in value]
                assert key not in neighbors
                assert len(set(neighbors)) == len(neighbors)

    def test_insert_distances(self):
        """test insert distances algorithm"""

        values = [(i, random.random())
                  for i in range(1500)]
        for k in (10, 100, 1000):
            _distances = []
            for i, value in values:
                distance.insert_distance(_distances,
                                         i,
                                         value,
                                         k)
            # since k is < 1500
            assert len(_distances) == k
            ordered = [value[-1] for value in _distances]
            assert sorted(ordered) == ordered


if __name__ == '__main__':
    unittest.main()