view globalneighbors/distance.py @ 22:e69cb496324e

we have a data dump
author Jeff Hammel <k0scist@gmail.com>
date Sun, 25 Jun 2017 17:45:19 -0700
parents 811adc9736eb
children
line wrap: on
line source

"""
distance functionality
"""

import argparse
import bisect
import json
import sys
import time
from math import asin, sin, cos, sqrt, pi, fabs
from .cli import CitiesParser
from .constants import Rearth
from .locations import locations
from .read import read_cities
from .schema import fields

DEGREESTORADIANS = pi/180.


def haversine(lat1, lon1, lat2, lon2, r=1):
    """
    see
    https://en.wikipedia.org/wiki/Haversine_formula
    Coordinates in radians
    """
    return 2*r*asin(
        sqrt(sin(0.5*(lat2-lat1))**2
             +cos(lat1)*cos(lat2)*(sin(0.5*(lon2-lon1))**2)
        ))


def deg_to_rad(degrees):
    return degrees*DEGREESTORADIANS


def insert_distance(distances, i, new_distance, k):
    """
    insert `(i, new_distance)` into `distances`
    keeping distances in order with maximum of size `k`
    """

    if len(distances) == k and new_distance >= distances[-1][-1]:
        return

     # TODO: Binary Search Tree
    for _index, (geoid, old_distance) in enumerate(distances):
        if new_distance < old_distance:
            distances.insert(_index, (i, new_distance))
            if len(distances) == k+1:
                distances.pop()
            break
    else:
        distances.append((i, new_distance))


class KeyWrapper(object):
    """wrapper for python's `bisect` methods"""
    def __init__(self, iterable, key):
        self.it = iterable
        self.key = key

    def __getitem__(self, i):
        return self.key(self.it[i])

    def __len__(self):
        return len(self.it)


def insert_distance_bisect(distances, i, new_distance, k):

    if not distances:
        distances.append((i, new_distance))
        return
    if new_distance >= distances[-1][-1]:
        if len(distances) < k:
            distances.append((i, new_distance))
        return

    point = bisect.bisect_left(KeyWrapper(distances,
                                          key=lambda x: x[-1]),
                               new_distance)
    distances.insert(point, (i, new_distance))
    if len(distances) == k+1:
        distances.pop()

def calculate_distances(locations, r=Rearth):
    """
    WARNING! This is an N-squared approach
    """

    # convert to rad
    rad_locations = [(location, tuple([deg_to_rad(i)
                                       for i in latlon]))
                      for location, latlon
                      in locations.items()]

    # use haversince function on N-body problem
    for index, loc1 in enumerate(rad_locations):
        id1, (lat1, lon1) = loc1
        for loc2 in rad_locations[index+1:]:
            id2, (lat2, lon2) = loc2
            key = (id1, id2) if id2 > id1 else (id2, id1)
            yield (key,
                   haversine(lat1, lon1, lat2, lon2, r=r))


def calculate_neighbors(locations,
                        k=10,
                        lat_tol=1.,
                        lon_tol=1.,
                        output=None,
                        neighbors=None):
    """
    calculate `k` nearest neighbors for each location

    locations -- dict of `geoid: (lat, lon)`
    """
    neighbors = neighbors or {}
    items = locations.items()  # copy
    index = 0
    n_items = len(items)
    start = int(time.time())
    while items:
        index += 1
        if output and not index % output:
            # output status counter
            now = int(time.time())
            duration =  now - start
            start = now
            print ('{},{},{},{}'.format(index,
                                        len(items),
                                        n_items,
                                        duration))
        id1, (lat1, lon1) = items.pop()
        for loc2 in items:
            id2, (lat2, lon2) = loc2

            # filter out locations based on latlon boxing
            if fabs(lat2 - lat1) > lat_tol:
                 continue
            if fabs(lon2 - lon1) > lon_tol:
                 continue

            # determine distance
            args = [deg_to_rad(i) for i in
                    (lat1, lon1, lat2, lon2)]
            new_distance = haversine(*args, r=Rearth)

            # insert in order
            ids = (id1, id2)
            for i in (0, 1):
                distances = neighbors.setdefault(ids[i], [])
                insert_distance_bisect(distances,
                                       ids[i-1],
                                       new_distance,
                                       k)

    return neighbors


def write_neighbors(fp, neighbors):
    for key, value in neighbors.iteritems():
        fp.write("{key} {value}\n".format(key=key,
                                        value=json.dumps(value)))

def main(args=sys.argv[1:]):
    """CLI"""

    # parse command line arguments
    description = """write nearest neighborfiles"""
    parser = CitiesParser(description=description)
    parser.add_argument('output', type=argparse.FileType('w'),
                        help="output file to dump JSON to")
    parser.add_argument('--latlon', dest='latlon', type=float,
                        nargs=2, metavar=("LAT", "LON"),
                        default=(1., 1.),
                        help="tolerance of latitude and longitude in degrees [DEFAULT: %(default)s]")
    parser.add_argument('--counter', '--output-counter',
                        dest='output_counter',
                        type=int, default=100,
                        help="how often to output progress updates [DEFAULT: %(default)s]")
    parser.add_argument('-k', dest='k',
                        type=int, default=50,
                        help="number of neighbors to determine [DEFAULT: %(default)s]")
    options = parser.parse_args(args)

    # get locations
    city_locations = locations(read_cities(options.cities, fields=fields))
    options.cities.close()
    options.output.close()


    # calculate neighbors
    neighbors = calculate_neighbors(city_locations,
                                    k=options.k,
                                    lat_tol=options.latlon[0],
                                    lon_tol=options.latlon[-1],
                                    output=options.output_counter)

    # output
    print ("Outputting neighbors")
    sys.stdout.flush()
    import pdb; pdb.set_trace()
    with open(options.output.name, 'w') as f:
        f.write(json.dumps(neighbors))

if __name__ == '__main__':
    main()