Mercurial > hg > GlobalNeighbors
view globalneighbors/distance.py @ 25:991bce6b6881 default tip
[knn] placeholder for planning session
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 17 Sep 2017 14:35:50 -0700 |
parents | e69cb496324e |
children |
line wrap: on
line source
""" distance functionality """ import argparse import bisect import json import sys import time from math import asin, sin, cos, sqrt, pi, fabs from .cli import CitiesParser from .constants import Rearth from .locations import locations from .read import read_cities from .schema import fields DEGREESTORADIANS = pi/180. def haversine(lat1, lon1, lat2, lon2, r=1): """ see https://en.wikipedia.org/wiki/Haversine_formula Coordinates in radians """ return 2*r*asin( sqrt(sin(0.5*(lat2-lat1))**2 +cos(lat1)*cos(lat2)*(sin(0.5*(lon2-lon1))**2) )) def deg_to_rad(degrees): return degrees*DEGREESTORADIANS def insert_distance(distances, i, new_distance, k): """ insert `(i, new_distance)` into `distances` keeping distances in order with maximum of size `k` """ if len(distances) == k and new_distance >= distances[-1][-1]: return # TODO: Binary Search Tree for _index, (geoid, old_distance) in enumerate(distances): if new_distance < old_distance: distances.insert(_index, (i, new_distance)) if len(distances) == k+1: distances.pop() break else: distances.append((i, new_distance)) class KeyWrapper(object): """wrapper for python's `bisect` methods""" def __init__(self, iterable, key): self.it = iterable self.key = key def __getitem__(self, i): return self.key(self.it[i]) def __len__(self): return len(self.it) def insert_distance_bisect(distances, i, new_distance, k): if not distances: distances.append((i, new_distance)) return if new_distance >= distances[-1][-1]: if len(distances) < k: distances.append((i, new_distance)) return point = bisect.bisect_left(KeyWrapper(distances, key=lambda x: x[-1]), new_distance) distances.insert(point, (i, new_distance)) if len(distances) == k+1: distances.pop() def calculate_distances(locations, r=Rearth): """ WARNING! This is an N-squared approach """ # convert to rad rad_locations = [(location, tuple([deg_to_rad(i) for i in latlon])) for location, latlon in locations.items()] # use haversince function on N-body problem for index, loc1 in enumerate(rad_locations): id1, (lat1, lon1) = loc1 for loc2 in rad_locations[index+1:]: id2, (lat2, lon2) = loc2 key = (id1, id2) if id2 > id1 else (id2, id1) yield (key, haversine(lat1, lon1, lat2, lon2, r=r)) def calculate_neighbors(locations, k=10, lat_tol=1., lon_tol=1., output=None, neighbors=None): """ calculate `k` nearest neighbors for each location locations -- dict of `geoid: (lat, lon)` """ neighbors = neighbors or {} items = locations.items() # copy index = 0 n_items = len(items) start = int(time.time()) while items: index += 1 if output and not index % output: # output status counter now = int(time.time()) duration = now - start start = now print ('{},{},{},{}'.format(index, len(items), n_items, duration)) id1, (lat1, lon1) = items.pop() for loc2 in items: id2, (lat2, lon2) = loc2 # filter out locations based on latlon boxing if fabs(lat2 - lat1) > lat_tol: continue if fabs(lon2 - lon1) > lon_tol: continue # determine distance args = [deg_to_rad(i) for i in (lat1, lon1, lat2, lon2)] new_distance = haversine(*args, r=Rearth) # insert in order ids = (id1, id2) for i in (0, 1): distances = neighbors.setdefault(ids[i], []) insert_distance_bisect(distances, ids[i-1], new_distance, k) return neighbors def write_neighbors(fp, neighbors): for key, value in neighbors.iteritems(): fp.write("{key} {value}\n".format(key=key, value=json.dumps(value))) def main(args=sys.argv[1:]): """CLI""" # parse command line arguments description = """write nearest neighborfiles""" parser = CitiesParser(description=description) parser.add_argument('output', type=argparse.FileType('w'), help="output file to dump JSON to") parser.add_argument('--latlon', dest='latlon', type=float, nargs=2, metavar=("LAT", "LON"), default=(1., 1.), help="tolerance of latitude and longitude in degrees [DEFAULT: %(default)s]") parser.add_argument('--counter', '--output-counter', dest='output_counter', type=int, default=100, help="how often to output progress updates [DEFAULT: %(default)s]") parser.add_argument('-k', dest='k', type=int, default=50, help="number of neighbors to determine [DEFAULT: %(default)s]") options = parser.parse_args(args) # get locations city_locations = locations(read_cities(options.cities, fields=fields)) options.cities.close() options.output.close() # calculate neighbors neighbors = calculate_neighbors(city_locations, k=options.k, lat_tol=options.latlon[0], lon_tol=options.latlon[-1], output=options.output_counter) # output print ("Outputting neighbors") sys.stdout.flush() import pdb; pdb.set_trace() with open(options.output.name, 'w') as f: f.write(json.dumps(neighbors)) if __name__ == '__main__': main()