Mercurial > hg > GlobalNeighbors
changeset 22:e69cb496324e
we have a data dump
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 25 Jun 2017 17:45:19 -0700 |
parents | 22c384fe954d |
children | 6891c5523b69 |
files | globalneighbors/distance.py globalneighbors/neighbors.py globalneighbors/web.py tests/test_write.py |
diffstat | 4 files changed, 106 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/globalneighbors/distance.py Sun Jun 25 16:28:56 2017 -0700 +++ b/globalneighbors/distance.py Sun Jun 25 17:45:19 2017 -0700 @@ -158,6 +158,11 @@ return neighbors +def write_neighbors(fp, neighbors): + for key, value in neighbors.iteritems(): + fp.write("{key} {value}\n".format(key=key, + value=json.dumps(value))) + def main(args=sys.argv[1:]): """CLI""" @@ -179,11 +184,11 @@ help="number of neighbors to determine [DEFAULT: %(default)s]") options = parser.parse_args(args) - # parse cities - cities = list(read_cities(options.cities, fields=fields)) + # get locations + city_locations = locations(read_cities(options.cities, fields=fields)) + options.cities.close() + options.output.close() - # get locations - city_locations = locations(cities) # calculate neighbors neighbors = calculate_neighbors(city_locations, @@ -195,7 +200,9 @@ # output print ("Outputting neighbors") sys.stdout.flush() - options.output.write(json.dumps(neighbors)) + import pdb; pdb.set_trace() + with open(options.output.name, 'w') as f: + f.write(json.dumps(neighbors)) if __name__ == '__main__': main()
--- a/globalneighbors/neighbors.py Sun Jun 25 16:28:56 2017 -0700 +++ b/globalneighbors/neighbors.py Sun Jun 25 17:45:19 2017 -0700 @@ -2,10 +2,15 @@ read neighbors file; this should be in the form of: -`{geoid: [(geoid_closest_neighbor, distance), - (geoid_2nd_closest_neighbor, distance), - ...] - }` +`geoid [(geoid_closest_neighbor, distance), (geoid_2nd_closest_neighbor, distance), ...]` + +*PER LINE* this format was chosen because it is easier to +iteratively read and write vs JSON. + +While CSV could be made to fit this model, because +there are both distances and geo IDs as pairs, it is not +the most natural fit. So we'll settle for our own data model. +No, it's not the best, but so be it (for now). """ import json @@ -21,6 +26,9 @@ retval = {} for line in f: - data = json.loads(line) - retval.update(data) + key, value = line.split(None, 1) + key = int(key) + data = json.loads(value) + data = [tuple(item) for item in data] + retval[key] = data return retval
--- a/globalneighbors/web.py Sun Jun 25 16:28:56 2017 -0700 +++ b/globalneighbors/web.py Sun Jun 25 17:45:19 2017 -0700 @@ -108,6 +108,19 @@ body=json.dumps(self.cities( startswith=request.GET.get('term')))) +class NeighborsHandler(Handler): + + content_type = 'application/json' + + def __init__(self, neighbors): + self.neighbors = neighbors + + def GET(self, request): + geoid = request.GET.get('geoid') + neighbors = self.neighbors.get(geoid, []) + return Response(content_type=self.content_type, + body=json.dumps(neighbors)) + class GlobalHandler(Handler): """WSGI HTTP Handler""" @@ -126,7 +139,9 @@ fields=fields) self.locations = locations(self.cities) if neighbors_file: - pass # TODO + self.neighbors = read_neighbors_file(neighbors_file) + else: + self.neighbors = None # get country codes self.country_codes = sorted(set([city['country code'] @@ -166,6 +181,8 @@ if not city: return variables = dict(city=city) + if self.neighbors: + import pdb; pdb.set_trace() return Response(content_type=self.content_type, body=self.citypage.render(variables)) except ValueError:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_write.py Sun Jun 25 17:45:19 2017 -0700 @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +""" +test writing + reading distances +""" + +import os +import shutil +import tempfile +import unittest +from common import datafile +from globalneighbors.distance import calculate_neighbors +from globalneighbors.distance import write_neighbors +from globalneighbors.locations import locations +from globalneighbors.neighbors import read_neighbors_file +from globalneighbors.read import read_cities + + +class TestDistanceReadWrite(unittest.TestCase): + + def test_10000(self): + """test 10000 cities""" + + # read locations + citiesfile = datafile('10000cities.tsv') + assert os.path.exists(citiesfile) + with open(citiesfile) as f: + city_locations = locations(read_cities(f)) + + # calculate neighbors + neighbors = calculate_neighbors(city_locations, + k=50, + lat_tol=2., + lon_tol=2.) + + # make a staging area + tmpdir = tempfile.mkdtemp() + try: + # write the neighbors + outfile = os.path.join(tmpdir, 'neighbors.dat') + assert not os.path.exists(outfile) + with open(outfile, 'w') as f: + write_neighbors(f, neighbors) + assert os.path.exists(outfile) + + # read the neighbors + with open(outfile) as f: + new_neighbors = read_neighbors_file(f) + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + # they should be equal + assert len(neighbors) == len(new_neighbors) + assert sorted(neighbors.keys()) == sorted(new_neighbors.keys()) + for key in neighbors.keys(): + valueA = neighbors[key] + valueB = new_neighbors[key] + assert valueA == valueB + + +if __name__ == '__main__': + unittest.main()