Mercurial > hg > tvii
changeset 87:9d5a5e9f5c3b
add kmeans + dataset
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 17 Dec 2017 14:05:57 -0800 |
parents | b56d329c238d |
children | 596dac7f3e98 |
files | tests/test_kmeans.py tvii/dataset/__init__.py tvii/dataset/circle.py tvii/dataset/cli.py tvii/dataset/gauss.py tvii/dataset/line.py tvii/dataset/rand.py tvii/kmeans.py |
diffstat | 7 files changed, 391 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_kmeans.py Sun Dec 17 14:05:57 2017 -0800 @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +""" +tests K means algorithm +""" + +import unittest +from tvii import kmeans +from nettwerk.dataset.circle import CircularRandom + + +class TestKMeans(unittest.TestCase): + + def test_dualing_gaussians(self): + """tests two gaussian distributions; first, cut overlap""" + # TODO + + def test_circles(self): + """test with two circles of points""" + + # generate two non-overlapping circles + n_points = 10000 # per circle + p1 = CircularRandom((-1.5, 0), 1)(n_points) + p2 = CircularRandom((1.5, 0), 1)(n_points) + + # run kmeans + classes, centroids = kmeans.kmeans(p1+p2, 2) + + # sanity + assert len(centroids) == 2 + assert len(classes) == 2 + + # the centroids should have opposite x values + xprod = centroids[0][0] * centroids[1][0] + assert xprod < 0. + assert abs(xprod + 2.25) < 0.1 + + # assert we're kinda close + for c in centroids: + c = [abs(i) for i in c] + assert abs(c[0]-1.5) < 0.1 + assert abs(c[1]) < 0.1 + + # its a pretty clean break; our points should be exact, most likely + if centroids[0][0] < 0.: + left = 0 + right = 1 + else: + left = 1 + right = 0 + assert sorted(p1) == sorted(classes[left]) + assert sorted(p2) == sorted(classes[right]) + + def test_help(self): + """smoketest for CLI""" + + try: + kmeans.main(['--help']) + except SystemExit: + # this is expected + pass + + +if __name__ == '__main__': + unittest.main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tvii/dataset/circle.py Sun Dec 17 14:05:57 2017 -0800 @@ -0,0 +1,74 @@ +#!/usr/bin/env python + +""" +random points in a circle +""" +# TODO: a `D`-sphere + +import math +import random +import sys +from .cli import DatasetGenerationParser + + +class CircularRandom(object): + + twopi = 2.*math.pi + + def __init__(self, center, radius=1.): + """ + center -- (x,y) center of circle + """ + + assert radius > 0 + self.center = (x, y) = center + self.radius = radius + + def __call__(self, n_points): + """samples `n_points`""" + + return [self.point() for _ in xrange(n_points)] + + def point(self): + """samples a single point""" + # TODO: because area grows as 1/r, this will bias + # points towards the center. This should be corrected + + r = self.radius*random.random() + theta = self.twopi*random.random() + x = math.cos(theta) + y = math.sqrt(1. - x*x) + if theta > math.pi: + y = -y + return (self.center[0] + x*r, + self.center[1] + y*r) + + +def main(args=sys.argv[1:]): + """CLI""" + + # parse command line + parser = DatasetGenerationParser(description=__doc__) + parser.add_argument('--center', dest='center', + nargs=2, metavar=('X', 'Y'), + type=float, default=(0., 0.), + help="center of circle [DEFAULT: %(default)s]") + parser.add_argument('-R', '--radius', dest='radius', + type=float, default=1., + help="radius of circle [DEFAULT: %(default)s]") + options = parser.parse_args(args) + + # instantiate sampler + circle = CircularRandom(options.center, options.radius) + + # get writer + writer = parser.writer() + + # sample + points = circle(options.number) + + # output + writer.writerows(points) + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tvii/dataset/cli.py Sun Dec 17 14:05:57 2017 -0800 @@ -0,0 +1,29 @@ +""" +command line abstraction for dataset generations +""" + +import csv +import argparse +import sys +from ..cli import CLIParser + + +class DatasetGenerationParser(CLIParser): + """argument parser for dataset generation""" + + def add_arguments(self): + self.add_argument('-N', '--number', dest='number', + type=int, default=100, + help="number of points [DEFAULT: %(default)s]") + self.add_argument('-D', '--dimensions', dest='dimensions', + type=int, default=2, + help="number of dimensions of data [DEFAULT: %(default)s]") + self.add_argument('-o', '--output', dest='output', + type=argparse.FileType('w'), default=sys.stdout, + help="where to write the data to, or stdout by default") + + def writer(self): + """returns CSV writer as specified by arguments""" + + assert self.options + return csv.writer(self.options.output)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tvii/dataset/gauss.py Sun Dec 17 14:05:57 2017 -0800 @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +""" +generate a Gaussian dataset +""" + +import random +import sys +from .cli import DatasetGenerationParser + + +class GaussianSampler(object): + + def __init__(self, D, sigma=1., center=None): + assert D > -1 + self.D = D + if center is None: + self.center = [0]*self.D + else: + assert len(center) == self.D + self.center = center + self.sigma = sigma + + def __call__(self): + """yield one point of a Gaussian distribution""" + + return [random.gauss(x, self.sigma) + for x in self.center] + + +def main(args=sys.argv[1:]): + """CLI""" + + # parse command line + parser = DatasetGenerationParser(description=__doc__) + options = parser.parse_args(args) + + # point generator + generator = GaussianSampler(D=options.dimensions) + points = [generator() for _ in xrange(options.number)] + + # output + parser.writer().writerows(points) + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tvii/dataset/line.py Sun Dec 17 14:05:57 2017 -0800 @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +""" +generate a line segment +""" + +import sys +from .cli import DatasetGenerationParser + + +def x_range(x_min, x_max, n): + """1-D range function""" + + dx = (x_max - x_min)/(n - 1.) + return [x_min + dx*i for i in range(n)] + + +class Line(object): + """linear dataset""" + + # TODO: should woprk in `D` dimensions + def __init__(self, W, b): + self.W = W + self.b = b + + def y(self, x): + return self.W*x + self.b + + def __call__(self, *x): + return [self.y(_x) for _x in x] + + +def main(args=sys.argv[1:]): + """CLI""" + + # parse command line + parser = DatasetGenerationParser(description=__doc__) + parser.add_argument('W', type=float, + help="slope") + parser.add_argument('b', type=float, + help="offset") + parser.add_argument('-x', dest='x_range', type=float, + nargs=2, metavar=('MIN', 'MAX'), default=(0., 1.), + help="range of `x`") + options = parser.parse_args(args) + + # instantiate line generator + line = Line(options.W, options.b) + + # generate `x` + x = x_range(options.x_range[0], options.x_range[1], options.number) + + # generate `y` + y = line(*x) + + # output points + parser.writer().writerows(zip(x, y))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tvii/dataset/rand.py Sun Dec 17 14:05:57 2017 -0800 @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +generate a random dataset +""" + +import csv +import random +import sys +from .cli import DatasetGenerationParser + + +class RandomDataset(object): + + def __init__(self, N, D, bounds=(0., 1.)): + assert len(bounds) == 2 + self.N = N + self.D = D + self.bounds = bounds + self.length = bounds[-1] - bounds[0] + + self.points = [self() for _ in xrange(self.N)] + + def translate(self, fraction): + return self.length*fraction + self.bounds[0] + + def __call__(self): + """generate one `D`-dimensional point of data""" + + return [self.translate(random.random()) for i in xrange(self.D)] + + +def main(args=sys.argv[1:]): + """CLI""" + + # parse command line + parser = DatasetGenerationParser(description=__doc__) + options = parser.parse_args(args) + + # make some random points + points = RandomDataset(N=options.number, D=options.dimensions) + + # output them + parser.writer().writerows(points.points) + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tvii/kmeans.py Sun Dec 17 14:05:57 2017 -0800 @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +K-means unsupervised learning algorithm +""" + +import csv +import os +import random +import sys +from .centroid import centroid +from .cli import CLIParser +from .distance import distance +from .read import read + + +def kmeans(x, k): + """ + applies K-means algorithm to data set `x` + to determine `k` classes of the problem + """ + + # initialization: + # pick `k` arbitrary centroids + assert k <= len(x) + centroids = random.sample(x, k) + oldcentroids = None + + while centroids != oldcentroids: + # ???convergence? + + # - divide `x` into `k` classes based on distance + classes = [[] for i in range(k)] + for point in x: + closest, d = min([(index, distance(point, c)) + for index, c in enumerate(centroids)], + key=lambda x: x[1]) + classes[closest].append(point) + + # - move centroids to the center of the points + oldcentroids = centroids + centroids = [centroid(*pts) for pts in classes] + + return (classes, centroids) + + + +def main(args=sys.argv[1:]): + """CLI""" + + # parse command line + parser = CLIParser(description=__doc__) + parser.add_argument('points', type=read, + help="points to consider") + parser.add_argument('--k', dest='k', + type=int, default=2, + help="number of classes to discern [DEFAULT: %(default)s]") + options = parser.parse_args(args) + + # run kmeans + classes, centroids = kmeans(options.points, options.k) + + # output centroids + # TODO: if an output flag is specified then output the different classes + writer = csv.writer(sys.stdout) + for c in centroids: + writer.writerow(c) + +if __name__ == '__main__': + main()