changeset 87:9d5a5e9f5c3b

add kmeans + dataset
author Jeff Hammel <k0scist@gmail.com>
date Sun, 17 Dec 2017 14:05:57 -0800
parents b56d329c238d
children 596dac7f3e98
files tests/test_kmeans.py tvii/dataset/__init__.py tvii/dataset/circle.py tvii/dataset/cli.py tvii/dataset/gauss.py tvii/dataset/line.py tvii/dataset/rand.py tvii/kmeans.py
diffstat 7 files changed, 391 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_kmeans.py	Sun Dec 17 14:05:57 2017 -0800
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+"""
+tests K means algorithm
+"""
+
+import unittest
+from tvii import kmeans
+from nettwerk.dataset.circle import CircularRandom
+
+
+class TestKMeans(unittest.TestCase):
+
+    def test_dualing_gaussians(self):
+        """tests two gaussian distributions;  first, cut overlap"""
+        # TODO
+
+    def test_circles(self):
+        """test with two circles of points"""
+
+        # generate two non-overlapping circles
+        n_points = 10000   # per circle
+        p1 = CircularRandom((-1.5, 0), 1)(n_points)
+        p2 = CircularRandom((1.5, 0), 1)(n_points)
+
+        # run kmeans
+        classes, centroids = kmeans.kmeans(p1+p2, 2)
+
+        # sanity
+        assert len(centroids) == 2
+        assert len(classes) == 2
+
+        # the centroids should have opposite x values
+        xprod = centroids[0][0] * centroids[1][0]
+        assert xprod < 0.
+        assert abs(xprod + 2.25) < 0.1
+
+        # assert we're kinda close
+        for c in centroids:
+            c = [abs(i) for i in c]
+            assert abs(c[0]-1.5) < 0.1
+            assert abs(c[1]) < 0.1
+
+        # its a pretty clean break; our points should be exact, most likely
+        if centroids[0][0] < 0.:
+            left = 0
+            right = 1
+        else:
+            left = 1
+            right = 0
+        assert sorted(p1) == sorted(classes[left])
+        assert sorted(p2) == sorted(classes[right])
+
+    def test_help(self):
+        """smoketest for CLI"""
+
+        try:
+            kmeans.main(['--help'])
+        except SystemExit:
+            # this is expected
+            pass
+
+
+if __name__ == '__main__':
+    unittest.main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tvii/dataset/circle.py	Sun Dec 17 14:05:57 2017 -0800
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+"""
+random points in a circle
+"""
+# TODO: a `D`-sphere
+
+import math
+import random
+import sys
+from .cli import DatasetGenerationParser
+
+
+class CircularRandom(object):
+
+    twopi = 2.*math.pi
+
+    def __init__(self, center, radius=1.):
+        """
+        center -- (x,y) center of circle
+        """
+
+        assert radius > 0
+        self.center = (x, y) = center
+        self.radius = radius
+
+    def __call__(self, n_points):
+        """samples `n_points`"""
+
+        return [self.point() for _ in xrange(n_points)]
+
+    def point(self):
+        """samples a single point"""
+        # TODO: because area grows as 1/r, this will bias
+        # points towards the center. This should be corrected
+
+        r = self.radius*random.random()
+        theta = self.twopi*random.random()
+        x = math.cos(theta)
+        y = math.sqrt(1. - x*x)
+        if theta > math.pi:
+            y = -y
+        return (self.center[0] + x*r,
+                self.center[1] + y*r)
+
+
+def main(args=sys.argv[1:]):
+    """CLI"""
+
+    # parse command line
+    parser = DatasetGenerationParser(description=__doc__)
+    parser.add_argument('--center', dest='center',
+                        nargs=2, metavar=('X', 'Y'),
+                        type=float, default=(0., 0.),
+                        help="center of circle [DEFAULT: %(default)s]")
+    parser.add_argument('-R', '--radius', dest='radius',
+                        type=float, default=1.,
+                        help="radius of circle [DEFAULT: %(default)s]")
+    options = parser.parse_args(args)
+
+    # instantiate sampler
+    circle = CircularRandom(options.center, options.radius)
+
+    # get writer
+    writer = parser.writer()
+
+    # sample
+    points = circle(options.number)
+
+    # output
+    writer.writerows(points)
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tvii/dataset/cli.py	Sun Dec 17 14:05:57 2017 -0800
@@ -0,0 +1,29 @@
+"""
+command line abstraction for dataset generations
+"""
+
+import csv
+import argparse
+import sys
+from ..cli import CLIParser
+
+
+class DatasetGenerationParser(CLIParser):
+    """argument parser for dataset generation"""
+
+    def add_arguments(self):
+        self.add_argument('-N', '--number', dest='number',
+                          type=int, default=100,
+                          help="number of points [DEFAULT: %(default)s]")
+        self.add_argument('-D', '--dimensions', dest='dimensions',
+                          type=int, default=2,
+                          help="number of dimensions of data [DEFAULT: %(default)s]")
+        self.add_argument('-o', '--output', dest='output',
+                          type=argparse.FileType('w'), default=sys.stdout,
+                          help="where to write the data to, or stdout by default")
+
+    def writer(self):
+        """returns CSV writer as specified by arguments"""
+
+        assert self.options
+        return csv.writer(self.options.output)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tvii/dataset/gauss.py	Sun Dec 17 14:05:57 2017 -0800
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+"""
+generate a Gaussian dataset
+"""
+
+import random
+import sys
+from .cli import DatasetGenerationParser
+
+
+class GaussianSampler(object):
+
+    def __init__(self, D, sigma=1., center=None):
+        assert D > -1
+        self.D = D
+        if center is None:
+            self.center = [0]*self.D
+        else:
+            assert len(center) == self.D
+            self.center = center
+        self.sigma = sigma
+
+    def __call__(self):
+        """yield one point of a Gaussian distribution"""
+
+        return [random.gauss(x, self.sigma)
+                for x in self.center]
+
+
+def main(args=sys.argv[1:]):
+    """CLI"""
+
+    # parse command line
+    parser = DatasetGenerationParser(description=__doc__)
+    options = parser.parse_args(args)
+
+    # point generator
+    generator = GaussianSampler(D=options.dimensions)
+    points = [generator() for _ in xrange(options.number)]
+
+    # output
+    parser.writer().writerows(points)
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tvii/dataset/line.py	Sun Dec 17 14:05:57 2017 -0800
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+
+"""
+generate a line segment
+"""
+
+import sys
+from .cli import DatasetGenerationParser
+
+
+def x_range(x_min, x_max, n):
+    """1-D range function"""
+
+    dx = (x_max - x_min)/(n - 1.)
+    return [x_min + dx*i for i in range(n)]
+
+
+class Line(object):
+    """linear dataset"""
+
+    # TODO: should woprk in `D` dimensions
+    def __init__(self, W, b):
+        self.W = W
+        self.b = b
+
+    def y(self, x):
+        return self.W*x + self.b
+
+    def __call__(self, *x):
+        return [self.y(_x) for _x in x]
+
+
+def main(args=sys.argv[1:]):
+    """CLI"""
+
+    # parse command line
+    parser = DatasetGenerationParser(description=__doc__)
+    parser.add_argument('W', type=float,
+                        help="slope")
+    parser.add_argument('b', type=float,
+                        help="offset")
+    parser.add_argument('-x', dest='x_range', type=float,
+                        nargs=2, metavar=('MIN', 'MAX'), default=(0., 1.),
+                        help="range of `x`")
+    options = parser.parse_args(args)
+
+    # instantiate line generator
+    line = Line(options.W, options.b)
+
+    # generate `x`
+    x = x_range(options.x_range[0], options.x_range[1], options.number)
+
+    # generate `y`
+    y = line(*x)
+
+    # output points
+    parser.writer().writerows(zip(x, y))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tvii/dataset/rand.py	Sun Dec 17 14:05:57 2017 -0800
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+generate a random dataset
+"""
+
+import csv
+import random
+import sys
+from .cli import DatasetGenerationParser
+
+
+class RandomDataset(object):
+
+    def __init__(self, N, D, bounds=(0., 1.)):
+        assert len(bounds) == 2
+        self.N = N
+        self.D = D
+        self.bounds = bounds
+        self.length = bounds[-1] - bounds[0]
+
+        self.points = [self() for _ in xrange(self.N)]
+
+    def translate(self, fraction):
+        return self.length*fraction + self.bounds[0]
+
+    def __call__(self):
+        """generate one `D`-dimensional point of data"""
+
+        return [self.translate(random.random()) for i in xrange(self.D)]
+
+
+def main(args=sys.argv[1:]):
+    """CLI"""
+
+    # parse command line
+    parser = DatasetGenerationParser(description=__doc__)
+    options = parser.parse_args(args)
+
+    # make some random points
+    points = RandomDataset(N=options.number, D=options.dimensions)
+
+    # output them
+    parser.writer().writerows(points.points)
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tvii/kmeans.py	Sun Dec 17 14:05:57 2017 -0800
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+K-means unsupervised learning algorithm
+"""
+
+import csv
+import os
+import random
+import sys
+from .centroid import centroid
+from .cli import CLIParser
+from .distance import distance
+from .read import read
+
+
+def kmeans(x, k):
+    """
+    applies K-means algorithm to data set `x`
+    to determine `k` classes of the problem
+    """
+
+    # initialization:
+    # pick `k` arbitrary centroids
+    assert k <= len(x)
+    centroids = random.sample(x, k)
+    oldcentroids = None
+
+    while centroids != oldcentroids:
+        # ???convergence?
+
+        # - divide `x` into `k` classes based on distance
+        classes = [[] for i in range(k)]
+        for point in x:
+            closest, d = min([(index, distance(point, c))
+                              for index, c in enumerate(centroids)],
+                             key=lambda x: x[1])
+            classes[closest].append(point)
+
+        # - move centroids to the center of the points
+        oldcentroids = centroids
+        centroids = [centroid(*pts) for pts in classes]
+
+    return (classes, centroids)
+
+
+
+def main(args=sys.argv[1:]):
+    """CLI"""
+
+    # parse command line
+    parser = CLIParser(description=__doc__)
+    parser.add_argument('points', type=read,
+                        help="points to consider")
+    parser.add_argument('--k', dest='k',
+                        type=int, default=2,
+                        help="number of classes to discern [DEFAULT: %(default)s]")
+    options = parser.parse_args(args)
+
+    # run kmeans
+    classes, centroids = kmeans(options.points, options.k)
+
+    # output centroids
+    # TODO:  if an output flag is specified then output the different classes
+    writer = csv.writer(sys.stdout)
+    for c in centroids:
+        writer.writerow(c)
+
+if __name__ == '__main__':
+    main()