annotate tvii/kmeans.py @ 87:9d5a5e9f5c3b

add kmeans + dataset
author Jeff Hammel <k0scist@gmail.com>
date Sun, 17 Dec 2017 14:05:57 -0800
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
87
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2 # -*- coding: utf-8 -*-
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 """
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 K-means unsupervised learning algorithm
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6 """
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 import csv
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
9 import os
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
10 import random
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 import sys
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12 from .centroid import centroid
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
13 from .cli import CLIParser
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
14 from .distance import distance
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
15 from .read import read
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
16
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
17
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
18 def kmeans(x, k):
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
19 """
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
20 applies K-means algorithm to data set `x`
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
21 to determine `k` classes of the problem
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
22 """
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
23
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
24 # initialization:
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
25 # pick `k` arbitrary centroids
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
26 assert k <= len(x)
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
27 centroids = random.sample(x, k)
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
28 oldcentroids = None
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
29
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
30 while centroids != oldcentroids:
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
31 # ???convergence?
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
32
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
33 # - divide `x` into `k` classes based on distance
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
34 classes = [[] for i in range(k)]
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
35 for point in x:
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
36 closest, d = min([(index, distance(point, c))
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
37 for index, c in enumerate(centroids)],
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
38 key=lambda x: x[1])
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
39 classes[closest].append(point)
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
40
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
41 # - move centroids to the center of the points
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
42 oldcentroids = centroids
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
43 centroids = [centroid(*pts) for pts in classes]
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
44
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
45 return (classes, centroids)
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
46
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
47
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
48
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
49 def main(args=sys.argv[1:]):
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
50 """CLI"""
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
51
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
52 # parse command line
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
53 parser = CLIParser(description=__doc__)
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
54 parser.add_argument('points', type=read,
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
55 help="points to consider")
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
56 parser.add_argument('--k', dest='k',
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
57 type=int, default=2,
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
58 help="number of classes to discern [DEFAULT: %(default)s]")
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
59 options = parser.parse_args(args)
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
60
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
61 # run kmeans
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
62 classes, centroids = kmeans(options.points, options.k)
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
63
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
64 # output centroids
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
65 # TODO: if an output flag is specified then output the different classes
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
66 writer = csv.writer(sys.stdout)
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
67 for c in centroids:
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
68 writer.writerow(c)
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
69
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
70 if __name__ == '__main__':
9d5a5e9f5c3b add kmeans + dataset
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
71 main()