annotate wordstream/api.py @ 0:8af3412e907a

initial import of wordstream
author k0s <k0scist@gmail.com>
date Fri, 01 Jan 2010 19:35:32 -0500
parents
children df84e61ae1e4
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
1 import urllib2
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
2
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
3 class Corpus(dict):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
4
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
5 def __init__(self, corpus=None):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
6 dict.__init__(corpus or {})
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
7
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
8 def feed(self, word, association):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
9 self.setdefault(word, []).append(association)
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
10
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
11 def eat(self, word):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
12 if word in self and self[word]:
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
13 return self[word].pop()
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
14
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
15 def feed_stream(self, stream):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
16 if isinstance(stream, basestring):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
17 stream = stream.split()
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
18 while len(stream) > 1:
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
19 self.feed(stream[-2], stream[-1])
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
20 stream.pop()
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
21
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
22 def feed_stuff(self, *args):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
23 for arg in args:
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
24 if arg.startswith('https://') or arg.startswith('http://'):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
25 text = urllib2.urlopen(arg)
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
26 else:
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
27 text = file(arg).read()
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
28 self.feed_stream(text)
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
29
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
30 def save(self, filename):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
31 named = False
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
32 if isinstance(f, basestring):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
33 named = True
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
34 f = file(f)
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
35 for key in sorted(self.keys()):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
36 print >> f, "%s %s" % (key, ' '.join(self[key]))
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
37 if named:
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
38 f.close()
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
39
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
40 def load(self, f):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
41 if isinstance(f, basestring):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
42 f = file(f)
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
43
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
44
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
45 @classmethod
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
46 def restore(cls, filename):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
47 corpus = cls()
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
48 corpus.load(filename)
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
49