Mercurial > hg > wordstream
comparison wordstream/api.py @ 0:8af3412e907a
initial import of wordstream
author | k0s <k0scist@gmail.com> |
---|---|
date | Fri, 01 Jan 2010 19:35:32 -0500 |
parents | |
children | df84e61ae1e4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8af3412e907a |
---|---|
1 import urllib2 | |
2 | |
3 class Corpus(dict): | |
4 | |
5 def __init__(self, corpus=None): | |
6 dict.__init__(corpus or {}) | |
7 | |
8 def feed(self, word, association): | |
9 self.setdefault(word, []).append(association) | |
10 | |
11 def eat(self, word): | |
12 if word in self and self[word]: | |
13 return self[word].pop() | |
14 | |
15 def feed_stream(self, stream): | |
16 if isinstance(stream, basestring): | |
17 stream = stream.split() | |
18 while len(stream) > 1: | |
19 self.feed(stream[-2], stream[-1]) | |
20 stream.pop() | |
21 | |
22 def feed_stuff(self, *args): | |
23 for arg in args: | |
24 if arg.startswith('https://') or arg.startswith('http://'): | |
25 text = urllib2.urlopen(arg) | |
26 else: | |
27 text = file(arg).read() | |
28 self.feed_stream(text) | |
29 | |
30 def save(self, filename): | |
31 named = False | |
32 if isinstance(f, basestring): | |
33 named = True | |
34 f = file(f) | |
35 for key in sorted(self.keys()): | |
36 print >> f, "%s %s" % (key, ' '.join(self[key])) | |
37 if named: | |
38 f.close() | |
39 | |
40 def load(self, f): | |
41 if isinstance(f, basestring): | |
42 f = file(f) | |
43 | |
44 | |
45 @classmethod | |
46 def restore(cls, filename): | |
47 corpus = cls() | |
48 corpus.load(filename) | |
49 |