0
|
1 import urllib2
|
|
2
|
|
3 class Corpus(dict):
|
|
4
|
|
5 def __init__(self, corpus=None):
|
|
6 dict.__init__(corpus or {})
|
|
7
|
|
8 def feed(self, word, association):
|
|
9 self.setdefault(word, []).append(association)
|
|
10
|
|
11 def eat(self, word):
|
|
12 if word in self and self[word]:
|
|
13 return self[word].pop()
|
|
14
|
|
15 def feed_stream(self, stream):
|
|
16 if isinstance(stream, basestring):
|
|
17 stream = stream.split()
|
|
18 while len(stream) > 1:
|
|
19 self.feed(stream[-2], stream[-1])
|
|
20 stream.pop()
|
|
21
|
|
22 def feed_stuff(self, *args):
|
|
23 for arg in args:
|
|
24 if arg.startswith('https://') or arg.startswith('http://'):
|
|
25 text = urllib2.urlopen(arg)
|
|
26 else:
|
|
27 text = file(arg).read()
|
|
28 self.feed_stream(text)
|
|
29
|
|
30 def save(self, filename):
|
|
31 named = False
|
|
32 if isinstance(f, basestring):
|
|
33 named = True
|
|
34 f = file(f)
|
|
35 for key in sorted(self.keys()):
|
|
36 print >> f, "%s %s" % (key, ' '.join(self[key]))
|
|
37 if named:
|
|
38 f.close()
|
|
39
|
|
40 def load(self, f):
|
|
41 if isinstance(f, basestring):
|
|
42 f = file(f)
|
|
43
|
|
44
|
|
45 @classmethod
|
|
46 def restore(cls, filename):
|
|
47 corpus = cls()
|
|
48 corpus.load(filename)
|
|
49
|