10
|
1 from urllib.request import urlopen
|
2
|
2 from random import shuffle
|
0
|
3
|
|
4 class Corpus(dict):
|
|
5
|
|
6 def __init__(self, corpus=None):
|
|
7 dict.__init__(corpus or {})
|
|
8
|
|
9 def feed(self, word, association):
|
|
10 self.setdefault(word, []).append(association)
|
|
11
|
|
12 def eat(self, word):
|
2
|
13 if word in self:
|
|
14 if self[word]:
|
|
15 return self[word].pop()
|
|
16 else:
|
|
17 del self[word]
|
0
|
18
|
|
19 def feed_stream(self, stream):
|
|
20 if isinstance(stream, basestring):
|
|
21 stream = stream.split()
|
|
22 while len(stream) > 1:
|
|
23 self.feed(stream[-2], stream[-1])
|
|
24 stream.pop()
|
|
25
|
|
26 def feed_stuff(self, *args):
|
|
27 for arg in args:
|
|
28 if arg.startswith('https://') or arg.startswith('http://'):
|
10
|
29 with urlopen(arg) as response:
|
|
30 text = response.read()
|
0
|
31 else:
|
|
32 text = file(arg).read()
|
|
33 self.feed_stream(text)
|
|
34
|
2
|
35 def scramble(self):
|
|
36 for i in self:
|
|
37 shuffle(self[i])
|
|
38
|
0
|
39 def save(self, filename):
|
|
40 named = False
|
|
41 if isinstance(f, basestring):
|
|
42 named = True
|
|
43 f = file(f)
|
|
44 for key in sorted(self.keys()):
|
|
45 print >> f, "%s %s" % (key, ' '.join(self[key]))
|
|
46 if named:
|
|
47 f.close()
|
|
48
|
|
49 def load(self, f):
|
|
50 if isinstance(f, basestring):
|
|
51 f = file(f)
|
|
52
|
|
53
|
|
54 @classmethod
|
|
55 def restore(cls, filename):
|
|
56 corpus = cls()
|
|
57 corpus.load(filename)
|
|
58
|