10
|
1 from urllib.request import urlopen
|
2
|
2 from random import shuffle
|
0
|
3
|
|
4 class Corpus(dict):
|
|
5
|
|
6 def __init__(self, corpus=None):
|
|
7 dict.__init__(corpus or {})
|
|
8
|
|
9 def feed(self, word, association):
|
|
10 self.setdefault(word, []).append(association)
|
|
11
|
|
12 def eat(self, word):
|
2
|
13 if word in self:
|
|
14 if self[word]:
|
|
15 return self[word].pop()
|
|
16 else:
|
|
17 del self[word]
|
0
|
18
|
|
19 def feed_stream(self, stream):
|
19
|
20 if isinstance(stream, bytes):
|
|
21 stream = stream.decode('utf-8')
|
18
|
22 if isinstance(stream, str):
|
0
|
23 stream = stream.split()
|
|
24 while len(stream) > 1:
|
|
25 self.feed(stream[-2], stream[-1])
|
|
26 stream.pop()
|
|
27
|
|
28 def feed_stuff(self, *args):
|
|
29 for arg in args:
|
|
30 if arg.startswith('https://') or arg.startswith('http://'):
|
10
|
31 with urlopen(arg) as response:
|
|
32 text = response.read()
|
0
|
33 else:
|
17
|
34 text = open(arg).read()
|
0
|
35 self.feed_stream(text)
|
|
36
|
2
|
37 def scramble(self):
|
|
38 for i in self:
|
|
39 shuffle(self[i])
|
|
40
|
17
|
41 def save(self, f):
|
0
|
42 named = False
|
17
|
43 if isinstance(f, str):
|
0
|
44 named = True
|
17
|
45 f = open(f)
|
0
|
46 for key in sorted(self.keys()):
|
17
|
47 f.write("%s %s\n" % (key, ' '.join(self[key])))
|
0
|
48 if named:
|
|
49 f.close()
|
17
|
50
|
0
|
51 def load(self, f):
|
17
|
52 if isinstance(f, str):
|
|
53 f = open(f)
|
0
|
54
|
|
55 @classmethod
|
|
56 def restore(cls, filename):
|
|
57 corpus = cls()
|
|
58 corpus.load(filename)
|