annotate wordstream/api.py @ 21:c05704d14958 default tip

py3
author Jeff Hammel <k0scist@gmail.com>
date Wed, 02 Feb 2022 16:15:36 +0000
parents 49ff2772891d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
Jeff Hammel <k0scist@gmail.com>
parents: 2
diff changeset
1 from urllib.request import urlopen
2
df84e61ae1e4 add dissociation
k0s <k0scist@gmail.com>
parents: 0
diff changeset
2 from random import shuffle
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
3
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
4 class Corpus(dict):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
5
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
6 def __init__(self, corpus=None):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
7 dict.__init__(corpus or {})
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
8
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
9 def feed(self, word, association):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
10 self.setdefault(word, []).append(association)
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
11
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
12 def eat(self, word):
2
df84e61ae1e4 add dissociation
k0s <k0scist@gmail.com>
parents: 0
diff changeset
13 if word in self:
df84e61ae1e4 add dissociation
k0s <k0scist@gmail.com>
parents: 0
diff changeset
14 if self[word]:
df84e61ae1e4 add dissociation
k0s <k0scist@gmail.com>
parents: 0
diff changeset
15 return self[word].pop()
df84e61ae1e4 add dissociation
k0s <k0scist@gmail.com>
parents: 0
diff changeset
16 else:
df84e61ae1e4 add dissociation
k0s <k0scist@gmail.com>
parents: 0
diff changeset
17 del self[word]
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
18
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
19 def feed_stream(self, stream):
19
49ff2772891d handle bytes case
Jeff Hammel <k0scist@gmail.com>
parents: 18
diff changeset
20 if isinstance(stream, bytes):
49ff2772891d handle bytes case
Jeff Hammel <k0scist@gmail.com>
parents: 18
diff changeset
21 stream = stream.decode('utf-8')
18
Jeff Hammel <k0scist@gmail.com>
parents: 17
diff changeset
22 if isinstance(stream, str):
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
23 stream = stream.split()
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
24 while len(stream) > 1:
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
25 self.feed(stream[-2], stream[-1])
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
26 stream.pop()
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
27
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
28 def feed_stuff(self, *args):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
29 for arg in args:
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
30 if arg.startswith('https://') or arg.startswith('http://'):
10
Jeff Hammel <k0scist@gmail.com>
parents: 2
diff changeset
31 with urlopen(arg) as response:
Jeff Hammel <k0scist@gmail.com>
parents: 2
diff changeset
32 text = response.read()
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
33 else:
17
Jeff Hammel <k0scist@gmail.com>
parents: 10
diff changeset
34 text = open(arg).read()
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
35 self.feed_stream(text)
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
36
2
df84e61ae1e4 add dissociation
k0s <k0scist@gmail.com>
parents: 0
diff changeset
37 def scramble(self):
df84e61ae1e4 add dissociation
k0s <k0scist@gmail.com>
parents: 0
diff changeset
38 for i in self:
df84e61ae1e4 add dissociation
k0s <k0scist@gmail.com>
parents: 0
diff changeset
39 shuffle(self[i])
df84e61ae1e4 add dissociation
k0s <k0scist@gmail.com>
parents: 0
diff changeset
40
17
Jeff Hammel <k0scist@gmail.com>
parents: 10
diff changeset
41 def save(self, f):
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
42 named = False
17
Jeff Hammel <k0scist@gmail.com>
parents: 10
diff changeset
43 if isinstance(f, str):
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
44 named = True
17
Jeff Hammel <k0scist@gmail.com>
parents: 10
diff changeset
45 f = open(f)
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
46 for key in sorted(self.keys()):
17
Jeff Hammel <k0scist@gmail.com>
parents: 10
diff changeset
47 f.write("%s %s\n" % (key, ' '.join(self[key])))
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
48 if named:
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
49 f.close()
17
Jeff Hammel <k0scist@gmail.com>
parents: 10
diff changeset
50
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
51 def load(self, f):
17
Jeff Hammel <k0scist@gmail.com>
parents: 10
diff changeset
52 if isinstance(f, str):
Jeff Hammel <k0scist@gmail.com>
parents: 10
diff changeset
53 f = open(f)
0
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
54
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
55 @classmethod
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
56 def restore(cls, filename):
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
57 corpus = cls()
8af3412e907a initial import of wordstream
k0s <k0scist@gmail.com>
parents:
diff changeset
58 corpus.load(filename)