Mercurial > hg > wordstream
view wordstream/api.py @ 0:8af3412e907a
initial import of wordstream
author | k0s <k0scist@gmail.com> |
---|---|
date | Fri, 01 Jan 2010 19:35:32 -0500 |
parents | |
children | df84e61ae1e4 |
line wrap: on
line source
import urllib2 class Corpus(dict): def __init__(self, corpus=None): dict.__init__(corpus or {}) def feed(self, word, association): self.setdefault(word, []).append(association) def eat(self, word): if word in self and self[word]: return self[word].pop() def feed_stream(self, stream): if isinstance(stream, basestring): stream = stream.split() while len(stream) > 1: self.feed(stream[-2], stream[-1]) stream.pop() def feed_stuff(self, *args): for arg in args: if arg.startswith('https://') or arg.startswith('http://'): text = urllib2.urlopen(arg) else: text = file(arg).read() self.feed_stream(text) def save(self, filename): named = False if isinstance(f, basestring): named = True f = file(f) for key in sorted(self.keys()): print >> f, "%s %s" % (key, ' '.join(self[key])) if named: f.close() def load(self, f): if isinstance(f, basestring): f = file(f) @classmethod def restore(cls, filename): corpus = cls() corpus.load(filename)