Mercurial > hg > wordstream

import urllib2

class Corpus(dict):

    def __init__(self, corpus=None):
        dict.__init__(corpus or {})

    def feed(self, word, association):
        self.setdefault(word, []).append(association)

    def eat(self, word):
        if word in self and self[word]:
            return self[word].pop()

    def feed_stream(self, stream):
        if isinstance(stream, basestring):
            stream = stream.split()
        while len(stream) > 1:
            self.feed(stream[-2], stream[-1])
            stream.pop()

    def feed_stuff(self, *args):
        for arg in args:
            if arg.startswith('https://') or arg.startswith('http://'):
                text = urllib2.urlopen(arg)
            else:
                text = file(arg).read()
            self.feed_stream(text)

    def save(self, filename):
        named = False
        if isinstance(f, basestring):
            named = True
            f = file(f)
        for key in sorted(self.keys()):
            print >> f, "%s %s" % (key, ' '.join(self[key]))
        if named:
            f.close()

    def load(self, f):
        if isinstance(f, basestring):
            f = file(f)


    @classmethod
    def restore(cls, filename):
        corpus = cls()
        corpus.load(filename)
author	k0s <k0scist@gmail.com>
date	Fri, 01 Jan 2010 19:35:32 -0500
parents
children	df84e61ae1e4