view urlmatch.py @ 3:00266c7a7c3c

since there is only one module, dont bother with the whole directory thing
author Jeff Hammel <jhammel@mozilla.com>
date Fri, 17 Jun 2011 10:39:37 -0700
parents urlmatch/__init__.py@750dc780d3d8
children 23be092e6099
line wrap: on
line source

import urlparse

class UrlMatcher(object):

    def __init__(self, *urls):
        match_order=('domain', 'scheme', 'path')
        self.order = match_order
        self.urls = {}
        for url in urls:
            self.add(url)

    def decompose(self, url):

        # break it down
        (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
        urldict = {}

        # domain
        netloc = netloc.split('.')
        if len(netloc) == 1:
            urldict['domain'] = netloc
        else:
            # assert a TLD
            urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2]))

        # path
        path = path.strip('/').split('/')
        if path == ['']:
            path = []
        urldict['path'] = path

        # scheme
        urldict['scheme'] = scheme

        # could do others

        return urldict

    def add(self, url):
        if url not in self.urls:
            self.urls[url] = self.decompose(url)

    def diff(self, url1, url2):

        # decompose the urls if necessary
        if isinstance(url1, basestring):
            url1 = self.decompose(url)
        if isinstance(url2, basestring):
            url2 = self.decompose(url)

        # TODO: finish
        raise NotImplementedError

    def match(self, url):
        if '://' not in url:
            # give a bogus scheme for urlparse. boo!
            urldict = self.decompose('bogus://' + url)
            urldict.pop('scheme')
        else:
            urldict = self.decompose(url)

        order = self.order
        urls = set(self.urls.keys())
        for field in order:
            value = urldict.get(field)
            if not value:
                # don't match trivial fields
                continue
            length = len(value)
            deleted = set()
            for key in list(urls)[:]:
                compare_value = self.urls[key].get(field)
                if not compare_value:
                    urls.discard(key)
                    continue
                if isinstance(value, basestring) and value != compare_value:
                    urls.discard(key)
                    continue
                if len(compare_value) < length:
                    urls.discard(key)
                    continue
                if compare_value[:len(value)] != value:
                    urls.discard(key)
            if not urls:
                return []
        return urls

if __name__ == '__main__':
    matcher = UrlMatcher('http://www.example.com/foo/bar/fleem')
    matcher.add('http://www.example.com/foo/blah')
    matcher.add('https://www.example.com/foo/')
    matcher.add('https://www.example.net/foo/')
    print matcher.match('example.com/foo/bar')
    print matcher.match('http://example.com/foo')
    print matcher.match('example.com')
    print matcher.match('example')