Mercurial > hg > urlmatch

#!/usr/bin/env python

import urlparse

class UrlMatcher(object):
    """match urls"""

    def __init__(self, *urls):
        match_order=('domain', 'scheme', 'path')
        self.order = match_order
        self.urls = {}
        for url in urls:
            self.add(url)

    def decompose(self, url):

        # break it down
        (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
        urldict = {}

        # domain
        netloc = netloc.split('.')
        if len(netloc) == 1:
            urldict['domain'] = netloc
        else:
            # assert a TLD
            urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2]))

        # path
        path = path.strip('/').split('/')
        if path == ['']:
            path = []
        urldict['path'] = path

        # scheme
        urldict['scheme'] = scheme

        # could do others

        return urldict

    @classmethod
    def recompose(cls, url):
        """reconstruct a deconstructed url"""

        # must have a domain
        assert 'domain' in url
        assert url['domain']

        # reconstruct domain
        if len(url['domain']) == 1:
            return url['domain'][0] # what else to do?
        retval = '%s.%s' % tuple(url['domain'][:2])
        if len(url['domain']) > 2:
            retval = '.'.join(reversed(url['domain'][2:])) + '.' + retval

        # add the scheme
        if 'scheme' in url:
            retval = url['scheme'] + '://' + retval

        # add the path
        if 'path' in url:
            retval += '/' + '/'.join(url['path'])

        return retval

    def add(self, url):
        if url not in self.urls:
            self.urls[url] = self.decompose(url)

    def diff(self, url1, url2):

        # decompose the urls if necessary
        if isinstance(url1, basestring):
            url1 = self.decompose(url)
        if isinstance(url2, basestring):
            url2 = self.decompose(url)

        # TODO: finish
        for i in self.order:
            if i in url1 and i in url2:
                if url1[i] == url2[i]:
                    continue
                if isinstance(url1[i], basestring):
                    raise NotImplementedError
                else:
                    raise NotImplementedError
            elif i not in url1 and i not in url2:
                continue
            else:
                retval1 = url1.get(i)
                retval2 = url2.get(i)
                if isinstance(retval1, basestring) or isinstance(retval2, basestring):
                    return {i: (retval1, retval2)}
                if retval1 is not None:
                    retval1 = retval1[0]
                if retval2 is not None:
                    retval2 = retval2[0]
                return {i: [(retval1, retval2)]}


    def match(self, url):
        if '://' not in url:
            # give a bogus scheme for urlparse. boo!
            urldict = self.decompose('bogus://' + url)
            urldict.pop('scheme')
        else:
            urldict = self.decompose(url)

        order = self.order
        urls = set(self.urls.keys())
        for field in order:
            value = urldict.get(field)
            if not value:
                # don't match trivial fields
                continue
            length = len(value)
            deleted = set()
            for key in list(urls)[:]:
                compare_value = self.urls[key].get(field)
                if not compare_value:
                    urls.discard(key)
                    continue
                if isinstance(value, basestring) and value != compare_value:
                    urls.discard(key)
                    continue
                if len(compare_value) < length:
                    urls.discard(key)
                    continue
                if compare_value[:len(value)] != value:
                    urls.discard(key)
            if not urls:
                return []
        return urls

if __name__ == '__main__':
    matcher = UrlMatcher('http://www.example.com/foo/bar/fleem')
    matcher.add('http://www.example.com/foo/blah')
    matcher.add('https://www.example.com/foo/')
    matcher.add('https://www.example.net/foo/')
    print matcher.match('example.com/foo/bar')
    print matcher.match('http://example.com/foo')
    print matcher.match('example.com')
    print matcher.match('example')
author	Jeff Hammel <jhammel@mozilla.com>
date	Tue, 28 Jun 2011 18:39:18 -0700
parents	ef0553c4bbcd
children