Mercurial > hg > urlmatch
changeset 0:8bd0c3b2163e
create urlmatch package
author | Jeff Hammel <jhammel@mozilla.com> |
---|---|
date | Mon, 13 Jun 2011 21:25:06 -0700 |
parents | |
children | 750dc780d3d8 |
files | setup.py urlmatch.txt urlmatch/__init__.py |
diffstat | 3 files changed, 132 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/setup.py Mon Jun 13 21:25:06 2011 -0700 @@ -0,0 +1,26 @@ +from setuptools import setup, find_packages +import sys, os + +version = '0.0' + +setup(name='urlmatch', + version=version, + description="match urls systematically", + long_description="""\ +""", + classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers + keywords='url', + author='Jeff Hammel', + author_email='jhammel@mozilla.com', + url='http://k0s.org/mozilla/hg/urlmatch', + license='MPL', + packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), + include_package_data=True, + zip_safe=False, + install_requires=[ + # -*- Extra requirements: -*- + ], + entry_points=""" + # -*- Entry points: -*- + """, + )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/urlmatch.txt Mon Jun 13 21:25:06 2011 -0700 @@ -0,0 +1,21 @@ +urlmatch +======== + +Test making a basic tree:: + + >>> urlmatcher = URLmatcher() + >>> urls = ['http://example.com/foo/bar.html', + ... 'http://example.com/foo/baz.html', + ... 'http://example.com/foo/fleem.html'] + >>> urlmatcher.add(*urls) + >>> urlmatcher.tree() + ['http://example.com/foo/': ['bar.html', 'baz.html', 'fleem.html']] + +Now a more complex tree:: + + >>> urlmatcher = URLmatcher() + >>> urlmatcher.add(*['http://example.com/index.html', + ... 'https://example.com/, + ... 'http://gitcub.com/k0s']) + >>> urlmatcher.tree() + ['example.com': ['
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/urlmatch/__init__.py Mon Jun 13 21:25:06 2011 -0700 @@ -0,0 +1,85 @@ +import urlparse + +class UrlMatcher(object): + + def __init__(self, *urls): + match_order=('domain', 'scheme', 'path') + self.order = match_order + self.urls = {} + for url in urls: + self.add(url) + + def decompose(self, url): + + # break it down + (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) + urldict = {} + + # domain + netloc = netloc.split('.') + if len(netloc) == 1: + urldict['domain'] = netloc + else: + # assert a TLD + urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2])) + + # path + path = path.strip('/').split('/') + if path == ['']: + path = [] + urldict['path'] = path + + # scheme + urldict['scheme'] = scheme + + # could do others + + return urldict + + def add(self, url): + if url not in self.urls: + self.urls[url] = self.decompose(url) + + def match(self, url): + if '://' not in url: + # give a bogus scheme for urlparse. boo! + urldict = self.decompose('bogus://' + url) + urldict.pop('scheme') + else: + urldict = self.decompose(url) + + order = self.order + urls = set(self.urls.keys()) + for field in order: + value = urldict.get(field) + if not value: + # don't match trivial fields + continue + length = len(value) + deleted = set() + for key in list(urls)[:]: + compare_value = self.urls[key].get(field) + if not compare_value: + urls.discard(key) + continue + if isinstance(value, basestring) and value != compare_value: + urls.discard(key) + continue + if len(compare_value) < length: + urls.discard(key) + continue + if compare_value[:len(value)] != value: + urls.discard(key) + if not urls: + return [] + return urls + +if __name__ == '__main__': + matcher = UrlMatcher('http://www.example.com/foo/bar/fleem') + matcher.add('http://www.example.com/foo/blah') + matcher.add('https://www.example.com/foo/') + matcher.add('https://www.example.net/foo/') + print matcher.match('example.com/foo/bar') + print matcher.match('http://example.com/foo') + print matcher.match('example.com') + print matcher.match('example')