urlmatch: urlmatch.py comparison

comparison urlmatch.py @ 3:00266c7a7c3c

since there is only one module, dont bother with the whole directory thing

author	Jeff Hammel <jhammel@mozilla.com>
date	Fri, 17 Jun 2011 10:39:37 -0700
parents	urlmatch/__init__.py@750dc780d3d8
children	23be092e6099

comparison

equal deleted inserted replaced

-:20dde2687cfb
+:00266c7a7c3c
+import urlparse
+class UrlMatcher(object):
+def __init__(self, *urls):
+match_order=('domain', 'scheme', 'path')
+self.order = match_order
+self.urls = {}
+for url in urls:
+self.add(url)
+def decompose(self, url):
+# break it down
+(scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
+urldict = {}
+# domain
+netloc = netloc.split('.')
+if len(netloc) == 1:
+urldict['domain'] = netloc
+else:
+# assert a TLD
+urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2]))
+# path
+path = path.strip('/').split('/')
+if path == ['']:
+path = []
+urldict['path'] = path
+# scheme
+urldict['scheme'] = scheme
+# could do others
+return urldict
+def add(self, url):
+if url not in self.urls:
+self.urls[url] = self.decompose(url)
+def diff(self, url1, url2):
+# decompose the urls if necessary
+if isinstance(url1, basestring):
+url1 = self.decompose(url)
+if isinstance(url2, basestring):
+url2 = self.decompose(url)
+# TODO: finish
+raise NotImplementedError
+def match(self, url):
+if '://' not in url:
+# give a bogus scheme for urlparse. boo!
+urldict = self.decompose('bogus://' + url)
+urldict.pop('scheme')
+else:
+urldict = self.decompose(url)
+order = self.order
+urls = set(self.urls.keys())
+for field in order:
+value = urldict.get(field)
+if not value:
+# don't match trivial fields
+continue
+length = len(value)
+deleted = set()
+for key in list(urls)[:]:
+compare_value = self.urls[key].get(field)
+if not compare_value:
+urls.discard(key)
+continue
+if isinstance(value, basestring) and value != compare_value:
+urls.discard(key)
+continue
+if len(compare_value) < length:
+urls.discard(key)
+continue
+if compare_value[:len(value)] != value:
+urls.discard(key)
+if not urls:
+return []
+return urls
+if __name__ == '__main__':
+matcher = UrlMatcher('http://www.example.com/foo/bar/fleem')
+matcher.add('http://www.example.com/foo/blah')
+matcher.add('https://www.example.com/foo/')
+matcher.add('https://www.example.net/foo/')
+print matcher.match('example.com/foo/bar')
+print matcher.match('http://example.com/foo')
+print matcher.match('example.com')
+print matcher.match('example')

Mercurial > hg > urlmatch

comparison urlmatch.py @ 3:00266c7a7c3c