Mercurial > hg > urlmatch
view urlmatch.py @ 10:b02420253bfd default tip
add recomposition and a test for it
author | Jeff Hammel <jhammel@mozilla.com> |
---|---|
date | Tue, 28 Jun 2011 18:39:18 -0700 |
parents | ef0553c4bbcd |
children |
line wrap: on
line source
#!/usr/bin/env python import urlparse class UrlMatcher(object): """match urls""" def __init__(self, *urls): match_order=('domain', 'scheme', 'path') self.order = match_order self.urls = {} for url in urls: self.add(url) def decompose(self, url): # break it down (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) urldict = {} # domain netloc = netloc.split('.') if len(netloc) == 1: urldict['domain'] = netloc else: # assert a TLD urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2])) # path path = path.strip('/').split('/') if path == ['']: path = [] urldict['path'] = path # scheme urldict['scheme'] = scheme # could do others return urldict @classmethod def recompose(cls, url): """reconstruct a deconstructed url""" # must have a domain assert 'domain' in url assert url['domain'] # reconstruct domain if len(url['domain']) == 1: return url['domain'][0] # what else to do? retval = '%s.%s' % tuple(url['domain'][:2]) if len(url['domain']) > 2: retval = '.'.join(reversed(url['domain'][2:])) + '.' + retval # add the scheme if 'scheme' in url: retval = url['scheme'] + '://' + retval # add the path if 'path' in url: retval += '/' + '/'.join(url['path']) return retval def add(self, url): if url not in self.urls: self.urls[url] = self.decompose(url) def diff(self, url1, url2): # decompose the urls if necessary if isinstance(url1, basestring): url1 = self.decompose(url) if isinstance(url2, basestring): url2 = self.decompose(url) # TODO: finish for i in self.order: if i in url1 and i in url2: if url1[i] == url2[i]: continue if isinstance(url1[i], basestring): raise NotImplementedError else: raise NotImplementedError elif i not in url1 and i not in url2: continue else: retval1 = url1.get(i) retval2 = url2.get(i) if isinstance(retval1, basestring) or isinstance(retval2, basestring): return {i: (retval1, retval2)} if retval1 is not None: retval1 = retval1[0] if retval2 is not None: retval2 = retval2[0] return {i: [(retval1, retval2)]} def match(self, url): if '://' not in url: # give a bogus scheme for urlparse. boo! urldict = self.decompose('bogus://' + url) urldict.pop('scheme') else: urldict = self.decompose(url) order = self.order urls = set(self.urls.keys()) for field in order: value = urldict.get(field) if not value: # don't match trivial fields continue length = len(value) deleted = set() for key in list(urls)[:]: compare_value = self.urls[key].get(field) if not compare_value: urls.discard(key) continue if isinstance(value, basestring) and value != compare_value: urls.discard(key) continue if len(compare_value) < length: urls.discard(key) continue if compare_value[:len(value)] != value: urls.discard(key) if not urls: return [] return urls if __name__ == '__main__': matcher = UrlMatcher('http://www.example.com/foo/bar/fleem') matcher.add('http://www.example.com/foo/blah') matcher.add('https://www.example.com/foo/') matcher.add('https://www.example.net/foo/') print matcher.match('example.com/foo/bar') print matcher.match('http://example.com/foo') print matcher.match('example.com') print matcher.match('example')