Mercurial > hg > urlmatch
annotate urlmatch.py @ 6:0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
| author | Jeff Hammel <jhammel@mozilla.com> | 
|---|---|
| date | Mon, 27 Jun 2011 07:16:46 -0700 | 
| parents | 23be092e6099 | 
| children | ef0553c4bbcd | 
| rev | line source | 
|---|---|
| 4 | 1 #!/usr/bin/env python | 
| 2 | |
| 0 | 3 import urlparse | 
| 4 | |
| 5 class UrlMatcher(object): | |
| 4 | 6 """match urls""" | 
| 0 | 7 | 
| 8 def __init__(self, *urls): | |
| 9 match_order=('domain', 'scheme', 'path') | |
| 10 self.order = match_order | |
| 11 self.urls = {} | |
| 12 for url in urls: | |
| 13 self.add(url) | |
| 14 | |
| 15 def decompose(self, url): | |
| 16 | |
| 17 # break it down | |
| 18 (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) | |
| 19 urldict = {} | |
| 20 | |
| 21 # domain | |
| 22 netloc = netloc.split('.') | |
| 23 if len(netloc) == 1: | |
| 24 urldict['domain'] = netloc | |
| 25 else: | |
| 26 # assert a TLD | |
| 27 urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2])) | |
| 28 | |
| 29 # path | |
| 30 path = path.strip('/').split('/') | |
| 31 if path == ['']: | |
| 32 path = [] | |
| 33 urldict['path'] = path | |
| 34 | |
| 35 # scheme | |
| 36 urldict['scheme'] = scheme | |
| 37 | |
| 38 # could do others | |
| 39 | |
| 40 return urldict | |
| 41 | |
| 42 def add(self, url): | |
| 43 if url not in self.urls: | |
| 44 self.urls[url] = self.decompose(url) | |
| 45 | |
| 1 
750dc780d3d8
stub a diff method; really, i have no idea what im doing
 Jeff Hammel <jhammel@mozilla.com> parents: 
0diff
changeset | 46 def diff(self, url1, url2): | 
| 
750dc780d3d8
stub a diff method; really, i have no idea what im doing
 Jeff Hammel <jhammel@mozilla.com> parents: 
0diff
changeset | 47 | 
| 
750dc780d3d8
stub a diff method; really, i have no idea what im doing
 Jeff Hammel <jhammel@mozilla.com> parents: 
0diff
changeset | 48 # decompose the urls if necessary | 
| 
750dc780d3d8
stub a diff method; really, i have no idea what im doing
 Jeff Hammel <jhammel@mozilla.com> parents: 
0diff
changeset | 49 if isinstance(url1, basestring): | 
| 
750dc780d3d8
stub a diff method; really, i have no idea what im doing
 Jeff Hammel <jhammel@mozilla.com> parents: 
0diff
changeset | 50 url1 = self.decompose(url) | 
| 
750dc780d3d8
stub a diff method; really, i have no idea what im doing
 Jeff Hammel <jhammel@mozilla.com> parents: 
0diff
changeset | 51 if isinstance(url2, basestring): | 
| 
750dc780d3d8
stub a diff method; really, i have no idea what im doing
 Jeff Hammel <jhammel@mozilla.com> parents: 
0diff
changeset | 52 url2 = self.decompose(url) | 
| 
750dc780d3d8
stub a diff method; really, i have no idea what im doing
 Jeff Hammel <jhammel@mozilla.com> parents: 
0diff
changeset | 53 | 
| 
750dc780d3d8
stub a diff method; really, i have no idea what im doing
 Jeff Hammel <jhammel@mozilla.com> parents: 
0diff
changeset | 54 # TODO: finish | 
| 6 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 55 for i in self.order: | 
| 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 56 if i in url1 and i in url2: | 
| 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 57 if url1[i] == url2[i]: | 
| 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 58 continue | 
| 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 59 if isinstance(url1[i], basestring): | 
| 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 60 raise NotImplementedError | 
| 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 61 else: | 
| 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 62 retval = url1.get(i, url2[i]) | 
| 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 63 if isinstance(retval, basestring): | 
| 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 64 return {i: retval} | 
| 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 65 return {i: retval[0]} | 
| 
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
 Jeff Hammel <jhammel@mozilla.com> parents: 
4diff
changeset | 66 | 
| 1 
750dc780d3d8
stub a diff method; really, i have no idea what im doing
 Jeff Hammel <jhammel@mozilla.com> parents: 
0diff
changeset | 67 | 
| 0 | 68 def match(self, url): | 
| 69 if '://' not in url: | |
| 70 # give a bogus scheme for urlparse. boo! | |
| 71 urldict = self.decompose('bogus://' + url) | |
| 72 urldict.pop('scheme') | |
| 73 else: | |
| 74 urldict = self.decompose(url) | |
| 75 | |
| 76 order = self.order | |
| 77 urls = set(self.urls.keys()) | |
| 78 for field in order: | |
| 79 value = urldict.get(field) | |
| 80 if not value: | |
| 81 # don't match trivial fields | |
| 82 continue | |
| 83 length = len(value) | |
| 84 deleted = set() | |
| 85 for key in list(urls)[:]: | |
| 86 compare_value = self.urls[key].get(field) | |
| 87 if not compare_value: | |
| 88 urls.discard(key) | |
| 89 continue | |
| 90 if isinstance(value, basestring) and value != compare_value: | |
| 91 urls.discard(key) | |
| 92 continue | |
| 93 if len(compare_value) < length: | |
| 94 urls.discard(key) | |
| 95 continue | |
| 96 if compare_value[:len(value)] != value: | |
| 97 urls.discard(key) | |
| 98 if not urls: | |
| 99 return [] | |
| 100 return urls | |
| 101 | |
| 102 if __name__ == '__main__': | |
| 103 matcher = UrlMatcher('http://www.example.com/foo/bar/fleem') | |
| 104 matcher.add('http://www.example.com/foo/blah') | |
| 105 matcher.add('https://www.example.com/foo/') | |
| 106 matcher.add('https://www.example.net/foo/') | |
| 107 print matcher.match('example.com/foo/bar') | |
| 108 print matcher.match('http://example.com/foo') | |
| 109 print matcher.match('example.com') | |
| 110 print matcher.match('example') | 
