Mercurial > hg > urlmatch
annotate urlmatch.py @ 7:ef0553c4bbcd
more stubbing
| author | Jeff Hammel <jhammel@mozilla.com> |
|---|---|
| date | Mon, 27 Jun 2011 11:07:38 -0700 |
| parents | 0cd69fa6751c |
| children | b02420253bfd |
| rev | line source |
|---|---|
| 4 | 1 #!/usr/bin/env python |
| 2 | |
| 0 | 3 import urlparse |
| 4 | |
| 5 class UrlMatcher(object): | |
| 4 | 6 """match urls""" |
| 0 | 7 |
| 8 def __init__(self, *urls): | |
| 9 match_order=('domain', 'scheme', 'path') | |
| 10 self.order = match_order | |
| 11 self.urls = {} | |
| 12 for url in urls: | |
| 13 self.add(url) | |
| 14 | |
| 15 def decompose(self, url): | |
| 16 | |
| 17 # break it down | |
| 18 (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) | |
| 19 urldict = {} | |
| 20 | |
| 21 # domain | |
| 22 netloc = netloc.split('.') | |
| 23 if len(netloc) == 1: | |
| 24 urldict['domain'] = netloc | |
| 25 else: | |
| 26 # assert a TLD | |
| 27 urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2])) | |
| 28 | |
| 29 # path | |
| 30 path = path.strip('/').split('/') | |
| 31 if path == ['']: | |
| 32 path = [] | |
| 33 urldict['path'] = path | |
| 34 | |
| 35 # scheme | |
| 36 urldict['scheme'] = scheme | |
| 37 | |
| 38 # could do others | |
| 39 | |
| 40 return urldict | |
| 41 | |
| 42 def add(self, url): | |
| 43 if url not in self.urls: | |
| 44 self.urls[url] = self.decompose(url) | |
| 45 | |
|
1
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
46 def diff(self, url1, url2): |
|
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
47 |
|
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
48 # decompose the urls if necessary |
|
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
49 if isinstance(url1, basestring): |
|
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
50 url1 = self.decompose(url) |
|
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
51 if isinstance(url2, basestring): |
|
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
52 url2 = self.decompose(url) |
|
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
53 |
|
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
54 # TODO: finish |
|
6
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
55 for i in self.order: |
|
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
56 if i in url1 and i in url2: |
|
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
57 if url1[i] == url2[i]: |
|
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
58 continue |
|
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
59 if isinstance(url1[i], basestring): |
|
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
60 raise NotImplementedError |
| 7 | 61 else: |
| 62 raise NotImplementedError | |
| 63 elif i not in url1 and i not in url2: | |
| 64 continue | |
|
6
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
65 else: |
| 7 | 66 retval1 = url1.get(i) |
| 67 retval2 = url2.get(i) | |
| 68 if isinstance(retval1, basestring) or isinstance(retval2, basestring): | |
| 69 return {i: (retval1, retval2)} | |
| 70 if retval1 is not None: | |
| 71 retval1 = retval1[0] | |
| 72 if retval2 is not None: | |
| 73 retval2 = retval2[0] | |
| 74 return {i: [(retval1, retval2)]} | |
| 75 | |
|
1
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
76 |
| 0 | 77 def match(self, url): |
| 78 if '://' not in url: | |
| 79 # give a bogus scheme for urlparse. boo! | |
| 80 urldict = self.decompose('bogus://' + url) | |
| 81 urldict.pop('scheme') | |
| 82 else: | |
| 83 urldict = self.decompose(url) | |
| 84 | |
| 85 order = self.order | |
| 86 urls = set(self.urls.keys()) | |
| 87 for field in order: | |
| 88 value = urldict.get(field) | |
| 89 if not value: | |
| 90 # don't match trivial fields | |
| 91 continue | |
| 92 length = len(value) | |
| 93 deleted = set() | |
| 94 for key in list(urls)[:]: | |
| 95 compare_value = self.urls[key].get(field) | |
| 96 if not compare_value: | |
| 97 urls.discard(key) | |
| 98 continue | |
| 99 if isinstance(value, basestring) and value != compare_value: | |
| 100 urls.discard(key) | |
| 101 continue | |
| 102 if len(compare_value) < length: | |
| 103 urls.discard(key) | |
| 104 continue | |
| 105 if compare_value[:len(value)] != value: | |
| 106 urls.discard(key) | |
| 107 if not urls: | |
| 108 return [] | |
| 109 return urls | |
| 110 | |
| 111 if __name__ == '__main__': | |
| 112 matcher = UrlMatcher('http://www.example.com/foo/bar/fleem') | |
| 113 matcher.add('http://www.example.com/foo/blah') | |
| 114 matcher.add('https://www.example.com/foo/') | |
| 115 matcher.add('https://www.example.net/foo/') | |
| 116 print matcher.match('example.com/foo/bar') | |
| 117 print matcher.match('http://example.com/foo') | |
| 118 print matcher.match('example.com') | |
| 119 print matcher.match('example') |
