Mercurial > hg > urlmatch
annotate urlmatch.py @ 6:0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
author | Jeff Hammel <jhammel@mozilla.com> |
---|---|
date | Mon, 27 Jun 2011 07:16:46 -0700 |
parents | 23be092e6099 |
children | ef0553c4bbcd |
rev | line source |
---|---|
4 | 1 #!/usr/bin/env python |
2 | |
0 | 3 import urlparse |
4 | |
5 class UrlMatcher(object): | |
4 | 6 """match urls""" |
0 | 7 |
8 def __init__(self, *urls): | |
9 match_order=('domain', 'scheme', 'path') | |
10 self.order = match_order | |
11 self.urls = {} | |
12 for url in urls: | |
13 self.add(url) | |
14 | |
15 def decompose(self, url): | |
16 | |
17 # break it down | |
18 (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) | |
19 urldict = {} | |
20 | |
21 # domain | |
22 netloc = netloc.split('.') | |
23 if len(netloc) == 1: | |
24 urldict['domain'] = netloc | |
25 else: | |
26 # assert a TLD | |
27 urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2])) | |
28 | |
29 # path | |
30 path = path.strip('/').split('/') | |
31 if path == ['']: | |
32 path = [] | |
33 urldict['path'] = path | |
34 | |
35 # scheme | |
36 urldict['scheme'] = scheme | |
37 | |
38 # could do others | |
39 | |
40 return urldict | |
41 | |
42 def add(self, url): | |
43 if url not in self.urls: | |
44 self.urls[url] = self.decompose(url) | |
45 | |
1
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
46 def diff(self, url1, url2): |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
47 |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
48 # decompose the urls if necessary |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
49 if isinstance(url1, basestring): |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
50 url1 = self.decompose(url) |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
51 if isinstance(url2, basestring): |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
52 url2 = self.decompose(url) |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
53 |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
54 # TODO: finish |
6
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
55 for i in self.order: |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
56 if i in url1 and i in url2: |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
57 if url1[i] == url2[i]: |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
58 continue |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
59 if isinstance(url1[i], basestring): |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
60 raise NotImplementedError |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
61 else: |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
62 retval = url1.get(i, url2[i]) |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
63 if isinstance(retval, basestring): |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
64 return {i: retval} |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
65 return {i: retval[0]} |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
66 |
1
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
67 |
0 | 68 def match(self, url): |
69 if '://' not in url: | |
70 # give a bogus scheme for urlparse. boo! | |
71 urldict = self.decompose('bogus://' + url) | |
72 urldict.pop('scheme') | |
73 else: | |
74 urldict = self.decompose(url) | |
75 | |
76 order = self.order | |
77 urls = set(self.urls.keys()) | |
78 for field in order: | |
79 value = urldict.get(field) | |
80 if not value: | |
81 # don't match trivial fields | |
82 continue | |
83 length = len(value) | |
84 deleted = set() | |
85 for key in list(urls)[:]: | |
86 compare_value = self.urls[key].get(field) | |
87 if not compare_value: | |
88 urls.discard(key) | |
89 continue | |
90 if isinstance(value, basestring) and value != compare_value: | |
91 urls.discard(key) | |
92 continue | |
93 if len(compare_value) < length: | |
94 urls.discard(key) | |
95 continue | |
96 if compare_value[:len(value)] != value: | |
97 urls.discard(key) | |
98 if not urls: | |
99 return [] | |
100 return urls | |
101 | |
102 if __name__ == '__main__': | |
103 matcher = UrlMatcher('http://www.example.com/foo/bar/fleem') | |
104 matcher.add('http://www.example.com/foo/blah') | |
105 matcher.add('https://www.example.com/foo/') | |
106 matcher.add('https://www.example.net/foo/') | |
107 print matcher.match('example.com/foo/bar') | |
108 print matcher.match('http://example.com/foo') | |
109 print matcher.match('example.com') | |
110 print matcher.match('example') |