Mercurial > hg > urlmatch
annotate urlmatch.py @ 10:b02420253bfd default tip
add recomposition and a test for it
author | Jeff Hammel <jhammel@mozilla.com> |
---|---|
date | Tue, 28 Jun 2011 18:39:18 -0700 |
parents | ef0553c4bbcd |
children |
rev | line source |
---|---|
4 | 1 #!/usr/bin/env python |
2 | |
0 | 3 import urlparse |
4 | |
5 class UrlMatcher(object): | |
4 | 6 """match urls""" |
0 | 7 |
8 def __init__(self, *urls): | |
9 match_order=('domain', 'scheme', 'path') | |
10 self.order = match_order | |
11 self.urls = {} | |
12 for url in urls: | |
13 self.add(url) | |
14 | |
15 def decompose(self, url): | |
16 | |
17 # break it down | |
18 (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) | |
19 urldict = {} | |
20 | |
21 # domain | |
22 netloc = netloc.split('.') | |
23 if len(netloc) == 1: | |
24 urldict['domain'] = netloc | |
25 else: | |
26 # assert a TLD | |
27 urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2])) | |
28 | |
29 # path | |
30 path = path.strip('/').split('/') | |
31 if path == ['']: | |
32 path = [] | |
33 urldict['path'] = path | |
34 | |
35 # scheme | |
36 urldict['scheme'] = scheme | |
37 | |
38 # could do others | |
39 | |
40 return urldict | |
41 | |
10
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
42 @classmethod |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
43 def recompose(cls, url): |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
44 """reconstruct a deconstructed url""" |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
45 |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
46 # must have a domain |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
47 assert 'domain' in url |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
48 assert url['domain'] |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
49 |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
50 # reconstruct domain |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
51 if len(url['domain']) == 1: |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
52 return url['domain'][0] # what else to do? |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
53 retval = '%s.%s' % tuple(url['domain'][:2]) |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
54 if len(url['domain']) > 2: |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
55 retval = '.'.join(reversed(url['domain'][2:])) + '.' + retval |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
56 |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
57 # add the scheme |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
58 if 'scheme' in url: |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
59 retval = url['scheme'] + '://' + retval |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
60 |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
61 # add the path |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
62 if 'path' in url: |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
63 retval += '/' + '/'.join(url['path']) |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
64 |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
65 return retval |
b02420253bfd
add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents:
7
diff
changeset
|
66 |
0 | 67 def add(self, url): |
68 if url not in self.urls: | |
69 self.urls[url] = self.decompose(url) | |
70 | |
1
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
71 def diff(self, url1, url2): |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
72 |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
73 # decompose the urls if necessary |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
74 if isinstance(url1, basestring): |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
75 url1 = self.decompose(url) |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
76 if isinstance(url2, basestring): |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
77 url2 = self.decompose(url) |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
78 |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
79 # TODO: finish |
6
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
80 for i in self.order: |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
81 if i in url1 and i in url2: |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
82 if url1[i] == url2[i]: |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
83 continue |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
84 if isinstance(url1[i], basestring): |
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
85 raise NotImplementedError |
7 | 86 else: |
87 raise NotImplementedError | |
88 elif i not in url1 and i not in url2: | |
89 continue | |
6
0cd69fa6751c
add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents:
4
diff
changeset
|
90 else: |
7 | 91 retval1 = url1.get(i) |
92 retval2 = url2.get(i) | |
93 if isinstance(retval1, basestring) or isinstance(retval2, basestring): | |
94 return {i: (retval1, retval2)} | |
95 if retval1 is not None: | |
96 retval1 = retval1[0] | |
97 if retval2 is not None: | |
98 retval2 = retval2[0] | |
99 return {i: [(retval1, retval2)]} | |
100 | |
1
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
101 |
0 | 102 def match(self, url): |
103 if '://' not in url: | |
104 # give a bogus scheme for urlparse. boo! | |
105 urldict = self.decompose('bogus://' + url) | |
106 urldict.pop('scheme') | |
107 else: | |
108 urldict = self.decompose(url) | |
109 | |
110 order = self.order | |
111 urls = set(self.urls.keys()) | |
112 for field in order: | |
113 value = urldict.get(field) | |
114 if not value: | |
115 # don't match trivial fields | |
116 continue | |
117 length = len(value) | |
118 deleted = set() | |
119 for key in list(urls)[:]: | |
120 compare_value = self.urls[key].get(field) | |
121 if not compare_value: | |
122 urls.discard(key) | |
123 continue | |
124 if isinstance(value, basestring) and value != compare_value: | |
125 urls.discard(key) | |
126 continue | |
127 if len(compare_value) < length: | |
128 urls.discard(key) | |
129 continue | |
130 if compare_value[:len(value)] != value: | |
131 urls.discard(key) | |
132 if not urls: | |
133 return [] | |
134 return urls | |
135 | |
136 if __name__ == '__main__': | |
137 matcher = UrlMatcher('http://www.example.com/foo/bar/fleem') | |
138 matcher.add('http://www.example.com/foo/blah') | |
139 matcher.add('https://www.example.com/foo/') | |
140 matcher.add('https://www.example.net/foo/') | |
141 print matcher.match('example.com/foo/bar') | |
142 print matcher.match('http://example.com/foo') | |
143 print matcher.match('example.com') | |
144 print matcher.match('example') |