Mercurial > hg > urlmatch
annotate urlmatch.py @ 4:23be092e6099
make this executable
author | Jeff Hammel <jhammel@mozilla.com> |
---|---|
date | Fri, 17 Jun 2011 10:40:51 -0700 |
parents | 00266c7a7c3c |
children | 0cd69fa6751c |
rev | line source |
---|---|
4 | 1 #!/usr/bin/env python |
2 | |
0 | 3 import urlparse |
4 | |
5 class UrlMatcher(object): | |
4 | 6 """match urls""" |
0 | 7 |
8 def __init__(self, *urls): | |
9 match_order=('domain', 'scheme', 'path') | |
10 self.order = match_order | |
11 self.urls = {} | |
12 for url in urls: | |
13 self.add(url) | |
14 | |
15 def decompose(self, url): | |
16 | |
17 # break it down | |
18 (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) | |
19 urldict = {} | |
20 | |
21 # domain | |
22 netloc = netloc.split('.') | |
23 if len(netloc) == 1: | |
24 urldict['domain'] = netloc | |
25 else: | |
26 # assert a TLD | |
27 urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2])) | |
28 | |
29 # path | |
30 path = path.strip('/').split('/') | |
31 if path == ['']: | |
32 path = [] | |
33 urldict['path'] = path | |
34 | |
35 # scheme | |
36 urldict['scheme'] = scheme | |
37 | |
38 # could do others | |
39 | |
40 return urldict | |
41 | |
42 def add(self, url): | |
43 if url not in self.urls: | |
44 self.urls[url] = self.decompose(url) | |
45 | |
1
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
46 def diff(self, url1, url2): |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
47 |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
48 # decompose the urls if necessary |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
49 if isinstance(url1, basestring): |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
50 url1 = self.decompose(url) |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
51 if isinstance(url2, basestring): |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
52 url2 = self.decompose(url) |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
53 |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
54 # TODO: finish |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
55 raise NotImplementedError |
750dc780d3d8
stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents:
0
diff
changeset
|
56 |
0 | 57 def match(self, url): |
58 if '://' not in url: | |
59 # give a bogus scheme for urlparse. boo! | |
60 urldict = self.decompose('bogus://' + url) | |
61 urldict.pop('scheme') | |
62 else: | |
63 urldict = self.decompose(url) | |
64 | |
65 order = self.order | |
66 urls = set(self.urls.keys()) | |
67 for field in order: | |
68 value = urldict.get(field) | |
69 if not value: | |
70 # don't match trivial fields | |
71 continue | |
72 length = len(value) | |
73 deleted = set() | |
74 for key in list(urls)[:]: | |
75 compare_value = self.urls[key].get(field) | |
76 if not compare_value: | |
77 urls.discard(key) | |
78 continue | |
79 if isinstance(value, basestring) and value != compare_value: | |
80 urls.discard(key) | |
81 continue | |
82 if len(compare_value) < length: | |
83 urls.discard(key) | |
84 continue | |
85 if compare_value[:len(value)] != value: | |
86 urls.discard(key) | |
87 if not urls: | |
88 return [] | |
89 return urls | |
90 | |
91 if __name__ == '__main__': | |
92 matcher = UrlMatcher('http://www.example.com/foo/bar/fleem') | |
93 matcher.add('http://www.example.com/foo/blah') | |
94 matcher.add('https://www.example.com/foo/') | |
95 matcher.add('https://www.example.net/foo/') | |
96 print matcher.match('example.com/foo/bar') | |
97 print matcher.match('http://example.com/foo') | |
98 print matcher.match('example.com') | |
99 print matcher.match('example') |