Mercurial > hg > urlmatch
comparison urlmatch.py @ 3:00266c7a7c3c
since there is only one module, dont bother with the whole directory thing
author | Jeff Hammel <jhammel@mozilla.com> |
---|---|
date | Fri, 17 Jun 2011 10:39:37 -0700 |
parents | urlmatch/__init__.py@750dc780d3d8 |
children | 23be092e6099 |
comparison
equal
deleted
inserted
replaced
2:20dde2687cfb | 3:00266c7a7c3c |
---|---|
1 import urlparse | |
2 | |
3 class UrlMatcher(object): | |
4 | |
5 def __init__(self, *urls): | |
6 match_order=('domain', 'scheme', 'path') | |
7 self.order = match_order | |
8 self.urls = {} | |
9 for url in urls: | |
10 self.add(url) | |
11 | |
12 def decompose(self, url): | |
13 | |
14 # break it down | |
15 (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) | |
16 urldict = {} | |
17 | |
18 # domain | |
19 netloc = netloc.split('.') | |
20 if len(netloc) == 1: | |
21 urldict['domain'] = netloc | |
22 else: | |
23 # assert a TLD | |
24 urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2])) | |
25 | |
26 # path | |
27 path = path.strip('/').split('/') | |
28 if path == ['']: | |
29 path = [] | |
30 urldict['path'] = path | |
31 | |
32 # scheme | |
33 urldict['scheme'] = scheme | |
34 | |
35 # could do others | |
36 | |
37 return urldict | |
38 | |
39 def add(self, url): | |
40 if url not in self.urls: | |
41 self.urls[url] = self.decompose(url) | |
42 | |
43 def diff(self, url1, url2): | |
44 | |
45 # decompose the urls if necessary | |
46 if isinstance(url1, basestring): | |
47 url1 = self.decompose(url) | |
48 if isinstance(url2, basestring): | |
49 url2 = self.decompose(url) | |
50 | |
51 # TODO: finish | |
52 raise NotImplementedError | |
53 | |
54 def match(self, url): | |
55 if '://' not in url: | |
56 # give a bogus scheme for urlparse. boo! | |
57 urldict = self.decompose('bogus://' + url) | |
58 urldict.pop('scheme') | |
59 else: | |
60 urldict = self.decompose(url) | |
61 | |
62 order = self.order | |
63 urls = set(self.urls.keys()) | |
64 for field in order: | |
65 value = urldict.get(field) | |
66 if not value: | |
67 # don't match trivial fields | |
68 continue | |
69 length = len(value) | |
70 deleted = set() | |
71 for key in list(urls)[:]: | |
72 compare_value = self.urls[key].get(field) | |
73 if not compare_value: | |
74 urls.discard(key) | |
75 continue | |
76 if isinstance(value, basestring) and value != compare_value: | |
77 urls.discard(key) | |
78 continue | |
79 if len(compare_value) < length: | |
80 urls.discard(key) | |
81 continue | |
82 if compare_value[:len(value)] != value: | |
83 urls.discard(key) | |
84 if not urls: | |
85 return [] | |
86 return urls | |
87 | |
88 if __name__ == '__main__': | |
89 matcher = UrlMatcher('http://www.example.com/foo/bar/fleem') | |
90 matcher.add('http://www.example.com/foo/blah') | |
91 matcher.add('https://www.example.com/foo/') | |
92 matcher.add('https://www.example.net/foo/') | |
93 print matcher.match('example.com/foo/bar') | |
94 print matcher.match('http://example.com/foo') | |
95 print matcher.match('example.com') | |
96 print matcher.match('example') |