comparison urlmatch.py @ 3:00266c7a7c3c

since there is only one module, dont bother with the whole directory thing
author Jeff Hammel <jhammel@mozilla.com>
date Fri, 17 Jun 2011 10:39:37 -0700
parents urlmatch/__init__.py@750dc780d3d8
children 23be092e6099
comparison
equal deleted inserted replaced
2:20dde2687cfb 3:00266c7a7c3c
1 import urlparse
2
3 class UrlMatcher(object):
4
5 def __init__(self, *urls):
6 match_order=('domain', 'scheme', 'path')
7 self.order = match_order
8 self.urls = {}
9 for url in urls:
10 self.add(url)
11
12 def decompose(self, url):
13
14 # break it down
15 (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
16 urldict = {}
17
18 # domain
19 netloc = netloc.split('.')
20 if len(netloc) == 1:
21 urldict['domain'] = netloc
22 else:
23 # assert a TLD
24 urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2]))
25
26 # path
27 path = path.strip('/').split('/')
28 if path == ['']:
29 path = []
30 urldict['path'] = path
31
32 # scheme
33 urldict['scheme'] = scheme
34
35 # could do others
36
37 return urldict
38
39 def add(self, url):
40 if url not in self.urls:
41 self.urls[url] = self.decompose(url)
42
43 def diff(self, url1, url2):
44
45 # decompose the urls if necessary
46 if isinstance(url1, basestring):
47 url1 = self.decompose(url)
48 if isinstance(url2, basestring):
49 url2 = self.decompose(url)
50
51 # TODO: finish
52 raise NotImplementedError
53
54 def match(self, url):
55 if '://' not in url:
56 # give a bogus scheme for urlparse. boo!
57 urldict = self.decompose('bogus://' + url)
58 urldict.pop('scheme')
59 else:
60 urldict = self.decompose(url)
61
62 order = self.order
63 urls = set(self.urls.keys())
64 for field in order:
65 value = urldict.get(field)
66 if not value:
67 # don't match trivial fields
68 continue
69 length = len(value)
70 deleted = set()
71 for key in list(urls)[:]:
72 compare_value = self.urls[key].get(field)
73 if not compare_value:
74 urls.discard(key)
75 continue
76 if isinstance(value, basestring) and value != compare_value:
77 urls.discard(key)
78 continue
79 if len(compare_value) < length:
80 urls.discard(key)
81 continue
82 if compare_value[:len(value)] != value:
83 urls.discard(key)
84 if not urls:
85 return []
86 return urls
87
88 if __name__ == '__main__':
89 matcher = UrlMatcher('http://www.example.com/foo/bar/fleem')
90 matcher.add('http://www.example.com/foo/blah')
91 matcher.add('https://www.example.com/foo/')
92 matcher.add('https://www.example.net/foo/')
93 print matcher.match('example.com/foo/bar')
94 print matcher.match('http://example.com/foo')
95 print matcher.match('example.com')
96 print matcher.match('example')