annotate urlmatch.py @ 3:00266c7a7c3c

since there is only one module, dont bother with the whole directory thing
author Jeff Hammel <jhammel@mozilla.com>
date Fri, 17 Jun 2011 10:39:37 -0700
parents urlmatch/__init__.py@750dc780d3d8
children 23be092e6099
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
1 import urlparse
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
2
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
3 class UrlMatcher(object):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
4
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
5 def __init__(self, *urls):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
6 match_order=('domain', 'scheme', 'path')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
7 self.order = match_order
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
8 self.urls = {}
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
9 for url in urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
10 self.add(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
11
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
12 def decompose(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
13
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
14 # break it down
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
15 (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
16 urldict = {}
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
17
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
18 # domain
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
19 netloc = netloc.split('.')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
20 if len(netloc) == 1:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
21 urldict['domain'] = netloc
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
22 else:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
23 # assert a TLD
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
24 urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2]))
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
25
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
26 # path
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
27 path = path.strip('/').split('/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
28 if path == ['']:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
29 path = []
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
30 urldict['path'] = path
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
31
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
32 # scheme
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
33 urldict['scheme'] = scheme
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
34
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
35 # could do others
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
36
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
37 return urldict
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
38
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
39 def add(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
40 if url not in self.urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
41 self.urls[url] = self.decompose(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
42
1
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
43 def diff(self, url1, url2):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
44
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
45 # decompose the urls if necessary
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
46 if isinstance(url1, basestring):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
47 url1 = self.decompose(url)
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
48 if isinstance(url2, basestring):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
49 url2 = self.decompose(url)
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
50
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
51 # TODO: finish
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
52 raise NotImplementedError
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
53
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
54 def match(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
55 if '://' not in url:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
56 # give a bogus scheme for urlparse. boo!
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
57 urldict = self.decompose('bogus://' + url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
58 urldict.pop('scheme')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
59 else:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
60 urldict = self.decompose(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
61
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
62 order = self.order
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
63 urls = set(self.urls.keys())
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
64 for field in order:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
65 value = urldict.get(field)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
66 if not value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
67 # don't match trivial fields
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
68 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
69 length = len(value)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
70 deleted = set()
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
71 for key in list(urls)[:]:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
72 compare_value = self.urls[key].get(field)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
73 if not compare_value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
74 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
75 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
76 if isinstance(value, basestring) and value != compare_value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
77 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
78 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
79 if len(compare_value) < length:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
80 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
81 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
82 if compare_value[:len(value)] != value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
83 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
84 if not urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
85 return []
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
86 return urls
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
87
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
88 if __name__ == '__main__':
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
89 matcher = UrlMatcher('http://www.example.com/foo/bar/fleem')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
90 matcher.add('http://www.example.com/foo/blah')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
91 matcher.add('https://www.example.com/foo/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
92 matcher.add('https://www.example.net/foo/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
93 print matcher.match('example.com/foo/bar')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
94 print matcher.match('http://example.com/foo')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
95 print matcher.match('example.com')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
96 print matcher.match('example')