annotate urlmatch.py @ 5:6b99523536ee

tests work
author Jeff Hammel <jhammel@mozilla.com>
date Fri, 17 Jun 2011 10:52:29 -0700
parents 23be092e6099
children 0cd69fa6751c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
23be092e6099 make this executable
Jeff Hammel <jhammel@mozilla.com>
parents: 3
diff changeset
1 #!/usr/bin/env python
23be092e6099 make this executable
Jeff Hammel <jhammel@mozilla.com>
parents: 3
diff changeset
2
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
3 import urlparse
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
4
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
5 class UrlMatcher(object):
4
23be092e6099 make this executable
Jeff Hammel <jhammel@mozilla.com>
parents: 3
diff changeset
6 """match urls"""
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
7
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
8 def __init__(self, *urls):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
9 match_order=('domain', 'scheme', 'path')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
10 self.order = match_order
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
11 self.urls = {}
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
12 for url in urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
13 self.add(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
14
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
15 def decompose(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
16
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
17 # break it down
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
18 (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
19 urldict = {}
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
20
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
21 # domain
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
22 netloc = netloc.split('.')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
23 if len(netloc) == 1:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
24 urldict['domain'] = netloc
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
25 else:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
26 # assert a TLD
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
27 urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2]))
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
28
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
29 # path
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
30 path = path.strip('/').split('/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
31 if path == ['']:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
32 path = []
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
33 urldict['path'] = path
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
34
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
35 # scheme
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
36 urldict['scheme'] = scheme
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
37
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
38 # could do others
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
39
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
40 return urldict
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
41
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
42 def add(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
43 if url not in self.urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
44 self.urls[url] = self.decompose(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
45
1
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
46 def diff(self, url1, url2):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
47
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
48 # decompose the urls if necessary
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
49 if isinstance(url1, basestring):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
50 url1 = self.decompose(url)
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
51 if isinstance(url2, basestring):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
52 url2 = self.decompose(url)
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
53
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
54 # TODO: finish
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
55 raise NotImplementedError
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
56
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
57 def match(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
58 if '://' not in url:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
59 # give a bogus scheme for urlparse. boo!
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
60 urldict = self.decompose('bogus://' + url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
61 urldict.pop('scheme')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
62 else:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
63 urldict = self.decompose(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
64
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
65 order = self.order
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
66 urls = set(self.urls.keys())
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
67 for field in order:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
68 value = urldict.get(field)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
69 if not value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
70 # don't match trivial fields
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
71 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
72 length = len(value)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
73 deleted = set()
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
74 for key in list(urls)[:]:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
75 compare_value = self.urls[key].get(field)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
76 if not compare_value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
77 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
78 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
79 if isinstance(value, basestring) and value != compare_value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
80 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
81 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
82 if len(compare_value) < length:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
83 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
84 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
85 if compare_value[:len(value)] != value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
86 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
87 if not urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
88 return []
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
89 return urls
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
90
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
91 if __name__ == '__main__':
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
92 matcher = UrlMatcher('http://www.example.com/foo/bar/fleem')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
93 matcher.add('http://www.example.com/foo/blah')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
94 matcher.add('https://www.example.com/foo/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
95 matcher.add('https://www.example.net/foo/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
96 print matcher.match('example.com/foo/bar')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
97 print matcher.match('http://example.com/foo')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
98 print matcher.match('example.com')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
99 print matcher.match('example')