annotate urlmatch.py @ 6:0cd69fa6751c

add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
author Jeff Hammel <jhammel@mozilla.com>
date Mon, 27 Jun 2011 07:16:46 -0700
parents 23be092e6099
children ef0553c4bbcd
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
23be092e6099 make this executable
Jeff Hammel <jhammel@mozilla.com>
parents: 3
diff changeset
1 #!/usr/bin/env python
23be092e6099 make this executable
Jeff Hammel <jhammel@mozilla.com>
parents: 3
diff changeset
2
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
3 import urlparse
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
4
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
5 class UrlMatcher(object):
4
23be092e6099 make this executable
Jeff Hammel <jhammel@mozilla.com>
parents: 3
diff changeset
6 """match urls"""
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
7
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
8 def __init__(self, *urls):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
9 match_order=('domain', 'scheme', 'path')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
10 self.order = match_order
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
11 self.urls = {}
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
12 for url in urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
13 self.add(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
14
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
15 def decompose(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
16
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
17 # break it down
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
18 (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
19 urldict = {}
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
20
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
21 # domain
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
22 netloc = netloc.split('.')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
23 if len(netloc) == 1:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
24 urldict['domain'] = netloc
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
25 else:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
26 # assert a TLD
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
27 urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2]))
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
28
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
29 # path
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
30 path = path.strip('/').split('/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
31 if path == ['']:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
32 path = []
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
33 urldict['path'] = path
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
34
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
35 # scheme
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
36 urldict['scheme'] = scheme
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
37
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
38 # could do others
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
39
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
40 return urldict
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
41
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
42 def add(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
43 if url not in self.urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
44 self.urls[url] = self.decompose(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
45
1
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
46 def diff(self, url1, url2):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
47
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
48 # decompose the urls if necessary
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
49 if isinstance(url1, basestring):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
50 url1 = self.decompose(url)
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
51 if isinstance(url2, basestring):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
52 url2 = self.decompose(url)
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
53
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
54 # TODO: finish
6
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
55 for i in self.order:
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
56 if i in url1 and i in url2:
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
57 if url1[i] == url2[i]:
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
58 continue
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
59 if isinstance(url1[i], basestring):
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
60 raise NotImplementedError
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
61 else:
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
62 retval = url1.get(i, url2[i])
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
63 if isinstance(retval, basestring):
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
64 return {i: retval}
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
65 return {i: retval[0]}
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
66
1
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
67
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
68 def match(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
69 if '://' not in url:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
70 # give a bogus scheme for urlparse. boo!
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
71 urldict = self.decompose('bogus://' + url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
72 urldict.pop('scheme')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
73 else:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
74 urldict = self.decompose(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
75
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
76 order = self.order
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
77 urls = set(self.urls.keys())
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
78 for field in order:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
79 value = urldict.get(field)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
80 if not value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
81 # don't match trivial fields
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
82 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
83 length = len(value)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
84 deleted = set()
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
85 for key in list(urls)[:]:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
86 compare_value = self.urls[key].get(field)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
87 if not compare_value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
88 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
89 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
90 if isinstance(value, basestring) and value != compare_value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
91 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
92 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
93 if len(compare_value) < length:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
94 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
95 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
96 if compare_value[:len(value)] != value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
97 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
98 if not urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
99 return []
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
100 return urls
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
101
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
102 if __name__ == '__main__':
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
103 matcher = UrlMatcher('http://www.example.com/foo/bar/fleem')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
104 matcher.add('http://www.example.com/foo/blah')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
105 matcher.add('https://www.example.com/foo/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
106 matcher.add('https://www.example.net/foo/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
107 print matcher.match('example.com/foo/bar')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
108 print matcher.match('http://example.com/foo')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
109 print matcher.match('example.com')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
110 print matcher.match('example')