annotate urlmatch.py @ 10:b02420253bfd default tip

add recomposition and a test for it
author Jeff Hammel <jhammel@mozilla.com>
date Tue, 28 Jun 2011 18:39:18 -0700
parents ef0553c4bbcd
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
23be092e6099 make this executable
Jeff Hammel <jhammel@mozilla.com>
parents: 3
diff changeset
1 #!/usr/bin/env python
23be092e6099 make this executable
Jeff Hammel <jhammel@mozilla.com>
parents: 3
diff changeset
2
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
3 import urlparse
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
4
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
5 class UrlMatcher(object):
4
23be092e6099 make this executable
Jeff Hammel <jhammel@mozilla.com>
parents: 3
diff changeset
6 """match urls"""
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
7
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
8 def __init__(self, *urls):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
9 match_order=('domain', 'scheme', 'path')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
10 self.order = match_order
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
11 self.urls = {}
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
12 for url in urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
13 self.add(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
14
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
15 def decompose(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
16
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
17 # break it down
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
18 (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
19 urldict = {}
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
20
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
21 # domain
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
22 netloc = netloc.split('.')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
23 if len(netloc) == 1:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
24 urldict['domain'] = netloc
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
25 else:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
26 # assert a TLD
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
27 urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2]))
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
28
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
29 # path
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
30 path = path.strip('/').split('/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
31 if path == ['']:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
32 path = []
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
33 urldict['path'] = path
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
34
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
35 # scheme
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
36 urldict['scheme'] = scheme
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
37
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
38 # could do others
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
39
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
40 return urldict
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
41
10
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
42 @classmethod
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
43 def recompose(cls, url):
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
44 """reconstruct a deconstructed url"""
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
45
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
46 # must have a domain
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
47 assert 'domain' in url
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
48 assert url['domain']
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
49
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
50 # reconstruct domain
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
51 if len(url['domain']) == 1:
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
52 return url['domain'][0] # what else to do?
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
53 retval = '%s.%s' % tuple(url['domain'][:2])
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
54 if len(url['domain']) > 2:
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
55 retval = '.'.join(reversed(url['domain'][2:])) + '.' + retval
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
56
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
57 # add the scheme
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
58 if 'scheme' in url:
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
59 retval = url['scheme'] + '://' + retval
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
60
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
61 # add the path
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
62 if 'path' in url:
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
63 retval += '/' + '/'.join(url['path'])
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
64
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
65 return retval
b02420253bfd add recomposition and a test for it
Jeff Hammel <jhammel@mozilla.com>
parents: 7
diff changeset
66
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
67 def add(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
68 if url not in self.urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
69 self.urls[url] = self.decompose(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
70
1
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
71 def diff(self, url1, url2):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
72
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
73 # decompose the urls if necessary
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
74 if isinstance(url1, basestring):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
75 url1 = self.decompose(url)
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
76 if isinstance(url2, basestring):
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
77 url2 = self.decompose(url)
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
78
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
79 # TODO: finish
6
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
80 for i in self.order:
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
81 if i in url1 and i in url2:
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
82 if url1[i] == url2[i]:
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
83 continue
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
84 if isinstance(url1[i], basestring):
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
85 raise NotImplementedError
7
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
86 else:
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
87 raise NotImplementedError
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
88 elif i not in url1 and i not in url2:
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
89 continue
6
0cd69fa6751c add test for decomposition; stub diff, will have to move to 2-tuples to do this properly
Jeff Hammel <jhammel@mozilla.com>
parents: 4
diff changeset
90 else:
7
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
91 retval1 = url1.get(i)
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
92 retval2 = url2.get(i)
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
93 if isinstance(retval1, basestring) or isinstance(retval2, basestring):
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
94 return {i: (retval1, retval2)}
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
95 if retval1 is not None:
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
96 retval1 = retval1[0]
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
97 if retval2 is not None:
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
98 retval2 = retval2[0]
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
99 return {i: [(retval1, retval2)]}
ef0553c4bbcd more stubbing
Jeff Hammel <jhammel@mozilla.com>
parents: 6
diff changeset
100
1
750dc780d3d8 stub a diff method; really, i have no idea what im doing
Jeff Hammel <jhammel@mozilla.com>
parents: 0
diff changeset
101
0
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
102 def match(self, url):
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
103 if '://' not in url:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
104 # give a bogus scheme for urlparse. boo!
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
105 urldict = self.decompose('bogus://' + url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
106 urldict.pop('scheme')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
107 else:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
108 urldict = self.decompose(url)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
109
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
110 order = self.order
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
111 urls = set(self.urls.keys())
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
112 for field in order:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
113 value = urldict.get(field)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
114 if not value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
115 # don't match trivial fields
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
116 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
117 length = len(value)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
118 deleted = set()
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
119 for key in list(urls)[:]:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
120 compare_value = self.urls[key].get(field)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
121 if not compare_value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
122 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
123 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
124 if isinstance(value, basestring) and value != compare_value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
125 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
126 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
127 if len(compare_value) < length:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
128 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
129 continue
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
130 if compare_value[:len(value)] != value:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
131 urls.discard(key)
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
132 if not urls:
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
133 return []
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
134 return urls
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
135
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
136 if __name__ == '__main__':
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
137 matcher = UrlMatcher('http://www.example.com/foo/bar/fleem')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
138 matcher.add('http://www.example.com/foo/blah')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
139 matcher.add('https://www.example.com/foo/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
140 matcher.add('https://www.example.net/foo/')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
141 print matcher.match('example.com/foo/bar')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
142 print matcher.match('http://example.com/foo')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
143 print matcher.match('example.com')
8bd0c3b2163e create urlmatch package
Jeff Hammel <jhammel@mozilla.com>
parents:
diff changeset
144 print matcher.match('example')