Mercurial > hg > urlmatch

diff urlmatch.py @ 3:00266c7a7c3c
since there is only one module, dont bother with the whole directory thing
author: Jeff Hammel <jhammel@mozilla.com>
date: Fri, 17 Jun 2011 10:39:37 -0700
parents: urlmatch/__init__.py@750dc780d3d8
children: 23be092e6099
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/urlmatch.py	Fri Jun 17 10:39:37 2011 -0700
@@ -0,0 +1,96 @@
+import urlparse
+
+class UrlMatcher(object):
+
+    def __init__(self, *urls):
+        match_order=('domain', 'scheme', 'path')
+        self.order = match_order
+        self.urls = {}
+        for url in urls:
+            self.add(url)
+
+    def decompose(self, url):
+
+        # break it down
+        (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
+        urldict = {}
+
+        # domain
+        netloc = netloc.split('.')
+        if len(netloc) == 1:
+            urldict['domain'] = netloc
+        else:
+            # assert a TLD
+            urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2]))
+
+        # path
+        path = path.strip('/').split('/')
+        if path == ['']:
+            path = []
+        urldict['path'] = path
+
+        # scheme
+        urldict['scheme'] = scheme
+
+        # could do others
+
+        return urldict
+
+    def add(self, url):
+        if url not in self.urls:
+            self.urls[url] = self.decompose(url)
+
+    def diff(self, url1, url2):
+
+        # decompose the urls if necessary
+        if isinstance(url1, basestring):
+            url1 = self.decompose(url)
+        if isinstance(url2, basestring):
+            url2 = self.decompose(url)
+
+        # TODO: finish
+        raise NotImplementedError
+
+    def match(self, url):
+        if '://' not in url:
+            # give a bogus scheme for urlparse. boo!
+            urldict = self.decompose('bogus://' + url)
+            urldict.pop('scheme')
+        else:
+            urldict = self.decompose(url)
+
+        order = self.order
+        urls = set(self.urls.keys())
+        for field in order:
+            value = urldict.get(field)
+            if not value:
+                # don't match trivial fields
+                continue
+            length = len(value)
+            deleted = set()
+            for key in list(urls)[:]:
+                compare_value = self.urls[key].get(field)
+                if not compare_value:
+                    urls.discard(key)
+                    continue
+                if isinstance(value, basestring) and value != compare_value:
+                    urls.discard(key)
+                    continue
+                if len(compare_value) < length:
+                    urls.discard(key)
+                    continue
+                if compare_value[:len(value)] != value:
+                    urls.discard(key)
+            if not urls:
+                return []
+        return urls
+
+if __name__ == '__main__':
+    matcher = UrlMatcher('http://www.example.com/foo/bar/fleem')
+    matcher.add('http://www.example.com/foo/blah')
+    matcher.add('https://www.example.com/foo/')
+    matcher.add('https://www.example.net/foo/')
+    print matcher.match('example.com/foo/bar')
+    print matcher.match('http://example.com/foo')
+    print matcher.match('example.com')
+    print matcher.match('example')
author	Jeff Hammel <jhammel@mozilla.com>
date	Fri, 17 Jun 2011 10:39:37 -0700
parents	urlmatch/__init__.py@750dc780d3d8
children	23be092e6099