changeset 0:8bd0c3b2163e

create urlmatch package
author Jeff Hammel <jhammel@mozilla.com>
date Mon, 13 Jun 2011 21:25:06 -0700
parents
children 750dc780d3d8
files setup.py urlmatch.txt urlmatch/__init__.py
diffstat 3 files changed, 132 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/setup.py	Mon Jun 13 21:25:06 2011 -0700
@@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+import sys, os
+
+version = '0.0'
+
+setup(name='urlmatch',
+      version=version,
+      description="match urls systematically",
+      long_description="""\
+""",
+      classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
+      keywords='url',
+      author='Jeff Hammel',
+      author_email='jhammel@mozilla.com',
+      url='http://k0s.org/mozilla/hg/urlmatch',
+      license='MPL',
+      packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
+      include_package_data=True,
+      zip_safe=False,
+      install_requires=[
+          # -*- Extra requirements: -*-
+      ],
+      entry_points="""
+      # -*- Entry points: -*-
+      """,
+      )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/urlmatch.txt	Mon Jun 13 21:25:06 2011 -0700
@@ -0,0 +1,21 @@
+urlmatch
+========
+
+Test making a basic tree::
+
+    >>> urlmatcher = URLmatcher()
+    >>> urls = ['http://example.com/foo/bar.html',
+    ...         'http://example.com/foo/baz.html',
+    ...         'http://example.com/foo/fleem.html']
+    >>> urlmatcher.add(*urls)
+    >>> urlmatcher.tree()
+    ['http://example.com/foo/': ['bar.html', 'baz.html', 'fleem.html']]
+    
+Now a more complex tree::
+
+    >>> urlmatcher = URLmatcher()
+    >>> urlmatcher.add(*['http://example.com/index.html',
+    ...                  'https://example.com/,
+    ...                  'http://gitcub.com/k0s'])
+    >>> urlmatcher.tree()
+    ['example.com': ['  
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/urlmatch/__init__.py	Mon Jun 13 21:25:06 2011 -0700
@@ -0,0 +1,85 @@
+import urlparse
+
+class UrlMatcher(object):
+
+    def __init__(self, *urls):
+        match_order=('domain', 'scheme', 'path')
+        self.order = match_order
+        self.urls = {}
+        for url in urls:
+            self.add(url)
+
+    def decompose(self, url):
+
+        # break it down
+        (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
+        urldict = {}
+
+        # domain
+        netloc = netloc.split('.')
+        if len(netloc) == 1:
+            urldict['domain'] = netloc
+        else:
+            # assert a TLD
+            urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2]))
+
+        # path
+        path = path.strip('/').split('/')
+        if path == ['']:
+            path = []
+        urldict['path'] = path
+
+        # scheme
+        urldict['scheme'] = scheme
+
+        # could do others
+
+        return urldict
+
+    def add(self, url):
+        if url not in self.urls:
+            self.urls[url] = self.decompose(url)
+
+    def match(self, url):
+        if '://' not in url:
+            # give a bogus scheme for urlparse. boo!
+            urldict = self.decompose('bogus://' + url)
+            urldict.pop('scheme')
+        else:
+            urldict = self.decompose(url)
+
+        order = self.order
+        urls = set(self.urls.keys())
+        for field in order:
+            value = urldict.get(field)
+            if not value:
+                # don't match trivial fields
+                continue
+            length = len(value)
+            deleted = set()
+            for key in list(urls)[:]:
+                compare_value = self.urls[key].get(field)
+                if not compare_value:
+                    urls.discard(key)
+                    continue
+                if isinstance(value, basestring) and value != compare_value:
+                    urls.discard(key)
+                    continue
+                if len(compare_value) < length:
+                    urls.discard(key)
+                    continue
+                if compare_value[:len(value)] != value:
+                    urls.discard(key)
+            if not urls:
+                return []
+        return urls
+
+if __name__ == '__main__':
+    matcher = UrlMatcher('http://www.example.com/foo/bar/fleem')
+    matcher.add('http://www.example.com/foo/blah')
+    matcher.add('https://www.example.com/foo/')
+    matcher.add('https://www.example.net/foo/')
+    print matcher.match('example.com/foo/bar')
+    print matcher.match('http://example.com/foo')
+    print matcher.match('example.com')
+    print matcher.match('example')