Mercurial > hg > config
view python/url.py @ 925:a92db57f62f8 default tip
add lxml
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Mon, 20 Jan 2025 09:20:00 -0800 |
parents | f011ec45b8e8 |
children |
line wrap: on
line source
#!/usr/bin/env python # -*- coding: utf-8 -*- """ url manipulation """ import argparse import os import shutil import subprocess import sys import tempfile import urlparse import urllib2 __all__ = ['load', 'main'] string = (str, unicode) def ensure_dir(directory): """ensure `directory` is a directory""" if os.path.exists(directory): assert os.path.isdir(directory) return directory os.makedirs(directory) return directory def isURL(url): return '://' in url def read_s3(url): name = tempfile.mktemp() try: subprocess.check_output(['s3cmd', 'get', url, name]) with open(name) as f: read = f.read() os.remove(name) return read finally: if os.path.exists(name): os.remove(name) def read_http(url): return urllib2.urlopen(url).read() def read_file(url): scheme = 'file://' if url.startswith(scheme): url = url[len(scheme):] return open(url).read() loaders = {'s3': read_s3, 'http': read_http, 'https': read_http, 'file': read_file } def scheme(url): if '://' in url: parsed = urlparse.urlsplit(url) return parsed.scheme return 'file' def parent(url): if '://' in url: return url.rsplit('/', 1)[0] else: # file return os.path.abspath(os.path.dirname(url)) def basename(url): if '://' in url: return url.rsplit('/', 1)[-1] else: # file return os.path.basename(url) def loader(url): return loaders[scheme(url)] def load(url): """returns the contents of a URL""" return loader(url)(url) def get_file(src, dest): shutil.copy2(src, dest) def get_s3(src, dest): subprocess.check_output(['s3cmd', 'get', src, dest]) def default_getter(src, dest): assert not os.path.isURL(dest) dirname = parent(dest) ensure_dir(dirname) with open(dest, 'w') as f: f.write(load(url)) getters = {'file': get_file, 's3': get_s3 } def get(src, dest): """get a thing to a local file""" if os.path.isdir(dest): dest = os.path.join(dest, basename(src)) return getters.get(scheme(src), default_getter)(src, dest) def rel(base, path): """ relative path to base otherwise, return None """ if path.startswith(base): return path[len(base):] def main(args=sys.argv[1:]): """CLI""" # parse command line parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('url', help='URL to read') parser.add_argument('-o', '--output', dest='output', help="get to this location") options = parser.parse_args(args) if options.output: # copy src to this location get(options.url, options.output) sys.exit() # read location contents = load(options.url) # output print (contents) if __name__ == '__main__': main()