Mercurial > hg > config
annotate python/url.py @ 895:8d3df8c0c730
wordstream is a requirement
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Fri, 13 Aug 2021 15:16:21 -0700 |
parents | f011ec45b8e8 |
children |
rev | line source |
---|---|
754
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
3 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
4 """ |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
5 url manipulation |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
6 """ |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
7 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
8 import argparse |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
9 import os |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
10 import shutil |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
11 import subprocess |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
12 import sys |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
13 import tempfile |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
14 import urlparse |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
15 import urllib2 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
16 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
17 __all__ = ['load', 'main'] |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
18 string = (str, unicode) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
19 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
20 def ensure_dir(directory): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
21 """ensure `directory` is a directory""" |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
22 if os.path.exists(directory): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
23 assert os.path.isdir(directory) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
24 return directory |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
25 os.makedirs(directory) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
26 return directory |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
27 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
28 def isURL(url): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
29 return '://' in url |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
30 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
31 def read_s3(url): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
32 name = tempfile.mktemp() |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
33 try: |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
34 subprocess.check_output(['s3cmd', 'get', url, name]) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
35 with open(name) as f: |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
36 read = f.read() |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
37 os.remove(name) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
38 return read |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
39 finally: |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
40 if os.path.exists(name): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
41 os.remove(name) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
42 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
43 def read_http(url): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
44 return urllib2.urlopen(url).read() |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
45 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
46 def read_file(url): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
47 scheme = 'file://' |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
48 if url.startswith(scheme): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
49 url = url[len(scheme):] |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
50 return open(url).read() |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
51 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
52 loaders = {'s3': read_s3, |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
53 'http': read_http, |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
54 'https': read_http, |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
55 'file': read_file |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
56 } |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
57 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
58 def scheme(url): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
59 if '://' in url: |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
60 parsed = urlparse.urlsplit(url) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
61 return parsed.scheme |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
62 return 'file' |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
63 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
64 def parent(url): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
65 if '://' in url: |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
66 return url.rsplit('/', 1)[0] |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
67 else: |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
68 # file |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
69 return os.path.abspath(os.path.dirname(url)) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
70 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
71 def basename(url): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
72 if '://' in url: |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
73 return url.rsplit('/', 1)[-1] |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
74 else: |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
75 # file |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
76 return os.path.basename(url) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
77 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
78 def loader(url): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
79 return loaders[scheme(url)] |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
80 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
81 def load(url): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
82 """returns the contents of a URL""" |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
83 return loader(url)(url) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
84 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
85 def get_file(src, dest): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
86 shutil.copy2(src, dest) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
87 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
88 def get_s3(src, dest): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
89 subprocess.check_output(['s3cmd', 'get', src, dest]) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
90 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
91 def default_getter(src, dest): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
92 assert not os.path.isURL(dest) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
93 dirname = parent(dest) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
94 ensure_dir(dirname) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
95 with open(dest, 'w') as f: |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
96 f.write(load(url)) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
97 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
98 getters = {'file': get_file, |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
99 's3': get_s3 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
100 } |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
101 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
102 def get(src, dest): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
103 """get a thing to a local file""" |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
104 if os.path.isdir(dest): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
105 dest = os.path.join(dest, basename(src)) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
106 return getters.get(scheme(src), default_getter)(src, dest) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
107 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
108 def rel(base, path): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
109 """ |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
110 relative path to base |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
111 otherwise, return None |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
112 """ |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
113 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
114 if path.startswith(base): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
115 return path[len(base):] |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
116 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
117 def main(args=sys.argv[1:]): |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
118 """CLI""" |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
119 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
120 # parse command line |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
121 parser = argparse.ArgumentParser(description=__doc__) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
122 parser.add_argument('url', help='URL to read') |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
123 parser.add_argument('-o', '--output', dest='output', |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
124 help="get to this location") |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
125 options = parser.parse_args(args) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
126 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
127 if options.output: |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
128 # copy src to this location |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
129 get(options.url, options.output) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
130 sys.exit() |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
131 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
132 # read location |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
133 contents = load(options.url) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
134 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
135 # output |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
136 print (contents) |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
137 |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
138 if __name__ == '__main__': |
f011ec45b8e8
add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
139 main() |