annotate python/url.py @ 925:a92db57f62f8 default tip

add lxml
author Jeff Hammel <k0scist@gmail.com>
date Mon, 20 Jan 2025 09:20:00 -0800
parents f011ec45b8e8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
754
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2 # -*- coding: utf-8 -*-
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 """
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 url manipulation
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6 """
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 import argparse
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
9 import os
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
10 import shutil
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 import subprocess
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12 import sys
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
13 import tempfile
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
14 import urlparse
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
15 import urllib2
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
16
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
17 __all__ = ['load', 'main']
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
18 string = (str, unicode)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
19
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
20 def ensure_dir(directory):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
21 """ensure `directory` is a directory"""
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
22 if os.path.exists(directory):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
23 assert os.path.isdir(directory)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
24 return directory
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
25 os.makedirs(directory)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
26 return directory
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
27
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
28 def isURL(url):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
29 return '://' in url
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
30
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
31 def read_s3(url):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
32 name = tempfile.mktemp()
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
33 try:
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
34 subprocess.check_output(['s3cmd', 'get', url, name])
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
35 with open(name) as f:
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
36 read = f.read()
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
37 os.remove(name)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
38 return read
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
39 finally:
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
40 if os.path.exists(name):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
41 os.remove(name)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
42
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
43 def read_http(url):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
44 return urllib2.urlopen(url).read()
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
45
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
46 def read_file(url):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
47 scheme = 'file://'
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
48 if url.startswith(scheme):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
49 url = url[len(scheme):]
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
50 return open(url).read()
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
51
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
52 loaders = {'s3': read_s3,
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
53 'http': read_http,
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
54 'https': read_http,
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
55 'file': read_file
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
56 }
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
57
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
58 def scheme(url):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
59 if '://' in url:
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
60 parsed = urlparse.urlsplit(url)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
61 return parsed.scheme
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
62 return 'file'
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
63
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
64 def parent(url):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
65 if '://' in url:
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
66 return url.rsplit('/', 1)[0]
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
67 else:
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
68 # file
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
69 return os.path.abspath(os.path.dirname(url))
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
70
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
71 def basename(url):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
72 if '://' in url:
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
73 return url.rsplit('/', 1)[-1]
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
74 else:
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
75 # file
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
76 return os.path.basename(url)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
77
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
78 def loader(url):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
79 return loaders[scheme(url)]
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
80
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
81 def load(url):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
82 """returns the contents of a URL"""
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
83 return loader(url)(url)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
84
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
85 def get_file(src, dest):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
86 shutil.copy2(src, dest)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
87
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
88 def get_s3(src, dest):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
89 subprocess.check_output(['s3cmd', 'get', src, dest])
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
90
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
91 def default_getter(src, dest):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
92 assert not os.path.isURL(dest)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
93 dirname = parent(dest)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
94 ensure_dir(dirname)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
95 with open(dest, 'w') as f:
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
96 f.write(load(url))
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
97
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
98 getters = {'file': get_file,
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
99 's3': get_s3
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
100 }
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
101
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
102 def get(src, dest):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
103 """get a thing to a local file"""
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
104 if os.path.isdir(dest):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
105 dest = os.path.join(dest, basename(src))
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
106 return getters.get(scheme(src), default_getter)(src, dest)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
107
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
108 def rel(base, path):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
109 """
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
110 relative path to base
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
111 otherwise, return None
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
112 """
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
113
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
114 if path.startswith(base):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
115 return path[len(base):]
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
116
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
117 def main(args=sys.argv[1:]):
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
118 """CLI"""
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
119
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
120 # parse command line
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
121 parser = argparse.ArgumentParser(description=__doc__)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
122 parser.add_argument('url', help='URL to read')
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
123 parser.add_argument('-o', '--output', dest='output',
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
124 help="get to this location")
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
125 options = parser.parse_args(args)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
126
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
127 if options.output:
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
128 # copy src to this location
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
129 get(options.url, options.output)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
130 sys.exit()
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
131
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
132 # read location
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
133 contents = load(options.url)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
134
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
135 # output
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
136 print (contents)
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
137
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
138 if __name__ == '__main__':
f011ec45b8e8 add example load type interface
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
139 main()