annotate scripts/html2json.py @ 15:8e13c2f6c2d7

py3
author Jeff Hammel <k0scist@gmail.com>
date Tue, 24 Nov 2020 13:13:36 -0800
parents b0942f44413f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3 """
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 script to convert HTML microformat files to JSON:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6 <div class="project">
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7 <h1><a href="${URL}">${PROJECT}</a></h1>
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 <p class="description">${DESCRIPTION}</p>
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
9
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
10 <!-- fields (lists) -->
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 <ul class="author"><li>${AUTHOR}</li></ul>
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12 <ul class="usage"><li>${USAGE}</li></ul>
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
13 </div>
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
14 """
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
15
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
16 ### imports
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
17
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
18 import os
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
19
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
20 try:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
21 from lxml import etree
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
22 except ImportError:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
23 raise ImportError("""You need lxml to run this script. Try running
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
24 `easy_install lxml`
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
25 It will work if you're lucky""")
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
26
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
27 try:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
28 import json
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
29 except ImportError:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
30 import simplejson as json
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
31
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
32 ### parse command line
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
33
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
34 from optparse import OptionParser
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
35
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
36 usage = '%prog file'
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
37 parser = OptionParser(usage=usage, description=__doc__)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
38 parser.add_option('--pprint', dest='pprint',
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
39 action='store_true', default=False,
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
40 help="pretty-print the json")
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
41
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
42 options, args = parser.parse_args()
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
43
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
44 if not len(args) == 1:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
45 parser.print_help()
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
46 parser.exit()
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
47 filename = args[0]
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
48 assert os.path.exists(filename), "%s not found" % filename
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
49
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
50 ### parse teh file
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
51 document = etree.parse(filename)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
52 elements = document.findall(".//div[@class='project']")
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
53 if not elements:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
54 root = document.getroot()
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
55 if root.tag == 'div' and 'project' in root.attrib.get('class', '').split():
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
56 elements = [root]
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
57 if not elements:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
58 parser.error('No <div class="project"> found')
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
59
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
60 # print teh projects
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
61 for element in elements:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
62 project = {}
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
63 header = element.find('.//h1')
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
64 link = header.find('a')
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
65 if link is not None:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
66 project['name'] = link.text
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
67 project['url'] = link.attrib['href']
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
68 else:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
69 project['name'] = header.text
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
70 project['name'] = ' '.join(project['name'].strip().split())
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
71 description = element.find("p[@class='description']")
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
72 if description is not None:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
73 project['description'] = description.text or ''
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
74 project['description'] = ' '.join(project['description'].strip().split())
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
75 for field in ('author', 'usage', 'language', 'type'):
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
76 e = element.find("ul[@class='%s']" % field)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
77 if e is not None:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
78 values = e.findall('li')
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
79 for value in values:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
80 project.setdefault(field, []).append(value.text)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
81 indent = options.pprint and 2 or None
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
82 print json.dumps(project, indent=indent)