Mercurial > hg > toolbox
annotate scripts/html2json.py @ 17:cabe97535057
py3
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Tue, 24 Nov 2020 13:15:33 -0800 |
parents | b0942f44413f |
children |
rev | line source |
---|---|
0
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
2 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
3 """ |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
4 script to convert HTML microformat files to JSON: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
5 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
6 <div class="project"> |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
7 <h1><a href="${URL}">${PROJECT}</a></h1> |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
8 <p class="description">${DESCRIPTION}</p> |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
9 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
10 <!-- fields (lists) --> |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
11 <ul class="author"><li>${AUTHOR}</li></ul> |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
12 <ul class="usage"><li>${USAGE}</li></ul> |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
13 </div> |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
14 """ |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
15 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
16 ### imports |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
17 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
18 import os |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
19 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
20 try: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
21 from lxml import etree |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
22 except ImportError: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
23 raise ImportError("""You need lxml to run this script. Try running |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
24 `easy_install lxml` |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
25 It will work if you're lucky""") |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
26 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
27 try: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
28 import json |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
29 except ImportError: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
30 import simplejson as json |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
31 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
32 ### parse command line |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
33 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
34 from optparse import OptionParser |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
35 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
36 usage = '%prog file' |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
37 parser = OptionParser(usage=usage, description=__doc__) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
38 parser.add_option('--pprint', dest='pprint', |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
39 action='store_true', default=False, |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
40 help="pretty-print the json") |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
41 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
42 options, args = parser.parse_args() |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
43 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
44 if not len(args) == 1: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
45 parser.print_help() |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
46 parser.exit() |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
47 filename = args[0] |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
48 assert os.path.exists(filename), "%s not found" % filename |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
49 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
50 ### parse teh file |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
51 document = etree.parse(filename) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
52 elements = document.findall(".//div[@class='project']") |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
53 if not elements: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
54 root = document.getroot() |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
55 if root.tag == 'div' and 'project' in root.attrib.get('class', '').split(): |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
56 elements = [root] |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
57 if not elements: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
58 parser.error('No <div class="project"> found') |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
59 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
60 # print teh projects |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
61 for element in elements: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
62 project = {} |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
63 header = element.find('.//h1') |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
64 link = header.find('a') |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
65 if link is not None: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
66 project['name'] = link.text |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
67 project['url'] = link.attrib['href'] |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
68 else: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
69 project['name'] = header.text |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
70 project['name'] = ' '.join(project['name'].strip().split()) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
71 description = element.find("p[@class='description']") |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
72 if description is not None: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
73 project['description'] = description.text or '' |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
74 project['description'] = ' '.join(project['description'].strip().split()) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
75 for field in ('author', 'usage', 'language', 'type'): |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
76 e = element.find("ul[@class='%s']" % field) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
77 if e is not None: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
78 values = e.findall('li') |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
79 for value in values: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
80 project.setdefault(field, []).append(value.text) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
81 indent = options.pprint and 2 or None |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
82 print json.dumps(project, indent=indent) |