Mercurial > hg > toolbox
comparison scripts/html2json.py @ 0:b0942f44413f
import from git://github.com/mozilla/toolbox.git
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 11 May 2014 09:15:35 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b0942f44413f |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 script to convert HTML microformat files to JSON: | |
5 | |
6 <div class="project"> | |
7 <h1><a href="${URL}">${PROJECT}</a></h1> | |
8 <p class="description">${DESCRIPTION}</p> | |
9 | |
10 <!-- fields (lists) --> | |
11 <ul class="author"><li>${AUTHOR}</li></ul> | |
12 <ul class="usage"><li>${USAGE}</li></ul> | |
13 </div> | |
14 """ | |
15 | |
16 ### imports | |
17 | |
18 import os | |
19 | |
20 try: | |
21 from lxml import etree | |
22 except ImportError: | |
23 raise ImportError("""You need lxml to run this script. Try running | |
24 `easy_install lxml` | |
25 It will work if you're lucky""") | |
26 | |
27 try: | |
28 import json | |
29 except ImportError: | |
30 import simplejson as json | |
31 | |
32 ### parse command line | |
33 | |
34 from optparse import OptionParser | |
35 | |
36 usage = '%prog file' | |
37 parser = OptionParser(usage=usage, description=__doc__) | |
38 parser.add_option('--pprint', dest='pprint', | |
39 action='store_true', default=False, | |
40 help="pretty-print the json") | |
41 | |
42 options, args = parser.parse_args() | |
43 | |
44 if not len(args) == 1: | |
45 parser.print_help() | |
46 parser.exit() | |
47 filename = args[0] | |
48 assert os.path.exists(filename), "%s not found" % filename | |
49 | |
50 ### parse teh file | |
51 document = etree.parse(filename) | |
52 elements = document.findall(".//div[@class='project']") | |
53 if not elements: | |
54 root = document.getroot() | |
55 if root.tag == 'div' and 'project' in root.attrib.get('class', '').split(): | |
56 elements = [root] | |
57 if not elements: | |
58 parser.error('No <div class="project"> found') | |
59 | |
60 # print teh projects | |
61 for element in elements: | |
62 project = {} | |
63 header = element.find('.//h1') | |
64 link = header.find('a') | |
65 if link is not None: | |
66 project['name'] = link.text | |
67 project['url'] = link.attrib['href'] | |
68 else: | |
69 project['name'] = header.text | |
70 project['name'] = ' '.join(project['name'].strip().split()) | |
71 description = element.find("p[@class='description']") | |
72 if description is not None: | |
73 project['description'] = description.text or '' | |
74 project['description'] = ' '.join(project['description'].strip().split()) | |
75 for field in ('author', 'usage', 'language', 'type'): | |
76 e = element.find("ul[@class='%s']" % field) | |
77 if e is not None: | |
78 values = e.findall('li') | |
79 for value in values: | |
80 project.setdefault(field, []).append(value.text) | |
81 indent = options.pprint and 2 or None | |
82 print json.dumps(project, indent=indent) |