comparison scripts/html2json.py @ 0:b0942f44413f

import from git://github.com/mozilla/toolbox.git
author Jeff Hammel <k0scist@gmail.com>
date Sun, 11 May 2014 09:15:35 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b0942f44413f
1 #!/usr/bin/env python
2
3 """
4 script to convert HTML microformat files to JSON:
5
6 <div class="project">
7 <h1><a href="${URL}">${PROJECT}</a></h1>
8 <p class="description">${DESCRIPTION}</p>
9
10 <!-- fields (lists) -->
11 <ul class="author"><li>${AUTHOR}</li></ul>
12 <ul class="usage"><li>${USAGE}</li></ul>
13 </div>
14 """
15
16 ### imports
17
18 import os
19
20 try:
21 from lxml import etree
22 except ImportError:
23 raise ImportError("""You need lxml to run this script. Try running
24 `easy_install lxml`
25 It will work if you're lucky""")
26
27 try:
28 import json
29 except ImportError:
30 import simplejson as json
31
32 ### parse command line
33
34 from optparse import OptionParser
35
36 usage = '%prog file'
37 parser = OptionParser(usage=usage, description=__doc__)
38 parser.add_option('--pprint', dest='pprint',
39 action='store_true', default=False,
40 help="pretty-print the json")
41
42 options, args = parser.parse_args()
43
44 if not len(args) == 1:
45 parser.print_help()
46 parser.exit()
47 filename = args[0]
48 assert os.path.exists(filename), "%s not found" % filename
49
50 ### parse teh file
51 document = etree.parse(filename)
52 elements = document.findall(".//div[@class='project']")
53 if not elements:
54 root = document.getroot()
55 if root.tag == 'div' and 'project' in root.attrib.get('class', '').split():
56 elements = [root]
57 if not elements:
58 parser.error('No <div class="project"> found')
59
60 # print teh projects
61 for element in elements:
62 project = {}
63 header = element.find('.//h1')
64 link = header.find('a')
65 if link is not None:
66 project['name'] = link.text
67 project['url'] = link.attrib['href']
68 else:
69 project['name'] = header.text
70 project['name'] = ' '.join(project['name'].strip().split())
71 description = element.find("p[@class='description']")
72 if description is not None:
73 project['description'] = description.text or ''
74 project['description'] = ' '.join(project['description'].strip().split())
75 for field in ('author', 'usage', 'language', 'type'):
76 e = element.find("ul[@class='%s']" % field)
77 if e is not None:
78 values = e.findall('li')
79 for value in values:
80 project.setdefault(field, []).append(value.text)
81 indent = options.pprint and 2 or None
82 print json.dumps(project, indent=indent)