Mercurial > hg > toolbox
comparison scripts/html2json.py @ 0:b0942f44413f
import from git://github.com/mozilla/toolbox.git
| author | Jeff Hammel <k0scist@gmail.com> |
|---|---|
| date | Sun, 11 May 2014 09:15:35 -0700 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b0942f44413f |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 """ | |
| 4 script to convert HTML microformat files to JSON: | |
| 5 | |
| 6 <div class="project"> | |
| 7 <h1><a href="${URL}">${PROJECT}</a></h1> | |
| 8 <p class="description">${DESCRIPTION}</p> | |
| 9 | |
| 10 <!-- fields (lists) --> | |
| 11 <ul class="author"><li>${AUTHOR}</li></ul> | |
| 12 <ul class="usage"><li>${USAGE}</li></ul> | |
| 13 </div> | |
| 14 """ | |
| 15 | |
| 16 ### imports | |
| 17 | |
| 18 import os | |
| 19 | |
| 20 try: | |
| 21 from lxml import etree | |
| 22 except ImportError: | |
| 23 raise ImportError("""You need lxml to run this script. Try running | |
| 24 `easy_install lxml` | |
| 25 It will work if you're lucky""") | |
| 26 | |
| 27 try: | |
| 28 import json | |
| 29 except ImportError: | |
| 30 import simplejson as json | |
| 31 | |
| 32 ### parse command line | |
| 33 | |
| 34 from optparse import OptionParser | |
| 35 | |
| 36 usage = '%prog file' | |
| 37 parser = OptionParser(usage=usage, description=__doc__) | |
| 38 parser.add_option('--pprint', dest='pprint', | |
| 39 action='store_true', default=False, | |
| 40 help="pretty-print the json") | |
| 41 | |
| 42 options, args = parser.parse_args() | |
| 43 | |
| 44 if not len(args) == 1: | |
| 45 parser.print_help() | |
| 46 parser.exit() | |
| 47 filename = args[0] | |
| 48 assert os.path.exists(filename), "%s not found" % filename | |
| 49 | |
| 50 ### parse teh file | |
| 51 document = etree.parse(filename) | |
| 52 elements = document.findall(".//div[@class='project']") | |
| 53 if not elements: | |
| 54 root = document.getroot() | |
| 55 if root.tag == 'div' and 'project' in root.attrib.get('class', '').split(): | |
| 56 elements = [root] | |
| 57 if not elements: | |
| 58 parser.error('No <div class="project"> found') | |
| 59 | |
| 60 # print teh projects | |
| 61 for element in elements: | |
| 62 project = {} | |
| 63 header = element.find('.//h1') | |
| 64 link = header.find('a') | |
| 65 if link is not None: | |
| 66 project['name'] = link.text | |
| 67 project['url'] = link.attrib['href'] | |
| 68 else: | |
| 69 project['name'] = header.text | |
| 70 project['name'] = ' '.join(project['name'].strip().split()) | |
| 71 description = element.find("p[@class='description']") | |
| 72 if description is not None: | |
| 73 project['description'] = description.text or '' | |
| 74 project['description'] = ' '.join(project['description'].strip().split()) | |
| 75 for field in ('author', 'usage', 'language', 'type'): | |
| 76 e = element.find("ul[@class='%s']" % field) | |
| 77 if e is not None: | |
| 78 values = e.findall('li') | |
| 79 for value in values: | |
| 80 project.setdefault(field, []).append(value.text) | |
| 81 indent = options.pprint and 2 or None | |
| 82 print json.dumps(project, indent=indent) |
