Mercurial > hg > toolbox
view scripts/html2json.py @ 0:b0942f44413f
import from git://github.com/mozilla/toolbox.git
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 11 May 2014 09:15:35 -0700 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python """ script to convert HTML microformat files to JSON: <div class="project"> <h1><a href="${URL}">${PROJECT}</a></h1> <p class="description">${DESCRIPTION}</p> <!-- fields (lists) --> <ul class="author"><li>${AUTHOR}</li></ul> <ul class="usage"><li>${USAGE}</li></ul> </div> """ ### imports import os try: from lxml import etree except ImportError: raise ImportError("""You need lxml to run this script. Try running `easy_install lxml` It will work if you're lucky""") try: import json except ImportError: import simplejson as json ### parse command line from optparse import OptionParser usage = '%prog file' parser = OptionParser(usage=usage, description=__doc__) parser.add_option('--pprint', dest='pprint', action='store_true', default=False, help="pretty-print the json") options, args = parser.parse_args() if not len(args) == 1: parser.print_help() parser.exit() filename = args[0] assert os.path.exists(filename), "%s not found" % filename ### parse teh file document = etree.parse(filename) elements = document.findall(".//div[@class='project']") if not elements: root = document.getroot() if root.tag == 'div' and 'project' in root.attrib.get('class', '').split(): elements = [root] if not elements: parser.error('No <div class="project"> found') # print teh projects for element in elements: project = {} header = element.find('.//h1') link = header.find('a') if link is not None: project['name'] = link.text project['url'] = link.attrib['href'] else: project['name'] = header.text project['name'] = ' '.join(project['name'].strip().split()) description = element.find("p[@class='description']") if description is not None: project['description'] = description.text or '' project['description'] = ' '.join(project['description'].strip().split()) for field in ('author', 'usage', 'language', 'type'): e = element.find("ul[@class='%s']" % field) if e is not None: values = e.findall('li') for value in values: project.setdefault(field, []).append(value.text) indent = options.pprint and 2 or None print json.dumps(project, indent=indent)