toolbox: scripts/html2json.py comparison

comparison scripts/html2json.py @ 0:b0942f44413f

import from git://github.com/mozilla/toolbox.git

author	Jeff Hammel <k0scist@gmail.com>
date	Sun, 11 May 2014 09:15:35 -0700
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:b0942f44413f
+#!/usr/bin/env python
+"""
+script to convert HTML microformat files to JSON:
+<div class="project">
+<h1><a href="${URL}">${PROJECT}</a></h1>
+<p class="description">${DESCRIPTION}</p>
+<!-- fields (lists) -->
+<ul class="author"><li>${AUTHOR}</li></ul>
+<ul class="usage"><li>${USAGE}</li></ul>
+</div>
+"""
+### imports
+import os
+try:
+from lxml import etree
+except ImportError:
+raise ImportError("""You need lxml to run this script. Try running
+`easy_install lxml`
+It will work if you're lucky""")
+try:
+import json
+except ImportError:
+import simplejson as json
+### parse command line
+from optparse import OptionParser
+usage = '%prog file'
+parser = OptionParser(usage=usage, description=__doc__)
+parser.add_option('--pprint', dest='pprint',
+action='store_true', default=False,
+help="pretty-print the json")
+options, args = parser.parse_args()
+if not len(args) == 1:
+parser.print_help()
+parser.exit()
+filename = args[0]
+assert os.path.exists(filename), "%s not found" % filename
+### parse teh file
+document = etree.parse(filename)
+elements = document.findall(".//div[@class='project']")
+if not elements:
+root = document.getroot()
+if root.tag == 'div' and 'project' in root.attrib.get('class', '').split():
+elements = [root]
+if not elements:
+parser.error('No <div class="project"> found')
+# print teh projects
+for element in elements:
+project = {}
+header = element.find('.//h1')
+link = header.find('a')
+if link is not None:
+project['name'] = link.text
+project['url'] = link.attrib['href']
+else:
+project['name'] = header.text
+project['name'] = ' '.join(project['name'].strip().split())
+description = element.find("p[@class='description']")
+if description is not None:
+project['description'] = description.text or ''
+project['description'] = ' '.join(project['description'].strip().split())
+for field in ('author', 'usage', 'language', 'type'):
+e = element.find("ul[@class='%s']" % field)
+if e is not None:
+values = e.findall('li')
+for value in values:
+project.setdefault(field, []).append(value.text)
+indent = options.pprint and 2 or None
+print json.dumps(project, indent=indent)

Mercurial > hg > toolbox

comparison scripts/html2json.py @ 0:b0942f44413f