diff scripts/html2json.py @ 0:b0942f44413f

import from git://github.com/mozilla/toolbox.git
author Jeff Hammel <k0scist@gmail.com>
date Sun, 11 May 2014 09:15:35 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/html2json.py	Sun May 11 09:15:35 2014 -0700
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+"""
+script to convert HTML microformat files to JSON:
+
+<div class="project">
+<h1><a href="${URL}">${PROJECT}</a></h1>
+<p class="description">${DESCRIPTION}</p>
+
+<!-- fields (lists) -->
+<ul class="author"><li>${AUTHOR}</li></ul>
+<ul class="usage"><li>${USAGE}</li></ul>
+</div>
+"""
+
+### imports
+
+import os
+
+try:
+    from lxml import etree
+except ImportError:
+    raise ImportError("""You need lxml to run this script. Try running
+    `easy_install lxml`
+    It will work if you're lucky""")
+
+try:
+    import json
+except ImportError:
+    import simplejson as json
+
+### parse command line
+
+from optparse import OptionParser
+
+usage = '%prog file'
+parser = OptionParser(usage=usage, description=__doc__)
+parser.add_option('--pprint', dest='pprint',
+                  action='store_true', default=False,
+                  help="pretty-print the json")
+                  
+options, args = parser.parse_args()
+
+if not len(args) == 1:
+    parser.print_help()
+    parser.exit()
+filename = args[0]
+assert os.path.exists(filename), "%s not found" % filename
+
+### parse teh file
+document = etree.parse(filename)
+elements = document.findall(".//div[@class='project']")
+if not elements:
+    root = document.getroot()
+    if root.tag == 'div' and 'project' in root.attrib.get('class', '').split():
+        elements = [root]
+if not elements:
+    parser.error('No <div class="project"> found')
+
+# print teh projects
+for element in elements:
+    project = {}
+    header = element.find('.//h1')
+    link = header.find('a')
+    if link is not None:
+        project['name'] = link.text
+        project['url'] = link.attrib['href']
+    else:
+        project['name'] = header.text
+    project['name'] = ' '.join(project['name'].strip().split())
+    description = element.find("p[@class='description']")
+    if description is not None:
+        project['description'] = description.text or ''
+        project['description'] = ' '.join(project['description'].strip().split())
+    for field in ('author', 'usage', 'language', 'type'):
+        e = element.find("ul[@class='%s']" % field)
+        if e is not None:
+            values = e.findall('li')
+            for value in values:
+                project.setdefault(field, []).append(value.text)
+    indent = options.pprint and 2 or None
+    print json.dumps(project, indent=indent)