# HG changeset patch # User Jeff Hammel # Date 1457382124 28800 # Node ID cb1b91c6bcebe615093e1733229d4af921b6d2c9 # Parent 489204193cd7c19132044bb859c4910f757bf36f example program for HTML -> text conversion using only HTMLParser diff -r 489204193cd7 -r cb1b91c6bceb python/html2text.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/python/html2text.py Mon Mar 07 12:22:04 2016 -0800 @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +convert HTML to text using only HTMLParser +""" + +# imports +import argparse +import sys +from HTMLParser import HTMLParser + +class HTML2Text(HTMLParser): + + def __init__(self): + HTMLParser.__init__(self) + self.in_body = False + self.text = [] + + def handle_starttag(self, tag, attrs): + if tag == 'body': + self.in_body = True + + def handle_data(self, data): + if self.in_body: + data = data.strip() + if data: + self.text.append(data) + + def __str__(self): + return '\n'.join(self.text) + +def main(args=sys.argv[1:]): + + # parse command line + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('input', nargs='?', + type=argparse.FileType('r'), default=sys.stdin, + help='input file, or read from stdin if ommitted') + options = parser.parse_args(args) + + # parse HTML + html = options.input.read() + html_parser = HTML2Text() + html_parser.feed(html) + html_parser.close() + + # output it + print (html_parser) + +if __name__ == '__main__': + main()