Mercurial > hg > config
changeset 770:cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Mon, 07 Mar 2016 12:22:04 -0800 |
parents | 489204193cd7 |
children | c1f314c518c9 |
files | python/html2text.py |
diffstat | 1 files changed, 52 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/python/html2text.py Mon Mar 07 12:22:04 2016 -0800 @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +convert HTML to text using only HTMLParser +""" + +# imports +import argparse +import sys +from HTMLParser import HTMLParser + +class HTML2Text(HTMLParser): + + def __init__(self): + HTMLParser.__init__(self) + self.in_body = False + self.text = [] + + def handle_starttag(self, tag, attrs): + if tag == 'body': + self.in_body = True + + def handle_data(self, data): + if self.in_body: + data = data.strip() + if data: + self.text.append(data) + + def __str__(self): + return '\n'.join(self.text) + +def main(args=sys.argv[1:]): + + # parse command line + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('input', nargs='?', + type=argparse.FileType('r'), default=sys.stdin, + help='input file, or read from stdin if ommitted') + options = parser.parse_args(args) + + # parse HTML + html = options.input.read() + html_parser = HTML2Text() + html_parser.feed(html) + html_parser.close() + + # output it + print (html_parser) + +if __name__ == '__main__': + main()