comparison python/html2text.py @ 770:cb1b91c6bceb

example program for HTML -> text conversion using only HTMLParser
author Jeff Hammel <k0scist@gmail.com>
date Mon, 07 Mar 2016 12:22:04 -0800
parents
children
comparison
equal deleted inserted replaced
769:489204193cd7 770:cb1b91c6bceb
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 """
5 convert HTML to text using only HTMLParser
6 """
7
8 # imports
9 import argparse
10 import sys
11 from HTMLParser import HTMLParser
12
13 class HTML2Text(HTMLParser):
14
15 def __init__(self):
16 HTMLParser.__init__(self)
17 self.in_body = False
18 self.text = []
19
20 def handle_starttag(self, tag, attrs):
21 if tag == 'body':
22 self.in_body = True
23
24 def handle_data(self, data):
25 if self.in_body:
26 data = data.strip()
27 if data:
28 self.text.append(data)
29
30 def __str__(self):
31 return '\n'.join(self.text)
32
33 def main(args=sys.argv[1:]):
34
35 # parse command line
36 parser = argparse.ArgumentParser(description=__doc__)
37 parser.add_argument('input', nargs='?',
38 type=argparse.FileType('r'), default=sys.stdin,
39 help='input file, or read from stdin if ommitted')
40 options = parser.parse_args(args)
41
42 # parse HTML
43 html = options.input.read()
44 html_parser = HTML2Text()
45 html_parser.feed(html)
46 html_parser.close()
47
48 # output it
49 print (html_parser)
50
51 if __name__ == '__main__':
52 main()