Mercurial > hg > config
comparison python/html2text.py @ 770:cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Mon, 07 Mar 2016 12:22:04 -0800 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
769:489204193cd7 | 770:cb1b91c6bceb |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 """ | |
5 convert HTML to text using only HTMLParser | |
6 """ | |
7 | |
8 # imports | |
9 import argparse | |
10 import sys | |
11 from HTMLParser import HTMLParser | |
12 | |
13 class HTML2Text(HTMLParser): | |
14 | |
15 def __init__(self): | |
16 HTMLParser.__init__(self) | |
17 self.in_body = False | |
18 self.text = [] | |
19 | |
20 def handle_starttag(self, tag, attrs): | |
21 if tag == 'body': | |
22 self.in_body = True | |
23 | |
24 def handle_data(self, data): | |
25 if self.in_body: | |
26 data = data.strip() | |
27 if data: | |
28 self.text.append(data) | |
29 | |
30 def __str__(self): | |
31 return '\n'.join(self.text) | |
32 | |
33 def main(args=sys.argv[1:]): | |
34 | |
35 # parse command line | |
36 parser = argparse.ArgumentParser(description=__doc__) | |
37 parser.add_argument('input', nargs='?', | |
38 type=argparse.FileType('r'), default=sys.stdin, | |
39 help='input file, or read from stdin if ommitted') | |
40 options = parser.parse_args(args) | |
41 | |
42 # parse HTML | |
43 html = options.input.read() | |
44 html_parser = HTML2Text() | |
45 html_parser.feed(html) | |
46 html_parser.close() | |
47 | |
48 # output it | |
49 print (html_parser) | |
50 | |
51 if __name__ == '__main__': | |
52 main() |