Mercurial > hg > config
annotate python/html2text.py @ 895:8d3df8c0c730
wordstream is a requirement
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Fri, 13 Aug 2021 15:16:21 -0700 |
parents | cb1b91c6bceb |
children |
rev | line source |
---|---|
770
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
3 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
4 """ |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
5 convert HTML to text using only HTMLParser |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
6 """ |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
7 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
8 # imports |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
9 import argparse |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
10 import sys |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
11 from HTMLParser import HTMLParser |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
12 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
13 class HTML2Text(HTMLParser): |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
14 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
15 def __init__(self): |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
16 HTMLParser.__init__(self) |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
17 self.in_body = False |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
18 self.text = [] |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
19 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
20 def handle_starttag(self, tag, attrs): |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
21 if tag == 'body': |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
22 self.in_body = True |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
23 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
24 def handle_data(self, data): |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
25 if self.in_body: |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
26 data = data.strip() |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
27 if data: |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
28 self.text.append(data) |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
29 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
30 def __str__(self): |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
31 return '\n'.join(self.text) |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
32 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
33 def main(args=sys.argv[1:]): |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
34 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
35 # parse command line |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
36 parser = argparse.ArgumentParser(description=__doc__) |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
37 parser.add_argument('input', nargs='?', |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
38 type=argparse.FileType('r'), default=sys.stdin, |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
39 help='input file, or read from stdin if ommitted') |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
40 options = parser.parse_args(args) |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
41 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
42 # parse HTML |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
43 html = options.input.read() |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
44 html_parser = HTML2Text() |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
45 html_parser.feed(html) |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
46 html_parser.close() |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
47 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
48 # output it |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
49 print (html_parser) |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
50 |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
51 if __name__ == '__main__': |
cb1b91c6bceb
example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
52 main() |