annotate python/html2text.py @ 916:82763d37ab09

add web2 convenience script
author Jeff Hammel <k0scist@gmail.com>
date Thu, 30 May 2024 18:56:59 -0700
parents cb1b91c6bceb
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
770
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2 # -*- coding: utf-8 -*-
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 """
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 convert HTML to text using only HTMLParser
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6 """
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 # imports
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
9 import argparse
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
10 import sys
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 from HTMLParser import HTMLParser
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
13 class HTML2Text(HTMLParser):
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
14
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
15 def __init__(self):
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
16 HTMLParser.__init__(self)
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
17 self.in_body = False
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
18 self.text = []
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
19
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
20 def handle_starttag(self, tag, attrs):
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
21 if tag == 'body':
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
22 self.in_body = True
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
23
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
24 def handle_data(self, data):
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
25 if self.in_body:
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
26 data = data.strip()
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
27 if data:
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
28 self.text.append(data)
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
29
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
30 def __str__(self):
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
31 return '\n'.join(self.text)
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
32
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
33 def main(args=sys.argv[1:]):
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
34
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
35 # parse command line
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
36 parser = argparse.ArgumentParser(description=__doc__)
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
37 parser.add_argument('input', nargs='?',
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
38 type=argparse.FileType('r'), default=sys.stdin,
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
39 help='input file, or read from stdin if ommitted')
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
40 options = parser.parse_args(args)
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
41
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
42 # parse HTML
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
43 html = options.input.read()
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
44 html_parser = HTML2Text()
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
45 html_parser.feed(html)
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
46 html_parser.close()
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
47
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
48 # output it
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
49 print (html_parser)
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
50
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
51 if __name__ == '__main__':
cb1b91c6bceb example program for HTML -> text conversion using only HTMLParser
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
52 main()