45
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 split paragraphs, sentences, etc
|
|
5 """
|
|
6
|
|
7 # imports
|
|
8 import argparse
|
46
|
9 import re
|
|
10 import string
|
45
|
11 import sys
|
|
12
|
46
|
13
|
|
14 def findall(sub, _string):
|
|
15 """find all occurances of `sub` in _string"""
|
|
16
|
|
17 retval = []
|
|
18 index = 0
|
|
19 while True:
|
|
20 try:
|
|
21 index = _string.index(sub, index)
|
|
22 retval.append(index)
|
|
23 index += 1
|
|
24 except ValueError:
|
|
25 return retval
|
|
26
|
|
27
|
45
|
28 def split_paragraphs(text):
|
|
29
|
|
30 lines = [line.strip() for line in text.strip().splitlines()]
|
|
31 lines = [line if line else '\n'
|
|
32 for line in lines]
|
|
33 text = ' '.join(lines).strip()
|
|
34 paragraphs = [' '.join(p) for p in text.split('\n')]
|
|
35 return paragraphs
|
|
36
|
|
37 def main(args=sys.argv[1:]):
|
|
38 """CLI"""
|
|
39
|
|
40 # parse command line arguments
|
|
41 parser = argparse.ArgumentParser(description=__doc__)
|
|
42 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
|
|
43 options = parser.parse_args(args)
|
|
44
|
46
|
45 # preprocess text
|
45
|
46 text = options.file.read().strip()
|
|
47 text = ' '.join(text.split())
|
48
|
48 # paragraphs = split_paragraphs(text)
|
45
|
49
|
48
|
50 ends = '.?!'
|
45
|
51
|
48
|
52 # find all ending punctuation
|
|
53 indices = {end: findall(text, end) for end in ends}
|
|
54
|
45
|
55
|
|
56 if __name__ == '__main__':
|
|
57 main()
|