45
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 split paragraphs, sentences, etc
|
|
5 """
|
|
6
|
|
7 # imports
|
|
8 import argparse
|
46
|
9 import re
|
|
10 import string
|
45
|
11 import sys
|
|
12
|
46
|
13
|
50
|
14 def findall(_string, sub):
|
46
|
15 """find all occurances of `sub` in _string"""
|
|
16
|
|
17 retval = []
|
|
18 index = 0
|
|
19 while True:
|
|
20 try:
|
|
21 index = _string.index(sub, index)
|
|
22 retval.append(index)
|
50
|
23 index += len(sub)
|
46
|
24 except ValueError:
|
|
25 return retval
|
|
26
|
50
|
27 def findindices(_string, values):
|
|
28 """
|
|
29 returns ordered list of 2-tuples:
|
|
30 (index, value)
|
|
31 """
|
|
32
|
|
33 def split_sentences(text, ends='.?!'):
|
|
34 """split a text into sentences"""
|
46
|
35
|
45
|
36 def split_paragraphs(text):
|
|
37
|
|
38 lines = [line.strip() for line in text.strip().splitlines()]
|
|
39 lines = [line if line else '\n'
|
|
40 for line in lines]
|
|
41 text = ' '.join(lines).strip()
|
|
42 paragraphs = [' '.join(p) for p in text.split('\n')]
|
|
43 return paragraphs
|
|
44
|
|
45 def main(args=sys.argv[1:]):
|
|
46 """CLI"""
|
|
47
|
|
48 # parse command line arguments
|
|
49 parser = argparse.ArgumentParser(description=__doc__)
|
|
50 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
|
|
51 options = parser.parse_args(args)
|
|
52
|
46
|
53 # preprocess text
|
45
|
54 text = options.file.read().strip()
|
|
55 text = ' '.join(text.split())
|
48
|
56 # paragraphs = split_paragraphs(text)
|
45
|
57
|
48
|
58 ends = '.?!'
|
45
|
59
|
48
|
60 # find all ending punctuation
|
|
61 indices = {end: findall(text, end) for end in ends}
|
|
62
|
45
|
63
|
|
64 if __name__ == '__main__':
|
|
65 main()
|