annotate textshaper/split.py @ 50:1284c99a94fa

stubbing
author Jeff Hammel <k0scist@gmail.com>
date Sat, 16 May 2015 21:02:07 -0700
parents 03ce88daa98d
children c3b69728f291
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3 """
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 split paragraphs, sentences, etc
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 """
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7 # imports
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 import argparse
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
9 import re
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
10 import string
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 import sys
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
13
50
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
14 def findall(_string, sub):
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
15 """find all occurances of `sub` in _string"""
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
16
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
17 retval = []
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
18 index = 0
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
19 while True:
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
20 try:
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
21 index = _string.index(sub, index)
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
22 retval.append(index)
50
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
23 index += len(sub)
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
24 except ValueError:
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
25 return retval
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
26
50
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
27 def findindices(_string, values):
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
28 """
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
29 returns ordered list of 2-tuples:
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
30 (index, value)
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
31 """
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
32
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
33 def split_sentences(text, ends='.?!'):
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
34 """split a text into sentences"""
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
35
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
36 def split_paragraphs(text):
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
37
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
38 lines = [line.strip() for line in text.strip().splitlines()]
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
39 lines = [line if line else '\n'
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
40 for line in lines]
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
41 text = ' '.join(lines).strip()
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
42 paragraphs = [' '.join(p) for p in text.split('\n')]
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
43 return paragraphs
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
44
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
45 def main(args=sys.argv[1:]):
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
46 """CLI"""
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
47
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
48 # parse command line arguments
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
49 parser = argparse.ArgumentParser(description=__doc__)
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
50 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
51 options = parser.parse_args(args)
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
52
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
53 # preprocess text
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
54 text = options.file.read().strip()
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
55 text = ' '.join(text.split())
48
03ce88daa98d start test
Jeff Hammel <k0scist@gmail.com>
parents: 46
diff changeset
56 # paragraphs = split_paragraphs(text)
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
57
48
03ce88daa98d start test
Jeff Hammel <k0scist@gmail.com>
parents: 46
diff changeset
58 ends = '.?!'
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
59
48
03ce88daa98d start test
Jeff Hammel <k0scist@gmail.com>
parents: 46
diff changeset
60 # find all ending punctuation
03ce88daa98d start test
Jeff Hammel <k0scist@gmail.com>
parents: 46
diff changeset
61 indices = {end: findall(text, end) for end in ends}
03ce88daa98d start test
Jeff Hammel <k0scist@gmail.com>
parents: 46
diff changeset
62
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
63
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
64 if __name__ == '__main__':
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
65 main()