annotate textshaper/split.py @ 55:4e2190495d50

this basically works
author Jeff Hammel <k0scist@gmail.com>
date Sun, 17 May 2015 17:14:47 -0700
parents 1d755747e67a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3 """
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4 split paragraphs, sentences, etc
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 """
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7 # imports
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 import argparse
54
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
9 import csv
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
10 import string
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 import sys
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
13
50
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
14 def findall(_string, sub):
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
15 """find all occurances of `sub` in _string"""
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
16
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
17 retval = []
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
18 index = 0
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
19 while True:
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
20 try:
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
21 index = _string.index(sub, index)
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
22 retval.append(index)
50
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
23 index += len(sub)
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
24 except ValueError:
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
25 return retval
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
26
51
c3b69728f291 finding indices now works
Jeff Hammel <k0scist@gmail.com>
parents: 50
diff changeset
27 def indices(text, values):
50
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
28 """
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
29 returns ordered list of 2-tuples:
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
30 (index, value)
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
31 """
51
c3b69728f291 finding indices now works
Jeff Hammel <k0scist@gmail.com>
parents: 50
diff changeset
32 locations = {value: findall(text, value) for value in values}
c3b69728f291 finding indices now works
Jeff Hammel <k0scist@gmail.com>
parents: 50
diff changeset
33 indices = []
c3b69728f291 finding indices now works
Jeff Hammel <k0scist@gmail.com>
parents: 50
diff changeset
34 for key, values in locations.items():
c3b69728f291 finding indices now works
Jeff Hammel <k0scist@gmail.com>
parents: 50
diff changeset
35 indices.extend([(value, key) for value in values])
c3b69728f291 finding indices now works
Jeff Hammel <k0scist@gmail.com>
parents: 50
diff changeset
36 return sorted(indices, key=lambda x: x[0])
50
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
37
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
38 def split_sentences(text, ends='.?!'):
1284c99a94fa stubbing
Jeff Hammel <k0scist@gmail.com>
parents: 48
diff changeset
39 """split a text into sentences"""
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
40
52
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
41 text = text.strip()
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
42 sentences = []
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
43 _indices = indices(text, ends)
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
44
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
45 begin = 0
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
46 for index, value in _indices:
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
47 sentence = text[begin:index]
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
48 sentence += value
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
49 sentence.strip()
54
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
50 begin = index + len(value)
52
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
51 if sentence:
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
52 sentences.append(sentence)
54
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
53 # add the trailing bits, if they exist
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
54 sentence = text[begin:].strip()
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
55 if sentence:
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
56 sentences.append(sentence)
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
57 # shouldn't need to do this
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
58 sentences = [sentence.strip() for sentence in sentences]
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
59 return sentences
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
60
52
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
61
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
62 def split_paragraphs(text):
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
63
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
64 lines = [line.strip() for line in text.strip().splitlines()]
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
65 lines = [line if line else '\n'
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
66 for line in lines]
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
67 text = ' '.join(lines).strip()
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
68 paragraphs = [' '.join(p) for p in text.split('\n')]
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
69 return paragraphs
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
70
54
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
71 def words(text):
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
72 """return the alphanumeric words in a sentence"""
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
73 words = text.strip().split()
55
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
74 return [word for word in words
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
75 if set(word).intersection(string.letters)]
54
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
76
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
77 def main(args=sys.argv[1:]):
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
78 """CLI"""
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
79
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
80 # parse command line arguments
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
81 parser = argparse.ArgumentParser(description=__doc__)
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
82 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
54
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
83 parser.add_argument('-n', '--number', dest='number',
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
84 action='store_true', default=False,
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
85 help="number the sentences (CSV)")
55
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
86 parser.add_argument('-c', '--count', dest='count',
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
87 action='store_true', default=False,
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
88 help="count the words in each sentence (CSV)")
54
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
89 parser.add_argument('-o', '--output', dest='output',
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
90 type=argparse.FileType('w'), default=sys.stdout,
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
91 help="file to output to, or stdout by default")
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
92 options = parser.parse_args(args)
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
93
46
7e63ca061b6c start findall function
Jeff Hammel <k0scist@gmail.com>
parents: 45
diff changeset
94 # preprocess text
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
95 text = options.file.read().strip()
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
96 text = ' '.join(text.split())
48
03ce88daa98d start test
Jeff Hammel <k0scist@gmail.com>
parents: 46
diff changeset
97 # paragraphs = split_paragraphs(text)
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
98
52
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
99 # find all sentences
48
03ce88daa98d start test
Jeff Hammel <k0scist@gmail.com>
parents: 46
diff changeset
100 ends = '.?!'
52
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
101 sentences = split_sentences(text, ends)
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
102
52
8d8c1ac0e8e1 add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents: 51
diff changeset
103 # display
54
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
104 if options.number:
55
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
105 if options.count:
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
106 raise NotImplementedError('TODO') # -> record TODO items
54
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
107 writer = csv.writer(options.output)
55
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
108 for index, sentence in enumerate(sentences, 1):
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
109 writer.writerow([index, sentence])
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
110 elif options.count:
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
111 writer = csv.writer(options.output)
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
112 for sentence in sentences:
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
113 n_words = len(words(sentence))
4e2190495d50 this basically works
Jeff Hammel <k0scist@gmail.com>
parents: 54
diff changeset
114 writer.writerow([n_words, sentence])
54
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
115 else:
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
116 for sentence in sentences:
1d755747e67a almost there
Jeff Hammel <k0scist@gmail.com>
parents: 52
diff changeset
117 options.output.write(sentence + '\n')
45
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
118
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
119 if __name__ == '__main__':
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
120 main()