Mercurial > hg > TextShaper
annotate textshaper/split.py @ 57:a2bbe406f570 default tip
which is no longer maintained; roll our mediocre own
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Mon, 20 Feb 2017 10:40:34 -0800 |
parents | 4e2190495d50 |
children |
rev | line source |
---|---|
45 | 1 #!/usr/bin/env python |
2 | |
3 """ | |
4 split paragraphs, sentences, etc | |
5 """ | |
6 | |
7 # imports | |
8 import argparse | |
54 | 9 import csv |
46 | 10 import string |
45 | 11 import sys |
12 | |
46 | 13 |
50 | 14 def findall(_string, sub): |
46 | 15 """find all occurances of `sub` in _string""" |
16 | |
17 retval = [] | |
18 index = 0 | |
19 while True: | |
20 try: | |
21 index = _string.index(sub, index) | |
22 retval.append(index) | |
50 | 23 index += len(sub) |
46 | 24 except ValueError: |
25 return retval | |
26 | |
51 | 27 def indices(text, values): |
50 | 28 """ |
29 returns ordered list of 2-tuples: | |
30 (index, value) | |
31 """ | |
51 | 32 locations = {value: findall(text, value) for value in values} |
33 indices = [] | |
34 for key, values in locations.items(): | |
35 indices.extend([(value, key) for value in values]) | |
36 return sorted(indices, key=lambda x: x[0]) | |
50 | 37 |
38 def split_sentences(text, ends='.?!'): | |
39 """split a text into sentences""" | |
46 | 40 |
52
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
41 text = text.strip() |
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
42 sentences = [] |
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
43 _indices = indices(text, ends) |
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
44 |
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
45 begin = 0 |
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
46 for index, value in _indices: |
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
47 sentence = text[begin:index] |
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
48 sentence += value |
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
49 sentence.strip() |
54 | 50 begin = index + len(value) |
52
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
51 if sentence: |
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
52 sentences.append(sentence) |
54 | 53 # add the trailing bits, if they exist |
54 sentence = text[begin:].strip() | |
55 if sentence: | |
56 sentences.append(sentence) | |
57 # shouldn't need to do this | |
58 sentences = [sentence.strip() for sentence in sentences] | |
59 return sentences | |
60 | |
52
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
61 |
45 | 62 def split_paragraphs(text): |
63 | |
64 lines = [line.strip() for line in text.strip().splitlines()] | |
65 lines = [line if line else '\n' | |
66 for line in lines] | |
67 text = ' '.join(lines).strip() | |
68 paragraphs = [' '.join(p) for p in text.split('\n')] | |
69 return paragraphs | |
70 | |
54 | 71 def words(text): |
72 """return the alphanumeric words in a sentence""" | |
73 words = text.strip().split() | |
55 | 74 return [word for word in words |
75 if set(word).intersection(string.letters)] | |
54 | 76 |
45 | 77 def main(args=sys.argv[1:]): |
78 """CLI""" | |
79 | |
80 # parse command line arguments | |
81 parser = argparse.ArgumentParser(description=__doc__) | |
82 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) | |
54 | 83 parser.add_argument('-n', '--number', dest='number', |
84 action='store_true', default=False, | |
85 help="number the sentences (CSV)") | |
55 | 86 parser.add_argument('-c', '--count', dest='count', |
87 action='store_true', default=False, | |
88 help="count the words in each sentence (CSV)") | |
54 | 89 parser.add_argument('-o', '--output', dest='output', |
90 type=argparse.FileType('w'), default=sys.stdout, | |
91 help="file to output to, or stdout by default") | |
45 | 92 options = parser.parse_args(args) |
93 | |
46 | 94 # preprocess text |
45 | 95 text = options.file.read().strip() |
96 text = ' '.join(text.split()) | |
48 | 97 # paragraphs = split_paragraphs(text) |
45 | 98 |
52
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
99 # find all sentences |
48 | 100 ends = '.?!' |
52
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
101 sentences = split_sentences(text, ends) |
45 | 102 |
52
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
103 # display |
54 | 104 if options.number: |
55 | 105 if options.count: |
106 raise NotImplementedError('TODO') # -> record TODO items | |
54 | 107 writer = csv.writer(options.output) |
55 | 108 for index, sentence in enumerate(sentences, 1): |
109 writer.writerow([index, sentence]) | |
110 elif options.count: | |
111 writer = csv.writer(options.output) | |
112 for sentence in sentences: | |
113 n_words = len(words(sentence)) | |
114 writer.writerow([n_words, sentence]) | |
54 | 115 else: |
116 for sentence in sentences: | |
117 options.output.write(sentence + '\n') | |
45 | 118 |
119 if __name__ == '__main__': | |
120 main() |