Mercurial > hg > TextShaper
annotate textshaper/split.py @ 52:8d8c1ac0e8e1
add a test text and wire some things up
| author | Jeff Hammel <k0scist@gmail.com> |
|---|---|
| date | Sun, 17 May 2015 08:48:56 -0700 |
| parents | c3b69728f291 |
| children | 1d755747e67a |
| rev | line source |
|---|---|
| 45 | 1 #!/usr/bin/env python |
| 2 | |
| 3 """ | |
| 4 split paragraphs, sentences, etc | |
| 5 """ | |
| 6 | |
| 7 # imports | |
| 8 import argparse | |
| 46 | 9 import re |
| 10 import string | |
| 45 | 11 import sys |
| 12 | |
| 46 | 13 |
| 50 | 14 def findall(_string, sub): |
| 46 | 15 """find all occurances of `sub` in _string""" |
| 16 | |
| 17 retval = [] | |
| 18 index = 0 | |
| 19 while True: | |
| 20 try: | |
| 21 index = _string.index(sub, index) | |
| 22 retval.append(index) | |
| 50 | 23 index += len(sub) |
| 46 | 24 except ValueError: |
| 25 return retval | |
| 26 | |
| 51 | 27 def indices(text, values): |
| 50 | 28 """ |
| 29 returns ordered list of 2-tuples: | |
| 30 (index, value) | |
| 31 """ | |
| 51 | 32 locations = {value: findall(text, value) for value in values} |
| 33 indices = [] | |
| 34 for key, values in locations.items(): | |
| 35 indices.extend([(value, key) for value in values]) | |
| 36 return sorted(indices, key=lambda x: x[0]) | |
| 50 | 37 |
| 38 def split_sentences(text, ends='.?!'): | |
| 39 """split a text into sentences""" | |
| 46 | 40 |
|
52
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
41 text = text.strip() |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
42 sentences = [] |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
43 _indices = indices(text, ends) |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
44 |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
45 begin = 0 |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
46 for index, value in _indices: |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
47 sentence = text[begin:index] |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
48 sentence += value |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
49 sentence.strip() |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
50 begin = index |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
51 if sentence: |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
52 sentences.append(sentence) |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
53 import pdb; pdb.set_trace() |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
54 |
| 45 | 55 def split_paragraphs(text): |
| 56 | |
| 57 lines = [line.strip() for line in text.strip().splitlines()] | |
| 58 lines = [line if line else '\n' | |
| 59 for line in lines] | |
| 60 text = ' '.join(lines).strip() | |
| 61 paragraphs = [' '.join(p) for p in text.split('\n')] | |
| 62 return paragraphs | |
| 63 | |
| 64 def main(args=sys.argv[1:]): | |
| 65 """CLI""" | |
| 66 | |
| 67 # parse command line arguments | |
| 68 parser = argparse.ArgumentParser(description=__doc__) | |
| 69 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) | |
| 70 options = parser.parse_args(args) | |
| 71 | |
| 46 | 72 # preprocess text |
| 45 | 73 text = options.file.read().strip() |
| 74 text = ' '.join(text.split()) | |
| 48 | 75 # paragraphs = split_paragraphs(text) |
| 45 | 76 |
|
52
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
77 # find all sentences |
| 48 | 78 ends = '.?!' |
|
52
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
79 sentences = split_sentences(text, ends) |
| 45 | 80 |
|
52
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
81 # display |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
82 for sentence in sentences: |
|
8d8c1ac0e8e1
add a test text and wire some things up
Jeff Hammel <k0scist@gmail.com>
parents:
51
diff
changeset
|
83 print (sentence) |
| 45 | 84 |
| 85 if __name__ == '__main__': | |
| 86 main() |
