Mercurial > hg > TextShaper
comparison textshaper/split.py @ 52:8d8c1ac0e8e1
add a test text and wire some things up
| author | Jeff Hammel <k0scist@gmail.com> |
|---|---|
| date | Sun, 17 May 2015 08:48:56 -0700 |
| parents | c3b69728f291 |
| children | 1d755747e67a |
comparison
equal
deleted
inserted
replaced
| 51:c3b69728f291 | 52:8d8c1ac0e8e1 |
|---|---|
| 36 return sorted(indices, key=lambda x: x[0]) | 36 return sorted(indices, key=lambda x: x[0]) |
| 37 | 37 |
| 38 def split_sentences(text, ends='.?!'): | 38 def split_sentences(text, ends='.?!'): |
| 39 """split a text into sentences""" | 39 """split a text into sentences""" |
| 40 | 40 |
| 41 text = text.strip() | |
| 42 sentences = [] | |
| 43 _indices = indices(text, ends) | |
| 44 | |
| 45 begin = 0 | |
| 46 for index, value in _indices: | |
| 47 sentence = text[begin:index] | |
| 48 sentence += value | |
| 49 sentence.strip() | |
| 50 begin = index | |
| 51 if sentence: | |
| 52 sentences.append(sentence) | |
| 53 import pdb; pdb.set_trace() | |
| 54 | |
| 41 def split_paragraphs(text): | 55 def split_paragraphs(text): |
| 42 | 56 |
| 43 lines = [line.strip() for line in text.strip().splitlines()] | 57 lines = [line.strip() for line in text.strip().splitlines()] |
| 44 lines = [line if line else '\n' | 58 lines = [line if line else '\n' |
| 45 for line in lines] | 59 for line in lines] |
| 58 # preprocess text | 72 # preprocess text |
| 59 text = options.file.read().strip() | 73 text = options.file.read().strip() |
| 60 text = ' '.join(text.split()) | 74 text = ' '.join(text.split()) |
| 61 # paragraphs = split_paragraphs(text) | 75 # paragraphs = split_paragraphs(text) |
| 62 | 76 |
| 77 # find all sentences | |
| 63 ends = '.?!' | 78 ends = '.?!' |
| 79 sentences = split_sentences(text, ends) | |
| 64 | 80 |
| 65 # find all ending punctuation | 81 # display |
| 66 | 82 for sentence in sentences: |
| 67 | 83 print (sentence) |
| 68 | 84 |
| 69 if __name__ == '__main__': | 85 if __name__ == '__main__': |
| 70 main() | 86 main() |
