Mercurial > hg > TextShaper
comparison textshaper/split.py @ 46:7e63ca061b6c
start findall function
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sat, 16 May 2015 18:53:53 -0700 |
parents | ccbdc00d4f0a |
children | 03ce88daa98d |
comparison
equal
deleted
inserted
replaced
45:ccbdc00d4f0a | 46:7e63ca061b6c |
---|---|
4 split paragraphs, sentences, etc | 4 split paragraphs, sentences, etc |
5 """ | 5 """ |
6 | 6 |
7 # imports | 7 # imports |
8 import argparse | 8 import argparse |
9 import re | |
10 import string | |
9 import sys | 11 import sys |
12 | |
13 | |
14 def findall(sub, _string): | |
15 """find all occurances of `sub` in _string""" | |
16 | |
17 retval = [] | |
18 index = 0 | |
19 while True: | |
20 try: | |
21 index = _string.index(sub, index) | |
22 retval.append(index) | |
23 index += 1 | |
24 except ValueError: | |
25 return retval | |
26 | |
10 | 27 |
11 def split_paragraphs(text): | 28 def split_paragraphs(text): |
12 | 29 |
13 lines = [line.strip() for line in text.strip().splitlines()] | 30 lines = [line.strip() for line in text.strip().splitlines()] |
14 lines = [line if line else '\n' | 31 lines = [line if line else '\n' |
23 # parse command line arguments | 40 # parse command line arguments |
24 parser = argparse.ArgumentParser(description=__doc__) | 41 parser = argparse.ArgumentParser(description=__doc__) |
25 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) | 42 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) |
26 options = parser.parse_args(args) | 43 options = parser.parse_args(args) |
27 | 44 |
45 # preprocess text | |
28 text = options.file.read().strip() | 46 text = options.file.read().strip() |
29 text = ' '.join(text.split()) | 47 text = ' '.join(text.split()) |
30 # paragraphs = split_paragraphs(text) | 48 # paragraphs = split_paragraphs(text) |
31 | 49 |
32 punctuation = ('.',) | 50 ends = '.?!' |
33 | 51 |
52 for end in ends: | |
34 # for paragraph in paragraphs: | 53 # for paragraph in paragraphs: |
35 # print (paragraph) | 54 # print (paragraph) |
36 | 55 |
37 if __name__ == '__main__': | 56 if __name__ == '__main__': |
38 main() | 57 main() |