Mercurial > hg > TextShaper
view textshaper/split.py @ 50:1284c99a94fa
stubbing
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sat, 16 May 2015 21:02:07 -0700 |
parents | 03ce88daa98d |
children | c3b69728f291 |
line wrap: on
line source
#!/usr/bin/env python """ split paragraphs, sentences, etc """ # imports import argparse import re import string import sys def findall(_string, sub): """find all occurances of `sub` in _string""" retval = [] index = 0 while True: try: index = _string.index(sub, index) retval.append(index) index += len(sub) except ValueError: return retval def findindices(_string, values): """ returns ordered list of 2-tuples: (index, value) """ def split_sentences(text, ends='.?!'): """split a text into sentences""" def split_paragraphs(text): lines = [line.strip() for line in text.strip().splitlines()] lines = [line if line else '\n' for line in lines] text = ' '.join(lines).strip() paragraphs = [' '.join(p) for p in text.split('\n')] return paragraphs def main(args=sys.argv[1:]): """CLI""" # parse command line arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) options = parser.parse_args(args) # preprocess text text = options.file.read().strip() text = ' '.join(text.split()) # paragraphs = split_paragraphs(text) ends = '.?!' # find all ending punctuation indices = {end: findall(text, end) for end in ends} if __name__ == '__main__': main()