Mercurial > hg > TextShaper
view textshaper/split.py @ 55:4e2190495d50
this basically works
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 17 May 2015 17:14:47 -0700 |
parents | 1d755747e67a |
children |
line wrap: on
line source
#!/usr/bin/env python """ split paragraphs, sentences, etc """ # imports import argparse import csv import string import sys def findall(_string, sub): """find all occurances of `sub` in _string""" retval = [] index = 0 while True: try: index = _string.index(sub, index) retval.append(index) index += len(sub) except ValueError: return retval def indices(text, values): """ returns ordered list of 2-tuples: (index, value) """ locations = {value: findall(text, value) for value in values} indices = [] for key, values in locations.items(): indices.extend([(value, key) for value in values]) return sorted(indices, key=lambda x: x[0]) def split_sentences(text, ends='.?!'): """split a text into sentences""" text = text.strip() sentences = [] _indices = indices(text, ends) begin = 0 for index, value in _indices: sentence = text[begin:index] sentence += value sentence.strip() begin = index + len(value) if sentence: sentences.append(sentence) # add the trailing bits, if they exist sentence = text[begin:].strip() if sentence: sentences.append(sentence) # shouldn't need to do this sentences = [sentence.strip() for sentence in sentences] return sentences def split_paragraphs(text): lines = [line.strip() for line in text.strip().splitlines()] lines = [line if line else '\n' for line in lines] text = ' '.join(lines).strip() paragraphs = [' '.join(p) for p in text.split('\n')] return paragraphs def words(text): """return the alphanumeric words in a sentence""" words = text.strip().split() return [word for word in words if set(word).intersection(string.letters)] def main(args=sys.argv[1:]): """CLI""" # parse command line arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) parser.add_argument('-n', '--number', dest='number', action='store_true', default=False, help="number the sentences (CSV)") parser.add_argument('-c', '--count', dest='count', action='store_true', default=False, help="count the words in each sentence (CSV)") parser.add_argument('-o', '--output', dest='output', type=argparse.FileType('w'), default=sys.stdout, help="file to output to, or stdout by default") options = parser.parse_args(args) # preprocess text text = options.file.read().strip() text = ' '.join(text.split()) # paragraphs = split_paragraphs(text) # find all sentences ends = '.?!' sentences = split_sentences(text, ends) # display if options.number: if options.count: raise NotImplementedError('TODO') # -> record TODO items writer = csv.writer(options.output) for index, sentence in enumerate(sentences, 1): writer.writerow([index, sentence]) elif options.count: writer = csv.writer(options.output) for sentence in sentences: n_words = len(words(sentence)) writer.writerow([n_words, sentence]) else: for sentence in sentences: options.output.write(sentence + '\n') if __name__ == '__main__': main()