view textshaper/split.py @ 56:4576ccc3be76

python 3.5
author Jeff Hammel <k0scist@gmail.com>
date Sun, 19 Feb 2017 17:53:30 -0800
parents 4e2190495d50
children
line wrap: on
line source

#!/usr/bin/env python

"""
split paragraphs, sentences, etc
"""

# imports
import argparse
import csv
import string
import sys


def findall(_string, sub):
    """find all occurances of `sub` in _string"""

    retval = []
    index = 0
    while True:
        try:
            index = _string.index(sub, index)
            retval.append(index)
            index += len(sub)
        except ValueError:
            return retval

def indices(text, values):
    """
    returns ordered list of 2-tuples:
    (index, value)
    """
    locations = {value: findall(text, value) for value in values}
    indices = []
    for key, values in locations.items():
        indices.extend([(value, key) for value in values])
    return sorted(indices, key=lambda x: x[0])

def split_sentences(text, ends='.?!'):
    """split a text into sentences"""

    text = text.strip()
    sentences = []
    _indices = indices(text, ends)

    begin = 0
    for index, value in _indices:
        sentence = text[begin:index]
        sentence += value
        sentence.strip()
        begin = index + len(value)
        if sentence:
            sentences.append(sentence)
    # add the trailing bits, if they exist
    sentence = text[begin:].strip()
    if sentence:
        sentences.append(sentence)
    # shouldn't need to do this
    sentences = [sentence.strip() for sentence in sentences]
    return sentences


def split_paragraphs(text):

    lines = [line.strip() for line in text.strip().splitlines()]
    lines = [line if line else '\n'
             for line in lines]
    text = ' '.join(lines).strip()
    paragraphs = [' '.join(p) for p in text.split('\n')]
    return paragraphs

def words(text):
    """return the alphanumeric words in a sentence"""
    words = text.strip().split()
    return [word for word in words
            if set(word).intersection(string.letters)]

def main(args=sys.argv[1:]):
    """CLI"""

    # parse command line arguments
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
    parser.add_argument('-n', '--number', dest='number',
                        action='store_true', default=False,
                        help="number the sentences (CSV)")
    parser.add_argument('-c', '--count', dest='count',
                        action='store_true', default=False,
                        help="count the words in each sentence (CSV)")
    parser.add_argument('-o', '--output', dest='output',
                        type=argparse.FileType('w'), default=sys.stdout,
                        help="file to output to, or stdout by default")
    options = parser.parse_args(args)

    # preprocess text
    text = options.file.read().strip()
    text = ' '.join(text.split())
    #    paragraphs = split_paragraphs(text)

    # find all sentences
    ends = '.?!'
    sentences = split_sentences(text, ends)

    # display
    if options.number:
        if options.count:
            raise NotImplementedError('TODO') # -> record TODO items
        writer = csv.writer(options.output)
        for index, sentence in enumerate(sentences, 1):
            writer.writerow([index, sentence])
    elif options.count:
        writer = csv.writer(options.output)
        for sentence in sentences:
            n_words = len(words(sentence))
            writer.writerow([n_words, sentence])
    else:
        for sentence in sentences:
            options.output.write(sentence + '\n')

if __name__ == '__main__':
    main()