comparison textshaper/split.py @ 54:1d755747e67a

almost there
author Jeff Hammel <k0scist@gmail.com>
date Sun, 17 May 2015 09:11:30 -0700
parents 8d8c1ac0e8e1
children 4e2190495d50
comparison
equal deleted inserted replaced
53:3691ffa84a3a 54:1d755747e67a
4 split paragraphs, sentences, etc 4 split paragraphs, sentences, etc
5 """ 5 """
6 6
7 # imports 7 # imports
8 import argparse 8 import argparse
9 import re 9 import csv
10 import string 10 import string
11 import sys 11 import sys
12 12
13 13
14 def findall(_string, sub): 14 def findall(_string, sub):
45 begin = 0 45 begin = 0
46 for index, value in _indices: 46 for index, value in _indices:
47 sentence = text[begin:index] 47 sentence = text[begin:index]
48 sentence += value 48 sentence += value
49 sentence.strip() 49 sentence.strip()
50 begin = index 50 begin = index + len(value)
51 if sentence: 51 if sentence:
52 sentences.append(sentence) 52 sentences.append(sentence)
53 import pdb; pdb.set_trace() 53 # add the trailing bits, if they exist
54 sentence = text[begin:].strip()
55 if sentence:
56 sentences.append(sentence)
57 # shouldn't need to do this
58 sentences = [sentence.strip() for sentence in sentences]
59 return sentences
60
54 61
55 def split_paragraphs(text): 62 def split_paragraphs(text):
56 63
57 lines = [line.strip() for line in text.strip().splitlines()] 64 lines = [line.strip() for line in text.strip().splitlines()]
58 lines = [line if line else '\n' 65 lines = [line if line else '\n'
59 for line in lines] 66 for line in lines]
60 text = ' '.join(lines).strip() 67 text = ' '.join(lines).strip()
61 paragraphs = [' '.join(p) for p in text.split('\n')] 68 paragraphs = [' '.join(p) for p in text.split('\n')]
62 return paragraphs 69 return paragraphs
63 70
71 def words(text):
72 """return the alphanumeric words in a sentence"""
73 words = text.strip().split()
74 return [word for word in words]
75
64 def main(args=sys.argv[1:]): 76 def main(args=sys.argv[1:]):
65 """CLI""" 77 """CLI"""
66 78
67 # parse command line arguments 79 # parse command line arguments
68 parser = argparse.ArgumentParser(description=__doc__) 80 parser = argparse.ArgumentParser(description=__doc__)
69 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) 81 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
82 parser.add_argument('-n', '--number', dest='number',
83 action='store_true', default=False,
84 help="number the sentences (CSV)")
85 parser.add_argument('-o', '--output', dest='output',
86 type=argparse.FileType('w'), default=sys.stdout,
87 help="file to output to, or stdout by default")
70 options = parser.parse_args(args) 88 options = parser.parse_args(args)
71 89
72 # preprocess text 90 # preprocess text
73 text = options.file.read().strip() 91 text = options.file.read().strip()
74 text = ' '.join(text.split()) 92 text = ' '.join(text.split())
77 # find all sentences 95 # find all sentences
78 ends = '.?!' 96 ends = '.?!'
79 sentences = split_sentences(text, ends) 97 sentences = split_sentences(text, ends)
80 98
81 # display 99 # display
82 for sentence in sentences: 100 if options.number:
83 print (sentence) 101 writer = csv.writer(options.output)
102 else:
103 for sentence in sentences:
104 options.output.write(sentence + '\n')
84 105
85 if __name__ == '__main__': 106 if __name__ == '__main__':
86 main() 107 main()