Mercurial > hg > TextShaper
comparison textshaper/split.py @ 54:1d755747e67a
almost there
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 17 May 2015 09:11:30 -0700 |
parents | 8d8c1ac0e8e1 |
children | 4e2190495d50 |
comparison
equal
deleted
inserted
replaced
53:3691ffa84a3a | 54:1d755747e67a |
---|---|
4 split paragraphs, sentences, etc | 4 split paragraphs, sentences, etc |
5 """ | 5 """ |
6 | 6 |
7 # imports | 7 # imports |
8 import argparse | 8 import argparse |
9 import re | 9 import csv |
10 import string | 10 import string |
11 import sys | 11 import sys |
12 | 12 |
13 | 13 |
14 def findall(_string, sub): | 14 def findall(_string, sub): |
45 begin = 0 | 45 begin = 0 |
46 for index, value in _indices: | 46 for index, value in _indices: |
47 sentence = text[begin:index] | 47 sentence = text[begin:index] |
48 sentence += value | 48 sentence += value |
49 sentence.strip() | 49 sentence.strip() |
50 begin = index | 50 begin = index + len(value) |
51 if sentence: | 51 if sentence: |
52 sentences.append(sentence) | 52 sentences.append(sentence) |
53 import pdb; pdb.set_trace() | 53 # add the trailing bits, if they exist |
54 sentence = text[begin:].strip() | |
55 if sentence: | |
56 sentences.append(sentence) | |
57 # shouldn't need to do this | |
58 sentences = [sentence.strip() for sentence in sentences] | |
59 return sentences | |
60 | |
54 | 61 |
55 def split_paragraphs(text): | 62 def split_paragraphs(text): |
56 | 63 |
57 lines = [line.strip() for line in text.strip().splitlines()] | 64 lines = [line.strip() for line in text.strip().splitlines()] |
58 lines = [line if line else '\n' | 65 lines = [line if line else '\n' |
59 for line in lines] | 66 for line in lines] |
60 text = ' '.join(lines).strip() | 67 text = ' '.join(lines).strip() |
61 paragraphs = [' '.join(p) for p in text.split('\n')] | 68 paragraphs = [' '.join(p) for p in text.split('\n')] |
62 return paragraphs | 69 return paragraphs |
63 | 70 |
71 def words(text): | |
72 """return the alphanumeric words in a sentence""" | |
73 words = text.strip().split() | |
74 return [word for word in words] | |
75 | |
64 def main(args=sys.argv[1:]): | 76 def main(args=sys.argv[1:]): |
65 """CLI""" | 77 """CLI""" |
66 | 78 |
67 # parse command line arguments | 79 # parse command line arguments |
68 parser = argparse.ArgumentParser(description=__doc__) | 80 parser = argparse.ArgumentParser(description=__doc__) |
69 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) | 81 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) |
82 parser.add_argument('-n', '--number', dest='number', | |
83 action='store_true', default=False, | |
84 help="number the sentences (CSV)") | |
85 parser.add_argument('-o', '--output', dest='output', | |
86 type=argparse.FileType('w'), default=sys.stdout, | |
87 help="file to output to, or stdout by default") | |
70 options = parser.parse_args(args) | 88 options = parser.parse_args(args) |
71 | 89 |
72 # preprocess text | 90 # preprocess text |
73 text = options.file.read().strip() | 91 text = options.file.read().strip() |
74 text = ' '.join(text.split()) | 92 text = ' '.join(text.split()) |
77 # find all sentences | 95 # find all sentences |
78 ends = '.?!' | 96 ends = '.?!' |
79 sentences = split_sentences(text, ends) | 97 sentences = split_sentences(text, ends) |
80 | 98 |
81 # display | 99 # display |
82 for sentence in sentences: | 100 if options.number: |
83 print (sentence) | 101 writer = csv.writer(options.output) |
102 else: | |
103 for sentence in sentences: | |
104 options.output.write(sentence + '\n') | |
84 | 105 |
85 if __name__ == '__main__': | 106 if __name__ == '__main__': |
86 main() | 107 main() |