changeset 54:1d755747e67a

almost there
author Jeff Hammel <k0scist@gmail.com>
date Sun, 17 May 2015 09:11:30 -0700
parents 3691ffa84a3a
children 4e2190495d50
files textshaper/split.py
diffstat 1 files changed, 26 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/textshaper/split.py	Sun May 17 08:53:11 2015 -0700
+++ b/textshaper/split.py	Sun May 17 09:11:30 2015 -0700
@@ -6,7 +6,7 @@
 
 # imports
 import argparse
-import re
+import csv
 import string
 import sys
 
@@ -47,10 +47,17 @@
         sentence = text[begin:index]
         sentence += value
         sentence.strip()
-        begin = index
+        begin = index + len(value)
         if sentence:
             sentences.append(sentence)
-    import pdb; pdb.set_trace()
+    # add the trailing bits, if they exist
+    sentence = text[begin:].strip()
+    if sentence:
+        sentences.append(sentence)
+    # shouldn't need to do this
+    sentences = [sentence.strip() for sentence in sentences]
+    return sentences
+
 
 def split_paragraphs(text):
 
@@ -61,12 +68,23 @@
     paragraphs = [' '.join(p) for p in text.split('\n')]
     return paragraphs
 
+def words(text):
+    """return the alphanumeric words in a sentence"""
+    words = text.strip().split()
+    return [word for word in words]
+
 def main(args=sys.argv[1:]):
     """CLI"""
 
     # parse command line arguments
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
+    parser.add_argument('-n', '--number', dest='number',
+                        action='store_true', default=False,
+                        help="number the sentences (CSV)")
+    parser.add_argument('-o', '--output', dest='output',
+                        type=argparse.FileType('w'), default=sys.stdout,
+                        help="file to output to, or stdout by default")
     options = parser.parse_args(args)
 
     # preprocess text
@@ -79,8 +97,11 @@
     sentences = split_sentences(text, ends)
 
     # display
-    for sentence in sentences:
-        print (sentence)
+    if options.number:
+        writer = csv.writer(options.output)
+    else:
+        for sentence in sentences:
+            options.output.write(sentence + '\n')
 
 if __name__ == '__main__':
     main()