-
Notifications
You must be signed in to change notification settings - Fork 30
/
preparer.py
56 lines (47 loc) · 1.77 KB
/
preparer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
'Tokenize input text for the Recognizer.'
__author__ = 'Nick Montfort'
__copyright__ = 'Copyright 2011 Nick Montfort'
__license__ = 'ISC'
__version__ = '0.5.0.0'
__status__ = 'Development'
import sys
import re
try:
import readline
except ImportError:
pass
import input_model
def prepare(separator, prompt='', in_stream=sys.stdin, out_stream=sys.stdout):
"""Read a string from the input string and return it tokenized.
Andrew Plotkin fixed this so that up arrow fetches the previous command."""
if (hasattr(in_stream, 'isatty') and in_stream.isatty()):
input_string = raw_input(prompt)
else:
out_stream.write(prompt)
input_string = in_stream.readline()
if input_string == '':
# Empty string indicates end of the input file.
# (A blank input line would look like '\n'.)
raise EOFError()
out_stream.write(input_string)
return tokenize(input_string, separator)
def tokenize(input_string, separator):
'Returns tokenized and slightly reformatted text.'
input_string = re.sub('\s*$', '', input_string)
new_text = input_string
new_text = re.sub(' *([\.\?\!\&\(\)\-\;\:\,]) *', r' \1 ', new_text)
new_text = re.sub('^[ \t]+', '', new_text)
new_text = re.sub('[ \t\n]+$', '', new_text)
new_text = re.sub('[ \t]+', ' ', new_text)
new_text = re.sub(" *' *", " '", new_text)
new_text = re.sub(" *' *", " '", new_text)
tokens = new_text.lower().split()
while len(tokens) > 0 and tokens[0] in separator:
tokens.pop(0)
while len(tokens) > 0 and tokens[-1] in separator:
tokens.pop()
user_input = input_model.RichInput(input_string, tokens)
return user_input
if __name__ == "__main__":
TEST_INPUT = prepare()
print TEST_INPUT.tokens