-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenyze.py
executable file
·240 lines (193 loc) · 5.96 KB
/
tokenyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env python
"""tokenyze is a Python tokenizer
It uses generators to do a look-ahead tokenizing of an input string.
Tokens are defined as names or strings, and can be nested using brackets.
Names are made up of sequential non-whitespace characters.
Brackets are special single letter tokens.
Strings are delimited by either single or double quotes.
Backslashes can escape these characters.
Example:
The text
"fr33(the p1zza c@t)n0w_",
will result in the following (generated) token list:
['fr33', '(', 'the', 'p1zza', 'c@t', ')', 'n0w_']
Implementation:
The code uses a generator (getchars) to deliver character from the text
to the gettokens consumer. The consumer will pass on responsibility
for parsing the text to either a whitespace consumer (eatwhitespace)
or a token consumer, which will in turn defer to a name consumner
(eatname) or string consumner (eatstring).
The gettokens consumer itself is a generator, which will yield each
found token in turn until there are no more tokens left.
Usage:
$ python
>>> import tokenyze
>>> for token in tokenyze.gettokens("fr33(the p1zza c@t)n0w_"):
... print token
...
fr33
(
the
p1zza
c@t
)
n0w_
>>>
"""
LOG = False
if LOG:
import sys
def log(*args, **kwargs):
"""For internal debug use
"""
if LOG:
for arg in args:
sys.stderr.write(str(arg))
sys.stderr.write("\n")
def getchars(text):
"""character iterator
This generator iterates over the characters in the text
"""
for nextchar in text:
yield nextchar
def eatwhitespace(nextchar, TEXT):
"""whitespace eater
This eats whitespace, and returns the next non-whitespace character
"""
log("-eat whitespace")
while True:
if nextchar is not None:
if nextchar == "\\":
nextchar = nextchar + TEXT.next()
if nextchar not in " \t\n":
log("-found non-whitespace:", nextchar)
return nextchar
nextchar = TEXT.next()
def eatstring(nextchar, TEXT):
"""string eater
This eats a string, until the starting quote of the string is found.
Strings are returned with their enclosing quotes!
"""
log("-eat string")
# start the string with the new character, this will be a quote
string = nextchar
while True:
try:
nextchar = TEXT.next()
except StopIteration as e:
# if we run out of characters in the text, return the name
log("-stop string: ", string, None)
return string, None
if nextchar == "\\":
nextchar = nextchar + TEXT.next()
string += nextchar[-1]
continue
string += nextchar[-1]
if nextchar == string[0]:
# if we see the same quote as the opening quote, the string
# is complete.
log("-return string: ", string, None)
return string, None
def eatname(specialchars, nextchar, TEXT):
"""name eater
This eats a name, until either a non-name character
is seen, or another token (brackets) starts.
"""
log("-eat name")
if nextchar in "()" + specialchars:
# hard return on bracket, and return the next character as an empty string.
return nextchar, ''
# start the new name with the new character
name = nextchar[-1]
while True:
try:
nextchar = TEXT.next()
except StopIteration as e:
# if we run out of characters in the text, return the name
log("-stop name: ", name, None)
return name, None
log("-name char", nextchar)
if nextchar == "\\":
nextchar = nextchar + TEXT.next()
if nextchar in " \t\n()'\"" + specialchars:
log("-return name: ", name, nextchar)
return name, nextchar
name += nextchar[-1]
def eattoken(specialchars, nextchar, TEXT):
"""token eater
This decides whether the next token will be a string or a name, depending
on the look-ahead character
"""
log("-eat token")
if nextchar in ("'", '"'):
return eatstring(nextchar, TEXT)
else:
return eatname(specialchars, nextchar, TEXT)
def gettokens(text, specialchars = ''):
"""gettokens is a generator that splits a text into tokens
"""
log("-make D generator")
# create a generator that will provide us with one character at a time
TEXT = getchars(text)
# grab the first character. This is our look-ahead
nextchar = TEXT.next()
while True:
# eat all following white space. Return the first non-whitespace character
nextchar = eatwhitespace(nextchar, TEXT)
# feed the look-ahead character to the token eater
token, nextchar = eattoken(specialchars, nextchar, TEXT)
# we get the token and the next non-token character back
log("-yield token", token)
yield token
##### some unit tests #################################################
if __name__ == "__main__":
LOG = True
import sys
import pdb
import traceback
def test(string, expected):
log(string)
# add ":" as a specialchar
tokens = list(gettokens(string, ":"))
log(tokens)
assert(len(expected) == len(tokens))
assert(all([a == b for (a, b) in zip(tokens, expected)]))
try:
# simple test
test(
"hello world!",
['hello', 'world!']
)
# a bit more complex, we have brackets breaking up strings of text
test(
"fr33(the p1zza c@t)n0w_",
['fr33', '(', 'the', 'p1zza', 'c@t', ')', 'n0w_']
)
# string should be able to contain extra specialchars
test(
"fr33(the 'p1:zza' c@t)n0w_",
['fr33', '(', 'the', "'p1:zza'", 'c@t', ')', 'n0w_']
)
# check for extra specialchars
test(
"fr3:3(the p1zza c@t)n0w_",
['fr3', ':', '3', '(', 'the', 'p1zza', 'c@t', ')', 'n0w_']
)
# test escaped characters
test(
"eat(the\tchopper\n\\)boppers\ttoday",
['eat', '(', 'the', 'chopper', ')boppers', 'today']
)
# random nonsense
test(
"hello $ %_!\nwo('orld (r)\" '\"mole\")bl\tah(foo)(bar)\\(wack",
['hello', '$', '%_!', 'wo', '(', '\'orld (r)" \'', '"mole"', ')', 'bl', 'ah', '(', 'foo', ')', '(', 'bar', ')', '(wack']
)
# more random nonsense
test(
"hdcs43(operator*'230 lkcank 23245'\"dckjnkasd(sdclk)\"csdkk(kjk kjnkjn)(caskdjkj)( sdcklkldc)\")\"",
['hdcs43', '(', 'operator*', "'230 lkcank 23245'", '"dckjnkasd(sdclk)"', 'csdkk', '(', 'kjk', 'kjnkjn', ')', '(', 'caskdjkj', ')', '(', 'sdcklkldc', ')', '")"']
)
except:
traceback.print_exc()
pdb.post_mortem()