-
Notifications
You must be signed in to change notification settings - Fork 0
/
scanner.py
111 lines (99 loc) · 3.9 KB
/
scanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import re
from DFA.DFA import get_final_token, get_token_type, get_next_state
from DFA.states import STATE_SCANNER
from Tools.regex import RE
class scanner:
def __init__(self, s: str) -> None:
self.s = s
self.slen = len(self.s)
self.this_char_idx = 0
self.start_token = 0
self.raw_token_type = ""
self.this_state = STATE_SCANNER.START
self.errors = []
self.tokens = []
self.lexemes = [
"if",
"else",
"void",
"int",
"repeat",
"break",
"until",
"return",
"endif",
]
self.line = 1
self.addition_str = "\n " # has to begin with \n and be longer than 1
self.s += self.addition_str
self.empty_return = (None, None, None)
def get_next_token(
self,
):
# End of the file
if self.slen <= self.this_char_idx:
return self.return2compiler(self.empty_return)
while self.this_char_idx < len(self.s):
if self.this_state == STATE_SCANNER.START:
self.start_token = self.this_char_idx
self.this_char = self.s[self.this_char_idx]
self.next_state, self.log, self.next_char_idx = get_next_state(
self.this_state, self.this_char, self.this_char_idx
)
# print_letter_state(self.this_char_idx,self.this_state,
# self.this_char, self.next_state, self.log)
if (
self.next_state == STATE_SCANNER.ENDBACK
or self.next_state == STATE_SCANNER.END
or self.next_state == STATE_SCANNER.ERROR
) and self.log:
self.raw_token_type = self.log
to_return = self.empty_return
if (
self.this_state == STATE_SCANNER.END
or self.this_state == STATE_SCANNER.ERROR
):
self.end_token = self.this_char_idx
self.token = self.s[self.start_token : self.end_token]
self.token_type = get_token_type(self.raw_token_type, self.token)
if len(re.findall(RE.MY_NEWLINE, self.token)) > 0:
self.line += len(re.findall(RE.MY_NEWLINE, self.token))
if re.match(RE.MY_WHITESPACE, self.token):
pass
else:
# print("line {: >5} {: >20} {: >20} {: >10} {: >10}".format(
# self.line, self.token, self.token_type, self.start_token, self.end_token))
to_return = (self.line, self.token_type, self.token)
if self.this_state == STATE_SCANNER.ERROR:
# print(self.token)
self.errors.append((self.line, self.token, self.token_type))
to_return = self.empty_return
if self.token_type == "COMMENT":
to_return = self.empty_return
self.this_char_idx = self.next_char_idx
self.this_state = self.next_state
if to_return[0] != None:
return self.return2compiler(to_return)
(
self.line,
self.token,
self.token_type,
self.start_token,
self.end_token,
) = get_final_token(
self.this_state,
self.s,
self.addition_str,
self.start_token,
self.line,
self.token,
)
if self.line != None:
self.errors.append((self.line, self.token, self.token_type))
return self.return2compiler(self.empty_return)
def return2compiler(self, to_return):
if to_return[0] != None:
self.tokens.append(to_return)
if to_return[1] == "ID" and to_return[2] not in self.lexemes:
self.lexemes.append(to_return[2])
return to_return