-
Notifications
You must be signed in to change notification settings - Fork 2
/
autohighlight.py
258 lines (242 loc) · 13.2 KB
/
autohighlight.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
from utils import Set
import pprint
import re
from color import Color
from tokenize import Tokenizer
from token import Token
from symbol import Symbol
from mapping import Mapping
from production import Production
pp = pprint.PrettyPrinter()
class Autohighlight:
"""The autohighlight class encapsulates all the state of the
highlighter at any given moment"""
def __init__(self, file):
self.GlobalSymbolDict = {} # Symbol table
self.ColorDefinitions = self.getPredefinedColors() # Color definition table - maps color names(as strings) to Color objects
self.OrderedColorMappings = [] # A list of color Mapping objects in textual order (contents of thir section of input)
self.tokenizer = None # The token generator
if type(file).__name__ == 'str': self.tokenizer = Tokenizer(open(file)) # behave nicely when given a filename instead of a stream
else: self.tokenizer = Tokenizer(file)
def getPredefinedColors(self):
"""Produce hash of Color objects representing the predefined colors"""
colorNames = [ 'Comment', \
'Constant', \
'String', \
'VariableName', \
'FunctionName', \
'Keyword', \
'Type', \
'None', \
'Error' \
]
colors = {}
for colorName in colorNames:
colors[colorName]=Color(Token(None,None,colorName),True)
return colors
def promote_productions(self):
"""Convert all the elements of products from tokens into
symbols, meanwhile checking that all of the elements are
existing symbols. This is name analysis in action: because
symbol names have Algol scoping inside the concrete grammar
portion of the input file, we wait until the whose shebang is
parsed before attempting to promote tokens into symbols."""
for sym in self.GlobalSymbolDict.values():
for production in sym.productions:
elements = production.elements
if len(elements) > 0: # An empty production has no tokens to promote
firstToken = elements[0]
for i in range(0, len(elements)):
if re.compile("^'").match(elements[i].text): # If the element is a literal, no name analysis needs to be done
elements[i] = Symbol(elements[i])
elements[i].is_lit = True
elements[i].regex = Set(re.escape(elements[i].defining_token.text[1:-1]))
self.GlobalSymbolDict[elements[i].defining_token.text]=elements[i]
else: # Do name analysis: check if the symbol is used without being defined.
try:
elements[i] = self.GlobalSymbolDict[elements[i].text]
except KeyError, e:
raise Exception("Production for %s beginning at %d,%d: %s is not a symbol." % \
(sym.defining_token.text, firstToken.line, firstToken.col, elements[i].text))
def parse_lexical_symbols(self):
"""Given that the token generator is at the beginning of the
lexical symbol specifications, read a series of lexical symbol
specifications, doing name and basic type analysis on the fly."""
stack = []
self.tokenizer.next().must_be('{')
for token in self.tokenizer:
stack += [ token ]
if token.text == ".":
stack[0].assert_symbol_name()
stack[1].must_be(':')
stack[2].must_match('^\\$', "regular expression")
## Name analysis
if stack[0].text in self.GlobalSymbolDict:
originalDef = self.GlobalSymbolDict[stack[0].text].defining_token
raise Exception("Symbol %s redefined at %d,%d. Originally at %d,%d" % (stack[0].text, stack[0].line, stack[0].col, \
originalDef.line, originalDef.col))
s = Symbol(stack[0])
s.is_gla = True
s.regex = Set(stack[2].text[1:])
self.GlobalSymbolDict[stack[0].text] = s
stack = []
elif token.text == "{":
raise Exception("Unexpected %s" % token)
elif token.text == "}":
if len(stack) > 1: raise Exception("Unfinished lexical specification beginning with %s" % stack[0])
return
else: pass
def parse_cst(self):
"""Given that the token generator is positioned at the start
of the concrete grammar, read rules. After this routine
completes, each symbol in the GlobalSymbolDict has a set of
productions that contain Tokens, not symbols. Conversion from
tokens to symbols happens in promote_productions."""
stack = []
self.tokenizer.next().must_be('{')
for token in self.tokenizer:
stack += [ token ] # Build a stack to process
if token.text == ".":
# We've got a rule to process. Start by determining correct syntax.
stack[1].must_be(':')
## Name analysis
stack[0].assert_symbol_name()
production_elements = stack[2:-1]
for element in production_elements:
element.assert_symbol_name()
if stack[0].text in self.GlobalSymbolDict: # Redefined lexical sym or add a new production?
existingSymbol = self.GlobalSymbolDict[stack[0].text]
if existingSymbol.is_gla:
raise Exception("Lexical Symbol %s redefined at %d,%d. Originally at %d,%d" % \
(stack[0].text, stack[0].line, stack[0].col, \
existingSymbol.defining_token.line, existingSymbol.defining_token.col))
existingSymbol.productions += [Production(existingSymbol,production_elements)]
else: # Brand new symbol occurrence
s = Symbol(stack[0])
s.is_gla = False
s.productions = [Production(s,production_elements)]
self.GlobalSymbolDict[stack[0].text] = s
stack = []
elif token.text == "{":
raise Exception("Unexpected %s" % token)
elif token.text == "}":
if len(stack) > 1: raise Exception("Unfinished lexical specification beginning with %s" % stack[0])
#pp = pprint.PrettyPrinter()
#pp.pprint(self.GlobalSymbolDict)
return
else: pass
def mkColor(self, name):
"""Control is transferred here in order to read a color
specification from the token generator. Does basic name
analysis to determine if valid font attributes are used."""
known_attrs = [ 'font-family', 'font-style', 'font-weight', 'font-size', 'text-decoration', 'color', 'background-color' ]
stack = []
color = Color(name)
for token in self.tokenizer:
if token.text == ";":
stack[0].assert_symbol_name
if stack[0].text not in known_attrs: raise Exception("%d:%d: Unknown color attribute %s" % (stack[0].line, stack[0].col, stack[0].text))
stack[1].must_be(":")
stack[2].must_match("^\w", "%d:%d: Expected a color attribute value instead of %s" % (stack[2].line, stack[2].col, stack[2].text))
color.attrs[stack[0].text] = stack[2].text
stack = []
elif token.text == "}":
return color
else:
stack += [token]
raise Exception("%d:%d: End-of-file reached while scanning color %s defined here." % (name.line, name.col, name.text))
def parse_color(self):
"""Given that the token generator is at the beginning of the
coloring section, read color definitions and coloring requests"""
begin = self.tokenizer.next()
begin.must_be('{')
for name in self.tokenizer:
if name.text == '}': return
name.must_match("^[A-Za-z]", "%d:%d: Expected a color name, got %s instead." % (name.line, name.col, name.text))
midpunct = self.tokenizer.next()
if midpunct.text == "{":
color = self.mkColor(name)
if color in self.ColorDefinitions:
raise Exception("%d:%d: Color %s has already been defined." % (name.line, name.col, name.text))
self.ColorDefinitions[name.text] = color
elif midpunct.text == ':':
stack = []
for token in self.tokenizer:
if token.text == ".":
self.OrderedColorMappings += [Mapping(name,stack)]
break
elif token.text == "}": raise Exception("%d:%d: Color section ended while defining mapping for color %s" % (name.line, name.col, name.text))
try:
stack += [ self.GlobalSymbolDict[token.text] ]
except:
raise Exception("%d:%d: Literal %s does not occur in the grammar" % (token.line, token.col, token.text))
elif midpunct.text == '}': raise Exception("%d:%d: Coloring section ended unexpectedly here." % (token.line, token.col))
else: raise Exception("%d:%d: Expected : or {, not %s" % (midpunct.line, midpunct.col, midpunct.text))
raise Exception("%d:%d: Unexpected end-of-file while scanning color definition section beginning here." % (begin.line, begin.col))
def check_color_scoping(self):
"""Since color names have algol scoping, name analysis must be
done after all the color definitions have been read."""
for mapping in self.OrderedColorMappings:
if mapping.token.text not in self.ColorDefinitions:
raise Exception("%d:%d Color %s is never defined" % (mapping.token.line, mapping.token.col, mapping.token.text))
def check_for_multiple_roots(self):
"""Determine whether the cst has multiple roots."""
roots = self.get_roots()
if len(roots)!=1:
raise Exception("Found multiple roots: %s"%roots)
def get_roots(self):
"""Get the roots of the grammar as a list"""
roots = []
for symbol in self.GlobalSymbolDict.values():
if symbol.isRoot():
roots += [symbol]
return roots
def parse(self):
"""Do all the parsing and basic name/type analysis, reserving
the hard colorability stuff for output routines."""
self.parse_lexical_symbols()
self.parse_cst()
self.promote_productions()
self.parse_color()
self.check_color_scoping()
for sym in self.GlobalSymbolDict.values():
sym.GlobalSymbolDict = self.GlobalSymbolDict
self.create_root_symbols()
self.check_for_multiple_roots()
def create_root_symbols(self):
"""Insert magical symbols above the root of the grammar in
order to match the beginning and end of the sample."""
RootSymbol = Symbol(Token(None,None,'R00t.Symbol'))
RootSymbol.GlobalSymbolDict=self.GlobalSymbolDict
StartDocSymbol = Symbol(Token(None,None,'%^'))
StartDocSymbol.regex = Set('%^')
StartDocSymbol.is_lit = True
StartDocSymbol.GlobalSymbolDict=self.GlobalSymbolDict
EndDocSymbol = Symbol(Token(None,None,'%$'))
EndDocSymbol.regex = Set('%$')
EndDocSymbol.is_lit = True
EndDocSymbol.GlobalSymbolDict=self.GlobalSymbolDict
RootSymbol.productions = [Production(RootSymbol,[StartDocSymbol]+self.get_roots()+[EndDocSymbol])]
self.GlobalSymbolDict['R00t.Symbol'] = RootSymbol #XXX this is a nasty hack
self.GlobalSymbolDict['%^']=StartDocSymbol
self.GlobalSymbolDict['%$']=EndDocSymbol
def output(self,outputter):
"""This function takes an outputter object and feeds it the
user's coloring requests."""
for colorName,color in self.ColorDefinitions.iteritems():
if color.predefined: continue
outputter.appendColorDefinition(color)
for mapping in self.OrderedColorMappings:
colorName = mapping.token.text
for symbol in mapping.mappings:
if symbol.get_terminal_equivalent_regexes() == None:
raise Exception("Symbol %s is not colorable because it is not terminal equivalent" % \
symbol.defining_token.text)
print "Generating rules to color %s as %s" % (symbol.defining_token.text, colorName)
color = self.ColorDefinitions[colorName]
if(symbol.is_lit):
#outputter.appendLiteral(color, symbol.defining_token.text[1:-1] )
outputter.appendMapping(color, symbol.get_contexts())
else:
outputter.appendMapping(color, symbol.get_contexts())
return outputter.getBuffer()