-
Notifications
You must be signed in to change notification settings - Fork 4
/
parser.cpp
430 lines (362 loc) · 15.4 KB
/
parser.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
//--------------------------------------------------------------------------------------
// File: parser.cpp
//
// Parsing and combining the instructions.
//
// Copyright (c) Oberoi Security Solutions. All rights reserved.
// Licensed under the Apache 2.0 License.
//--------------------------------------------------------------------------------------
#include "parser.h"
#include "registers.h"
set<string> g_allRegisters;
extern const char* ALL_REGISTERS[];
// Load all the registers extracted from Ghidra into a set
// When parsing the instructions this is how we will tell the difference
// between an instruction mnemonic versus a register
// additionalRegisters is a list of additional registers specified by the user
// at the command line
int initRegisters(vector<string>& additionalRegisters)
{
set<string>::iterator it;
for(unsigned int i = 0; i < sizeof(ALL_REGISTERS)/sizeof(ALL_REGISTERS[0]); i++)
{
const char* reg = ALL_REGISTERS[i];
// BUGBUG: bad alloc exception?
g_allRegisters.insert(reg);
}
for(auto additionalRegister : additionalRegisters)
{
g_allRegisters.insert(additionalRegister);
}
return 0;
}
// returns true if the passed in string is a register
// This is determined seeing if it's in the g_allRegisters set
bool isRegister(string& str)
{
set<string>::iterator it;
it = g_allRegisters.find(str);
if(it == g_allRegisters.end())
{
return false;
}
return true;
}
// returns true if the passed in string is an opcode
// We determine a string is an opcode if it is a hex string beginning with 0x
bool isOpcode(string& str)
{
bool result;
boost::regex expr{"0[xX][0-9a-fA-F]+"};
result = boost::regex_match(str, expr);
return result;
}
// returns true if the passed in string is an integer
bool isInteger(string &str)
{
bool result;
boost::regex expr{"[0-9]+"};
result = boost::regex_match(str, expr);
return result;
}
// an immediate is a hex string or decimal string
bool isImmediate(string& str)
{
if(isOpcode(str) || isInteger(str))
{
return true;
}
return false;
}
// tokenizes the input instructions and appends them to the allInstructions set
int parseInstructions(PARSED_DATA& parsedData)
{
unsigned int lineNum = 0;
int result = 0;
std::string line;
// open the input file for parsing
boost::filesystem::path infile{parsedData.inputFilename};
boost::filesystem::ifstream ifs{infile};
if(!ifs)
{
cout << "[-] Failed to open input file!!" << endl;
return -1;
}
//
// parse the input file line by line
//
while (std::getline(ifs, line))
{
vector<string> lineSplit;
++lineNum;
Instruction* currInstruction = new Instruction();
if(currInstruction == NULL)
{
cout << "[-] Error line " << lineNum << ": Failed to allocate!!" << endl;
return -1;
}
// We want to split these fillers from register values
// The simplest way I could come up was to do this but it's slow...
// BUGBUG: improve performance here
boost::replace_all(line, ",", " , ");
boost::replace_all(line, "@", " @ ");
boost::replace_all(line, "(", " ( ");
boost::replace_all(line, ")", " ) ");
boost::replace_all(line, "+", " + ");
boost::replace_all(line, "-", " - ");
boost::replace_all(line, "#", " # ");
boost::trim(line);
// split the line into components
boost::split(lineSplit, line, boost::algorithm::is_space(), boost::token_compress_on);
// our combining algorithm needs to be rewritten to support more than 26 tokens
// for the time being bail
if(lineSplit.size() > MAX_TOKENS)
{
cout << "[-] Error line " << lineNum << ": Line has more than MAX_TOKENS!!" << endl;
delete currInstruction;
return -1;
}
// tokenize each line component and add it to the Instruction
for (unsigned int i = 0; i < lineSplit.size(); i++)
{
std::set<string>::iterator it;
if(i == 0)
{
unsigned int opcodeBitLength = 0;
// the first element on the line must be the opcode
result = isOpcode(lineSplit[i]);
if(result != true)
{
cout << "[-] Error line " << lineNum << ": First field is not an hex opcode!!" << endl;
cout << "[-] Got: " << lineSplit[i];
delete currInstruction;
return -1;
}
currInstruction->setOpcode(lineSplit[i]);
// we need to keep track of the maximum bit length for the combining stage
opcodeBitLength = currInstruction->getOpcode().length();
if(opcodeBitLength > parsedData.maxOpcodeBits)
{
//cout << "Updating bit length from " << parsedData.maxOpcodeBits << " to " << opcodeBitLength << endl;
parsedData.maxOpcodeBits = opcodeBitLength;
}
}
else
{
InstructionComponentType currType;
// all remaining elements on the line are components of the instruction
if(isRegister(lineSplit[i]))
{
currType = TYPE_REGISTER;
parsedData.registers.insert(lineSplit[i]);
}
else if(isImmediate(lineSplit[i]))
{
currType = TYPE_IMMEDIATE;
}
else
{
currType = TYPE_INSTRUCTION;
}
currInstruction->addComponent(currType, lineSplit[i]);
}
} // for (int i = 0; i < lineSplit.size(); i++)
// sanity check the instruction
result = currInstruction->validateInstruction();
if(result != true)
{
cout << "[-] Error line " << lineNum << ": Instruction is invalid!!" << endl;
delete currInstruction;
return -1;
}
// check for duplicate instructions before inserting
if(parsedData.allInstructions.find(currInstruction->getOpcode()) != parsedData.allInstructions.end())
{
cout << "[-] Error line " << lineNum << ": Found duplicate opcode!!" << endl;
delete currInstruction;
return -1;
}
// everything is good, insert instruction into our set
parsedData.allInstructions.insert({{currInstruction->getOpcode(), currInstruction}});
} // while (std::getline(ifs, line))
// copy the instructions into the combined instructions set
// we need to save the original allInstructions to recreate the registers lists
// when we print out the instructions
parsedData.combinedInstructions = parsedData.allInstructions;
ifs.close();
return 0;
}
// Attempts to combine instructions into one. To combine two instructions into one:
// -- the opcodes must bit one bit apart
// -- the instructions must be identical (COMBINE_DUPLICATE)
// -- the instructions must be identical except for an immediate field (COMBINE_IMMEDIATE)
// -- the instructions must be identical except for a register field (COMBINE_REGISTER)
//
// When we find two instructions to combine we must:
// -- remove the first instruction from combinedInstructions set
// -- remove the second instruction from the combinedInstruction set
// -- change the shared bit to another character such as:
// ---- '*' for duplicates
// ---- lowercase letter for immediates
// ---- uppercase letter for registers
// -- create a new combined instruction and add it to the combinedInstructions set
//
// Because we are inserting and deleting while iterating through the set we need be careful
// with our iterators
//
void combineInstructions(PARSED_DATA& parsedData, COMBINE_TYPE combineType)
{
map<string, Instruction*> tempCombinedInstructions; // because we can't insert into a set while iterating over it
// we temporarily store combined instructions here
// worst case we must run this algorithm once for every bit in the opcode
// we have a short-circuit exit if execute a loop without combining any instructions
for(unsigned int k = 0; k < parsedData.maxOpcodeBits; k++)
{
cout << " [*] Pass: " << k << " Instructions: " << parsedData.combinedInstructions.size() << endl;
// loop through all of the instructionsone by one
map<string, Instruction*>:: iterator currItr = parsedData.combinedInstructions.begin();
while (currItr != parsedData.combinedInstructions.end())
{
bool didCombine = false;
// loop through each bit of the current instruction
string curBitString = currItr->first;
for(unsigned int i = 0; i < curBitString.length() && didCombine == false; i++)
{
map<string, Instruction*>:: iterator tempItr;
string tempBitString;
bool isEqual = false;
char replacementChar = '*';
int differencePosition = 0;
if(curBitString[i] != '0')
{
continue; // we only increment a single bit
}
//
// Check if there is an another instruction where curBitString[i] == '1'
//
// current bit position is 0, increment it to a 1 and see if another string is there
tempBitString = curBitString;
tempBitString[i] = '1';
tempItr = parsedData.combinedInstructions.find(tempBitString);
if(tempItr == parsedData.combinedInstructions.end())
{
// didn't find an adjacent instruction
continue;
}
//
// We have a candidate adjacent instruction, check if they are combinable
//
switch(combineType)
{
case COMBINE_DUPLICATES:
isEqual = currItr->second->areInstructionComonentsEqual(tempItr->second);
replacementChar = '*';
break;
case COMBINE_IMMEDIATES:
isEqual = currItr->second->areInstructionComonentsEqualExceptImmediate(tempItr->second, &differencePosition);
if(isEqual == false)
{
isEqual = currItr->second->areInstructionComonentsEqualExceptNegativeSign(tempItr->second, &differencePosition);
}
replacementChar = currItr->second->getComponentLetterFromPosition(TYPE_IMMEDIATE, differencePosition);
break;
case COMBINE_REGISTERS:
isEqual = currItr->second->areInstructionComonentsEqualExceptRegister(tempItr->second, &differencePosition);
replacementChar = currItr->second->getComponentLetterFromPosition(TYPE_REGISTER, differencePosition);
break;
default:
// BUGBUG: handle errors gracefully
cout << "[-] Invalid combine type specified!!" << endl;
return;
}
if(isEqual)
{
// instructions are equal, combine them
Instruction* combinedInstruction = currItr->second;
// remove the two existing instructions
parsedData.combinedInstructions.erase(tempBitString);
// this removes the current instruction AND increments our iterator
currItr = parsedData.combinedInstructions.erase(currItr);
// insert the combined instruction into the tempCombinedInstructions set
// it's safe to delete but not insert into a set while iterating through it
tempBitString[i] = replacementChar;
combinedInstruction->setOpcodeBitString(tempBitString);
combinedInstruction->setCombined(true);
tempCombinedInstructions.insert({{tempBitString, combinedInstruction}});
// we deleted the current instruction, abort the loop
didCombine = true;
break;
}
} // for(int i = 0; i < curBitString.length() && didCombine == false; i++)
if(didCombine == false)
{
// we didn't combine an instruction, increment the iterator manually
currItr++;
}
} //while (currItr != parsedData.combinedInstructions.end())
// short-circuit exit if we didn't combine any instructions during this loop
if(tempCombinedInstructions.size() == 0)
{
//cout << " [*] No instructions combined during pass. Short-circuiting" << endl;
return;
}
// we deleted instructions, now merge back in the combined instructions
for(map<string, Instruction*>:: iterator currItr = tempCombinedInstructions.begin();
currItr != tempCombinedInstructions.end();
currItr++)
{
parsedData.combinedInstructions.insert({{currItr->first, currItr->second }});
}
tempCombinedInstructions.clear();
} // for(int k = 0; k < parsedData.maxOpcodeBits; k++)
return;
}
// walks through all instructions that have combined registers and figures out the register list
// and register variable name and appends them to registerVariables. Once registerVariables is filled out
// attachVariables is filled out.
void computeAttachVariables(PARSED_DATA& parsedData)
{
std::set<Instruction*>::iterator it;
// iterate through all combined instructions and update registerVariables
for (auto& x: parsedData.combinedInstructions)
{
x.second->computeAttachVariables(parsedData.allInstructions, parsedData.registerVariables);
}
for (auto& y: parsedData.registerVariables)
{
// y.second = string consisting all delimited by space
// y.first = register variable name
parsedData.attachVariables[y.second].insert(y.first);
}
return;
}
// walks through all instructions that have combined registers and figures out the register list
// and register variable name and appends them to registerVariables. Once registerVariables is filled out
// attachVariables is filled out.
void computeTokenInstructions(PARSED_DATA& parsedData)
{
std::set<Instruction*>::iterator it;
// iterate through all combined instructions. getOpcodeOutputString() will append new tokens
// to the tokenInstructions set
for (auto& x: parsedData.combinedInstructions)
{
x.second->getOpcodeOutputString(parsedData.tokenInstructions);
}
return;
}
// clears the parser data structure
void clearParserData(PARSED_DATA& parsedData)
{
for (auto& x: parsedData.allInstructions)
{
// free the Instructions we allocated
delete x.second;
}
parsedData.allInstructions.clear();
// no other data structures allocated Instructions
parsedData.combinedInstructions.clear();
parsedData.registers.clear();
parsedData.registerVariables.clear();
parsedData.attachVariables.clear();
parsedData.tokenInstructions.clear();
}