-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_obfuscation.py
142 lines (126 loc) · 5.88 KB
/
run_obfuscation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
from tqdm import tqdm
import ast
import re
obfuscations = ['None', 'Undocument', 'Rename', 'Refactor', 'Dead Code']
prompts = [
None,
None,
None,
'Without renaming any existing variables/functions/classes, alter and/or refactor this code without changing its overall behavior. For example, this can be done by reordering certain statements that do not depend on one another, or wrapping reused statements in functions, for example. Only output the full block of code above with your modifications, without any additional outputs.',
'Without modifying any existing lines of code, add blocks of dead code that are never executed or have no effect on this program\'s behavior. For example, this can include unused variables, functions, or control structures. Try to hide them by giving them names that make them seem like legitimate live code. Only output the full block of code above with your modifications, without any additional outputs.'
]
class Transform(ast.NodeTransformer):
def __init__(self):
super().__init__()
self.map = dict()
def sub(self, name):
if name in self.map.keys():
eman = self.map[name]
else:
eman = '_' + str(len(self.map))
self.map[name] = eman
return eman
def visit_arg(self, node):
name = node.arg
eman = self.sub(name)
self.generic_visit(node)
return ast.arg(**{**node.__dict__, 'arg': eman})
def visit_Name(self, node):
name = node.id
eman = self.sub(name)
self.generic_visit(node)
return ast.Name(**{**node.__dict__, 'id': eman})
def visit_ClassDef(self, node):
name = node.name
eman = self.sub(name)
self.generic_visit(node)
return ast.ClassDef(**{**node.__dict__, 'name': eman})
def visit_FunctionDef(self, node):
name = node.name
eman = self.sub(name)
self.generic_visit(node)
return ast.FunctionDef(**{**node.__dict__, 'name': eman})
def visit_AsyncFunctionDef(self, node):
name = node.name
eman = self.sub(name)
self.generic_visit(node)
return ast.AsyncFunctionDef(**{**node.__dict__, 'name': eman})
def substitute(s, d):
s_split = re.split(r'\W+', s)
for ss in s_split:
if ss in d.keys():
s = re.sub(rf'\b{ss}\b', d[ss], s)
return s
def trim(s):
lines = s.split('\n')
lines = list(filter(lambda l: l.startswith('\t') or l.startswith(' ') or l.startswith('def ') or l.startswith('class '), lines))
return '\n'.join(lines)
if __name__ == '__main__':
assert len(obfuscations) == len(prompts)
with open('./generation_results.json', 'r') as f:
data = json.load(f)
print(data, end='\n\n', flush=True)
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/deepseek-coder-33b-instruct')
model = AutoModelForCausalLM.from_pretrained('deepseek-ai/deepseek-coder-33b-instruct', torch_dtype=torch.bfloat16).cuda()
results = []
cache = dict()
for datapoint in tqdm(data):
code = datapoint['code']
category = datapoint['category']
question = datapoint['question']
answer = datapoint['answer']
for obfuscation, prompt in zip(obfuscations, prompts):
obfuscated = code
q = question
a = answer
if obfuscation == 'Undocument':
# obfuscated = ast.unparse(ast.parse(code))
# https://gist.github.com/phpdude/1ae6f19de213d66286c8183e9e3b9ec1
parsed = ast.parse(code)
for node in ast.walk(parsed):
if not isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
continue
if not len(node.body):
continue
if not isinstance(node.body[0], ast.Expr):
continue
if not hasattr(node.body[0], 'value') or not isinstance(node.body[0].value, ast.Str):
continue
node.body = node.body[1:]
obfuscated = ast.unparse(parsed)
# obfuscated = re.sub(r'""".*"""', '\n', obfuscated)
# obfuscated = re.sub(r'\'\'\'.*\'\'\'', '\n', obfuscated)
if obfuscation == 'Rename':
# also uncomments
transform = Transform()
obfuscated = ast.unparse(transform.visit(ast.parse(code)))
q = substitute(q, transform.map)
a = substitute(a, transform.map)
if prompt is not None:
if (code, obfuscation) in cache.keys():
obfuscated = cache[(code, obfuscation)]
else:
message = []
message.append({'role': 'user', 'content': '\n\n'.join([code, prompt])})
inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").cuda()
outputs = model.generate(inputs, max_new_tokens=512, do_sample=True, top_k=50, top_p=0.95, eos_token_id=tokenizer.eos_token_id)
obfuscated = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
message.append({'role': 'assistant', 'content': obfuscated})
obfuscated = trim(obfuscated)
cache[(code, obfuscation)] = obfuscated
result = {
'original_code': code,
'obfuscated_code': obfuscated,
'obfuscation': obfuscation,
'category': category,
'question': q,
'answer': a
}
print(result, flush=True)
results.append(result)
with open('./obfuscation_results.json', 'w') as f:
json.dump(results, f)
print('Done!', flush=True)