-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_primary_obfuscation.py
172 lines (152 loc) · 8 KB
/
run_primary_obfuscation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
from tqdm import tqdm
import ast
import re
# obfuscations = ['None', 'Undocument', 'Rename', 'Refactor', 'Dead Code']
# prompts = [
# None,
# None,
# None,
# 'Without renaming any existing variables/functions/classes, alter and/or refactor this code without changing its overall behavior. For example, this can be done by reordering certain statements that do not depend on one another, or wrapping reused statements in functions, for example. Only output the full block of code above with your modifications, without any additional outputs.',
# 'Without modifying any existing lines of code, add blocks of dead code that are never executed or have no effect on this program\'s behavior. For example, this can include unused variables, functions, or control structures. Try to hide them by giving them names that make them seem like legitimate live code. Only output the full block of code above with your modifications, without any additional outputs.'
# ]
obfuscations = ['None', 'Refactor', 'Dead Code', 'Refactor and Dead Code']
prompts = [
None,
'Without renaming any existing variables/functions/classes, alter and/or refactor this code without changing its overall behavior. For example, this can be done by reordering certain statements that do not depend on one another, or wrapping reused statements in functions, for example. Only output the full block of code above with your modifications, without any additional outputs.',
'Without modifying any existing lines of code, add blocks of dead code that are never executed or have no effect on this program\'s behavior. For example, this can include unused variables, functions, or control structures. Try to hide them by giving them names that make them seem like legitimate live code. Only output the full block of code above with your modifications, without any additional outputs.',
None
]
# class Transform(ast.NodeTransformer):
# def __init__(self):
# super().__init__()
# self.map = dict()
# def sub(self, name):
# if name in self.map.keys():
# eman = self.map[name]
# else:
# eman = '_' + str(len(self.map))
# self.map[name] = eman
# return eman
# def visit_arg(self, node):
# name = node.arg
# eman = self.sub(name)
# self.generic_visit(node)
# return ast.arg(**{**node.__dict__, 'arg': eman})
# def visit_Name(self, node):
# name = node.id
# eman = self.sub(name)
# self.generic_visit(node)
# return ast.Name(**{**node.__dict__, 'id': eman})
# def visit_ClassDef(self, node):
# name = node.name
# eman = self.sub(name)
# self.generic_visit(node)
# return ast.ClassDef(**{**node.__dict__, 'name': eman})
# def visit_FunctionDef(self, node):
# name = node.name
# eman = self.sub(name)
# self.generic_visit(node)
# return ast.FunctionDef(**{**node.__dict__, 'name': eman})
# def visit_AsyncFunctionDef(self, node):
# name = node.name
# eman = self.sub(name)
# self.generic_visit(node)
# return ast.AsyncFunctionDef(**{**node.__dict__, 'name': eman})
# def substitute(s, d):
# s_split = re.split(r'\W+', s)
# for ss in s_split:
# if ss in d.keys():
# s = re.sub(rf'\b{ss}\b', d[ss], s)
# return s
def trim(s):
lines = s.split('\n')
lines = list(filter(lambda l: l.startswith('\t') or l.startswith(' ') or l.startswith('def ') or l.startswith('class '), lines))
return '\n'.join(lines)
seed = 11797
model_name = 'deepseek-ai/deepseek-coder-33b-instruct'
if __name__ == '__main__':
assert len(obfuscations) == len(prompts)
torch.manual_seed(seed)
with open('./generation_results.json', 'r') as f:
data = json.load(f)
print(data, end='\n\n', flush=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).cuda()
results = []
cache = dict()
for datapoint in tqdm(data):
code = datapoint['code']
category = datapoint['category']
question = datapoint['question']
answer = datapoint['answer']
for obfuscation, prompt in zip(obfuscations, prompts):
# obfuscated = code
# q = question
# a = answer
# if obfuscation == 'Undocument':
# # obfuscated = ast.unparse(ast.parse(code))
# # https://gist.github.com/phpdude/1ae6f19de213d66286c8183e9e3b9ec1
# parsed = ast.parse(code)
# for node in ast.walk(parsed):
# if not isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
# continue
# if not len(node.body):
# continue
# if not isinstance(node.body[0], ast.Expr):
# continue
# if not hasattr(node.body[0], 'value') or not isinstance(node.body[0].value, ast.Str):
# continue
# node.body = node.body[1:]
# obfuscated = ast.unparse(parsed)
# # obfuscated = re.sub(r'""".*"""', '\n', obfuscated)
# # obfuscated = re.sub(r'\'\'\'.*\'\'\'', '\n', obfuscated)
# if obfuscation == 'Rename':
# # also uncomments
# transform = Transform()
# obfuscated = ast.unparse(transform.visit(ast.parse(code)))
# q = substitute(q, transform.map)
# a = substitute(a, transform.map)
# if prompt is not None:
if obfuscation == 'None':
obfuscated = code
elif (code, obfuscation) in cache.keys():
obfuscated = cache[(code, obfuscation)]
else:
if obfuscation == 'Refactor and Dead Code':
assert (code, 'Refactor') in cache.keys()
obfuscated = cache[(code, 'Refactor')]
prompt = prompts[1]
message = []
message.append({'role': 'user', 'content': '\n\n'.join([obfuscated, prompt])})
inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").cuda()
outputs = model.generate(inputs, max_new_tokens=1024, do_sample=True, top_k=50, top_p=0.95, eos_token_id=tokenizer.eos_token_id)
obfuscated = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
message.append({'role': 'assistant', 'content': obfuscated})
obfuscated = trim(obfuscated)
cache[(code, obfuscation)] = obfuscated
else:
assert prompt is not None
message = []
message.append({'role': 'user', 'content': '\n\n'.join([code, prompt])})
inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").cuda()
outputs = model.generate(inputs, max_new_tokens=1024, do_sample=True, top_k=50, top_p=0.95, eos_token_id=tokenizer.eos_token_id)
obfuscated = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
message.append({'role': 'assistant', 'content': obfuscated})
obfuscated = trim(obfuscated)
cache[(code, obfuscation)] = obfuscated
result = {
'original_code': code,
'obfuscated_code': obfuscated,
'obfuscation': obfuscation,
'category': category,
'question': question,
'answer': answer
}
print(result, flush=True)
results.append(result)
with open('./primary_obfuscation_results.json', 'w') as f:
json.dump(results, f)
print('Done!', flush=True)