-
Notifications
You must be signed in to change notification settings - Fork 7
/
plot_hist.py
194 lines (172 loc) · 8.04 KB
/
plot_hist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import numpy
from matplotlib import pyplot
from matplotlib.backends.backend_pdf import PdfPages
import os, sys, re
from collections import defaultdict
import argparse
import common, dot
from nltk.stem.porter import *
"""Read the program similarity result files and plot histograms"""
def parse_result_file(result_file):
"""
file format:
path_to_dotA:
path_to_similar_dot1 , score
...
path_to_similar_dot5 , score
path_to_dotB:
...
"""
dot_score_lst = []
dot_sim_result = {}
proj_lst = common.LIMITED_PROJECT_LIST
match_count = defaultdict(int)
count = 0
score = 0.0
current_dot = None
with open(result_file, "r") as fi:
for line in fi:
line = line.rstrip('\n')
if len(line)>0 and line[-1]==":":
current_dot = line[:-1]
dot_sim_result[current_dot] = []
else:
linarr = line.split(" , ")
if linarr[0][-3:]=="dot":
if count == 0:
match_name = linarr[0].split(os.path.sep)[6] # quick hack to get the proj name
match_count[match_name] += 1
count += 1
score += float(linarr[1])
dot_sim_result[current_dot].append((linarr[0],linarr[1]))
if count==5:
dot_score_lst.append((current_dot, score/count))
count = 0
score = 0.0
return (dot_score_lst, dot_sim_result, match_count)
def show_improvement(proj, dot_score_lst_nc, dot_score_lst_c, dot_sim_res_nc, dot_sim_res_c, dot_method_map, topk):
nc_total = 0.0
c_total = 0.0
largest_impr = 0.0
method_num = len(dot_score_lst_nc)
impr_lst = []
assert len(dot_score_lst_nc)==len(dot_score_lst_c), "Should have the same number of methods with or without clustering."
for i in range(len(dot_score_lst_nc)):
assert dot_score_lst_nc[i][0]==dot_score_lst_c[i][0], "Should be comparing the same dot."
nc_total += dot_score_lst_nc[i][1]
c_total += dot_score_lst_c[i][1]
impr = dot_score_lst_c[i][1] - dot_score_lst_nc[i][1]
if impr > largest_impr:
largest_impr = impr
#assert impr_score+0.00001>=0.0, "Clustering should not degrade the performance of similar program identification."
impr_lst.append((dot_score_lst_nc[i][0], impr))
impr_lst.sort(key=lambda x: x[1], reverse=True)
total_impr = c_total - nc_total
print("\n***************************\n")
print("{0}:".format(proj))
print("Average score improvement per method: {0}.".format(total_impr/len(dot_score_lst_nc)))
print("Percentage score improvement: {0}.".format(total_impr*100/nc_total))
print("Largest score improvement for a single method: {0}.\n".format(largest_impr))
print("The top {0} most improved methods are:\n".format(topk))
for i in range(topk):
dot_name = impr_lst[i][0]
print(dot_method_map[dot_name]+" : average similarity score improved by " + str(impr_lst[i][1]))
print("Before clustering:")
nc_lst = dot_sim_res_nc[dot_name]
for j in range(len(nc_lst)):
print(dot_method_map[nc_lst[j][0]]+" , "+nc_lst[j][1])
print("After clustering:")
c_lst = dot_sim_res_c[dot_name]
for j in range(len(c_lst)):
print(dot_method_map[c_lst[j][0]]+" , "+c_lst[j][1])
print("\n")
print("\n***************************\n")
# output some stats
return (c_total, nc_total, method_num, largest_impr)
def plot_hist(x, xlabel, y, ylabel, fig_file, title=""):
bins = numpy.linspace(0.0, 4.0, 100)
#pyplot.hist(x, bins, alpha=0.5, label=xlabel)
#pyplot.hist(y, bins, alpha=0.5, label=ylabel)
data = numpy.vstack([x, y]).T
pyplot.figure()
pyplot.hist(data, bins, alpha=0.7, color=["white", "black"], hatch="//", label=[xlabel, ylabel])
pyplot.legend(loc="upper right")
#pyplot.show()
pyplot.title(title)
pyplot.ylabel("number of program segments")
pyplot.xlabel("similarity score")
pyplot.xlim(0.0, 1.0)
pp = PdfPages(fig_file+".pdf")
pyplot.savefig(pp, format='pdf')
pp.close()
def get_dot_method_map(proj_lst):
dot_method_map = {}
for proj in proj_lst:
output_dir_lst = dot.dot_dirs(proj)
for output_dir in output_dir_lst:
method_file = dot.get_method_path(proj, output_dir)
with open(method_file, "r") as mf:
for line in mf:
line = line.rstrip()
items = line.split("\t")
method_name = items[0]
method_dot = items[1]
method_dot_path = dot.get_dot_path(proj, output_dir, method_dot)
dot_method_map[method_dot_path] = method_name
return dot_method_map
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-nc", "--nocluster", required=True, type=str, help="path to the result folder without relabeling")
parser.add_argument("-c", "--cluster", required=True, type=str, help="path to the result folder with relabeling")
parser.add_argument("-f", "--fig", type=str, help="path to the figure folder")
parser.add_argument("-s", "--strategy", required=True, type=str, help="name of the strategy")
parser.add_argument("-k", "--topk", type=int, help="top k most improved methods")
#parser.add_argument("-a", "--all", action="store_true", help="set to merge results from all benchmark projects in a single histogram")
args = parser.parse_args()
strategy = "strategy"
if args.strategy:
strategy = args.strategy
proj_lst = common.LIMITED_PROJECT_LIST
fig_dir = strategy+"_hist"
if args.fig:
fig_dir = args.fig
common.mkdir(fig_dir)
dot_method_map = get_dot_method_map(proj_lst)
topk = 10
if args.topk:
topk = args.topk
all_score_lst_nc = []
all_score_lst_c = []
all_c_total = 0.0
all_nc_total = 0.0
all_largest_impr = 0.0
all_method_num = 0
for proj in proj_lst:
proj_result_file_name = proj + "_result.txt"
(dot_lst_nc, dot_res_nc, match_count_nc) = parse_result_file(os.path.join(args.nocluster, proj_result_file_name))
(dot_lst_c, dot_res_c, match_count_c) = parse_result_file(os.path.join(args.cluster, proj_result_file_name))
score_lst_nc = [x[1] for x in dot_lst_nc]
score_lst_c = [x[1] for x in dot_lst_c]
(c_total, nc_total, method_num, largest_impr) = show_improvement(proj, dot_lst_nc, dot_lst_c, dot_res_nc, dot_res_c, dot_method_map, topk)
print("\n***************************\n")
print("{0} after clustering:".format(proj))
for match in list(match_count_c.keys()):
print("Number of matched methods in {0}: {1}".format(match, match_count_c[match]))
print("\n***************************\n")
all_c_total += c_total
all_nc_total += nc_total
all_method_num += method_num
if largest_impr > all_largest_impr:
all_largest_impr = largest_impr
all_score_lst_nc += score_lst_nc
all_score_lst_c += score_lst_c
plot_hist(score_lst_nc, "w/o clustering", score_lst_c, strategy, os.path.join(fig_dir, proj), proj+" : "+strategy)
print("\n")
all_avg_impr = (all_c_total - all_nc_total)/all_method_num
all_percent_impr = (all_c_total - all_nc_total)*100/all_nc_total
plot_hist(all_score_lst_nc, "w/o clustering", all_score_lst_c, strategy, os.path.join(fig_dir, strategy), "all : "+strategy)
print("Average score improvement across projects: {0}".format(all_avg_impr))
print("Percentage score improvement across projects: {0}".format(all_percent_impr))
print("Largest score improvement for a single method: {0}".format(all_largest_impr))
if __name__ == "__main__":
main()