-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsynthetic_accuracy_easy_2_paddle.py
214 lines (147 loc) · 9.69 KB
/
synthetic_accuracy_easy_2_paddle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# %%
'''
Let's get all accuracy results here. It should include
1. Two versions of accuracy is needed - maybe a separate dictionary to keep the number of valid and exact matches
2. Separately show on synthetic data and real Japanese data
Paddle to Paddle Accuracy - Two Versions
Paddle to GCV Accuracy - both the previous dict and current dict
Easy to Easy Accuracy - waiting for the last one
GCV to GCV Accuracy - zht and Japanese
Paddle to Easy Accuracy - Running
In this script, you can also add paddle to paddle, easy to easy...
'''
import pandas as pd
import os
import json
from tqdm import tqdm
# Let's do a new folder... on the old dicts to synth results first
# This is no use
#file_name = ['df_full_matched_small_fuzzychinese.csv','df_full_matched_small_homo.csv','df_full_matched_small_lev.csv','df_full_matched_small_simstring.csv']
# Easy_2_Paddle_ZHT_FOLDER_old_dict = '/mnt/data01/yxm/homoglyphic_matching/multi_lang/zht_gcv_paddle_top10'
### for easy to paddle, only run on expanded dict
Easy_2_Paddle_ZHT_FOLDER_new_expanded_dict = '/mnt/data01/yxm/homoglyphic_matching/multi_lang/c_j_k_match_top_10_zht_paddle_easy'
# This is the not good results
Easy_2_Paddle_ZHS_FOLDER_new_expanded_dict = '/mnt/data01/yxm/homoglyphic_matching/multi_lang/c_j_k_match_top_10_zhs_easy_paddle_80000_expand_final'
## need changes since zhs is truncated now!
Easy_2_Paddle_JA_FOLDER = '/mnt/data01/yxm/homoglyphic_matching/multi_lang/c_j_k_match_top_10_ja_paddle_easy'
Easy_2_Paddle_KO_FOLDER = '/mnt/data01/yxm/homoglyphic_matching/multi_lang/c_j_k_match_top_10_ko_paddle_easy'
save_output = './aggregate_results_easy_2_paddle_0519_report'
os.makedirs(save_output,exist_ok=True)
'''
Let's exclude the exact matches first, only use the valid error df
Let's do a paddle to gcv series first... this is in the gcv folder
If you want to calculate several versions of accuracy, you need to keep the full df, valid df, and the error_df for match, so you can know everything
df_full, df_valid
'''
# same_matched function
def same_matched(a,b):
for ele in a:
if ele == b: # If any of the ele in a equals b, return 1, after the iter, if nothing returns, just return 0
return 1
return 0
# Let's define a function for calculate accuracy
def cal_acc(df_matched_small, method_name,stored_accuracy, aux=None):
if aux==None:
accuracy_name = 'accuracy'
prefix = f"{method_name}_matched"
elif method_name=="sim":
assert aux!=None
accuracy_name = f'accuracy_{aux}'
prefix = f"{method_name}_{aux}_nearest"
method_name = f"{method_name}_{aux}"
elif method_name=="fuzzychinese":
assert aux!=None
accuracy_name = f'accuracy_{aux}'
prefix = f"{method_name}_{aux}_word"
method_name = f"{method_name}_{aux}"
df_matched_small[accuracy_name]=df_matched_small.apply(lambda x:same_matched([x[f"{prefix}_1"]],x["truth"]),axis=1)#The accuracy do not need to be stored
accuracy=df_matched_small[accuracy_name].sum() # Store these into a json file
stored_accuracy[method_name]["top 1"] = int(accuracy)
df_matched_small[accuracy_name]=df_matched_small.apply(lambda x:same_matched([x[f"{prefix}_1"],x[f"{prefix}_2"]],x["truth"]),axis=1)#The accuracy do not need to be stored
accuracy=df_matched_small[accuracy_name].sum() # Store these into a json file
stored_accuracy[method_name]["top 2"] = int(accuracy)
df_matched_small[accuracy_name]=df_matched_small.apply(lambda x:same_matched([x[f"{prefix}_1"],x[f"{prefix}_2"],x[f"{prefix}_3"]],x["truth"]),axis=1)#The accuracy do not need to be stored
accuracy=df_matched_small[accuracy_name].sum() # Store these into a json file
stored_accuracy[method_name]["top 3"] = int(accuracy)
df_matched_small[accuracy_name]=df_matched_small.apply(lambda x:same_matched([x[f"{prefix}_1"],x[f"{prefix}_2"],x[f"{prefix}_3"],x[f"{prefix}_4"],x[f"{prefix}_5"]],x["truth"]),axis=1)#The accuracy do not need to be stored
accuracy=df_matched_small[accuracy_name].sum() # Store these into a json file
stored_accuracy[method_name]["top 5"] = int(accuracy)
df_matched_small[accuracy_name]=df_matched_small.apply(lambda x:same_matched([x[f"{prefix}_1"],x[f"{prefix}_2"],x[f"{prefix}_3"],x[f"{prefix}_4"],x[f"{prefix}_5"],x[f"{prefix}_6"],x[f"{prefix}_7"],x[f"{prefix}_8"],x[f"{prefix}_9"],x[f"{prefix}_10"]],x["truth"]),axis=1)#The accuracy do not need to be stored
accuracy=df_matched_small[accuracy_name].sum() # Store these into a json file
stored_accuracy[method_name]["top 10"] = int(accuracy)
return stored_accuracy
lang_2_folder = {'zht_paddle_easy':Easy_2_Paddle_ZHT_FOLDER_new_expanded_dict,'zhs_easy_paddle_80000':Easy_2_Paddle_ZHS_FOLDER_new_expanded_dict,'ko_paddle_easy':Easy_2_Paddle_KO_FOLDER,'ja_paddle_easy':Easy_2_Paddle_JA_FOLDER}
lang_2_total = {'zht_paddle_easy':76349,'zhs_easy_paddle_80000':50000,'ko_paddle_easy':70091,'ja_paddle_easy':128678}
for lang,folder in tqdm(lang_2_folder.items()):
stored_accuracy = {} # Initialize the stored_accuracy
for choice in ["homo","lev","sim_cos","sim_dice","sim_over","sim_jac","fuzzychinese_stroke","fuzzychinese_char"]:
stored_accuracy[choice] = {}
for method_name in ['homo','lev','sim','fuzzychinese']:
if method_name == 'sim':
file_name = "simstring"
else:
file_name = method_name
df_matched_small = pd.read_csv(os.path.join(folder,f'df_full_matched_small_{file_name}.csv'))
df_matched_small=df_matched_small.dropna(subset=['result'])
df_matched_small = df_matched_small[df_matched_small["result"]!=""]
df_matched_small=df_matched_small.dropna(subset=['truth'])
df_matched_small = df_matched_small[df_matched_small["truth"]!=""]
# No need to drop NA again, everything inside df_matched_small is already dropped - You still need to drop NA, but it is weird for levenshtein
print('after drop NA',len(df_matched_small))
if method_name =="homo" or method_name=="lev":
stored_accuracy = cal_acc(df_matched_small,method_name, stored_accuracy)
elif method_name=="sim":
for aux in ["cos","over","dice","jac"]:
stored_accuracy = cal_acc(df_matched_small,method_name, stored_accuracy,aux)
elif method_name=="fuzzychinese":
for aux in ["stroke","char"]:
stored_accuracy = cal_acc(df_matched_small,method_name, stored_accuracy,aux)
with open(os.path.join(save_output,f'accuracy_{lang}_small_count.json'),'w') as f:
json.dump(stored_accuracy,f,ensure_ascii=False)
print(stored_accuracy)
# No need get from there...
df_full = pd.read_csv(f'/mnt/data01/yxm/homo/multilang_results/{lang}/df_full.csv')
df_full = df_full.drop_duplicates(subset=['ground_truth'])
df_valid = df_full.dropna(subset=['result'])
# df_valid = pd.read_csv(f'/mnt/data01/yxm/homo/multilang_results_gcv/{lang}/df_valid.csv')
df_error = pd.read_csv(f'/mnt/data01/yxm/homo/multilang_results/{lang}/error_df.csv')
# You can have more functionalities for creating several versions of accuracy...
# We want to keep two versions of top 1 accuracy, let alone others for now
'''
Think About How to Store this Information for Confluence Report...
Maybe two tables - for the accuracy, let's have three columns? only keep top 1
accuracy for only error df, include exact match but no empty string, include everything include the empty string
save this to the save_output folder
'''
name = ["Paddle to GCV"]
## change the total_images here...
total_images = [lang_2_total[lang]]
total_valid_OCR = [len(df_valid)]
print('df_error before',len(df_error))
df_error = df_error.dropna(subset=['result'])
# Also need to drop where ground_truth is empty! Maybe that's also why lev doesn't match the full...
df_error = df_error.dropna(subset=['ground_truth'])
print('df_error_after',len(df_error))
total_error_df = [len(df_error)]
total_exact_match = [len(df_valid)-len(df_error)]
total_empty_string = [lang_2_total[lang]-len(df_valid)]
total_empty_string_perc = [(lang_2_total[lang]-len(df_valid))/lang_2_total[lang]]
total_exact_match_perc = [(len(df_valid)-len(df_error))/lang_2_total[lang]]
total_error_df_perc = [len(df_error)/lang_2_total[lang]]
df_stats_save = pd.DataFrame(list(zip(name,total_images,total_valid_OCR,total_error_df,total_exact_match, total_empty_string, total_empty_string_perc, total_exact_match_perc, total_error_df_perc)),columns=['name','Total #images','Total #Valid OCR','Total #Error df','Total #Exact Match', 'Total Empty String', 'total empty string perc','total exact match perc','total error df perc'])
df_stats_save.to_csv(os.path.join(save_output,f'data_stats_{lang}.csv'))
# Also store the exact count_from_json top 1
method_name_list = []
count_correct_match_list = []
accuracy_error_list = []
accuracy_valid_list = []
accuracy_full_list = []
for method_name in stored_accuracy:
method_name_list.append(method_name)
correct_match = stored_accuracy[method_name]["top 1"]
count_correct_match_list.append(correct_match)
accuracy_error_list.append(correct_match/len(df_error))
accuracy_valid_list.append((correct_match+len(df_valid)-len(df_error))/len(df_valid))
accuracy_full_list.append((correct_match+len(df_valid)-len(df_error))/total_images[0])
df_accuracy_save = pd.DataFrame(list(zip(method_name_list,count_correct_match_list,accuracy_error_list,accuracy_valid_list,accuracy_full_list)),columns=['method_name','count','error','valid','full'])
df_accuracy_save.to_csv(os.path.join(save_output,f"data_accuracy_{lang}.csv"))