-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresult_data_processor.py
226 lines (173 loc) · 8.09 KB
/
result_data_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import pandas as pd
import os
import fnmatch
import json
import re
import numpy as np
import logging
logging.basicConfig(filename='error_log.log', level=logging.ERROR)
class ResultDataProcessor:
def __init__(self, directory='results', pattern='results*.json'):
self.directory = directory
self.pattern = pattern
self.data = self.process_data()
self.ranked_data = self.rank_data()
def _find_files(self, directory='results', pattern='results*.json'):
matching_files = {}
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
matching_files[root] = filename
# TODO decide on removing this since I am catching the error when processing the file
matching_files = {key: value for key, value in matching_files.items() if 'gpt-j-6b' not in key}
matching_files = list(matching_files.values())
return matching_files
def _read_and_transform_data(self, filename):
with open(filename) as f:
data = json.load(f)
df = pd.DataFrame(data['results']).T
return df
def _cleanup_dataframe(self, df, model_name):
df = df.rename(columns={'acc': model_name})
df.index = (df.index.str.replace('hendrycksTest-', 'MMLU_', regex=True)
.str.replace('harness\|', '', regex=True)
.str.replace('\|5', '', regex=True))
return df[[model_name]]
def _extract_mc1(self, df, model_name):
df = df.rename(columns={'mc1': model_name})
# rename row harness|truthfulqa:mc|0 to truthfulqa:mc1
df.index = (df.index.str.replace('mc\|0', 'mc1', regex=True))
# just return the harness|truthfulqa:mc1 row
df = df.loc[['harness|truthfulqa:mc1']]
return df[[model_name]]
def _extract_mc2(self, df, model_name):
# rename row harness|truthfulqa:mc|0 to truthfulqa:mc2
df = df.rename(columns={'mc2': model_name})
df.index = (df.index.str.replace('mc\|0', 'mc2', regex=True))
df = df.loc[['harness|truthfulqa:mc2']]
return df[[model_name]]
# remove extreme outliers from column harness|truthfulqa:mc1
def _remove_mc1_outliers(self, df):
mc1 = df['harness|truthfulqa:mc1']
# Identify the outliers
# outliers_condition = mc1 > mc1.quantile(.95)
outliers_condition = mc1 == 1.0
# Replace the outliers with NaN
df.loc[outliers_condition, 'harness|truthfulqa:mc1'] = np.nan
return df
@staticmethod
def _extract_parameters(model_name):
"""
Function to extract parameters from model name.
It handles names with 'b/B' for billions and 'm/M' for millions.
"""
# pattern to match a number followed by 'b' (representing billions) or 'm' (representing millions)
pattern = re.compile(r'(\d+\.?\d*)([bBmM])')
match = pattern.search(model_name)
if match:
num, magnitude = match.groups()
num = float(num)
# convert millions to billions
if magnitude.lower() == 'm':
num /= 1000
return num
# return NaN if no match
return np.nan
def process_data(self):
full_model_name_count = 0
full_model_names = []
dataframes = []
organization_names = []
for filename in self._find_files(self.directory, self.pattern):
# try:
raw_data = self._read_and_transform_data(filename)
split_path = filename.split('/')
model_name = split_path[2]
organization_name = split_path[1]
full_model_name = f'{organization_name}/{model_name}'
full_model_name_count += 1
# print count every 100 models
if full_model_name_count % 100 == 0:
print(full_model_name_count)
cleaned_data = self._cleanup_dataframe(raw_data, model_name)
# mc1 = self._extract_mc1(raw_data, full_model_name)
# mc2 = self._extract_mc2(raw_data, full_model_name)
# cleaned_data = pd.concat([cleaned_data, mc1])
# cleaned_data = pd.concat([cleaned_data, mc2])
organization_names.append(organization_name)
full_model_names.append(full_model_name)
dataframes.append(cleaned_data)
# except Exception as e:
# # logging.error(f'Error processing {filename}')
# # logging.error(f'The error is: {e}')
# print(f'Error processing {filename}')
# print(f'The error is: {e}')
# continue
data = pd.concat(dataframes, axis=1).transpose()
# Add organization column
# data['organization'] = organization_names
print("full_model_names")
print(len(full_model_names))
print("organization_names")
print(len(organization_name))
data['full_model_name'] = full_model_names
# Add Model Name and rearrange columns
data['Model Name'] = data.index
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]
data = data[cols]
# Remove the 'Model Name' column
data = data.drop(columns=['Model Name'])
# Add average column
data['MMLU_average'] = data.filter(regex='MMLU').mean(axis=1)
# Reorder columns to move 'MMLU_average' to the third position
cols = data.columns.tolist()
cols = cols[:2] + cols[-1:] + cols[2:-1]
data = data[cols]
# Add parameter count column using extract_parameters function
data['Parameters'] = data.index.to_series().apply(self._extract_parameters)
# move the parameters column to the front of the dataframe
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]
print(cols)
data = data[cols]
new_columns = ['full_model_name'] + [col for col in data.columns if col != 'full_model_name']
data = data.reindex(columns=new_columns)
# # Reorder columns to move 'organization' to the second position
# cols = data.columns.tolist()
# cols = cols[-1:] + cols[:-1]
# data = data[cols]
# remove extreme outliers from column harness|truthfulqa:mc1
# data = self._remove_mc1_outliers(data)
data = self.manual_removal_of_models(data)
# drop rows if MMLU_abstract_algebra is NaN
data = data.dropna(subset=['MMLU_abstract_algebra'])
# add a URL column that takes https://huggingface.co/ + full_model_name
data['URL'] = 'https://huggingface.co/' + data['full_model_name']
new_columns = ['URL'] + [col for col in data.columns if col != 'URL']
data = data.reindex(columns=new_columns)
# drop columns drop|3 gsm8k and winogrande
data = data.drop(columns=['drop|3', 'gsm8k', 'winogrande'])
# # Drop specific columns
data = data.drop(columns=['all', 'truthfulqa:mc|0'])
# save to csv with the current date as part of the filename
data.to_csv(f'processed_data_{pd.Timestamp.now().strftime("%Y-%m-%d")}.csv')
return data
def manual_removal_of_models(self, df):
# remove models verified to be trained on evaluation data
# load the list of models
with open('contaminated_models.txt') as f:
contaminated_models = f.read().splitlines()
# remove the models from the dataframe
df = df[~df.index.isin(contaminated_models)]
return df
def rank_data(self):
# add rank for each column to the dataframe
# copy the data dataframe to avoid modifying the original dataframe
rank_data = self.data.copy()
for col in list(rank_data.columns):
rank_data[col + "_rank"] = rank_data[col].rank(ascending=False, method='min')
return rank_data
def get_data(self, selected_models):
return self.data[self.data.index.isin(selected_models)]