Skip to content

Commit

Permalink
fix null in win-rate calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
weiweiy committed Nov 15, 2023
1 parent 7c5392f commit 15c764f
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 16 deletions.
43 changes: 43 additions & 0 deletions neurIPS_eval_scripts/hide_dataset_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import json
from collections import defaultdict
import argparse


def hide_name (ranked):
metrics = defaultdict(list)
for item in ranked:
for k, v in item[1].items():
metrics[k].append(v)

names= list(metrics.keys())
names = [x.split(' ')[0].strip() for x in names]
name_dic = {x: f"dataset_{i}" for i, x in enumerate(set(names)) if x!="Score"}

clean_ranked = []

for (k, v) in ranked:
nn = {}
for kk, vv in v.items():
t = kk.split(' ')
a = t[0]
p = t[1:]
new_k = ' '.join([name_dic.get(a, 'Score')] + p)
nn[new_k] = vv
clean_ranked.append((k, nn))
return clean_ranked

if __name__ == "__main__":

try:
parser = argparse.ArgumentParser(description="hide dataset name")
parser.add_argument("--file", type=str, help='ranked submission file', required=True)
parser.add_argument("--outfile", type=str, help="output file name", required=True)
args = parser.parse_args()
with open (args.file, 'r') as handle:
ranked = json.load(handle)
clean_ranked = hide_name(ranked=ranked)

with open(args.outfile,'w') as handle:
json.dump(clean_ranked, handle, indent=4)
except Exception as e:
print(e)
31 changes: 15 additions & 16 deletions neurIPS_eval_scripts/rank_submissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def transpose_results(results):
return transposed_results


def calc_win_rate(values, lower_is_better=False):
def calc_win_rate(values, lower_is_better=False, verbose=False):
#This function calculate win rate, allow entries in values to repeat, such as [1, 1, 1, 3, 4, 5, 1]
# in this case, the repeated values will get the same win-rate, which is 1/(n_repeats) + count(lower_rank)
counts = Counter( values)
Expand All @@ -51,9 +51,13 @@ def calc_win_rate(values, lower_is_better=False):
win_rate[i] += 1
elif v == vv:
win_rate[i] += 1.0/counts[v]

win_rate = [(k, v/len(values)) for k, v in win_rate.items()]
if verbose:
print( [(i,v ) for i, v in enumerate(values)])
print (win_rate)
win_rate = [x[1] for x in sorted(win_rate, key=lambda k : k[0] )]
if verbose:
print(win_rate)
return win_rate

#take from https://github.com/Lightning-AI/llm-efficiency-challenge-eval/blob/main/agents/helm_postprocessing.py
Expand Down Expand Up @@ -84,20 +88,20 @@ def rank_results(data:dict, metrics_config:dict):
win_rates_per_row = [[] for _ in submission_ids]
metrics = [metric for _, metric, _ in metrics_config[scenario]]
for metric in metrics:

lower_is_better = lower_is_better_map[scenario][metric]
default_value = 0.0 if not lower_is_better else 1000.0
values = [(data[submission_id].get(scenario, {metric: default_value}).get(metric, 0.0), j) for j, submission_id in enumerate(submission_ids)]

default_value = 0.0
if lower_is_better:
default_value = 1000.0

values = [(data[submission_id].get(scenario, {metric: default_value}).get(metric, default_value), j) for j, submission_id in enumerate(submission_ids)]
vv = [x[0] for x in values]
win_rates = calc_win_rate(vv, lower_is_better=lower_is_better)
for (win, (_, j)) in zip (win_rates, values):
for (win, (val, j)) in zip (win_rates, values):
win_rates_per_row[j].append(win)




for submission_id, win_rates in zip(submission_ids, win_rates_per_row):
if not win_rates:
continue
mean_win_rates[submission_id][scenario] = statistics.mean(win_rates)

# mean_win_rates layout
Expand All @@ -123,11 +127,6 @@ def rank_results(data:dict, metrics_config:dict):
value = None
if scenario in data[submission_id] and metric in data[submission_id][scenario]:
value = data[submission_id][scenario][metric]
# temporary fix for populating lower is better entries with 0.0's;
# this has been fixed in agents.py, but it's needed for older submissions;
# we can remove once we move to flash helm
if lower_is_better and value == 0.0:
value = None
row[metric] = value
row[f"{scenario} Mean Win Rate"] = mean_win_rates[submission_id][scenario]
row[score_key] = scores[submission_id]
Expand Down Expand Up @@ -164,7 +163,7 @@ def rank_results(data:dict, metrics_config:dict):
submission_results = transpose_results(submission_results)

# METRICS = Hidden_eval_metrics
# # submission_results = transpose_results(submission_results)


name = f"{args.track}_{name}"

Expand Down

0 comments on commit 15c764f

Please sign in to comment.