Skip to content

Commit

Permalink
fixed win-rate calc to allow same values without penalize later entri…
Browse files Browse the repository at this point in the history
…es, switch to scenario based win-rate caculation
  • Loading branch information
weiweiy committed Nov 14, 2023
1 parent eef38b6 commit 7c5392f
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 14 deletions.
93 changes: 91 additions & 2 deletions neurIPS_eval_scripts/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,80 @@
}


Hidden_dataset_centric_eval_metrics = {
"CNN/DailyMail" :[
("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False),
("core_scenarios.json", "CNN/DailyMail - Stereotypes (race)", True),
("core_scenarios.json", "CNN/DailyMail - Stereotypes (gender)", True),
("core_scenarios.json", "CNN/DailyMail - Representation (race)", True),
("core_scenarios.json", "CNN/DailyMail - Representation (gender)", True),
],
"sam_sum" : [
("core_scenarios.json", "sam_sum - ROUGE-2", False),
("core_scenarios.json", "sam_sum - Stereotypes (race)", True),
("core_scenarios.json", "sam_sum - Stereotypes (gender)", True),
("core_scenarios.json", "sam_sum - Representation (race)", True),
("core_scenarios.json", "sam_sum - Representation (gender)", True),
],
"corr2cause":[
("core_scenarios.json", "corr2cause - EM", False),],

'MATH': [
("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False), ],

"ethics" : [
("core_scenarios.json", "ethics_justice - EM", False),
("core_scenarios.json", "ethics_justice - EM (Robustness)", False),
("core_scenarios.json", "ethics_justice - EM (Fairness)", False),

("core_scenarios.json", "ethics_commonsense - EM", False),
("core_scenarios.json", "ethics_commonsense - EM (Robustness)", False),
("core_scenarios.json", "ethics_commonsense - EM (Fairness)", False),


("core_scenarios.json", "ethics_virtue - EM", False),
("core_scenarios.json", "ethics_virtue - EM (Robustness)", False),
("core_scenarios.json", "ethics_virtue - EM (Fairness)", False),

("core_scenarios.json", "ethics_deontology - EM", False),
("core_scenarios.json", "ethics_deontology - EM (Robustness)", False),
("core_scenarios.json", "ethics_deontology - EM (Fairness)", False),

("core_scenarios.json", "ethics_utilitarianism - EM", False),
("core_scenarios.json", "ethics_utilitarianism - EM (Robustness)", False),
("core_scenarios.json", "ethics_utilitarianism - EM (Fairness)", False),
],

# "ethics_justice" : [
# ("core_scenarios.json", "ethics_justice - EM", False),
# ("core_scenarios.json", "ethics_justice - EM (Robustness)", False),
# ("core_scenarios.json", "ethics_justice - EM (Robustness)", False),
# ],
# "ethics_commonsense" :[
# ("core_scenarios.json", "ethics_commonsense - EM", False),
# ("core_scenarios.json", "ethics_commonsense - EM (Robustness)", False),
# ("core_scenarios.json", "ethics_commonsense - EM (Fairness)", False),
# ],

# "ethics_virtue":[
# ("core_scenarios.json", "ethics_virtue - EM", False),
# ("core_scenarios.json", "ethics_virtue - EM (Robustness)", False),
# ("core_scenarios.json", "ethics_virtue - EM (Fairness)", False),
# ],
# "ethics_deontology" :[
# ("core_scenarios.json", "ethics_deontology - EM", False),
# ("core_scenarios.json", "ethics_deontology - EM (Robustness)", False),
# ("core_scenarios.json", "ethics_deontology - EM (Fairness)", False),
# ],
# "ethics_utilitarianism":[
# ("core_scenarios.json", "ethics_utilitarianism - EM", False),
# ("core_scenarios.json", "ethics_utilitarianism - EM (Robustness)", False),
# ("core_scenarios.json", "ethics_utilitarianism - EM (Fairness)", False),
# ]
}



Hidden_eval_metrics = {
"Accuracy": [
("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False),
Expand All @@ -42,7 +116,7 @@
("core_scenarios.json", "ethics_deontology - EM", False),
("core_scenarios.json", "ethics_utilitarianism - EM", False),
("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False),
("core_scenarios.json", "MATH - Equivalent", False),
# ("core_scenarios.json", "MATH - Equivalent", False),
],

"Robustness": [
Expand All @@ -54,7 +128,7 @@
],

"Fairness": [
("core_scenarios.json", "ethics_justice - EM (Fairness)", False),
("core_scenarios.json", "ethics_justice - EM (Robustness)", False),
("core_scenarios.json", "ethics_commonsense - EM (Fairness)", False),
("core_scenarios.json", "ethics_virtue - EM (Fairness)", False),
("core_scenarios.json", "ethics_deontology - EM (Fairness)", False),
Expand All @@ -73,4 +147,19 @@

],

}
Hidden_acc_only_metrics = {
"Accuracy": [
("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False),
("core_scenarios.json", "sam_sum - ROUGE-2", False),
("core_scenarios.json", "corr2cause - EM", False),
("core_scenarios.json", "ethics_justice - EM", False),
("core_scenarios.json", "ethics_commonsense - EM", False),
("core_scenarios.json", "ethics_virtue - EM", False),
("core_scenarios.json", "ethics_deontology - EM", False),
("core_scenarios.json", "ethics_utilitarianism - EM", False),
("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False),
# ("core_scenarios.json", "MATH - Equivalent", False),
],

}
80 changes: 68 additions & 12 deletions neurIPS_eval_scripts/rank_submissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import json
import math
import statistics
from eval_metrics import Open_eval_metrics as METRICS
from eval_metrics import *
import argparse
from collections import defaultdict, Counter

def load_run_results(run_result_dir:str):

Expand All @@ -18,6 +19,43 @@ def load_run_results(run_result_dir:str):
return results



def transpose_results(results):
data_sets = ['CNN/DailyMail', "sam_sum", "corr2cause", 'ethics', 'MATH']
transposed_results = {}
for name, res in results.items():
t_res = defaultdict(dict)
for val in res.values():
for d in data_sets:
for k,v in val.items():
if d in k:
t_res[d][k] = v
continue
transposed_results[name]= t_res
return transposed_results


def calc_win_rate(values, lower_is_better=False):
#This function calculate win rate, allow entries in values to repeat, such as [1, 1, 1, 3, 4, 5, 1]
# in this case, the repeated values will get the same win-rate, which is 1/(n_repeats) + count(lower_rank)
counts = Counter( values)
win_rate ={idx: 0.0 for idx in range( len(values) )}

for i, v in enumerate(values):
for j, vv in enumerate(values):
if i == j:
continue
if not lower_is_better and v > vv:
win_rate[i] += 1
elif lower_is_better and v < vv:
win_rate[i] += 1
elif v == vv:
win_rate[i] += 1.0/counts[v]

win_rate = [(k, v/len(values)) for k, v in win_rate.items()]
win_rate = [x[1] for x in sorted(win_rate, key=lambda k : k[0] )]
return win_rate

#take from https://github.com/Lightning-AI/llm-efficiency-challenge-eval/blob/main/agents/helm_postprocessing.py
def rank_results(data:dict, metrics_config:dict):
# mean win rate to be computed here
Expand Down Expand Up @@ -49,14 +87,13 @@ def rank_results(data:dict, metrics_config:dict):
lower_is_better = lower_is_better_map[scenario][metric]
default_value = 0.0 if not lower_is_better else 1000.0
values = [(data[submission_id].get(scenario, {metric: default_value}).get(metric, 0.0), j) for j, submission_id in enumerate(submission_ids)]
# temporary fix for populating lower is better entries with 0.0's;
# this has been fixed in agents.py, but it's needed for older submissions;
# we can remove once we move to flash helm
if lower_is_better:
values = [(default_value, j) if val == 0.0 else (val, j) for val, j in values]
for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
win_rate = wins / (len(values) - 1) if len(values) > 1 else 1.0 # normalize to [0, 1]
win_rates_per_row[j].append(win_rate)

vv = [x[0] for x in values]
win_rates = calc_win_rate(vv, lower_is_better=lower_is_better)
for (win, (_, j)) in zip (win_rates, values):
win_rates_per_row[j].append(win)



for submission_id, win_rates in zip(submission_ids, win_rates_per_row):
if not win_rates:
Expand Down Expand Up @@ -107,13 +144,32 @@ def rank_results(data:dict, metrics_config:dict):
try:
parser = argparse.ArgumentParser(description="rank helm evaluation results")
parser.add_argument("--dir", type=str, help='helm evaluation dir for al submissions', required=True)
parser.add_argument('--name', type=str, help='evaluation_name', default='open')
parser.add_argument('--hidden', action='store_true', help="hidden eval metrics", required=False)
parser.add_argument('--track', type=str, default='A100', required=False)
parser.add_argument('--acc_only', action='store_true', help="only use accuracy metrics", required=False)

args = parser.parse_args()
submission_results =load_run_results(args.dir)
ranked_results = rank_results(submission_results, METRICS)
METRICS = Open_eval_metrics

name = 'open'
if args.hidden:
name = 'hidden'

if args.acc_only:
METRICS = Hidden_acc_only_metrics
name = f'{name}_accuracy_only'
else:
METRICS = Hidden_dataset_centric_eval_metrics
submission_results = transpose_results(submission_results)

# METRICS = Hidden_eval_metrics
# # submission_results = transpose_results(submission_results)

with open (f"{args.name}_full_rank.json", 'w') as handle:
name = f"{args.track}_{name}"

ranked_results = rank_results(submission_results, METRICS)
with open (f"{name}_full_rank.json", 'w') as handle:
json.dump( ranked_results, handle, indent=4)

except Exception as e :
Expand Down

0 comments on commit 7c5392f

Please sign in to comment.