Skip to content

Commit

Permalink
schema, config and eval scripts to make hidden eval dataset work (#11)
Browse files Browse the repository at this point in the history
* schema, config and eval scripts to make hidden eval dataset work

* upgrade datasets version to 2.14.6 and generate 1000 and 2000 sparse config

* take out cnn from open eval, added 3k eval config

* change sam_sum to use summerization metrics

* re-generate sparse_run_spec

* update cause2corr to only do 1-shot examples
  • Loading branch information
weiweiy authored Nov 11, 2023
1 parent fd145d2 commit 05b5e50
Show file tree
Hide file tree
Showing 8 changed files with 445 additions and 30 deletions.
151 changes: 151 additions & 0 deletions build_secret_run_spec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
entries = [

# Misc datasets
{'scenario': 'summarization', 'description': "sam_sum:model=neurips/local", 'priority': 1},
{'scenario': 'causation', 'description': "corr2cause:model=neurips/local,max_train_instances=1",'priority': 1},

## Ethics datasets
{'scenario': 'ethics', 'description': "ethics_justice:model=neurips/local", 'priority': 1},
{'scenario': 'ethics', 'description': "ethics_commonsense:model=neurips/local", 'priority': 1},
{'scenario': 'ethics', 'description': "ethics_virtue:model=neurips/local", 'priority': 1},
{'scenario': 'ethics', 'description': "ethics_deontology:model=neurips/local", 'priority': 1},
{'scenario': 'ethics', 'description': "ethics_utilitarianism:model=neurips/local", 'priority': 1},

## Math datasets
{'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=1,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=1,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=1,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=1,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=1,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=1,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=1,use_official_examples=True", 'priority': 2},
#
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=2,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=2,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=2,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=2,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=2,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=2,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=2,use_official_examples=True", 'priority': 4},
#
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=3,use_official_examples=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=3,use_official_examples=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=3,use_official_examples=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=3,use_official_examples=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=3,use_official_examples=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=3,use_official_examples=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=3,use_official_examples=True", 'priority': 2},
#
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=4,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=4,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=4,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=4,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=4,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=4,use_official_examples=True", 'priority': 4},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=4,use_official_examples=True", 'priority': 4},

{'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=5,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=5,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=5,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=5,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=5,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=5,use_official_examples=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=5,use_official_examples=True", 'priority': 2},

# With chain-of-thought prompting:
{'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=1,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=1,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=1,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=1,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=1,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=1,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=1,use_chain_of_thought=True", 'priority' : 2},
#
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=3,use_chain_of_thought=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=3 ,use_chain_of_thought=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=3,use_chain_of_thought=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=3,use_chain_of_thought=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=3,use_chain_of_thought=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=3,use_chain_of_thought=True", 'priority': 2},
# {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=3,use_chain_of_thought=True", 'priority' : 2},
#
{'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=5,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True", 'priority': 2},
{'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True", 'priority' : 2},

{'scenario':'cnn','description': "summarization_cnndm:model=neurips/local", 'priority': 1},

]



def generate_equal_sum_list(V, N):
# Calculate the base value that will be repeated.
base_value = V // N
# Calculate the remainder for distribution.
remainder = V % N

# Create the list with base_value repeated N times.
result = [base_value] * N

# Distribute the remainder evenly among the elements.
for i in range(remainder):
result[i] += 1

return result

import pandas as pd
import argparse

if __name__ == "__main__":

import argparse
parser = argparse.ArgumentParser(
description='''
This method automatically generates a configuration file for the neurips_llm_efficiency_challenge
Calling it with: `python build_run_specs_full.py --example_budget=600` will produce a conf file
with a total of 600 examples distributed evenly across scenarios as also defined here.
''',
)
parser.add_argument("--example_budget", required=True, type=int, help='# example to use')
args = parser.parse_args()

# get a list of scenarios and n_examples
df = pd.DataFrame(entries)
scenario_count_dict = df.value_counts('scenario').to_dict()
n_scenarios = len(df.scenario.unique())
max_eval_instances_per_scenario = generate_equal_sum_list(args.example_budget, n_scenarios)

# get a dict of the amount of examples per
scenario_n_examples_dict = {}
for scenario, n_subscenarios in scenario_count_dict.items():
cur_max_eval_instances_per_scenario = max_eval_instances_per_scenario.pop()
scenario_n_examples_dict[scenario] = generate_equal_sum_list(cur_max_eval_instances_per_scenario,n_subscenarios)

for i in range(len(entries)):
cur_scenario = entries[i]['scenario']
# print(f"added {v} to {entries[i]['max_eval_instances']}")
v = scenario_n_examples_dict[cur_scenario].pop()
entries[i]['max_eval_instances'] = v

with open(f'./run_specs_full_closed_eval_coarse_{args.example_budget}_budget.conf','w') as f:
f.write('entries: [\n')
last_scenario = ''
for entry in entries:
cur_scenario = entry['scenario']
if cur_scenario != last_scenario:
f.write(f'\n# {cur_scenario}\n')
print(entry)
last_scenario = cur_scenario
f.write('{')
f.write(f'description: """{entry["description"]}'.replace('"""','"'))
f.write(f',max_eval_instances={entry["max_eval_instances"]}""",priority: 1'.replace('"""','"'))
f.write('}\n')
f.write(']')

print(f'Saved ./run_secret_specs_full_coarse_{args.example_budget}_budget.conf')
47 changes: 46 additions & 1 deletion neurIPS_eval_scripts/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Open_eval_metrics = {
"Accuracy": [
("core_scenarios.json", "MMLU - EM", False),
("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False),
# ("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False),
("core_scenarios.json", "TruthfulQA - EM", False),
("targeted_evaluations.json", "BBQ - EM", False),
("core_scenarios.json", "GSM8K - EM", False),
Expand All @@ -21,11 +21,56 @@

],

# "Bias": [
# ("core_scenarios.json", "CNN/DailyMail - Stereotypes (race)", True),
# ("core_scenarios.json", "CNN/DailyMail - Stereotypes (gender)", True),
# ("core_scenarios.json", "CNN/DailyMail - Representation (race)", True),
# ("core_scenarios.json", "CNN/DailyMail - Representation (gender)", True),
# ],

}


Hidden_eval_metrics = {
"Accuracy": [
("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False),
("core_scenarios.json", "sam_sum - ROUGE-2", False),
("core_scenarios.json", "corr2cause - EM", False),
("core_scenarios.json", "ethics_justice - EM", False),
("core_scenarios.json", "ethics_commonsense - EM", False),
("core_scenarios.json", "ethics_virtue - EM", False),
("core_scenarios.json", "ethics_deontology - EM", False),
("core_scenarios.json", "ethics_utilitarianism - EM", False),
("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False),
("core_scenarios.json", "MATH - Equivalent", False),
],

"Robustness": [
("core_scenarios.json", "ethics_justice - EM (Robustness)", False),
("core_scenarios.json", "ethics_commonsense - EM (Robustness)", False),
("core_scenarios.json", "ethics_virtue - EM (Robustness)", False),
("core_scenarios.json", "ethics_deontology - EM (Robustness)", False),
("core_scenarios.json", "ethics_utilitarianism - EM (Robustness)", False),
],

"Fairness": [
("core_scenarios.json", "ethics_justice - EM (Fairness)", False),
("core_scenarios.json", "ethics_commonsense - EM (Fairness)", False),
("core_scenarios.json", "ethics_virtue - EM (Fairness)", False),
("core_scenarios.json", "ethics_deontology - EM (Fairness)", False),
("core_scenarios.json", "ethics_utilitarianism - EM (Fairness)", False),
],

"Bias": [
("core_scenarios.json", "sam_sum - Stereotypes (race)", True),
("core_scenarios.json", "sam_sum - Stereotypes (gender)", True),
("core_scenarios.json", "sam_sum - Representation (race)", True),
("core_scenarios.json", "sam_sum - Representation (gender)", True),
("core_scenarios.json", "CNN/DailyMail - Stereotypes (race)", True),
("core_scenarios.json", "CNN/DailyMail - Stereotypes (gender)", True),
("core_scenarios.json", "CNN/DailyMail - Representation (race)", True),
("core_scenarios.json", "CNN/DailyMail - Representation (gender)", True),

],

}
26 changes: 21 additions & 5 deletions neurIPS_eval_scripts/process_helm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import os
import sys
import argparse
from eval_metrics import Open_eval_metrics as METRICS
from eval_metrics import Open_eval_metrics as open_metrics
from eval_metrics import Hidden_eval_metrics as hidden_metrics

'''
parse results from helm-summerize under helm_output_dir/runs/submission_id
Expand All @@ -11,7 +12,7 @@
'''

#this is taken from https://github.com/Lightning-AI/llm-efficiency-challenge-eval/blob/main/agents/agents.py#L182
def process_helm_results(root_path:str, suite: str) -> dict:
def process_helm_results(root_path:str, suite: str, METRICS:dict = open_metrics) -> dict:
path = f"{root_path}/runs/{suite}/groups/"
output = {}

Expand Down Expand Up @@ -45,15 +46,30 @@ def process_helm_results(root_path:str, suite: str) -> dict:
parser = argparse.ArgumentParser(description="Parse helm-summerize results")
parser.add_argument("--dir", type=str, help='Helm Benchmark dir', required=True)
parser.add_argument('--idx', type=str, help='submission id', required=True)

parser.add_argument('--hidden', action='store_true', help="hidden eval metrics", required=False)
args = parser.parse_args()
run_results = process_helm_results(args.dir, args.idx)

use_metrics = open_metrics
if args.hidden:
use_metrics = hidden_metrics

run_results = process_helm_results(args.dir, args.idx, METRICS=use_metrics)
print(run_results)

results_dir = f"{args.dir}/submission_results"
os.makedirs(results_dir, exist_ok=True)
result_json = os.path.join(results_dir, f"{args.idx}.json")

out_name = f"{args.idx}.json"
if args.hidden:
out_name = f"{args.idx}_hidden.json"

result_json = os.path.join(results_dir, out_name)

print(result_json)

with open (result_json, 'w') as handle:
json.dump( run_results, handle)
json.dump( run_results, handle, indent=4)

except Exception as e :
print(e)
Expand Down
2 changes: 1 addition & 1 deletion neurIPS_eval_scripts/rank_submissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def rank_results(data:dict, metrics_config:dict):
ranked_results = rank_results(submission_results, METRICS)

with open (f"{args.name}_full_rank.json", 'w') as handle:
json.dump( ranked_results, handle)
json.dump( ranked_results, handle, indent=4)

except Exception as e :
print(e)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ cycler==0.11.0
cymem==2.0.6
Cython==0.29.32
dacite==1.6.0
datasets==2.5.2
datasets==2.14.6
dill==0.3.5.1
distlib==0.3.6
emoji==2.1.0
Expand Down
Loading

0 comments on commit 05b5e50

Please sign in to comment.