Skip to content

Commit

Permalink
take out cnn from open eval, added 3k eval config
Browse files Browse the repository at this point in the history
  • Loading branch information
weiweiy committed Nov 10, 2023
1 parent 3cbee85 commit 5ddfd7b
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 29 deletions.
15 changes: 8 additions & 7 deletions neurIPS_eval_scripts/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Open_eval_metrics = {
"Accuracy": [
("core_scenarios.json", "MMLU - EM", False),
("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False),
# ("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False),
("core_scenarios.json", "TruthfulQA - EM", False),
("targeted_evaluations.json", "BBQ - EM", False),
("core_scenarios.json", "GSM8K - EM", False),
Expand All @@ -21,12 +21,12 @@

],

"Bias": [
("core_scenarios.json", "CNN/DailyMail - Stereotypes (race)", True),
("core_scenarios.json", "CNN/DailyMail - Stereotypes (gender)", True),
("core_scenarios.json", "CNN/DailyMail - Representation (race)", True),
("core_scenarios.json", "CNN/DailyMail - Representation (gender)", True),
],
# "Bias": [
# ("core_scenarios.json", "CNN/DailyMail - Stereotypes (race)", True),
# ("core_scenarios.json", "CNN/DailyMail - Stereotypes (gender)", True),
# ("core_scenarios.json", "CNN/DailyMail - Representation (race)", True),
# ("core_scenarios.json", "CNN/DailyMail - Representation (gender)", True),
# ],

}

Expand All @@ -41,6 +41,7 @@
("core_scenarios.json", "ethics_deontology - EM", False),
("core_scenarios.json", "ethics_utilitarianism - EM", False),
("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False),
("core_scenarios.json", "MATH - Equivalent", False),
],

# "Robustness": [
Expand Down
2 changes: 1 addition & 1 deletion neurIPS_eval_scripts/rank_submissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def rank_results(data:dict, metrics_config:dict):
ranked_results = rank_results(submission_results, METRICS)

with open (f"{args.name}_full_rank.json", 'w') as handle:
json.dump( ranked_results, handle)
json.dump( ranked_results, handle, indent=4)

except Exception as e :
print(e)
35 changes: 14 additions & 21 deletions run_specs_full_closed_eval_coarse_2000_budget.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,18 @@ entries: [
{description: "ethics_utilitarianism:model=neurips/local,max_eval_instances=100",priority: 1}

# math
{description: "math:model=neurips/local,subject=number_theory,level=1,use_chain_of_thought=True,max_eval_instances=23",priority: 1}
{description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_chain_of_thought=True,max_eval_instances=23",priority: 1}
{description: "math:model=neurips/local,subject=algebra,level=1,use_chain_of_thought=True,max_eval_instances=23",priority: 1}
{description: "math:model=neurips/local,subject=prealgebra,level=1,use_chain_of_thought=True,max_eval_instances=23",priority: 1}
{description: "math:model=neurips/local,subject=geometry,level=1,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=precalculus,level=1,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=number_theory,level=3,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=intermediate_algebra,level=3 ,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=algebra,level=3,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=prealgebra,level=3,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=geometry,level=3,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=counting_and_probability,level=3,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=precalculus,level=3,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=number_theory,level=5,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True,max_eval_instances=24",priority: 1}
{description: "math:model=neurips/local,subject=number_theory,level=5,use_official_examples=True,max_eval_instances=35",priority: 1}
{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_official_examples=True,max_eval_instances=35",priority: 1}
{description: "math:model=neurips/local,subject=algebra,level=5,use_official_examples=True,max_eval_instances=35",priority: 1}
{description: "math:model=neurips/local,subject=prealgebra,level=5,use_official_examples=True,max_eval_instances=35",priority: 1}
{description: "math:model=neurips/local,subject=geometry,level=5,use_official_examples=True,max_eval_instances=36",priority: 1}
{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_official_examples=True,max_eval_instances=36",priority: 1}
{description: "math:model=neurips/local,subject=precalculus,level=5,use_official_examples=True,max_eval_instances=36",priority: 1}
{description: "math:model=neurips/local,subject=number_theory,level=5,use_chain_of_thought=True,max_eval_instances=36",priority: 1}
{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True,max_eval_instances=36",priority: 1}
{description: "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True,max_eval_instances=36",priority: 1}
{description: "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True,max_eval_instances=36",priority: 1}
{description: "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True,max_eval_instances=36",priority: 1}
{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True,max_eval_instances=36",priority: 1}
{description: "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True,max_eval_instances=36",priority: 1}
]
45 changes: 45 additions & 0 deletions run_specs_full_closed_eval_coarse_3000_budget.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
entries: [

# summarization
{description: "sam_sum:model=neurips/local,max_eval_instances=750",priority: 1}

# causation
{description: "corr2cause:model=neurips/local,max_eval_instances=750",priority: 1}

# ethics
{description: "ethics_justice:model=neurips/local,max_eval_instances=150",priority: 1}
{description: "ethics_commonsense:model=neurips/local,max_eval_instances=150",priority: 1}
{description: "ethics_virtue:model=neurips/local,max_eval_instances=150",priority: 1}
{description: "ethics_deontology:model=neurips/local,max_eval_instances=150",priority: 1}
{description: "ethics_utilitarianism:model=neurips/local,max_eval_instances=150",priority: 1}

# math
{description: "math:model=neurips/local,subject=number_theory,level=1,use_official_examples=True,max_eval_instances=26",priority: 1}
{description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_official_examples=True,max_eval_instances=26",priority: 1}
{description: "math:model=neurips/local,subject=algebra,level=1,use_official_examples=True,max_eval_instances=26",priority: 1}
{description: "math:model=neurips/local,subject=prealgebra,level=1,use_official_examples=True,max_eval_instances=26",priority: 1}
{description: "math:model=neurips/local,subject=geometry,level=1,use_official_examples=True,max_eval_instances=26",priority: 1}
{description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_official_examples=True,max_eval_instances=26",priority: 1}
{description: "math:model=neurips/local,subject=precalculus,level=1,use_official_examples=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=number_theory,level=5,use_official_examples=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_official_examples=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=algebra,level=5,use_official_examples=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=prealgebra,level=5,use_official_examples=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=geometry,level=5,use_official_examples=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_official_examples=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=precalculus,level=5,use_official_examples=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=number_theory,level=1,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=algebra,level=1,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=prealgebra,level=1,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=geometry,level=1,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=precalculus,level=1,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=number_theory,level=5,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
{description: "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True,max_eval_instances=27",priority: 1}
]

0 comments on commit 5ddfd7b

Please sign in to comment.