From 05b5e504cd70ed02bcb5eb7feb58b32000163b82 Mon Sep 17 00:00:00 2001 From: weiweiy Date: Fri, 10 Nov 2023 18:30:47 -0800 Subject: [PATCH] schema, config and eval scripts to make hidden eval dataset work (#11) * schema, config and eval scripts to make hidden eval dataset work * upgrade datasets version to 2.14.6 and generate 1000 and 2000 sparse config * take out cnn from open eval, added 3k eval config * change sam_sum to use summerization metrics * re-generate sparse_run_spec * update cause2corr to only do 1-shot examples --- build_secret_run_spec.py | 151 +++++++++++++++++ neurIPS_eval_scripts/eval_metrics.py | 47 ++++- neurIPS_eval_scripts/process_helm.py | 26 ++- neurIPS_eval_scripts/rank_submissions.py | 2 +- requirements.txt | 2 +- ...s_full_closed_eval_coarse_3000_budget.conf | 48 ++++++ src/helm/benchmark/run_specs.py | 39 +++-- src/helm/benchmark/static/schema.yaml | 160 +++++++++++++++++- 8 files changed, 445 insertions(+), 30 deletions(-) create mode 100644 build_secret_run_spec.py create mode 100644 run_specs_full_closed_eval_coarse_3000_budget.conf mode change 100644 => 100755 src/helm/benchmark/static/schema.yaml diff --git a/build_secret_run_spec.py b/build_secret_run_spec.py new file mode 100644 index 000000000..cc027a820 --- /dev/null +++ b/build_secret_run_spec.py @@ -0,0 +1,151 @@ +entries = [ + + # Misc datasets + {'scenario': 'summarization', 'description': "sam_sum:model=neurips/local", 'priority': 1}, + {'scenario': 'causation', 'description': "corr2cause:model=neurips/local,max_train_instances=1",'priority': 1}, + + ## Ethics datasets + {'scenario': 'ethics', 'description': "ethics_justice:model=neurips/local", 'priority': 1}, + {'scenario': 'ethics', 'description': "ethics_commonsense:model=neurips/local", 'priority': 1}, + {'scenario': 'ethics', 'description': "ethics_virtue:model=neurips/local", 'priority': 1}, + {'scenario': 'ethics', 'description': "ethics_deontology:model=neurips/local", 'priority': 1}, + {'scenario': 'ethics', 'description': "ethics_utilitarianism:model=neurips/local", 'priority': 1}, + + ## Math datasets + {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=1,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=1,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=1,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=1,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=1,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=1,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=1,use_official_examples=True", 'priority': 2}, +# +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=2,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=2,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=2,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=2,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=2,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=2,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=2,use_official_examples=True", 'priority': 4}, +# +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=3,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=3,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=3,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=3,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=3,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=3,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=3,use_official_examples=True", 'priority': 2}, +# +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=4,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=4,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=4,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=4,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=4,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=4,use_official_examples=True", 'priority': 4}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=4,use_official_examples=True", 'priority': 4}, + + {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=5,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=5,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=5,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=5,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=5,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=5,use_official_examples=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=5,use_official_examples=True", 'priority': 2}, + + # With chain-of-thought prompting: + {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=1,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=1,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=1,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=1,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=1,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=1,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=1,use_chain_of_thought=True", 'priority' : 2}, +# +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=3,use_chain_of_thought=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=3 ,use_chain_of_thought=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=3,use_chain_of_thought=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=3,use_chain_of_thought=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=3,use_chain_of_thought=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=3,use_chain_of_thought=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=3,use_chain_of_thought=True", 'priority' : 2}, +# + {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=5,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True", 'priority': 2}, + {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True", 'priority' : 2}, + + {'scenario':'cnn','description': "summarization_cnndm:model=neurips/local", 'priority': 1}, + +] + + + +def generate_equal_sum_list(V, N): + # Calculate the base value that will be repeated. + base_value = V // N + # Calculate the remainder for distribution. + remainder = V % N + + # Create the list with base_value repeated N times. + result = [base_value] * N + + # Distribute the remainder evenly among the elements. + for i in range(remainder): + result[i] += 1 + + return result + +import pandas as pd +import argparse + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser( + description=''' + This method automatically generates a configuration file for the neurips_llm_efficiency_challenge + + Calling it with: `python build_run_specs_full.py --example_budget=600` will produce a conf file + with a total of 600 examples distributed evenly across scenarios as also defined here. + ''', + ) + parser.add_argument("--example_budget", required=True, type=int, help='# example to use') + args = parser.parse_args() + + # get a list of scenarios and n_examples + df = pd.DataFrame(entries) + scenario_count_dict = df.value_counts('scenario').to_dict() + n_scenarios = len(df.scenario.unique()) + max_eval_instances_per_scenario = generate_equal_sum_list(args.example_budget, n_scenarios) + + # get a dict of the amount of examples per + scenario_n_examples_dict = {} + for scenario, n_subscenarios in scenario_count_dict.items(): + cur_max_eval_instances_per_scenario = max_eval_instances_per_scenario.pop() + scenario_n_examples_dict[scenario] = generate_equal_sum_list(cur_max_eval_instances_per_scenario,n_subscenarios) + + for i in range(len(entries)): + cur_scenario = entries[i]['scenario'] + # print(f"added {v} to {entries[i]['max_eval_instances']}") + v = scenario_n_examples_dict[cur_scenario].pop() + entries[i]['max_eval_instances'] = v + + with open(f'./run_specs_full_closed_eval_coarse_{args.example_budget}_budget.conf','w') as f: + f.write('entries: [\n') + last_scenario = '' + for entry in entries: + cur_scenario = entry['scenario'] + if cur_scenario != last_scenario: + f.write(f'\n# {cur_scenario}\n') + print(entry) + last_scenario = cur_scenario + f.write('{') + f.write(f'description: """{entry["description"]}'.replace('"""','"')) + f.write(f',max_eval_instances={entry["max_eval_instances"]}""",priority: 1'.replace('"""','"')) + f.write('}\n') + f.write(']') + + print(f'Saved ./run_secret_specs_full_coarse_{args.example_budget}_budget.conf') diff --git a/neurIPS_eval_scripts/eval_metrics.py b/neurIPS_eval_scripts/eval_metrics.py index ceb0513fe..84b033379 100755 --- a/neurIPS_eval_scripts/eval_metrics.py +++ b/neurIPS_eval_scripts/eval_metrics.py @@ -3,7 +3,7 @@ Open_eval_metrics = { "Accuracy": [ ("core_scenarios.json", "MMLU - EM", False), - ("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False), + # ("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False), ("core_scenarios.json", "TruthfulQA - EM", False), ("targeted_evaluations.json", "BBQ - EM", False), ("core_scenarios.json", "GSM8K - EM", False), @@ -21,11 +21,56 @@ ], + # "Bias": [ + # ("core_scenarios.json", "CNN/DailyMail - Stereotypes (race)", True), + # ("core_scenarios.json", "CNN/DailyMail - Stereotypes (gender)", True), + # ("core_scenarios.json", "CNN/DailyMail - Representation (race)", True), + # ("core_scenarios.json", "CNN/DailyMail - Representation (gender)", True), + # ], + +} + + +Hidden_eval_metrics = { + "Accuracy": [ + ("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False), + ("core_scenarios.json", "sam_sum - ROUGE-2", False), + ("core_scenarios.json", "corr2cause - EM", False), + ("core_scenarios.json", "ethics_justice - EM", False), + ("core_scenarios.json", "ethics_commonsense - EM", False), + ("core_scenarios.json", "ethics_virtue - EM", False), + ("core_scenarios.json", "ethics_deontology - EM", False), + ("core_scenarios.json", "ethics_utilitarianism - EM", False), + ("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False), + ("core_scenarios.json", "MATH - Equivalent", False), + ], + + "Robustness": [ + ("core_scenarios.json", "ethics_justice - EM (Robustness)", False), + ("core_scenarios.json", "ethics_commonsense - EM (Robustness)", False), + ("core_scenarios.json", "ethics_virtue - EM (Robustness)", False), + ("core_scenarios.json", "ethics_deontology - EM (Robustness)", False), + ("core_scenarios.json", "ethics_utilitarianism - EM (Robustness)", False), + ], + + "Fairness": [ + ("core_scenarios.json", "ethics_justice - EM (Fairness)", False), + ("core_scenarios.json", "ethics_commonsense - EM (Fairness)", False), + ("core_scenarios.json", "ethics_virtue - EM (Fairness)", False), + ("core_scenarios.json", "ethics_deontology - EM (Fairness)", False), + ("core_scenarios.json", "ethics_utilitarianism - EM (Fairness)", False), + ], + "Bias": [ + ("core_scenarios.json", "sam_sum - Stereotypes (race)", True), + ("core_scenarios.json", "sam_sum - Stereotypes (gender)", True), + ("core_scenarios.json", "sam_sum - Representation (race)", True), + ("core_scenarios.json", "sam_sum - Representation (gender)", True), ("core_scenarios.json", "CNN/DailyMail - Stereotypes (race)", True), ("core_scenarios.json", "CNN/DailyMail - Stereotypes (gender)", True), ("core_scenarios.json", "CNN/DailyMail - Representation (race)", True), ("core_scenarios.json", "CNN/DailyMail - Representation (gender)", True), + ], } \ No newline at end of file diff --git a/neurIPS_eval_scripts/process_helm.py b/neurIPS_eval_scripts/process_helm.py index 6d4fbc002..4523c407f 100755 --- a/neurIPS_eval_scripts/process_helm.py +++ b/neurIPS_eval_scripts/process_helm.py @@ -2,7 +2,8 @@ import os import sys import argparse -from eval_metrics import Open_eval_metrics as METRICS +from eval_metrics import Open_eval_metrics as open_metrics +from eval_metrics import Hidden_eval_metrics as hidden_metrics ''' parse results from helm-summerize under helm_output_dir/runs/submission_id @@ -11,7 +12,7 @@ ''' #this is taken from https://github.com/Lightning-AI/llm-efficiency-challenge-eval/blob/main/agents/agents.py#L182 -def process_helm_results(root_path:str, suite: str) -> dict: +def process_helm_results(root_path:str, suite: str, METRICS:dict = open_metrics) -> dict: path = f"{root_path}/runs/{suite}/groups/" output = {} @@ -45,15 +46,30 @@ def process_helm_results(root_path:str, suite: str) -> dict: parser = argparse.ArgumentParser(description="Parse helm-summerize results") parser.add_argument("--dir", type=str, help='Helm Benchmark dir', required=True) parser.add_argument('--idx', type=str, help='submission id', required=True) + + parser.add_argument('--hidden', action='store_true', help="hidden eval metrics", required=False) args = parser.parse_args() - run_results = process_helm_results(args.dir, args.idx) + + use_metrics = open_metrics + if args.hidden: + use_metrics = hidden_metrics + + run_results = process_helm_results(args.dir, args.idx, METRICS=use_metrics) + print(run_results) results_dir = f"{args.dir}/submission_results" os.makedirs(results_dir, exist_ok=True) - result_json = os.path.join(results_dir, f"{args.idx}.json") + + out_name = f"{args.idx}.json" + if args.hidden: + out_name = f"{args.idx}_hidden.json" + + result_json = os.path.join(results_dir, out_name) + + print(result_json) with open (result_json, 'w') as handle: - json.dump( run_results, handle) + json.dump( run_results, handle, indent=4) except Exception as e : print(e) diff --git a/neurIPS_eval_scripts/rank_submissions.py b/neurIPS_eval_scripts/rank_submissions.py index a24a2460f..f94d3574f 100755 --- a/neurIPS_eval_scripts/rank_submissions.py +++ b/neurIPS_eval_scripts/rank_submissions.py @@ -114,7 +114,7 @@ def rank_results(data:dict, metrics_config:dict): ranked_results = rank_results(submission_results, METRICS) with open (f"{args.name}_full_rank.json", 'w') as handle: - json.dump( ranked_results, handle) + json.dump( ranked_results, handle, indent=4) except Exception as e : print(e) diff --git a/requirements.txt b/requirements.txt index 20a08d941..2bef7d27e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,7 @@ cycler==0.11.0 cymem==2.0.6 Cython==0.29.32 dacite==1.6.0 -datasets==2.5.2 +datasets==2.14.6 dill==0.3.5.1 distlib==0.3.6 emoji==2.1.0 diff --git a/run_specs_full_closed_eval_coarse_3000_budget.conf b/run_specs_full_closed_eval_coarse_3000_budget.conf new file mode 100644 index 000000000..5c89341fc --- /dev/null +++ b/run_specs_full_closed_eval_coarse_3000_budget.conf @@ -0,0 +1,48 @@ +entries: [ + +# summarization +{description: "sam_sum:model=neurips/local,max_eval_instances=600",priority: 1} + +# causation +{description: "corr2cause:model=neurips/local,max_train_instances=1,max_eval_instances=600",priority: 1} + +# ethics +{description: "ethics_justice:model=neurips/local,max_eval_instances=120",priority: 1} +{description: "ethics_commonsense:model=neurips/local,max_eval_instances=120",priority: 1} +{description: "ethics_virtue:model=neurips/local,max_eval_instances=120",priority: 1} +{description: "ethics_deontology:model=neurips/local,max_eval_instances=120",priority: 1} +{description: "ethics_utilitarianism:model=neurips/local,max_eval_instances=120",priority: 1} + +# math +{description: "math:model=neurips/local,subject=number_theory,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=algebra,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=prealgebra,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=geometry,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=precalculus,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=number_theory,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=algebra,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=prealgebra,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=geometry,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=precalculus,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=number_theory,level=1,use_chain_of_thought=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_chain_of_thought=True,max_eval_instances=21",priority: 1} +{description: "math:model=neurips/local,subject=algebra,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1} +{description: "math:model=neurips/local,subject=prealgebra,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1} +{description: "math:model=neurips/local,subject=geometry,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1} +{description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1} +{description: "math:model=neurips/local,subject=precalculus,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1} +{description: "math:model=neurips/local,subject=number_theory,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} +{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} +{description: "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} +{description: "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} +{description: "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} +{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} +{description: "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} + +# cnn +{description: "summarization_cnndm:model=neurips/local,max_eval_instances=600",priority: 1} +] \ No newline at end of file diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py index 87cc9d5ee..a37d93d34 100644 --- a/src/helm/benchmark/run_specs.py +++ b/src/helm/benchmark/run_specs.py @@ -1815,6 +1815,25 @@ def get_cnndm_summarization_spec(temperature: float = 0.3, device: str = "cpu") groups=["summarization_cnndm"], ) +@run_spec_function("sam_sum") +def get_sam_sum_spec() -> RunSpec: + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.sam_sum_scenario.SamSumScenario", args={}) + + adapter_spec = get_summarization_adapter_spec( + num_sents=1, + max_tokens=128, + temperature=0.3, + ) + + return RunSpec( + name="sam_sum", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs= get_summarization_metric_specs({"task": "sam_sum", "device": 'cpu'}) + + get_generative_harms_metric_specs(), + groups=["sam_sum"], + ) + @run_spec_function("empatheticdialogues") def get_empatheticdialogues_spec() -> RunSpec: @@ -2058,26 +2077,6 @@ def get_me_q_sum_spec() -> RunSpec: groups=["MeQSum"], ) - -@run_spec_function("sam_sum") -def get_sam_sum_spec() -> RunSpec: - scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.sam_sum_scenario.SamSumScenario", args={}) - - adapter_spec = get_summarization_adapter_spec( - num_sents=1, - max_tokens=128, - temperature=0.3, - ) - - return RunSpec( - name="sam_sum", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=get_open_ended_generation_metric_specs(), - groups=["sam_sum"], - ) - - @run_spec_function("med_dialog") def get_med_dialog_spec(subset: str) -> RunSpec: scenario_spec = ScenarioSpec( diff --git a/src/helm/benchmark/static/schema.yaml b/src/helm/benchmark/static/schema.yaml old mode 100644 new mode 100755 index de500088e..c4aa58cc3 --- a/src/helm/benchmark/static/schema.yaml +++ b/src/helm/benchmark/static/schema.yaml @@ -1793,6 +1793,14 @@ run_groups: - BIG-bench - gsm ## TODO: ADD new dataset here + - corr2cause + - ethics_justice + - ethics_commonsense + - ethics_virtue + - ethics_deontology + - ethics_utilitarianism + - math_chain_of_thought + - math_regular - name: information_retrieval display_name: Information retrieval @@ -1809,6 +1817,7 @@ run_groups: subgroups: - summarization_cnndm - summarization_xsum + - sam_sum - name: sentiment_analysis display_name: Sentiment analysis @@ -1893,6 +1902,7 @@ run_groups: - entity_data_imputation - entity_matching - BIG-bench + - corr2cause - name: harms display_name: Harms @@ -1906,6 +1916,11 @@ run_groups: - bbq - bold - real_toxicity_prompts + - ethics_justice + - ethics_commonsense + - ethics_virtue + - ethics_deontology + - ethics_utilitarianism - name: efficiency display_name: Efficiency @@ -1942,6 +1957,7 @@ run_groups: subgroups: - natural_qa_openbook_longans - summarization_cnndm + - sam_sum - imdb - civil_comments adapter_keys_shown: @@ -1965,6 +1981,11 @@ run_groups: - legal_support - lsat_qa - bbq + - ethics_justice + - ethics_commonsense + - ethics_virtue + - ethics_deontology + - ethics_utilitarianism adapter_keys_shown: - model - method @@ -1977,6 +1998,7 @@ run_groups: subgroups: - natural_qa_openbook_longans - summarization_cnndm + - sam_sum - imdb - civil_comments adapter_keys_shown: @@ -2194,7 +2216,7 @@ run_groups: who: "?" when: "?" language: English - + - name: truthful_qa display_name: TruthfulQA description: The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/). @@ -2215,7 +2237,119 @@ run_groups: when: "?" language: English - ##TODO: ADD New data set detal here + ##adding hiden eval dataset + - name: corr2cause + display_name: corr2cause + short_display_name: corr2cause + description: Causal reasoning based on graph data https://github.com/causalNLP/corr2cause/tree/main + category: Targeted evaluations + metric_groups: + - accuracy + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: question answering + what: "?" + who: "?" + when: "?" + language: English + + - name: ethics_justice + display_name: ethics_justice + short_display_name: ethics_justice + description: Justice sub-tasks of ETHICS benchmark https://github.com/hendrycks/ethics + category: Targeted evaluations + metric_groups: + - accuracy + - robustness + - fairness + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: question answering + what: "?" + who: "?" + when: "?" + language: English + + - name: ethics_virtue + display_name: ethics_virtue + short_display_name: ethics_virtue + description: Virtue sub-tasks of ETHICS benchmark https://github.com/hendrycks/ethics + category: Targeted evaluations + metric_groups: + - accuracy + - robustness + - fairness + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: question answering + what: "?" + who: "?" + when: "?" + language: English + + + - name: ethics_commonsense + display_name: ethics_commonsense + short_display_name: ethics_commonsense + description: Commonsense sub-tasks of ETHICS benchmark https://github.com/hendrycks/ethics + category: Targeted evaluations + metric_groups: + - accuracy + - robustness + - fairness + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: question answering + what: "?" + who: "?" + when: "?" + language: English + + - name: ethics_deontology + display_name: ethics_deontology + short_display_name: ethics_deontology + description: Deontology sub-tasks of ETHICS benchmark https://github.com/hendrycks/ethics + category: Targeted evaluations + metric_groups: + - accuracy + - robustness + - fairness + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: question answering + what: "?" + who: "?" + when: "?" + language: English + + - name: ethics_utilitarianism + display_name: ethics_utilitarianism + short_display_name: ethics_utilitarianism + description: Utilitarianism sub-tasks of ETHICS benchmark https://github.com/hendrycks/ethics + category: Targeted evaluations + metric_groups: + - accuracy + - robustness + - fairness + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: question answering + what: "?" + who: "?" + when: "?" + language: English - name: BIG-bench display_name: BIG-bench @@ -2301,6 +2435,28 @@ run_groups: language: English # Summarization scenarios + #TODO!!! Someone should sanity check + - name: sam_sum + display_name: sam_sum + description: Abstractive summarization of dialogue https://github.com/tensorflow/datasets/blob/master/docs/catalog/samsum.md + metric_groups: + - accuracy + - summarization_metrics + - bias + - toxicity + - efficiency + - general_information + environment: + main_name: rouge_2 + main_split: valid + taxonomy: + task: summarization + what: "?" + who: "?" + when: "?" + language: English + + - name: summarization_cnndm display_name: CNN/DailyMail description: The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).