From 05b5e504cd70ed02bcb5eb7feb58b32000163b82 Mon Sep 17 00:00:00 2001
From: weiweiy <ona.ona@gmail.com>
Date: Fri, 10 Nov 2023 18:30:47 -0800
Subject: [PATCH] schema, config and eval scripts to make hidden eval dataset
 work (#11)

* schema, config and eval scripts to make hidden eval dataset work

* upgrade datasets version to 2.14.6 and generate 1000 and 2000 sparse config

* take out cnn from open eval, added 3k eval config

* change sam_sum to use summerization metrics

* re-generate sparse_run_spec

* update cause2corr to only do 1-shot examples
---
 build_secret_run_spec.py                      | 151 +++++++++++++++++
 neurIPS_eval_scripts/eval_metrics.py          |  47 ++++-
 neurIPS_eval_scripts/process_helm.py          |  26 ++-
 neurIPS_eval_scripts/rank_submissions.py      |   2 +-
 requirements.txt                              |   2 +-
 ...s_full_closed_eval_coarse_3000_budget.conf |  48 ++++++
 src/helm/benchmark/run_specs.py               |  39 +++--
 src/helm/benchmark/static/schema.yaml         | 160 +++++++++++++++++-
 8 files changed, 445 insertions(+), 30 deletions(-)
 create mode 100644 build_secret_run_spec.py
 create mode 100644 run_specs_full_closed_eval_coarse_3000_budget.conf
 mode change 100644 => 100755 src/helm/benchmark/static/schema.yaml

diff --git a/build_secret_run_spec.py b/build_secret_run_spec.py
new file mode 100644
index 000000000..cc027a820
--- /dev/null
+++ b/build_secret_run_spec.py
@@ -0,0 +1,151 @@
+entries = [ 
+
+    # Misc datasets
+    {'scenario': 'summarization', 'description': "sam_sum:model=neurips/local", 'priority': 1},
+    {'scenario': 'causation', 'description': "corr2cause:model=neurips/local,max_train_instances=1",'priority': 1},
+
+    ## Ethics datasets
+    {'scenario': 'ethics', 'description': "ethics_justice:model=neurips/local", 'priority': 1},
+    {'scenario': 'ethics', 'description': "ethics_commonsense:model=neurips/local", 'priority': 1},
+    {'scenario': 'ethics', 'description': "ethics_virtue:model=neurips/local", 'priority': 1},
+    {'scenario': 'ethics', 'description': "ethics_deontology:model=neurips/local", 'priority': 1},
+    {'scenario': 'ethics', 'description': "ethics_utilitarianism:model=neurips/local", 'priority': 1},
+
+    ## Math datasets
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=1,use_official_examples=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=1,use_official_examples=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=1,use_official_examples=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=1,use_official_examples=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=1,use_official_examples=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=1,use_official_examples=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=1,use_official_examples=True", 'priority': 2},
+# 
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=2,use_official_examples=True", 'priority': 4},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=2,use_official_examples=True", 'priority': 4},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=2,use_official_examples=True", 'priority': 4},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=2,use_official_examples=True", 'priority': 4},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=2,use_official_examples=True", 'priority': 4},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=2,use_official_examples=True", 'priority': 4},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=2,use_official_examples=True", 'priority': 4},
+# 
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=3,use_official_examples=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=3,use_official_examples=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=3,use_official_examples=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=3,use_official_examples=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=3,use_official_examples=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=3,use_official_examples=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=3,use_official_examples=True", 'priority': 2},
+# 
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=4,use_official_examples=True", 'priority': 4},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=4,use_official_examples=True", 'priority': 4},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=4,use_official_examples=True", 'priority': 4},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=4,use_official_examples=True", 'priority': 4},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=4,use_official_examples=True", 'priority': 4},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=4,use_official_examples=True", 'priority': 4},
+#    {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=4,use_official_examples=True", 'priority': 4},
+
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=5,use_official_examples=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=5,use_official_examples=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=5,use_official_examples=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=5,use_official_examples=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=5,use_official_examples=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=5,use_official_examples=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=5,use_official_examples=True", 'priority': 2},
+
+    # With chain-of-thought prompting:
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=1,use_chain_of_thought=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=1,use_chain_of_thought=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=1,use_chain_of_thought=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=1,use_chain_of_thought=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=1,use_chain_of_thought=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=1,use_chain_of_thought=True", 'priority': 2},
+     {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=1,use_chain_of_thought=True", 'priority' : 2},
+# 
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=3,use_chain_of_thought=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=3 ,use_chain_of_thought=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=3,use_chain_of_thought=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=3,use_chain_of_thought=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=3,use_chain_of_thought=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=3,use_chain_of_thought=True", 'priority': 2},
+#     {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=3,use_chain_of_thought=True", 'priority' : 2},
+# 
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=5,use_chain_of_thought=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True", 'priority': 2},
+    {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True", 'priority' : 2},
+
+    {'scenario':'cnn','description': "summarization_cnndm:model=neurips/local", 'priority': 1},
+
+]
+
+
+
+def generate_equal_sum_list(V, N):
+    # Calculate the base value that will be repeated.
+    base_value = V // N
+    # Calculate the remainder for distribution.
+    remainder = V % N
+    
+    # Create the list with base_value repeated N times.
+    result = [base_value] * N
+    
+    # Distribute the remainder evenly among the elements.
+    for i in range(remainder):
+        result[i] += 1
+    
+    return result
+
+import pandas as pd
+import argparse
+
+if __name__ == "__main__":
+
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='''
+        This method automatically generates a configuration file for the neurips_llm_efficiency_challenge
+        
+        Calling it with: `python build_run_specs_full.py --example_budget=600` will produce a conf file 
+        with a total of 600 examples distributed evenly across scenarios as also defined here.
+        ''',
+    )
+    parser.add_argument("--example_budget", required=True, type=int, help='# example to use')
+    args = parser.parse_args()
+    
+    # get a list of scenarios and n_examples
+    df =  pd.DataFrame(entries)
+    scenario_count_dict = df.value_counts('scenario').to_dict()
+    n_scenarios = len(df.scenario.unique())
+    max_eval_instances_per_scenario = generate_equal_sum_list(args.example_budget, n_scenarios)
+
+    # get a dict of the amount of examples per 
+    scenario_n_examples_dict = {}
+    for scenario, n_subscenarios in scenario_count_dict.items():
+        cur_max_eval_instances_per_scenario = max_eval_instances_per_scenario.pop()
+        scenario_n_examples_dict[scenario] = generate_equal_sum_list(cur_max_eval_instances_per_scenario,n_subscenarios)
+
+    for i in range(len(entries)):
+        cur_scenario = entries[i]['scenario']
+        # print(f"added {v} to {entries[i]['max_eval_instances']}")
+        v = scenario_n_examples_dict[cur_scenario].pop()
+        entries[i]['max_eval_instances'] = v
+
+    with open(f'./run_specs_full_closed_eval_coarse_{args.example_budget}_budget.conf','w') as f:
+        f.write('entries: [\n')
+        last_scenario = ''
+        for entry in entries:
+            cur_scenario = entry['scenario']
+            if cur_scenario != last_scenario:
+                f.write(f'\n# {cur_scenario}\n')
+                print(entry)
+            last_scenario = cur_scenario
+            f.write('{')
+            f.write(f'description: """{entry["description"]}'.replace('"""','"'))
+            f.write(f',max_eval_instances={entry["max_eval_instances"]}""",priority: 1'.replace('"""','"'))
+            f.write('}\n')
+        f.write(']')
+
+    print(f'Saved ./run_secret_specs_full_coarse_{args.example_budget}_budget.conf')
diff --git a/neurIPS_eval_scripts/eval_metrics.py b/neurIPS_eval_scripts/eval_metrics.py
index ceb0513fe..84b033379 100755
--- a/neurIPS_eval_scripts/eval_metrics.py
+++ b/neurIPS_eval_scripts/eval_metrics.py
@@ -3,7 +3,7 @@
 Open_eval_metrics = {
     "Accuracy": [
         ("core_scenarios.json", "MMLU - EM", False),
-        ("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False),
+        # ("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False),
         ("core_scenarios.json", "TruthfulQA - EM", False),
         ("targeted_evaluations.json", "BBQ - EM", False),
         ("core_scenarios.json", "GSM8K - EM", False),
@@ -21,11 +21,56 @@
 
     ],
 
+    # "Bias": [
+    #     ("core_scenarios.json", "CNN/DailyMail - Stereotypes (race)", True),
+    #     ("core_scenarios.json", "CNN/DailyMail - Stereotypes (gender)", True),
+    #     ("core_scenarios.json", "CNN/DailyMail - Representation (race)", True),
+    #     ("core_scenarios.json", "CNN/DailyMail - Representation (gender)", True),
+    # ],
+
+}
+
+
+Hidden_eval_metrics = {
+    "Accuracy": [
+        ("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False),
+        ("core_scenarios.json", "sam_sum - ROUGE-2", False),
+        ("core_scenarios.json", "corr2cause - EM", False),
+        ("core_scenarios.json", "ethics_justice - EM", False),
+        ("core_scenarios.json", "ethics_commonsense - EM", False),
+        ("core_scenarios.json", "ethics_virtue - EM", False),
+        ("core_scenarios.json", "ethics_deontology - EM", False),
+        ("core_scenarios.json", "ethics_utilitarianism - EM", False),
+        ("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False),
+        ("core_scenarios.json", "MATH - Equivalent", False),
+    ],
+
+    "Robustness": [
+        ("core_scenarios.json", "ethics_justice - EM (Robustness)", False),
+        ("core_scenarios.json", "ethics_commonsense - EM (Robustness)", False),
+        ("core_scenarios.json", "ethics_virtue - EM (Robustness)", False),
+        ("core_scenarios.json", "ethics_deontology - EM (Robustness)", False),
+        ("core_scenarios.json", "ethics_utilitarianism - EM (Robustness)", False),
+    ],
+
+    "Fairness": [
+        ("core_scenarios.json", "ethics_justice - EM (Fairness)", False),
+        ("core_scenarios.json", "ethics_commonsense - EM (Fairness)", False),
+        ("core_scenarios.json", "ethics_virtue - EM (Fairness)", False),
+        ("core_scenarios.json", "ethics_deontology - EM (Fairness)", False),
+        ("core_scenarios.json", "ethics_utilitarianism - EM (Fairness)", False),
+    ],
+
     "Bias": [
+        ("core_scenarios.json", "sam_sum - Stereotypes (race)", True),
+        ("core_scenarios.json", "sam_sum - Stereotypes (gender)", True),
+        ("core_scenarios.json", "sam_sum - Representation (race)", True),
+        ("core_scenarios.json", "sam_sum - Representation (gender)", True),
         ("core_scenarios.json", "CNN/DailyMail - Stereotypes (race)", True),
         ("core_scenarios.json", "CNN/DailyMail - Stereotypes (gender)", True),
         ("core_scenarios.json", "CNN/DailyMail - Representation (race)", True),
         ("core_scenarios.json", "CNN/DailyMail - Representation (gender)", True),
+
     ],
 
 }
\ No newline at end of file
diff --git a/neurIPS_eval_scripts/process_helm.py b/neurIPS_eval_scripts/process_helm.py
index 6d4fbc002..4523c407f 100755
--- a/neurIPS_eval_scripts/process_helm.py
+++ b/neurIPS_eval_scripts/process_helm.py
@@ -2,7 +2,8 @@
 import os 
 import sys
 import argparse
-from eval_metrics import Open_eval_metrics as METRICS
+from eval_metrics import Open_eval_metrics as open_metrics 
+from eval_metrics import Hidden_eval_metrics as hidden_metrics
 
 '''
 parse results from helm-summerize under helm_output_dir/runs/submission_id
@@ -11,7 +12,7 @@
 '''
 
 #this is taken from https://github.com/Lightning-AI/llm-efficiency-challenge-eval/blob/main/agents/agents.py#L182
-def process_helm_results(root_path:str, suite: str) -> dict:
+def process_helm_results(root_path:str, suite: str, METRICS:dict = open_metrics) -> dict:
     path = f"{root_path}/runs/{suite}/groups/"
     output = {}
 
@@ -45,15 +46,30 @@ def process_helm_results(root_path:str, suite: str) -> dict:
         parser = argparse.ArgumentParser(description="Parse helm-summerize results")
         parser.add_argument("--dir", type=str, help='Helm Benchmark dir', required=True)
         parser.add_argument('--idx', type=str, help='submission id', required=True)
+        
+        parser.add_argument('--hidden', action='store_true', help="hidden eval metrics", required=False)
         args = parser.parse_args()
-        run_results = process_helm_results(args.dir, args.idx)
+        
+        use_metrics = open_metrics
+        if args.hidden:
+            use_metrics = hidden_metrics
+            
+        run_results = process_helm_results(args.dir, args.idx, METRICS=use_metrics)
+        print(run_results)
 
         results_dir = f"{args.dir}/submission_results"
         os.makedirs(results_dir, exist_ok=True)
-        result_json = os.path.join(results_dir, f"{args.idx}.json")
+
+        out_name = f"{args.idx}.json" 
+        if args.hidden:
+            out_name = f"{args.idx}_hidden.json" 
+
+        result_json = os.path.join(results_dir, out_name)
+
+        print(result_json)
 
         with open (result_json, 'w') as handle:
-            json.dump( run_results, handle)
+            json.dump( run_results, handle, indent=4)
 
     except Exception as e :
         print(e)
diff --git a/neurIPS_eval_scripts/rank_submissions.py b/neurIPS_eval_scripts/rank_submissions.py
index a24a2460f..f94d3574f 100755
--- a/neurIPS_eval_scripts/rank_submissions.py
+++ b/neurIPS_eval_scripts/rank_submissions.py
@@ -114,7 +114,7 @@ def rank_results(data:dict, metrics_config:dict):
         ranked_results = rank_results(submission_results, METRICS)
 
         with open (f"{args.name}_full_rank.json", 'w') as handle:
-            json.dump( ranked_results, handle)
+            json.dump( ranked_results, handle, indent=4)
 
     except Exception as e :
         print(e)
diff --git a/requirements.txt b/requirements.txt
index 20a08d941..2bef7d27e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,7 +32,7 @@ cycler==0.11.0
 cymem==2.0.6
 Cython==0.29.32
 dacite==1.6.0
-datasets==2.5.2
+datasets==2.14.6
 dill==0.3.5.1
 distlib==0.3.6
 emoji==2.1.0
diff --git a/run_specs_full_closed_eval_coarse_3000_budget.conf b/run_specs_full_closed_eval_coarse_3000_budget.conf
new file mode 100644
index 000000000..5c89341fc
--- /dev/null
+++ b/run_specs_full_closed_eval_coarse_3000_budget.conf
@@ -0,0 +1,48 @@
+entries: [
+
+# summarization
+{description: "sam_sum:model=neurips/local,max_eval_instances=600",priority: 1}
+
+# causation
+{description: "corr2cause:model=neurips/local,max_train_instances=1,max_eval_instances=600",priority: 1}
+
+# ethics
+{description: "ethics_justice:model=neurips/local,max_eval_instances=120",priority: 1}
+{description: "ethics_commonsense:model=neurips/local,max_eval_instances=120",priority: 1}
+{description: "ethics_virtue:model=neurips/local,max_eval_instances=120",priority: 1}
+{description: "ethics_deontology:model=neurips/local,max_eval_instances=120",priority: 1}
+{description: "ethics_utilitarianism:model=neurips/local,max_eval_instances=120",priority: 1}
+
+# math
+{description: "math:model=neurips/local,subject=number_theory,level=1,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=algebra,level=1,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=prealgebra,level=1,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=geometry,level=1,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=precalculus,level=1,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=number_theory,level=5,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=algebra,level=5,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=prealgebra,level=5,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=geometry,level=5,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=precalculus,level=5,use_official_examples=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=number_theory,level=1,use_chain_of_thought=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_chain_of_thought=True,max_eval_instances=21",priority: 1}
+{description: "math:model=neurips/local,subject=algebra,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+{description: "math:model=neurips/local,subject=prealgebra,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+{description: "math:model=neurips/local,subject=geometry,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+{description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+{description: "math:model=neurips/local,subject=precalculus,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+{description: "math:model=neurips/local,subject=number_theory,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+{description: "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+{description: "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+{description: "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+{description: "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1}
+
+# cnn
+{description: "summarization_cnndm:model=neurips/local,max_eval_instances=600",priority: 1}
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
index 87cc9d5ee..a37d93d34 100644
--- a/src/helm/benchmark/run_specs.py
+++ b/src/helm/benchmark/run_specs.py
@@ -1815,6 +1815,25 @@ def get_cnndm_summarization_spec(temperature: float = 0.3, device: str = "cpu")
         groups=["summarization_cnndm"],
     )
 
+@run_spec_function("sam_sum")
+def get_sam_sum_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.sam_sum_scenario.SamSumScenario", args={})
+
+    adapter_spec = get_summarization_adapter_spec(
+        num_sents=1,
+        max_tokens=128,
+        temperature=0.3,
+    )
+
+    return RunSpec(
+        name="sam_sum",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=   get_summarization_metric_specs({"task": "sam_sum", "device": 'cpu'})
+        + get_generative_harms_metric_specs(),
+        groups=["sam_sum"],
+    )
+
 
 @run_spec_function("empatheticdialogues")
 def get_empatheticdialogues_spec() -> RunSpec:
@@ -2058,26 +2077,6 @@ def get_me_q_sum_spec() -> RunSpec:
         groups=["MeQSum"],
     )
 
-
-@run_spec_function("sam_sum")
-def get_sam_sum_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.sam_sum_scenario.SamSumScenario", args={})
-
-    adapter_spec = get_summarization_adapter_spec(
-        num_sents=1,
-        max_tokens=128,
-        temperature=0.3,
-    )
-
-    return RunSpec(
-        name="sam_sum",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_open_ended_generation_metric_specs(),
-        groups=["sam_sum"],
-    )
-
-
 @run_spec_function("med_dialog")
 def get_med_dialog_spec(subset: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
diff --git a/src/helm/benchmark/static/schema.yaml b/src/helm/benchmark/static/schema.yaml
old mode 100644
new mode 100755
index de500088e..c4aa58cc3
--- a/src/helm/benchmark/static/schema.yaml
+++ b/src/helm/benchmark/static/schema.yaml
@@ -1793,6 +1793,14 @@ run_groups:
       - BIG-bench
       - gsm
      ## TODO: ADD new dataset here 
+      - corr2cause
+      - ethics_justice
+      - ethics_commonsense
+      - ethics_virtue
+      - ethics_deontology
+      - ethics_utilitarianism
+      - math_chain_of_thought
+      - math_regular
 
   - name: information_retrieval
     display_name: Information retrieval
@@ -1809,6 +1817,7 @@ run_groups:
     subgroups:
       - summarization_cnndm
       - summarization_xsum
+      - sam_sum
 
   - name: sentiment_analysis
     display_name: Sentiment analysis
@@ -1893,6 +1902,7 @@ run_groups:
       - entity_data_imputation
       - entity_matching
       - BIG-bench
+      - corr2cause
 
   - name: harms
     display_name: Harms
@@ -1906,6 +1916,11 @@ run_groups:
       - bbq
       - bold
       - real_toxicity_prompts
+      - ethics_justice
+      - ethics_commonsense
+      - ethics_virtue
+      - ethics_deontology
+      - ethics_utilitarianism
 
   - name: efficiency
     display_name: Efficiency
@@ -1942,6 +1957,7 @@ run_groups:
     subgroups:
       - natural_qa_openbook_longans
       - summarization_cnndm
+      - sam_sum
       - imdb
       - civil_comments
     adapter_keys_shown:
@@ -1965,6 +1981,11 @@ run_groups:
       - legal_support
       - lsat_qa
       - bbq
+      - ethics_justice
+      - ethics_commonsense
+      - ethics_virtue
+      - ethics_deontology
+      - ethics_utilitarianism
     adapter_keys_shown:
       - model
       - method
@@ -1977,6 +1998,7 @@ run_groups:
     subgroups:
       - natural_qa_openbook_longans
       - summarization_cnndm
+      - sam_sum
       - imdb
       - civil_comments
     adapter_keys_shown:
@@ -2194,7 +2216,7 @@ run_groups:
       who: "?"
       when: "?"
       language: English
-
+      
   - name: truthful_qa
     display_name: TruthfulQA
     description: The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).
@@ -2215,7 +2237,119 @@ run_groups:
       when: "?"
       language: English
 
-  ##TODO: ADD New data set detal here
+  ##adding hiden eval dataset
+  - name: corr2cause
+    display_name: corr2cause
+    short_display_name: corr2cause
+    description: Causal reasoning based on graph data https://github.com/causalNLP/corr2cause/tree/main
+    category: Targeted evaluations
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: question answering
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+
+  - name: ethics_justice
+    display_name: ethics_justice
+    short_display_name: ethics_justice
+    description: Justice sub-tasks of ETHICS benchmark https://github.com/hendrycks/ethics
+    category: Targeted evaluations
+    metric_groups:
+      - accuracy
+      - robustness
+      - fairness
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: question answering
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+
+  - name: ethics_virtue
+    display_name: ethics_virtue
+    short_display_name: ethics_virtue
+    description: Virtue sub-tasks of ETHICS benchmark https://github.com/hendrycks/ethics
+    category: Targeted evaluations
+    metric_groups:
+      - accuracy
+      - robustness
+      - fairness
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: question answering
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+
+
+  - name: ethics_commonsense
+    display_name: ethics_commonsense
+    short_display_name: ethics_commonsense
+    description: Commonsense  sub-tasks of ETHICS benchmark https://github.com/hendrycks/ethics
+    category: Targeted evaluations
+    metric_groups:
+      - accuracy
+      - robustness
+      - fairness
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: question answering
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+
+  - name: ethics_deontology
+    display_name: ethics_deontology
+    short_display_name: ethics_deontology
+    description: Deontology  sub-tasks of ETHICS benchmark https://github.com/hendrycks/ethics
+    category: Targeted evaluations
+    metric_groups:
+      - accuracy
+      - robustness
+      - fairness
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: question answering
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+
+  - name: ethics_utilitarianism
+    display_name: ethics_utilitarianism
+    short_display_name: ethics_utilitarianism
+    description: Utilitarianism sub-tasks of ETHICS benchmark https://github.com/hendrycks/ethics
+    category: Targeted evaluations
+    metric_groups:
+      - accuracy
+      - robustness
+      - fairness
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: question answering
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
 
   - name: BIG-bench
     display_name: BIG-bench
@@ -2301,6 +2435,28 @@ run_groups:
       language: English
 
 # Summarization scenarios
+  #TODO!!! Someone should sanity check
+  - name: sam_sum
+    display_name: sam_sum
+    description:  Abstractive summarization of dialogue https://github.com/tensorflow/datasets/blob/master/docs/catalog/samsum.md
+    metric_groups:
+      - accuracy
+      - summarization_metrics
+      - bias
+      - toxicity
+      - efficiency
+      - general_information
+    environment:
+      main_name: rouge_2
+      main_split: valid
+    taxonomy:
+      task: summarization
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+
+
   - name: summarization_cnndm
     display_name: CNN/DailyMail
     description: The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).