diff --git a/private_run_specs.conf b/private_run_specs.conf index 0ad6b947..f5ade0a3 100644 --- a/private_run_specs.conf +++ b/private_run_specs.conf @@ -1,3 +1,4 @@ + entries: [ ## Real @@ -82,6 +83,8 @@ entries: [ {description: "math:model=text_code,subject=counting_and_probability,level=5,use_chain_of_thought=True", priority: 3} {description: "math:model=text_code,subject=precalculus,level=5,use_chain_of_thought=True", priority: 3} - {description: "sam_sum:model=neurips/local,max_train_instances=3", priority: 1} - {description: "ethics_utilitarianism:model=neurips/local", priority: 1} + {description: "sam_sum:model=neurips/local,max_train_instances=3", priority: 1} + {description: "ethics_utilitarianism:model=neurips/local", priority: 1} + {description: "corr2cause:model=neurips/local", priority: 1} + ] \ No newline at end of file diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py index b08d51bd..19cfb49d 100644 --- a/src/helm/benchmark/run_specs.py +++ b/src/helm/benchmark/run_specs.py @@ -1040,6 +1040,27 @@ def get_ethics_utilitarianism_spec(method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> ) +@run_spec_function("corr2cause") +def get_corr2cause_spec(method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec: + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.corr2cause_scenario.Corr2CauseScenario", args={}) + + prompt = """ +Given a scenario with a premise and a hypothesis, determine if the hypothesis can be inferred from the premise. +""" + + adapter_spec = get_multiple_choice_adapter_spec( + method=method, max_tokens=1, instructions=prompt, input_noun="Scenario\n", output_noun="Answer" + ) + + return RunSpec( + name=f"corr2cause,method={method}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs(), + groups=["corr2cause"], + ) + + @run_spec_function("twitter_aae") def get_twitter_aae_spec(demographic: str) -> RunSpec: scenario_spec = ScenarioSpec( diff --git a/src/helm/benchmark/scenarios/corr2cause_scenario.py b/src/helm/benchmark/scenarios/corr2cause_scenario.py new file mode 100644 index 00000000..b10d3d3b --- /dev/null +++ b/src/helm/benchmark/scenarios/corr2cause_scenario.py @@ -0,0 +1,88 @@ +import csv +import os +import random +from typing import List, Dict, Any +from helm.common.general import ensure_file_downloaded, ensure_directory_exists +from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, VALID_SPLIT, Input, Output + +# TODO: Should I just get rid of the train/test split? + + +class Corr2CauseScenario(Scenario): + """Information on this class""" + + name = "corr2cause" + description = "Can Large Language Models Infer Causation from Correlation?" + tags = ["classification"] + DATASET_FILE_NAME = "corr2cause.csv" + TRAIN_RATIO = 0.8 # 80% for training, 20% for validation + TRAIN_SPLIT = "train" + VALID_SPLIT = "valid" + + def download_dataset(self, output_path: str): + """Downloads the Corr2Cause dataset if not already present.""" + # Define the target path for the dataset + data_dir = os.path.join(output_path, "data") + dataset_path = os.path.join(data_dir, self.DATASET_FILE_NAME) + + # Check if the dataset already exists + if os.path.exists(dataset_path): + print(f"The dataset '{self.DATASET_FILE_NAME}' already exists at '{dataset_path}'. Skipping download.") + return + + # Download the raw data + url = "https://gist.githubusercontent.com/msaroufim/2835e9a27490bb183de86c54c0614169/raw/4160842cd2574716355a5fe9134387a20baed9f8/corr2cause.csv" + ensure_directory_exists(data_dir) + ensure_file_downloaded(source_url=url, target_path=dataset_path) + + def load_dataset(self, output_path: str) -> List[Dict[str, Any]]: + file_path = os.path.join(output_path, "data", self.DATASET_FILE_NAME) + + data = [] + with open(file_path, encoding="utf-8") as f: + csv_reader = csv.reader(f) + next(csv_reader) + for label, question in csv_reader: + data_point = {"label": int(label), "input": question.strip()} + data.append(data_point) + random.seed(0) + random.shuffle(data) + return data + + def get_label(self, label: int) -> str: + return "No" if label == 0 else "Yes" + + def data_to_instance(self, data_point: Dict[str, Any], split: str, instance_id: str) -> Instance: + input_text = Input(text=data_point["input"]) + + # Create reference choices with "No" and "Yes" + choices = [ + Reference(output=Output(text="No"), tags=[]), + Reference(output=Output(text="Yes"), tags=[]) + ] + + # Assign the CORRECT_TAG to the correct choice + correct_label = self.get_label(data_point["label"]) + for choice in choices: + if choice.output.text == correct_label: + choice.tags.append(CORRECT_TAG) + + return Instance( + input=input_text, references=choices, split=split + ) + + def get_instances(self, output_path: str) -> List[Instance]: + """Returns the instances for this scenario.""" + self.download_dataset(output_path) + data = self.load_dataset(output_path) + # Split the data + split_k = int(len(data) * self.TRAIN_RATIO) + train_data = data[:split_k] + valid_data = data[split_k:] + + train_instances = [self.data_to_instance(dt, self.TRAIN_SPLIT, f"id{i}") for i, dt in enumerate(train_data)] + valid_instances = [ + self.data_to_instance(dt, self.VALID_SPLIT, f"id{i+len(train_data)}") for i, dt in enumerate(valid_data) + ] + + return train_instances + valid_instances