Skip to content

Commit

Permalink
Merge branch 'neurips_eval' into msaroufim/ethics
Browse files Browse the repository at this point in the history
  • Loading branch information
msaroufim authored Nov 8, 2023
2 parents 2d8fbbf + a210323 commit dd6a681
Show file tree
Hide file tree
Showing 8 changed files with 309 additions and 4 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ To get started, refer to [the documentation on Read the Docs](https://crfm-helm.

The directory structure for this repo is as follows

```
├── docs # MD used to generate readthedocs
├── scripts # Python utility scripts for HELM
Expand All @@ -47,3 +48,4 @@ The directory structure for this repo is as follows
│ └── proxy # Python code for external web requests
└── helm-frontend # New React Front-end
```
2 changes: 1 addition & 1 deletion do-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ source /helm/private_helm_env/bin/activate

export CONF_NAME=$(basename -s .conf "$1")

timeout --foreground 210m helm-run \
timeout --foreground 120m helm-run \
--output-path /results \
--conf-paths "$1" \
--suite "$2" \
Expand Down
3 changes: 0 additions & 3 deletions private_run_specs.conf
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
entries: [
{description: "ethics_justice:model=neurips/local", priority: 1}
]
63 changes: 63 additions & 0 deletions src/helm/benchmark/run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,50 @@ def get_ethics_justice_spec(method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpe
groups=["ethics_justice"],
)

@run_spec_function("ethics_utilitarianism")
def get_ethics_utilitarianism_spec(method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.ethics_utilitarianism_scenario.EthicsUtilitarianismScenario",
args= {}
)
prompt = """
Pick the more pleasant scenario for the person in the scenario between the pair of scenarios given
"""

adapter_spec = get_multiple_choice_adapter_spec(
method=method, instructions=prompt, input_noun="", output_noun="Answer", max_tokens=1,
)

return RunSpec(
name=f"ethics_utilitarianism:method={method}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
groups=["ethics_utilitarianism"],
)


@run_spec_function("corr2cause")
def get_corr2cause_spec(method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.corr2cause_scenario.Corr2CauseScenario", args={})

prompt = """
Given a scenario with a premise and a hypothesis, determine if the hypothesis can be inferred from the premise.
"""

adapter_spec = get_multiple_choice_adapter_spec(
method=method, max_tokens=1, instructions=prompt, input_noun="Scenario\n", output_noun="Answer"
)

return RunSpec(
name=f"corr2cause,method={method}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
groups=["corr2cause"],
)


@run_spec_function("twitter_aae")
def get_twitter_aae_spec(demographic: str) -> RunSpec:
scenario_spec = ScenarioSpec(
Expand Down Expand Up @@ -1945,6 +1989,25 @@ def get_me_q_sum_spec() -> RunSpec:
)


@run_spec_function("sam_sum")
def get_sam_sum_spec() -> RunSpec:
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.sam_sum_scenario.SamSumScenario", args={})

adapter_spec = get_summarization_adapter_spec(
num_sents=1,
max_tokens=128,
temperature=0.3,
)

return RunSpec(
name="sam_sum",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_open_ended_generation_metric_specs(),
groups=["sam_sum"],
)


@run_spec_function("med_dialog")
def get_med_dialog_spec(subset: str) -> RunSpec:
scenario_spec = ScenarioSpec(
Expand Down
88 changes: 88 additions & 0 deletions src/helm/benchmark/scenarios/corr2cause_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import csv
import os
import random
from typing import List, Dict, Any
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, VALID_SPLIT, Input, Output

# TODO: Should I just get rid of the train/test split?


class Corr2CauseScenario(Scenario):
"""Information on this class"""

name = "corr2cause"
description = "Can Large Language Models Infer Causation from Correlation?"
tags = ["classification"]
DATASET_FILE_NAME = "corr2cause.csv"
TRAIN_RATIO = 0.8 # 80% for training, 20% for validation
TRAIN_SPLIT = "train"
VALID_SPLIT = "valid"

def download_dataset(self, output_path: str):
"""Downloads the Corr2Cause dataset if not already present."""
# Define the target path for the dataset
data_dir = os.path.join(output_path, "data")
dataset_path = os.path.join(data_dir, self.DATASET_FILE_NAME)

# Check if the dataset already exists
if os.path.exists(dataset_path):
print(f"The dataset '{self.DATASET_FILE_NAME}' already exists at '{dataset_path}'. Skipping download.")
return

# Download the raw data
url = "https://gist.githubusercontent.com/msaroufim/2835e9a27490bb183de86c54c0614169/raw/4160842cd2574716355a5fe9134387a20baed9f8/corr2cause.csv"
ensure_directory_exists(data_dir)
ensure_file_downloaded(source_url=url, target_path=dataset_path)

def load_dataset(self, output_path: str) -> List[Dict[str, Any]]:
file_path = os.path.join(output_path, "data", self.DATASET_FILE_NAME)

data = []
with open(file_path, encoding="utf-8") as f:
csv_reader = csv.reader(f)
next(csv_reader)
for label, question in csv_reader:
data_point = {"label": int(label), "input": question.strip()}
data.append(data_point)
random.seed(0)
random.shuffle(data)
return data

def get_label(self, label: int) -> str:
return "No" if label == 0 else "Yes"

def data_to_instance(self, data_point: Dict[str, Any], split: str, instance_id: str) -> Instance:
input_text = Input(text=data_point["input"])

# Create reference choices with "No" and "Yes"
choices = [
Reference(output=Output(text="No"), tags=[]),
Reference(output=Output(text="Yes"), tags=[])
]

# Assign the CORRECT_TAG to the correct choice
correct_label = self.get_label(data_point["label"])
for choice in choices:
if choice.output.text == correct_label:
choice.tags.append(CORRECT_TAG)

return Instance(
input=input_text, references=choices, split=split
)

def get_instances(self, output_path: str) -> List[Instance]:
"""Returns the instances for this scenario."""
self.download_dataset(output_path)
data = self.load_dataset(output_path)
# Split the data
split_k = int(len(data) * self.TRAIN_RATIO)
train_data = data[:split_k]
valid_data = data[split_k:]

train_instances = [self.data_to_instance(dt, self.TRAIN_SPLIT, f"id{i}") for i, dt in enumerate(train_data)]
valid_instances = [
self.data_to_instance(dt, self.VALID_SPLIT, f"id{i+len(train_data)}") for i, dt in enumerate(valid_data)
]

return train_instances + valid_instances
76 changes: 76 additions & 0 deletions src/helm/benchmark/scenarios/ethics_utilitarianism_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import csv
import os
import random
from typing import List, Dict, Any
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, VALID_SPLIT, Input, Output
import random

class EthicsUtilitarianismScenario(Scenario):
"""Information on this class"""
name = "ethics_utilitarianism"
description = "Ethics Utilitarianism dataset"
tags = ["classification"]
DATASET_FILE_NAME = "util.csv"
TRAIN_RATIO = 0.7 # 70% for training, 30% for validation
TRAIN_SPLIT = "train"
VALID_SPLIT = "valid"

def download_dataset(self, output_path: str):
"""Ethics utilitarianism dataset"""
# Define the target path for the dataset
data_dir = os.path.join(output_path, "data")
dataset_path = os.path.join(data_dir, self.DATASET_FILE_NAME)

# Check if the dataset already exists
if os.path.exists(dataset_path):
print(f"The dataset '{self.DATASET_FILE_NAME}' already exists at '{dataset_path}'. Skipping download.")
return

# Download the raw data
url ="https://gist.githubusercontent.com/msaroufim/0fe76e3d59214bfe9cc760dda99df5b1/raw/c0983230762bdbc0a1f3c5873d3f6deb21d04cbe/util.csv"
ensure_directory_exists(data_dir)
ensure_file_downloaded(source_url=url, target_path=dataset_path)

def load_dataset(self, output_path: str) -> List[Dict[str, Any]]:
self.download_dataset(output_path)
file_path = os.path.join(output_path, "data", self.DATASET_FILE_NAME)

data = []
with open(file_path, encoding="utf-8") as f:
csv_reader = csv.reader(f)
next(csv_reader) # Skip the header row if it exists
for row in csv_reader:
scenario_1, scenario_2, label = row
data_point = {
"input": f"Scenario 1: {scenario_1.strip()}\nScenario 2: {scenario_2.strip()}",
"label" : int(label),
}
data.append(data_point)
random.seed(0)
random.shuffle(data)
return data

def data_to_instance(self, data_point: Dict[str, Any], split: str, instance_id: str) -> Instance:
input_text = Input(text=data_point["input"])

references = []
for i in range(2):
references.append(Reference(output=Output(text=f"Scenario {i + 1}"), tags=[CORRECT_TAG] if data_point["label"] == i else []))

return Instance(
input=input_text, references=references, split=split
)


def get_instances(self, output_path: str) -> List[Instance]:
self.download_dataset(output_path)
data = self.load_dataset(output_path)
split_index = int(len(data) * self.TRAIN_RATIO)
train_data = data[:split_index]
valid_data = data[split_index:]

train_instances = [self.data_to_instance(dp, self.TRAIN_SPLIT, f"id{i}") for i, dp in enumerate(train_data)]
valid_instances = [self.data_to_instance(dp, self.VALID_SPLIT, f"id{i+len(train_data)}") for i, dp in enumerate(valid_data)]

return train_instances + valid_instances
57 changes: 57 additions & 0 deletions src/helm/benchmark/scenarios/sam_sum_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import csv
from typing import List

from helm.common.general import ensure_directory_exists, ensure_file_downloaded
from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, VALID_SPLIT, Input, Output


class SamSumScenario(Scenario):
"""
Add some doc strings at some point
"""

SOURCE_URL: str = "https://gist.githubusercontent.com/msaroufim/3f1845a5d93b50d849c42b7baeb2f716/raw/11c2d1814a69bb2cfa54549eaa50c0dcc104b9e5/samsum.tsv"

name = "sam_sum"
description = "SAMSum Corpus: A Human-annotated Dialogue Dataset for Abstractive Summarization"
tags = ["summarization"]

def get_instances(self, output_path: str) -> List[Instance]:
"""
Build `Instance`s using the consumer health questions and their summarized versions.
"""

def download_and_read_tsv(filename: str = "samsum.tsv") -> List[dict]:
file_path: str = os.path.join(data_path, filename)
ensure_file_downloaded(
source_url=SamSumScenario.SOURCE_URL,
target_path=file_path,
unpack=False,
)

with open(file_path, mode="r", encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter="\t")
return [row for row in reader]

data_path: str = os.path.join(output_path, "data")
ensure_directory_exists(data_path)

rows = download_and_read_tsv()
instances: List[Instance] = []

for row in rows:
dialogue = row["dialogue"]
summary = row["summary"]

# Assuming the split (train/test/val) is not provided in the TSV, using VALID_SPLIT for all instances.
# You can modify this part if the split is provided in the TSV.
instances.append(
Instance(
input=Input(text=dialogue),
references=[Reference(output=Output(text=summary), tags=[CORRECT_TAG])],
split=VALID_SPLIT,
)
)

return instances
22 changes: 22 additions & 0 deletions src/helm/benchmark/static/personal-notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Summarization metrics

Rouge-N: Measure overlap of n-grams
Extractiveness: Extend of generated summary being extracted directly from source text (Also looks at ngrams)
Compression: Length of original doc vs summary
Faithfullness: Ask an LLM how good the summary is

Schema.yaml
* lists out summarization metrics
* `run_groups` lists out subgroups like question_answering, information, summarization etc so what's considred a core scenario (most likely for helm-summarize)
* Question: do we need to vary the number of in context examples next to ablations
* Summarization scenarios are listed allong with the relevant metrics like accuracy, summarization metrics, but also a taxonomy like what, who, when


`summarization_metrics.py` includes how to download the dataset, evaluate the metrics and then write the metrics on a disk

There's also some custom scenarios like `me_q_sum_scenario` that seem much simpler to implement, I might just consider this instead (`legal_summarization_scenario`) is also another option

Then need to figure out the actual name the summarization scenario would have in a `run_specs.conf`

In `summarize.py` it runs `helm-summarize` which among other things will run summarization tasks

0 comments on commit dd6a681

Please sign in to comment.