From 65597125c7bd3b3a55b0405b2ae8f7445ff1c3e9 Mon Sep 17 00:00:00 2001
From: Mikhail Vyrodov <66316137+VyrodovMikhail@users.noreply.github.com>
Date: Sat, 15 Jun 2024 12:02:46 +0300
Subject: [PATCH] Add goat task support (#6)

* Add goat support

* Fix word generation metric

* Pre-commit fixes

* Review fixes

* Add matching tasks support

* Rename sootv to matching

* Change goat dataset path

* Fix uncorrect dataset names and remove matching validation

* Add support for multiple choice literature tasks

* Rename goat tasks groups

* Fix word_in_set metric code

* Delete config for matching tasks and remove unused metrics

* Fix word_in_set metric and add support for more eos tokens

* Add new eos token
---
 lm_eval/api/metrics.py                        | 27 +++++++++++++++
 .../_default_multiple_choice_template_yaml    | 34 +++++++++++++++++++
 .../goat/_default_single_choice_template_yaml | 16 +++++++++
 .../_default_word_generation_template_yaml    | 26 ++++++++++++++
 lm_eval/tasks/goat/goat.yaml                  |  4 +++
 .../goat/goat_multiple_choice_literature.yaml |  6 ++++
 .../goat/goat_multiple_choice_sociology.yaml  |  6 ++++
 .../goat/goat_single_choice_sociology.yaml    |  6 ++++
 .../goat/goat_word_generation_literature.yaml |  6 ++++
 .../goat/goat_word_generation_sociology.yaml  |  6 ++++
 10 files changed, 137 insertions(+)
 create mode 100644 lm_eval/tasks/goat/_default_multiple_choice_template_yaml
 create mode 100644 lm_eval/tasks/goat/_default_single_choice_template_yaml
 create mode 100644 lm_eval/tasks/goat/_default_word_generation_template_yaml
 create mode 100644 lm_eval/tasks/goat/goat.yaml
 create mode 100644 lm_eval/tasks/goat/goat_multiple_choice_literature.yaml
 create mode 100644 lm_eval/tasks/goat/goat_multiple_choice_sociology.yaml
 create mode 100644 lm_eval/tasks/goat/goat_single_choice_sociology.yaml
 create mode 100644 lm_eval/tasks/goat/goat_word_generation_literature.yaml
 create mode 100644 lm_eval/tasks/goat/goat_word_generation_sociology.yaml
diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index acc70234b1..40da6c1e19 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -317,6 +317,33 @@ def acc_all(items):
     return acc
 
 
+@register_metric(
+    metric="multi_choice_em_unordered",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def multi_choice_em_unordered(items):
+    gold, pred = items
+
+    gold_answers = gold.split(",")
+    pred_answers = pred.split(",")
+    return set(gold_answers) == set(pred_answers)
+
+
+@register_metric(
+    metric="word_in_set",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def word_in_set(items):
+    gold, pred_answer = items
+    gold_answers = gold.split(",")
+
+    return pred_answer.strip() in gold_answers
+
+
 def acc_all_stderr(items):
     # Only count as correct if all answers are labeled correctly for each question
     question_scoring_dict = {}
diff --git a/lm_eval/tasks/goat/_default_multiple_choice_template_yaml b/lm_eval/tasks/goat/_default_multiple_choice_template_yaml
new file mode 100644
index 0000000000..18f909e501
--- /dev/null
+++ b/lm_eval/tasks/goat/_default_multiple_choice_template_yaml
@@ -0,0 +1,34 @@
+dataset_path: deepvk/goat
+output_type: generate_until
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+doc_to_text: "Вопрос: {{input.strip()}}\nОтвет должен состоять из последовательности цифр, написанных слитно.\n\nОтвет: "
+doc_to_target: "{{answer}}"
+metric_list:
+  - metric: multi_choice_em_unordered
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "Вопрос:"
+    - "</s>"
+    - "<|end_of_text|>"
+    - "<|im_end|>"
+  do_sample: false
+  temperature: 0.0
+  num_beams: 3
+repeats: 1
+num_fewshot: 5
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "([0-9\\,]+)"
+      - function: "take_first"
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/goat/_default_single_choice_template_yaml b/lm_eval/tasks/goat/_default_single_choice_template_yaml
new file mode 100644
index 0000000000..4f2964b20b
--- /dev/null
+++ b/lm_eval/tasks/goat/_default_single_choice_template_yaml
@@ -0,0 +1,16 @@
+dataset_path: deepvk/goat
+test_split: test
+num_fewshot: 5
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: "{{input.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nОтвет: "
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/goat/_default_word_generation_template_yaml b/lm_eval/tasks/goat/_default_word_generation_template_yaml
new file mode 100644
index 0000000000..1df9a525da
--- /dev/null
+++ b/lm_eval/tasks/goat/_default_word_generation_template_yaml
@@ -0,0 +1,26 @@
+dataset_path: deepvk/goat
+output_type: generate_until
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+doc_to_text: "Вопрос: {{input.strip()}}\nОтвет должен состоять из одного или нескольких слов, написанных слитно, без пробелов и без запятых.\n\nОтвет: "
+doc_to_target: "{{answer}}"
+metric_list:
+  - metric: word_in_set
+    aggregation: mean
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "Вопрос:"
+    - "</s>"
+    - "<|end_of_text|>"
+    - "<|im_end|>"
+  do_sample: false
+  temperature: 0.0
+  num_beams: 3
+repeats: 1
+num_fewshot: 5
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/goat/goat.yaml b/lm_eval/tasks/goat/goat.yaml
new file mode 100644
index 0000000000..3a37b891bd
--- /dev/null
+++ b/lm_eval/tasks/goat/goat.yaml
@@ -0,0 +1,4 @@
+group: goat
+task:
+- social_science
+- literature
diff --git a/lm_eval/tasks/goat/goat_multiple_choice_literature.yaml b/lm_eval/tasks/goat/goat_multiple_choice_literature.yaml
new file mode 100644
index 0000000000..e4d4978a1d
--- /dev/null
+++ b/lm_eval/tasks/goat/goat_multiple_choice_literature.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "literature_multiple_choice"
+"task": "literature_multiple_choice"
+"description": "The following are multiple choice questions (with answers) about literature.\n\n"
+"group": "literature"
+"group_alias": "literature"
+"include": "_default_multiple_choice_template_yaml"
diff --git a/lm_eval/tasks/goat/goat_multiple_choice_sociology.yaml b/lm_eval/tasks/goat/goat_multiple_choice_sociology.yaml
new file mode 100644
index 0000000000..0215517694
--- /dev/null
+++ b/lm_eval/tasks/goat/goat_multiple_choice_sociology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "sociology_multiple_choice"
+"task": "sociology_multiple_choice"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\n"
+"group": "social_science"
+"group_alias": "social_science"
+"include": "_default_multiple_choice_template_yaml"
diff --git a/lm_eval/tasks/goat/goat_single_choice_sociology.yaml b/lm_eval/tasks/goat/goat_single_choice_sociology.yaml
new file mode 100644
index 0000000000..183ea3f04d
--- /dev/null
+++ b/lm_eval/tasks/goat/goat_single_choice_sociology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "sociology_single_choice"
+"task": "sociology_single_choice"
+"description": "The following are single choice questions (with answers) about sociology.\n\n"
+"group": "social_science"
+"group_alias": "social_science"
+"include": "_default_single_choice_template_yaml"
diff --git a/lm_eval/tasks/goat/goat_word_generation_literature.yaml b/lm_eval/tasks/goat/goat_word_generation_literature.yaml
new file mode 100644
index 0000000000..7a1075e416
--- /dev/null
+++ b/lm_eval/tasks/goat/goat_word_generation_literature.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "literature_word_generation"
+"task": "literature_word_generation"
+"description": "The following are word generation questions (with answers) about literature.\n\n"
+"group": "literature"
+"group_alias": "literature"
+"include": "_default_word_generation_template_yaml"
diff --git a/lm_eval/tasks/goat/goat_word_generation_sociology.yaml b/lm_eval/tasks/goat/goat_word_generation_sociology.yaml
new file mode 100644
index 0000000000..40d66a60c0
--- /dev/null
+++ b/lm_eval/tasks/goat/goat_word_generation_sociology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "sociology_word_generation"
+"task": "sociology_word_generation"
+"description": "The following are word generation questions (with answers) about sociology.\n\n"
+"group": "social_science"
+"group_alias": "social_science"
+"include": "_default_word_generation_template_yaml"