From 65597125c7bd3b3a55b0405b2ae8f7445ff1c3e9 Mon Sep 17 00:00:00 2001 From: Mikhail Vyrodov <66316137+VyrodovMikhail@users.noreply.github.com> Date: Sat, 15 Jun 2024 12:02:46 +0300 Subject: [PATCH] Add goat task support (#6) * Add goat support * Fix word generation metric * Pre-commit fixes * Review fixes * Add matching tasks support * Rename sootv to matching * Change goat dataset path * Fix uncorrect dataset names and remove matching validation * Add support for multiple choice literature tasks * Rename goat tasks groups * Fix word_in_set metric code * Delete config for matching tasks and remove unused metrics * Fix word_in_set metric and add support for more eos tokens * Add new eos token --- lm_eval/api/metrics.py | 27 +++++++++++++++ .../_default_multiple_choice_template_yaml | 34 +++++++++++++++++++ .../goat/_default_single_choice_template_yaml | 16 +++++++++ .../_default_word_generation_template_yaml | 26 ++++++++++++++ lm_eval/tasks/goat/goat.yaml | 4 +++ .../goat/goat_multiple_choice_literature.yaml | 6 ++++ .../goat/goat_multiple_choice_sociology.yaml | 6 ++++ .../goat/goat_single_choice_sociology.yaml | 6 ++++ .../goat/goat_word_generation_literature.yaml | 6 ++++ .../goat/goat_word_generation_sociology.yaml | 6 ++++ 10 files changed, 137 insertions(+) create mode 100644 lm_eval/tasks/goat/_default_multiple_choice_template_yaml create mode 100644 lm_eval/tasks/goat/_default_single_choice_template_yaml create mode 100644 lm_eval/tasks/goat/_default_word_generation_template_yaml create mode 100644 lm_eval/tasks/goat/goat.yaml create mode 100644 lm_eval/tasks/goat/goat_multiple_choice_literature.yaml create mode 100644 lm_eval/tasks/goat/goat_multiple_choice_sociology.yaml create mode 100644 lm_eval/tasks/goat/goat_single_choice_sociology.yaml create mode 100644 lm_eval/tasks/goat/goat_word_generation_literature.yaml create mode 100644 lm_eval/tasks/goat/goat_word_generation_sociology.yaml diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py index acc70234b1..40da6c1e19 100644 --- a/lm_eval/api/metrics.py +++ b/lm_eval/api/metrics.py @@ -317,6 +317,33 @@ def acc_all(items): return acc +@register_metric( + metric="multi_choice_em_unordered", + higher_is_better=True, + output_type="generate_until", + aggregation="mean", +) +def multi_choice_em_unordered(items): + gold, pred = items + + gold_answers = gold.split(",") + pred_answers = pred.split(",") + return set(gold_answers) == set(pred_answers) + + +@register_metric( + metric="word_in_set", + higher_is_better=True, + output_type="generate_until", + aggregation="mean", +) +def word_in_set(items): + gold, pred_answer = items + gold_answers = gold.split(",") + + return pred_answer.strip() in gold_answers + + def acc_all_stderr(items): # Only count as correct if all answers are labeled correctly for each question question_scoring_dict = {} diff --git a/lm_eval/tasks/goat/_default_multiple_choice_template_yaml b/lm_eval/tasks/goat/_default_multiple_choice_template_yaml new file mode 100644 index 0000000000..18f909e501 --- /dev/null +++ b/lm_eval/tasks/goat/_default_multiple_choice_template_yaml @@ -0,0 +1,34 @@ +dataset_path: deepvk/goat +output_type: generate_until +test_split: test +fewshot_split: dev +fewshot_config: + sampler: first_n +doc_to_text: "Вопрос: {{input.strip()}}\nОтвет должен состоять из последовательности цифр, написанных слитно.\n\nОтвет: " +doc_to_target: "{{answer}}" +metric_list: + - metric: multi_choice_em_unordered + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: false +generation_kwargs: + until: + - "\n\n" + - "Вопрос:" + - "" + - "<|end_of_text|>" + - "<|im_end|>" + do_sample: false + temperature: 0.0 + num_beams: 3 +repeats: 1 +num_fewshot: 5 +filter_list: + - name: "get-answer" + filter: + - function: "regex" + regex_pattern: "([0-9\\,]+)" + - function: "take_first" +metadata: + version: 0.0 diff --git a/lm_eval/tasks/goat/_default_single_choice_template_yaml b/lm_eval/tasks/goat/_default_single_choice_template_yaml new file mode 100644 index 0000000000..4f2964b20b --- /dev/null +++ b/lm_eval/tasks/goat/_default_single_choice_template_yaml @@ -0,0 +1,16 @@ +dataset_path: deepvk/goat +test_split: test +num_fewshot: 5 +fewshot_split: dev +fewshot_config: + sampler: first_n +output_type: multiple_choice +doc_to_text: "{{input.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nОтвет: " +doc_to_choice: ["A", "B", "C", "D"] +doc_to_target: answer +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 0.0 diff --git a/lm_eval/tasks/goat/_default_word_generation_template_yaml b/lm_eval/tasks/goat/_default_word_generation_template_yaml new file mode 100644 index 0000000000..1df9a525da --- /dev/null +++ b/lm_eval/tasks/goat/_default_word_generation_template_yaml @@ -0,0 +1,26 @@ +dataset_path: deepvk/goat +output_type: generate_until +test_split: test +fewshot_split: dev +fewshot_config: + sampler: first_n +doc_to_text: "Вопрос: {{input.strip()}}\nОтвет должен состоять из одного или нескольких слов, написанных слитно, без пробелов и без запятых.\n\nОтвет: " +doc_to_target: "{{answer}}" +metric_list: + - metric: word_in_set + aggregation: mean + higher_is_better: true +generation_kwargs: + until: + - "\n\n" + - "Вопрос:" + - "" + - "<|end_of_text|>" + - "<|im_end|>" + do_sample: false + temperature: 0.0 + num_beams: 3 +repeats: 1 +num_fewshot: 5 +metadata: + version: 0.0 diff --git a/lm_eval/tasks/goat/goat.yaml b/lm_eval/tasks/goat/goat.yaml new file mode 100644 index 0000000000..3a37b891bd --- /dev/null +++ b/lm_eval/tasks/goat/goat.yaml @@ -0,0 +1,4 @@ +group: goat +task: +- social_science +- literature diff --git a/lm_eval/tasks/goat/goat_multiple_choice_literature.yaml b/lm_eval/tasks/goat/goat_multiple_choice_literature.yaml new file mode 100644 index 0000000000..e4d4978a1d --- /dev/null +++ b/lm_eval/tasks/goat/goat_multiple_choice_literature.yaml @@ -0,0 +1,6 @@ +"dataset_name": "literature_multiple_choice" +"task": "literature_multiple_choice" +"description": "The following are multiple choice questions (with answers) about literature.\n\n" +"group": "literature" +"group_alias": "literature" +"include": "_default_multiple_choice_template_yaml" diff --git a/lm_eval/tasks/goat/goat_multiple_choice_sociology.yaml b/lm_eval/tasks/goat/goat_multiple_choice_sociology.yaml new file mode 100644 index 0000000000..0215517694 --- /dev/null +++ b/lm_eval/tasks/goat/goat_multiple_choice_sociology.yaml @@ -0,0 +1,6 @@ +"dataset_name": "sociology_multiple_choice" +"task": "sociology_multiple_choice" +"description": "The following are multiple choice questions (with answers) about sociology.\n\n" +"group": "social_science" +"group_alias": "social_science" +"include": "_default_multiple_choice_template_yaml" diff --git a/lm_eval/tasks/goat/goat_single_choice_sociology.yaml b/lm_eval/tasks/goat/goat_single_choice_sociology.yaml new file mode 100644 index 0000000000..183ea3f04d --- /dev/null +++ b/lm_eval/tasks/goat/goat_single_choice_sociology.yaml @@ -0,0 +1,6 @@ +"dataset_name": "sociology_single_choice" +"task": "sociology_single_choice" +"description": "The following are single choice questions (with answers) about sociology.\n\n" +"group": "social_science" +"group_alias": "social_science" +"include": "_default_single_choice_template_yaml" diff --git a/lm_eval/tasks/goat/goat_word_generation_literature.yaml b/lm_eval/tasks/goat/goat_word_generation_literature.yaml new file mode 100644 index 0000000000..7a1075e416 --- /dev/null +++ b/lm_eval/tasks/goat/goat_word_generation_literature.yaml @@ -0,0 +1,6 @@ +"dataset_name": "literature_word_generation" +"task": "literature_word_generation" +"description": "The following are word generation questions (with answers) about literature.\n\n" +"group": "literature" +"group_alias": "literature" +"include": "_default_word_generation_template_yaml" diff --git a/lm_eval/tasks/goat/goat_word_generation_sociology.yaml b/lm_eval/tasks/goat/goat_word_generation_sociology.yaml new file mode 100644 index 0000000000..40d66a60c0 --- /dev/null +++ b/lm_eval/tasks/goat/goat_word_generation_sociology.yaml @@ -0,0 +1,6 @@ +"dataset_name": "sociology_word_generation" +"task": "sociology_word_generation" +"description": "The following are word generation questions (with answers) about sociology.\n\n" +"group": "social_science" +"group_alias": "social_science" +"include": "_default_word_generation_template_yaml"