From e273edd1844b8b99d3a0a12d9dce5a3e67362aec Mon Sep 17 00:00:00 2001 From: Wenshansilvia Date: Wed, 13 Nov 2024 15:35:51 +0800 Subject: [PATCH 1/9] add claimnum claimfaithfulness repetitiveness --- rageval/metrics/__init__.py | 6 +- .../_claim_faithfulness.py | 122 ++++++++++++++++++ .../answer_informativeness/_claim_num.py | 97 ++++++++++++++ .../answer_informativeness/_repetitiveness.py | 101 +++++++++++++++ rageval/metrics/base.py | 12 +- tests/units/test_claim_faithfulness.py | 41 ++++++ tests/units/test_claim_num.py | 38 ++++++ tests/units/test_repetitiveness.py | 32 +++++ 8 files changed, 444 insertions(+), 5 deletions(-) create mode 100644 tests/units/test_claim_faithfulness.py create mode 100644 tests/units/test_claim_num.py create mode 100644 tests/units/test_repetitiveness.py diff --git a/rageval/metrics/__init__.py b/rageval/metrics/__init__.py index cbba529..04df427 100644 --- a/rageval/metrics/__init__.py +++ b/rageval/metrics/__init__.py @@ -19,12 +19,12 @@ from .answer_groundedness._answer_citation_precision import AnswerCitationPrecision from .answer_groundedness._answer_citation_recall import AnswerCitationRecall from .answer_groundedness._context_reject_rate import ContextRejectRate -##from .answer_groundedness._claim_faithfulness import ClaimFaithfulness +from .answer_groundedness._claim_faithfulness import ClaimFaithfulness # Metrics about the answer informativeness -##from .answer_informative._claim_num import ClaimNum +from .answer_informativeness._claim_num import ClaimNum from .answer_informativeness._text_length import TextLength -##from .answer_informativeness._repetitiveness import Repetitiveness +from .answer_informativeness._repetitiveness import Repetitiveness ##from .answer_informativeness._pairwise_accuracy import PairwiseAccuracy from .answer_informativeness._answer_distinct12 import AnswerDistinct diff --git a/rageval/metrics/answer_groundedness/_claim_faithfulness.py b/rageval/metrics/answer_groundedness/_claim_faithfulness.py index e69de29..06937a8 100644 --- a/rageval/metrics/answer_groundedness/_claim_faithfulness.py +++ b/rageval/metrics/answer_groundedness/_claim_faithfulness.py @@ -0,0 +1,122 @@ +from dataclasses import dataclass +from typing import Optional, Iterable +from refchecker.extractor import LLMExtractor +from refchecker.checker import LLMChecker + +import evaluate +import datasets +import os +from rageval.metrics import Metric, add_attribute +import numpy as np + + +_DESCRIPTION = """\ +ClaimFaithfulness is a metric that evaluates to what extend does the answer follows the given evidences. + +It is calculated by first utilizing the open-source tool RefChecker to extract claims from the generated text, and then use the same tool to check whether evidences can entail each claim. The ultimate measure is the total number of entailment, providing insight into the faithfulness to given evidences in the model's outputs. +""" + +_KWARGS_DESCRIPTION = """\ +Args: + name : str + +Optional Args: + None + +Functions: + _compute_one: Evaluating the faithfulness of claims generated. + +Examples: + >>> from datasets import Dataset + >>> import rageval as rl + >>> sample = { + ... "answers": [ + ... "A", + ... "C", + ... ] + ... } + >>> dataset = Dataset.from_dict(sample) + >>> metric = ClaimFaithfulness(model = "openai//home/gomall/models/Qwen2-7B-Instruct", + api_base = "http://localhost:5000/v1", + api_key = "sk-123456789") + >>> metric.mtype + 'answer_informativeness' +""" + +@dataclass +@add_attribute('mtype', 'answer_informativeness') +@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class ClaimFaithfulness(Metric): + """Estimates the faithfulness of claims contained in answers.""" + + name = "claim_faithfulness" + + ALIAS = ['claim_faithfulness'] + + def __init__(self, model: str = "openai//home/gomall/models/Qwen2-7B-Instruct", + api_base: str = "http://localhost:5000/v1", + api_key: str = "sk-123456789"): + """ + Explicitly initialize ClaimFaithfulness. + + Ensure all parent classes are initialized. + """ + self.extractor = LLMExtractor(model=model, batch_size=8, api_base=api_base) + self.checker = LLMChecker(model=model, batch_size=8, api_base=api_base) + os.environ['OPENAI_API_KEY'] = api_key + super().__init__() + self.info = evaluate.MetricInfo( + description=_DESCRIPTION, + inputs_description=_KWARGS_DESCRIPTION, + citation="", + homepage="", + features=datasets.Features( + { + "answers": datasets.Value("string"), + } + ), + codebase_urls=[], + reference_urls=[] + ) + + def __repr__(self) -> str: + """:return: Formatted string representation of the metric.""" + return f"{self.ALIAS[0]}" # pragma: no cover + + def _compute_one( + self, + answer: str, + question: str, + context: str, + *args: Optional[Iterable], + ) -> float: + """Evaluating the richness of claims contained in answers.""" + extraction_results = self.extractor.extract( + batch_responses=[answer], + batch_questions=[question], + max_new_tokens=1000 + ) + claims = [[c.content for c in res.claims] for res in extraction_results] + merge_psg = False + checking_results = self.checker.check( + batch_claims=claims, + batch_references=[context], + batch_questions=[question], + max_reference_segment_length=0, + merge_psg=merge_psg, + is_joint=True, + joint_check_num=5, + sagemaker_client=None, + sagemaker_params=None, + sagemaker_get_response_func=None, + ) + def to_bool(checking_results): + if isinstance(checking_results, str): + return checking_results == "Entailment" + return np.array([to_bool(res) for res in checking_results]) + + retrieved2response = to_bool(checking_results) + faithful = np.max(retrieved2response, axis=2) + faithfulness_score = np.mean(faithful) + + return faithfulness_score \ No newline at end of file diff --git a/rageval/metrics/answer_informativeness/_claim_num.py b/rageval/metrics/answer_informativeness/_claim_num.py index e69de29..188465d 100644 --- a/rageval/metrics/answer_informativeness/_claim_num.py +++ b/rageval/metrics/answer_informativeness/_claim_num.py @@ -0,0 +1,97 @@ +from dataclasses import dataclass +from typing import Optional, Iterable +from refchecker.extractor import LLMExtractor +import evaluate +import datasets +import os +from rageval.metrics import Metric, add_attribute + + +_DESCRIPTION = """\ +ClaimNum is a metric designed to evaluate the richness of claims generated by the model. + +It is calculated by first utilizing the open-source tool RefChecker to extract claims from the generated text. The ultimate measure is the total number of distinct claims identified, providing insight into the diversity of opinions presented in the model's outputs. +""" + +_KWARGS_DESCRIPTION = """\ +Args: + name : str + +Optional Args: + None + +Functions: + _compute_one: Evaluating the richness of claims generated. + +Examples: + >>> from datasets import Dataset + >>> import rageval as rl + >>> sample = { + ... "answers": [ + ... "A", + ... "C", + ... ] + ... } + >>> dataset = Dataset.from_dict(sample) + >>> metric = ClaimNum(model = "openai//home/gomall/models/Qwen2-7B-Instruct", + api_base = "http://localhost:5000/v1", + api_key = "sk-123456789") + >>> metric.mtype + 'answer_informativeness' +""" + +@dataclass +@add_attribute('mtype', 'answer_informativeness') +@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class ClaimNum(Metric): + """Estimates the richness of claims contained in answers.""" + + name = "claim_num" + + ALIAS = ['claim_num'] + + def __init__(self, model: str = "openai//home/gomall/models/Qwen2-7B-Instruct", + api_base: str = "http://localhost:5000/v1", + api_key: str = "sk-123456789"): + """ + Explicitly initialize TextLength. + + Ensure all parent classes are initialized. + """ + self.extractor = LLMExtractor(model=model, batch_size=8, api_base=api_base) + os.environ['OPENAI_API_KEY'] = api_key + super().__init__() + self.info = evaluate.MetricInfo( + description=_DESCRIPTION, + inputs_description=_KWARGS_DESCRIPTION, + citation="", + homepage="", + features=datasets.Features( + { + "answers": datasets.Value("string"), + } + ), + codebase_urls=[], + reference_urls=[] + ) + + def __repr__(self) -> str: + """:return: Formatted string representation of the metric.""" + return f"{self.ALIAS[0]}" # pragma: no cover + + def _compute_one( + self, + answer: str, + question: str, + *args: Optional[Iterable], + ) -> float: + """Evaluating the richness of claims contained in answers.""" + extraction_results = self.extractor.extract( + batch_responses=[answer], + batch_questions=[question], + max_new_tokens=1000 + ) + claims = [[c.content for c in res.claims] for res in extraction_results] + claim_num = len(claims[0]) + + return claim_num \ No newline at end of file diff --git a/rageval/metrics/answer_informativeness/_repetitiveness.py b/rageval/metrics/answer_informativeness/_repetitiveness.py index e69de29..5a680d8 100644 --- a/rageval/metrics/answer_informativeness/_repetitiveness.py +++ b/rageval/metrics/answer_informativeness/_repetitiveness.py @@ -0,0 +1,101 @@ +from dataclasses import dataclass +from typing import Optional, Iterable + +import evaluate +import datasets +import os +from rageval.metrics import Metric, add_attribute +import numpy as np +from nltk.tokenize import sent_tokenize, word_tokenize +from collections import Counter + + +_DESCRIPTION = """\ +Repetitiveness is a metric that evaluates repetitiveness/redundency generated by the model. + +It is calculated by first spliting answer into words unit and sentences. Then counts the apperance of same words and sentences diveded by the total number. Higher the number, higher the redundency. +""" + +_KWARGS_DESCRIPTION = """\ +Args: + name : str + +Optional Args: + None + +Functions: + _compute_one: Evaluating the repetitiveness of answers generated. + +Examples: + >>> from datasets import Dataset + >>> import rageval as rl + >>> sample = { + ... "answers": [ + ... "A", + ... "C", + ... ] + ... } + >>> dataset = Dataset.from_dict(sample) + >>> metric = Repetitiveness() + >>> metric.mtype + 'answer_informativeness' +""" + +@dataclass +@add_attribute('mtype', 'answer_informativeness') +@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Repetitiveness(Metric): + """Estimates the repetitiveness of answers.""" + + name = "repetitiveness" + + ALIAS = ['repetitiveness'] + + def __init__(self): + """ + Explicitly initialize Repetitiveness. + + Ensure all parent classes are initialized. + """ + super().__init__() + self.info = evaluate.MetricInfo( + description=_DESCRIPTION, + inputs_description=_KWARGS_DESCRIPTION, + citation="", + homepage="", + features=datasets.Features( + { + "answers": datasets.Value("string"), + } + ), + codebase_urls=[], + reference_urls=[] + ) + + def __repr__(self) -> str: + """:return: Formatted string representation of the metric.""" + return f"{self.ALIAS[0]}" # pragma: no cover + + def _compute_one( + self, + answer: str, + *args: Optional[Iterable], + ) -> float: + """Evaluating the repetitiveness of answer.""" + def generate_splits(text): + words = word_tokenize(text) + sentences = sent_tokenize(text) + return words, sentences + def calculate_redundancy_one(text): + words, sentences = generate_splits(text) + + redundancy_ratio = [] + for gram in [words, sentences]: + counts = Counter(gram) + repeated_grams = sum(count for count in counts.values() if count > 1) + redundancy_ratio.append(repeated_grams / len(gram) if len(gram) > 0 else 0) + + return np.mean(redundancy_ratio) + scores = [] + + return calculate_redundancy_one(answer) diff --git a/rageval/metrics/base.py b/rageval/metrics/base.py index 2232560..cf9225d 100644 --- a/rageval/metrics/base.py +++ b/rageval/metrics/base.py @@ -58,6 +58,7 @@ def compute( self, pred_answers: Optional[Iterable] = None, ref_answers: Optional[Iterable] = None, + contexts: Optional[Iterable] = None, batch_size: Optional[int] = None, *args: Optional[Iterable], ) -> Tuple[float, List[float]]: @@ -67,7 +68,7 @@ def compute( Return average scores of all inputs and a score list for each example. """ self._validate_data(pred_answers, ref_answers, *args) - scores = self._compute_batch(pred_answers, ref_answers, *args) + scores = self._compute_batch(pred_answers, ref_answers, contexts, *args) return np.average(scores), scores @@ -76,6 +77,7 @@ def _compute_one( self, pred_answer: Optional[Iterable] = None, ref_answer: Optional[Iterable] = None, + context: Optional[Iterable] = None, *args: Optional[Iterable] ) -> float: ... # pragma: no cover @@ -84,11 +86,17 @@ def _compute_batch( self, pred_answers: Optional[Iterable] = None, ref_answers: Optional[Iterable] = None, + contexts: Optional[Iterable] = None, *args: Optional[Iterable] ) -> List[float]: """Compute the metric for a batch of predictions and references.""" scores = [] - if (pred_answers and ref_answers): # if both columns exist + if contexts: + for pred_answer, ref_answer, context in tqdm(zip(pred_answers, ref_answers, contexts), + desc=f"Computing {self.name}", + total=len(pred_answers)): + scores.append(self._compute_one(pred_answer, ref_answer, context)) + elif (pred_answers and ref_answers): # if both columns exist for pred_answer, ref_answer in tqdm(zip(pred_answers, ref_answers), desc=f"Computing {self.name}", total=len(pred_answers)): diff --git a/tests/units/test_claim_faithfulness.py b/tests/units/test_claim_faithfulness.py new file mode 100644 index 0000000..6ed6caa --- /dev/null +++ b/tests/units/test_claim_faithfulness.py @@ -0,0 +1,41 @@ +import pytest +from datasets import Dataset + +from rageval.metrics import ClaimFaithfulness +import rageval as rl + + +@pytest.fixture(scope='module') +def sample(): + test_case = { + "questions": [ + "习近平主席在何时何地会见了埃塞俄比亚总理海尔马里亚姆?", + "埃塞俄比亚希望与中国在哪些领域加强合作?" + ], + "answers": [ + "习近平主席在2017年5月12日于北京人民大会堂会见了埃塞俄比亚总理海尔马里亚姆。", + "埃塞俄比亚希望与中国在以下领域加强合作:\n1. **共建“一带一路”框架下合作**:埃塞俄比亚表示希望能够积极参与“一带一路”倡议,深化与中国在基础设施建设、产能合作、互联互通等领域的合作。\n2. **提高工业化水平和出口创汇能力**:埃塞俄比亚期待中国在推动其工业化进程以及提升出口创汇能力方面提供帮助和合作。\n3. **安全、有序、有效推进经贸合作**:希望与中方在贸易和投资合作领域取得进展,实现稳定、有序和高效的合作。" + ], + "contexts":[ + ["人民网北京5月12日电??(记者杜尚泽)国家主席习近平12日在人民大会堂会见来华出席“一带一路”国际合作高峰论坛的埃塞俄比亚总理海尔马里亚姆习近平指出,在双方共同努力下,中埃合作已经走在中非合作前列为体现双方关系的战略性和政治互信的高水平,我提议将中埃关系定位提升为全面战略合作伙伴关系双方要保持高层交往势头,扩大各领域交流合作,在涉及彼此核心利益和重大关切问题上相互理解和支持要集中资源优势,重点打造互联互通、产能等领域合作项目,安全、有序、有效推进两国经贸互利合作要加强青年、妇女、高校、智库、媒体、文化等人文领域交流合作,增进两国人民相互了解和友谊,中方欢迎两国扩大航空合作习近平强调,埃塞俄比亚是非洲有重要影响力的国家,也是海上丝绸之路的历史和自然延伸中方祝贺埃塞俄比亚加入亚洲基础设施投资银行,支持埃方继续发挥“一带一路”连接非洲的桥梁和纽带作用中方愿同埃方一道,以落实中非合作论坛约翰内斯堡峰会成果和推进“一带一路”建设为契机,全面深化两国各领域合作,促进合作共赢、共同发展海尔马里亚姆表示,埃中在经贸、投资、产能、基础设施建设等广泛领域合作取得长足进展埃方愿在落实中非合作论坛约翰内斯堡峰会成果框架内深化埃中合作“一带一路”倡议是有远见、具有全球意义的伟大倡议,将有助于推进互联互通、促进贸易投资合作埃方将积极参与“一带一路”建设合作王沪宁、栗战书、杨洁篪等参加会见《 人民日报 》( 2017年05月13日 01 版)","人民网北京5月12日电??(记者杜尚泽)国家主席习近平12日在人民大会堂会见来华出席“一带一路”国际合作高峰论坛的埃塞俄比亚总理海尔马里亚姆。习近平指出,在双方共同努力下,中埃合作已经走在中非合作前列。为体现双方关系的战略性和政治互信的高水平,我提议将中埃关系定位提升为全面战略合作伙伴关系。双方要保持高层交往势头,扩大各领域交流合作,在涉及彼此核心利益和重大关切问题上相互理解和支持。要集中资源优势,重点打造互联互通、产能等领域合作项目,安全、有序、有效推进两国经贸互利合作。要加强青年、妇女、高校、智库、媒体、文化等人文领域交流合作,增进两国人民相互了解和友谊,中方欢迎两国扩大航空合作。习近平强调,埃塞俄比亚是非洲有重要影响力的国家,也是海上丝绸之路的历史和自然延伸。中方祝贺埃塞俄比亚加入亚洲基础设施投资银行,支持埃方继续发挥“一带一路”连接非洲的桥梁和纽带作用。中方愿同埃方一道,以落实中非合作论坛约翰内斯堡峰会成果和推进“一带一路”建设为契机,全面深化两国各领域合作,促进合作共赢、共同发展。海尔马里亚姆表示,埃中在经贸、投资、产能、基础设施建设等广泛领域合作取得长足进展。埃方愿在落实中非合作论坛约翰内斯堡峰会成果框架内深化埃中合作。“一带一路”倡议是有远见、具有全球意义的伟大倡议,将有助于推进互联互通、促进贸易投资合作。埃方将积极参与“一带一路”建设合作。王沪宁、栗战书、杨洁篪等参加会见。《 人民日报 》( 2017年05月13日 01 版)"], + ["人民网北京5月12日电??(记者杜尚泽)国家主席习近平12日在人民大会堂会见来华出席“一带一路”国际合作高峰论坛的埃塞俄比亚总理海尔马里亚姆。习近平指出,在双方共同努力下,中埃合作已经走在中非合作前列。为体现双方关系的战略性和政治互信的高水平,我提议将中埃关系定位提升为全面战略合作伙伴关系。双方要保持高层交往势头,扩大各领域交流合作,在涉及彼此核心利益和重大关切问题上相互理解和支持。要集中资源优势,重点打造互联互通、产能等领域合作项目,安全、有序、有效推进两国经贸互利合作。要加强青年、妇女、高校、智库、媒体、文化等人文领域交流合作,增进两国人民相互了解和友谊,中方欢迎两国扩大航空合作。习近平强调,埃塞俄比亚是非洲有重要影响力的国家,也是海上丝绸之路的历史和自然延伸。中方祝贺埃塞俄比亚加入亚洲基础设施投资银行,支持埃方继续发挥“一带一路”连接非洲的桥梁和纽带作用。中方愿同埃方一道,以落实中非合作论坛约翰内斯堡峰会成果和推进“一带一路”建设为契机,全面深化两国各领域合作,促进合作共赢、共同发展。海尔马里亚姆表示,埃中在经贸、投资、产能、基础设施建设等广泛领域合作取得长足进展。埃方愿在落实中非合作论坛约翰内斯堡峰会成果框架内深化埃中合作。“一带一路”倡议是有远见、具有全球意义的伟大倡议,将有助于推进互联互通、促进贸易投资合作。埃方将积极参与“一带一路”建设合作。王沪宁、栗战书、杨洁篪等参加会见。《 人民日报 》( 2017年05月13日 01 版)","人民网北京5月12日电??(记者杜尚泽)国家主席习近平12日在人民大会堂会见来华出席“一带一路”国际合作高峰论坛的埃塞俄比亚总理海尔马里亚姆。习近平指出,在双方共同努力下,中埃合作已经走在中非合作前列。为体现双方关系的战略性和政治互信的高水平,我提议将中埃关系定位提升为全面战略合作伙伴关系。双方要保持高层交往势头,扩大各领域交流合作,在涉及彼此核心利益和重大关切问题上相互理解和支持。要集中资源优势,重点打造互联互通、产能等领域合作项目,安全、有序、有效推进两国经贸互利合作。要加强青年、妇女、高校、智库、媒体、文化等人文领域交流合作,增进两国人民相互了解和友谊,中方欢迎两国扩大航空合作。习近平强调,埃塞俄比亚是非洲有重要影响力的国家,也是海上丝绸之路的历史和自然延伸。中方祝贺埃塞俄比亚加入亚洲基础设施投资银行,支持埃方继续发挥“一带一路”连接非洲的桥梁和纽带作用。中方愿同埃方一道,以落实中非合作论坛约翰内斯堡峰会成果和推进“一带一路”建设为契机,全面深化两国各领域合作,促进合作共赢、共同发展。海尔马里亚姆表示,埃中在经贸、投资、产能、基础设施建设等广泛领域合作取得长足进展。埃方愿在落实中非合作论坛约翰内斯堡峰会成果框架内深化埃中合作。“一带一路”倡议是有远见、具有全球意义的伟大倡议,将有助于推进互联互通、促进贸易投资合作。埃方将积极参与“一带一路”建设合作。王沪宁、栗战书、杨洁篪等参加会见。《 人民日报 》( 2017年05月13日 01 版)"] + ] + } + return test_case + + +@pytest.fixture(scope='module') +def testset(sample): + ds = Dataset.from_dict(sample) + return ds + + +@pytest.mark.skip +def test_case_on_text_length(testset): + metric = ClaimFaithfulness(model = "openai//home/gomall/models/Qwen2-7B-Instruct", + api_base = "http://project.gomall.ac.cn:30590/notebook/tensorboard/wangwenshan/1161/v1", + api_key = "sk-123456789") + assert metric.name == "claim_faithfulness" + score, results = metric.compute(testset["answers"], testset["questions"], testset["contexts"]) + print(score, results) + assert isinstance(score, float) diff --git a/tests/units/test_claim_num.py b/tests/units/test_claim_num.py new file mode 100644 index 0000000..270708a --- /dev/null +++ b/tests/units/test_claim_num.py @@ -0,0 +1,38 @@ +import pytest +from datasets import Dataset + +from rageval.metrics import ClaimNum +import rageval as rl + + +@pytest.fixture(scope='module') +def sample(): + test_case = { + "questions": [ + "习近平主席在何时何地会见了埃塞俄比亚总理海尔马里亚姆?", + "埃塞俄比亚希望与中国在哪些领域加强合作?" + ], + "answers": [ + "习近平主席在2017年5月12日于北京人民大会堂会见了埃塞俄比亚总理海尔马里亚姆。", + "埃塞俄比亚希望与中国在以下领域加强合作:\n1. **共建“一带一路”框架下合作**:埃塞俄比亚表示希望能够积极参与“一带一路”倡议,深化与中国在基础设施建设、产能合作、互联互通等领域的合作。\n2. **提高工业化水平和出口创汇能力**:埃塞俄比亚期待中国在推动其工业化进程以及提升出口创汇能力方面提供帮助和合作。\n3. **安全、有序、有效推进经贸合作**:希望与中方在贸易和投资合作领域取得进展,实现稳定、有序和高效的合作。" + ] + } + return test_case + + +@pytest.fixture(scope='module') +def testset(sample): + ds = Dataset.from_dict(sample) + return ds + + +@pytest.mark.skip +def test_case_on_text_length(testset): + metric = ClaimNum(model = "openai//home/gomall/models/Qwen2-7B-Instruct", + api_base = "http://project.gomall.ac.cn:30590/notebook/tensorboard/wangwenshan/1161/v1", + api_key = "sk-123456789") + assert metric.name == "claim_num" + score, results = metric.compute(testset["answers"], testset["questions"]) + print(score, results) # 5.5 [3,8] + assert isinstance(score, float) + diff --git a/tests/units/test_repetitiveness.py b/tests/units/test_repetitiveness.py new file mode 100644 index 0000000..427cae5 --- /dev/null +++ b/tests/units/test_repetitiveness.py @@ -0,0 +1,32 @@ +import pytest +from datasets import Dataset + +from rageval.metrics import Repetitiveness +import rageval as rl + + +#@pytest.fixture(scope='module') +def sample(): + test_case = { + "answers": [ + "习近平主席在2017年5月12日于北京人民大会堂会见了埃塞俄比亚总理海尔马里亚姆。", + "埃塞俄比亚希望与中国在以下领域加强合作:\n1. **共建“一带一路”框架下合作**:埃塞俄比亚表示希望能够积极参与“一带一路”倡议,深化与中国在基础设施建设、产能合作、互联互通等领域的合作。\n2. **提高工业化水平和出口创汇能力**:埃塞俄比亚期待中国在推动其工业化进程以及提升出口创汇能力方面提供帮助和合作。\n3. **安全、有序、有效推进经贸合作**:希望与中方在贸易和投资合作领域取得进展,实现稳定、有序和高效的合作。" + ] + } + return test_case + + +#@pytest.fixture(scope='module') +def testset(sample): + ds = Dataset.from_dict(sample) + return ds + + +#@pytest.mark.slow +def test_case_on_text_length(testset): + metric = Repetitiveness() + assert metric.name == "repetitiveness" + score, results = metric.compute(testset["answers"]) + assert round(score, 2) == 0.16 + +test_case_on_text_length(testset(sample())) From 31391162363f9bab45f41b379f00b6623c9df827 Mon Sep 17 00:00:00 2001 From: Wenshansilvia Date: Wed, 13 Nov 2024 19:19:23 +0800 Subject: [PATCH 2/9] add claimnum claimfaithfulness repetitiveness --- rageval/metrics/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rageval/metrics/base.py b/rageval/metrics/base.py index cf9225d..651c404 100644 --- a/rageval/metrics/base.py +++ b/rageval/metrics/base.py @@ -58,8 +58,8 @@ def compute( self, pred_answers: Optional[Iterable] = None, ref_answers: Optional[Iterable] = None, - contexts: Optional[Iterable] = None, batch_size: Optional[int] = None, + contexts: Optional[Iterable] = None, *args: Optional[Iterable], ) -> Tuple[float, List[float]]: """ From 6d0b12226f8232368647fb47f5348b7175544553 Mon Sep 17 00:00:00 2001 From: Wenshansilvia Date: Wed, 13 Nov 2024 19:21:17 +0800 Subject: [PATCH 3/9] add claimnum claimfaithfulness repetitiveness --- tests/units/test_claim_faithfulness.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/units/test_claim_faithfulness.py b/tests/units/test_claim_faithfulness.py index 6ed6caa..04c6284 100644 --- a/tests/units/test_claim_faithfulness.py +++ b/tests/units/test_claim_faithfulness.py @@ -36,6 +36,8 @@ def test_case_on_text_length(testset): api_base = "http://project.gomall.ac.cn:30590/notebook/tensorboard/wangwenshan/1161/v1", api_key = "sk-123456789") assert metric.name == "claim_faithfulness" - score, results = metric.compute(testset["answers"], testset["questions"], testset["contexts"]) + score, results = metric.compute(pred_answers = testset["answers"], + ref_answers = testset["questions"], + contexts = testset["contexts"]) print(score, results) assert isinstance(score, float) From 798f57c341969f7308586a8aaee4f2124541be12 Mon Sep 17 00:00:00 2001 From: Wenshansilvia Date: Fri, 15 Nov 2024 15:50:16 +0800 Subject: [PATCH 4/9] add claimnum claimfaithfulness repetitiveness --- rageval/metrics/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rageval/metrics/base.py b/rageval/metrics/base.py index 651c404..0fb1140 100644 --- a/rageval/metrics/base.py +++ b/rageval/metrics/base.py @@ -68,7 +68,10 @@ def compute( Return average scores of all inputs and a score list for each example. """ self._validate_data(pred_answers, ref_answers, *args) - scores = self._compute_batch(pred_answers, ref_answers, contexts, *args) + if contexts: + scores = self._compute_batch(pred_answers, ref_answers, contexts, *args) + else: + scores = self._compute_batch(pred_answers, ref_answers, *args) return np.average(scores), scores From 4a69e345327cecd3dbe2bd6363f55fb297f3331e Mon Sep 17 00:00:00 2001 From: Wenshansilvia Date: Fri, 15 Nov 2024 18:14:40 +0800 Subject: [PATCH 5/9] add claimnum claimfaithfulness repetitiveness --- .../metrics/answer_groundedness/_claim_faithfulness.py | 4 +--- rageval/metrics/answer_informativeness/_claim_num.py | 4 +--- tests/units/test_repetitiveness.py | 8 +++----- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/rageval/metrics/answer_groundedness/_claim_faithfulness.py b/rageval/metrics/answer_groundedness/_claim_faithfulness.py index 06937a8..2bdde4e 100644 --- a/rageval/metrics/answer_groundedness/_claim_faithfulness.py +++ b/rageval/metrics/answer_groundedness/_claim_faithfulness.py @@ -36,9 +36,7 @@ ... ] ... } >>> dataset = Dataset.from_dict(sample) - >>> metric = ClaimFaithfulness(model = "openai//home/gomall/models/Qwen2-7B-Instruct", - api_base = "http://localhost:5000/v1", - api_key = "sk-123456789") + >>> metric = ClaimFaithfulness(model = "openai//home/gomall/models/Qwen2-7B-Instruct", api_base = "http://project.gomall.ac.cn:30590/notebook/tensorboard/wangwenshan/1161/v1", api_key = "sk-123456789") >>> metric.mtype 'answer_informativeness' """ diff --git a/rageval/metrics/answer_informativeness/_claim_num.py b/rageval/metrics/answer_informativeness/_claim_num.py index 188465d..adc5c6f 100644 --- a/rageval/metrics/answer_informativeness/_claim_num.py +++ b/rageval/metrics/answer_informativeness/_claim_num.py @@ -33,9 +33,7 @@ ... ] ... } >>> dataset = Dataset.from_dict(sample) - >>> metric = ClaimNum(model = "openai//home/gomall/models/Qwen2-7B-Instruct", - api_base = "http://localhost:5000/v1", - api_key = "sk-123456789") + >>> metric = ClaimNum(model = "openai//home/gomall/models/Qwen2-7B-Instruct", api_base = "http://localhost:5000/v1", api_key = "sk-123456789") >>> metric.mtype 'answer_informativeness' """ diff --git a/tests/units/test_repetitiveness.py b/tests/units/test_repetitiveness.py index 427cae5..16173eb 100644 --- a/tests/units/test_repetitiveness.py +++ b/tests/units/test_repetitiveness.py @@ -5,7 +5,7 @@ import rageval as rl -#@pytest.fixture(scope='module') +@pytest.fixture(scope='module') def sample(): test_case = { "answers": [ @@ -16,17 +16,15 @@ def sample(): return test_case -#@pytest.fixture(scope='module') +@pytest.fixture(scope='module') def testset(sample): ds = Dataset.from_dict(sample) return ds -#@pytest.mark.slow +@pytest.mark.slow def test_case_on_text_length(testset): metric = Repetitiveness() assert metric.name == "repetitiveness" score, results = metric.compute(testset["answers"]) assert round(score, 2) == 0.16 - -test_case_on_text_length(testset(sample())) From 8382d707ca8dcb5a26fa08807cf7fc34faf51022 Mon Sep 17 00:00:00 2001 From: Wenshansilvia Date: Mon, 18 Nov 2024 11:04:13 +0800 Subject: [PATCH 6/9] add claimnum claimfaithfulness repetitiveness --- rageval/metrics/answer_groundedness/_claim_faithfulness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rageval/metrics/answer_groundedness/_claim_faithfulness.py b/rageval/metrics/answer_groundedness/_claim_faithfulness.py index 2bdde4e..7ea840b 100644 --- a/rageval/metrics/answer_groundedness/_claim_faithfulness.py +++ b/rageval/metrics/answer_groundedness/_claim_faithfulness.py @@ -36,7 +36,7 @@ ... ] ... } >>> dataset = Dataset.from_dict(sample) - >>> metric = ClaimFaithfulness(model = "openai//home/gomall/models/Qwen2-7B-Instruct", api_base = "http://project.gomall.ac.cn:30590/notebook/tensorboard/wangwenshan/1161/v1", api_key = "sk-123456789") + >>> metric = ClaimFaithfulness(model = "openai//home/gomall/models/Qwen2-7B-Instruct", api_base = "http://project.gomall.ac.cn:30590/notebook/tensorboard/wangwenshan/1161/v1", api_key = "sk-123456789") >>> metric.mtype 'answer_informativeness' """ From 4afa71ce4da9c2cc7a01d28b26a1439e20c67887 Mon Sep 17 00:00:00 2001 From: Wenshansilvia Date: Mon, 18 Nov 2024 11:30:34 +0800 Subject: [PATCH 7/9] add claimnum claimfaithfulness repetitiveness --- .../_claim_faithfulness.py | 33 ++++++++++--------- .../answer_informativeness/_claim_num.py | 7 ++-- .../answer_informativeness/_repetitiveness.py | 7 ++-- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/rageval/metrics/answer_groundedness/_claim_faithfulness.py b/rageval/metrics/answer_groundedness/_claim_faithfulness.py index 7ea840b..57b5a85 100644 --- a/rageval/metrics/answer_groundedness/_claim_faithfulness.py +++ b/rageval/metrics/answer_groundedness/_claim_faithfulness.py @@ -11,7 +11,7 @@ _DESCRIPTION = """\ -ClaimFaithfulness is a metric that evaluates to what extend does the answer follows the given evidences. +ClaimFaithfulness is a metric that evaluates to what extend does the answer follows the given evidences. It is calculated by first utilizing the open-source tool RefChecker to extract claims from the generated text, and then use the same tool to check whether evidences can entail each claim. The ultimate measure is the total number of entailment, providing insight into the faithfulness to given evidences in the model's outputs. """ @@ -41,6 +41,7 @@ 'answer_informativeness' """ + @dataclass @add_attribute('mtype', 'answer_informativeness') @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) @@ -93,28 +94,28 @@ def _compute_one( batch_responses=[answer], batch_questions=[question], max_new_tokens=1000 - ) - claims = [[c.content for c in res.claims] for res in extraction_results] + ) + claims = [[c.content for c in res.claims] for res in extraction_results] merge_psg = False checking_results = self.checker.check( - batch_claims=claims, - batch_references=[context], - batch_questions=[question], - max_reference_segment_length=0, - merge_psg=merge_psg, - is_joint=True, - joint_check_num=5, - sagemaker_client=None, - sagemaker_params=None, - sagemaker_get_response_func=None, - ) + batch_claims=claims, + batch_references=[context], + batch_questions=[question], + max_reference_segment_length=0, + merge_psg=merge_psg, + is_joint=True, + joint_check_num=5, + sagemaker_client=None, + sagemaker_params=None, + sagemaker_get_response_func=None, + ) def to_bool(checking_results): if isinstance(checking_results, str): return checking_results == "Entailment" return np.array([to_bool(res) for res in checking_results]) - + retrieved2response = to_bool(checking_results) faithful = np.max(retrieved2response, axis=2) faithfulness_score = np.mean(faithful) - return faithfulness_score \ No newline at end of file + return faithfulness_score diff --git a/rageval/metrics/answer_informativeness/_claim_num.py b/rageval/metrics/answer_informativeness/_claim_num.py index adc5c6f..561b417 100644 --- a/rageval/metrics/answer_informativeness/_claim_num.py +++ b/rageval/metrics/answer_informativeness/_claim_num.py @@ -8,7 +8,7 @@ _DESCRIPTION = """\ -ClaimNum is a metric designed to evaluate the richness of claims generated by the model. +ClaimNum is a metric designed to evaluate the richness of claims generated by the model. It is calculated by first utilizing the open-source tool RefChecker to extract claims from the generated text. The ultimate measure is the total number of distinct claims identified, providing insight into the diversity of opinions presented in the model's outputs. """ @@ -38,6 +38,7 @@ 'answer_informativeness' """ + @dataclass @add_attribute('mtype', 'answer_informativeness') @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) @@ -89,7 +90,7 @@ def _compute_one( batch_questions=[question], max_new_tokens=1000 ) - claims = [[c.content for c in res.claims] for res in extraction_results] + claims = [[c.content for c in res.claims] for res in extraction_results] claim_num = len(claims[0]) - return claim_num \ No newline at end of file + return claim_num diff --git a/rageval/metrics/answer_informativeness/_repetitiveness.py b/rageval/metrics/answer_informativeness/_repetitiveness.py index 5a680d8..d430f35 100644 --- a/rageval/metrics/answer_informativeness/_repetitiveness.py +++ b/rageval/metrics/answer_informativeness/_repetitiveness.py @@ -11,7 +11,7 @@ _DESCRIPTION = """\ -Repetitiveness is a metric that evaluates repetitiveness/redundency generated by the model. +Repetitiveness is a metric that evaluates repetitiveness/redundency generated by the model. It is calculated by first spliting answer into words unit and sentences. Then counts the apperance of same words and sentences diveded by the total number. Higher the number, higher the redundency. """ @@ -41,6 +41,7 @@ 'answer_informativeness' """ + @dataclass @add_attribute('mtype', 'answer_informativeness') @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) @@ -86,16 +87,14 @@ def generate_splits(text): words = word_tokenize(text) sentences = sent_tokenize(text) return words, sentences + def calculate_redundancy_one(text): words, sentences = generate_splits(text) - redundancy_ratio = [] for gram in [words, sentences]: counts = Counter(gram) repeated_grams = sum(count for count in counts.values() if count > 1) redundancy_ratio.append(repeated_grams / len(gram) if len(gram) > 0 else 0) - return np.mean(redundancy_ratio) - scores = [] return calculate_redundancy_one(answer) From 336a63859f707aaddb88ad6a64c6a1c9c2a06fd9 Mon Sep 17 00:00:00 2001 From: Wenshansilvia Date: Mon, 18 Nov 2024 15:46:08 +0800 Subject: [PATCH 8/9] add claimnum claimfaithfulness repetitiveness --- .../_claim_faithfulness.py | 33 +++++++++---------- rageval/metrics/base.py | 4 +-- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/rageval/metrics/answer_groundedness/_claim_faithfulness.py b/rageval/metrics/answer_groundedness/_claim_faithfulness.py index 57b5a85..e345b06 100644 --- a/rageval/metrics/answer_groundedness/_claim_faithfulness.py +++ b/rageval/metrics/answer_groundedness/_claim_faithfulness.py @@ -90,25 +90,24 @@ def _compute_one( *args: Optional[Iterable], ) -> float: """Evaluating the richness of claims contained in answers.""" - extraction_results = self.extractor.extract( - batch_responses=[answer], - batch_questions=[question], - max_new_tokens=1000 - ) + extraction_results = self.extractor.extract(batch_responses=[answer], + batch_questions=[question], + max_new_tokens=1000 + ) claims = [[c.content for c in res.claims] for res in extraction_results] merge_psg = False - checking_results = self.checker.check( - batch_claims=claims, - batch_references=[context], - batch_questions=[question], - max_reference_segment_length=0, - merge_psg=merge_psg, - is_joint=True, - joint_check_num=5, - sagemaker_client=None, - sagemaker_params=None, - sagemaker_get_response_func=None, - ) + checking_results = self.checker.check(batch_claims=claims, + batch_references=[context], + batch_questions=[question], + max_reference_segment_length=0, + merge_psg=merge_psg, + is_joint=True, + joint_check_num=5, + sagemaker_client=None, + sagemaker_params=None, + sagemaker_get_response_func=None + ) + def to_bool(checking_results): if isinstance(checking_results, str): return checking_results == "Entailment" diff --git a/rageval/metrics/base.py b/rageval/metrics/base.py index 0fb1140..c7fdfa1 100644 --- a/rageval/metrics/base.py +++ b/rageval/metrics/base.py @@ -96,8 +96,8 @@ def _compute_batch( scores = [] if contexts: for pred_answer, ref_answer, context in tqdm(zip(pred_answers, ref_answers, contexts), - desc=f"Computing {self.name}", - total=len(pred_answers)): + desc=f"Computing {self.name}", + total=len(pred_answers)): scores.append(self._compute_one(pred_answer, ref_answer, context)) elif (pred_answers and ref_answers): # if both columns exist for pred_answer, ref_answer in tqdm(zip(pred_answers, ref_answers), From eb939d7698fdf0b5c7583876c896407a321a4ff8 Mon Sep 17 00:00:00 2001 From: Wenshansilvia Date: Mon, 18 Nov 2024 17:16:04 +0800 Subject: [PATCH 9/9] add claimnum claimfaithfulness repetitiveness --- rageval/metrics/answer_groundedness/_claim_faithfulness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rageval/metrics/answer_groundedness/_claim_faithfulness.py b/rageval/metrics/answer_groundedness/_claim_faithfulness.py index e345b06..5af3537 100644 --- a/rageval/metrics/answer_groundedness/_claim_faithfulness.py +++ b/rageval/metrics/answer_groundedness/_claim_faithfulness.py @@ -107,7 +107,7 @@ def _compute_one( sagemaker_params=None, sagemaker_get_response_func=None ) - + def to_bool(checking_results): if isinstance(checking_results, str): return checking_results == "Entailment"