gomate-community · Wenshansilvia · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 15, 2024
diff --git a/rageval/metrics/__init__.py b/rageval/metrics/__init__.py
@@ -19,12 +19,12 @@
 from .answer_groundedness._answer_citation_precision import AnswerCitationPrecision
 from .answer_groundedness._answer_citation_recall import AnswerCitationRecall
 from .answer_groundedness._context_reject_rate import ContextRejectRate
-##from .answer_groundedness._claim_faithfulness import ClaimFaithfulness
+from .answer_groundedness._claim_faithfulness import ClaimFaithfulness
 
 # Metrics about the answer informativeness
-##from .answer_informative._claim_num import ClaimNum
+from .answer_informativeness._claim_num import ClaimNum
 from .answer_informativeness._text_length import TextLength
-##from .answer_informativeness._repetitiveness import Repetitiveness
+from .answer_informativeness._repetitiveness import Repetitiveness
 ##from .answer_informativeness._pairwise_accuracy import PairwiseAccuracy
 from .answer_informativeness._answer_distinct12 import AnswerDistinct
 

diff --git a/rageval/metrics/answer_groundedness/_claim_faithfulness.py b/rageval/metrics/answer_groundedness/_claim_faithfulness.py
@@ -1,0 +1,120 @@
+from dataclasses import dataclass
+from typing import Optional, Iterable
+from refchecker.extractor import LLMExtractor
+from refchecker.checker import LLMChecker
+
+import evaluate
+import datasets
+import os
+from rageval.metrics import Metric, add_attribute
+import numpy as np
+
+
+_DESCRIPTION = """\
+ClaimFaithfulness is a metric that evaluates to what extend does the answer follows the given evidences.
+
+It is calculated by first utilizing the open-source tool RefChecker to extract claims from the generated text, and then use the same tool to check whether evidences can entail each claim. The ultimate measure is the total number of entailment, providing insight into the faithfulness to given evidences in the model's outputs.
+"""
+
+_KWARGS_DESCRIPTION = """\
+Args:
+    name : str
+
+Optional Args:
+    None
+
+Functions:
+    _compute_one: Evaluating the faithfulness of claims generated.
+
+Examples:
+    >>> from datasets import Dataset
+    >>> import rageval as rl
+    >>> sample = {
+    ...     "answers": [
+    ...         "A",
+    ...         "C",
+    ...     ]
+    ... }
+    >>> dataset = Dataset.from_dict(sample)
+    >>> metric = ClaimFaithfulness(model = "openai//home/gomall/models/Qwen2-7B-Instruct", api_base = "http://project.gomall.ac.cn:30590/notebook/tensorboard/wangwenshan/1161/v1", api_key = "sk-123456789")
+    >>> metric.mtype
+    'answer_informativeness'
+"""
+
+
+@dataclass
+@add_attribute('mtype', 'answer_informativeness')
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class ClaimFaithfulness(Metric):
+    """Estimates the faithfulness of claims contained in answers."""
+
+    name = "claim_faithfulness"
+
+    ALIAS = ['claim_faithfulness']
+
+    def __init__(self, model: str = "openai//home/gomall/models/Qwen2-7B-Instruct",
+                 api_base: str = "http://localhost:5000/v1",
+                 api_key: str = "sk-123456789"):
+        """
+        Explicitly initialize ClaimFaithfulness.
+
+        Ensure all parent classes are initialized.
+        """
+        self.extractor = LLMExtractor(model=model, batch_size=8, api_base=api_base)
+        self.checker = LLMChecker(model=model, batch_size=8, api_base=api_base)
+        os.environ['OPENAI_API_KEY'] = api_key
+        super().__init__()
+        self.info = evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            citation="",
+            homepage="",
+            features=datasets.Features(
+                {
+                    "answers": datasets.Value("string"),
+                }
+            ),
+            codebase_urls=[],
+            reference_urls=[]
+        )
+
+    def __repr__(self) -> str:
+        """:return: Formatted string representation of the metric."""
+        return f"{self.ALIAS[0]}"  # pragma: no cover
+
+    def _compute_one(
+        self,
+        answer: str,
+        question: str,
+        context: str,
+        *args: Optional[Iterable],
+    ) -> float:
+        """Evaluating the richness of claims contained in answers."""
+        extraction_results = self.extractor.extract(batch_responses=[answer],
+                                                    batch_questions=[question],
+                                                    max_new_tokens=1000
+                                                    )
+        claims = [[c.content for c in res.claims] for res in extraction_results]
+        merge_psg = False
+        checking_results = self.checker.check(batch_claims=claims,
+                                              batch_references=[context],
+                                              batch_questions=[question],
+                                              max_reference_segment_length=0,
+                                              merge_psg=merge_psg,
+                                              is_joint=True,
+                                              joint_check_num=5,
+                                              sagemaker_client=None,
+                                              sagemaker_params=None,
+                                              sagemaker_get_response_func=None
+                                              )
+
+        def to_bool(checking_results):
+            if isinstance(checking_results, str):
+                return checking_results == "Entailment"
+            return np.array([to_bool(res) for res in checking_results])
+
+        retrieved2response = to_bool(checking_results)
+        faithful = np.max(retrieved2response, axis=2)
+        faithfulness_score = np.mean(faithful)
+
+        return faithfulness_score
diff --git a/rageval/metrics/answer_informativeness/_claim_num.py b/rageval/metrics/answer_informativeness/_claim_num.py
@@ -1,0 +1,96 @@
+from dataclasses import dataclass
+from typing import Optional, Iterable
+from refchecker.extractor import LLMExtractor
+import evaluate
+import datasets
+import os
+from rageval.metrics import Metric, add_attribute
+
+
+_DESCRIPTION = """\
+ClaimNum is a metric designed to evaluate the richness of claims generated by the model.
+
+It is calculated by first utilizing the open-source tool RefChecker to extract claims from the generated text. The ultimate measure is the total number of distinct claims identified, providing insight into the diversity of opinions presented in the model's outputs.
+"""
+
+_KWARGS_DESCRIPTION = """\
+Args:
+    name : str
+
+Optional Args:
+    None
+
+Functions:
+    _compute_one: Evaluating the richness of claims generated.
+
+Examples:
+    >>> from datasets import Dataset
+    >>> import rageval as rl
+    >>> sample = {
+    ...     "answers": [
+    ...         "A",
+    ...         "C",
+    ...     ]
+    ... }
+    >>> dataset = Dataset.from_dict(sample)
+    >>> metric = ClaimNum(model = "openai//home/gomall/models/Qwen2-7B-Instruct", api_base = "http://localhost:5000/v1", api_key = "sk-123456789")
+    >>> metric.mtype
+    'answer_informativeness'
+"""
+
+
+@dataclass
+@add_attribute('mtype', 'answer_informativeness')
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class ClaimNum(Metric):
+    """Estimates the richness of claims contained in answers."""
+
+    name = "claim_num"
+
+    ALIAS = ['claim_num']
+
+    def __init__(self, model: str = "openai//home/gomall/models/Qwen2-7B-Instruct",
+                 api_base: str = "http://localhost:5000/v1",
+                 api_key: str = "sk-123456789"):
+        """
+        Explicitly initialize TextLength.
+
+        Ensure all parent classes are initialized.
+        """
+        self.extractor = LLMExtractor(model=model, batch_size=8, api_base=api_base)
+        os.environ['OPENAI_API_KEY'] = api_key
+        super().__init__()
+        self.info = evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            citation="",
+            homepage="",
+            features=datasets.Features(
+                {
+                    "answers": datasets.Value("string"),
+                }
+            ),
+            codebase_urls=[],
+            reference_urls=[]
+        )
+
+    def __repr__(self) -> str:
+        """:return: Formatted string representation of the metric."""
+        return f"{self.ALIAS[0]}"  # pragma: no cover
+
+    def _compute_one(
+        self,
+        answer: str,
+        question: str,
+        *args: Optional[Iterable],
+    ) -> float:
+        """Evaluating the richness of claims contained in answers."""
+        extraction_results = self.extractor.extract(
+            batch_responses=[answer],
+            batch_questions=[question],
+            max_new_tokens=1000
+        )
+        claims = [[c.content for c in res.claims] for res in extraction_results]
+        claim_num = len(claims[0])
+
+        return claim_num
diff --git a/rageval/metrics/answer_informativeness/_repetitiveness.py b/rageval/metrics/answer_informativeness/_repetitiveness.py
@@ -0,0 +1,100 @@
+from dataclasses import dataclass
+from typing import Optional, Iterable
+
+import evaluate
+import datasets
+import os
+from rageval.metrics import Metric, add_attribute
+import numpy as np
+from nltk.tokenize import sent_tokenize, word_tokenize
+from collections import Counter
+
+
+_DESCRIPTION = """\
+Repetitiveness is a metric that evaluates repetitiveness/redundency generated by the model.
+
+It is calculated by first spliting answer into words unit and sentences. Then counts the apperance of same words and sentences diveded by the total number. Higher the number, higher the redundency.
+"""
+
+_KWARGS_DESCRIPTION = """\
+Args:
+    name : str
+
+Optional Args:
+    None
+
+Functions:
+    _compute_one: Evaluating the repetitiveness of answers generated.
+
+Examples:
+    >>> from datasets import Dataset
+    >>> import rageval as rl
+    >>> sample = {
+    ...     "answers": [
+    ...         "A",
+    ...         "C",
+    ...     ]
+    ... }
+    >>> dataset = Dataset.from_dict(sample)
+    >>> metric = Repetitiveness()
+    >>> metric.mtype
+    'answer_informativeness'
+"""
+
+
+@dataclass
+@add_attribute('mtype', 'answer_informativeness')
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Repetitiveness(Metric):
+    """Estimates the repetitiveness of answers."""
+
+    name = "repetitiveness"
+
+    ALIAS = ['repetitiveness']
+
+    def __init__(self):
+        """
+        Explicitly initialize Repetitiveness.
+
+        Ensure all parent classes are initialized.
+        """
+        super().__init__()
+        self.info = evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            citation="",
+            homepage="",
+            features=datasets.Features(
+                {
+                    "answers": datasets.Value("string"),
+                }
+            ),
+            codebase_urls=[],
+            reference_urls=[]
+        )
+
+    def __repr__(self) -> str:
+        """:return: Formatted string representation of the metric."""
+        return f"{self.ALIAS[0]}"  # pragma: no cover
+
+    def _compute_one(
+        self,
+        answer: str,
+        *args: Optional[Iterable],
+    ) -> float:
+        """Evaluating the repetitiveness of answer."""
+        def generate_splits(text):
+            words = word_tokenize(text)
+            sentences = sent_tokenize(text)
+            return words, sentences
+
+        def calculate_redundancy_one(text):
+            words, sentences = generate_splits(text)
+            redundancy_ratio = []
+            for gram in [words, sentences]:
+                counts = Counter(gram)
+                repeated_grams = sum(count for count in counts.values() if count > 1)
+                redundancy_ratio.append(repeated_grams / len(gram) if len(gram) > 0 else 0)
+            return np.mean(redundancy_ratio)
+
+        return calculate_redundancy_one(answer)
diff --git a/rageval/metrics/base.py b/rageval/metrics/base.py
@@ -59,6 +59,7 @@
         pred_answers: Optional[Iterable] = None,
         ref_answers: Optional[Iterable] = None,
         batch_size: Optional[int] = None,
+        contexts: Optional[Iterable] = None,
         *args: Optional[Iterable],
     ) -> Tuple[float, List[float]]:
         """
@@ -67,7 +68,10 @@
         Return average scores of all inputs and a score list for each example.
         """
         self._validate_data(pred_answers, ref_answers, *args)
-        scores = self._compute_batch(pred_answers, ref_answers, *args)
+        if contexts:
+            scores = self._compute_batch(pred_answers, ref_answers, contexts, *args)
+        else:
+            scores = self._compute_batch(pred_answers, ref_answers, *args)
 
         return np.average(scores), scores
 
@@ -76,6 +80,7 @@
         self,
         pred_answer: Optional[Iterable] = None,
         ref_answer: Optional[Iterable] = None,
+        context: Optional[Iterable] = None,
         *args: Optional[Iterable]
     ) -> float:
         ...  # pragma: no cover
@@ -84,11 +89,17 @@
         self,
         pred_answers: Optional[Iterable] = None,
         ref_answers: Optional[Iterable] = None,
+        contexts: Optional[Iterable] = None,
         *args: Optional[Iterable]
     ) -> List[float]:
         """Compute the metric for a batch of predictions and references."""
         scores = []
-        if (pred_answers and ref_answers):  # if both columns exist
+        if contexts:
+            for pred_answer, ref_answer, context in tqdm(zip(pred_answers, ref_answers, contexts),
+                                                         desc=f"Computing {self.name}",
+                                                         total=len(pred_answers)):
+                scores.append(self._compute_one(pred_answer, ref_answer, context))
+        elif (pred_answers and ref_answers):  # if both columns exist
             for pred_answer, ref_answer in tqdm(zip(pred_answers, ref_answers),
                                                 desc=f"Computing {self.name}",
                                                 total=len(pred_answers)):