Merge pull request #122 from gomate-community/upgradepython

upgrade python to 3.10
gomate-community · Oct 10, 2024 · bc43863 · bc43863
2 parents be31dae + 019a541
commit bc43863
Show file tree

Hide file tree

Showing 16 changed files with 138 additions and 215 deletions.
diff --git a/.github/workflows/makefile.yml b/.github/workflows/makefile.yml
@@ -17,7 +17,7 @@ jobs:
     - name: Setup Python version
       uses: actions/setup-python@v1
       with:
-        python-version: 3.8.18
+        python-version: 3.10.15
 
     - name: Install requirements
       run: make init

diff --git a/rageval/metrics/answer_correctness/_answer_accuracy.py b/rageval/metrics/answer_correctness/_answer_accuracy.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 from typing import List
+import evaluate
 
 import datasets
 
@@ -83,13 +84,7 @@ def __init__(self):
         Ensure all parent classes are initialized.
         """
         super().__init__()
-
-    def __repr__(self) -> str:
-        """:return: Formatted string representation of the metric."""
-        return f"{self.ALIAS[0]}"
-
-    def _info(self):
-        return datasets.MetricInfo(
+        self.info = evaluate.MetricInfo(
             description=_DESCRIPTION,
             inputs_description=_KWARGS_DESCRIPTION,
             citation=_CITATION,
@@ -104,6 +99,10 @@ def _info(self):
             reference_urls=["https://arxiv.org/abs/2009.03300"]
         )
 
+    def __repr__(self) -> str:
+        """:return: Formatted string representation of the metric."""
+        return f"{self.ALIAS[0]}"
+
     def _compute_one(
         self,
         answer: str,

diff --git a/rageval/metrics/answer_correctness/_answer_bert_score.py b/rageval/metrics/answer_correctness/_answer_bert_score.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 from typing import List, Tuple
+import evaluate
 
 import datasets
 from rageval.metrics import Metric, add_attribute
@@ -88,13 +89,7 @@ def __init__(self, lang: str = "en", rescale_with_baseline=False):
         """Explicitly initialize the AnswerBERTScore to ensure all parent class initialized."""
         super().__init__()
         self.scorer = BERTScorer(lang=lang, rescale_with_baseline=rescale_with_baseline)
-
-    def __repr__(self) -> str:
-        """:return: Formatted string representation of the metric."""
-        return f"{self.ALIAS[0]}"
-
-    def _info(self):
-        return datasets.MetricInfo(
+        self.info = evaluate.MetricInfo(
             description=_DESCRIPTION,
             inputs_description=_KWARGS_DESCRIPTION,
             citation=_CITATION,
@@ -111,6 +106,10 @@ def _info(self):
             reference_urls=["https://openreview.net/forum?id=SkeHuCVFDr"]
         )
 
+    def __repr__(self) -> str:
+        """:return: Formatted string representation of the metric."""
+        return f"{self.ALIAS[0]}"
+
     def _compute_one(
         self,
         pred_answers: str,

diff --git a/rageval/metrics/answer_correctness/_answer_bleu.py b/rageval/metrics/answer_correctness/_answer_bleu.py
@@ -1,10 +1,10 @@
 import re
 from dataclasses import dataclass
 from typing import List, Tuple
-
+import evaluate
 import datasets
-
 from rageval.metrics import Metric, add_attribute
+from tqdm import tqdm
 
 
 _DESCRIPTION = """\
@@ -55,9 +55,9 @@
     'AnswerCorrectness'
     >>> score, results = metric.compute(dataset["answers"], dataset["gt_answers"], 1)
     >>> score
-    0.3172992057845065
+    0.3450835085970013
     >>> results[0]
-    0.49697705300310346
+    0.5401725898595141
 """
 
 
@@ -87,13 +87,7 @@ class AnswerBleuScore(Metric):
     def __init__(self):
         """Explicitly initialize the AnswerBleuScore to ensure all parent class initialized."""
         super().__init__()
-
-    def __repr__(self) -> str:
-        """:return: Formatted string representation of the metric."""
-        return f"{self.ALIAS[0]}"
-
-    def _info(self):
-        return datasets.MetricInfo(
+        self.info = evaluate.MetricInfo(
             description=_DESCRIPTION,
             inputs_description=_KWARGS_DESCRIPTION,
             citation=_CITATION,
@@ -111,56 +105,37 @@ def _info(self):
             reference_urls=["https://www.aclweb.org/anthology/P02-1040.pdf"]
         )
 
-    def _clean_special_tokens(self, sentence: str, subword: str) -> str:
-        """Clean special word in sentence"""
+    def __repr__(self) -> str:
+        """:return: Formatted string representation of the metric."""
+        return f"{self.ALIAS[0]}"  # pragma: no cover
 
-        sentence = sentence.strip()
-        if subword is not None:
-            sentence = re.sub(subword, "", sentence)
-        return sentence
+    def compute(
+        self,
+        pred_answers: List[str],
+        ref_answers: List[List[str]],
+        batch_size: int,
+    ) -> Tuple[float, List[float]]:
+        """Compute the bleu score on both corpus level and instance level."""
+        bleu = evaluate.load("bleu")
+        # corpus level
+        bleu_result = bleu.compute(predictions=pred_answers, references=ref_answers)
+        score = bleu_result['bleu']
+        # instance level
+        scores = []
+        for pred_answer, ref_answer in tqdm(zip(pred_answers, ref_answers),
+                                            desc=f"Computing {self.name}",
+                                            total=len(pred_answers)):
+            scores.append(self._compute_one(pred_answer, ref_answer))
+        return score, scores
 
     def _compute_one(
         self,
         pred_answers: List[str],
         ref_answers: List[List[str]]
     ) -> List[float]:
-        """Compute the bleu score of a batch of answers."""
-        scores = []
-        bleu = datasets.load_metric("bleu")
-        for output, gt_answers in zip(pred_answers, ref_answers):
-            output_clean = self._clean_special_tokens(output, None)
-            predictions = [output_clean.split(' ')]
-            references = []
-            for gt_answer in gt_answers:
-                gt_answer_clean = self._clean_special_tokens(gt_answer, None)
-                references.append(list(gt_answer_clean.split(' ')))
-            bleu_result = bleu.compute(predictions=predictions, references=[references])
-            bleu_score = bleu_result['bleu']
-            scores.append(bleu_score)
-
-        return scores
+        """Compute the bleu score on an instance level."""
 
-    def compute(
-        self,
-        pred_answers: List[str],
-        ref_answers: List[List[str]],
-        batch_size: int,
-    ) -> Tuple[float, List[float]]:
-        """Evaluate the dataset."""
-
-        bleu = datasets.load_metric("bleu")
-        predictions = []
-        references = []
-        for output, gt_answers in zip(pred_answers, ref_answers):
-            output_clean = self._clean_special_tokens(output, None)
-            predictions.append(list(output_clean.split(' ')))
-            reference = []
-            for gt_answer in gt_answers:
-                gt_answer_clean = self._clean_special_tokens(gt_answer, None)
-                reference.append(list(gt_answer_clean.split(' ')))
-            references.append(reference)
-        bleu_result = bleu.compute(predictions=predictions, references=references)
+        bleu = evaluate.load("bleu")
+        bleu_result = bleu.compute(predictions=[pred_answers], references=[ref_answers])
         bleu_score = bleu_result['bleu']
-        scores = self._compute_one(pred_answers, ref_answers)
-
-        return bleu_score, scores
+        return bleu_score
diff --git a/rageval/metrics/answer_correctness/_answer_chrf.py b/rageval/metrics/answer_correctness/_answer_chrf.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 from typing import List, Tuple, Optional
+import evaluate
 
 import datasets
 from sacrebleu.metrics import CHRF
@@ -127,13 +128,7 @@ def __init__(
             whitespace=whitespace,
             eps_smoothing=eps_smoothing
         )
-
-    def __repr__(self) -> str:
-        """:return: Formatted string representation of the metric."""
-        return f"{self.ALIAS[0]}"
-
-    def _info(self):
-        return datasets.MetricInfo(
+        self.info = evaluate.MetricInfo(
             description=_DESCRIPTION,
             inputs_description=_KWARGS_DESCRIPTION,
             citation=_CITATION,
@@ -151,6 +146,10 @@ def _info(self):
             ]
         )
 
+    def __repr__(self) -> str:
+        """:return: Formatted string representation of the metric."""
+        return f"{self.ALIAS[0]}"  # pragma: no cover
+
     def _validate_data(
         self,
         pred_answers: List[str],
@@ -159,9 +158,9 @@ def _validate_data(
         """Validate the input dataset."""
         super()._validate_data(pred_answers, ref_answers)
         if not all(isinstance(answer, str) for answer in pred_answers):
-            raise ValueError("The type of pred_answers should be a string.")
+            raise ValueError("The type of pred_answers should be a string.")  # pragma: no cover
         if not all(isinstance(a, list) and all(isinstance(item, str) for item in a) for a in ref_answers):
-            raise ValueError("The type of ref_answers should be a list of strings.")
+            raise ValueError("The type of ref_answers should be a list of strings.")  # pragma: no cover
 
     def _compute_one(
         self,

diff --git a/rageval/metrics/answer_correctness/_answer_claim_recall.py b/rageval/metrics/answer_correctness/_answer_claim_recall.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 from typing import List, Callable, Tuple
+import evaluate
 
 import datasets
 import numpy as np
@@ -92,13 +93,7 @@ def __init__(self, nli_model: Callable, decompose_model: str = "gpt-3.5-turbo"):
         super().__init__()
         self.nli_model = nli_model
         self.decompose_model = decompose_model
-
-    def __repr__(self) -> str:
-        """:return: Formatted string representation of the metric."""
-        return f"{self.ALIAS[0]}"
-
-    def _info(self):
-        return datasets.MetricInfo(
+        self.info = evaluate.MetricInfo(
             description=_DESCRIPTION,
             inputs_description=_KWARGS_DESCRIPTION,
             citation=_CITATION,
@@ -113,6 +108,10 @@ def _info(self):
             reference_urls=["https://arxiv.org/abs/2305.14627"]
         )
 
+    def __repr__(self) -> str:
+        """:return: Formatted string representation of the metric."""
+        return f"{self.ALIAS[0]}"  # pragma: no cover
+
     def _compute_one(
         self,
         answer: str,
@@ -164,9 +163,9 @@ def _compute_batch(
                 # use decompose_model to decompose the gt_answers into claims list
                 claims = [text_to_sents(gt_answer, self.decompose_model) for gt_answer in ref_answers]
             else:
-                raise ValueError("The type of gt_answers element should be list or string.")
+                raise ValueError("The type of gt_answers element should be list or string.")  # pragma: no cover
         else:
-            raise ValueError("The type of gt_answers should be list.")
+            raise ValueError("The type of gt_answers should be list.")  # pragma: no cover
 
         results = []
         for i, answer in tqdm(enumerate(pred_answers)):

diff --git a/rageval/metrics/answer_correctness/_answer_disambig_f1.py b/rageval/metrics/answer_correctness/_answer_disambig_f1.py
@@ -3,7 +3,7 @@
 from collections import Counter
 from dataclasses import dataclass
 from typing import List
-from tqdm import tqdm
+import evaluate
 
 import datasets
 import numpy as np
@@ -104,13 +104,7 @@ def __init__(self, model: str = "en_core_web_sm"):
         super().__init__()
         self.model = model
         self.nlp = spacy.load(model)
-
-    def __repr__(self) -> str:
-        """:return: Formatted string representation of the metric."""
-        return f"{self.ALIAS[0]}"
-
-    def _info(self):
-        return datasets.MetricInfo(
+        self.info = evaluate.MetricInfo(
             description=_DESCRIPTION,
             inputs_description=_KWARGS_DESCRIPTION,
             citation=_CITATION,
@@ -131,6 +125,10 @@ def _info(self):
             ]
         )
 
+    def __repr__(self) -> str:
+        """:return: Formatted string representation of the metric."""
+        return f"{self.ALIAS[0]}"  # pragma: no cover
+
     def _normalize_text(self, s: str) -> str:
         def remove_articles(text):
             return re.sub(r'\b(a|an|the)\b', ' ', text)

diff --git a/rageval/metrics/answer_groundedness/_context_reject_rate.py b/rageval/metrics/answer_groundedness/_context_reject_rate.py
@@ -3,7 +3,8 @@
 
 import datasets
 import numpy as np
-from datasets import Dataset
+import evaluate
+
 from langchain.schema import LLMResult
 from tqdm import tqdm
 
@@ -108,13 +109,7 @@ class ContextRejectRate(MetricWithLLM):
     def __init__(self, model: Callable):
         """Explicitly initialize the ContextRejectRate to ensure all parent class initialized."""
         super().__init__(model)
-
-    def __repr__(self) -> str:
-        """:return: Formatted string representation of the metric."""
-        return f"{self.ALIAS[0]}"
-
-    def _info(self):
-        return datasets.MetricInfo(
+        self.info = evaluate.MetricInfo(
             description=_DESCRIPTION,
             inputs_description=_KWARGS_DESCRIPTION,
             citation=_CITATION,
@@ -129,6 +124,10 @@ def _info(self):
             reference_urls=["https://arxiv.org/abs/2311.09210"]
         )
 
+    def __repr__(self) -> str:
+        """:return: Formatted string representation of the metric."""
+        return f"{self.ALIAS[0]}"  # pragma: no cover
+
     def parse_llm_result(self, prompts: List[str], result: LLMResult):
         """Parse the results of LLM based on whether the answer contains the content specified by prompt."""
         responses = [[i.text for i in r] for r in result.generations]
@@ -151,17 +150,14 @@ def compute(
         """Evaluate the dataset."""
         scores = []
         length = len(questions)
-        if batch_size:
-            for start in tqdm(range(0, length, batch_size)):
-                end = start + batch_size
-                end = end if end < length else length
-                score = self._compute_batch(
-                    questions[start:end],
-                    contexts[start:end]
-                )
-                scores.extend(score)
-        else:
-            scores = self._compute_batch(questions, contexts)
+        for start in tqdm(range(0, length, batch_size)):
+            end = start + batch_size
+            end = end if end < length else length
+            score = self._compute_batch(
+                questions[start:end],
+                contexts[start:end]
+            )
+            scores.extend(score)
 
         return np.average(scores), scores