Skip to content

Commit

Permalink
Merge pull request #122 from gomate-community/upgradepython
Browse files Browse the repository at this point in the history
upgrade python to 3.10
  • Loading branch information
bugtig6351 authored Oct 10, 2024
2 parents be31dae + 019a541 commit bc43863
Show file tree
Hide file tree
Showing 16 changed files with 138 additions and 215 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/makefile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
- name: Setup Python version
uses: actions/setup-python@v1
with:
python-version: 3.8.18
python-version: 3.10.15

- name: Install requirements
run: make init
Expand Down
13 changes: 6 additions & 7 deletions rageval/metrics/answer_correctness/_answer_accuracy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass
from typing import List
import evaluate

import datasets

Expand Down Expand Up @@ -83,13 +84,7 @@ def __init__(self):
Ensure all parent classes are initialized.
"""
super().__init__()

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}"

def _info(self):
return datasets.MetricInfo(
self.info = evaluate.MetricInfo(
description=_DESCRIPTION,
inputs_description=_KWARGS_DESCRIPTION,
citation=_CITATION,
Expand All @@ -104,6 +99,10 @@ def _info(self):
reference_urls=["https://arxiv.org/abs/2009.03300"]
)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}"

def _compute_one(
self,
answer: str,
Expand Down
13 changes: 6 additions & 7 deletions rageval/metrics/answer_correctness/_answer_bert_score.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass
from typing import List, Tuple
import evaluate

import datasets
from rageval.metrics import Metric, add_attribute
Expand Down Expand Up @@ -88,13 +89,7 @@ def __init__(self, lang: str = "en", rescale_with_baseline=False):
"""Explicitly initialize the AnswerBERTScore to ensure all parent class initialized."""
super().__init__()
self.scorer = BERTScorer(lang=lang, rescale_with_baseline=rescale_with_baseline)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}"

def _info(self):
return datasets.MetricInfo(
self.info = evaluate.MetricInfo(
description=_DESCRIPTION,
inputs_description=_KWARGS_DESCRIPTION,
citation=_CITATION,
Expand All @@ -111,6 +106,10 @@ def _info(self):
reference_urls=["https://openreview.net/forum?id=SkeHuCVFDr"]
)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}"

def _compute_one(
self,
pred_answers: str,
Expand Down
85 changes: 30 additions & 55 deletions rageval/metrics/answer_correctness/_answer_bleu.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import re
from dataclasses import dataclass
from typing import List, Tuple

import evaluate
import datasets

from rageval.metrics import Metric, add_attribute
from tqdm import tqdm


_DESCRIPTION = """\
Expand Down Expand Up @@ -55,9 +55,9 @@
'AnswerCorrectness'
>>> score, results = metric.compute(dataset["answers"], dataset["gt_answers"], 1)
>>> score
0.3172992057845065
0.3450835085970013
>>> results[0]
0.49697705300310346
0.5401725898595141
"""


Expand Down Expand Up @@ -87,13 +87,7 @@ class AnswerBleuScore(Metric):
def __init__(self):
"""Explicitly initialize the AnswerBleuScore to ensure all parent class initialized."""
super().__init__()

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}"

def _info(self):
return datasets.MetricInfo(
self.info = evaluate.MetricInfo(
description=_DESCRIPTION,
inputs_description=_KWARGS_DESCRIPTION,
citation=_CITATION,
Expand All @@ -111,56 +105,37 @@ def _info(self):
reference_urls=["https://www.aclweb.org/anthology/P02-1040.pdf"]
)

def _clean_special_tokens(self, sentence: str, subword: str) -> str:
"""Clean special word in sentence"""
def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}" # pragma: no cover

sentence = sentence.strip()
if subword is not None:
sentence = re.sub(subword, "", sentence)
return sentence
def compute(
self,
pred_answers: List[str],
ref_answers: List[List[str]],
batch_size: int,
) -> Tuple[float, List[float]]:
"""Compute the bleu score on both corpus level and instance level."""
bleu = evaluate.load("bleu")
# corpus level
bleu_result = bleu.compute(predictions=pred_answers, references=ref_answers)
score = bleu_result['bleu']
# instance level
scores = []
for pred_answer, ref_answer in tqdm(zip(pred_answers, ref_answers),
desc=f"Computing {self.name}",
total=len(pred_answers)):
scores.append(self._compute_one(pred_answer, ref_answer))
return score, scores

def _compute_one(
self,
pred_answers: List[str],
ref_answers: List[List[str]]
) -> List[float]:
"""Compute the bleu score of a batch of answers."""
scores = []
bleu = datasets.load_metric("bleu")
for output, gt_answers in zip(pred_answers, ref_answers):
output_clean = self._clean_special_tokens(output, None)
predictions = [output_clean.split(' ')]
references = []
for gt_answer in gt_answers:
gt_answer_clean = self._clean_special_tokens(gt_answer, None)
references.append(list(gt_answer_clean.split(' ')))
bleu_result = bleu.compute(predictions=predictions, references=[references])
bleu_score = bleu_result['bleu']
scores.append(bleu_score)

return scores
"""Compute the bleu score on an instance level."""

def compute(
self,
pred_answers: List[str],
ref_answers: List[List[str]],
batch_size: int,
) -> Tuple[float, List[float]]:
"""Evaluate the dataset."""

bleu = datasets.load_metric("bleu")
predictions = []
references = []
for output, gt_answers in zip(pred_answers, ref_answers):
output_clean = self._clean_special_tokens(output, None)
predictions.append(list(output_clean.split(' ')))
reference = []
for gt_answer in gt_answers:
gt_answer_clean = self._clean_special_tokens(gt_answer, None)
reference.append(list(gt_answer_clean.split(' ')))
references.append(reference)
bleu_result = bleu.compute(predictions=predictions, references=references)
bleu = evaluate.load("bleu")
bleu_result = bleu.compute(predictions=[pred_answers], references=[ref_answers])
bleu_score = bleu_result['bleu']
scores = self._compute_one(pred_answers, ref_answers)

return bleu_score, scores
return bleu_score
17 changes: 8 additions & 9 deletions rageval/metrics/answer_correctness/_answer_chrf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass
from typing import List, Tuple, Optional
import evaluate

import datasets
from sacrebleu.metrics import CHRF
Expand Down Expand Up @@ -127,13 +128,7 @@ def __init__(
whitespace=whitespace,
eps_smoothing=eps_smoothing
)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}"

def _info(self):
return datasets.MetricInfo(
self.info = evaluate.MetricInfo(
description=_DESCRIPTION,
inputs_description=_KWARGS_DESCRIPTION,
citation=_CITATION,
Expand All @@ -151,6 +146,10 @@ def _info(self):
]
)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}" # pragma: no cover

def _validate_data(
self,
pred_answers: List[str],
Expand All @@ -159,9 +158,9 @@ def _validate_data(
"""Validate the input dataset."""
super()._validate_data(pred_answers, ref_answers)
if not all(isinstance(answer, str) for answer in pred_answers):
raise ValueError("The type of pred_answers should be a string.")
raise ValueError("The type of pred_answers should be a string.") # pragma: no cover
if not all(isinstance(a, list) and all(isinstance(item, str) for item in a) for a in ref_answers):
raise ValueError("The type of ref_answers should be a list of strings.")
raise ValueError("The type of ref_answers should be a list of strings.") # pragma: no cover

def _compute_one(
self,
Expand Down
17 changes: 8 additions & 9 deletions rageval/metrics/answer_correctness/_answer_claim_recall.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass
from typing import List, Callable, Tuple
import evaluate

import datasets
import numpy as np
Expand Down Expand Up @@ -92,13 +93,7 @@ def __init__(self, nli_model: Callable, decompose_model: str = "gpt-3.5-turbo"):
super().__init__()
self.nli_model = nli_model
self.decompose_model = decompose_model

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}"

def _info(self):
return datasets.MetricInfo(
self.info = evaluate.MetricInfo(
description=_DESCRIPTION,
inputs_description=_KWARGS_DESCRIPTION,
citation=_CITATION,
Expand All @@ -113,6 +108,10 @@ def _info(self):
reference_urls=["https://arxiv.org/abs/2305.14627"]
)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}" # pragma: no cover

def _compute_one(
self,
answer: str,
Expand Down Expand Up @@ -164,9 +163,9 @@ def _compute_batch(
# use decompose_model to decompose the gt_answers into claims list
claims = [text_to_sents(gt_answer, self.decompose_model) for gt_answer in ref_answers]
else:
raise ValueError("The type of gt_answers element should be list or string.")
raise ValueError("The type of gt_answers element should be list or string.") # pragma: no cover
else:
raise ValueError("The type of gt_answers should be list.")
raise ValueError("The type of gt_answers should be list.") # pragma: no cover

results = []
for i, answer in tqdm(enumerate(pred_answers)):
Expand Down
14 changes: 6 additions & 8 deletions rageval/metrics/answer_correctness/_answer_disambig_f1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections import Counter
from dataclasses import dataclass
from typing import List
from tqdm import tqdm
import evaluate

import datasets
import numpy as np
Expand Down Expand Up @@ -104,13 +104,7 @@ def __init__(self, model: str = "en_core_web_sm"):
super().__init__()
self.model = model
self.nlp = spacy.load(model)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}"

def _info(self):
return datasets.MetricInfo(
self.info = evaluate.MetricInfo(
description=_DESCRIPTION,
inputs_description=_KWARGS_DESCRIPTION,
citation=_CITATION,
Expand All @@ -131,6 +125,10 @@ def _info(self):
]
)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}" # pragma: no cover

def _normalize_text(self, s: str) -> str:
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
Expand Down
34 changes: 15 additions & 19 deletions rageval/metrics/answer_groundedness/_context_reject_rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

import datasets
import numpy as np
from datasets import Dataset
import evaluate

from langchain.schema import LLMResult
from tqdm import tqdm

Expand Down Expand Up @@ -108,13 +109,7 @@ class ContextRejectRate(MetricWithLLM):
def __init__(self, model: Callable):
"""Explicitly initialize the ContextRejectRate to ensure all parent class initialized."""
super().__init__(model)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}"

def _info(self):
return datasets.MetricInfo(
self.info = evaluate.MetricInfo(
description=_DESCRIPTION,
inputs_description=_KWARGS_DESCRIPTION,
citation=_CITATION,
Expand All @@ -129,6 +124,10 @@ def _info(self):
reference_urls=["https://arxiv.org/abs/2311.09210"]
)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}" # pragma: no cover

def parse_llm_result(self, prompts: List[str], result: LLMResult):
"""Parse the results of LLM based on whether the answer contains the content specified by prompt."""
responses = [[i.text for i in r] for r in result.generations]
Expand All @@ -151,17 +150,14 @@ def compute(
"""Evaluate the dataset."""
scores = []
length = len(questions)
if batch_size:
for start in tqdm(range(0, length, batch_size)):
end = start + batch_size
end = end if end < length else length
score = self._compute_batch(
questions[start:end],
contexts[start:end]
)
scores.extend(score)
else:
scores = self._compute_batch(questions, contexts)
for start in tqdm(range(0, length, batch_size)):
end = start + batch_size
end = end if end < length else length
score = self._compute_batch(
questions[start:end],
contexts[start:end]
)
scores.extend(score)

return np.average(scores), scores

Expand Down
Loading

0 comments on commit bc43863

Please sign in to comment.