Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add claimnum claimfaithfulness repetitiveness #126

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
6 changes: 3 additions & 3 deletions rageval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@
from .answer_groundedness._answer_citation_precision import AnswerCitationPrecision
from .answer_groundedness._answer_citation_recall import AnswerCitationRecall
from .answer_groundedness._context_reject_rate import ContextRejectRate
##from .answer_groundedness._claim_faithfulness import ClaimFaithfulness
from .answer_groundedness._claim_faithfulness import ClaimFaithfulness

# Metrics about the answer informativeness
##from .answer_informative._claim_num import ClaimNum
from .answer_informativeness._claim_num import ClaimNum
from .answer_informativeness._text_length import TextLength
##from .answer_informativeness._repetitiveness import Repetitiveness
from .answer_informativeness._repetitiveness import Repetitiveness
##from .answer_informativeness._pairwise_accuracy import PairwiseAccuracy
from .answer_informativeness._answer_distinct12 import AnswerDistinct

Expand Down
120 changes: 120 additions & 0 deletions rageval/metrics/answer_groundedness/_claim_faithfulness.py
Original file line number Diff line number Diff line change
@@ -1,0 +1,120 @@
from dataclasses import dataclass
from typing import Optional, Iterable
from refchecker.extractor import LLMExtractor
from refchecker.checker import LLMChecker

import evaluate
import datasets
import os
from rageval.metrics import Metric, add_attribute
import numpy as np


_DESCRIPTION = """\
ClaimFaithfulness is a metric that evaluates to what extend does the answer follows the given evidences.

It is calculated by first utilizing the open-source tool RefChecker to extract claims from the generated text, and then use the same tool to check whether evidences can entail each claim. The ultimate measure is the total number of entailment, providing insight into the faithfulness to given evidences in the model's outputs.
"""

_KWARGS_DESCRIPTION = """\
Args:
name : str

Optional Args:
None

Functions:
_compute_one: Evaluating the faithfulness of claims generated.

Examples:
>>> from datasets import Dataset
>>> import rageval as rl
>>> sample = {
... "answers": [
... "A",
... "C",
... ]
... }
>>> dataset = Dataset.from_dict(sample)
>>> metric = ClaimFaithfulness(model = "openai//home/gomall/models/Qwen2-7B-Instruct", api_base = "http://project.gomall.ac.cn:30590/notebook/tensorboard/wangwenshan/1161/v1", api_key = "sk-123456789")
>>> metric.mtype
'answer_informativeness'
"""


@dataclass
@add_attribute('mtype', 'answer_informativeness')
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ClaimFaithfulness(Metric):
"""Estimates the faithfulness of claims contained in answers."""

name = "claim_faithfulness"

ALIAS = ['claim_faithfulness']

def __init__(self, model: str = "openai//home/gomall/models/Qwen2-7B-Instruct",
api_base: str = "http://localhost:5000/v1",
api_key: str = "sk-123456789"):
"""
Explicitly initialize ClaimFaithfulness.

Ensure all parent classes are initialized.
"""
self.extractor = LLMExtractor(model=model, batch_size=8, api_base=api_base)
self.checker = LLMChecker(model=model, batch_size=8, api_base=api_base)
os.environ['OPENAI_API_KEY'] = api_key
super().__init__()
self.info = evaluate.MetricInfo(
description=_DESCRIPTION,
inputs_description=_KWARGS_DESCRIPTION,
citation="",
homepage="",
features=datasets.Features(
{
"answers": datasets.Value("string"),
}
),
codebase_urls=[],
reference_urls=[]
)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}" # pragma: no cover

def _compute_one(
self,
answer: str,
question: str,
context: str,
*args: Optional[Iterable],
) -> float:
"""Evaluating the richness of claims contained in answers."""
extraction_results = self.extractor.extract(batch_responses=[answer],

Check warning on line 93 in rageval/metrics/answer_groundedness/_claim_faithfulness.py

View check run for this annotation

Codecov / codecov/patch

rageval/metrics/answer_groundedness/_claim_faithfulness.py#L93

Added line #L93 was not covered by tests
batch_questions=[question],
max_new_tokens=1000
)
claims = [[c.content for c in res.claims] for res in extraction_results]
merge_psg = False
checking_results = self.checker.check(batch_claims=claims,

Check warning on line 99 in rageval/metrics/answer_groundedness/_claim_faithfulness.py

View check run for this annotation

Codecov / codecov/patch

rageval/metrics/answer_groundedness/_claim_faithfulness.py#L97-L99

Added lines #L97 - L99 were not covered by tests
batch_references=[context],
batch_questions=[question],
max_reference_segment_length=0,
merge_psg=merge_psg,
is_joint=True,
joint_check_num=5,
sagemaker_client=None,
sagemaker_params=None,
sagemaker_get_response_func=None
)

def to_bool(checking_results):
if isinstance(checking_results, str):
return checking_results == "Entailment"
return np.array([to_bool(res) for res in checking_results])

Check warning on line 114 in rageval/metrics/answer_groundedness/_claim_faithfulness.py

View check run for this annotation

Codecov / codecov/patch

rageval/metrics/answer_groundedness/_claim_faithfulness.py#L111-L114

Added lines #L111 - L114 were not covered by tests

retrieved2response = to_bool(checking_results)
faithful = np.max(retrieved2response, axis=2)
faithfulness_score = np.mean(faithful)

Check warning on line 118 in rageval/metrics/answer_groundedness/_claim_faithfulness.py

View check run for this annotation

Codecov / codecov/patch

rageval/metrics/answer_groundedness/_claim_faithfulness.py#L116-L118

Added lines #L116 - L118 were not covered by tests

return faithfulness_score

Check warning on line 120 in rageval/metrics/answer_groundedness/_claim_faithfulness.py

View check run for this annotation

Codecov / codecov/patch

rageval/metrics/answer_groundedness/_claim_faithfulness.py#L120

Added line #L120 was not covered by tests
96 changes: 96 additions & 0 deletions rageval/metrics/answer_informativeness/_claim_num.py
Original file line number Diff line number Diff line change
@@ -1,0 +1,96 @@
from dataclasses import dataclass
from typing import Optional, Iterable
from refchecker.extractor import LLMExtractor
import evaluate
import datasets
import os
from rageval.metrics import Metric, add_attribute


_DESCRIPTION = """\
ClaimNum is a metric designed to evaluate the richness of claims generated by the model.

It is calculated by first utilizing the open-source tool RefChecker to extract claims from the generated text. The ultimate measure is the total number of distinct claims identified, providing insight into the diversity of opinions presented in the model's outputs.
"""

_KWARGS_DESCRIPTION = """\
Args:
name : str

Optional Args:
None

Functions:
_compute_one: Evaluating the richness of claims generated.

Examples:
>>> from datasets import Dataset
>>> import rageval as rl
>>> sample = {
... "answers": [
... "A",
... "C",
... ]
... }
>>> dataset = Dataset.from_dict(sample)
>>> metric = ClaimNum(model = "openai//home/gomall/models/Qwen2-7B-Instruct", api_base = "http://localhost:5000/v1", api_key = "sk-123456789")
>>> metric.mtype
'answer_informativeness'
"""


@dataclass
@add_attribute('mtype', 'answer_informativeness')
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ClaimNum(Metric):
"""Estimates the richness of claims contained in answers."""

name = "claim_num"

ALIAS = ['claim_num']

def __init__(self, model: str = "openai//home/gomall/models/Qwen2-7B-Instruct",
api_base: str = "http://localhost:5000/v1",
api_key: str = "sk-123456789"):
"""
Explicitly initialize TextLength.

Ensure all parent classes are initialized.
"""
self.extractor = LLMExtractor(model=model, batch_size=8, api_base=api_base)
os.environ['OPENAI_API_KEY'] = api_key
super().__init__()
self.info = evaluate.MetricInfo(
description=_DESCRIPTION,
inputs_description=_KWARGS_DESCRIPTION,
citation="",
homepage="",
features=datasets.Features(
{
"answers": datasets.Value("string"),
}
),
codebase_urls=[],
reference_urls=[]
)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}" # pragma: no cover

def _compute_one(
self,
answer: str,
question: str,
*args: Optional[Iterable],
) -> float:
"""Evaluating the richness of claims contained in answers."""
extraction_results = self.extractor.extract(

Check warning on line 88 in rageval/metrics/answer_informativeness/_claim_num.py

View check run for this annotation

Codecov / codecov/patch

rageval/metrics/answer_informativeness/_claim_num.py#L88

Added line #L88 was not covered by tests
batch_responses=[answer],
batch_questions=[question],
max_new_tokens=1000
)
claims = [[c.content for c in res.claims] for res in extraction_results]
claim_num = len(claims[0])

Check warning on line 94 in rageval/metrics/answer_informativeness/_claim_num.py

View check run for this annotation

Codecov / codecov/patch

rageval/metrics/answer_informativeness/_claim_num.py#L93-L94

Added lines #L93 - L94 were not covered by tests

return claim_num

Check warning on line 96 in rageval/metrics/answer_informativeness/_claim_num.py

View check run for this annotation

Codecov / codecov/patch

rageval/metrics/answer_informativeness/_claim_num.py#L96

Added line #L96 was not covered by tests
100 changes: 100 additions & 0 deletions rageval/metrics/answer_informativeness/_repetitiveness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from dataclasses import dataclass
from typing import Optional, Iterable

import evaluate
import datasets
import os
from rageval.metrics import Metric, add_attribute
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter


_DESCRIPTION = """\
Repetitiveness is a metric that evaluates repetitiveness/redundency generated by the model.

It is calculated by first spliting answer into words unit and sentences. Then counts the apperance of same words and sentences diveded by the total number. Higher the number, higher the redundency.
"""

_KWARGS_DESCRIPTION = """\
Args:
name : str

Optional Args:
None

Functions:
_compute_one: Evaluating the repetitiveness of answers generated.

Examples:
>>> from datasets import Dataset
>>> import rageval as rl
>>> sample = {
... "answers": [
... "A",
... "C",
... ]
... }
>>> dataset = Dataset.from_dict(sample)
>>> metric = Repetitiveness()
>>> metric.mtype
'answer_informativeness'
"""


@dataclass
@add_attribute('mtype', 'answer_informativeness')
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Repetitiveness(Metric):
"""Estimates the repetitiveness of answers."""

name = "repetitiveness"

ALIAS = ['repetitiveness']

def __init__(self):
"""
Explicitly initialize Repetitiveness.

Ensure all parent classes are initialized.
"""
super().__init__()
self.info = evaluate.MetricInfo(
description=_DESCRIPTION,
inputs_description=_KWARGS_DESCRIPTION,
citation="",
homepage="",
features=datasets.Features(
{
"answers": datasets.Value("string"),
}
),
codebase_urls=[],
reference_urls=[]
)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
return f"{self.ALIAS[0]}" # pragma: no cover

def _compute_one(
self,
answer: str,
*args: Optional[Iterable],
) -> float:
"""Evaluating the repetitiveness of answer."""
def generate_splits(text):
words = word_tokenize(text)
sentences = sent_tokenize(text)
return words, sentences

def calculate_redundancy_one(text):
words, sentences = generate_splits(text)
redundancy_ratio = []
for gram in [words, sentences]:
counts = Counter(gram)
repeated_grams = sum(count for count in counts.values() if count > 1)
redundancy_ratio.append(repeated_grams / len(gram) if len(gram) > 0 else 0)
return np.mean(redundancy_ratio)

return calculate_redundancy_one(answer)
15 changes: 13 additions & 2 deletions rageval/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
pred_answers: Optional[Iterable] = None,
ref_answers: Optional[Iterable] = None,
batch_size: Optional[int] = None,
contexts: Optional[Iterable] = None,
*args: Optional[Iterable],
) -> Tuple[float, List[float]]:
"""
Expand All @@ -67,7 +68,10 @@
Return average scores of all inputs and a score list for each example.
"""
self._validate_data(pred_answers, ref_answers, *args)
scores = self._compute_batch(pred_answers, ref_answers, *args)
if contexts:
scores = self._compute_batch(pred_answers, ref_answers, contexts, *args)

Check warning on line 72 in rageval/metrics/base.py

View check run for this annotation

Codecov / codecov/patch

rageval/metrics/base.py#L72

Added line #L72 was not covered by tests
else:
scores = self._compute_batch(pred_answers, ref_answers, *args)

return np.average(scores), scores

Expand All @@ -76,6 +80,7 @@
self,
pred_answer: Optional[Iterable] = None,
ref_answer: Optional[Iterable] = None,
context: Optional[Iterable] = None,
*args: Optional[Iterable]
) -> float:
... # pragma: no cover
Expand All @@ -84,11 +89,17 @@
self,
pred_answers: Optional[Iterable] = None,
ref_answers: Optional[Iterable] = None,
contexts: Optional[Iterable] = None,
*args: Optional[Iterable]
) -> List[float]:
"""Compute the metric for a batch of predictions and references."""
scores = []
if (pred_answers and ref_answers): # if both columns exist
if contexts:
for pred_answer, ref_answer, context in tqdm(zip(pred_answers, ref_answers, contexts),

Check warning on line 98 in rageval/metrics/base.py

View check run for this annotation

Codecov / codecov/patch

rageval/metrics/base.py#L98

Added line #L98 was not covered by tests
desc=f"Computing {self.name}",
total=len(pred_answers)):
scores.append(self._compute_one(pred_answer, ref_answer, context))

Check warning on line 101 in rageval/metrics/base.py

View check run for this annotation

Codecov / codecov/patch

rageval/metrics/base.py#L101

Added line #L101 was not covered by tests
elif (pred_answers and ref_answers): # if both columns exist
for pred_answer, ref_answer in tqdm(zip(pred_answers, ref_answers),
desc=f"Computing {self.name}",
total=len(pred_answers)):
Expand Down
Loading
Loading