diff --git a/eval/benchs/halueval/eval_halueval_dialog.py b/eval/benchs/halueval/eval_halueval_dialog.py index 8cf0110..45968fd 100644 --- a/eval/benchs/halueval/eval_halueval_dialog.py +++ b/eval/benchs/halueval/eval_halueval_dialog.py @@ -51,7 +51,7 @@ def scoring(self, data_point: dict) -> dict: ) response = self.model.safe_request(query) - answer = response.strip().split() + answer = response.strip().split() # Extract the first word, such as "Yes", "No", "#Yes", "No." # Note: "".strip() returns [] instead of [""] answer = answer[0] if answer else "" diff --git a/eval/benchs/halueval/eval_halueval_qa.py b/eval/benchs/halueval/eval_halueval_qa.py index 6258eea..4476424 100644 --- a/eval/benchs/halueval/eval_halueval_qa.py +++ b/eval/benchs/halueval/eval_halueval_qa.py @@ -56,14 +56,13 @@ def scoring(self, data_point: dict) -> dict: ) response = self.model.safe_request(query) - answer = response.strip().split() + answer = response.strip().split() # Extract the first word, such as "Yes", "No", "#Yes", "No." # Note: "".strip() returns [] instead of [""] answer = answer[0] if answer else "" # Remove the leading "#", ".", "," answer = answer.strip("#").strip(".").strip(",") - return { "metrics": { "correct": ground_truth.lower() == answer.lower(), diff --git a/eval/benchs/halueval/eval_halueval_summa.py b/eval/benchs/halueval/eval_halueval_summa.py index 7f2201f..f974470 100644 --- a/eval/benchs/halueval/eval_halueval_summa.py +++ b/eval/benchs/halueval/eval_halueval_summa.py @@ -42,7 +42,7 @@ def scoring(self, data_point: dict) -> dict: ) response = self.model.safe_request(query) - answer = response.strip().split() + answer = response.strip().split() # Extract the first word, such as "Yes", "No", "#Yes", "No." # Note: "".strip() returns [] instead of [""] answer = answer[0] if answer else "" diff --git a/eval/benchs/uhgeval/eval_sele.py b/eval/benchs/uhgeval/eval_sele.py index 80a4da4..12f258f 100644 --- a/eval/benchs/uhgeval/eval_sele.py +++ b/eval/benchs/uhgeval/eval_sele.py @@ -41,8 +41,8 @@ def which_is_true(self, contn1: str, contn2: str, obj: dict) -> tuple[int, str]: """Given two continuations, determine which one is more accurate. Returns: - tuple: (answer, response). `answer` is 1 if the first continuation is more - accurate, 2 if the second one is more accurate, and -1 if error. `response` + tuple: (answer, response). `answer` is 1 if the first continuation is more + accurate, 2 if the second one is more accurate, and -1 if error. `response` is the model's response. """ query = PROMPT_TEMPLATE.format(