From 9bf9be216ec17679e32abd9e896207aa4fcd7de4 Mon Sep 17 00:00:00 2001 From: Thilo von Neumann Date: Thu, 29 Aug 2024 08:06:39 +0200 Subject: [PATCH] Add apply_dicp_assignment --- meeteval/wer/wer/di_cp.py | 83 ++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 27 deletions(-) diff --git a/meeteval/wer/wer/di_cp.py b/meeteval/wer/wer/di_cp.py index 339fc14c..4ae7219e 100644 --- a/meeteval/wer/wer/di_cp.py +++ b/meeteval/wer/wer/di_cp.py @@ -11,6 +11,7 @@ 'DICPErrorRate', 'greedy_di_cp_word_error_rate', 'greedy_di_cp_word_error_rate_multifile', + 'apply_dicp_assignment', ] @@ -19,33 +20,7 @@ class DICPErrorRate(ErrorRate): assignment: Tuple[int, ...] def apply_assignment(self, reference, hypothesis): - if reference != []: # Special case where we don't want to handle [] as SegLST - from meeteval.io.seglst import SegLST, asseglistconvertible - import meeteval - try: - h_conv = asseglistconvertible(hypothesis, py_convert=False) - except: - pass - else: - hypothesis = h_conv.to_seglst() - - if 'segment_index' in hypothesis.T.keys(): - hypothesis = hypothesis.groupby('segment_index').values() - else: - hypothesis = [[r] for r in hypothesis] - - assert len(hypothesis) == len(self.assignment), (len(hypothesis), len(self.assignment)) - hypothesis = meeteval.io.SegLST([ - {**s, 'speaker': a} - for r, a in zip(hypothesis, self.assignment) - for s in r - ]) - - return reference, h_conv.new(hypothesis) - h = {} - for a, h_ in zip(self.assignment, hypothesis): - h.setdefault(a, []).append(h_) - return reference, h + return apply_dicp_assignment(self.assignment, reference, hypothesis) @classmethod def from_dict(cls, d): @@ -134,3 +109,57 @@ def greedy_di_cp_word_error_rate_multifile( ), reference, hypothesis, partial=partial ) + + +def apply_dicp_assignment( + assignment: 'list[int | str] | tuple[int | str]', + reference: 'list[list[str]] | dict[str, list[str]] | SegLST', + hypothesis: 'list[str] | dict[str] | SegLST', +): + """ + Apply DI-cp assignment so that the hypothesis streams match the reference streams. + + Computing the standard WER on the output of this function yields the same + result as the DI-cpWER on the input of this function. + + Arguments: + assignment: The assignment of hypothesis segments to the reference + streams. The length of the assignment must match the number of + segments in the hypothesis. The assignment is a list of stream + labels, one entry for each stream. + reference: Is passed thorugh unchanged but used to determine the format + of the hypothesis output if it is not SegLST. + hypothesis: The hypothesis segments. This can be a list of lists of + segments, or a SegLST object. If it is a SegLST object, the + "segment_index" field is used to group the segments, if present. + + >>> assignment = ('A', 'A', 'B') + >>> apply_dicp_assignment(assignment, {'A': 'a c', 'B': 'd e'}, ['a', 'c d', 'e']) + ({'A': 'a c', 'B': 'd e'}, {'A': ['a', 'c d'], 'B': ['e']}) + + >>> assignment = (0, 0, 1) + >>> apply_dicp_assignment(assignment, ['a c', 'd e'], ['a', 'c d', 'e']) + (['a c', 'd e'], [['a', 'c d'], ['e']]) + + >>> assignment = ('A', ) + >>> apply_dicp_assignment(assignment, {'A': 'b', 'B': 'c'}, ['a']) + ({'A': 'b', 'B': 'c'}, {'A': ['a'], 'B': []}) + + >>> ref = meeteval.io.STM.parse('X 1 A 0.0 1.0 a b\\nX 1 A 1.0 2.0 c d\\nX 1 B 0.0 2.0 e f\\n') + >>> hyp = meeteval.io.STM.parse('X 1 1 0.0 2.0 c d\\nX 1 0 0.0 2.0 a b e f\\n') + >>> ref, hyp = apply_dicp_assignment((0, 1, 1), hyp, ref) + >>> print(ref.dumps()) + X 1 1 0.0 2.0 c d + X 1 0 0.0 2.0 a b e f + + >>> print(hyp.dumps()) + X 1 0 0.0 1.0 a b + X 1 1 1.0 2.0 c d + X 1 1 0.0 2.0 e f + + """ + # The assignment is identical to the ORC assignment, but with + # reference and hypothesis swapped. + from meeteval.wer.wer.orc import apply_orc_assignment + hypothesis, reference = apply_orc_assignment(assignment, hypothesis, reference) + return reference, hypothesis