Skip to content

Commit

Permalink
Allow user-supplied segment_index in ORC and tcORC
Browse files Browse the repository at this point in the history
  • Loading branch information
thequilo committed Apr 11, 2024
1 parent 7e757de commit 71eda4f
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 4 deletions.
21 changes: 19 additions & 2 deletions meeteval/wer/wer/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,25 @@ def orc_word_error_rate(reference, hypothesis):
# come from the same segment. Do this before removing empty segments so that
# we can still find the original segments in the reference after the
# assignment.
for i, s in enumerate(reference):
s['segment_index'] = i
# If the input already contains a "segment_index" key, we use that because
# the user might have already assigned the segment index and split words
if 'segment_index' not in reference.T.keys():
reference = meeteval.io.SegLST([
{**s, 'segment_index': i}
for i, s in enumerate(reference)
])
else:
# Make sure that the user-supplied segment indices are correct.
# They should either be unique or every segment should contain only
# a single word
if (
len(reference) != len(reference.unique('segment_index'))
and any([' ' in s['words'] for s in reference])
):
raise ValueError(
'If supplied, the "segment_index" key must be unique or every '
'segment must contain only a single word.'
)

# Group by stream. For ORC-WER, only hypothesis must be grouped
hypothesis = hypothesis.groupby('speaker')
Expand Down
21 changes: 19 additions & 2 deletions meeteval/wer/wer/time_constrained_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,25 @@ def time_constrained_orc_wer(
# come from the same segment. Do this before removing empty segments so that
# we can still find the original segments in the reference after the
# assignment.
for i, s in enumerate(reference):
s['segment_index'] = i
# If the input already contains a "segment_index" key, we use that because
# the user might have already assigned the segment index and split words
if 'segment_index' not in reference.T.keys():
reference = meeteval.io.SegLST([
{**s, 'segment_index': i}
for i, s in enumerate(reference)
])
else:
# Make sure that the user-supplied segment indices are correct.
# They should either be unique or every segment should contain only
# a single word
if (
len(reference) != len(reference.unique('segment_index'))
and any([' ' in s['words'] for s in reference])
):
raise ValueError(
'If supplied, the "segment_index" key must be unique or every '
'segment must contain only a single word.'
)

# Group by stream. For ORC-WER, only hypothesis must be grouped
hypothesis = hypothesis.groupby('speaker')
Expand Down

0 comments on commit 71eda4f

Please sign in to comment.