diff --git a/meeteval/viz/__main__.py b/meeteval/viz/__main__.py index ca895bed..6fe1686c 100644 --- a/meeteval/viz/__main__.py +++ b/meeteval/viz/__main__.py @@ -78,7 +78,7 @@ def create_viz_folder( file = out / f"{session_id}.html" file.write_text(indent(doc.getvalue())) - print(f'Wrote {file.absolute()}') + print(f'Wrote file://{file.absolute()}') ########################################################################### @@ -88,7 +88,7 @@ def create_viz_folder( def get_wer(v): error_rate = meeteval.wer.combine_error_rates(*[ meeteval.wer.ErrorRate.from_dict( - av.data['info']['wer']['hypothesis']) + av.data['info']['wer']) for av in v.values() ]).error_rate return f'{error_rate * 100:.2f} %' @@ -171,7 +171,7 @@ def get_wer(v): for (i, alignment), av in v.items(): with tag('td'): with tag('span', klass='number'): - wer = av.data['info']['wer']['hypothesis']['error_rate'] + wer = av.data['info']['wer']['error_rate'] doc.text(f"{wer * 100:.2f} %") doc.text(' (') with tag('a', href=f'{session_id}_{i}_{alignment}.html'): @@ -221,7 +221,7 @@ def get_wer(v): with open(out / "index.html", "w") as text_file: text_file.write(indent(doc.getvalue())) - print(f'Open {(out / "index.html").absolute()}') + print(f'Open file://{(out / "index.html").absolute()}') def html( diff --git a/meeteval/viz/visualize.css b/meeteval/viz/visualize.css index 01c14138..6a4ea88d 100644 --- a/meeteval/viz/visualize.css +++ b/meeteval/viz/visualize.css @@ -53,8 +53,13 @@ code { } .pill.warn { - background-color: #f3ebc9; - border: 1px solid #ffcc00; + background-color: #ffff00; + border: 1px solid #eed202; +} + +.pill.warn:hover:not(.no-border) { + background-color: #ffff00; + border: 1px solid #eed202; } .pill:hover:not(.no-border) { @@ -132,9 +137,9 @@ code { } /* Icons */ -i { +i, .icon { display: inline-block; - font-size: 1.2em; + /* font-size: 1.2em; */ margin-right: 5px; } diff --git a/meeteval/viz/visualize.js b/meeteval/viz/visualize.js index 1678550a..8396cd3c 100644 --- a/meeteval/viz/visualize.js +++ b/meeteval/viz/visualize.js @@ -276,6 +276,12 @@ function alignment_visualization( settings.font_size = 12; } + const constants = { + utteranceMarkerOverhang: 3, // Overhang (left and right) of the utterance begin and end markers in pixels + utteranceMarkerDepth: 6, // Depth (height) of the utterance marker bracket in pixels + minStitchOffset: 10, // Minimum distance of the kink in the stitching line to the word in pixels + }; + var urlParams = new URLSearchParams(window.location.search); if (settings.encodeURL && urlParams.has('minimaps')) { settings.minimaps.number = urlParams.get('minimaps') @@ -1072,76 +1078,76 @@ class CanvasPlot { label = (label, value, icon=null, tooltip=null) => { var l = root.append("div").classed("pill", true) - if (icon) l.append("div").html(icon); + if (icon) l.append("div").classed("icon", true).html(icon); l.append("div").classed("info-label", true).text(label); l.append("div").classed("info-value", true).text(value); if (tooltip) addTooltip(l, tooltip); return l; } + console.log(info.wer) + label("ID:", info.filename); label("Length:", info.length.toFixed(2) + "s"); - label("WER:", (info.wer.hypothesis.error_rate * 100).toFixed(2) + "%", null, c => { - if (Object.keys(info.wer).length == 1){ - const wer = info.wer.hypothesis; - const wer_by_speakers = info.wer_by_speakers.hypothesis; - const table = c.append("table").classed("wer-table", true); - const head = table.append("thead") - const hr1 = head.append("tr"); - hr1.append("th"); - hr1.append("th"); - hr1.append("th"); - hr1.append("th"); - - // Determine header from alignment type. If it contians orc, write by stream, otherwise, write by spekaer - let breakdownHeader; - if (info.alignment_type.includes("orc")) { - breakdownHeader = "Counts by Stream"; - } else { - breakdownHeader = "Counts by Speaker"; - } - - hr1.append("th").text(breakdownHeader).attr("colspan", Object.keys(wer_by_speakers).length).style("border-bottom", "1px solid white"); - - const hr = head.append("tr") - hr.append("th").text(""); - hr.append("th"); - hr.append("th").text("Count"); - hr.append("th").text("Relative"); - Object.keys(wer_by_speakers).forEach(speaker => { hr.append("th").text(speaker); }); - const body = table.append("tbody"); - const words = body.append("tr"); - words.append("td").text("Ref. Words"); - words.append("td"); - words.append("td").text(wer.length); - words.append("td").text("100.0%"); - Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { words.append("td").text(wer.length); }); - const correct = body.append("tr"); - correct.append("td").text("Correct"); - correct.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["correct"]); - correct.append("td").text(wer.length - wer.substitutions - wer.deletions); - correct.append("td").text(((wer.length - wer.substitutions - wer.deletions)/wer.length * 100).toFixed(1) + "%"); - Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { correct.append("td").text(wer.length - wer.substitutions - wer.deletions); }); - const substitution = body.append("tr"); - substitution.append("td").text("Substitution"); - substitution.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["substitution"]); - substitution.append("td").text(wer.substitutions); - substitution.append("td").text((wer.substitutions / wer.length * 100).toFixed(1) + "%"); - Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { substitution.append("td").text(wer.substitutions); }); - const insertion = body.append("tr"); - insertion.append("td").text("Insertion"); - insertion.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["insertion"]); - insertion.append("td").text(wer.insertions); - insertion.append("td").text((wer.insertions / wer.length * 100).toFixed(1) + "%"); - Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { insertion.append("td").text(wer.insertions); }); - const deletion = body.append("tr"); - deletion.append("td").text("Deletion"); - deletion.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["deletion"]); - deletion.append("td").text(wer.deletions); - deletion.append("td").text((wer.deletions / wer.length * 100).toFixed(1) + "%"); - Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { deletion.append("td").text(wer.deletions); }); - c.append("div").classed("tooltip-info", true).text("Note: Values don't add up to 100% (except when Insertion=0)\nRef. Words = Correct + Substitution + Deletion\nHyp. Words = Correct + Substitution + Insertion"); + label("WER:", (info.wer.error_rate * 100).toFixed(2) + "%", null, c => { + const wer = info.wer; + const wer_by_speakers = info.wer_by_speakers; + const table = c.append("table").classed("wer-table", true); + const head = table.append("thead") + const hr1 = head.append("tr"); + hr1.append("th"); + hr1.append("th"); + hr1.append("th"); + hr1.append("th"); + + // Determine header from alignment type. If it contians orc, write by stream, otherwise, write by spekaer + let breakdownHeader; + if (info.alignment_type.includes("orc")) { + breakdownHeader = "Counts by Stream"; + } else { + breakdownHeader = "Counts by Speaker"; } + + hr1.append("th").text(breakdownHeader).attr("colspan", Object.keys(wer_by_speakers).length).style("border-bottom", "1px solid white"); + + const hr = head.append("tr") + hr.append("th").text(""); + hr.append("th"); + hr.append("th").text("Count"); + hr.append("th").text("Relative"); + Object.keys(wer_by_speakers).forEach(speaker => { hr.append("th").text(speaker); }); + const body = table.append("tbody"); + const words = body.append("tr"); + words.append("td").text("Ref. Words"); + words.append("td"); + words.append("td").text(wer.length); + words.append("td").text("100.0%"); + Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { words.append("td").text(wer.length); }); + const correct = body.append("tr"); + correct.append("td").text("Correct"); + correct.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["correct"]); + correct.append("td").text(wer.length - wer.substitutions - wer.deletions); + correct.append("td").text(((wer.length - wer.substitutions - wer.deletions)/wer.length * 100).toFixed(1) + "%"); + Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { correct.append("td").text(wer.length - wer.substitutions - wer.deletions); }); + const substitution = body.append("tr"); + substitution.append("td").text("Substitution"); + substitution.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["substitution"]); + substitution.append("td").text(wer.substitutions); + substitution.append("td").text((wer.substitutions / wer.length * 100).toFixed(1) + "%"); + Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { substitution.append("td").text(wer.substitutions); }); + const insertion = body.append("tr"); + insertion.append("td").text("Insertion"); + insertion.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["insertion"]); + insertion.append("td").text(wer.insertions); + insertion.append("td").text((wer.insertions / wer.length * 100).toFixed(1) + "%"); + Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { insertion.append("td").text(wer.insertions); }); + const deletion = body.append("tr"); + deletion.append("td").text("Deletion"); + deletion.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["deletion"]); + deletion.append("td").text(wer.deletions); + deletion.append("td").text((wer.deletions / wer.length * 100).toFixed(1) + "%"); + Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { deletion.append("td").text(wer.deletions); }); + c.append("div").classed("tooltip-info", true).text("Note: Values don't add up to 100% (except when Insertion=0)\nRef. Words = Correct + Substitution + Deletion\nHyp. Words = Correct + Substitution + Insertion"); }); label("Alignment:", info.alignment_type, null, c => c.append('div').classed('wrap-60', true).html("The alignment algorithm used to generate this visualization. Available are:" + @@ -1163,7 +1169,7 @@ class CanvasPlot { "Reference self-overlap:", (info.wer.reference_self_overlap.overlap_rate * 100).toFixed(2) + "%", icons["warning"], - c => c.append('div').classed('wrap-40').text("Self-overlap is the percentage of time that a speaker annotation overlaps with itself. " + + c => c.append('div').classed('wrap-40', true).text("Self-overlap is the percentage of time that a speaker annotation overlaps with itself. " + "On the reference, this is usually an indication for annotation errors.\n" + "Extreme self-overlap can lead to unexpected WERs!") ).classed("warn", true); @@ -1171,7 +1177,8 @@ class CanvasPlot { "Hypothesis self-overlap:", (info.wer.hypothesis_self_overlap.overlap_rate * 100).toFixed(2) + "%", icons["warning"], - c => c.append('div').classed('wrap-40').text("Self-overlap is the percentage of time that a speaker annotation overlaps with itself. " + + c => c.append('div').classed('wrap-40', true).text("Self-overlap is the percentage of time that a speaker annotation overlaps with itself. " + + "On the hypothesis, this often indicates systematic errors.\n" + "Extreme self-overlap can lead to unexpected WERs!") ).classed("warn", true); } @@ -1543,52 +1550,68 @@ class CanvasPlot { right_center_time: right.center_time, start_time: Math.min(left.center_time, right.center_time), end_time: Math.max(left.center_time, right.center_time), + left_utterance: utterances[left.utterance_index], + right_utterance: utterances[right.utterance_index], } }) }); this.filtered_matches = this.matches; + // Precompute utterance x positions and widths + this.precompute_utterance_positions = () => { + console.log("Precomputing utterance positions") + const match_width = settings.match_width * this.plot.x.bandwidth() / 2; + const columnwidth = this.plot.x.bandwidth() / 2 - match_width; + + this.utterances.forEach(u => { + let x = this.plot.x(u.speaker); + let width = columnwidth; + + if (u.source === "hypothesis") { + x += this.plot.x.bandwidth() / 2 + match_width; + } + + if (u.utterance_overlaps) { + width = columnwidth * u.overlap_width; + x = x + width * u.overlap_shift; + width = width - 2*constants.utteranceMarkerOverhang; + } + u.x = x; + u.width = width; + }) + + }; + this.plot.onSizeChanged(this.precompute_utterance_positions); + this.utteranceSelectListeners = []; // Plot label this.plot.element.append("div").classed("plot-label", true).style("margin-left", this.plot.y_axis_padding_html + "px").text("Detailed matching"); + // Click handler for selecting utterances const self = this; - this.last_utterance_candidates_index = -1 this.plot.element.on("click", (event) => { const screenX = event.layerX * self.plot.dpr; // convert from html px to canvas px const screenY = event.layerY * self.plot.dpr; // convert from html px to canvas px const y = self.plot.y.invert(screenY); - // invert x band scale - const match_width = settings.match_width * self.plot.x.bandwidth() / 2; - const eachBand = self.plot.x.step(); - const index = Math.floor((screenX - self.plot.y_axis_padding) / eachBand); - const speaker = self.plot.x.domain()[index]; - - const within_speaker_coord = screenX - self.plot.x(speaker); - const source = ( - within_speaker_coord < self.plot.x.bandwidth() / 2 - match_width - ? "reference" - : ( - within_speaker_coord > self.plot.x.bandwidth() / 2 + match_width - ? "hypothesis" - : null - ) - ); - - if (source) { - const utterance_candidates = this.filtered_utterances.filter( - u => u.start_time < y && u.end_time > y && u.speaker === speaker && u.source === source - ) - if (utterance_candidates.length > 0) { - self.last_utterance_candidates_index = (self.last_utterance_candidates_index+1) % utterance_candidates.length - selectSegment(utterance_candidates[self.last_utterance_candidates_index]); - } - else selectSegment(null); - } else selectSegment(null); + // Brute force go through all utterances and check if the click is inside + // Use the precomputed x and width values + // This should be fast enough since this.filtered_utterances contains only the visible utterances + // and this action is not performed frequently + const utterance_candidates = this.filtered_utterances.filter( + u => u.start_time < y && u.end_time > y && u.x <= screenX && u.x + u.width >= screenX + ) + if (utterance_candidates.length > 0) { + selectSegment(utterance_candidates[0]); + // With the current layout, utterances should never overlap. + // Log a warning if this happens + if (utterance_candidates.length > 1) console.warn("Multiple utterances selected. This should not happen.") + } + else selectSegment(null); }) + // Scrolling with a mouse wheel this.wheel_tracker = {} let deltaY = 0; let hitCount = 0; @@ -1696,6 +1719,9 @@ class CanvasPlot { drawDetails() { const filtered_words = this.filtered_words; + + // Draw help message and exit if the amount of displayed words is too high + // This would lead to a very slow rendering and any information would be lost due to the scale if (filtered_words.length > 3000) { this.plot.context.font = `${30 * plot.dpr}px Arial`; this.plot.context.textAlign = "center"; @@ -1704,48 +1730,49 @@ class CanvasPlot { this.plot.context.fillText("Zoom in or select a smaller region in the minimap above", this.plot.width / 2, this.plot.height / 2); return; } + + // Precompute constants required later const filtered_utterances = this.filtered_utterances; const context = this.plot.context; - const draw_text = filtered_words.length < 400; const draw_boxes = filtered_words.length < 1000; const draw_utterance_markers = filtered_words.length < 2000; const match_width = settings.match_width * this.plot.x.bandwidth() / 2; - const stitch_offset = Math.min(10, match_width / 2); + const stitch_offset = Math.min(constants.minStitchOffset, match_width / 2); const rectwidth = this.plot.x.bandwidth() / 2 - match_width; - const bandwidth = this.plot.x.bandwidth() / 2; - // Draw background + // Draw background: Gray bands for each speaker + const min_y = this.plot.y.range()[0]; + const plot_height = this.plot.y.range()[1] - this.plot.y.range()[0]; + const width = this.plot.x.bandwidth(); for (let i = 0; i < this.plot.x.domain().length; i++) { const speaker = this.plot.x.domain()[i]; - const y = this.plot.y.range()[0]; const x = this.plot.x(speaker); - const width = this.plot.x.bandwidth(); - const height = this.plot.y.range()[1] - this.plot.y.range()[0]; context.fillStyle = "#eee"; - context.fillRect(x, y, width, height); + context.fillRect(x, min_y, width, plot_height); } - // Draw utterance begin/end markers - if (draw_utterance_markers) { - context.strokeStyle = "black"; - context.lineWidth = .1; + // Draw red lines in the background where the the selected segment + // starts and ends + if (draw_utterance_markers && this.state.selectedSegment) { + const [minX, maxX] = this.plot.x.range(); + context.lineWidth = .5; + context.strokeStyle = 'red'; - if (this.state.selectedSegment) { - const [minX, maxX] = this.plot.x.range(); - context.lineWidth = .5; - context.strokeStyle = 'red'; - var y = this.plot.y(this.state.selectedSegment.start_time) - 1; - context.beginPath(); - context.moveTo(minX, y); - context.lineTo(maxX, y); - y = this.plot.y(this.state.selectedSegment.end_time) + 1; - context.moveTo(minX, y); - context.lineTo(maxX, y); - context.stroke(); - } + // Start point + var y = this.plot.y(this.state.selectedSegment.start_time) - 1; + context.beginPath(); + context.moveTo(minX, y); + context.lineTo(maxX, y); + + // End point + y = this.plot.y(this.state.selectedSegment.end_time) + 1; + context.moveTo(minX, y); + context.lineTo(maxX, y); + context.stroke(); } + // Draw markers. This feature is not yet fully supported const filtered_markers = this.filtered_markers; // Draw a range marker on the left side of the plot with two lines spanning the full width if (filtered_markers) filtered_markers.forEach(m => { @@ -1779,42 +1806,45 @@ class CanvasPlot { // Draw word boxes filtered_words.forEach(d => { - const bandleft = this.plot.x(d.speaker); - let rectleft = bandleft; - if (d.source === "hypothesis") rectleft += bandwidth + match_width; + // Compute the actual horizontal position and width of the box + // considering overlaps with other utterances + const utterance = this.utterances[d['utterance_index']]; + // Fill the box with the color of the match if (d.matches?.length > 0 || d.highlight) { context.beginPath(); context.rect( - rectleft, + utterance.x, this.plot.y(d.start_time), - rectwidth, + utterance.width, this.plot.y(d.end_time) - this.plot.y(d.start_time)); if (d.highlight) context.fillStyle = settings.colors.highlight; else context.fillStyle = settings.colors[d.matches[0][1]]; } - context.fill(); + + // Draw box border context.strokeStyle = "gray"; context.lineWidth = 2; if (draw_boxes) context.stroke(); - // Stitches for insertion / deletion + // Draw (stub) stitches for insertion / deletion + // These do not connect to other words, but are drawn as a straight line + // ending in the space between reference and hypothesis if (d.matches?.length > 0) { - // TODO: support multiple matches const [match_index, match_type] = d.matches[0]; context.beginPath(); context.lineWidth = 2; context.strokeStyle = settings.colors[match_type]; if (match_type === 'insertion') { const y = this.plot.y(d.center_time); - context.moveTo(rectleft, y); - context.lineTo(rectleft - stitch_offset, y); + context.moveTo(utterance.x, y); + context.lineTo(utterance.x - stitch_offset, y); } else if (match_type === 'deletion') { const y = this.plot.y(d.center_time); - context.moveTo(rectleft + rectwidth, y); - context.lineTo(rectleft + rectwidth + stitch_offset, y); + context.moveTo(utterance.x + utterance.width, y); + context.lineTo(utterance.x + utterance.width + stitch_offset, y); } context.stroke(); } @@ -1827,59 +1857,56 @@ class CanvasPlot { context.beginPath(); const bandleft = this.plot.x(m.speaker); context.strokeStyle = settings.colors[m.match_type]; - context.moveTo(bandleft + rectwidth, this.plot.y(m.left_center_time)); + context.moveTo(m.left_utterance.x + m.left_utterance.width, this.plot.y(m.left_center_time)); context.lineTo(bandleft + rectwidth + stitch_offset, this.plot.y(m.left_center_time)); context.lineTo(bandleft + rectwidth + 2 * match_width - stitch_offset, this.plot.y(m.right_center_time)); - context.lineTo(bandleft + rectwidth + 2 * match_width, this.plot.y(m.right_center_time)); + context.lineTo(m.right_utterance.x, this.plot.y(m.right_center_time)); context.stroke(); }); // Draw word text + // This is done after the stitches so that the text is on top even if stitches or boxes overlap context.font = `${settings.font_size * this.plot.dpr}px Arial`; context.textAlign = "center"; context.textBaseline = "middle"; context.lineWidth = 1; if (draw_text) filtered_words.forEach(d => { - const bandleft = this.plot.x(d.speaker); - let rectleft = bandleft; - if (d.source === "hypothesis") rectleft += bandwidth + match_width; - - rectleft += rectwidth / 2; + const utterance = this.utterances[d['utterance_index']]; + let x = utterance.x + utterance.width / 2; // Center of the utterance let y_ = this.plot.y((d.start_time + d.end_time) / 2); if (d.matches === undefined) context.fillStyle = "gray"; else context.fillStyle = '#000'; - context.fillText(d.words, rectleft, y_); + context.fillText(d.words, x, y_); }) // Draw utterance begin and end markers - const markerLength = 6; - const markerOverhang = 3; + // This is done after drawing the word boxes so that the markers are visible + // even when the word boxes are too crammed + const markerDepth = constants.utteranceMarkerDepth; + const markerOverhang = constants.utteranceMarkerOverhang; if (draw_utterance_markers) filtered_utterances.forEach(d => { context.strokeStyle = "black"; context.lineWidth = 1.5; context.beginPath(); // x is the left side of the marker - var x = this.plot.x(d.speaker); - const bandwidth = this.plot.x.bandwidth() / 2 - match_width; - if (d.source == "hypothesis") { - x += bandwidth + 2*match_width; - } + const x = d.x; + const width = d.width; // Begin marker var y = this.plot.y(d.start_time) - 1; - context.moveTo(x - markerOverhang, y + markerLength); + context.moveTo(x - markerOverhang, y + markerDepth); context.lineTo(x - markerOverhang, y); - context.lineTo(x + bandwidth + markerOverhang, y); - context.lineTo(x + bandwidth + markerOverhang, y + markerLength); + context.lineTo(x + width + markerOverhang, y); + context.lineTo(x + width + markerOverhang, y + markerDepth); // End marker y = this.plot.y(d.end_time) + 1; - context.moveTo(x - markerOverhang, y - markerLength); + context.moveTo(x - markerOverhang, y - markerDepth); context.lineTo(x - markerOverhang, y); - context.lineTo(x + bandwidth + markerOverhang, y); - context.lineTo(x + bandwidth + markerOverhang, y - markerLength); + context.lineTo(x + width + markerOverhang, y); + context.lineTo(x + width + markerOverhang, y - markerDepth); context.stroke(); // Draw marker that text is empty @@ -1889,7 +1916,7 @@ class CanvasPlot { context.textBaseline = "middle"; context.strokeStyle = "lightgray"; context.linewidth = 1; - const x_ = x + bandwidth / 2; + const x_ = x + d.width / 2; context.font = `italic ${settings.font_size * this.plot.dpr}px Arial`; context.fillStyle = "gray"; context.fillText('(empty segment)', x_, (this.plot.y(d.start_time) + this.plot.y(d.end_time)) / 2); @@ -1899,11 +1926,10 @@ class CanvasPlot { // Draw boundary around the selected utterance if (this.state.selectedSegment) { const d = this.state.selectedSegment; - const x = this.plot.x(d.speaker) + (d.source === "hypothesis" ? bandwidth + match_width : 0); context.beginPath(); context.strokeStyle = "red"; context.lineWidth = 3; - context.rect(x, this.plot.y(d.start_time), rectwidth, this.plot.y(d.end_time) - this.plot.y(d.start_time)); + context.rect(d.x, this.plot.y(d.start_time), d.width, this.plot.y(d.end_time) - this.plot.y(d.start_time)); context.stroke(); // Write begin time above begin marker @@ -1911,11 +1937,11 @@ class CanvasPlot { context.fillStyle = "gray"; context.textAlign = "center"; context.textBaseline = "bottom"; - context.fillText(`begin time: ${d.start_time.toFixed(2)}`, x + rectwidth / 2, this.plot.y(d.start_time) - 3); + context.fillText(`begin time: ${d.start_time.toFixed(2)}`, d.x + d.width / 2, this.plot.y(d.start_time) - 3); // Write end time below end marker context.textBaseline = "top"; - context.fillText(`end time: ${d.end_time.toFixed(2)}`, x + rectwidth / 2, this.plot.y(d.end_time) + 3); + context.fillText(`end time: ${d.end_time.toFixed(2)}`, d.x + d.width / 2, this.plot.y(d.end_time) + 3); } } diff --git a/meeteval/viz/visualize.py b/meeteval/viz/visualize.py index d165f64b..d077df64 100644 --- a/meeteval/viz/visualize.py +++ b/meeteval/viz/visualize.py @@ -1,7 +1,6 @@ import logging import os -import json - +from meeteval.wer.wer.utils import check_single_filename import urllib.request import meeteval @@ -10,7 +9,6 @@ logging.basicConfig(level=logging.ERROR) import dataclasses import functools -import shutil import uuid from pathlib import Path @@ -100,80 +98,13 @@ def nested_round(obj): raise TypeError(path) -def get_wer(t: SegLST, assignment_type, collar=5, hypothesis_key='hypothesis'): - """ - Compute the WER with the given assignment type and collar between the segments with `s['source'] = 'reference'` - and `s['source'] = hypothesis_key`. - """ - ref = t.filter(lambda s: s['source'] == 'reference') - hyp = t.filter(lambda s: s['source'] == hypothesis_key) - if assignment_type == 'cp': - from meeteval.wer.wer.cp import cp_word_error_rate - wer = cp_word_error_rate(ref, hyp) - elif assignment_type in ('tcp', 'ditcp'): - from meeteval.wer.wer.time_constrained import time_constrained_minimum_permutation_word_error_rate - # The visualization looks wrong if we don't sort segments - wer = time_constrained_minimum_permutation_word_error_rate( - ref, hyp, - collar=collar, - reference_sort='segment', - hypothesis_sort='segment', - reference_pseudo_word_level_timing='character_based', - hypothesis_pseudo_word_level_timing='character_based_points', - ) - else: - raise ValueError(assignment_type) - return wer - - -def apply_assignment(assignment, d: SegLST, source_key='hypothesis'): - """ - Apply the assignment to the given SegLST by replacing the "speaker" key of the hypothesis. - """ - # Both ref and hyp key can be missing or None - # This can happen when the filter function excludes a speaker completely - # TODO: Find a good way to name these and adjust apply_cp_assignment accordingly - assignment = dict( - ((b, a if a is not None else f'[{b}]') - for a, b in assignment) - ) - # We only want to change the labels for the hypothesis. This way, we can easily - # apply this function to the full set of words - return d.map( - lambda w: - {**w, 'speaker': assignment.get(w['speaker'], f"[{w['speaker']}]")} - if w.get('source', None) == source_key - else w - ) - -# def get_diarization_invariant_alignment(ref: SegLST, hyp: SegLST, collar=5): -# from meet_eval.dicpwer.dicp import greedy_di_tcp_error_rate -# words, _ = get_alignment(ref, hyp, 'tcp', collar=collar) -# -# wer = greedy_di_tcp_error_rate( -# list(ref.groupby('speaker').values()), -# [[[vv] for vv in v] for v in (ref.groupby('speaker')).values()], -# collar=collar -# ) -# -# hyp = wer.apply_assignment(sorted(hyp, key=lambda x: x['start_time'])) -# hyp = [ -# {**l, 'speaker2': k, } -# for k, v in hyp.items() -# for l in v -# ] -# -# _, alignment = get_alignment(ref, hyp, 'tcp', collar=collar) -# return words, alignment - - -def get_alignment(data, alignment_type, collar=5, hypothesis_key='hypothesis'): +def get_alignment(data, alignment_type, collar=5): # Extract hyps and ref from data. They have been merged earlier for easier processing - hyp = data.filter(lambda s: s['source'] == hypothesis_key) + hyp = data.filter(lambda s: s['source'] == 'hypothesis') ref = data.filter(lambda s: s['source'] == 'reference') - if alignment_type == 'cp': + if alignment_type == 'levenshtein': from meeteval.wer.wer.time_constrained import align # Set the collar large enough that all words overlap with all other words min_time = min(map(lambda x: x['start_time'], data)) @@ -188,7 +119,7 @@ def get_alignment(data, alignment_type, collar=5, hypothesis_key='hypothesis'): hypothesis_sort=False, style='seglst', ) - elif alignment_type == 'tcp': + elif alignment_type == 'time_constrained': from meeteval.wer.wer.time_constrained import align align = functools.partial( align, @@ -200,11 +131,8 @@ def get_alignment(data, alignment_type, collar=5, hypothesis_key='hypothesis'): hypothesis_sort=False, style='seglst', ) - elif alignment_type == 'ditcp': - raise NotImplementedError() - # return get_diarization_invariant_alignment(ref, hyp, collar=collar) else: - raise ValueError(alignment_type) + raise NotImplementedError(alignment_type) # Compute alignment and extract words ref = ref.sorted('start_time').groupby('speaker') @@ -237,69 +165,126 @@ def get_alignment(data, alignment_type, collar=5, hypothesis_key='hypothesis'): r.setdefault('matches', []).append((h['word_index'], 'substitution')) -def get_visualization_data(ref: SegLST, *hyp: SegLST, assignment='tcp', alignment_transform=None): +def solve_stream_assignment(ref, hyp, assignment): + """ + Computes the word error rate and applies the assignment to the reference and hypothesis. + """ + if assignment == 'cp': + wer = meeteval.wer.wer.cp.cp_word_error_rate(ref, hyp) + ref, hyp = wer.apply_assignment(ref, hyp) + elif assignment == 'tcp': + wer = meeteval.wer.wer.time_constrained.time_constrained_minimum_permutation_word_error_rate( + ref, hyp, + collar=5, + reference_sort='segment', + hypothesis_sort='segment', + reference_pseudo_word_level_timing='character_based', + hypothesis_pseudo_word_level_timing='character_based_points', + ) + ref, hyp = wer.apply_assignment(ref, hyp) + elif assignment == 'tcorc': + wer = meeteval.wer.wer.time_constrained_orc.time_constrained_orc_wer( + ref, hyp, + collar=5, + reference_sort='segment', + hypothesis_sort='segment', + reference_pseudo_word_level_timing='character_based', + hypothesis_pseudo_word_level_timing='character_based_points', + ) + ref, hyp = wer.apply_assignment(ref, hyp) + elif assignment == 'orc': + wer = meeteval.wer.wer.orc.orc_word_error_rate(ref, hyp) + ref, hyp = wer.apply_assignment(ref, hyp) + else: + raise ValueError(assignment) + return wer, ref, hyp + + +def add_overlap_shift(utterances: SegLST): + """ + Adds the keys "overlap_shift" and "overlap_width" to each utterance. These + values are used to determine the width and horizontal position of each + utterance in the visualization such that they do not overlap visually, even if they + overlap temporally. + """ + for utterance in utterances: + # Find any other overlapping utterances + # TODO: Make this search more efficient + overlaps = [] + for other_utterance in utterances[:utterance['utterance_index']][::-1]: + if other_utterance['end_time'] > utterance['start_time']: + if other_utterance['source'] == utterance['source'] and other_utterance['speaker'] == utterance['speaker']: + overlaps.append(other_utterance['utterance_index']) + other_utterance['utterance_overlaps'].append(utterance['utterance_index']) + + # Compute shifts from the overlaps such that the utterances don't overlap + # This is a greedy approach that works well for most cases + utterance['utterance_overlaps'] = overlaps + if overlaps: + shifts = [ + utterances[o]['overlap_shift'] + for o in overlaps + ] + for shift in range(len(shifts) + 1): + if shift not in shifts: + break + utterance['overlap_shift'] = shift + else: + utterance['overlap_shift'] = 0 + + # Compute the width for each (sub)column and assign it to the utterance + # This should result in the largest possible width for each utterance + # such that no two utterances overlap + for utterance in utterances: + utterance['num_columns'] = max([utterances[o]['overlap_shift'] for o in utterance['utterance_overlaps']] + [utterance['overlap_shift']]) + 1 + + for utterance in utterances.sorted(lambda x: -x['num_columns']): + num_columns = max([utterances[o]['num_columns'] for o in utterance['utterance_overlaps']] + [utterance['num_columns']]) + + width = 1 / num_columns + utterance['overlap_width'] = width + + + +def get_visualization_data(ref: SegLST, hyp: SegLST, assignment='tcp', alignment_transform=None): + """ + Generates the data structure as required by the visualization frontend. + + Solves the stream assignment problem and computes the alignment between the reference and hypothesis. + Then, computes additional useful information for display in the visualization. + """ ref = asseglst(ref) - hyp = [asseglst(h) for h in hyp] + hyp = asseglst(hyp) + check_single_filename(ref, hyp) data = { 'info': { 'filename': ref[0]['session_id'], 'alignment_type': assignment, - 'length': max([e['end_time'] for e in hyp[0] + ref]) - min([e['start_time'] for e in hyp[0] + ref]), + 'length': max([e['end_time'] for e in hyp + ref]) - min([e['start_time'] for e in hyp + ref]), } } - # Solve assignment when assignment is tcorc or orc - if assignment == 'tcorc': - assert len(hyp) == 1, len(hyp) - from meeteval.wer.wer.time_constrained_orc import time_constrained_orc_wer - # The visualization looks wrong if we don't sort segments - wer = time_constrained_orc_wer( - ref, *hyp, - collar=5, - reference_sort='segment', - hypothesis_sort='segment', - reference_pseudo_word_level_timing='character_based', - hypothesis_pseudo_word_level_timing='character_based_points', - ) - ref, hyp = wer.apply_assignment(ref, *hyp) - hyp = (hyp,) - assignment = 'tcp' - elif assignment == 'orc': - assert len(hyp) == 1, len(hyp) - from meeteval.wer.wer.orc import orc_word_error_rate - wer = orc_word_error_rate(ref, *hyp) - ref, hyp = wer.apply_assignment(ref, *hyp) - hyp = (hyp,) - assignment = 'cp' - - assert len(hyp) > 0, hyp + # Get and apply stream assignment + wer, ref, hyp = solve_stream_assignment(ref, hyp, assignment) + align_type = 'time_constrained' if assignment in ['tcp', 'tcorc'] else 'levenshtein' + if alignment_transform is None: alignment_transform = lambda x: x - ref_session_ids = set(ref.T['session_id']) - for h in hyp: - hyp_session_ids = set(h.T['session_id']) - assert 1 == len(ref_session_ids) and ref_session_ids == hyp_session_ids, f'Expect a single session ID/filename and the same for reference an hypothesis, got {ref_session_ids} and {hyp_session_ids}.' - # Add information about ref/hyp to each utterance ref = ref.map(lambda s: {**s, 'source': 'reference'}) - # TODO: how to encode hypothesis correctly? I want to be able to name them from outside. - # Use a new key, "system_name"? - if len(hyp) > 1: - hypothesis_keys = [f'hypothesis-{i}' for i in range(len(hyp))] - else: - hypothesis_keys = ['hypothesis'] - hyp = SegLST.merge(*[ - h.map(lambda s: {**s, 'source': hypothesis_keys[i]}) - for i, h in enumerate(hyp) - ]) + hyp = hyp.map(lambda s: {**s, 'source': 'hypothesis'}) u = ref + hyp # Sort by begin time. Otherwise, the alignment will be unintuitive and likely not what the user wanted u = u.sorted('start_time') + # Add utterance index + for i, utterance in enumerate(u): + utterance['utterance_index'] = i + # Convert to words so that the transformation can be applied w = get_pseudo_word_level_timings(u, 'character_based') w = w.map(lambda w: {**w, 'words': call_with_args(alignment_transform, w), 'original_words': w['words']}) @@ -308,20 +293,11 @@ def get_visualization_data(ref: SegLST, *hyp: SegLST, assignment='tcp', alignmen ignored_words = w.filter(lambda s: not s['words']) # .map(lambda s: {**s, 'match_type': 'ignored'}) w = w.filter(lambda s: s['words']) - # Get assignment using the word-level timestamps and filtered data - wers = {} - for k in hypothesis_keys: - wer = wers[k] = get_wer(w, assignment, collar=5, hypothesis_key=k) - u = apply_assignment(wer.assignment, u, source_key=k) - w = apply_assignment(wer.assignment, w, source_key=k) - ignored_words = apply_assignment(wer.assignment, ignored_words, source_key=k) - # Get the alignment using the filtered data. Add ignored words for visualization # Add running word index used by the alignment to refer to different words for i, word in enumerate(w): word['word_index'] = i - for k in hypothesis_keys: - get_alignment(w, assignment, collar=5, hypothesis_key=k) + get_alignment(w, align_type, collar=5) words = w + ignored_words # Map back to original_words @@ -345,6 +321,7 @@ def get_visualization_data(ref: SegLST, *hyp: SegLST, assignment='tcp', alignmen 'speaker', 'start_time', 'duration', + 'utterance_index', ] } def compress(m): @@ -365,18 +342,20 @@ def compress(m): else: data['words'] = words.segments + add_overlap_shift(u) + # Add utterances to data. Add total number of words to each utterance data['utterances'] = [{**l, 'total': len(l['words'].split())} for l in u] - data['info']['wer'] = {k: dataclasses.asdict(wer) for k, wer in wers.items()} + data['info']['wer'] = dataclasses.asdict(wer) - def wer_by_speaker(hypothesis_key, speaker): + def wer_by_speaker(speaker): # Get all words from this speaker words_ = words.filter(lambda s: s['speaker'] == speaker) # Get all hypothesis words. From this we can find the number of insertions, substitutions and correct matches. # Ignore any words that are not matched (i.e., don't have a "matches" key) - hyp_words = words_.filter(lambda s: s['source'] == k and 'matches' in s) + hyp_words = words_.filter(lambda s: s['source'] == 'hypothesis' and 'matches' in s) insertions = len(hyp_words.filter(lambda s: s['matches'][0][1] == 'insertion')) substitutions = len(hyp_words.filter(lambda s: s['matches'][0][1] == 'substitution')) # correct = len(hyp_words.filter(lambda s: s['matches'][0][1] == 'correct')) @@ -386,7 +365,7 @@ def wer_by_speaker(hypothesis_key, speaker): # The number of deletions is the number of reference words that are not matched with a hypothesis word. ref_words = words_.filter(lambda s: s['source'] == 'reference' and 'matches' in s) deletions = len(ref_words.filter( - lambda s: not [w for w, _ in s['matches'] if w is not None and words[w]['source'] == hypothesis_key])) + lambda s: not [w for w, _ in s['matches'] if w is not None and words[w]['source'] == 'hypothesis'])) return dataclasses.asdict(ErrorRate( errors=insertions + deletions + substitutions, @@ -399,11 +378,8 @@ def wer_by_speaker(hypothesis_key, speaker): )) data['info']['wer_by_speakers'] = { - k: { - speaker: wer_by_speaker(k, speaker) - for speaker in list(ref.unique('speaker')) - } - for k in hypothesis_keys + speaker: wer_by_speaker(speaker) + for speaker in list(ref.unique('speaker')) } return data @@ -625,7 +601,7 @@ def load_cdn(name, url): function exec() {{ // Wait for d3 to load if (typeof d3 !== 'undefined') alignment_visualization( - {dumps_json(self.data, indent=None, sort_keys=False, separators=(',', ':'), float_round=4)}, + {dumps_json(self.data, indent=1 if self.js_debug else None, sort_keys=False, separators=(',', ':'), float_round=4)}, "#{element_id}", {{ colors: {self._get_colormap()},