diff --git a/meeteval/viz/__main__.py b/meeteval/viz/__main__.py
index ca895bed..6fe1686c 100644
--- a/meeteval/viz/__main__.py
+++ b/meeteval/viz/__main__.py
@@ -78,7 +78,7 @@ def create_viz_folder(
 
         file = out / f"{session_id}.html"
         file.write_text(indent(doc.getvalue()))
-        print(f'Wrote {file.absolute()}')
+        print(f'Wrote file://{file.absolute()}')
 
     ###########################################################################
 
@@ -88,7 +88,7 @@ def create_viz_folder(
     def get_wer(v):
         error_rate = meeteval.wer.combine_error_rates(*[
             meeteval.wer.ErrorRate.from_dict(
-                av.data['info']['wer']['hypothesis'])
+                av.data['info']['wer'])
             for av in v.values()
         ]).error_rate
         return f'{error_rate * 100:.2f} %'
@@ -171,7 +171,7 @@ def get_wer(v):
                             for (i, alignment), av in v.items():
                                 with tag('td'):
                                     with tag('span', klass='number'):
-                                        wer = av.data['info']['wer']['hypothesis']['error_rate']
+                                        wer = av.data['info']['wer']['error_rate']
                                         doc.text(f"{wer * 100:.2f} %")
                                     doc.text(' (')
                                     with tag('a', href=f'{session_id}_{i}_{alignment}.html'):
@@ -221,7 +221,7 @@ def get_wer(v):
 
     with open(out / "index.html", "w") as text_file:
         text_file.write(indent(doc.getvalue()))
-    print(f'Open {(out / "index.html").absolute()}')
+    print(f'Open file://{(out / "index.html").absolute()}')
 
 
 def html(
diff --git a/meeteval/viz/visualize.css b/meeteval/viz/visualize.css
index 01c14138..6a4ea88d 100644
--- a/meeteval/viz/visualize.css
+++ b/meeteval/viz/visualize.css
@@ -53,8 +53,13 @@ code {
 }
 
 .pill.warn {
-    background-color: #f3ebc9;
-    border: 1px solid #ffcc00;
+    background-color: #ffff00;
+    border: 1px solid #eed202;
+}
+
+.pill.warn:hover:not(.no-border) {
+    background-color: #ffff00;
+    border: 1px solid #eed202;
 }
 
 .pill:hover:not(.no-border) {
@@ -132,9 +137,9 @@ code {
 }
 
 /* Icons */
-i {
+i, .icon {
     display: inline-block;
-    font-size: 1.2em;
+    /* font-size: 1.2em; */
     margin-right: 5px;
 }
 
diff --git a/meeteval/viz/visualize.js b/meeteval/viz/visualize.js
index 1678550a..8396cd3c 100644
--- a/meeteval/viz/visualize.js
+++ b/meeteval/viz/visualize.js
@@ -276,6 +276,12 @@ function alignment_visualization(
         settings.font_size = 12;
     }
 
+    const constants = {
+        utteranceMarkerOverhang: 3,  // Overhang (left and right) of the utterance begin and end markers in pixels
+        utteranceMarkerDepth: 6,   // Depth (height) of the utterance marker bracket in pixels
+        minStitchOffset: 10,  // Minimum distance of the kink in the stitching line to the word in pixels
+    };
+
     var urlParams = new URLSearchParams(window.location.search);
     if (settings.encodeURL && urlParams.has('minimaps')) {
         settings.minimaps.number = urlParams.get('minimaps')
@@ -1072,76 +1078,76 @@ class CanvasPlot {
 
         label = (label, value, icon=null, tooltip=null) => {
             var l = root.append("div").classed("pill", true)
-            if (icon) l.append("div").html(icon);
+            if (icon) l.append("div").classed("icon", true).html(icon);
             l.append("div").classed("info-label", true).text(label);
             l.append("div").classed("info-value", true).text(value);
             if (tooltip) addTooltip(l, tooltip);
             return l;
         }
 
+        console.log(info.wer)
+
         label("ID:", info.filename);
         label("Length:", info.length.toFixed(2) + "s");
-        label("WER:", (info.wer.hypothesis.error_rate * 100).toFixed(2) + "%", null, c => {
-            if (Object.keys(info.wer).length == 1){
-                const wer = info.wer.hypothesis;
-                const wer_by_speakers = info.wer_by_speakers.hypothesis;
-                const table = c.append("table").classed("wer-table", true);
-                const head = table.append("thead")
-                const hr1 = head.append("tr");
-                hr1.append("th");
-                hr1.append("th");
-                hr1.append("th");
-                hr1.append("th");
-
-                // Determine header from alignment type. If it contians orc, write by stream, otherwise, write by spekaer
-                let breakdownHeader;
-                if (info.alignment_type.includes("orc")) {
-                    breakdownHeader = "Counts by Stream";
-                } else {
-                    breakdownHeader = "Counts by Speaker";
-                }
-
-                hr1.append("th").text(breakdownHeader).attr("colspan", Object.keys(wer_by_speakers).length).style("border-bottom", "1px solid white");
-
-                const hr = head.append("tr")
-                hr.append("th").text("");
-                hr.append("th");
-                hr.append("th").text("Count");
-                hr.append("th").text("Relative");
-                Object.keys(wer_by_speakers).forEach(speaker => { hr.append("th").text(speaker); });
-                const body = table.append("tbody");
-                const words = body.append("tr");
-                words.append("td").text("Ref. Words");
-                words.append("td");
-                words.append("td").text(wer.length);
-                words.append("td").text("100.0%");
-                Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { words.append("td").text(wer.length); });
-                const correct = body.append("tr");
-                correct.append("td").text("Correct");
-                correct.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["correct"]);
-                correct.append("td").text(wer.length - wer.substitutions - wer.deletions);
-                correct.append("td").text(((wer.length - wer.substitutions - wer.deletions)/wer.length * 100).toFixed(1) + "%");
-                Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { correct.append("td").text(wer.length - wer.substitutions - wer.deletions); });
-                const substitution = body.append("tr");
-                substitution.append("td").text("Substitution");
-                substitution.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["substitution"]);
-                substitution.append("td").text(wer.substitutions);
-                substitution.append("td").text((wer.substitutions / wer.length * 100).toFixed(1) + "%");
-                Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { substitution.append("td").text(wer.substitutions); });
-                const insertion = body.append("tr");
-                insertion.append("td").text("Insertion");
-                insertion.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["insertion"]);
-                insertion.append("td").text(wer.insertions);
-                insertion.append("td").text((wer.insertions / wer.length * 100).toFixed(1) + "%");
-                Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { insertion.append("td").text(wer.insertions); });
-                const deletion = body.append("tr");
-                deletion.append("td").text("Deletion");
-                deletion.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["deletion"]);
-                deletion.append("td").text(wer.deletions);
-                deletion.append("td").text((wer.deletions / wer.length * 100).toFixed(1) + "%");
-                Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { deletion.append("td").text(wer.deletions); });
-                c.append("div").classed("tooltip-info", true).text("Note: Values don't add up to 100% (except when Insertion=0)\nRef. Words = Correct + Substitution + Deletion\nHyp. Words = Correct + Substitution + Insertion");
+        label("WER:", (info.wer.error_rate * 100).toFixed(2) + "%", null, c => {
+            const wer = info.wer;
+            const wer_by_speakers = info.wer_by_speakers;
+            const table = c.append("table").classed("wer-table", true);
+            const head = table.append("thead")
+            const hr1 = head.append("tr");
+            hr1.append("th");
+            hr1.append("th");
+            hr1.append("th");
+            hr1.append("th");
+
+            // Determine header from alignment type. If it contians orc, write by stream, otherwise, write by spekaer
+            let breakdownHeader;
+            if (info.alignment_type.includes("orc")) {
+                breakdownHeader = "Counts by Stream";
+            } else {
+                breakdownHeader = "Counts by Speaker";
             }
+
+            hr1.append("th").text(breakdownHeader).attr("colspan", Object.keys(wer_by_speakers).length).style("border-bottom", "1px solid white");
+
+            const hr = head.append("tr")
+            hr.append("th").text("");
+            hr.append("th");
+            hr.append("th").text("Count");
+            hr.append("th").text("Relative");
+            Object.keys(wer_by_speakers).forEach(speaker => { hr.append("th").text(speaker); });
+            const body = table.append("tbody");
+            const words = body.append("tr");
+            words.append("td").text("Ref. Words");
+            words.append("td");
+            words.append("td").text(wer.length);
+            words.append("td").text("100.0%");
+            Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { words.append("td").text(wer.length); });
+            const correct = body.append("tr");
+            correct.append("td").text("Correct");
+            correct.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["correct"]);
+            correct.append("td").text(wer.length - wer.substitutions - wer.deletions);
+            correct.append("td").text(((wer.length - wer.substitutions - wer.deletions)/wer.length * 100).toFixed(1) + "%");
+            Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { correct.append("td").text(wer.length - wer.substitutions - wer.deletions); });
+            const substitution = body.append("tr");
+            substitution.append("td").text("Substitution");
+            substitution.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["substitution"]);
+            substitution.append("td").text(wer.substitutions);
+            substitution.append("td").text((wer.substitutions / wer.length * 100).toFixed(1) + "%");
+            Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { substitution.append("td").text(wer.substitutions); });
+            const insertion = body.append("tr");
+            insertion.append("td").text("Insertion");
+            insertion.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["insertion"]);
+            insertion.append("td").text(wer.insertions);
+            insertion.append("td").text((wer.insertions / wer.length * 100).toFixed(1) + "%");
+            Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { insertion.append("td").text(wer.insertions); });
+            const deletion = body.append("tr");
+            deletion.append("td").text("Deletion");
+            deletion.append("td").append("div").classed("legend-color", true).style("background-color", settings.colors["deletion"]);
+            deletion.append("td").text(wer.deletions);
+            deletion.append("td").text((wer.deletions / wer.length * 100).toFixed(1) + "%");
+            Object.entries(wer_by_speakers).forEach(([speaker, wer]) => { deletion.append("td").text(wer.deletions); });
+            c.append("div").classed("tooltip-info", true).text("Note: Values don't add up to 100% (except when Insertion=0)\nRef. Words = Correct + Substitution + Deletion\nHyp. Words = Correct + Substitution + Insertion");
         });
         label("Alignment:", info.alignment_type, null,
             c => c.append('div').classed('wrap-60', true).html("The alignment algorithm used to generate this visualization. Available are:" +
@@ -1163,7 +1169,7 @@ class CanvasPlot {
             "Reference self-overlap:",
             (info.wer.reference_self_overlap.overlap_rate * 100).toFixed(2) + "%",
             icons["warning"],
-            c => c.append('div').classed('wrap-40').text("Self-overlap is the percentage of time that a speaker annotation overlaps with itself. " +
+            c => c.append('div').classed('wrap-40', true).text("Self-overlap is the percentage of time that a speaker annotation overlaps with itself. " +
             "On the reference, this is usually an indication for annotation errors.\n" +
             "Extreme self-overlap can lead to unexpected WERs!")
         ).classed("warn", true);
@@ -1171,7 +1177,8 @@ class CanvasPlot {
             "Hypothesis self-overlap:",
             (info.wer.hypothesis_self_overlap.overlap_rate * 100).toFixed(2) + "%",
             icons["warning"],
-            c => c.append('div').classed('wrap-40').text("Self-overlap is the percentage of time that a speaker annotation overlaps with itself. " +
+            c => c.append('div').classed('wrap-40', true).text("Self-overlap is the percentage of time that a speaker annotation overlaps with itself. " +
+            "On the hypothesis, this often indicates systematic errors.\n" +
             "Extreme self-overlap can lead to unexpected WERs!")
         ).classed("warn", true);
     }
@@ -1543,52 +1550,68 @@ class CanvasPlot {
                             right_center_time: right.center_time,
                             start_time: Math.min(left.center_time, right.center_time),
                             end_time: Math.max(left.center_time, right.center_time),
+                            left_utterance: utterances[left.utterance_index],
+                            right_utterance: utterances[right.utterance_index],
                         }
                     })
             });
             this.filtered_matches = this.matches;
 
+            // Precompute utterance x positions and widths
+            this.precompute_utterance_positions = () => {
+                console.log("Precomputing utterance positions")
+                const match_width = settings.match_width * this.plot.x.bandwidth() / 2;
+                const columnwidth = this.plot.x.bandwidth() / 2 - match_width;
+
+                this.utterances.forEach(u => {
+                    let x = this.plot.x(u.speaker);
+                    let width = columnwidth;
+
+                    if (u.source === "hypothesis") {
+                        x += this.plot.x.bandwidth() / 2 + match_width;
+                    }
+
+                    if (u.utterance_overlaps) {
+                        width = columnwidth * u.overlap_width;
+                        x = x + width * u.overlap_shift;
+                        width = width - 2*constants.utteranceMarkerOverhang;
+                    }
+                    u.x = x;
+                    u.width = width;
+                })
+                
+            };
+            this.plot.onSizeChanged(this.precompute_utterance_positions);
+
             this.utteranceSelectListeners = [];
 
             // Plot label
             this.plot.element.append("div").classed("plot-label", true).style("margin-left", this.plot.y_axis_padding_html + "px").text("Detailed matching");
 
+            // Click handler for selecting utterances
             const self = this;
-            this.last_utterance_candidates_index = -1
             this.plot.element.on("click", (event) => {
                 const screenX = event.layerX * self.plot.dpr;  // convert from html px to canvas px
                 const screenY = event.layerY * self.plot.dpr;  // convert from html px to canvas px
                 const y = self.plot.y.invert(screenY);
 
-                // invert x band scale
-                const match_width = settings.match_width * self.plot.x.bandwidth() / 2;
-                const eachBand = self.plot.x.step();
-                const index = Math.floor((screenX - self.plot.y_axis_padding) / eachBand);
-                const speaker = self.plot.x.domain()[index];
-
-                const within_speaker_coord = screenX - self.plot.x(speaker);
-                const source = (
-                    within_speaker_coord < self.plot.x.bandwidth() / 2 - match_width
-                        ? "reference"
-                        : (
-                            within_speaker_coord > self.plot.x.bandwidth() / 2 + match_width
-                            ? "hypothesis"
-                            : null
-                        )
-                );
-
-                if (source) {
-                    const utterance_candidates = this.filtered_utterances.filter(
-                        u => u.start_time < y && u.end_time > y && u.speaker === speaker && u.source === source
-                    )
-                    if (utterance_candidates.length > 0) {
-                        self.last_utterance_candidates_index = (self.last_utterance_candidates_index+1) % utterance_candidates.length
-                        selectSegment(utterance_candidates[self.last_utterance_candidates_index]);
-                    }
-                    else selectSegment(null);
-                } else selectSegment(null);
+                // Brute force go through all utterances and check if the click is inside
+                // Use the precomputed x and width values
+                // This should be fast enough since this.filtered_utterances contains only the visible utterances
+                // and this action is not performed frequently
+                const utterance_candidates = this.filtered_utterances.filter(
+                    u => u.start_time < y && u.end_time > y && u.x <= screenX && u.x + u.width >= screenX
+                )
+                if (utterance_candidates.length > 0) {
+                    selectSegment(utterance_candidates[0]);
+                    // With the current layout, utterances should never overlap.
+                    // Log a warning if this happens
+                    if (utterance_candidates.length > 1) console.warn("Multiple utterances selected. This should not happen.")
+                }
+                else selectSegment(null);
             })
 
+            // Scrolling with a mouse wheel
             this.wheel_tracker = {}
             let deltaY = 0;
             let hitCount = 0;
@@ -1696,6 +1719,9 @@ class CanvasPlot {
 
         drawDetails() {
             const filtered_words = this.filtered_words;
+
+            // Draw help message and exit if the amount of displayed words is too high
+            // This would lead to a very slow rendering and any information would be lost due to the scale
             if (filtered_words.length > 3000) {
                 this.plot.context.font = `${30 * plot.dpr}px Arial`;
                 this.plot.context.textAlign = "center";
@@ -1704,48 +1730,49 @@ class CanvasPlot {
                 this.plot.context.fillText("Zoom in or select a smaller region in the minimap above", this.plot.width / 2, this.plot.height / 2);
                 return;
             }
+
+            // Precompute constants required later
             const filtered_utterances = this.filtered_utterances;
             const context = this.plot.context;
-
             const draw_text = filtered_words.length < 400;
             const draw_boxes = filtered_words.length < 1000;
             const draw_utterance_markers = filtered_words.length < 2000;
             const match_width = settings.match_width * this.plot.x.bandwidth() / 2;
-            const stitch_offset = Math.min(10, match_width / 2);
+            const stitch_offset = Math.min(constants.minStitchOffset, match_width / 2);
             const rectwidth = this.plot.x.bandwidth() / 2 - match_width;
-            const bandwidth = this.plot.x.bandwidth() / 2;
 
-            // Draw background
+            // Draw background: Gray bands for each speaker
+            const min_y = this.plot.y.range()[0];
+            const plot_height = this.plot.y.range()[1] - this.plot.y.range()[0];
+            const width = this.plot.x.bandwidth();
             for (let i = 0; i < this.plot.x.domain().length; i++) {
                 const speaker = this.plot.x.domain()[i];
-                const y = this.plot.y.range()[0];
                 const x = this.plot.x(speaker);
-                const width = this.plot.x.bandwidth();
-                const height = this.plot.y.range()[1] - this.plot.y.range()[0];
                 context.fillStyle = "#eee";
-                context.fillRect(x, y, width, height);
+                context.fillRect(x, min_y, width, plot_height);
             }
 
-            // Draw utterance begin/end markers
-            if (draw_utterance_markers) {
-                context.strokeStyle = "black";
-                context.lineWidth = .1;
+            // Draw red lines in the background where the the selected segment
+            // starts and ends
+            if (draw_utterance_markers && this.state.selectedSegment) {
+                const [minX, maxX] = this.plot.x.range();
+                context.lineWidth = .5;
+                context.strokeStyle = 'red';
 
-                if (this.state.selectedSegment) {
-                    const [minX, maxX] = this.plot.x.range();
-                    context.lineWidth = .5;
-                    context.strokeStyle = 'red';
-                    var y = this.plot.y(this.state.selectedSegment.start_time) - 1;
-                    context.beginPath();
-                    context.moveTo(minX, y);
-                    context.lineTo(maxX, y);
-                    y = this.plot.y(this.state.selectedSegment.end_time) + 1;
-                    context.moveTo(minX, y);
-                    context.lineTo(maxX, y);
-                    context.stroke();
-                }
+                // Start point
+                var y = this.plot.y(this.state.selectedSegment.start_time) - 1;
+                context.beginPath();
+                context.moveTo(minX, y);
+                context.lineTo(maxX, y);
+
+                // End point
+                y = this.plot.y(this.state.selectedSegment.end_time) + 1;
+                context.moveTo(minX, y);
+                context.lineTo(maxX, y);
+                context.stroke();
             }
 
+            // Draw markers. This feature is not yet fully supported
             const filtered_markers = this.filtered_markers;
             // Draw a range marker on the left side of the plot with two lines spanning the full width
             if (filtered_markers) filtered_markers.forEach(m => {
@@ -1779,42 +1806,45 @@ class CanvasPlot {
 
             // Draw word boxes
             filtered_words.forEach(d => {
-                const bandleft = this.plot.x(d.speaker);
-                let rectleft = bandleft;
-                if (d.source === "hypothesis") rectleft += bandwidth + match_width;
+                // Compute the actual horizontal position and width of the box
+                // considering overlaps with other utterances
+                const utterance = this.utterances[d['utterance_index']];
 
+                // Fill the box with the color of the match
                 if (d.matches?.length > 0 || d.highlight) {
                     context.beginPath();
                     context.rect(
-                        rectleft,
+                        utterance.x,
                         this.plot.y(d.start_time),
-                        rectwidth,
+                        utterance.width,
                         this.plot.y(d.end_time) - this.plot.y(d.start_time));
 
                     if (d.highlight) context.fillStyle = settings.colors.highlight;
                     else context.fillStyle = settings.colors[d.matches[0][1]];
                 }
-
                 context.fill();
+
+                // Draw box border
                 context.strokeStyle = "gray";
                 context.lineWidth = 2;
                 if (draw_boxes) context.stroke();
 
-                // Stitches for insertion / deletion
+                // Draw (stub) stitches for insertion / deletion
+                // These do not connect to other words, but are drawn as a straight line
+                // ending in the space between reference and hypothesis
                 if (d.matches?.length > 0) {
-                    // TODO: support multiple matches
                     const [match_index, match_type] = d.matches[0];
                     context.beginPath();
                     context.lineWidth = 2;
                     context.strokeStyle = settings.colors[match_type];
                     if (match_type === 'insertion') {
                         const y = this.plot.y(d.center_time);
-                        context.moveTo(rectleft, y);
-                        context.lineTo(rectleft - stitch_offset, y);
+                        context.moveTo(utterance.x, y);
+                        context.lineTo(utterance.x - stitch_offset, y);
                     } else if (match_type === 'deletion') {
                         const y = this.plot.y(d.center_time);
-                        context.moveTo(rectleft + rectwidth, y);
-                        context.lineTo(rectleft + rectwidth + stitch_offset, y);
+                        context.moveTo(utterance.x + utterance.width, y);
+                        context.lineTo(utterance.x + utterance.width + stitch_offset, y);
                     }
                     context.stroke();
                 }
@@ -1827,59 +1857,56 @@ class CanvasPlot {
                 context.beginPath();
                 const bandleft = this.plot.x(m.speaker);
                 context.strokeStyle = settings.colors[m.match_type];
-                context.moveTo(bandleft + rectwidth, this.plot.y(m.left_center_time));
+                context.moveTo(m.left_utterance.x + m.left_utterance.width, this.plot.y(m.left_center_time));
                 context.lineTo(bandleft + rectwidth + stitch_offset, this.plot.y(m.left_center_time));
                 context.lineTo(bandleft + rectwidth + 2 * match_width - stitch_offset, this.plot.y(m.right_center_time));
-                context.lineTo(bandleft + rectwidth + 2 * match_width, this.plot.y(m.right_center_time));
+                context.lineTo(m.right_utterance.x, this.plot.y(m.right_center_time));
                 context.stroke();
             });
 
             // Draw word text
+            // This is done after the stitches so that the text is on top even if stitches or boxes overlap
             context.font = `${settings.font_size * this.plot.dpr}px Arial`;
             context.textAlign = "center";
             context.textBaseline = "middle";
             context.lineWidth = 1;
 
             if (draw_text) filtered_words.forEach(d => {
-                const bandleft = this.plot.x(d.speaker);
-                let rectleft = bandleft;
-                if (d.source === "hypothesis") rectleft += bandwidth + match_width;
-
-                rectleft += rectwidth / 2;
+                const utterance = this.utterances[d['utterance_index']];
+                let x = utterance.x + utterance.width / 2;  // Center of the utterance
                 let y_ = this.plot.y((d.start_time + d.end_time) / 2);
                 if (d.matches === undefined) context.fillStyle = "gray";
                 else context.fillStyle = '#000';
-                context.fillText(d.words, rectleft, y_);
+                context.fillText(d.words, x, y_);
             })
 
             // Draw utterance begin and end markers
-            const markerLength = 6;
-            const markerOverhang = 3;
+            // This is done after drawing the word boxes so that the markers are visible
+            // even when the word boxes are too crammed
+            const markerDepth = constants.utteranceMarkerDepth;
+            const markerOverhang = constants.utteranceMarkerOverhang;
             if (draw_utterance_markers) filtered_utterances.forEach(d => {
                 context.strokeStyle = "black";
                 context.lineWidth = 1.5;
                 context.beginPath();
 
                 // x is the left side of the marker
-                var x = this.plot.x(d.speaker);
-                const bandwidth = this.plot.x.bandwidth() / 2 - match_width;
-                if (d.source == "hypothesis") {
-                    x += bandwidth + 2*match_width;
-                }
+                const x = d.x;
+                const width = d.width;
 
                 // Begin marker
                 var y = this.plot.y(d.start_time) - 1;
-                context.moveTo(x - markerOverhang, y + markerLength);
+                context.moveTo(x - markerOverhang, y + markerDepth);
                 context.lineTo(x - markerOverhang, y);
-                context.lineTo(x + bandwidth + markerOverhang, y);
-                context.lineTo(x + bandwidth + markerOverhang, y + markerLength);
+                context.lineTo(x + width + markerOverhang, y);
+                context.lineTo(x + width + markerOverhang, y + markerDepth);
 
                 // End marker
                 y = this.plot.y(d.end_time) + 1;
-                context.moveTo(x - markerOverhang, y - markerLength);
+                context.moveTo(x - markerOverhang, y - markerDepth);
                 context.lineTo(x - markerOverhang, y);
-                context.lineTo(x + bandwidth + markerOverhang, y);
-                context.lineTo(x + bandwidth + markerOverhang, y - markerLength);
+                context.lineTo(x + width + markerOverhang, y);
+                context.lineTo(x + width + markerOverhang, y - markerDepth);
                 context.stroke();
 
                 // Draw marker that text is empty
@@ -1889,7 +1916,7 @@ class CanvasPlot {
                     context.textBaseline = "middle";
                     context.strokeStyle = "lightgray";
                     context.linewidth = 1;
-                    const x_ = x + bandwidth / 2;
+                    const x_ = x + d.width / 2;
                     context.font = `italic ${settings.font_size * this.plot.dpr}px Arial`;
                     context.fillStyle = "gray";
                     context.fillText('(empty segment)', x_, (this.plot.y(d.start_time) + this.plot.y(d.end_time)) / 2);
@@ -1899,11 +1926,10 @@ class CanvasPlot {
             // Draw boundary around the selected utterance
             if (this.state.selectedSegment) {
                 const d = this.state.selectedSegment;
-                const x = this.plot.x(d.speaker) + (d.source === "hypothesis" ? bandwidth + match_width : 0);
                 context.beginPath();
                 context.strokeStyle = "red";
                 context.lineWidth = 3;
-                context.rect(x, this.plot.y(d.start_time), rectwidth, this.plot.y(d.end_time) - this.plot.y(d.start_time));
+                context.rect(d.x, this.plot.y(d.start_time), d.width, this.plot.y(d.end_time) - this.plot.y(d.start_time));
                 context.stroke();
 
                 // Write begin time above begin marker
@@ -1911,11 +1937,11 @@ class CanvasPlot {
                 context.fillStyle = "gray";
                 context.textAlign = "center";
                 context.textBaseline = "bottom";
-                context.fillText(`begin time: ${d.start_time.toFixed(2)}`, x + rectwidth / 2, this.plot.y(d.start_time) - 3);
+                context.fillText(`begin time: ${d.start_time.toFixed(2)}`, d.x + d.width / 2, this.plot.y(d.start_time) - 3);
 
                 // Write end time below end marker
                 context.textBaseline = "top";
-                context.fillText(`end time: ${d.end_time.toFixed(2)}`, x + rectwidth / 2, this.plot.y(d.end_time) + 3);
+                context.fillText(`end time: ${d.end_time.toFixed(2)}`, d.x + d.width / 2, this.plot.y(d.end_time) + 3);
             }
         }
 
diff --git a/meeteval/viz/visualize.py b/meeteval/viz/visualize.py
index d165f64b..d077df64 100644
--- a/meeteval/viz/visualize.py
+++ b/meeteval/viz/visualize.py
@@ -1,7 +1,6 @@
 import logging
 import os
-import json
-
+from meeteval.wer.wer.utils import check_single_filename
 import urllib.request
 
 import meeteval
@@ -10,7 +9,6 @@
 logging.basicConfig(level=logging.ERROR)
 import dataclasses
 import functools
-import shutil
 import uuid
 from pathlib import Path
 
@@ -100,80 +98,13 @@ def nested_round(obj):
         raise TypeError(path)
 
 
-def get_wer(t: SegLST, assignment_type, collar=5, hypothesis_key='hypothesis'):
-    """
-    Compute the WER with the given assignment type and collar between the segments with `s['source'] = 'reference'`
-    and `s['source'] = hypothesis_key`.
-    """
-    ref = t.filter(lambda s: s['source'] == 'reference')
-    hyp = t.filter(lambda s: s['source'] == hypothesis_key)
-    if assignment_type == 'cp':
-        from meeteval.wer.wer.cp import cp_word_error_rate
-        wer = cp_word_error_rate(ref, hyp)
-    elif assignment_type in ('tcp', 'ditcp'):
-        from meeteval.wer.wer.time_constrained import time_constrained_minimum_permutation_word_error_rate
-        # The visualization looks wrong if we don't sort segments
-        wer = time_constrained_minimum_permutation_word_error_rate(
-            ref, hyp,
-            collar=collar,
-            reference_sort='segment',
-            hypothesis_sort='segment',
-            reference_pseudo_word_level_timing='character_based',
-            hypothesis_pseudo_word_level_timing='character_based_points',
-        )
-    else:
-        raise ValueError(assignment_type)
-    return wer
-
-
-def apply_assignment(assignment, d: SegLST, source_key='hypothesis'):
-    """
-    Apply the assignment to the given SegLST by replacing the "speaker" key of the hypothesis.
-    """
-    # Both ref and hyp key can be missing or None
-    # This can happen when the filter function excludes a speaker completely
-    # TODO: Find a good way to name these and adjust apply_cp_assignment accordingly
-    assignment = dict(
-        ((b, a if a is not None else f'[{b}]')
-         for a, b in assignment)
-    )
-    # We only want to change the labels for the hypothesis. This way, we can easily
-    # apply this function to the full set of words
-    return d.map(
-        lambda w:
-        {**w, 'speaker': assignment.get(w['speaker'], f"[{w['speaker']}]")}
-        if w.get('source', None) == source_key
-        else w
-    )
 
-
-# def get_diarization_invariant_alignment(ref: SegLST, hyp: SegLST, collar=5):
-#     from meet_eval.dicpwer.dicp import greedy_di_tcp_error_rate
-#     words, _ = get_alignment(ref, hyp, 'tcp', collar=collar)
-#
-#     wer = greedy_di_tcp_error_rate(
-#         list(ref.groupby('speaker').values()),
-#         [[[vv] for vv in v] for v in (ref.groupby('speaker')).values()],
-#         collar=collar
-#     )
-#
-#     hyp = wer.apply_assignment(sorted(hyp, key=lambda x: x['start_time']))
-#     hyp = [
-#         {**l, 'speaker2': k, }
-#         for k, v in hyp.items()
-#         for l in v
-#     ]
-#
-#     _, alignment = get_alignment(ref, hyp, 'tcp', collar=collar)
-#     return words, alignment
-
-
-def get_alignment(data, alignment_type, collar=5, hypothesis_key='hypothesis'):
+def get_alignment(data, alignment_type, collar=5):
     # Extract hyps and ref from data. They have been merged earlier for easier processing
-    hyp = data.filter(lambda s: s['source'] == hypothesis_key)
+    hyp = data.filter(lambda s: s['source'] == 'hypothesis')
     ref = data.filter(lambda s: s['source'] == 'reference')
 
-    if alignment_type == 'cp':
+    if alignment_type == 'levenshtein':
         from meeteval.wer.wer.time_constrained import align
         # Set the collar large enough that all words overlap with all other words
         min_time = min(map(lambda x: x['start_time'], data))
@@ -188,7 +119,7 @@ def get_alignment(data, alignment_type, collar=5, hypothesis_key='hypothesis'):
             hypothesis_sort=False,
             style='seglst',
         )
-    elif alignment_type == 'tcp':
+    elif alignment_type == 'time_constrained':
         from meeteval.wer.wer.time_constrained import align
         align = functools.partial(
             align,
@@ -200,11 +131,8 @@ def get_alignment(data, alignment_type, collar=5, hypothesis_key='hypothesis'):
             hypothesis_sort=False,
             style='seglst',
         )
-    elif alignment_type == 'ditcp':
-        raise NotImplementedError()
-        # return get_diarization_invariant_alignment(ref, hyp, collar=collar)
     else:
-        raise ValueError(alignment_type)
+        raise NotImplementedError(alignment_type)
 
     # Compute alignment and extract words
     ref = ref.sorted('start_time').groupby('speaker')
@@ -237,69 +165,126 @@ def get_alignment(data, alignment_type, collar=5, hypothesis_key='hypothesis'):
                 r.setdefault('matches', []).append((h['word_index'], 'substitution'))
 
 
-def get_visualization_data(ref: SegLST, *hyp: SegLST, assignment='tcp', alignment_transform=None):
+def solve_stream_assignment(ref, hyp, assignment):
+    """
+    Computes the word error rate and applies the assignment to the reference and hypothesis.
+    """
+    if assignment == 'cp':
+        wer = meeteval.wer.wer.cp.cp_word_error_rate(ref, hyp)
+        ref, hyp = wer.apply_assignment(ref, hyp)
+    elif assignment == 'tcp':
+        wer = meeteval.wer.wer.time_constrained.time_constrained_minimum_permutation_word_error_rate(
+            ref, hyp,
+            collar=5,
+            reference_sort='segment',
+            hypothesis_sort='segment',
+            reference_pseudo_word_level_timing='character_based',
+            hypothesis_pseudo_word_level_timing='character_based_points',
+        )
+        ref, hyp = wer.apply_assignment(ref, hyp)
+    elif assignment == 'tcorc':
+        wer = meeteval.wer.wer.time_constrained_orc.time_constrained_orc_wer(
+            ref, hyp,
+            collar=5,
+            reference_sort='segment',
+            hypothesis_sort='segment',
+            reference_pseudo_word_level_timing='character_based',
+            hypothesis_pseudo_word_level_timing='character_based_points',
+        )
+        ref, hyp = wer.apply_assignment(ref, hyp)
+    elif assignment == 'orc':
+        wer = meeteval.wer.wer.orc.orc_word_error_rate(ref, hyp)
+        ref, hyp = wer.apply_assignment(ref, hyp)
+    else:
+        raise ValueError(assignment)
+    return wer, ref, hyp
+
+
+def add_overlap_shift(utterances: SegLST):
+    """
+    Adds the keys "overlap_shift" and "overlap_width" to each utterance. These
+    values are used to determine the width and horizontal position of each
+    utterance in the visualization such that they do not overlap visually, even if they
+    overlap temporally.
+    """
+    for utterance in utterances:
+        # Find any other overlapping utterances
+        # TODO: Make this search more efficient
+        overlaps = []
+        for other_utterance in utterances[:utterance['utterance_index']][::-1]:
+            if other_utterance['end_time'] > utterance['start_time']:
+                if other_utterance['source'] == utterance['source'] and other_utterance['speaker'] == utterance['speaker']:
+                    overlaps.append(other_utterance['utterance_index'])
+                    other_utterance['utterance_overlaps'].append(utterance['utterance_index'])
+        
+        # Compute shifts from the overlaps such that the utterances don't overlap
+        # This is a greedy approach that works well for most cases
+        utterance['utterance_overlaps'] = overlaps
+        if overlaps:
+            shifts = [
+                utterances[o]['overlap_shift']
+                for o in overlaps
+            ]
+            for shift in range(len(shifts) + 1):
+                if shift not in shifts:
+                    break
+            utterance['overlap_shift'] = shift
+        else:
+            utterance['overlap_shift'] = 0
+
+    # Compute the width for each (sub)column and assign it to the utterance
+    # This should result in the largest possible width for each utterance
+    # such that no two utterances overlap
+    for utterance in utterances:
+        utterance['num_columns'] = max([utterances[o]['overlap_shift'] for o in utterance['utterance_overlaps']] + [utterance['overlap_shift']]) + 1 
+
+    for utterance in  utterances.sorted(lambda x: -x['num_columns']):
+        num_columns =  max([utterances[o]['num_columns'] for o in utterance['utterance_overlaps']] + [utterance['num_columns']])
+
+        width = 1 / num_columns
+        utterance['overlap_width'] = width
+
+
+
+def get_visualization_data(ref: SegLST, hyp: SegLST, assignment='tcp', alignment_transform=None):
+    """
+    Generates the data structure as required by the visualization frontend.
+
+    Solves the stream assignment problem and computes the alignment between the reference and hypothesis.
+    Then, computes additional useful information for display in the visualization.
+    """
     ref = asseglst(ref)
-    hyp = [asseglst(h) for h in hyp]
+    hyp = asseglst(hyp)
+    check_single_filename(ref, hyp)
 
     data = {
         'info': {
             'filename': ref[0]['session_id'],
             'alignment_type': assignment,
-            'length': max([e['end_time'] for e in hyp[0] + ref]) - min([e['start_time'] for e in hyp[0] + ref]),
+            'length': max([e['end_time'] for e in hyp + ref]) - min([e['start_time'] for e in hyp + ref]),
         }
     }
 
-    # Solve assignment when assignment is tcorc or orc
-    if assignment == 'tcorc':
-        assert len(hyp) == 1, len(hyp)
-        from meeteval.wer.wer.time_constrained_orc import time_constrained_orc_wer
-        # The visualization looks wrong if we don't sort segments
-        wer = time_constrained_orc_wer(
-            ref, *hyp,
-            collar=5,
-            reference_sort='segment',
-            hypothesis_sort='segment',
-            reference_pseudo_word_level_timing='character_based',
-            hypothesis_pseudo_word_level_timing='character_based_points',
-        )
-        ref, hyp = wer.apply_assignment(ref, *hyp)
-        hyp = (hyp,)
-        assignment = 'tcp'
-    elif assignment == 'orc':
-        assert len(hyp) == 1, len(hyp)
-        from meeteval.wer.wer.orc import orc_word_error_rate
-        wer = orc_word_error_rate(ref, *hyp)
-        ref, hyp = wer.apply_assignment(ref, *hyp)
-        hyp = (hyp,)
-        assignment = 'cp'
-
-    assert len(hyp) > 0, hyp
+    # Get and apply stream assignment
+    wer, ref, hyp = solve_stream_assignment(ref, hyp, assignment)
+    align_type = 'time_constrained' if assignment in ['tcp', 'tcorc'] else 'levenshtein'
+
     if alignment_transform is None:
         alignment_transform = lambda x: x
 
-    ref_session_ids = set(ref.T['session_id'])
-    for h in hyp:
-        hyp_session_ids = set(h.T['session_id'])
-        assert 1 == len(ref_session_ids) and ref_session_ids == hyp_session_ids, f'Expect a single session ID/filename and the same for reference an hypothesis, got {ref_session_ids} and {hyp_session_ids}.'
-
     # Add information about ref/hyp to each utterance
     ref = ref.map(lambda s: {**s, 'source': 'reference'})
-    # TODO: how to encode hypothesis correctly? I want to be able to name them from outside.
-    #  Use a new key, "system_name"?
-    if len(hyp) > 1:
-        hypothesis_keys = [f'hypothesis-{i}' for i in range(len(hyp))]
-    else:
-        hypothesis_keys = ['hypothesis']
-    hyp = SegLST.merge(*[
-        h.map(lambda s: {**s, 'source': hypothesis_keys[i]})
-        for i, h in enumerate(hyp)
-    ])
+    hyp = hyp.map(lambda s: {**s, 'source': 'hypothesis'})
 
     u = ref + hyp
 
     # Sort by begin time. Otherwise, the alignment will be unintuitive and likely not what the user wanted
     u = u.sorted('start_time')
 
+    # Add utterance index
+    for i, utterance in enumerate(u):
+        utterance['utterance_index'] = i
+
     # Convert to words so that the transformation can be applied
     w = get_pseudo_word_level_timings(u, 'character_based')
     w = w.map(lambda w: {**w, 'words': call_with_args(alignment_transform, w), 'original_words': w['words']})
@@ -308,20 +293,11 @@ def get_visualization_data(ref: SegLST, *hyp: SegLST, assignment='tcp', alignmen
     ignored_words = w.filter(lambda s: not s['words'])  # .map(lambda s: {**s, 'match_type': 'ignored'})
     w = w.filter(lambda s: s['words'])
 
-    # Get assignment using the word-level timestamps and filtered data
-    wers = {}
-    for k in hypothesis_keys:
-        wer = wers[k] = get_wer(w, assignment, collar=5, hypothesis_key=k)
-        u = apply_assignment(wer.assignment, u, source_key=k)
-        w = apply_assignment(wer.assignment, w, source_key=k)
-        ignored_words = apply_assignment(wer.assignment, ignored_words, source_key=k)
-
     # Get the alignment using the filtered data. Add ignored words for visualization
     # Add running word index used by the alignment to refer to different words
     for i, word in enumerate(w):
         word['word_index'] = i
-    for k in hypothesis_keys:
-        get_alignment(w, assignment, collar=5, hypothesis_key=k)
+    get_alignment(w, align_type, collar=5)
     words = w + ignored_words
 
     # Map back to original_words
@@ -345,6 +321,7 @@ def get_visualization_data(ref: SegLST, *hyp: SegLST, assignment='tcp', alignmen
                 'speaker',
                 'start_time',
                 'duration',
+                'utterance_index',
             ]
         }
         def compress(m):
@@ -365,18 +342,20 @@ def compress(m):
     else:
         data['words'] = words.segments
 
+    add_overlap_shift(u)
+
     # Add utterances to data. Add total number of words to each utterance
     data['utterances'] = [{**l, 'total': len(l['words'].split())} for l in u]
 
-    data['info']['wer'] = {k: dataclasses.asdict(wer) for k, wer in wers.items()}
+    data['info']['wer'] = dataclasses.asdict(wer)
 
-    def wer_by_speaker(hypothesis_key, speaker):
+    def wer_by_speaker(speaker):
         # Get all words from this speaker
         words_ = words.filter(lambda s: s['speaker'] == speaker)
 
         # Get all hypothesis words. From this we can find the number of insertions, substitutions and correct matches.
         # Ignore any words that are not matched (i.e., don't have a "matches" key)
-        hyp_words = words_.filter(lambda s: s['source'] == k and 'matches' in s)
+        hyp_words = words_.filter(lambda s: s['source'] == 'hypothesis' and 'matches' in s)
         insertions = len(hyp_words.filter(lambda s: s['matches'][0][1] == 'insertion'))
         substitutions = len(hyp_words.filter(lambda s: s['matches'][0][1] == 'substitution'))
         # correct = len(hyp_words.filter(lambda s: s['matches'][0][1] == 'correct'))
@@ -386,7 +365,7 @@ def wer_by_speaker(hypothesis_key, speaker):
         # The number of deletions is the number of reference words that are not matched with a hypothesis word.
         ref_words = words_.filter(lambda s: s['source'] == 'reference' and 'matches' in s)
         deletions = len(ref_words.filter(
-            lambda s: not [w for w, _ in s['matches'] if w is not None and words[w]['source'] == hypothesis_key]))
+            lambda s: not [w for w, _ in s['matches'] if w is not None and words[w]['source'] == 'hypothesis']))
 
         return dataclasses.asdict(ErrorRate(
             errors=insertions + deletions + substitutions,
@@ -399,11 +378,8 @@ def wer_by_speaker(hypothesis_key, speaker):
         ))
 
     data['info']['wer_by_speakers'] = {
-        k: {
-            speaker: wer_by_speaker(k, speaker)
-            for speaker in list(ref.unique('speaker'))
-        }
-        for k in hypothesis_keys
+        speaker: wer_by_speaker(speaker)
+        for speaker in list(ref.unique('speaker'))
     }
     return data
 
@@ -625,7 +601,7 @@ def load_cdn(name, url):
                 function exec() {{
                     // Wait for d3 to load
                     if (typeof d3 !== 'undefined') alignment_visualization(
-                        {dumps_json(self.data, indent=None, sort_keys=False, separators=(',', ':'), float_round=4)},
+                        {dumps_json(self.data, indent=1 if self.js_debug else None, sort_keys=False, separators=(',', ':'), float_round=4)},
                         "#{element_id}",
                         {{
                             colors: {self._get_colormap()},