pyannote · hbredin · Oct 23, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 23, 2024
diff --git a/pyannote/audio/pipelines/speech_separation.py b/pyannote/audio/pipelines/speech_separation.py
@@ -124,7 +124,7 @@ class SpeechSeparation(SpeakerDiarizationMixin, Pipeline):
 
     def __init__(
         self,
-        segmentation: PipelineModel = None,
+        segmentation: PipelineModel = "pyannote/separation-ami-1.0",
         segmentation_step: float = 0.1,
         embedding: PipelineModel = "speechbrain/spkrec-ecapa-voxceleb@5c0be3875fda05e81f3c004ed8c7c06be308de1e",
         embedding_exclude_overlap: bool = False,
@@ -698,6 +698,15 @@ def apply(
         # strings and integers when reference is available and some hypothesis
         # speakers are not present in the reference)
 
+        # re-order sources so that they match
+        # the order given by diarization.labels()
+        inverse_mapping = {label: index for index, label in mapping.items()}
+        original_sliding_window = sources.sliding_window
+        data = sources.data[
+            :, [inverse_mapping[label] for label in diarization.labels()]
+        ]
+        sources = SlidingWindowFeature(data, original_sliding_window)
+
         if not return_embeddings:
             return diarization, sources
 
@@ -717,7 +726,6 @@ def apply(
 
         # re-order centroids so that they match
         # the order given by diarization.labels()
-        inverse_mapping = {label: index for index, label in mapping.items()}
         centroids = centroids[
             [inverse_mapping[label] for label in diarization.labels()]
         ]