fix: infer library type relationship (#157)

* refactor: compare alignments between mapped reads only * fix: update get lib type test * refactor: helper function for getlibtype * feat: only map when lib source is inferred * update tests * fix orientation tests * fix orientation tests * refactor scripts * update get lib type and get read orient * refactor get lib type * refactor: update get lib type and tests * refactor: update lib type and orientation * refactor: update concordant read counting, mapping * update debug messages * update comments in mapping
zavolanlab · Jan 21, 2024 · 7b65c43 · 7b65c43
1 parent 930b741
commit 7b65c43
Show file tree

Hide file tree

Showing 6 changed files with 220 additions and 66 deletions.
diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py
@@ -113,7 +113,7 @@ def _evaluate_mate_relationship(
             ids_2: As `ids_1` but for the putative second mate file.
         """
         self.results.relationship = StatesTypeRelationship.not_mates
-        if ids_1 == ids_2:
+        if ids_1 and ids_2 and ids_1 == ids_2:
             if (
                 self.results.file_1 == StatesType.first_mate and
                 self.results.file_2 == StatesType.second_mate
@@ -127,13 +127,23 @@ def _evaluate_mate_relationship(
                 self.mapping.library_type.relationship = (
                     StatesTypeRelationship.split_mates
                 )
-        else:
+        elif (
+            self.library_source.file_1.short_name is not None or
+            self.library_source.file_2.short_name is not None
+        ):
+            LOGGER.debug("Determining mate relationship by alignment...")
             self.mapping.library_type.relationship \
                 = StatesTypeRelationship.not_available
             self.mapping.library_source = self.library_source
             self.mapping.paths = self.path_1, self.path_2
             self.mapping.evaluate()
             self._align_mates()
+        else:
+            self.results.relationship = StatesTypeRelationship.not_available
+            LOGGER.debug(
+                "Sequence IDs and library source are not determined, "
+                "mate relationship cannot be inferred."
+            )
 
     def _align_mates(self):
         """Decide mate relationship by alignment."""
@@ -144,19 +154,24 @@ def _align_mates(self):
         samfile1 = pysam.AlignmentFile(str(alignment_1), 'r')
         samfile2 = pysam.AlignmentFile(str(alignment_2), 'r')
 
+        seq_id1 = None
+        seq_id2 = None
         previous_seq_id1 = None
         previous_seq_id2 = None
 
         reads1 = []  # List to store alignments for one read from file1
-        mate1 = []  # List to store alignments for each read
+        mate1 = []  # List to store alignments for each read from file1
         reads2 = []  # List to store alignments for one read from file2
+        mate2 = []  # List to store alignments for each read from file2
 
         concordant = 0
 
         for read1 in samfile1:
             seq_id1 = read1.query_name
-            if seq_id1 != previous_seq_id1 \
-                    and previous_seq_id1 is not None:
+            if (
+                seq_id1 != previous_seq_id1 and
+                previous_seq_id1 is not None
+            ):
                 mate1.append(reads1.copy())
                 reads1.clear()
             if read1.reference_end:
@@ -167,35 +182,63 @@ def _align_mates(self):
         read_counter = 0
         for read2 in samfile2:
             seq_id2 = read2.query_name
-            if seq_id2 != previous_seq_id2 \
-                    and previous_seq_id2 is not None:
-                if self._compare_alignments(mate1[read_counter], reads2):
+            if (
+                seq_id2 != previous_seq_id2 and
+                previous_seq_id2 is not None
+            ):
+                mate2.append(reads2.copy())
+                if self._compare_alignments(
+                    mate1[read_counter], reads2
+                ):
                     concordant += 1
                 reads2.clear()
                 read_counter += 1
             if read2.reference_end:
                 reads2.append(read2)
             previous_seq_id2 = seq_id2
-
-        if self._compare_alignments(mate1[read_counter], reads2):
+        mate2.append(reads2.copy())
+        if self._compare_alignments(
+            mate1[read_counter], reads2
+        ):
             concordant += 1
 
-        if (concordant / read_counter) >= self.cutoff:
-            self.results.relationship = (
-                StatesTypeRelationship.split_mates
-            )
-            self.mapping.library_type.relationship \
-                = StatesTypeRelationship.split_mates
-            self.mapping.mapped = False
-            self.mapping.star_dirs = []
-        else:
-            self.results.relationship = (
-                StatesTypeRelationship.not_mates
-            )
+        aligned_mate1 = len(list(filter(None, mate1)))
+        aligned_mate2 = len(list(filter(None, mate2)))
+
+        LOGGER.debug(f"Number of aligned reads file 1: {aligned_mate1}")
+        LOGGER.debug(f"Number of aligned reads file 2: {aligned_mate2}")
+        LOGGER.debug(f"Number of concordant reads: {concordant}")
+
+        self._update_relationship(
+            concordant, min(aligned_mate1, aligned_mate2)
+        )
 
         samfile1.close()
         samfile2.close()
 
+    def _update_relationship(self, concordant, aligned_reads):
+        """Helper function to update relationship based on alignment."""
+        try:
+            ratio = concordant / aligned_reads
+        except ZeroDivisionError:
+            self.results.relationship = (
+                StatesTypeRelationship.not_available
+            )
+        else:
+            if ratio >= self.cutoff:
+                self.results.relationship = (
+                    StatesTypeRelationship.split_mates
+                )
+                self.mapping.library_type.relationship = (
+                    StatesTypeRelationship.split_mates
+                )
+                self.mapping.mapped = False
+                self.mapping.star_dirs = []
+            else:
+                self.results.relationship = (
+                    StatesTypeRelationship.not_mates
+                )
+
     class AlignedSegment:
         """Placeholder class for mypy "Missing attribute"
         error in _compare_alignments(), the actual object used
@@ -302,44 +345,47 @@ def evaluate(self) -> None:
                     self.result = StatesType.not_available
                     raise FileProblem(f"File is empty: {self.path}") from exc
 
-                if self.seq_id_format is None:
+                if self.seq_id_format is not None:
+                    LOGGER.debug(
+                        "Sequence identifier format: "
+                        f"{self.seq_id_format.name}"
+                    )
+                else:
                     self.result = StatesType.not_available
-                    raise MetadataWarning(
+                    LOGGER.debug(
                         "Could not determine sequence identifier format."
                     )
-                LOGGER.debug(
-                    f"Sequence identifier format: {self.seq_id_format.name}"
-                )
 
                 # Ensure that remaining records are compatible with sequence
                 # identifier format and library type determined from first
                 # record
-                LOGGER.debug(
-                    "Checking consistency of remaining reads with initially "
-                    "determined identifier format and library type..."
-                )
-                for record in seq_iter:
-                    records += 1
-                    try:
-                        self._get_read_type(
-                            seq_id=record[0],
-                            regex=self.seq_id_format.value,
-                        )
-                    except (
-                        InconsistentFastqIdentifiers,
-                        UnknownFastqIdentifier,
-                    ) as exc:
-                        self.result = StatesType.not_available
-                        raise MetadataWarning(
-                            f"{type(exc).__name__}: {str(exc)}"
-                        ) from exc
+                if self.seq_id_format is not None:
+                    LOGGER.debug(
+                        "Checking consistency of remaining reads with "
+                        "initially determined identifier format "
+                        "and library type..."
+                    )
+                    for record in seq_iter:
+                        records += 1
+                        try:
+                            self._get_read_type(
+                                seq_id=record[0],
+                                regex=self.seq_id_format.value,
+                            )
+                        except (
+                            InconsistentFastqIdentifiers,
+                            UnknownFastqIdentifier,
+                        ) as exc:
+                            self.result = StatesType.not_available
+                            raise MetadataWarning(
+                                f"{type(exc).__name__}: {str(exc)}"
+                            ) from exc
+                    LOGGER.debug(f"Total records processed: {records}")
 
         except (OSError, ValueError) as exc:
             self.result = StatesType.not_available
             raise FileProblem(f"{type(exc).__name__}: {str(exc)}") from exc
 
-        LOGGER.debug(f"Total records processed: {records}")
-
     def _get_read_type(
         self,
         seq_id: str,

diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py
@@ -75,7 +75,11 @@ def evaluate(self) -> ResultsOrientation:
         self.mapping.transcripts_file = self.transcripts_file
         self.mapping.tmp_dir = self.tmp_dir
 
-        if not self.mapping.mapped:
+        if not self.mapping.mapped and (
+            self.library_source.file_1.short_name is not None or
+            self.library_source.file_2.short_name is not None
+        ):
+            LOGGER.debug("Determining read relationship by alignment...")
             self.mapping.evaluate()
 
         return self.process_alignments(star_dirs=self.mapping.star_dirs)

diff --git a/htsinfer/mapping.py b/htsinfer/mapping.py
@@ -51,11 +51,7 @@ def __init__(
         self.star_dirs: List[Path] = []
 
     def evaluate(self):
-        """Infer read orientation.
-
-        Returns:
-            Orientation results object.
-        """
+        """Align FASTQ files to reference transcripts with STAR."""
 
         # get transcripts for current organims
         transcripts = self.subset_transcripts_by_organism()
@@ -270,6 +266,10 @@ def prepare_star_alignment_commands(
     ) -> Dict[Path, List[str]]:
         """Prepare STAR alignment commands.
 
+        Input FASTQ files are assumed to be sorted according to reference names
+        or coordinates, the order of input reads is kept with the option
+        "PairedKeepInputOrder", no additional sorting of aligned reads is done.
+
         Args:
             index_dir: Path to directory containing STAR index.
 
@@ -299,6 +299,7 @@ def build_star_command(
                 "--runThreadN", f"{str(self.threads_star)}",
                 "--genomeDir", f"{str(index_dir)}",
                 "--outFilterMultimapNmax", "50",
+                "--outSAMorder", "PairedKeepInputOrder",
                 "--outSAMunmapped", "Within", "KeepPairs",
             ]
             cmd: List[str] = cmd_base[:]