From 70f425214c77be96c3cc01a38ced88cf2e0193fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Mon, 8 Jan 2024 16:16:25 +0100
Subject: [PATCH] refactor: update get lib type and tests

---
 htsinfer/get_library_type.py     | 35 +++++++++--------
 htsinfer/get_read_orientation.py |  7 ++--
 tests/test_get_library_type.py   | 65 +++++++++++++++++++++++++++++++-
 3 files changed, 85 insertions(+), 22 deletions(-)

diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py
index 19665c9..19d870f 100644
--- a/htsinfer/get_library_type.py
+++ b/htsinfer/get_library_type.py
@@ -127,8 +127,10 @@ def _evaluate_mate_relationship(
                 self.mapping.library_type.relationship = (
                     StatesTypeRelationship.split_mates
                 )
-        elif (self.library_source.file_1.short_name is not None
-              and self.library_source.file_2.short_name is not None):
+        elif (
+            self.library_source.file_1.short_name is not None and
+            self.library_source.file_2.short_name is not None
+        ):
             self.mapping.library_type.relationship \
                 = StatesTypeRelationship.not_available
             self.mapping.library_source = self.library_source
@@ -136,6 +138,7 @@ def _evaluate_mate_relationship(
             self.mapping.evaluate()
             self._align_mates()
         else:
+            self.results.relationship = StatesTypeRelationship.not_available
             LOGGER.debug(
                 "Library source is not determined, "
                 "mate relationship cannot be inferred by alignment."
@@ -195,8 +198,8 @@ def _align_mates(self):
             mate1[read_counter], reads2
         ):
             concordant += 1
-        LOGGER.debug(f"Number of mapped reads file 1: {len(mate1)}")
-        LOGGER.debug(f"Number of mapped reads file 2: {read_counter}")
+        LOGGER.debug(f"Number of aligned reads file 1: {len(mate1)}")
+        LOGGER.debug(f"Number of aligned reads file 2: {read_counter}")
         LOGGER.debug(f"Number of concordant reads: {concordant}")
         self._update_relationship(concordant, read_counter)
 
@@ -331,25 +334,26 @@ def evaluate(self) -> None:
                     self.result = StatesType.not_available
                     raise FileProblem(f"File is empty: {self.path}") from exc
 
-                if self.seq_id_format is None:
-                    self.result = StatesType.not_available
+                if self.seq_id_format is not None:
                     LOGGER.debug(
-                        "Could not determine sequence identifier format."
+                        "Sequence identifier format: "
+                        f"{self.seq_id_format.name}"
                     )
                 else:
+                    self.result = StatesType.not_available
                     LOGGER.debug(
-                        "Sequence identifier format: "
-                        f"{self.seq_id_format.name}"
+                        "Could not determine sequence identifier format."
                     )
 
                 # Ensure that remaining records are compatible with sequence
                 # identifier format and library type determined from first
                 # record
-                LOGGER.debug(
-                    "Checking consistency of remaining reads with initially "
-                    "determined identifier format and library type..."
-                )
                 if self.seq_id_format is not None:
+                    LOGGER.debug(
+                        "Checking consistency of remaining reads with "
+                        "initially determined identifier format "
+                        "and library type..."
+                    )
                     for record in seq_iter:
                         records += 1
                         try:
@@ -366,11 +370,6 @@ def evaluate(self) -> None:
                                 f"{type(exc).__name__}: {str(exc)}"
                             ) from exc
                     LOGGER.debug(f"Total records processed: {records}")
-                else:
-                    LOGGER.debug(
-                        "Could not determine sequence identifier format. "
-                        "Skipping consistency check for the remaining reads."
-                    )
 
         except (OSError, ValueError) as exc:
             self.result = StatesType.not_available
diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py
index 93b7c01..f66794b 100644
--- a/htsinfer/get_read_orientation.py
+++ b/htsinfer/get_read_orientation.py
@@ -75,9 +75,10 @@ def evaluate(self) -> ResultsOrientation:
         self.mapping.transcripts_file = self.transcripts_file
         self.mapping.tmp_dir = self.tmp_dir
 
-        if not self.mapping.mapped \
-                and (self.library_source.file_1.short_name is not None
-                     or self.library_source.file_2.short_name is not None):
+        if not self.mapping.mapped and (
+            self.library_source.file_1.short_name is not None or
+            self.library_source.file_2.short_name is not None
+        ):
             self.mapping.evaluate()
         else:
             LOGGER.debug(
diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py
index 2ae8c96..e1a0dce 100644
--- a/tests/test_get_library_type.py
+++ b/tests/test_get_library_type.py
@@ -29,6 +29,7 @@
     FILE_INCONSISTENT_IDS_SINGLE_OLD_NEW,
     FILE_MATE_1,
     FILE_MATE_2,
+    FILE_UNKNOWN_SEQ_ID,
     FILE_IDS_NOT_MATCH_1,
     FILE_IDS_NOT_MATCH_2,
     FILE_TRANSCRIPTS,
@@ -148,11 +149,11 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir):
         CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
         CONFIG.results.library_source = ResultsSource(
             file_1=Source(short_name="hsapiens", taxon_id=9606),
             file_2=Source(short_name="hsapiens", taxon_id=9606),
         )
-        CONFIG.args.tmp_dir = tmpdir
         MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2)
         MAPPING.transcripts_file = FILE_TRANSCRIPTS
         MAPPING.tmp_dir = tmpdir
@@ -166,6 +167,62 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir):
             StatesTypeRelationship.not_available
         )
 
+    def test_update_relationship_not_mates(self, tmpdir):
+        """Test update_relationship logic."""
+        CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2)
+        MAPPING.transcripts_file = FILE_TRANSCRIPTS
+        MAPPING.tmp_dir = tmpdir
+
+        test_instance = GetLibType(config=CONFIG, mapping=MAPPING)
+        test_instance.results.file_1 = StatesType.not_available
+        test_instance.results.file_2 = StatesType.not_available
+
+        # Simulate a scenario where ratio is below the cutoff
+        concordant = 0
+        read_counter = 20
+
+        # Call the _update_relationship method
+        test_instance._update_relationship(concordant, read_counter)
+
+        assert (
+            test_instance.results.relationship ==
+            StatesTypeRelationship.not_mates
+        )
+        assert (
+            test_instance.mapping.library_type.relationship ==
+            StatesTypeRelationship.not_available
+        )
+
+    def test_evaluate_mate_relationship_not_determined(self, tmpdir):
+        """Test mate relationship evaluation logic when
+        library source is not determined.
+        """
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        CONFIG.results.library_source = ResultsSource(
+            file_1=Source(),
+            file_2=Source(),
+        )
+        test_instance = GetLibType(config=CONFIG, mapping=MAPPING)
+        test_instance.results.file_1 = StatesType.not_available
+        test_instance.results.file_2 = StatesType.not_available
+
+        # Call the _evaluate_mate_relationship method
+        test_instance._evaluate_mate_relationship(
+            ids_1=["A", "B", "C"], ids_2=["D", "E", "F"]
+        )
+
+        assert (
+            test_instance.results.relationship ==
+            StatesTypeRelationship.not_available
+        )
+
     def test_evaluate_split_mates_not_matching_ids(self, tmpdir):
         """Test mate relationship evaluation logic with input files that are
         not mates from a paired-end library.
@@ -304,6 +361,12 @@ def test_get_read_type_no_match(self):
                 regex=SeqIdFormats['Casava >=1.8'].value,
             )
 
+    def test_evaluate_unknown_identifier_format(self):
+        """Test scenario where seq_id format cannot be determined."""
+        test_instance = GetFastqType(path=FILE_UNKNOWN_SEQ_ID)
+        test_instance.evaluate()
+        assert test_instance.result == StatesType.not_available
+
     def test_get_read_type_single_pass(self):
         """Read identifier is consistent with previous state."""
         test_instance = GetFastqType(path=FILE_DUMMY)