From 70f425214c77be96c3cc01a38ced88cf2e0193fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Mon, 8 Jan 2024 16:16:25 +0100 Subject: [PATCH] refactor: update get lib type and tests --- htsinfer/get_library_type.py | 35 +++++++++-------- htsinfer/get_read_orientation.py | 7 ++-- tests/test_get_library_type.py | 65 +++++++++++++++++++++++++++++++- 3 files changed, 85 insertions(+), 22 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 19665c9..19d870f 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -127,8 +127,10 @@ def _evaluate_mate_relationship( self.mapping.library_type.relationship = ( StatesTypeRelationship.split_mates ) - elif (self.library_source.file_1.short_name is not None - and self.library_source.file_2.short_name is not None): + elif ( + self.library_source.file_1.short_name is not None and + self.library_source.file_2.short_name is not None + ): self.mapping.library_type.relationship \ = StatesTypeRelationship.not_available self.mapping.library_source = self.library_source @@ -136,6 +138,7 @@ def _evaluate_mate_relationship( self.mapping.evaluate() self._align_mates() else: + self.results.relationship = StatesTypeRelationship.not_available LOGGER.debug( "Library source is not determined, " "mate relationship cannot be inferred by alignment." @@ -195,8 +198,8 @@ def _align_mates(self): mate1[read_counter], reads2 ): concordant += 1 - LOGGER.debug(f"Number of mapped reads file 1: {len(mate1)}") - LOGGER.debug(f"Number of mapped reads file 2: {read_counter}") + LOGGER.debug(f"Number of aligned reads file 1: {len(mate1)}") + LOGGER.debug(f"Number of aligned reads file 2: {read_counter}") LOGGER.debug(f"Number of concordant reads: {concordant}") self._update_relationship(concordant, read_counter) @@ -331,25 +334,26 @@ def evaluate(self) -> None: self.result = StatesType.not_available raise FileProblem(f"File is empty: {self.path}") from exc - if self.seq_id_format is None: - self.result = StatesType.not_available + if self.seq_id_format is not None: LOGGER.debug( - "Could not determine sequence identifier format." + "Sequence identifier format: " + f"{self.seq_id_format.name}" ) else: + self.result = StatesType.not_available LOGGER.debug( - "Sequence identifier format: " - f"{self.seq_id_format.name}" + "Could not determine sequence identifier format." ) # Ensure that remaining records are compatible with sequence # identifier format and library type determined from first # record - LOGGER.debug( - "Checking consistency of remaining reads with initially " - "determined identifier format and library type..." - ) if self.seq_id_format is not None: + LOGGER.debug( + "Checking consistency of remaining reads with " + "initially determined identifier format " + "and library type..." + ) for record in seq_iter: records += 1 try: @@ -366,11 +370,6 @@ def evaluate(self) -> None: f"{type(exc).__name__}: {str(exc)}" ) from exc LOGGER.debug(f"Total records processed: {records}") - else: - LOGGER.debug( - "Could not determine sequence identifier format. " - "Skipping consistency check for the remaining reads." - ) except (OSError, ValueError) as exc: self.result = StatesType.not_available diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index 93b7c01..f66794b 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -75,9 +75,10 @@ def evaluate(self) -> ResultsOrientation: self.mapping.transcripts_file = self.transcripts_file self.mapping.tmp_dir = self.tmp_dir - if not self.mapping.mapped \ - and (self.library_source.file_1.short_name is not None - or self.library_source.file_2.short_name is not None): + if not self.mapping.mapped and ( + self.library_source.file_1.short_name is not None or + self.library_source.file_2.short_name is not None + ): self.mapping.evaluate() else: LOGGER.debug( diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py index 2ae8c96..e1a0dce 100644 --- a/tests/test_get_library_type.py +++ b/tests/test_get_library_type.py @@ -29,6 +29,7 @@ FILE_INCONSISTENT_IDS_SINGLE_OLD_NEW, FILE_MATE_1, FILE_MATE_2, + FILE_UNKNOWN_SEQ_ID, FILE_IDS_NOT_MATCH_1, FILE_IDS_NOT_MATCH_2, FILE_TRANSCRIPTS, @@ -148,11 +149,11 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir): CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir CONFIG.results.library_source = ResultsSource( file_1=Source(short_name="hsapiens", taxon_id=9606), file_2=Source(short_name="hsapiens", taxon_id=9606), ) - CONFIG.args.tmp_dir = tmpdir MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2) MAPPING.transcripts_file = FILE_TRANSCRIPTS MAPPING.tmp_dir = tmpdir @@ -166,6 +167,62 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir): StatesTypeRelationship.not_available ) + def test_update_relationship_not_mates(self, tmpdir): + """Test update_relationship logic.""" + CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2) + MAPPING.transcripts_file = FILE_TRANSCRIPTS + MAPPING.tmp_dir = tmpdir + + test_instance = GetLibType(config=CONFIG, mapping=MAPPING) + test_instance.results.file_1 = StatesType.not_available + test_instance.results.file_2 = StatesType.not_available + + # Simulate a scenario where ratio is below the cutoff + concordant = 0 + read_counter = 20 + + # Call the _update_relationship method + test_instance._update_relationship(concordant, read_counter) + + assert ( + test_instance.results.relationship == + StatesTypeRelationship.not_mates + ) + assert ( + test_instance.mapping.library_type.relationship == + StatesTypeRelationship.not_available + ) + + def test_evaluate_mate_relationship_not_determined(self, tmpdir): + """Test mate relationship evaluation logic when + library source is not determined. + """ + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.results.library_source = ResultsSource( + file_1=Source(), + file_2=Source(), + ) + test_instance = GetLibType(config=CONFIG, mapping=MAPPING) + test_instance.results.file_1 = StatesType.not_available + test_instance.results.file_2 = StatesType.not_available + + # Call the _evaluate_mate_relationship method + test_instance._evaluate_mate_relationship( + ids_1=["A", "B", "C"], ids_2=["D", "E", "F"] + ) + + assert ( + test_instance.results.relationship == + StatesTypeRelationship.not_available + ) + def test_evaluate_split_mates_not_matching_ids(self, tmpdir): """Test mate relationship evaluation logic with input files that are not mates from a paired-end library. @@ -304,6 +361,12 @@ def test_get_read_type_no_match(self): regex=SeqIdFormats['Casava >=1.8'].value, ) + def test_evaluate_unknown_identifier_format(self): + """Test scenario where seq_id format cannot be determined.""" + test_instance = GetFastqType(path=FILE_UNKNOWN_SEQ_ID) + test_instance.evaluate() + assert test_instance.result == StatesType.not_available + def test_get_read_type_single_pass(self): """Read identifier is consistent with previous state.""" test_instance = GetFastqType(path=FILE_DUMMY)