From 69ea74cb8325a5e98ace58157f42df044941fbda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <51365402+balajtimate@users.noreply.github.com> Date: Tue, 23 Jan 2024 14:38:54 +0100 Subject: [PATCH] feat: improve lib type inference for SRA PE samples (#161) * feat: add asumed mates * update get lib type tests * lower lib_type_mate_cutoff --- htsinfer/cli.py | 2 +- htsinfer/get_library_type.py | 33 ++++++++++++++++++++++++++++++--- htsinfer/models.py | 4 +++- tests/test_get_library_type.py | 34 +++++++++++++++++++++++++++++++--- 4 files changed, 65 insertions(+), 8 deletions(-) diff --git a/htsinfer/cli.py b/htsinfer/cli.py index 217d6ce..6850600 100644 --- a/htsinfer/cli.py +++ b/htsinfer/cli.py @@ -232,7 +232,7 @@ def __call__( dest="lib_type_mates_cutoff", metavar="FLOAT", type=float, - default=0.95, + default=0.85, help=( "minimum fraction of mates that can be mapped to compatible loci " "and are considered concordant pairs / all mates" diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 37dad4f..e4aea81 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -127,6 +127,21 @@ def _evaluate_mate_relationship( self.mapping.library_type.relationship = ( StatesTypeRelationship.split_mates ) + # Infer mate relationship, even when assumed to be single + elif ( + self.results.file_1 == StatesType.single and + self.results.file_2 == StatesType.single + ) and ( + self.library_source.file_1.short_name is not None or + self.library_source.file_2.short_name is not None + ): + LOGGER.debug("Determining mate relationship by alignment...") + self.mapping.library_type.relationship \ + = StatesTypeRelationship.not_available + self.mapping.library_source = self.library_source + self.mapping.paths = self.path_1, self.path_2 + self.mapping.evaluate() + self._align_mates() elif ( self.library_source.file_1.short_name is not None or self.library_source.file_2.short_name is not None @@ -209,15 +224,15 @@ def _align_mates(self): LOGGER.debug(f"Number of aligned reads file 2: {aligned_mate2}") LOGGER.debug(f"Number of concordant reads: {concordant}") - self._update_relationship( + self._update_relationship_type( concordant, min(aligned_mate1, aligned_mate2) ) samfile1.close() samfile2.close() - def _update_relationship(self, concordant, aligned_reads): - """Helper function to update relationship based on alignment.""" + def _update_relationship_type(self, concordant, aligned_reads): + """Helper function to update relationship and type.""" try: ratio = concordant / aligned_reads except ZeroDivisionError: @@ -238,6 +253,18 @@ def _update_relationship(self, concordant, aligned_reads): self.results.relationship = ( StatesTypeRelationship.not_mates ) + if self.results.relationship == ( + StatesTypeRelationship.split_mates + ) and ( + self.results.file_1 == StatesType.single and + self.results.file_2 == StatesType.single + ) or ( + self.results.file_1 == StatesType.not_available and + self.results.file_2 == StatesType.not_available + ): + # Update first and second relationship + self.results.file_1 = StatesType.first_mate_assumed + self.results.file_2 = StatesType.second_mate_assumed class AlignedSegment: """Placeholder class for mypy "Missing attribute" diff --git a/htsinfer/models.py b/htsinfer/models.py index 6b8c2f9..e8e2778 100644 --- a/htsinfer/models.py +++ b/htsinfer/models.py @@ -155,9 +155,11 @@ class StatesType(Enum): that the library represents a single-end library. """ first_mate = "first_mate" + first_mate_assumed = "first_mate_assumed" mixed_mates = "mixed_mates" not_available = None second_mate = "second_mate" + second_mate_assumed = "second_mate_assumed" single = "single" @@ -438,7 +440,7 @@ class Args(BaseModel): lib_source_min_match_pct: float = 2 lib_source_min_freq_ratio: float = 2 lib_type_max_distance: int = 1000 - lib_type_mates_cutoff: float = 0.95 + lib_type_mates_cutoff: float = 0.85 read_orientation_min_mapped_reads: int = 20 read_orientation_min_fraction: float = 0.75 path_1_processed: Path = Path() diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py index e1a0dce..6c1bc65 100644 --- a/tests/test_get_library_type.py +++ b/tests/test_get_library_type.py @@ -113,6 +113,34 @@ def test_evaluate_mate_relationship_split_mates(self): StatesTypeRelationship.split_mates ) + def test_evaluate_mate_relationship_assumed_single(self, tmpdir): + """Test mate relationship evaluation logic with input files being + mates of a paired-end library but assumed single based on seq_ids. + """ + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.results.library_source = ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(short_name="hsapiens", taxon_id=9606), + ) + MAPPING.paths = (FILE_MATE_1, FILE_MATE_2) + MAPPING.transcripts_file = FILE_TRANSCRIPTS + MAPPING.tmp_dir = tmpdir + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) + test_instance.results.file_1 = StatesType.single + test_instance.results.file_2 = StatesType.single + test_instance._evaluate_mate_relationship( + ids_1=["A", "B", "C"], + ids_2=["A", "B", "C"], + ) + assert ( + test_instance.results.relationship == + StatesTypeRelationship.not_available + ) + def test_evaluate_mate_relationship_not_mates(self, tmpdir): """Test mate relationship evaluation logic with input files that are mates, but the relationship is not enough to trigger split_mates. @@ -167,7 +195,7 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir): StatesTypeRelationship.not_available ) - def test_update_relationship_not_mates(self, tmpdir): + def test_update_relationship_type_not_mates(self, tmpdir): """Test update_relationship logic.""" CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 CONFIG.args.path_2_processed = FILE_MATE_2 @@ -185,8 +213,8 @@ def test_update_relationship_not_mates(self, tmpdir): concordant = 0 read_counter = 20 - # Call the _update_relationship method - test_instance._update_relationship(concordant, read_counter) + # Call the _update_relationship_type method + test_instance._update_relationship_type(concordant, read_counter) assert ( test_instance.results.relationship ==