Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve lib type inference for SRA PE samples #161

Merged
merged 3 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion htsinfer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def __call__(
dest="lib_type_mates_cutoff",
metavar="FLOAT",
type=float,
default=0.95,
default=0.85,
help=(
"minimum fraction of mates that can be mapped to compatible loci "
"and are considered concordant pairs / all mates"
Expand Down
33 changes: 30 additions & 3 deletions htsinfer/get_library_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,21 @@ def _evaluate_mate_relationship(
self.mapping.library_type.relationship = (
StatesTypeRelationship.split_mates
)
# Infer mate relationship, even when assumed to be single
elif (
self.results.file_1 == StatesType.single and
self.results.file_2 == StatesType.single
) and (
self.library_source.file_1.short_name is not None or
self.library_source.file_2.short_name is not None
):
LOGGER.debug("Determining mate relationship by alignment...")
self.mapping.library_type.relationship \
= StatesTypeRelationship.not_available
self.mapping.library_source = self.library_source
self.mapping.paths = self.path_1, self.path_2
self.mapping.evaluate()
self._align_mates()
elif (
self.library_source.file_1.short_name is not None or
self.library_source.file_2.short_name is not None
Expand Down Expand Up @@ -209,15 +224,15 @@ def _align_mates(self):
LOGGER.debug(f"Number of aligned reads file 2: {aligned_mate2}")
LOGGER.debug(f"Number of concordant reads: {concordant}")

self._update_relationship(
self._update_relationship_type(
concordant, min(aligned_mate1, aligned_mate2)
)

samfile1.close()
samfile2.close()

def _update_relationship(self, concordant, aligned_reads):
"""Helper function to update relationship based on alignment."""
def _update_relationship_type(self, concordant, aligned_reads):
"""Helper function to update relationship and type."""
try:
ratio = concordant / aligned_reads
except ZeroDivisionError:
Expand All @@ -238,6 +253,18 @@ def _update_relationship(self, concordant, aligned_reads):
self.results.relationship = (
StatesTypeRelationship.not_mates
)
if self.results.relationship == (
StatesTypeRelationship.split_mates
) and (
self.results.file_1 == StatesType.single and
self.results.file_2 == StatesType.single
) or (
self.results.file_1 == StatesType.not_available and
self.results.file_2 == StatesType.not_available
):
# Update first and second relationship
self.results.file_1 = StatesType.first_mate_assumed
self.results.file_2 = StatesType.second_mate_assumed

class AlignedSegment:
"""Placeholder class for mypy "Missing attribute"
Expand Down
4 changes: 3 additions & 1 deletion htsinfer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,11 @@ class StatesType(Enum):
that the library represents a single-end library.
"""
first_mate = "first_mate"
first_mate_assumed = "first_mate_assumed"
mixed_mates = "mixed_mates"
not_available = None
second_mate = "second_mate"
second_mate_assumed = "second_mate_assumed"
single = "single"


Expand Down Expand Up @@ -438,7 +440,7 @@ class Args(BaseModel):
lib_source_min_match_pct: float = 2
lib_source_min_freq_ratio: float = 2
lib_type_max_distance: int = 1000
lib_type_mates_cutoff: float = 0.95
lib_type_mates_cutoff: float = 0.85
read_orientation_min_mapped_reads: int = 20
read_orientation_min_fraction: float = 0.75
path_1_processed: Path = Path()
Expand Down
34 changes: 31 additions & 3 deletions tests/test_get_library_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,34 @@ def test_evaluate_mate_relationship_split_mates(self):
StatesTypeRelationship.split_mates
)

def test_evaluate_mate_relationship_assumed_single(self, tmpdir):
"""Test mate relationship evaluation logic with input files being
mates of a paired-end library but assumed single based on seq_ids.
"""
CONFIG.args.path_1_processed = FILE_MATE_1
CONFIG.args.path_2_processed = FILE_MATE_2
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
CONFIG.args.tmp_dir = tmpdir
CONFIG.results.library_source = ResultsSource(
file_1=Source(short_name="hsapiens", taxon_id=9606),
file_2=Source(short_name="hsapiens", taxon_id=9606),
)
MAPPING.paths = (FILE_MATE_1, FILE_MATE_2)
MAPPING.transcripts_file = FILE_TRANSCRIPTS
MAPPING.tmp_dir = tmpdir
test_instance = GetLibType(config=CONFIG,
mapping=MAPPING)
test_instance.results.file_1 = StatesType.single
test_instance.results.file_2 = StatesType.single
test_instance._evaluate_mate_relationship(
ids_1=["A", "B", "C"],
ids_2=["A", "B", "C"],
)
assert (
test_instance.results.relationship ==
StatesTypeRelationship.not_available
)

def test_evaluate_mate_relationship_not_mates(self, tmpdir):
"""Test mate relationship evaluation logic with input files that are
mates, but the relationship is not enough to trigger split_mates.
Expand Down Expand Up @@ -167,7 +195,7 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir):
StatesTypeRelationship.not_available
)

def test_update_relationship_not_mates(self, tmpdir):
def test_update_relationship_type_not_mates(self, tmpdir):
"""Test update_relationship logic."""
CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1
CONFIG.args.path_2_processed = FILE_MATE_2
Expand All @@ -185,8 +213,8 @@ def test_update_relationship_not_mates(self, tmpdir):
concordant = 0
read_counter = 20

# Call the _update_relationship method
test_instance._update_relationship(concordant, read_counter)
# Call the _update_relationship_type method
test_instance._update_relationship_type(concordant, read_counter)

assert (
test_instance.results.relationship ==
Expand Down
Loading