Skip to content

Commit

Permalink
refactor: update get lib type and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
balajtimate committed Jan 8, 2024
1 parent 3fbaceb commit 70f4252
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 22 deletions.
35 changes: 17 additions & 18 deletions htsinfer/get_library_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,18 @@ def _evaluate_mate_relationship(
self.mapping.library_type.relationship = (
StatesTypeRelationship.split_mates
)
elif (self.library_source.file_1.short_name is not None
and self.library_source.file_2.short_name is not None):
elif (
self.library_source.file_1.short_name is not None and
self.library_source.file_2.short_name is not None
):
self.mapping.library_type.relationship \
= StatesTypeRelationship.not_available
self.mapping.library_source = self.library_source
self.mapping.paths = self.path_1, self.path_2
self.mapping.evaluate()
self._align_mates()
else:
self.results.relationship = StatesTypeRelationship.not_available
LOGGER.debug(
"Library source is not determined, "
"mate relationship cannot be inferred by alignment."
Expand Down Expand Up @@ -195,8 +198,8 @@ def _align_mates(self):
mate1[read_counter], reads2
):
concordant += 1
LOGGER.debug(f"Number of mapped reads file 1: {len(mate1)}")
LOGGER.debug(f"Number of mapped reads file 2: {read_counter}")
LOGGER.debug(f"Number of aligned reads file 1: {len(mate1)}")
LOGGER.debug(f"Number of aligned reads file 2: {read_counter}")
LOGGER.debug(f"Number of concordant reads: {concordant}")
self._update_relationship(concordant, read_counter)

Expand Down Expand Up @@ -331,25 +334,26 @@ def evaluate(self) -> None:
self.result = StatesType.not_available
raise FileProblem(f"File is empty: {self.path}") from exc

if self.seq_id_format is None:
self.result = StatesType.not_available
if self.seq_id_format is not None:
LOGGER.debug(
"Could not determine sequence identifier format."
"Sequence identifier format: "
f"{self.seq_id_format.name}"
)
else:
self.result = StatesType.not_available
LOGGER.debug(
"Sequence identifier format: "
f"{self.seq_id_format.name}"
"Could not determine sequence identifier format."
)

# Ensure that remaining records are compatible with sequence
# identifier format and library type determined from first
# record
LOGGER.debug(
"Checking consistency of remaining reads with initially "
"determined identifier format and library type..."
)
if self.seq_id_format is not None:
LOGGER.debug(
"Checking consistency of remaining reads with "
"initially determined identifier format "
"and library type..."
)
for record in seq_iter:
records += 1
try:
Expand All @@ -366,11 +370,6 @@ def evaluate(self) -> None:
f"{type(exc).__name__}: {str(exc)}"
) from exc
LOGGER.debug(f"Total records processed: {records}")
else:
LOGGER.debug(
"Could not determine sequence identifier format. "
"Skipping consistency check for the remaining reads."
)

except (OSError, ValueError) as exc:
self.result = StatesType.not_available
Expand Down
7 changes: 4 additions & 3 deletions htsinfer/get_read_orientation.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,10 @@ def evaluate(self) -> ResultsOrientation:
self.mapping.transcripts_file = self.transcripts_file
self.mapping.tmp_dir = self.tmp_dir

if not self.mapping.mapped \
and (self.library_source.file_1.short_name is not None
or self.library_source.file_2.short_name is not None):
if not self.mapping.mapped and (
self.library_source.file_1.short_name is not None or
self.library_source.file_2.short_name is not None
):
self.mapping.evaluate()
else:
LOGGER.debug(
Expand Down
65 changes: 64 additions & 1 deletion tests/test_get_library_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
FILE_INCONSISTENT_IDS_SINGLE_OLD_NEW,
FILE_MATE_1,
FILE_MATE_2,
FILE_UNKNOWN_SEQ_ID,
FILE_IDS_NOT_MATCH_1,
FILE_IDS_NOT_MATCH_2,
FILE_TRANSCRIPTS,
Expand Down Expand Up @@ -148,11 +149,11 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir):
CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1
CONFIG.args.path_2_processed = FILE_MATE_2
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
CONFIG.args.tmp_dir = tmpdir
CONFIG.results.library_source = ResultsSource(
file_1=Source(short_name="hsapiens", taxon_id=9606),
file_2=Source(short_name="hsapiens", taxon_id=9606),
)
CONFIG.args.tmp_dir = tmpdir
MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2)
MAPPING.transcripts_file = FILE_TRANSCRIPTS
MAPPING.tmp_dir = tmpdir
Expand All @@ -166,6 +167,62 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir):
StatesTypeRelationship.not_available
)

def test_update_relationship_not_mates(self, tmpdir):
"""Test update_relationship logic."""
CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1
CONFIG.args.path_2_processed = FILE_MATE_2
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
CONFIG.args.tmp_dir = tmpdir
MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2)
MAPPING.transcripts_file = FILE_TRANSCRIPTS
MAPPING.tmp_dir = tmpdir

test_instance = GetLibType(config=CONFIG, mapping=MAPPING)
test_instance.results.file_1 = StatesType.not_available
test_instance.results.file_2 = StatesType.not_available

# Simulate a scenario where ratio is below the cutoff
concordant = 0
read_counter = 20

# Call the _update_relationship method
test_instance._update_relationship(concordant, read_counter)

assert (
test_instance.results.relationship ==
StatesTypeRelationship.not_mates
)
assert (
test_instance.mapping.library_type.relationship ==
StatesTypeRelationship.not_available
)

def test_evaluate_mate_relationship_not_determined(self, tmpdir):
"""Test mate relationship evaluation logic when
library source is not determined.
"""
CONFIG.args.path_1_processed = FILE_MATE_1
CONFIG.args.path_2_processed = FILE_MATE_2
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
CONFIG.args.tmp_dir = tmpdir
CONFIG.results.library_source = ResultsSource(
file_1=Source(),
file_2=Source(),
)
test_instance = GetLibType(config=CONFIG, mapping=MAPPING)
test_instance.results.file_1 = StatesType.not_available
test_instance.results.file_2 = StatesType.not_available

# Call the _evaluate_mate_relationship method
test_instance._evaluate_mate_relationship(
ids_1=["A", "B", "C"], ids_2=["D", "E", "F"]
)

assert (
test_instance.results.relationship ==
StatesTypeRelationship.not_available
)

def test_evaluate_split_mates_not_matching_ids(self, tmpdir):
"""Test mate relationship evaluation logic with input files that are
not mates from a paired-end library.
Expand Down Expand Up @@ -304,6 +361,12 @@ def test_get_read_type_no_match(self):
regex=SeqIdFormats['Casava >=1.8'].value,
)

def test_evaluate_unknown_identifier_format(self):
"""Test scenario where seq_id format cannot be determined."""
test_instance = GetFastqType(path=FILE_UNKNOWN_SEQ_ID)
test_instance.evaluate()
assert test_instance.result == StatesType.not_available

def test_get_read_type_single_pass(self):
"""Read identifier is consistent with previous state."""
test_instance = GetFastqType(path=FILE_DUMMY)
Expand Down

0 comments on commit 70f4252

Please sign in to comment.