Skip to content

Commit

Permalink
fix: infer library type relationship (#157)
Browse files Browse the repository at this point in the history
* refactor: compare alignments between mapped reads only

* fix: update get lib type test

* refactor: helper function for getlibtype

* feat: only map when lib source is inferred

* update tests

* fix orientation tests

* fix orientation tests

* refactor scripts

* update get lib type and get read orient

* refactor get lib type

* refactor: update get lib type and tests

* refactor: update lib type and orientation

* refactor: update concordant read counting, mapping

* update debug messages

* update comments in mapping
  • Loading branch information
balajtimate authored Jan 21, 2024
1 parent 930b741 commit 7b65c43
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 66 deletions.
142 changes: 94 additions & 48 deletions htsinfer/get_library_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _evaluate_mate_relationship(
ids_2: As `ids_1` but for the putative second mate file.
"""
self.results.relationship = StatesTypeRelationship.not_mates
if ids_1 == ids_2:
if ids_1 and ids_2 and ids_1 == ids_2:
if (
self.results.file_1 == StatesType.first_mate and
self.results.file_2 == StatesType.second_mate
Expand All @@ -127,13 +127,23 @@ def _evaluate_mate_relationship(
self.mapping.library_type.relationship = (
StatesTypeRelationship.split_mates
)
else:
elif (
self.library_source.file_1.short_name is not None or
self.library_source.file_2.short_name is not None
):
LOGGER.debug("Determining mate relationship by alignment...")
self.mapping.library_type.relationship \
= StatesTypeRelationship.not_available
self.mapping.library_source = self.library_source
self.mapping.paths = self.path_1, self.path_2
self.mapping.evaluate()
self._align_mates()
else:
self.results.relationship = StatesTypeRelationship.not_available
LOGGER.debug(
"Sequence IDs and library source are not determined, "
"mate relationship cannot be inferred."
)

def _align_mates(self):
"""Decide mate relationship by alignment."""
Expand All @@ -144,19 +154,24 @@ def _align_mates(self):
samfile1 = pysam.AlignmentFile(str(alignment_1), 'r')
samfile2 = pysam.AlignmentFile(str(alignment_2), 'r')

seq_id1 = None
seq_id2 = None
previous_seq_id1 = None
previous_seq_id2 = None

reads1 = [] # List to store alignments for one read from file1
mate1 = [] # List to store alignments for each read
mate1 = [] # List to store alignments for each read from file1
reads2 = [] # List to store alignments for one read from file2
mate2 = [] # List to store alignments for each read from file2

concordant = 0

for read1 in samfile1:
seq_id1 = read1.query_name
if seq_id1 != previous_seq_id1 \
and previous_seq_id1 is not None:
if (
seq_id1 != previous_seq_id1 and
previous_seq_id1 is not None
):
mate1.append(reads1.copy())
reads1.clear()
if read1.reference_end:
Expand All @@ -167,35 +182,63 @@ def _align_mates(self):
read_counter = 0
for read2 in samfile2:
seq_id2 = read2.query_name
if seq_id2 != previous_seq_id2 \
and previous_seq_id2 is not None:
if self._compare_alignments(mate1[read_counter], reads2):
if (
seq_id2 != previous_seq_id2 and
previous_seq_id2 is not None
):
mate2.append(reads2.copy())
if self._compare_alignments(
mate1[read_counter], reads2
):
concordant += 1
reads2.clear()
read_counter += 1
if read2.reference_end:
reads2.append(read2)
previous_seq_id2 = seq_id2

if self._compare_alignments(mate1[read_counter], reads2):
mate2.append(reads2.copy())
if self._compare_alignments(
mate1[read_counter], reads2
):
concordant += 1

if (concordant / read_counter) >= self.cutoff:
self.results.relationship = (
StatesTypeRelationship.split_mates
)
self.mapping.library_type.relationship \
= StatesTypeRelationship.split_mates
self.mapping.mapped = False
self.mapping.star_dirs = []
else:
self.results.relationship = (
StatesTypeRelationship.not_mates
)
aligned_mate1 = len(list(filter(None, mate1)))
aligned_mate2 = len(list(filter(None, mate2)))

LOGGER.debug(f"Number of aligned reads file 1: {aligned_mate1}")
LOGGER.debug(f"Number of aligned reads file 2: {aligned_mate2}")
LOGGER.debug(f"Number of concordant reads: {concordant}")

self._update_relationship(
concordant, min(aligned_mate1, aligned_mate2)
)

samfile1.close()
samfile2.close()

def _update_relationship(self, concordant, aligned_reads):
"""Helper function to update relationship based on alignment."""
try:
ratio = concordant / aligned_reads
except ZeroDivisionError:
self.results.relationship = (
StatesTypeRelationship.not_available
)
else:
if ratio >= self.cutoff:
self.results.relationship = (
StatesTypeRelationship.split_mates
)
self.mapping.library_type.relationship = (
StatesTypeRelationship.split_mates
)
self.mapping.mapped = False
self.mapping.star_dirs = []
else:
self.results.relationship = (
StatesTypeRelationship.not_mates
)

class AlignedSegment:
"""Placeholder class for mypy "Missing attribute"
error in _compare_alignments(), the actual object used
Expand Down Expand Up @@ -302,44 +345,47 @@ def evaluate(self) -> None:
self.result = StatesType.not_available
raise FileProblem(f"File is empty: {self.path}") from exc

if self.seq_id_format is None:
if self.seq_id_format is not None:
LOGGER.debug(
"Sequence identifier format: "
f"{self.seq_id_format.name}"
)
else:
self.result = StatesType.not_available
raise MetadataWarning(
LOGGER.debug(
"Could not determine sequence identifier format."
)
LOGGER.debug(
f"Sequence identifier format: {self.seq_id_format.name}"
)

# Ensure that remaining records are compatible with sequence
# identifier format and library type determined from first
# record
LOGGER.debug(
"Checking consistency of remaining reads with initially "
"determined identifier format and library type..."
)
for record in seq_iter:
records += 1
try:
self._get_read_type(
seq_id=record[0],
regex=self.seq_id_format.value,
)
except (
InconsistentFastqIdentifiers,
UnknownFastqIdentifier,
) as exc:
self.result = StatesType.not_available
raise MetadataWarning(
f"{type(exc).__name__}: {str(exc)}"
) from exc
if self.seq_id_format is not None:
LOGGER.debug(
"Checking consistency of remaining reads with "
"initially determined identifier format "
"and library type..."
)
for record in seq_iter:
records += 1
try:
self._get_read_type(
seq_id=record[0],
regex=self.seq_id_format.value,
)
except (
InconsistentFastqIdentifiers,
UnknownFastqIdentifier,
) as exc:
self.result = StatesType.not_available
raise MetadataWarning(
f"{type(exc).__name__}: {str(exc)}"
) from exc
LOGGER.debug(f"Total records processed: {records}")

except (OSError, ValueError) as exc:
self.result = StatesType.not_available
raise FileProblem(f"{type(exc).__name__}: {str(exc)}") from exc

LOGGER.debug(f"Total records processed: {records}")

def _get_read_type(
self,
seq_id: str,
Expand Down
6 changes: 5 additions & 1 deletion htsinfer/get_read_orientation.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ def evaluate(self) -> ResultsOrientation:
self.mapping.transcripts_file = self.transcripts_file
self.mapping.tmp_dir = self.tmp_dir

if not self.mapping.mapped:
if not self.mapping.mapped and (
self.library_source.file_1.short_name is not None or
self.library_source.file_2.short_name is not None
):
LOGGER.debug("Determining read relationship by alignment...")
self.mapping.evaluate()

return self.process_alignments(star_dirs=self.mapping.star_dirs)
Expand Down
11 changes: 6 additions & 5 deletions htsinfer/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,7 @@ def __init__(
self.star_dirs: List[Path] = []

def evaluate(self):
"""Infer read orientation.
Returns:
Orientation results object.
"""
"""Align FASTQ files to reference transcripts with STAR."""

# get transcripts for current organims
transcripts = self.subset_transcripts_by_organism()
Expand Down Expand Up @@ -270,6 +266,10 @@ def prepare_star_alignment_commands(
) -> Dict[Path, List[str]]:
"""Prepare STAR alignment commands.
Input FASTQ files are assumed to be sorted according to reference names
or coordinates, the order of input reads is kept with the option
"PairedKeepInputOrder", no additional sorting of aligned reads is done.
Args:
index_dir: Path to directory containing STAR index.
Expand Down Expand Up @@ -299,6 +299,7 @@ def build_star_command(
"--runThreadN", f"{str(self.threads_star)}",
"--genomeDir", f"{str(index_dir)}",
"--outFilterMultimapNmax", "50",
"--outSAMorder", "PairedKeepInputOrder",
"--outSAMunmapped", "Within", "KeepPairs",
]
cmd: List[str] = cmd_base[:]
Expand Down
Loading

0 comments on commit 7b65c43

Please sign in to comment.