Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure bookkeeping of multiple contigs for the same process ID #50

Merged
merged 20 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
1e4db92
YAML and CSV are optional
rvosa Nov 6, 2024
99a8ecf
added institute/branch selector
rvosa Nov 7, 2024
315d11d
now urlencodes file names to handle spaces and such
rvosa Nov 7, 2024
866c503
added error handling for taxonomy match
rvosa Nov 7, 2024
f72e505
added comment for how to handle HMM files
rvosa Nov 7, 2024
708cd12
added comment for how to decide the translation table
rvosa Nov 7, 2024
6130e81
script to filter results to 'pass' and 'fail'
rvosa Nov 13, 2024
c950a42
reduced default verbosity
rvosa Nov 14, 2024
377ca57
adding structural validator
rvosa Nov 14, 2024
bfdf140
the sequence.id is now the default biopython behaviour, i.e. the firs…
rvosa Nov 25, 2024
7cfc13d
tests updated to reflect that the fasta parser doesn't return the pro…
rvosa Nov 25, 2024
ed3fe09
tests updated to reflect that the fasta parser doesn't return the pro…
rvosa Nov 25, 2024
42b0b32
updated to reflect that the fasta parser doesn't return the process I…
rvosa Nov 25, 2024
70606da
result object now tracks the full sequence ID rather than the process…
rvosa Nov 25, 2024
95ce9af
Columns need to be initialized separately.
rvosa Nov 25, 2024
9198b03
Updated to reflect the new signatures for SequenceHandler.parse_fasta…
rvosa Nov 25, 2024
2178cbd
the name of the fasta file is not attached as ancillary but using the…
rvosa Nov 25, 2024
3be601f
res.identification_rank needs to be set from the scoped config. Theor…
rvosa Nov 25, 2024
9e58150
needed other mock config object so that it clones
rvosa Nov 25, 2024
7e88f7d
the MGE FASTA files seem to have developed underscores as some magic …
rvosa Nov 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions barcode_validator/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@ def main(fasta_file_path, yaml_file_path, analytics_file_path, logger, config):
if analytics_file_path is not None:
rs.add_csv_file(analytics_file_path)

# Add fasta file path to each result
for result in rs.results:
result.add_ancillary('fasta_file', fasta_file_path)

print(rs) # print TSV results

logger.info("Analysis completed")
Expand All @@ -36,8 +32,8 @@ def main(fasta_file_path, yaml_file_path, analytics_file_path, logger, config):
# Process command line arguments
parser = argparse.ArgumentParser(description="Analyze DNA sequences from a FASTA file.")
parser.add_argument("-f", "--fasta_file", required=True, help="Path to the input FASTA file")
parser.add_argument("-y", "--yaml_file", required=True, help="Path to the assembly param YAML file")
parser.add_argument("-a", "--analytics_file", required=True, help="Path to the analytics CSV file")
parser.add_argument("-y", "--yaml_file", required=False, help="Path to the assembly param YAML file")
parser.add_argument("-a", "--analytics_file", required=False, help="Path to the analytics CSV file")
parser.add_argument("-c", "--config_file", required=True, help="Path to the configuration YAML file")
parser.add_argument("-v", "--verbosity", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
help="Set the logging verbosity (default: INFO)")
Expand Down
43 changes: 25 additions & 18 deletions barcode_validator/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ def align_to_hmm(self, sequence):
:param sequence: A BioPython SeqRecord object
:return: A BioPython SeqRecord object containing the aligned sequence
"""

# TODO: inside this method, the HMM file should be inferred from the `marker_code` attribute of the sequence.
# For example, if `marker_code` == 'COI-5P', the HMM file should be 'COI-5P.hmm', which should be set via
# self.hmmalign.set_hmmfile().
self.logger.info("Aligning sequence to HMM")
if len(sequence.seq) == 0:
self.logger.warning("Empty sequence provided for alignment")
Expand All @@ -64,6 +68,9 @@ def align_to_hmm(self, sequence):

# Run hmmalign and parse the output
return_code = self.hmmalign.run()
if return_code != 0:
self.logger.error(f"hmmalign failed with return code {return_code}")
return None
self.logger.debug(f"Going to parse outfile {temp_output.name}")
aligned_sequence = next(SeqIO.parse(temp_output.name, 'stockholm'))

Expand Down Expand Up @@ -113,19 +120,15 @@ def unalign_sequence(self, sequence):
self.logger.info("Removing gaps from aligned sequence")
if isinstance(sequence, SeqRecord):
# Convert Seq to string, remove gaps, then convert back to Seq
unaligned_sequence = str(sequence.seq).replace('-', '').replace('~', '')
return SeqRecord(
Seq(unaligned_sequence),
id=sequence.id,
name=sequence.name,
description=sequence.description
)
unaligned_sequence = str(sequence.seq).replace('-', '').replace('~', '').replace('_', '')
sequence.seq = Seq(unaligned_sequence)
return sequence
elif isinstance(sequence, Seq):
# If it's just a Seq object, convert to string, remove gaps, then back to Seq
return Seq(str(sequence).replace('-', '').replace('~', ''))
return Seq(str(sequence).replace('-', '').replace('~', '').replace('_', ''))
elif isinstance(sequence, str):
# If it's a string, just remove the gaps
return sequence.replace('-', '').replace('~', '')
return sequence.replace('-', '').replace('~', '').replace('_', '')
else:
raise TypeError(f"Unexpected type for sequence: {type(sequence)}")

Expand Down Expand Up @@ -179,17 +182,17 @@ def translate_sequence(self, dna_sequence, table_idx):

def parse_fasta(self, file_path):
"""
Parse a FASTA file and yield the process ID, sequence record, and optional JSON configuration for each entry.
The process ID is the first part of the sequence ID, which is assumed to be separated by an underscore.
Parse a FASTA file and yield the sequence record, and optional JSON configuration for each entry.
The process ID is the first part of the sequence ID, which is assumed to be separated by an underscore
and is attached under record.annotations['bcdm_fields']['processid'].
Any JSON configuration data after the first '{' in the header is parsed and returned.

:param file_path: Local path to the FASTA file
:yield: A tuple containing the process ID, the sequence record, and the JSON configuration (or None) for each entry
:yield: A tuple containing the sequence record, and the JSON configuration (or None) for each entry
"""
self.logger.info(f"Parsing FASTA file: {file_path}")
with open(file_path, 'r') as file:
for record in SeqIO.parse(file, 'fasta'):
process_id = record.id.split('_')[0]

# Attempt to parse JSON from the description
json_config = None
Expand All @@ -201,14 +204,18 @@ def parse_fasta(self, file_path):
# Remove the JSON part from the description
record.description = record.description[:json_start].strip()
except json.JSONDecodeError as e:
self.logger.warning(f"Failed to parse JSON for {process_id}: {e}")

record.id = process_id
self.logger.debug(f"Parsed process ID: {process_id}")
self.logger.warning(f"Failed to parse JSON for {record.id}: {e}")

# Log results
if 'bcdm_fields' not in record.annotations:
record.annotations['bcdm_fields'] = {}
record.annotations['bcdm_fields']['processid'] = record.id.split('_')[0]
self.logger.debug(f"Process ID: {record.annotations['bcdm_fields']['processid']}")
self.logger.debug(f"Sequence ID: {record.id}")
self.logger.debug(f"Sequence length: {len(record.seq)}")
self.logger.debug(f"JSON config: {json_config}")

yield process_id, self.unalign_sequence(record), json_config
yield self.unalign_sequence(record), json_config

def get_stop_codons(self, amino_acid_sequence):
"""
Expand Down
29 changes: 16 additions & 13 deletions barcode_validator/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,25 @@ def validate_fasta(self, fasta_file_path: str, config: Config) -> List[DNAAnalys
"""
results = []
sh = SequenceHandler(config)
for process_id, record, json_config in sh.parse_fasta(fasta_file_path):
for record, json_config in sh.parse_fasta(fasta_file_path):
scoped_config = config.local_clone(json_config)
result = self.validate_record(process_id, record, scoped_config)
result = DNAAnalysisResult(record.id, fasta_file_path)
result.level = scoped_config.get('level')
self.validate_record(record, scoped_config, result)
results.append(result)
return results

def validate_record(self, process_id: str, record: SeqRecord, config: Config) -> DNAAnalysisResult:
def validate_record(self, record: SeqRecord, config: Config, result: DNAAnalysisResult) -> None:
"""
Validate a single DNA sequence record.
:param process_id: A process ID
:param record: A Bio.SeqRecord object
:param config: A Config object
:param result: A DNAAnalysisResult object
:return: A DNAAnalysisResult object
"""
result = DNAAnalysisResult(process_id)
self.validate_sequence_quality(config, record, result)
self.validate_taxonomy(config, record, result)

# Return the result object
return result

def validate_taxonomy(self, config: Config, record: SeqRecord, result: DNAAnalysisResult) -> None:
"""
Validate the taxonomy of a DNA sequence record.
Expand All @@ -76,10 +74,10 @@ def validate_taxonomy(self, config: Config, record: SeqRecord, result: DNAAnalys
"""

# Lookup expected taxon in BOLD tree
sp = self.get_node_by_processid(result.process_id)
sp = self.get_node_by_processid(record.annotations['bcdm_fields']['processid'])
if sp is None:
self.logger.warning(f"Process ID {result.process_id} not found in BOLD tree.")
result.error = f"{result.process_id} not in BOLD"
self.logger.warning(f"Process ID {record.annotations['bcdm_fields']['processid']} not found in BOLD tree.")
result.error = f"{record.annotations['bcdm_fields']['processid']} not in BOLD"
else:

# Traverse BOLD tree to find the expected taxon at the specified rank
Expand All @@ -98,7 +96,7 @@ def validate_taxonomy(self, config: Config, record: SeqRecord, result: DNAAnalys

# Handle BLAST failure
if obs_taxon is None:
self.logger.warning(f"Local BLAST failed for {result.process_id}")
self.logger.warning(f"Local BLAST failed for {record.annotations['bcdm_fields']['processid']}")
result.error = f"Local BLAST failed for sequence '{record.seq}'"
else:
result.obs_taxon = obs_taxon
Expand Down Expand Up @@ -158,9 +156,14 @@ def validate_sequence_quality(self, config: Config, record: SeqRecord, result: D
# Compute marker quality metrics
aligned_sequence = sh.align_to_hmm(record)
if aligned_sequence is None:
self.logger.warning(f"Alignment failed for {result.process_id}")
self.logger.warning(f"Alignment failed for {result.sequence_id}")
result.error = f"Alignment failed for sequence '{record.seq}'"
else:

# TODO: make it so that the translation table is inferred from the BCDM annotations of the sequence.
# This should be a combination of the taxonomy and the marker code, where the latter should tell us
# if the marker is nuclear or mitochondrial and the former should tell us the translation table.
# https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
amino_acid_sequence = sh.translate_sequence(aligned_sequence, config.get('translation_table'))
result.stop_codons = sh.get_stop_codons(amino_acid_sequence)
result.seq_length = sh.marker_seqlength(aligned_sequence)
Expand Down
Loading