Skip to content

Commit

Permalink
Merge pull request #48 from naturalis/main
Browse files Browse the repository at this point in the history
merge triage
  • Loading branch information
rvosa authored Nov 13, 2024
2 parents 012d253 + 6130e81 commit f258d84
Show file tree
Hide file tree
Showing 4 changed files with 332 additions and 35 deletions.
4 changes: 4 additions & 0 deletions barcode_validator/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ def align_to_hmm(self, sequence):
:param sequence: A BioPython SeqRecord object
:return: A BioPython SeqRecord object containing the aligned sequence
"""

# TODO: inside this method, the HMM file should be inferred from the `marker_code` attribute of the sequence.
# For example, if `marker_code` == 'COI-5P', the HMM file should be 'COI-5P.hmm', which should be set via
# self.hmmalign.set_hmmfile().
self.logger.info("Aligning sequence to HMM")
if len(sequence.seq) == 0:
self.logger.warning("Empty sequence provided for alignment")
Expand Down
5 changes: 5 additions & 0 deletions barcode_validator/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,11 @@ def validate_sequence_quality(self, config: Config, record: SeqRecord, result: D
self.logger.warning(f"Alignment failed for {result.process_id}")
result.error = f"Alignment failed for sequence '{record.seq}'"
else:

# TODO: make it so that the translation table is inferred from the BCDM annotations of the sequence.
# This should be a combination of the taxonomy and the marker code, where the latter should tell us
# if the marker is nuclear or mitochondrial and the former should tell us the translation table.
# https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
amino_acid_sequence = sh.translate_sequence(aligned_sequence, config.get('translation_table'))
result.stop_codons = sh.get_stop_codons(amino_acid_sequence)
result.seq_length = sh.marker_seqlength(aligned_sequence)
Expand Down
250 changes: 250 additions & 0 deletions barcode_validator/triage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
import argparse
import logging
import os
import csv
import sys
from Bio import SeqIO
from pathlib import Path


def setup_logging(verbosity):
"""
Configure logging format and level based on verbosity flag count.
Args:
verbosity (int): Number of verbose flags (-v) provided:
0: WARNING level (default)
1: INFO level
2: DEBUG level
"""
levels = {
0: logging.WARNING,
1: logging.INFO,
2: logging.DEBUG
}
# If verbosity is higher than 2, cap it at DEBUG level
level = levels.get(min(verbosity, 2), logging.DEBUG)

logging.basicConfig(
level=level,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logging.debug(f"Logging level set to: {logging.getLevelName(level)}")


def parse_args():
"""
Parse command line arguments.
Returns:
argparse.Namespace: Object containing the following attributes:
- tsv (str): Path to input TSV file with sequence metadata
- fasta (str): Path to input FASTA file with sequences
- good_dir (str): Path to output directory for good sequences
- bad_dir (str): Path to output directory for bad sequences
- verbose (int): Count of verbose flags (0-2)
"""
parser = argparse.ArgumentParser(description='Triage FASTA sequences based on quality criteria.')
parser.add_argument('--tsv', required=True, help='Input TSV file with sequence metadata')
parser.add_argument('--fasta', required=True, help='Input FASTA file with sequences')
parser.add_argument('--good_dir', required=True, help='Output directory for good sequences')
parser.add_argument('--bad_dir', required=True, help='Output directory for bad sequences')
parser.add_argument('-v', '--verbose', action='count', default=0,
help='Increase output verbosity (use -v for INFO, -v -v for DEBUG)')
return parser.parse_args()


def read_tsv_data(tsv_file):
"""
Read TSV file and return data as dictionary keyed by process_id.
Args:
tsv_file (str): Path to the input TSV file containing sequence metadata.
Returns:
dict: Dictionary where keys are process_ids and values are dictionaries
containing the corresponding row data from the TSV file.
"""
data = {}
with open(tsv_file, 'r') as f:
reader = csv.DictReader(f, delimiter='\t')
logging.debug(f"Reading TSV file: {tsv_file}")
for row in reader:
try:
data[row['process_id']] = row
except KeyError:
print(f"Error: Missing process_id in file {tsv_file}")
sys.exit(1)
logging.debug(f"Read {len(data)} records from TSV file")
return data


def check_sequence(seq_id, metadata):
"""
Check if sequence meets quality criteria.
Args:
seq_id (str): Identifier of the sequence being checked.
metadata (dict): Dictionary containing sequence metadata with the following keys:
- nuc_basecount (str): Number of nucleotide bases
- stop_codons (str): Number of stop codons
- ambig_basecount (str): Number of ambiguous bases
- obs_taxon (str): Comma-separated list of observed taxa
- identification (str): Expected taxon identification
Returns:
tuple: A pair containing:
- bool: True if sequence passes all criteria, False otherwise
- list: List of strings indicating which criteria failed (empty if all passed)
"""
failed_criteria = []
logging.debug(f"Checking sequence {seq_id}")

# Check nuc_basecount >= 500
nuc_count = 0
if metadata['nuc_basecount'] != 'None':
nuc_count = int(metadata['nuc_basecount'])
if nuc_count >= 500:
logging.info(f"Sequence {seq_id} passes nucleotide base count check ({nuc_count} bases)")
else:
logging.warning(f"Sequence {seq_id} fails nucleotide base count check ({nuc_count} bases)")
failed_criteria.append('nuc_basecount')

# Check stop_codons == 0
stop_count = int(metadata['stop_codons'])
if stop_count == 0:
logging.info(f"Sequence {seq_id} passes stop codons check (no stop codons)")
else:
logging.warning(f"Sequence {seq_id} fails stop codons check ({stop_count} stop codons)")
failed_criteria.append('stop_codons')

# Check ambig_basecount <= 6
ambig_count = 7
if metadata['ambig_basecount'] != 'None':
ambig_count = int(metadata['ambig_basecount'])
if ambig_count <= 6:
logging.info(f"Sequence {seq_id} passes ambiguous base count check ({ambig_count} ambiguous bases)")
else:
logging.warning(f"Sequence {seq_id} fails ambiguous base count check ({ambig_count} ambiguous bases)")
failed_criteria.append('ambig_basecount')

# Check if identification is in obs_taxon
obs_taxa = {taxon.strip() for taxon in metadata['obs_taxon'].split(',')}
if metadata['identification'] in obs_taxa:
logging.info(f"Sequence {seq_id} passes taxonomy check (identification: {metadata['identification']})")
else:
logging.warning(
f"Sequence {seq_id} fails taxonomy check (identification '{metadata['identification']}' "
f"not found in observed taxa: {', '.join(obs_taxa)})"
)
failed_criteria.append('taxonomy')

logging.debug(
f"Sequence {seq_id} check complete. Failed criteria: {', '.join(failed_criteria) if failed_criteria else 'none'}")
return len(failed_criteria) == 0, failed_criteria


def process_sequences(args):
"""
Process sequences and write to appropriate output directories.
Args:
args (argparse.Namespace): Command line arguments containing:
- tsv (str): Path to input TSV file
- fasta (str): Path to input FASTA file
- good_dir (str): Path to output directory for good sequences
- bad_dir (str): Path to output directory for bad sequences
Side effects:
- Creates output directories if they don't exist
- Writes filtered FASTA and TSV files to good_dir and bad_dir
- Logs processing results using the logging module
"""
# Create output directories if they don't exist
Path(args.good_dir).mkdir(parents=True, exist_ok=True)
Path(args.bad_dir).mkdir(parents=True, exist_ok=True)
logging.debug(f"Created output directories: {args.good_dir}, {args.bad_dir}")

# Read TSV data
metadata = read_tsv_data(args.tsv)

# Prepare output files
good_fasta = os.path.join(args.good_dir, os.path.basename(args.fasta))
bad_fasta = os.path.join(args.bad_dir, os.path.basename(args.fasta))
good_tsv = os.path.join(args.good_dir, os.path.basename(args.tsv))
bad_tsv = os.path.join(args.bad_dir, os.path.basename(args.tsv))

logging.debug(f"Output files prepared:\n"
f"Good FASTA: {good_fasta}\n"
f"Bad FASTA: {bad_fasta}\n"
f"Good TSV: {good_tsv}\n"
f"Bad TSV: {bad_tsv}")

# Process sequences
good_records = []
bad_records = []
good_metadata = []
bad_metadata = []

logging.info(f"Starting sequence processing from {args.fasta}")
for record in SeqIO.parse(args.fasta, 'fasta'):
seq_id = record.id.split('_')[0]
logging.debug(f"Processing sequence: {seq_id}")

if seq_id not in metadata:
logging.error(f"No metadata found for sequence {seq_id}")
continue
try:
passes_check, failed_criteria = check_sequence(seq_id, metadata[seq_id])
except KeyError as e:
logging.error(f"Error processing sequence {seq_id}: {e} in {args.tsv}")
continue

if passes_check:
good_records.append(record)
good_metadata.append(metadata[seq_id])
else:
bad_records.append(record)
bad_metadata.append(metadata[seq_id])

# Write FASTA files
SeqIO.write(good_records, good_fasta, 'fasta')
SeqIO.write(bad_records, bad_fasta, 'fasta')
logging.debug(f"Written FASTA files:\n"
f"Good sequences: {good_fasta} ({len(good_records)} records)\n"
f"Bad sequences: {bad_fasta} ({len(bad_records)} records)")

# Write TSV files
fieldnames = next(csv.DictReader(open(args.tsv), delimiter='\t')).keys()

def write_tsv(filename, data):
with open(filename, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
writer.writerows(data)
logging.debug(f"Written TSV file: {filename} ({len(data)} records)")

write_tsv(good_tsv, good_metadata)
write_tsv(bad_tsv, bad_metadata)

logging.info(
f"Processing complete. Processed {len(good_records)} good sequences and {len(bad_records)} bad sequences")


def main():
"""
Main function to run the script.
This function:
1. Parses command line arguments
2. Sets up logging configuration based on verbosity level
3. Processes and triages sequences based on quality criteria
"""
args = parse_args()
setup_logging(args.verbose)
process_sequences(args)


if __name__ == "__main__":
main()
Loading

0 comments on commit f258d84

Please sign in to comment.