naturalis · rvosa · Nov 25, 2024 · Nov 6, 2024 · Nov 7, 2024 · Nov 7, 2024
diff --git a/barcode_validator/__main__.py b/barcode_validator/__main__.py
@@ -23,10 +23,6 @@ def main(fasta_file_path, yaml_file_path, analytics_file_path, logger, config):
     if analytics_file_path is not None:
         rs.add_csv_file(analytics_file_path)
 
-    # Add fasta file path to each result
-    for result in rs.results:
-        result.add_ancillary('fasta_file', fasta_file_path)
-
     print(rs)  # print TSV results
 
     logger.info("Analysis completed")
@@ -36,8 +32,8 @@ def main(fasta_file_path, yaml_file_path, analytics_file_path, logger, config):
     # Process command line arguments
     parser = argparse.ArgumentParser(description="Analyze DNA sequences from a FASTA file.")
     parser.add_argument("-f", "--fasta_file", required=True, help="Path to the input FASTA file")
-    parser.add_argument("-y", "--yaml_file", required=True, help="Path to the assembly param YAML file")
-    parser.add_argument("-a", "--analytics_file", required=True, help="Path to the analytics CSV file")
+    parser.add_argument("-y", "--yaml_file", required=False, help="Path to the assembly param YAML file")
+    parser.add_argument("-a", "--analytics_file", required=False, help="Path to the analytics CSV file")
     parser.add_argument("-c", "--config_file", required=True, help="Path to the configuration YAML file")
     parser.add_argument("-v", "--verbosity", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
                         help="Set the logging verbosity (default: INFO)")

diff --git a/barcode_validator/alignment.py b/barcode_validator/alignment.py
@@ -47,6 +47,10 @@ def align_to_hmm(self, sequence):
         :param sequence: A BioPython SeqRecord object
         :return: A BioPython SeqRecord object containing the aligned sequence
         """
+
+        # TODO: inside this method, the HMM file should be inferred from the `marker_code` attribute of the sequence.
+        # For example, if `marker_code` == 'COI-5P', the HMM file should be 'COI-5P.hmm', which should be set via
+        # self.hmmalign.set_hmmfile().
         self.logger.info("Aligning sequence to HMM")
         if len(sequence.seq) == 0:
             self.logger.warning("Empty sequence provided for alignment")
@@ -64,6 +68,9 @@ def align_to_hmm(self, sequence):
 
                 # Run hmmalign and parse the output
                 return_code = self.hmmalign.run()
+                if return_code != 0:
+                    self.logger.error(f"hmmalign failed with return code {return_code}")
+                    return None
                 self.logger.debug(f"Going to parse outfile {temp_output.name}")
                 aligned_sequence = next(SeqIO.parse(temp_output.name, 'stockholm'))
 
@@ -113,19 +120,15 @@ def unalign_sequence(self, sequence):
         self.logger.info("Removing gaps from aligned sequence")
         if isinstance(sequence, SeqRecord):
             # Convert Seq to string, remove gaps, then convert back to Seq
-            unaligned_sequence = str(sequence.seq).replace('-', '').replace('~', '')
-            return SeqRecord(
-                Seq(unaligned_sequence),
-                id=sequence.id,
-                name=sequence.name,
-                description=sequence.description
-            )
+            unaligned_sequence = str(sequence.seq).replace('-', '').replace('~', '').replace('_', '')
+            sequence.seq = Seq(unaligned_sequence)
+            return sequence
         elif isinstance(sequence, Seq):
             # If it's just a Seq object, convert to string, remove gaps, then back to Seq
-            return Seq(str(sequence).replace('-', '').replace('~', ''))
+            return Seq(str(sequence).replace('-', '').replace('~', '').replace('_', ''))
         elif isinstance(sequence, str):
             # If it's a string, just remove the gaps
-            return sequence.replace('-', '').replace('~', '')
+            return sequence.replace('-', '').replace('~', '').replace('_', '')
         else:
             raise TypeError(f"Unexpected type for sequence: {type(sequence)}")
 
@@ -179,17 +182,17 @@ def translate_sequence(self, dna_sequence, table_idx):
 
     def parse_fasta(self, file_path):
         """
-        Parse a FASTA file and yield the process ID, sequence record, and optional JSON configuration for each entry.
-        The process ID is the first part of the sequence ID, which is assumed to be separated by an underscore.
+        Parse a FASTA file and yield the sequence record, and optional JSON configuration for each entry.
+        The process ID is the first part of the sequence ID, which is assumed to be separated by an underscore
+        and is attached under record.annotations['bcdm_fields']['processid'].
         Any JSON configuration data after the first '{' in the header is parsed and returned.
 
         :param file_path: Local path to the FASTA file
-        :yield: A tuple containing the process ID, the sequence record, and the JSON configuration (or None) for each entry
+        :yield: A tuple containing the sequence record, and the JSON configuration (or None) for each entry
         """
         self.logger.info(f"Parsing FASTA file: {file_path}")
         with open(file_path, 'r') as file:
             for record in SeqIO.parse(file, 'fasta'):
-                process_id = record.id.split('_')[0]
 
                 # Attempt to parse JSON from the description
                 json_config = None
@@ -201,14 +204,18 @@ def parse_fasta(self, file_path):
                         # Remove the JSON part from the description
                         record.description = record.description[:json_start].strip()
                     except json.JSONDecodeError as e:
-                        self.logger.warning(f"Failed to parse JSON for {process_id}: {e}")
-
-                record.id = process_id
-                self.logger.debug(f"Parsed process ID: {process_id}")
+                        self.logger.warning(f"Failed to parse JSON for {record.id}: {e}")
+
+                # Log results
+                if 'bcdm_fields' not in record.annotations:
+                    record.annotations['bcdm_fields'] = {}
+                record.annotations['bcdm_fields']['processid'] = record.id.split('_')[0]
+                self.logger.debug(f"Process ID: {record.annotations['bcdm_fields']['processid']}")
+                self.logger.debug(f"Sequence ID: {record.id}")
                 self.logger.debug(f"Sequence length: {len(record.seq)}")
                 self.logger.debug(f"JSON config: {json_config}")
 
-                yield process_id, self.unalign_sequence(record), json_config
+                yield self.unalign_sequence(record), json_config
 
     def get_stop_codons(self, amino_acid_sequence):
         """

diff --git a/barcode_validator/core.py b/barcode_validator/core.py
@@ -46,27 +46,25 @@ def validate_fasta(self, fasta_file_path: str, config: Config) -> List[DNAAnalys
         """
         results = []
         sh = SequenceHandler(config)
-        for process_id, record, json_config in sh.parse_fasta(fasta_file_path):
+        for record, json_config in sh.parse_fasta(fasta_file_path):
             scoped_config = config.local_clone(json_config)
-            result = self.validate_record(process_id, record, scoped_config)
+            result = DNAAnalysisResult(record.id, fasta_file_path)
+            result.level = scoped_config.get('level')
+            self.validate_record(record, scoped_config, result)
             results.append(result)
         return results
 
-    def validate_record(self, process_id: str, record: SeqRecord, config: Config) -> DNAAnalysisResult:
+    def validate_record(self, record: SeqRecord, config: Config, result: DNAAnalysisResult) -> None:
         """
         Validate a single DNA sequence record.
-        :param process_id: A process ID
         :param record: A Bio.SeqRecord object
         :param config: A Config object
+        :param result: A DNAAnalysisResult object
         :return: A DNAAnalysisResult object
         """
-        result = DNAAnalysisResult(process_id)
         self.validate_sequence_quality(config, record, result)
         self.validate_taxonomy(config, record, result)
 
-        # Return the result object
-        return result
-
     def validate_taxonomy(self, config: Config, record: SeqRecord, result: DNAAnalysisResult) -> None:
         """
         Validate the taxonomy of a DNA sequence record.
@@ -76,10 +74,10 @@ def validate_taxonomy(self, config: Config, record: SeqRecord, result: DNAAnalys
         """
 
         # Lookup expected taxon in BOLD tree
-        sp = self.get_node_by_processid(result.process_id)
+        sp = self.get_node_by_processid(record.annotations['bcdm_fields']['processid'])
         if sp is None:
-            self.logger.warning(f"Process ID {result.process_id} not found in BOLD tree.")
-            result.error = f"{result.process_id} not in BOLD"
+            self.logger.warning(f"Process ID {record.annotations['bcdm_fields']['processid']} not found in BOLD tree.")
+            result.error = f"{record.annotations['bcdm_fields']['processid']} not in BOLD"
         else:
 
             # Traverse BOLD tree to find the expected taxon at the specified rank
@@ -98,7 +96,7 @@ def validate_taxonomy(self, config: Config, record: SeqRecord, result: DNAAnalys
 
             # Handle BLAST failure
             if obs_taxon is None:
-                self.logger.warning(f"Local BLAST failed for {result.process_id}")
+                self.logger.warning(f"Local BLAST failed for {record.annotations['bcdm_fields']['processid']}")
                 result.error = f"Local BLAST failed for sequence '{record.seq}'"
             else:
                 result.obs_taxon = obs_taxon
@@ -158,9 +156,14 @@ def validate_sequence_quality(self, config: Config, record: SeqRecord, result: D
         # Compute marker quality metrics
         aligned_sequence = sh.align_to_hmm(record)
         if aligned_sequence is None:
-            self.logger.warning(f"Alignment failed for {result.process_id}")
+            self.logger.warning(f"Alignment failed for {result.sequence_id}")
             result.error = f"Alignment failed for sequence '{record.seq}'"
         else:
+
+            # TODO: make it so that the translation table is inferred from the BCDM annotations of the sequence.
+            # This should be a combination of the taxonomy and the marker code, where the latter should tell us
+            # if the marker is nuclear or mitochondrial and the former should tell us the translation table.
+            # https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
             amino_acid_sequence = sh.translate_sequence(aligned_sequence, config.get('translation_table'))
             result.stop_codons = sh.get_stop_codons(amino_acid_sequence)
             result.seq_length = sh.marker_seqlength(aligned_sequence)