prec: handle rare sequence alignment issue

vistalab-technion · Mar 3, 2024 · b26845b · b26845b
1 parent 8d7dabe
commit b26845b
Showing 1 changed file with 8 additions and 0 deletions.
diff --git a/src/pp5/prec.py b/src/pp5/prec.py
@@ -1217,6 +1217,14 @@ def _find_unp_alignment(self, pdb_aa_seq: str, unp_aa_seq: str) -> Dict[int, int
         aligner = PairwiseAligner(
             substitution_matrix=BLOSUM80, open_gap_score=-10, extend_gap_score=-0.5
         )
+
+        # In rare cases, there could be unknown letters in the sequences. This causes
+        # the alignment to break. Replace with "X" which the aligner can handle.
+        unknown_aas = set(pdb_aa_seq).union(set(unp_aa_seq)) - set(ACIDS_1TO3)
+        for unk_aa in unknown_aas:  # usually there are none
+            unp_aa_seq = unp_aa_seq.replace(unk_aa, UNKNOWN_AA)
+            pdb_aa_seq = pdb_aa_seq.replace(unk_aa, UNKNOWN_AA)
+
         multi_alignments = aligner.align(pdb_aa_seq, unp_aa_seq)
         alignment = sorted(multi_alignments, key=lambda a: a.score)[-1]
         LOGGER.info(f"{self}: PDB to UNP sequence alignment score={alignment.score}")