From 8269403f9f43dd83a2134c929178c0207cdb1ad4 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Sun, 3 Mar 2024 05:26:43 +0200
Subject: [PATCH 1/3] backbone: handle rare issue with negative bfactors

---
 src/pp5/backbone.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/pp5/backbone.py b/src/pp5/backbone.py
index d3c5ad7..8948fed 100644
--- a/src/pp5/backbone.py
+++ b/src/pp5/backbone.py
@@ -37,7 +37,12 @@ def atom_location_sigma(atom: Atom) -> float:
     :param atom: The atom to calculate the sigma for.
     :return: The sigma in Angstroms.
     """
-    return math.sqrt(atom.get_bfactor() / CONST_8PI2)
+    bfactor = atom.get_bfactor()
+    if bfactor < 0:
+        # In very rare cases, the B-factor of some atom in a PDB file is negative,
+        # which doesn't make sense (e.g. 1D9U:B).
+        return float("nan")
+    return math.sqrt(bfactor / CONST_8PI2)
 
 
 def residue_backbone_atoms(res: Residue) -> Sequence[Atom]:

From 8d7dabe55817137fa62a6c37ecfa635972cbfbce Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Sun, 3 Mar 2024 05:51:28 +0200
Subject: [PATCH 2/3] contacts: handle rare cases where a residue itself can be
 disordered

---
 src/pp5/contacts.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/pp5/contacts.py b/src/pp5/contacts.py
index 430ea63..faa2693 100644
--- a/src/pp5/contacts.py
+++ b/src/pp5/contacts.py
@@ -16,7 +16,7 @@
 import pandas as pd
 from Bio.PDB import NeighborSearch
 from Bio.PDB.Atom import Atom
-from Bio.PDB.Residue import Residue
+from Bio.PDB.Residue import Residue, DisorderedResidue
 
 import pp5
 from pp5.codons import ACIDS_3TO1, UNKNOWN_AA
@@ -184,6 +184,12 @@ def __init__(
 
     def assign(self, res: Residue) -> Dict[str, Optional[ResidueContacts]]:
 
+        # In rare cases, a residue may be disordered and contain other residues.
+        # This means there's a point mutation and both original and mutated residues
+        # are present in the crystal. We ignore this and just use the selected residue.
+        if isinstance(res, DisorderedResidue):
+            res = res.disordered_get()
+
         # Get all atoms from within the residue, including side chain atoms
         all_atoms = tuple(res.get_atoms())
 

From b26845b08e626ca8b84eef03546a52e6ebb6eba3 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Sun, 3 Mar 2024 06:16:39 +0200
Subject: [PATCH 3/3] prec: handle rare sequence alignment issue

---
 src/pp5/prec.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/pp5/prec.py b/src/pp5/prec.py
index b51ad3e..3f42276 100644
--- a/src/pp5/prec.py
+++ b/src/pp5/prec.py
@@ -1217,6 +1217,14 @@ def _find_unp_alignment(self, pdb_aa_seq: str, unp_aa_seq: str) -> Dict[int, int
         aligner = PairwiseAligner(
             substitution_matrix=BLOSUM80, open_gap_score=-10, extend_gap_score=-0.5
         )
+
+        # In rare cases, there could be unknown letters in the sequences. This causes
+        # the alignment to break. Replace with "X" which the aligner can handle.
+        unknown_aas = set(pdb_aa_seq).union(set(unp_aa_seq)) - set(ACIDS_1TO3)
+        for unk_aa in unknown_aas:  # usually there are none
+            unp_aa_seq = unp_aa_seq.replace(unk_aa, UNKNOWN_AA)
+            pdb_aa_seq = pdb_aa_seq.replace(unk_aa, UNKNOWN_AA)
+
         multi_alignments = aligner.align(pdb_aa_seq, unp_aa_seq)
         alignment = sorted(multi_alignments, key=lambda a: a.score)[-1]
         LOGGER.info(f"{self}: PDB to UNP sequence alignment score={alignment.score}")