From 7f5c4f56d59662dbc9a041aafd78d539ebe62d78 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Tue, 21 May 2024 09:45:20 +0300 Subject: [PATCH 01/13] pairwise_alignment_map: move logic out of prec and into pp5.align --- src/pp5/align.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/src/pp5/align.py b/src/pp5/align.py index a58eb40..284a734 100644 --- a/src/pp5/align.py +++ b/src/pp5/align.py @@ -12,7 +12,7 @@ import warnings import contextlib import subprocess -from typing import Any, Dict, Tuple, Union, Iterable, Optional +from typing import Any, Dict, List, Tuple, Union, Iterable, Optional from pathlib import Path from datetime import datetime, timedelta @@ -21,11 +21,12 @@ from tqdm import tqdm from Bio.Seq import Seq +from pp5.codons import ACIDS_1TO3, UNKNOWN_AA from pp5.external_dbs.pdb import PDB_RCSB with warnings.catch_warnings(): warnings.simplefilter("ignore", BiopythonExperimentalWarning) - from Bio.Align import substitution_matrices + from Bio.Align import substitution_matrices, PairwiseAligner from Bio.AlignIO import MultipleSeqAlignment as MSA from Bio.SeqRecord import SeqRecord @@ -57,6 +58,68 @@ BLOSUM90 = substitution_matrices.load("BLOSUM90") +def pairwise_alignment_map( + src_seq: str, + tgt_seq: str, + open_gap_score: float = -10, + extend_gap_score: float = -0.5, +) -> Tuple[float, Dict[int, int]]: + """ + Aligns between two AA sequences and produces a map from the indices of + the source sequence to the indices of the target sequence. + Uses biopython's PairwiseAligner with BLOSUM80 matrix. + + :param src_seq: Source AA sequence to align. + :param tgt_seq: Target AA sequence to align. + :param open_gap_score: Penalty for opening a gap in the alignment. + :param extend_gap_score: Penalty for extending a gap by one residue. + :return: A tuple with two elements: + - The alignment score, higher is better. + - A dict mapping from an index in the source sequence to the corresponding + index in the target sequence. + """ + aligner = PairwiseAligner( + substitution_matrix=BLOSUM80, + open_gap_score=open_gap_score, + extend_gap_score=extend_gap_score, + ) + + # In rare cases, there could be unknown letters in the sequences. This causes + # the alignment to break. Replace with "X" which the aligner can handle. + unknown_aas = set(src_seq).union(set(tgt_seq)) - set(ACIDS_1TO3) + for unk_aa in unknown_aas: # usually there are none + tgt_seq = tgt_seq.replace(unk_aa, UNKNOWN_AA) + src_seq = src_seq.replace(unk_aa, UNKNOWN_AA) + + multi_alignments = aligner.align(src_seq, tgt_seq) + alignment = sorted(multi_alignments, key=lambda a: a.score)[-1] + score = alignment.score + # LOGGER.info(f"{self}: PDB to UNP sequence alignment score={alignment.score}") + + # Alignment contains two tuples each of length N (for N matching sub-sequences) + # ( + # ((t_start1, t_end1), (t_start2, t_end2), ..., (t_startN, t_endN)), + # ((q_start1, q_end1), (q_start2, q_end2), ..., (q_startN, q_endN)) + # ) + src_to_tgt: List[Tuple[int, int]] = [] + src_subseqs, tgt_subseqs = alignment.aligned + assert len(src_subseqs) == len(tgt_subseqs) + for i in range(len(src_subseqs)): + src_start, src_end = src_subseqs[i] + tgt_start, tgt_end = tgt_subseqs[i] + assert src_end - src_start == tgt_end - tgt_start + + for j in range(src_end - src_start): + if src_seq[src_start + j] != tgt_seq[tgt_start + j]: + # There are mismatches included in the match sequence (cases + # where a similar AA is not considered a complete mismatch). + # We are stricter: require exact match. + continue + src_to_tgt.append((src_start + j, tgt_start + j)) + + return score, dict(src_to_tgt) + + def multiseq_align( seqs: Iterable[SeqRecord] = None, in_file=None, out_file=None, **clustal_kw ) -> MSA: From d891dd4f5454649a437b8103878c04e708efe57b Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Tue, 21 May 2024 09:58:01 +0300 Subject: [PATCH 02/13] prec: missing residue detection now based on alignment to canonical PDB sequence --- src/pp5/prec.py | 107 ++++++++++++++---------------------------------- 1 file changed, 31 insertions(+), 76 deletions(-) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index 283d21f..ccae6bf 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -33,7 +33,7 @@ from Bio.PDB.Polypeptide import Polypeptide import pp5 -from pp5.align import BLOSUM80 +from pp5.align import BLOSUM80, pairwise_alignment_map from pp5.utils import ProteinInitError, filelock_context from pp5.codons import ( ACIDS_1TO3, @@ -885,49 +885,45 @@ def __init__( assert len(pdb_aa_seq) == len(residues) - # Add un-modelled residues by aligning to UNP: - # In case of non-modelled residues in the structure, they will be missing - # from the pdb_aa_seq and residues list, since they don't appear in - # biopython's structure. We will add them back using alignment to Uniprot. - # Find the alignment between the PDB AA sequence and the Uniprot AA sequence. - pdb_to_unp_idx = self._find_unp_alignment(pdb_aa_seq, self.unp_rec.sequence) - - for curr_pdb_idx in pdb_to_unp_idx.keys(): + # Add un-modelled residues by aligning to canonical PDB sequence (from + # structure metadata). Un-modelled residues will be missing from the pdb_aa_seq + # and residues list, as they don't appear in biopython's structure. + # We will add them back using alignment to the canonical sequence. + pdb_aa_seq_meta = self.pdb_meta.entity_sequence[ + self.pdb_meta.chain_entities[self.pdb_chain_id] + ] + _, pdb_to_meta_idx = pairwise_alignment_map(pdb_aa_seq, pdb_aa_seq_meta) + + # Find gaps in the alignment, and add missing residues within each gap + for curr_pdb_idx in pdb_to_meta_idx.keys(): next_pdb_idx = curr_pdb_idx + 1 - if next_pdb_idx not in pdb_to_unp_idx: + if next_pdb_idx not in pdb_to_meta_idx: # Either this is the last one, or the next one is not in the # alignment (can happen if it's a non-standard AA) continue - curr_unp_idx = pdb_to_unp_idx[curr_pdb_idx] - next_unp_idx = pdb_to_unp_idx[next_pdb_idx] + curr_meta_idx = pdb_to_meta_idx[curr_pdb_idx] + next_meta_idx = pdb_to_meta_idx[next_pdb_idx] - # Detect a gap in the alignment to UNP - gap_len = next_unp_idx - curr_unp_idx - 1 + # Detect a gap in the alignment to the canonical sequence + gap_len = next_meta_idx - curr_meta_idx - 1 if gap_len == 0: continue # Here, we have two adjacent residues in the pdb structure are aligned to - # non-adjacent residues in the unp sequence. It means that there are - # missing residues in the structure, i.e. the pdb sequence we created is - # not complete. + # non-adjacent residues in the canonical sequence. It means that there are + # missing (un-modelled) residues in the structure. curr_residue = residues[curr_pdb_idx] curr_residue_seq_idx = curr_residue.get_id()[1] - for j, gap_unp_idx in enumerate(range(curr_unp_idx + 1, next_unp_idx)): - missing_res_name_single = self.unp_rec.sequence[gap_unp_idx] + for j, gap_meta_idx in enumerate(range(curr_meta_idx + 1, next_meta_idx)): + missing_res_name_single = pdb_aa_seq_meta[gap_meta_idx] missing_res_name = ACIDS_1TO3[missing_res_name_single] - gap_pdb_idx = next_pdb_idx + j gap_pdb_seq_idx = curr_residue_seq_idx + j + 1 - - residues.insert( # inserts BEFORE the given index - gap_pdb_idx, - Residue( - (" ", gap_pdb_seq_idx, ICODE_MISSING_RESIDUE), - missing_res_name, - 0, - ), + missing_residue = Residue( + (" ", gap_pdb_seq_idx, ICODE_MISSING_RESIDUE), missing_res_name, 0 ) + residues.insert(gap_pdb_idx, missing_residue) # inserts BEFORE index # Update PDB sequence to include any missing residues we added pdb_aa_seq = str.join( @@ -965,6 +961,12 @@ def __init__( pdb_dict=self._pdb_dict, ) + # Align PDB sequence to UNP + unp_alignment_score, pdb_to_unp_idx = pairwise_alignment_map( + pdb_aa_seq, self.unp_rec.sequence + ) + LOGGER.info(f"{self}: PDB to UNP alignment score={unp_alignment_score}") + # Create a ResidueRecord holding all data we need per residue residue_recs = [] for i in range(n_residues): @@ -978,7 +980,7 @@ def __init__( r_prev: Optional[Residue] = residues[i - 1] if i > 0 else None r_next: Optional[Residue] = residues[i + 1] if i < n_residues - 1 else None - # Alignment to UNP + # Get corresponding UNP index unp_idx: Optional[int] = pdb_to_unp_idx.get(i, None) # Secondary structure annotation @@ -1241,53 +1243,6 @@ def save(self, out_dir=pp5.data_subdir("prec"), tag=None): LOGGER.info(f"Wrote {self} to {filepath}") return filepath - def _find_unp_alignment(self, pdb_aa_seq: str, unp_aa_seq: str) -> Dict[int, int]: - """ - Aligns between this prec's AA sequence (based on the PDB structure) and the - Uniprot sequence. - :param pdb_aa_seq: AA sequence from PDB to align. - :param unp_aa_seq: AA sequence from UNP to align. - :return: A dict mapping from an index in the PDB sequence to the - corresponding index in the UNP sequence. - """ - aligner = PairwiseAligner( - substitution_matrix=BLOSUM80, open_gap_score=-10, extend_gap_score=-0.5 - ) - - # In rare cases, there could be unknown letters in the sequences. This causes - # the alignment to break. Replace with "X" which the aligner can handle. - unknown_aas = set(pdb_aa_seq).union(set(unp_aa_seq)) - set(ACIDS_1TO3) - for unk_aa in unknown_aas: # usually there are none - unp_aa_seq = unp_aa_seq.replace(unk_aa, UNKNOWN_AA) - pdb_aa_seq = pdb_aa_seq.replace(unk_aa, UNKNOWN_AA) - - multi_alignments = aligner.align(pdb_aa_seq, unp_aa_seq) - alignment = sorted(multi_alignments, key=lambda a: a.score)[-1] - LOGGER.info(f"{self}: PDB to UNP sequence alignment score={alignment.score}") - - # Alignment contains two tuples each of length N (for N matching sub-sequences) - # ( - # ((t_start1, t_end1), (t_start2, t_end2), ..., (t_startN, t_endN)), - # ((q_start1, q_end1), (q_start2, q_end2), ..., (q_startN, q_endN)) - # ) - pdb_to_unp: List[Tuple[int, int]] = [] - pdb_subseqs, unp_subseqs = alignment.aligned - assert len(pdb_subseqs) == len(unp_subseqs) - for i in range(len(pdb_subseqs)): - pdb_start, pdb_end = pdb_subseqs[i] - unp_start, unp_end = unp_subseqs[i] - assert pdb_end - pdb_start == unp_end - unp_start - - for j in range(pdb_end - pdb_start): - if pdb_aa_seq[pdb_start + j] != unp_aa_seq[unp_start + j]: - # There are mismatches included in the match sequence (cases - # where a similar AA is not considered a complete mismatch). - # We are more strict: require exact match. - continue - pdb_to_unp.append((pdb_start + j, unp_start + j)) - - return dict(pdb_to_unp) - def _find_dna_alignment( self, pdb_aa_seq: str, max_ena: int ) -> Tuple[Optional[str], Optional[str], Dict[int, Dict[str, int]]]: From 626bc2ff20a99212c2bf702226ae9db178316c20 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Thu, 23 May 2024 15:08:48 +0300 Subject: [PATCH 03/13] WIP --- src/pp5/prec.py | 151 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 100 insertions(+), 51 deletions(-) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index ccae6bf..3634770 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -868,11 +868,11 @@ def __init__( dihedral_est_name, dihedral_est_args ) - pdb_aa_seq, residues = "", [] - # Extract the residues from the PDB structure + # Extract the residues from the PDB structure: these are the modelled residues curr_res: Residue - chain: Chain = self.pdb_rec[0][self.pdb_auth_chain_id] - for curr_res in chain.get_residues(): + modelled_residues: List[Residue] = [] + struct_chain: Chain = self.pdb_rec[0][self.pdb_auth_chain_id] + for curr_res in struct_chain.get_residues(): # Skip water molecules if curr_res.resname == "HOH": continue @@ -880,57 +880,104 @@ def __init__( # For non-standard AAs, we place an 'X' in the sequence, but we still # keep these residues. res_name_single = ACIDS_3TO1.get(curr_res.get_resname(), UNKNOWN_AA) - pdb_aa_seq += res_name_single - residues.append(curr_res) + modelled_residues.append(curr_res) - assert len(pdb_aa_seq) == len(residues) + # Sequence of modelled residues from the PDB structure + pdb_modelled_aa_seq: str = str.join( + "", + [ACIDS_3TO1.get(r.get_resname(), UNKNOWN_AA) for r in modelled_residues], + ) + assert len(pdb_modelled_aa_seq) == len(modelled_residues) + + # The canonical AA sequence from the structure metadata + pdb_meta_aa_seq = self.pdb_meta.entity_sequence[ + self.pdb_meta.chain_entities[self.pdb_chain_id] + ] # Add un-modelled residues by aligning to canonical PDB sequence (from # structure metadata). Un-modelled residues will be missing from the pdb_aa_seq # and residues list, as they don't appear in biopython's structure. # We will add them back using alignment to the canonical sequence. - pdb_aa_seq_meta = self.pdb_meta.entity_sequence[ - self.pdb_meta.chain_entities[self.pdb_chain_id] - ] - _, pdb_to_meta_idx = pairwise_alignment_map(pdb_aa_seq, pdb_aa_seq_meta) - - # Find gaps in the alignment, and add missing residues within each gap - for curr_pdb_idx in pdb_to_meta_idx.keys(): - next_pdb_idx = curr_pdb_idx + 1 - if next_pdb_idx not in pdb_to_meta_idx: - # Either this is the last one, or the next one is not in the - # alignment (can happen if it's a non-standard AA) - continue + # _, pdb_to_meta_idx = pairwise_alignment_map(pdb_aa_seq, pdb_aa_seq_meta) + # + # # Find gaps in the alignment, and add missing residues within each gap + # for curr_pdb_idx in pdb_to_meta_idx.keys(): + # next_pdb_idx = curr_pdb_idx + 1 + # if next_pdb_idx not in pdb_to_meta_idx: + # # Either this is the last one, or the next one is not in the + # # alignment (can happen if it's a non-standard AA) + # continue + # + # curr_meta_idx = pdb_to_meta_idx[curr_pdb_idx] + # next_meta_idx = pdb_to_meta_idx[next_pdb_idx] + # + # # Detect a gap in the alignment to the canonical sequence + # gap_len = next_meta_idx - curr_meta_idx - 1 + # if gap_len == 0: + # continue + # + # # Here, we have two adjacent residues in the pdb structure are aligned to + # # non-adjacent residues in the canonical sequence. It means that there are + # # missing (un-modelled) residues in the structure. + # curr_residue = residues[curr_pdb_idx] + # curr_residue_seq_idx = curr_residue.get_id()[1] + # for j, gap_meta_idx in enumerate(range(curr_meta_idx + 1, next_meta_idx)): + # missing_res_name_single = pdb_aa_seq_meta[gap_meta_idx] + # missing_res_name = ACIDS_1TO3[missing_res_name_single] + # gap_pdb_idx = next_pdb_idx + j + # gap_pdb_seq_idx = curr_residue_seq_idx + j + 1 + # missing_residue = Residue( + # (" ", gap_pdb_seq_idx, ICODE_MISSING_RESIDUE), missing_res_name, 0 + # ) + # residues.insert(gap_pdb_idx, missing_residue) # inserts BEFORE index + + # Add unmodelled residues: new approach + _, meta_to_struct_idx = pairwise_alignment_map( + pdb_meta_aa_seq, pdb_modelled_aa_seq + ) + matching_residues: List[Residue] = [] # residues in modelled and in meta + missing_residues: List[Residue] = [] # residues only in meta + for curr_meta_seq_idx in range(len(pdb_meta_aa_seq)): + + if curr_meta_seq_idx in meta_to_struct_idx: + # This residue is one of the modelled residues in the structure + modelled_seq_idx = meta_to_struct_idx[curr_meta_seq_idx] + curr_residue = modelled_residues[modelled_seq_idx] + else: + # This residue is not modelled (missing from the structure), need to add + missing_res_name_single = pdb_meta_aa_seq[curr_meta_seq_idx] + missing_res_name = ACIDS_1TO3[missing_res_name_single] + curr_residue = Residue( + (" ", curr_meta_seq_idx, ICODE_MISSING_RESIDUE), missing_res_name, 0 + ) + missing_residues.append(curr_residue) + matching_residues.append(curr_residue) - curr_meta_idx = pdb_to_meta_idx[curr_pdb_idx] - next_meta_idx = pdb_to_meta_idx[next_pdb_idx] + # Sanity check + matching_aa_seq = str.join( + "", [ACIDS_3TO1[r.get_resname()] for r in matching_residues] + ) + assert pdb_meta_aa_seq == matching_aa_seq - # Detect a gap in the alignment to the canonical sequence - gap_len = next_meta_idx - curr_meta_idx - 1 - if gap_len == 0: - continue + # Detect any residues that are modelled but not in the canonical sequence, + # these would usually be ligands and non-standard AAs. + matching_modelled_residues = set(matching_residues) - set(missing_residues) + extra_modelled_residues = set(modelled_residues) - matching_modelled_residues - # Here, we have two adjacent residues in the pdb structure are aligned to - # non-adjacent residues in the canonical sequence. It means that there are - # missing (un-modelled) residues in the structure. - curr_residue = residues[curr_pdb_idx] - curr_residue_seq_idx = curr_residue.get_id()[1] - for j, gap_meta_idx in enumerate(range(curr_meta_idx + 1, next_meta_idx)): - missing_res_name_single = pdb_aa_seq_meta[gap_meta_idx] - missing_res_name = ACIDS_1TO3[missing_res_name_single] - gap_pdb_idx = next_pdb_idx + j - gap_pdb_seq_idx = curr_residue_seq_idx + j + 1 - missing_residue = Residue( - (" ", gap_pdb_seq_idx, ICODE_MISSING_RESIDUE), missing_res_name, 0 - ) - residues.insert(gap_pdb_idx, missing_residue) # inserts BEFORE index + # Sort all residues by their sequence index, first matching, then others + all_residues = sorted( + [*matching_residues, *extra_modelled_residues], key=lambda r: r.get_id()[1] + ) + all_aa_seq = str.join( + "", [ACIDS_3TO1.get(r.get_resname(), UNKNOWN_AA) for r in all_residues] + ) # Update PDB sequence to include any missing residues we added - pdb_aa_seq = str.join( - "", [ACIDS_3TO1.get(r.resname, UNKNOWN_AA) for r in residues] - ) - assert len(pdb_aa_seq) == len(residues) - n_residues = len(pdb_aa_seq) + # pdb_modelled_aa_seq = str.join( + # "", [ACIDS_3TO1.get(r.resname, UNKNOWN_AA) for r in modelled_residues] + # ) + # assert len(pdb_modelled_aa_seq) == len(modelled_residues) + n_residues = len(all_residues) # Find the best matching DNA for our AA sequence via pairwise alignment # between the PDB AA sequence and translated DNA sequences. @@ -939,7 +986,7 @@ def __init__( idx_to_codons = {} if with_codons: self.ena_id, self._dna_seq, idx_to_codons = self._find_dna_alignment( - pdb_aa_seq, max_ena + all_aa_seq, max_ena ) # Calculate contacts if requested @@ -963,22 +1010,24 @@ def __init__( # Align PDB sequence to UNP unp_alignment_score, pdb_to_unp_idx = pairwise_alignment_map( - pdb_aa_seq, self.unp_rec.sequence + all_aa_seq, self.unp_rec.sequence ) LOGGER.info(f"{self}: PDB to UNP alignment score={unp_alignment_score}") # Create a ResidueRecord holding all data we need per residue residue_recs = [] for i in range(n_residues): - r_curr: Residue = residues[i] + r_curr: Residue = all_residues[i] relative_location = (i + 1) / n_residues # Sanity check - assert pdb_aa_seq[i] == ACIDS_3TO1.get(r_curr.get_resname(), UNKNOWN_AA) + assert all_aa_seq[i] == ACIDS_3TO1.get(r_curr.get_resname(), UNKNOWN_AA) # Get the residues before and after this one - r_prev: Optional[Residue] = residues[i - 1] if i > 0 else None - r_next: Optional[Residue] = residues[i + 1] if i < n_residues - 1 else None + r_prev: Optional[Residue] = all_residues[i - 1] if i > 0 else None + r_next: Optional[Residue] = ( + all_residues[i + 1] if i < n_residues - 1 else None + ) # Get corresponding UNP index unp_idx: Optional[int] = pdb_to_unp_idx.get(i, None) @@ -1009,7 +1058,7 @@ def __init__( ) residue_recs.append(rr) - self._protein_seq = pdb_aa_seq + self._protein_seq = all_aa_seq self._residue_recs: Dict[str, ResidueRecord] = { rr.res_id: rr for rr in residue_recs } From af101439dc9ad68a9d844370a7e110ae5290f4b2 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 26 May 2024 00:14:06 +0300 Subject: [PATCH 04/13] prec: clean up missing residue detection --- src/pp5/prec.py | 65 ++++++------------------------------------------- 1 file changed, 8 insertions(+), 57 deletions(-) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index 3634770..cc49a53 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -868,19 +868,12 @@ def __init__( dihedral_est_name, dihedral_est_args ) - # Extract the residues from the PDB structure: these are the modelled residues - curr_res: Residue - modelled_residues: List[Residue] = [] + # Extract the residues from the PDB structure: these are the modelled + # residues. We ignore water molecules. struct_chain: Chain = self.pdb_rec[0][self.pdb_auth_chain_id] - for curr_res in struct_chain.get_residues(): - # Skip water molecules - if curr_res.resname == "HOH": - continue - - # For non-standard AAs, we place an 'X' in the sequence, but we still - # keep these residues. - res_name_single = ACIDS_3TO1.get(curr_res.get_resname(), UNKNOWN_AA) - modelled_residues.append(curr_res) + modelled_residues: List[Residue] = [ + res for res in struct_chain.get_residues() if res.resname != "HOH" + ] # Sequence of modelled residues from the PDB structure pdb_modelled_aa_seq: str = str.join( @@ -894,48 +887,11 @@ def __init__( self.pdb_meta.chain_entities[self.pdb_chain_id] ] - # Add un-modelled residues by aligning to canonical PDB sequence (from - # structure metadata). Un-modelled residues will be missing from the pdb_aa_seq - # and residues list, as they don't appear in biopython's structure. - # We will add them back using alignment to the canonical sequence. - # _, pdb_to_meta_idx = pairwise_alignment_map(pdb_aa_seq, pdb_aa_seq_meta) - # - # # Find gaps in the alignment, and add missing residues within each gap - # for curr_pdb_idx in pdb_to_meta_idx.keys(): - # next_pdb_idx = curr_pdb_idx + 1 - # if next_pdb_idx not in pdb_to_meta_idx: - # # Either this is the last one, or the next one is not in the - # # alignment (can happen if it's a non-standard AA) - # continue - # - # curr_meta_idx = pdb_to_meta_idx[curr_pdb_idx] - # next_meta_idx = pdb_to_meta_idx[next_pdb_idx] - # - # # Detect a gap in the alignment to the canonical sequence - # gap_len = next_meta_idx - curr_meta_idx - 1 - # if gap_len == 0: - # continue - # - # # Here, we have two adjacent residues in the pdb structure are aligned to - # # non-adjacent residues in the canonical sequence. It means that there are - # # missing (un-modelled) residues in the structure. - # curr_residue = residues[curr_pdb_idx] - # curr_residue_seq_idx = curr_residue.get_id()[1] - # for j, gap_meta_idx in enumerate(range(curr_meta_idx + 1, next_meta_idx)): - # missing_res_name_single = pdb_aa_seq_meta[gap_meta_idx] - # missing_res_name = ACIDS_1TO3[missing_res_name_single] - # gap_pdb_idx = next_pdb_idx + j - # gap_pdb_seq_idx = curr_residue_seq_idx + j + 1 - # missing_residue = Residue( - # (" ", gap_pdb_seq_idx, ICODE_MISSING_RESIDUE), missing_res_name, 0 - # ) - # residues.insert(gap_pdb_idx, missing_residue) # inserts BEFORE index - - # Add unmodelled residues: new approach + # Add un-modelled residues by aligning to the canonical PDB sequence. _, meta_to_struct_idx = pairwise_alignment_map( pdb_meta_aa_seq, pdb_modelled_aa_seq ) - matching_residues: List[Residue] = [] # residues in modelled and in meta + matching_residues: List[Residue] = [] # residues both in modelled and in meta missing_residues: List[Residue] = [] # residues only in meta for curr_meta_seq_idx in range(len(pdb_meta_aa_seq)): @@ -951,6 +907,7 @@ def __init__( (" ", curr_meta_seq_idx, ICODE_MISSING_RESIDUE), missing_res_name, 0 ) missing_residues.append(curr_residue) + matching_residues.append(curr_residue) # Sanity check @@ -971,12 +928,6 @@ def __init__( all_aa_seq = str.join( "", [ACIDS_3TO1.get(r.get_resname(), UNKNOWN_AA) for r in all_residues] ) - - # Update PDB sequence to include any missing residues we added - # pdb_modelled_aa_seq = str.join( - # "", [ACIDS_3TO1.get(r.resname, UNKNOWN_AA) for r in modelled_residues] - # ) - # assert len(pdb_modelled_aa_seq) == len(modelled_residues) n_residues = len(all_residues) # Find the best matching DNA for our AA sequence via pairwise alignment From bec4f39d712e431b993024c32de9195dd4e07525 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 26 May 2024 00:36:11 +0300 Subject: [PATCH 05/13] prec: update unmodeled residue icode --- src/pp5/prec.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index cc49a53..7a15517 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -88,8 +88,8 @@ backbone_only=True, unit_cell=None, isotropic=True, scale_as_bfactor=True ) -# Special insertion code to mark a residue that was missing from the structure. -ICODE_MISSING_RESIDUE = "M" +# Special insertion code to mark a residue that is unmodeled the structure. +ICODE_UNMODELED_RES = "UNMODELED" class ResidueRecord(object): @@ -904,7 +904,7 @@ def __init__( missing_res_name_single = pdb_meta_aa_seq[curr_meta_seq_idx] missing_res_name = ACIDS_1TO3[missing_res_name_single] curr_residue = Residue( - (" ", curr_meta_seq_idx, ICODE_MISSING_RESIDUE), missing_res_name, 0 + (" ", curr_meta_seq_idx, ICODE_UNMODELED_RES), missing_res_name, 0 ) missing_residues.append(curr_residue) From 9d9763a60a01a6dd1d33b4b441dafea7a049eef6 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 26 May 2024 19:58:02 +0300 Subject: [PATCH 06/13] prec: fix wrong assignment of indices for unmodeled residues --- src/pp5/align.py | 10 ++++------ src/pp5/prec.py | 40 ++++++++++++++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/src/pp5/align.py b/src/pp5/align.py index 284a734..d27e649 100644 --- a/src/pp5/align.py +++ b/src/pp5/align.py @@ -26,7 +26,7 @@ with warnings.catch_warnings(): warnings.simplefilter("ignore", BiopythonExperimentalWarning) - from Bio.Align import substitution_matrices, PairwiseAligner + from Bio.Align import substitution_matrices, PairwiseAligner, Alignment from Bio.AlignIO import MultipleSeqAlignment as MSA from Bio.SeqRecord import SeqRecord @@ -63,7 +63,7 @@ def pairwise_alignment_map( tgt_seq: str, open_gap_score: float = -10, extend_gap_score: float = -0.5, -) -> Tuple[float, Dict[int, int]]: +) -> Tuple[Alignment, Dict[int, int]]: """ Aligns between two AA sequences and produces a map from the indices of the source sequence to the indices of the target sequence. @@ -74,7 +74,7 @@ def pairwise_alignment_map( :param open_gap_score: Penalty for opening a gap in the alignment. :param extend_gap_score: Penalty for extending a gap by one residue. :return: A tuple with two elements: - - The alignment score, higher is better. + - The alignment object produced by the aligner. - A dict mapping from an index in the source sequence to the corresponding index in the target sequence. """ @@ -93,8 +93,6 @@ def pairwise_alignment_map( multi_alignments = aligner.align(src_seq, tgt_seq) alignment = sorted(multi_alignments, key=lambda a: a.score)[-1] - score = alignment.score - # LOGGER.info(f"{self}: PDB to UNP sequence alignment score={alignment.score}") # Alignment contains two tuples each of length N (for N matching sub-sequences) # ( @@ -117,7 +115,7 @@ def pairwise_alignment_map( continue src_to_tgt.append((src_start + j, tgt_start + j)) - return score, dict(src_to_tgt) + return alignment, dict(src_to_tgt) def multiseq_align( diff --git a/src/pp5/prec.py b/src/pp5/prec.py index 7a15517..9ff2870 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -888,9 +888,13 @@ def __init__( ] # Add un-modelled residues by aligning to the canonical PDB sequence. - _, meta_to_struct_idx = pairwise_alignment_map( + meta_to_struct_seq_alignment, meta_to_struct_idx = pairwise_alignment_map( pdb_meta_aa_seq, pdb_modelled_aa_seq ) + LOGGER.info( + f"{self}: Canonical to structure seq alignment:\n" + f"{str(meta_to_struct_seq_alignment).strip()}" + ) matching_residues: List[Residue] = [] # residues both in modelled and in meta missing_residues: List[Residue] = [] # residues only in meta for curr_meta_seq_idx in range(len(pdb_meta_aa_seq)): @@ -903,8 +907,33 @@ def __init__( # This residue is not modelled (missing from the structure), need to add missing_res_name_single = pdb_meta_aa_seq[curr_meta_seq_idx] missing_res_name = ACIDS_1TO3[missing_res_name_single] + + # We need to determine the residue sequence index for the missing + # residue. It needs to be consistent with the sequence index of the + # modelled residues. + if len(matching_residues) > 0: + # We have previous matching residues, so we can infer the index + # based on the last matching residue. + missing_idx = matching_residues[-1].get_id()[1] + 1 + else: + # We have no matching residues yet. We need to determine how far + # we are from the first one, take its index and subtract the + # distance. + first_matching_meta_idx = next(iter(meta_to_struct_idx)) + dist_to_first_matching = first_matching_meta_idx - curr_meta_seq_idx + assert dist_to_first_matching >= 0 + first_matching_struct_idx = meta_to_struct_idx[ + first_matching_meta_idx + ] + first_matching_modelled_res = modelled_residues[ + first_matching_struct_idx + ] + missing_idx = ( + first_matching_modelled_res.get_id()[1] - dist_to_first_matching + ) + curr_residue = Residue( - (" ", curr_meta_seq_idx, ICODE_UNMODELED_RES), missing_res_name, 0 + (" ", missing_idx, ICODE_UNMODELED_RES), missing_res_name, 0 ) missing_residues.append(curr_residue) @@ -960,10 +989,10 @@ def __init__( ) # Align PDB sequence to UNP - unp_alignment_score, pdb_to_unp_idx = pairwise_alignment_map( + unp_alignment, pdb_to_unp_idx = pairwise_alignment_map( all_aa_seq, self.unp_rec.sequence ) - LOGGER.info(f"{self}: PDB to UNP alignment score={unp_alignment_score}") + LOGGER.info(f"{self}: PDB to UNP alignment score={unp_alignment.score}") # Create a ResidueRecord holding all data we need per residue residue_recs = [] @@ -1306,8 +1335,7 @@ def _find_dna_alignment( f"{self}: Translated DNA to PDB alignment " f"(norm_score=" f"{best_alignments.score / len(pdb_aa_seq):.2f}, " - f"num={len(best_alignments)})\n" - f"{str(best_alignment).strip()}" + f"num={len(best_alignments)})" ) # Map each AA to a dict of (codon->count) From 8bd4d6a60057fdeba4d535fdddf223846f39c571 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 2 Jun 2024 10:07:19 +0300 Subject: [PATCH 07/13] prec: assign separate index for unmodelled residues For unmodelled residues, we use the seq_idx from the previous residue, but assign a new index which is part of the insertion code. This prevents various issues which arise due to the inconsistency of the sequence index. --- src/pp5/prec.py | 73 ++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index 9ff2870..8a64d4e 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -89,7 +89,7 @@ ) # Special insertion code to mark a residue that is unmodeled the structure. -ICODE_UNMODELED_RES = "UNMODELED" +ICODE_UNMODELED_RES = "U" class ResidueRecord(object): @@ -892,7 +892,7 @@ def __init__( pdb_meta_aa_seq, pdb_modelled_aa_seq ) LOGGER.info( - f"{self}: Canonical to structure seq alignment:\n" + f"{self}: Canonical (target) to structure (query) sequence alignment:\n" f"{str(meta_to_struct_seq_alignment).strip()}" ) matching_residues: List[Residue] = [] # residues both in modelled and in meta @@ -905,36 +905,41 @@ def __init__( curr_residue = modelled_residues[modelled_seq_idx] else: # This residue is not modelled (missing from the structure), need to add - missing_res_name_single = pdb_meta_aa_seq[curr_meta_seq_idx] - missing_res_name = ACIDS_1TO3[missing_res_name_single] + unmodelled_res_name_single = pdb_meta_aa_seq[curr_meta_seq_idx] + unmodelled_res_name = ACIDS_1TO3[unmodelled_res_name_single] + unmodelled_count = 0 # number of consecutive unmodelled residues # We need to determine the residue sequence index for the missing # residue. It needs to be consistent with the sequence index of the # modelled residues. if len(matching_residues) > 0: # We have previous matching residues, so we can infer the index - # based on the last matching residue. - missing_idx = matching_residues[-1].get_id()[1] + 1 + # based on the last matching residue (LMR). + _, lmr_seq_idx, lmr_icode = matching_residues[-1].get_id() + + # One or more unmodelled residues will be inserted after the + # previous matching residue using the same sequence index. We use + # the icode to disambiguate them. Each consecutive unmodelled + # residue will have an icode of the form U_i, where i is a counter. + unmodelled_seq_idx = lmr_seq_idx + if lmr_icode.strip().startswith(ICODE_UNMODELED_RES): + unmodelled_count = int(lmr_icode.split("_")[1]) + 1 else: - # We have no matching residues yet. We need to determine how far - # we are from the first one, take its index and subtract the - # distance. - first_matching_meta_idx = next(iter(meta_to_struct_idx)) - dist_to_first_matching = first_matching_meta_idx - curr_meta_seq_idx - assert dist_to_first_matching >= 0 - first_matching_struct_idx = meta_to_struct_idx[ - first_matching_meta_idx - ] - first_matching_modelled_res = modelled_residues[ - first_matching_struct_idx - ] - missing_idx = ( - first_matching_modelled_res.get_id()[1] - dist_to_first_matching - ) - - curr_residue = Residue( - (" ", missing_idx, ICODE_UNMODELED_RES), missing_res_name, 0 - ) + # We have no matching residues yet. We'll insert the unmodelled + # residue(s) before the first matching residue (FMR), using a + # smaller sequence index. + fmr_meta_idx: int = next(iter(meta_to_struct_idx)) + fmr_struct_idx: int = meta_to_struct_idx[fmr_meta_idx] + fmr_modelled: Residue = modelled_residues[fmr_struct_idx] + _, fmr_seq_idx, fmr_icode = fmr_modelled.get_id() + unmodelled_seq_idx = fmr_seq_idx - 1 + # Sanity check: the first unmodelled residue should be inserted + # before a modelled one. We should only be in this 'else' once. + assert not fmr_icode.startswith(ICODE_UNMODELED_RES) + + unmodelled_res_icode = f"{ICODE_UNMODELED_RES}_{unmodelled_count:04d}" + missing_residue_id = (" ", unmodelled_seq_idx, unmodelled_res_icode) + curr_residue = Residue(missing_residue_id, unmodelled_res_name, 0) missing_residues.append(curr_residue) matching_residues.append(curr_residue) @@ -950,14 +955,26 @@ def __init__( matching_modelled_residues = set(matching_residues) - set(missing_residues) extra_modelled_residues = set(modelled_residues) - matching_modelled_residues - # Sort all residues by their sequence index, first matching, then others + # Sort all residues by their sequence index, then icode. Include the extra + # residues at the end. + def _residue_sort_key(r: Residue) -> Tuple[int, str, str]: + _het_flag, _seq_idx, _icode = r.get_id() + # Ensure everything has the expected type + _het_flag = str(_het_flag or "").strip() + _icode = str(_icode or "").strip() + _seq_idx = int(_seq_idx) + return _seq_idx, _icode, _het_flag + all_residues = sorted( - [*matching_residues, *extra_modelled_residues], key=lambda r: r.get_id()[1] + [*matching_residues, *extra_modelled_residues], key=_residue_sort_key ) + n_residues = len(all_residues) + + # Obtain single-letter AA sequence with all residues (including unmodelled), + # and with non-standard AAs or ligands represented by 'X'. all_aa_seq = str.join( "", [ACIDS_3TO1.get(r.get_resname(), UNKNOWN_AA) for r in all_residues] ) - n_residues = len(all_residues) # Find the best matching DNA for our AA sequence via pairwise alignment # between the PDB AA sequence and translated DNA sequences. From 402db933efadbe589c28f32640ba3bc40ef484df Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 2 Jun 2024 10:25:46 +0300 Subject: [PATCH 08/13] prec: Refactor AA sequence property --- notebooks/pp5_demo.ipynb | 4 ++-- src/pp5/prec.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/notebooks/pp5_demo.ipynb b/notebooks/pp5_demo.ipynb index c555bce..73fa51d 100644 --- a/notebooks/pp5_demo.ipynb +++ b/notebooks/pp5_demo.ipynb @@ -186,8 +186,8 @@ ], "source": [ "# Sequence of AAs as a SeqRecord\n", - "seq_rec = prec.protein_seq\n", - "print(f\"{seq_rec.seq!s}, len={len(seq_rec)}\")" + "aa_seq = prec.aa_seq\n", + "print(f\"{aa_seq}, len={len(aa_seq)}\")" ] }, { diff --git a/src/pp5/prec.py b/src/pp5/prec.py index 8a64d4e..944b98a 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -1055,7 +1055,7 @@ def _residue_sort_key(r: Residue) -> Tuple[int, str, str]: ) residue_recs.append(rr) - self._protein_seq = all_aa_seq + self._aa_seq = all_aa_seq self._residue_recs: Dict[str, ResidueRecord] = { rr.res_id: rr for rr in residue_recs } @@ -1103,15 +1103,15 @@ def dna_seq(self) -> Optional[SeqRecord]: return SeqRecord(Seq(self._dna_seq), self.ena_id, "", "") @property - def protein_seq(self) -> SeqRecord: + def aa_seq(self) -> str: """ - :return: Protein sequence as 1-letter AA names. Based on the - residues found in the associated PDB structure. - Note that the sequence might contain the letter 'X' denoting an - unknown AA. This happens if the PDB entry contains non-standard AAs - and we chose to ignore such AAs. + :return: Protein sequence as 1-letter AA names. + Based on the residues found in the associated PDB structure, including those + which are not modelled. + Note that the sequence might contain the letter 'X' denoting a non-standard + AA or a ligand. """ - return SeqRecord(Seq(self._protein_seq), self.pdb_id, "", "") + return self._aa_seq @property def seq_gaps(self) -> Sequence[Tuple[str, str]]: From e85cdfe43b38d21f4637de72edb61454a59a1dba Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 2 Jun 2024 10:26:06 +0300 Subject: [PATCH 09/13] prec: Add test for structures with unmodelled residues --- tests/test_prec.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_prec.py b/tests/test_prec.py index a0a75b0..cf186c4 100644 --- a/tests/test_prec.py +++ b/tests/test_prec.py @@ -16,7 +16,7 @@ Arpeggio, ) from pp5.external_dbs import unp -from pp5.external_dbs.pdb import PDB_DOWNLOAD_SOURCES +from pp5.external_dbs.pdb import PDB_DOWNLOAD_SOURCES, split_id @pytest.fixture( @@ -148,6 +148,19 @@ def test_contacts(self, pdb_id, with_altlocs, contact_method): assert res_id in prec assert res_contacts.contact_smax > 0 + @pytest.mark.parametrize("pdb_id", ["1914:A", "2B3P:A", "4M2Q:A", "3P8D:A"]) + def test_unmodelled_residues(self, pdb_id): + _, chain_id = split_id(pdb_id) + prec = ProteinRecord.from_pdb(pdb_id) + + pdb_meta = prec.pdb_meta + canonical_seq = pdb_meta.entity_sequence[pdb_meta.chain_entities[chain_id]] + + # The sequence in our prec structure should be identical to the canonical + # sequence in the PDB file, except where there are ligands. + aa_seq_no_ligands = prec.aa_seq.replace(UNKNOWN_AA, "") + assert aa_seq_no_ligands == canonical_seq + class TestFromUnp: def test_from_unp_default_selector(self): From f2b03994afe95b5154bb19b43a25c8b1fb2144d1 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 2 Jun 2024 10:56:47 +0300 Subject: [PATCH 10/13] prec: Improve test for structures with unmodelled residues --- tests/test_prec.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_prec.py b/tests/test_prec.py index cf186c4..c9ae004 100644 --- a/tests/test_prec.py +++ b/tests/test_prec.py @@ -161,6 +161,14 @@ def test_unmodelled_residues(self, pdb_id): aa_seq_no_ligands = prec.aa_seq.replace(UNKNOWN_AA, "") assert aa_seq_no_ligands == canonical_seq + # Make sure UNP alignment is correct and includes both modelled and + # unmodelled residues + unp_seq = prec.unp_rec.sequence + assert not all(r.unp_idx is None for r in prec) + assert all( + [unp_seq[r.unp_idx] == r.name for r in prec if r.unp_idx is not None] + ) + class TestFromUnp: def test_from_unp_default_selector(self): From 9e06d37ff02edabbd0e8b85b2ee511dff7fadabe Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 2 Jun 2024 19:15:35 +0300 Subject: [PATCH 11/13] pairwise_alignment_map: Update to choose alignment with smallest number of gaps --- src/pp5/align.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/pp5/align.py b/src/pp5/align.py index d27e649..4ea659d 100644 --- a/src/pp5/align.py +++ b/src/pp5/align.py @@ -68,6 +68,8 @@ def pairwise_alignment_map( Aligns between two AA sequences and produces a map from the indices of the source sequence to the indices of the target sequence. Uses biopython's PairwiseAligner with BLOSUM80 matrix. + In case there is more than one alignment option, the one with the highest score + and smallest number of gaps will be chosen. :param src_seq: Source AA sequence to align. :param tgt_seq: Target AA sequence to align. @@ -91,8 +93,15 @@ def pairwise_alignment_map( tgt_seq = tgt_seq.replace(unk_aa, UNKNOWN_AA) src_seq = src_seq.replace(unk_aa, UNKNOWN_AA) + # The aligner returns multiple alignment options multi_alignments = aligner.align(src_seq, tgt_seq) - alignment = sorted(multi_alignments, key=lambda a: a.score)[-1] + + # Choose alignment with maximal score and minimum number of gaps + def _align_sort_key(a: Alignment) -> Tuple[int, int]: + _n_gaps = a.coordinates.shape[1] + return -a.score, _n_gaps + + alignment = sorted(multi_alignments, key=_align_sort_key)[0] # Alignment contains two tuples each of length N (for N matching sub-sequences) # ( From d31b0acde7d72bf0e5373022aede1aa00e55b63b Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Thu, 13 Jun 2024 23:03:47 +0300 Subject: [PATCH 12/13] prec: count number of unmodelled residues --- src/pp5/prec.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index 944b98a..23a5a81 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -1184,6 +1184,40 @@ def num_altlocs(self) -> int: """ return sum(r.num_altlocs > 1 for r in self._residue_recs.values()) + @property + def num_unmodelled(self) -> Tuple[int, int, int]: + """ + Counts number of unmodelled residues in this chain. + + :return: A tuple of three integers: + - Number of unmodelled residues before the first modelled residue (N-terminus). + - Number of unmodelled residues between modelled residues. + - Number of unmodelled residues after the last modelled residue (C-terminus). + """ + count_pre, count_mid, count_post = 0, 0, 0 + modelled_idx = [ + i + for i, res in enumerate(self) + if not res.res_icode.startswith(ICODE_UNMODELED_RES) + and res.name in ACIDS_1TO3 + ] + + if len(modelled_idx) > 1: + first_modelled_idx, *_, last_modelled_idx = modelled_idx + count_pre, count_mid, count_post = 0, 0, 0 + + res: ResidueRecord + for i, res in enumerate(self): + if res.res_icode.startswith(ICODE_UNMODELED_RES): + if i < first_modelled_idx: + count_pre += 1 + elif i > last_modelled_idx: + count_post += 1 + else: + count_mid += 1 + + return count_pre, count_mid, count_post + def to_dataframe(self): """ :return: A Pandas dataframe where each row is a ResidueRecord from From 7ec2019fd1af4a3240267ab2fbafb0b63fa91df5 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Thu, 13 Jun 2024 23:11:48 +0300 Subject: [PATCH 13/13] collect-prec: add number of unmodelled residues as metadata columns --- src/pp5/collect.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/pp5/collect.py b/src/pp5/collect.py index 231786a..17e08e7 100644 --- a/src/pp5/collect.py +++ b/src/pp5/collect.py @@ -58,6 +58,7 @@ COL_PDB_SOURCE = "pdb_source" COL_REJECTED_BY = "rejected_by" COL_NUM_ALTLOCS = "num_altlocs" +COL_NUM_UNMODELLED = "num_unmodelled" COLLECTION_METADATA_FILENAME = "meta.json" ALL_STRUCTS_FILENAME = "meta-structs_all" @@ -1145,6 +1146,12 @@ def _collect_single_structure( COL_SEQ_LEN: seq_len, COL_SEQ_GAPS: str.join(";", [f"{s}-{e}" for (s, e) in prec.seq_gaps]), COL_NUM_ALTLOCS: prec.num_altlocs, + **{ + f"{COL_NUM_UNMODELLED}_{suffix}": n + for n, suffix in zip( + prec.num_unmodelled, ["nterm", "inter", "cterm"] + ) + }, COL_PDB_SOURCE: pdb_source, **meta.as_dict(chain_id=chain_id, seq_to_str=True), }