Skip to content
This repository has been archived by the owner on Jan 26, 2024. It is now read-only.

Commit

Permalink
code to get conservation data from bioprodict file
Browse files Browse the repository at this point in the history
  • Loading branch information
Coos Baakman committed Aug 10, 2021
1 parent d898e39 commit 377d722
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 0 deletions.
5 changes: 5 additions & 0 deletions deeprank/models/conservation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@



class ConservationTable:

61 changes: 61 additions & 0 deletions deeprank/operate/conservation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pandas

from deeprank.models.residue import Residue
from deeprank.config.chemicals import AA_codes_1to3


# >>> pandas.read_hdf("GPCR_variantsv2_increased_coverage.hdf5", "conservation")
# sequence_residue_number amino_acid alignment_position alignment_name sub_sequencecount ... sub_consv_W sub_consv_X sub_consv_Y sub_consv_Z sub_consv_gap
# accession ...
# 4DAJC 1 T NaN NaN NaN ... NaN NaN NaN NaN NaN
# 4DAJC 2 I 101.0 sf_v2.adh_gpcr 1337.0 ... 0.005234 0.0 0.005985 0.0 0.225830
# 4DAJC 3 W 102.0 sf_v2.adh_gpcr 1357.0 ... 0.510742 0.0 0.067810 0.0 0.214233
# 4DAJC 4 Q 103.0 sf_v2.adh_gpcr 1360.0 ... 0.090454 0.0 0.001471 0.0 0.212524
# 4DAJC 5 V 104.0 sf_v2.adh_gpcr 1369.0 ... 0.000000 0.0 0.002192 0.0 0.207275
# ... ... ... ... ... ... ... ... ... ... ... ...
# 4N6HA 404 R 375.0 sf_v2.adh_gpcr 3889.0 ... 0.032135 0.0 0.003857 0.0 0.432373
# 4N6HA 405 K 263.0 pdb_v3.016122 1583.0 ... 0.001264 0.0 0.008209 0.0 0.638184
# 4N6HA 406 P 264.0 pdb_v3.016122 1586.0 ... 0.009460 0.0 0.010719 0.0 0.637207
# 4N6HA 407 C 265.0 pdb_v3.016122 1584.0 ... 0.030304 0.0 0.118713 0.0 0.637695
# 4N6HA 408 G 266.0 pdb_v3.016122 1584.0 ... 0.084595 0.0 0.000631 0.0 0.637695


# >>> pandas.read_hdf("/home/cbaakman/deeprank-datasets/GPCR_variantsv2_increased_coverage.hdf5", "pdbs")
# pdbnumber pdb_x pdb_y pdb_z pdb_accessibility pdb_bvalue pdb_residuenumber alignment_position
# accession
# 5TZYA 2 -8.789062 8.476562 65.312500 154.37500 133.2500 1.0 NaN
# 5TZYA 3 -10.773438 10.515625 62.750000 37.46875 196.7500 2.0 NaN
# 5TZYA 4 -11.843750 14.164062 63.500000 70.00000 216.2500 3.0 NaN
# 5TZYA 5 -15.531250 15.117188 62.843750 64.87500 201.8750 4.0 NaN
# 5TZYA 6 -14.312500 18.312500 61.031250 112.81250 62.6875 5.0 NaN
# ... ... ... ... ... ... ... ... ...
# 6IGLA 398 30.125000 54.125000 7.976562 105.50000 82.3125 465.0 230.0
# 6IGLA 399 27.562500 54.968750 10.679688 69.43750 82.8125 466.0 231.0
# 6IGLA 400 26.687500 51.312500 11.312500 26.21875 81.8125 467.0 232.0
# 6IGLA 401 30.375000 50.312500 11.453125 115.00000 87.8750 468.0 233.0
# 6IGLA 402 31.484375 53.031250 13.882812 169.25000 100.8125 469.0 234.0


def get_conservation_from_bioprodict(pdb_dataframe, conservation_dataframe, pdb_accession, pdb_chain_id):
accession_code = "%s%s" % (pdb_accession, pdb_chain_id)

pdb_translation = {}
for index, row in pdb_dataframe.loc[accession_code].iterrows():
pdb_number = row["pdbnumber"]
seq_number = row["pdb_residuenumber"]
pdb_translation[seq_number] = pdb_number

conservation_table = {}
for index, row in conservation_dataframe.loc[accession_code].iterrows():
sequence_number = row["sequence_residue_number"]
amino_acid = AA_codes_1to3[row["amino_acid"]]
pdb_number = pdb_translation[sequence_number]

residue_id = Residue(pdb_number, amino_acid, pdb_chain_id)
conservation_table[residue_id] = {}
for amino_acid_letter in AA_codes_1to3:
key = "sub_consv_%s" % amino_acid_letter
conservation_table[residue_id][amino_acid_letter] = row[key]

return conservation_table

0 comments on commit 377d722

Please sign in to comment.