Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
stivalaa committed Aug 26, 2021
1 parent c18cc40 commit db9346d
Show file tree
Hide file tree
Showing 147 changed files with 24,906 additions and 0 deletions.
14 changes: 14 additions & 0 deletions scripts/build_betagrasp_query.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh
#
# build the tableau + distance matrix for tsrchd_sparse for the
# beta-grasp query (as the 4 largest strands and 1 alpha hexli in
# ubiquitin structure)
#
# $Id: build_betagrasp_query.sh 2908 2009-11-06 05:33:18Z astivala $

# tableaux+distmatrix db file
TABLEAUX_DB=${HOME}/tableauxdistmatrixdb.ascii

echo "${TABLEAUX_DB}"
echo "T T F" # options: type,order,output
pytableaucreate.py -bf -35 -tdssp -p none -i BGRASP -s2,1,8,5,3 ${HOME}/pdb/d1ubia_.ent
65 changes: 65 additions & 0 deletions scripts/build_cops_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/sh
#
# File: build_cops_db.sh
# Author: Alex Stivala
# Created: May 2010
#
#
# build_cops_db.sh - build tableaux database for COPS benchmark data set
#
# Usage: build_cops_db.sh querydir dbfile
#
# querydir is directory to put query input tableaux into
# dbfile is basename of tableaux database to create, will create
# dbfile.tableaux.pickle, dbfile.distmatrix.pickle and
# dbfile.tableauxdistmatrixdb.ascii
#
# Builds tableaux for queries and database for the COPS benchmark data set
# (Frank et al. 1999 "COPS Benchmark: interactive analysis of database
# search methods" Bioinformatics 26(4):574-575) available from
# http://benchmark.services.came.sbg.ac.at/
#
# Requires the buildtableauxdb.py and pytableaucreate.py and convdb2.py
# scripts in PATH.
#
# WARNING: dbfile and files in querydir are overwritten if they exist.
#
# $Id: build_cops_db.sh 3632 2010-05-12 02:07:26Z alexs $

COPS_ROOT=${HOME}/cops-benchmark-2009-6-full
COPS_PDB_QUERIES=${COPS_ROOT}/queries/pdb
COPS_PDB_DB=${COPS_ROOT}/database/pdb

if [ $# -ne 2 ]; then
echo "Usage: $0 querydir dbfile" >&2
exit 1
fi

QUERYDIR=$1
DBFILE=$2

OPTIONS="-p none -35 -t dssp"

if [ ! -d $QUERYDIR ]; then
mkdir $QUERYDIR
fi

tableaux_pickle=${DBFILE}.tableaux.pickle
distmatrix_pickle=${DBFILE}.distmatrix.pickle
tableauxdb=${DBFILE}.tableauxdb.ascii

for query in ${COPS_PDB_QUERIES}/*.pdb
do
qid=`basename $query .pdb`
qfile=${QUERYDIR}/${qid}.input
echo $tableauxdb > $qfile
echo "T T F" >> $qfile # options: type, order, output
pytableaucreate.py -f -b $OPTIONS $query >> $qfile
done


buildtableauxdb.py $OPTIONS $COPS_PDB_DB $tableaux_pickle
buildtableauxdb.py -d $OPTIONS $COPS_PDB_DB $distmatrix_pickle

convdb2.py $tableaux_pickle $distmatrix_pickle > $tableauxdb

138 changes: 138 additions & 0 deletions scripts/build_fastscopdominfo_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env python
###############################################################################
#
# build_fastscopdominfo_cache.py - build pickle file for cached SCOP info
#
# File: build_fastscopdominfo_cache.py
# Author: Alex Stivala
# Created: March 2010
#
# $Id: scopdominfo.py 3009 2009-12-08 03:01:48Z alexs $
#
###############################################################################

"""
Build cache (Python pickled dictionary) of information on the folds
and superfamilies SCOP domain identifiers (sids).
See usage in docstring for main()
SCOP and ASTRAL data is obtained using the Bio.SCOP library (Casbon et
al 2006 'A high level interface to SCOP and ASTRAL implemented in
Python' BMC Bioinformatics 7:10) and depends on having the data
downloaded, in SCOP_DIR (defined below).
Downloaded SCOP files from
http://scop.mrc-lmb.cam.ac.uk/scop/parse/index.html
and ASTRAL files (in scopseq-1.73) from
http://astral.berkeley.edu/scopseq-1.73.html
The files downlaoded are:
/local/charikar/SCOP/:
dir.cla.scop.txt_1.73
dir.des.scop.txt_1.73
dir.hie.scop.txt_1.73
/local/charikar/SCOP/scopseq-1.73:
astral-scopdom-seqres-all-1.73.fa
astral-scopdom-seqres-sel-gs-bib-95-1.73.id
Other files there are indices built by Bio.SCOP when first used.
"""

import sys,os
import pickle

from Bio.SCOP import *

from pathdefs import SCOP_DIR,SCOP_VERSION

#-----------------------------------------------------------------------------
#
# Function definitions
#
#-----------------------------------------------------------------------------


def build_scopdominfo_dict(scop):
"""
Build dictionary with
information about superfamily and class of all SCOP domains
Parameters:
scop - previously built Bio.SCOP Scop instance
Return value:
dict {sid: (superfamily_sccs, superfamily_description, fold_sccs,fold_description)}
where
superfamily_sccs is SCOP sccs identifying the superfamily for the domain
superamily_description is SCOP dessription of the superfamily
fold_description is the SCOP descriptino of the fold the domain is in
"""
scopdominfo_dict = {}
for scop_dom in scop.getDomains():
sid = scop_dom.sid
scop_superfamily = scop_dom.getAscendent('superfamily')
scop_fold = scop_dom.getAscendent('fold')
scop_class = scop_dom.getAscendent('class')
scopdominfo_dict[sid] = (scop_superfamily.sccs,
scop_superfamily.description,
scop_fold.sccs,
scop_fold.description)

return scopdominfo_dict


#-----------------------------------------------------------------------------
#
# Main
#
#-----------------------------------------------------------------------------

def usage(progname):
"""
Print usage message and exit
"""

sys.stderr.write("Usage: " +progname + " cachefile\n")
sys.exit(1)


def main():
"""
main for scomdominfo.py
Usage: scomdominfo.py cachefile
cachefile is the file to create the pickled domain info dictionary as
WARNING: overwritten if it exists
"""
if len(sys.argv) != 2:
usage(os.path.basename(sys.argv[0]))

pickle_filename = sys.argv[1]

sys.stderr.write("Reading SCOP Data...")
scop = Scop(dir_path=SCOP_DIR,version=SCOP_VERSION)
sys.stderr.write("done\n")

sys.stderr.write("Building domain info cache...")
scopdominfo_dict = build_scopdominfo_dict(scop)
sys.stderr.write("done. Got %d domain descriptions\n" %
len(scopdominfo_dict))

sys.stderr.write("Writing cache to file %s...\n" % pickle_filename)
fh = open(pickle_filename, "w")
pickle.dump(scopdominfo_dict, fh)
fh.close()
sys.stderr.write("done\n")



if __name__ == "__main__":
main()

76 changes: 76 additions & 0 deletions scripts/build_fischer_cm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/sh
#
# File: build_fischer_cm.sh
# Author: Alex Stivala
# Created: September 2008
#
# build_fischer_cm.sh - build contact maps for Fischer data set
#
# Usage: build_fischer_cm.sh outdir
#
# outdir is name of diretory which is created, and each contact map
# in ASCII format for use with MSVNS4MaxCMO (Pelta et al 2008)
# or other program using this format of contact matrix is
# created as a separate file in that directory, in format for input
# for use with msvns4maxcmo_allall.py for example
#
# builds contact maps, using pconpy.py,
# for the Fischer data set (Fischer et al 1996 Pac. Symp. Biocomput. 300-318))
# This allows all-against-all (including redundant, so for n (=68)
# there are n*n (=4624) total comparions) with e.g. msvns4maxcmo_allall.py,
#
#

# root of divided PDB hierarchy
PDBROOT=/local/charikar/pdb/pdb


# List of probe PDB ids from Fischer 1996 Table I
# Note several PDB ids obsoleted, so change to the replacments
FISCHER_S="1mdc 1mup 1npx 1cpc_l 1onc 2ak3_a 1osa 1atn_a 1pfc 1arb 2cmd 2pia 2pna 3rub_l 1bbh_a 2sar_a 1c2r_a 3cd4 1chr_a 1aep 1dxt_b 2mnr 2fbj_l 1lts_d 1gky 2gbp 1hip 1bbt_1 2sas 2mta_c 1fc1_a 1tah_a 2hpd_a 1rcb 1aba 1sac_a 1eaf 1dsb_a 2sga 1stf_i 2hhm_a 2afn_a 1aaj 1fxi_a 5fd1 1bge_b 1isu_a 3hla_b 1gal 3chy 1cau_b 2aza_a 1hom 1cew 1tlk 1cid 2omf 1crl 1lga_a 2sim 1mio_c 1ten 4sbv_a 1tie 8i1b 2snv 1hrh_a 1gp1_a"


# List of target fold PDB ids from Fischer 1996 Table I
# Note several PDB ids obsoleted, so change to the replacments
# this list corresponds to FISCHER_S ie FISCHER_P[i] is the target fold
# for probe FISCHER_S[i] for 0 < i < 67
FISCHER_P="1ifc 1rbp 3grs 1col_a 7rsa 1gky 4cpv 1atr 3hla_b 5ptp 6ldh 1fnb 1sha_a 6xia 2ccy_a 9rnt 1ycc 2rhe 2mnr 256b_a 1hbg 4enl 8fab_b 1bov_a 3adk 2liv 2hip_a 2plv1 2scp_a 1ycc 2fb4_h 1tca 2cpp 2gmf_a 1ego 2ayh 4cla 2trx_a 5ptp 1mol_a 1fbp_a 1aoz_a 1paz 1ubq 1iqz 2gmf_a 2hip_a 2rhe 3cox 2fox 1cau_a 1paz 1lfb 1mol_a 2rhe 2rhe 2por 1ede 2cyp 1nsb_a 2min_b 3hhr_b 2tbv_a 4fgf 4fgf 5ptp 1rnh 2trx_a"


# List of 68 probe sequences from Fischer 1996 Table II
# Note several PDB ids obsoleted, so change to the replacments
FISCHER_LIST="1dxt_b 1cpc_l 1c2r_a 2mta_c 1bbh_a 1bge_b 1rcb 1aep 1osa 2sas 1hom 1lga_a 2hpd_a 1chr_a 2mnr 3rub_l 1crl 1tah_a 1aba 1dsb_a 1gpl_a 1atn_a 1hrh_a 3chy 2ak3_a 1gky 2cmd 1eaf 2gbp 1mio_c 2pia 1gal 1npx 2hhm_a 1hip 1isu_a 1fc1_a 2fbj_l 1cid 1pfc 1ten 1tlk 3cd4 3hla_b 1aaj 2afn_a 2aza_a 4sbv_a 1bbt_1 1sac_a 1lts_d 1tie 8i1b 1arb 2sga 2snv 1mdc 1mup 2sim 1cau_b 2omf 1fxi_a 1cew 1stf_i 2pna 2sar_a 1onc 5fd1"

if [ $# -ne 1 ]; then
echo "Usage: $0 outdir" 2>&1
exit 1
fi
outdir=$1

if [ ! -d ${outdir} ]; then
mkdir ${outdir}
fi

# pconpy.py options
threshold=7.0
pconpyopts="--cmaplist --threshold=${threshold} --seq_separation=2"

for i in $FISCHER_LIST
do
pdb=`echo $i | tr A-Z a-z`
if [ `expr index $pdb _` -ne 0 ]; then
# get chainid from e.g. 1BYO_B
chainid=`expr substr $pdb 6 1`
chainopt="--chains=$chainid"
pdbid=`expr substr $pdb 1 4`_${chainid}
else
chainopt=""
pdbid=`expr substr $pdb 1 4`
fi
pdb=`expr substr $pdb 1 4`
div=`expr substr $pdb 2 2`
pdbfile=${PDBROOT}/${div}/pdb${pdb}.ent.gz
pdbid=`echo $pdbid | tr '[a-z]' '[A-Z]'`
pconpy.py ${pconpyopts} ${chainopt} --pdb=${pdbfile} --output=${outdir}/${pdbid}.cm_a${threshold}
done

85 changes: 85 additions & 0 deletions scripts/build_fischer_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/sh
#
# File: build_fischer_db.sh
# Author: Alex Stivala
# Created: September 2008
#
# build_fischer_db.sh - build tableaux database for Fischer data set
#
# Usage: build_fischer_db.sh outdir
#
# outdir is name of diretory which is created, and each tableau
# in ASCII format for use with tsrchd_sparse etc. is
# created as a separate file in that directory, in format for input
# for use with qptabmatch_allpairs.py for example
#
# To stdout is written the ASCII format db of all the tableaux+dist matrices
# (just all the ones written to outdir concatenated together with
# blank line between each).
#
# builds database of tableaux, using pytableaycreate.py,
# for the Fischer data set (Fischer et al 1996 Pac. Symp. Biocomput. 300-318))
# This allows all-against-all (including redundant, so for n (=68)
# there are n*n (=4624) total comparions) with e.g. qptabmatch_allall.py,
#
#

# root of divided PDB hierarchy
PDBROOT=/local/charikar/pdb/pdb


# List of probe PDB ids from Fischer 1996 Table I
# Note several PDB ids obsoleted, so change to the replacments
FISCHER_S="1mdc 1mup 1npx 1cpc_l 1onc 2ak3_a 1osa 1atn_a 1pfc 1arb 2cmd 2pia 2pna 3rub_l 1bbh_a 2sar_a 1c2r_a 3cd4 1chr_a 1aep 1dxt_b 2mnr 2fbj_l 1lts_d 1gky 2gbp 1hip 1bbt_1 2sas 2mta_c 1fc1_a 1tah_a 2hpd_a 1rcb 1aba 1sac_a 1eaf 1dsb_a 2sga 1stf_i 2hhm_a 2afn_a 1aaj 1fxi_a 5fd1 1bge_b 1isu_a 3hla_b 1gal 3chy 1cau_b 2aza_a 1hom 1cew 1tlk 1cid 2omf 1crl 1lga_a 2sim 1mio_c 1ten 4sbv_a 1tie 8i1b 2snv 1hrh_a 1gp1_a"


# List of target fold PDB ids from Fischer 1996 Table I
# Note several PDB ids obsoleted, so change to the replacments
# this list corresponds to FISCHER_S ie FISCHER_P[i] is the target fold
# for probe FISCHER_S[i] for 0 < i < 67
FISCHER_P="1ifc 1rbp 3grs 1col_a 7rsa 1gky 4cpv 1atr 3hla_b 5ptp 6ldh 1fnb 1sha_a 6xia 2ccy_a 9rnt 1ycc 2rhe 2mnr 256b_a 1hbg 4enl 8fab_b 1bov_a 3adk 2liv 2hip_a 2plv1 2scp_a 1ycc 2fb4_h 1tca 2cpp 2gmf_a 1ego 2ayh 4cla 2trx_a 5ptp 1mol_a 1fbp_a 1aoz_a 1paz 1ubq 1iqz 2gmf_a 2hip_a 2rhe 3cox 2fox 1cau_a 1paz 1lfb 1mol_a 2rhe 2rhe 2por 1ede 2cyp 1nsb_a 2min_b 3hhr_b 2tbv_a 4fgf 4fgf 5ptp 1rnh 2trx_a"


# List of 68 probe sequences from Fischer 1996 Table II
# Note several PDB ids obsoleted, so change to the replacments
FISCHER_LIST="1dxt_b 1cpc_l 1c2r_a 2mta_c 1bbh_a 1bge_b 1rcb 1aep 1osa 2sas 1hom 1lga_a 2hpd_a 1chr_a 2mnr 3rub_l 1crl 1tah_a 1aba 1dsb_a 1gpl_a 1atn_a 1hrh_a 3chy 2ak3_a 1gky 2cmd 1eaf 2gbp 1mio_c 2pia 1gal 1npx 2hhm_a 1hip 1isu_a 1fc1_a 2fbj_l 1cid 1pfc 1ten 1tlk 3cd4 3hla_b 1aaj 2afn_a 2aza_a 4sbv_a 1bbt_1 1sac_a 1lts_d 1tie 8i1b 1arb 2sga 2snv 1mdc 1mup 2sim 1cau_b 2omf 1fxi_a 1cew 1stf_i 2pna 2sar_a 1onc 5fd1"

if [ $# -ne 1 ]; then
echo "Usage: $0 outdir" 2>&1
exit 1
fi
outdir=$1

if [ ! -d ${outdir} ]; then
mkdir ${outdir}
fi

# pytableaucreate.py options
tabopts="-35 -f -t dssp -p none"

first=1
for i in $FISCHER_LIST
do
pdb=`echo $i | tr A-Z a-z`
if [ `expr index $pdb _` -ne 0 ]; then
# get chainid from e.g. 1BYO_B
chainid=`expr substr $pdb 6 1`
chainopt="-c $chainid"
pdbid=`expr substr $pdb 1 4`_${chainid}
else
chainopt=""
pdbid=`expr substr $pdb 1 4`
fi
pdb=`expr substr $pdb 1 4`
div=`expr substr $pdb 2 2`
pdbfile=${PDBROOT}/${div}/pdb${pdb}.ent.gz
if [ $first -eq 0 ]; then
echo
else
first=0
fi
pytableaucreate.py ${tabopts} ${chainopt} ${pdbfile} | tee ${outdir}/${pdbid}.tableaudistmatrix
# append distance matrix, removing identifier on first line
pytableaucreate.py -d ${tabopts} ${chainopt} ${pdbfile} | awk 'NR > 1'| tee -a ${outdir}/${pdbid}.tableaudistmatrix
done

Loading

0 comments on commit db9346d

Please sign in to comment.