diff --git a/scripts/build_betagrasp_query.sh b/scripts/build_betagrasp_query.sh new file mode 100755 index 0000000..03d56ed --- /dev/null +++ b/scripts/build_betagrasp_query.sh @@ -0,0 +1,14 @@ +#!/bin/sh +# +# build the tableau + distance matrix for tsrchd_sparse for the +# beta-grasp query (as the 4 largest strands and 1 alpha hexli in +# ubiquitin structure) +# +# $Id: build_betagrasp_query.sh 2908 2009-11-06 05:33:18Z astivala $ + +# tableaux+distmatrix db file +TABLEAUX_DB=${HOME}/tableauxdistmatrixdb.ascii + +echo "${TABLEAUX_DB}" +echo "T T F" # options: type,order,output +pytableaucreate.py -bf -35 -tdssp -p none -i BGRASP -s2,1,8,5,3 ${HOME}/pdb/d1ubia_.ent diff --git a/scripts/build_cops_db.sh b/scripts/build_cops_db.sh new file mode 100755 index 0000000..6f96ddc --- /dev/null +++ b/scripts/build_cops_db.sh @@ -0,0 +1,65 @@ +#!/bin/sh +# +# File: build_cops_db.sh +# Author: Alex Stivala +# Created: May 2010 +# +# +# build_cops_db.sh - build tableaux database for COPS benchmark data set +# +# Usage: build_cops_db.sh querydir dbfile +# +# querydir is directory to put query input tableaux into +# dbfile is basename of tableaux database to create, will create +# dbfile.tableaux.pickle, dbfile.distmatrix.pickle and +# dbfile.tableauxdistmatrixdb.ascii +# +# Builds tableaux for queries and database for the COPS benchmark data set +# (Frank et al. 1999 "COPS Benchmark: interactive analysis of database +# search methods" Bioinformatics 26(4):574-575) available from +# http://benchmark.services.came.sbg.ac.at/ +# +# Requires the buildtableauxdb.py and pytableaucreate.py and convdb2.py +# scripts in PATH. +# +# WARNING: dbfile and files in querydir are overwritten if they exist. +# +# $Id: build_cops_db.sh 3632 2010-05-12 02:07:26Z alexs $ + +COPS_ROOT=${HOME}/cops-benchmark-2009-6-full +COPS_PDB_QUERIES=${COPS_ROOT}/queries/pdb +COPS_PDB_DB=${COPS_ROOT}/database/pdb + +if [ $# -ne 2 ]; then + echo "Usage: $0 querydir dbfile" >&2 + exit 1 +fi + +QUERYDIR=$1 +DBFILE=$2 + +OPTIONS="-p none -35 -t dssp" + +if [ ! -d $QUERYDIR ]; then + mkdir $QUERYDIR +fi + +tableaux_pickle=${DBFILE}.tableaux.pickle +distmatrix_pickle=${DBFILE}.distmatrix.pickle +tableauxdb=${DBFILE}.tableauxdb.ascii + +for query in ${COPS_PDB_QUERIES}/*.pdb +do + qid=`basename $query .pdb` + qfile=${QUERYDIR}/${qid}.input + echo $tableauxdb > $qfile + echo "T T F" >> $qfile # options: type, order, output + pytableaucreate.py -f -b $OPTIONS $query >> $qfile +done + + +buildtableauxdb.py $OPTIONS $COPS_PDB_DB $tableaux_pickle +buildtableauxdb.py -d $OPTIONS $COPS_PDB_DB $distmatrix_pickle + +convdb2.py $tableaux_pickle $distmatrix_pickle > $tableauxdb + diff --git a/scripts/build_fastscopdominfo_cache.py b/scripts/build_fastscopdominfo_cache.py new file mode 100755 index 0000000..29e178b --- /dev/null +++ b/scripts/build_fastscopdominfo_cache.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python +############################################################################### +# +# build_fastscopdominfo_cache.py - build pickle file for cached SCOP info +# +# File: build_fastscopdominfo_cache.py +# Author: Alex Stivala +# Created: March 2010 +# +# $Id: scopdominfo.py 3009 2009-12-08 03:01:48Z alexs $ +# +############################################################################### + +""" +Build cache (Python pickled dictionary) of information on the folds +and superfamilies SCOP domain identifiers (sids). + +See usage in docstring for main() + +SCOP and ASTRAL data is obtained using the Bio.SCOP library (Casbon et +al 2006 'A high level interface to SCOP and ASTRAL implemented in +Python' BMC Bioinformatics 7:10) and depends on having the data +downloaded, in SCOP_DIR (defined below). + +Downloaded SCOP files from + +http://scop.mrc-lmb.cam.ac.uk/scop/parse/index.html + +and ASTRAL files (in scopseq-1.73) from + +http://astral.berkeley.edu/scopseq-1.73.html + +The files downlaoded are: + +/local/charikar/SCOP/: +dir.cla.scop.txt_1.73 +dir.des.scop.txt_1.73 +dir.hie.scop.txt_1.73 + +/local/charikar/SCOP/scopseq-1.73: +astral-scopdom-seqres-all-1.73.fa +astral-scopdom-seqres-sel-gs-bib-95-1.73.id + +Other files there are indices built by Bio.SCOP when first used. +""" + +import sys,os +import pickle + +from Bio.SCOP import * + +from pathdefs import SCOP_DIR,SCOP_VERSION + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def build_scopdominfo_dict(scop): + """ + Build dictionary with + information about superfamily and class of all SCOP domains + + Parameters: + scop - previously built Bio.SCOP Scop instance + + Return value: + dict {sid: (superfamily_sccs, superfamily_description, fold_sccs,fold_description)} + where + superfamily_sccs is SCOP sccs identifying the superfamily for the domain + superamily_description is SCOP dessription of the superfamily + fold_description is the SCOP descriptino of the fold the domain is in + """ + scopdominfo_dict = {} + for scop_dom in scop.getDomains(): + sid = scop_dom.sid + scop_superfamily = scop_dom.getAscendent('superfamily') + scop_fold = scop_dom.getAscendent('fold') + scop_class = scop_dom.getAscendent('class') + scopdominfo_dict[sid] = (scop_superfamily.sccs, + scop_superfamily.description, + scop_fold.sccs, + scop_fold.description) + + return scopdominfo_dict + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " cachefile\n") + sys.exit(1) + + +def main(): + """ + main for scomdominfo.py + + Usage: scomdominfo.py cachefile + + cachefile is the file to create the pickled domain info dictionary as + WARNING: overwritten if it exists + """ + if len(sys.argv) != 2: + usage(os.path.basename(sys.argv[0])) + + pickle_filename = sys.argv[1] + + sys.stderr.write("Reading SCOP Data...") + scop = Scop(dir_path=SCOP_DIR,version=SCOP_VERSION) + sys.stderr.write("done\n") + + sys.stderr.write("Building domain info cache...") + scopdominfo_dict = build_scopdominfo_dict(scop) + sys.stderr.write("done. Got %d domain descriptions\n" % + len(scopdominfo_dict)) + + sys.stderr.write("Writing cache to file %s...\n" % pickle_filename) + fh = open(pickle_filename, "w") + pickle.dump(scopdominfo_dict, fh) + fh.close() + sys.stderr.write("done\n") + + + +if __name__ == "__main__": + main() + diff --git a/scripts/build_fischer_cm.sh b/scripts/build_fischer_cm.sh new file mode 100755 index 0000000..4fdefee --- /dev/null +++ b/scripts/build_fischer_cm.sh @@ -0,0 +1,76 @@ +#!/bin/sh +# +# File: build_fischer_cm.sh +# Author: Alex Stivala +# Created: September 2008 +# +# build_fischer_cm.sh - build contact maps for Fischer data set +# +# Usage: build_fischer_cm.sh outdir +# +# outdir is name of diretory which is created, and each contact map +# in ASCII format for use with MSVNS4MaxCMO (Pelta et al 2008) +# or other program using this format of contact matrix is +# created as a separate file in that directory, in format for input +# for use with msvns4maxcmo_allall.py for example +# +# builds contact maps, using pconpy.py, +# for the Fischer data set (Fischer et al 1996 Pac. Symp. Biocomput. 300-318)) +# This allows all-against-all (including redundant, so for n (=68) +# there are n*n (=4624) total comparions) with e.g. msvns4maxcmo_allall.py, +# +# + +# root of divided PDB hierarchy +PDBROOT=/local/charikar/pdb/pdb + + +# List of probe PDB ids from Fischer 1996 Table I +# Note several PDB ids obsoleted, so change to the replacments +FISCHER_S="1mdc 1mup 1npx 1cpc_l 1onc 2ak3_a 1osa 1atn_a 1pfc 1arb 2cmd 2pia 2pna 3rub_l 1bbh_a 2sar_a 1c2r_a 3cd4 1chr_a 1aep 1dxt_b 2mnr 2fbj_l 1lts_d 1gky 2gbp 1hip 1bbt_1 2sas 2mta_c 1fc1_a 1tah_a 2hpd_a 1rcb 1aba 1sac_a 1eaf 1dsb_a 2sga 1stf_i 2hhm_a 2afn_a 1aaj 1fxi_a 5fd1 1bge_b 1isu_a 3hla_b 1gal 3chy 1cau_b 2aza_a 1hom 1cew 1tlk 1cid 2omf 1crl 1lga_a 2sim 1mio_c 1ten 4sbv_a 1tie 8i1b 2snv 1hrh_a 1gp1_a" + + +# List of target fold PDB ids from Fischer 1996 Table I +# Note several PDB ids obsoleted, so change to the replacments +# this list corresponds to FISCHER_S ie FISCHER_P[i] is the target fold +# for probe FISCHER_S[i] for 0 < i < 67 +FISCHER_P="1ifc 1rbp 3grs 1col_a 7rsa 1gky 4cpv 1atr 3hla_b 5ptp 6ldh 1fnb 1sha_a 6xia 2ccy_a 9rnt 1ycc 2rhe 2mnr 256b_a 1hbg 4enl 8fab_b 1bov_a 3adk 2liv 2hip_a 2plv1 2scp_a 1ycc 2fb4_h 1tca 2cpp 2gmf_a 1ego 2ayh 4cla 2trx_a 5ptp 1mol_a 1fbp_a 1aoz_a 1paz 1ubq 1iqz 2gmf_a 2hip_a 2rhe 3cox 2fox 1cau_a 1paz 1lfb 1mol_a 2rhe 2rhe 2por 1ede 2cyp 1nsb_a 2min_b 3hhr_b 2tbv_a 4fgf 4fgf 5ptp 1rnh 2trx_a" + + +# List of 68 probe sequences from Fischer 1996 Table II +# Note several PDB ids obsoleted, so change to the replacments +FISCHER_LIST="1dxt_b 1cpc_l 1c2r_a 2mta_c 1bbh_a 1bge_b 1rcb 1aep 1osa 2sas 1hom 1lga_a 2hpd_a 1chr_a 2mnr 3rub_l 1crl 1tah_a 1aba 1dsb_a 1gpl_a 1atn_a 1hrh_a 3chy 2ak3_a 1gky 2cmd 1eaf 2gbp 1mio_c 2pia 1gal 1npx 2hhm_a 1hip 1isu_a 1fc1_a 2fbj_l 1cid 1pfc 1ten 1tlk 3cd4 3hla_b 1aaj 2afn_a 2aza_a 4sbv_a 1bbt_1 1sac_a 1lts_d 1tie 8i1b 1arb 2sga 2snv 1mdc 1mup 2sim 1cau_b 2omf 1fxi_a 1cew 1stf_i 2pna 2sar_a 1onc 5fd1" + +if [ $# -ne 1 ]; then + echo "Usage: $0 outdir" 2>&1 + exit 1 +fi +outdir=$1 + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +# pconpy.py options +threshold=7.0 +pconpyopts="--cmaplist --threshold=${threshold} --seq_separation=2" + +for i in $FISCHER_LIST +do + pdb=`echo $i | tr A-Z a-z` + if [ `expr index $pdb _` -ne 0 ]; then + # get chainid from e.g. 1BYO_B + chainid=`expr substr $pdb 6 1` + chainopt="--chains=$chainid" + pdbid=`expr substr $pdb 1 4`_${chainid} + else + chainopt="" + pdbid=`expr substr $pdb 1 4` + fi + pdb=`expr substr $pdb 1 4` + div=`expr substr $pdb 2 2` + pdbfile=${PDBROOT}/${div}/pdb${pdb}.ent.gz + pdbid=`echo $pdbid | tr '[a-z]' '[A-Z]'` + pconpy.py ${pconpyopts} ${chainopt} --pdb=${pdbfile} --output=${outdir}/${pdbid}.cm_a${threshold} +done + diff --git a/scripts/build_fischer_db.sh b/scripts/build_fischer_db.sh new file mode 100755 index 0000000..c786271 --- /dev/null +++ b/scripts/build_fischer_db.sh @@ -0,0 +1,85 @@ +#!/bin/sh +# +# File: build_fischer_db.sh +# Author: Alex Stivala +# Created: September 2008 +# +# build_fischer_db.sh - build tableaux database for Fischer data set +# +# Usage: build_fischer_db.sh outdir +# +# outdir is name of diretory which is created, and each tableau +# in ASCII format for use with tsrchd_sparse etc. is +# created as a separate file in that directory, in format for input +# for use with qptabmatch_allpairs.py for example +# +# To stdout is written the ASCII format db of all the tableaux+dist matrices +# (just all the ones written to outdir concatenated together with +# blank line between each). +# +# builds database of tableaux, using pytableaycreate.py, +# for the Fischer data set (Fischer et al 1996 Pac. Symp. Biocomput. 300-318)) +# This allows all-against-all (including redundant, so for n (=68) +# there are n*n (=4624) total comparions) with e.g. qptabmatch_allall.py, +# +# + +# root of divided PDB hierarchy +PDBROOT=/local/charikar/pdb/pdb + + +# List of probe PDB ids from Fischer 1996 Table I +# Note several PDB ids obsoleted, so change to the replacments +FISCHER_S="1mdc 1mup 1npx 1cpc_l 1onc 2ak3_a 1osa 1atn_a 1pfc 1arb 2cmd 2pia 2pna 3rub_l 1bbh_a 2sar_a 1c2r_a 3cd4 1chr_a 1aep 1dxt_b 2mnr 2fbj_l 1lts_d 1gky 2gbp 1hip 1bbt_1 2sas 2mta_c 1fc1_a 1tah_a 2hpd_a 1rcb 1aba 1sac_a 1eaf 1dsb_a 2sga 1stf_i 2hhm_a 2afn_a 1aaj 1fxi_a 5fd1 1bge_b 1isu_a 3hla_b 1gal 3chy 1cau_b 2aza_a 1hom 1cew 1tlk 1cid 2omf 1crl 1lga_a 2sim 1mio_c 1ten 4sbv_a 1tie 8i1b 2snv 1hrh_a 1gp1_a" + + +# List of target fold PDB ids from Fischer 1996 Table I +# Note several PDB ids obsoleted, so change to the replacments +# this list corresponds to FISCHER_S ie FISCHER_P[i] is the target fold +# for probe FISCHER_S[i] for 0 < i < 67 +FISCHER_P="1ifc 1rbp 3grs 1col_a 7rsa 1gky 4cpv 1atr 3hla_b 5ptp 6ldh 1fnb 1sha_a 6xia 2ccy_a 9rnt 1ycc 2rhe 2mnr 256b_a 1hbg 4enl 8fab_b 1bov_a 3adk 2liv 2hip_a 2plv1 2scp_a 1ycc 2fb4_h 1tca 2cpp 2gmf_a 1ego 2ayh 4cla 2trx_a 5ptp 1mol_a 1fbp_a 1aoz_a 1paz 1ubq 1iqz 2gmf_a 2hip_a 2rhe 3cox 2fox 1cau_a 1paz 1lfb 1mol_a 2rhe 2rhe 2por 1ede 2cyp 1nsb_a 2min_b 3hhr_b 2tbv_a 4fgf 4fgf 5ptp 1rnh 2trx_a" + + +# List of 68 probe sequences from Fischer 1996 Table II +# Note several PDB ids obsoleted, so change to the replacments +FISCHER_LIST="1dxt_b 1cpc_l 1c2r_a 2mta_c 1bbh_a 1bge_b 1rcb 1aep 1osa 2sas 1hom 1lga_a 2hpd_a 1chr_a 2mnr 3rub_l 1crl 1tah_a 1aba 1dsb_a 1gpl_a 1atn_a 1hrh_a 3chy 2ak3_a 1gky 2cmd 1eaf 2gbp 1mio_c 2pia 1gal 1npx 2hhm_a 1hip 1isu_a 1fc1_a 2fbj_l 1cid 1pfc 1ten 1tlk 3cd4 3hla_b 1aaj 2afn_a 2aza_a 4sbv_a 1bbt_1 1sac_a 1lts_d 1tie 8i1b 1arb 2sga 2snv 1mdc 1mup 2sim 1cau_b 2omf 1fxi_a 1cew 1stf_i 2pna 2sar_a 1onc 5fd1" + +if [ $# -ne 1 ]; then + echo "Usage: $0 outdir" 2>&1 + exit 1 +fi +outdir=$1 + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +# pytableaucreate.py options +tabopts="-35 -f -t dssp -p none" + +first=1 +for i in $FISCHER_LIST +do + pdb=`echo $i | tr A-Z a-z` + if [ `expr index $pdb _` -ne 0 ]; then + # get chainid from e.g. 1BYO_B + chainid=`expr substr $pdb 6 1` + chainopt="-c $chainid" + pdbid=`expr substr $pdb 1 4`_${chainid} + else + chainopt="" + pdbid=`expr substr $pdb 1 4` + fi + pdb=`expr substr $pdb 1 4` + div=`expr substr $pdb 2 2` + pdbfile=${PDBROOT}/${div}/pdb${pdb}.ent.gz + if [ $first -eq 0 ]; then + echo + else + first=0 + fi + pytableaucreate.py ${tabopts} ${chainopt} ${pdbfile} | tee ${outdir}/${pdbid}.tableaudistmatrix + # append distance matrix, removing identifier on first line + pytableaucreate.py -d ${tabopts} ${chainopt} ${pdbfile} | awk 'NR > 1'| tee -a ${outdir}/${pdbid}.tableaudistmatrix +done + diff --git a/scripts/build_irtableau_query_input.sh b/scripts/build_irtableau_query_input.sh new file mode 100755 index 0000000..5a9cf2a --- /dev/null +++ b/scripts/build_irtableau_query_input.sh @@ -0,0 +1,38 @@ +#!/bin/sh +# +# File: build_irtableau_query_input.sh +# Author: Alex Stivala +# Created: March 2010 +# +# build_irtableau_query_input.sh - +# build irtableau input files for a list of query ids +# +# Usage: build_irtableau_query_input.sh outdir &1 + exit 1 +fi +outdir=$1 + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +while read scopsid +do + echo "${TABLEAUX_DB}" > ${outdir}/${scopsid}.input + fgrep ${scopsid} $ALL_VECTOR_DB >> ${outdir}/${scopsid}.input +done + diff --git a/scripts/build_nh3d_cm.sh b/scripts/build_nh3d_cm.sh new file mode 100755 index 0000000..aa70ed7 --- /dev/null +++ b/scripts/build_nh3d_cm.sh @@ -0,0 +1,45 @@ +#!/bin/sh +# +# File: build_nh3d_cm.sh +# Author: Alex Stivala +# Created: September 2008 +# +# build_nh3d_cm.sh - build contact maps for Nh3D data set +# +# Usage: build_nh3d_cm.sh outdir +# +# outdir is name of diretory which is created, and each contact map +# in ASCII format for use with MSVNS4MaxCMO (Pelta et al 2008) +# or other program using this format of contact matrix is +# created as a separate file in that directory, in format for input +# for use with msvns4maxcmo_allall.py for example +# +# builds contact maps, using pconpy.py, +# for the Nh3D data set (Thiruv et al 2005 BMC Struct Biol 5:12) +# +# + +# location of Nh3D data set, PDB format files +NH3D_PDB_DIR=/local/charikar/Nh3D/v3.0 + + +if [ $# -ne 1 ]; then + echo "Usage: $0 outdir" 2>&1 + exit 1 +fi +outdir=$1 + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +# pconpy.py options +threshold=7.0 +pconpyopts="--cmaplist --threshold=${threshold} --seq_separation=2" + +for pdbfile in ${NH3D_PDB_DIR}/*.pdb +do + cathid=`basename ${pdbfile} .pdb` + pconpy.py ${pconpyopts} --pdb=${pdbfile} --output=${outdir}/${cathid}.cm_a${threshold} +done + diff --git a/scripts/build_nh3d_db.sh b/scripts/build_nh3d_db.sh new file mode 100755 index 0000000..d14cc9e --- /dev/null +++ b/scripts/build_nh3d_db.sh @@ -0,0 +1,57 @@ +#!/bin/sh +# +# File: build_nh3d_db.sh +# Author: Alex Stivala +# Created: September 2008 +# +# build_nh3d_db.sh - build tableaux database for Nh3D data set +# +# Usage: build_nh3d_db.sh outdir +# +# outdir is name of diretory which is created, and each tableau +# in ASCII format for use with tsrchd_sparse etc. is +# created as a separate file in that directory, in format for input +# for use with qptabmatch_allpairs.py for example +# +# To stdout is written the ASCII format db of all the tableaux+dist matrices +# (just all the ones written to outdir concatenated together with +# blank line between each). +# +# builds database of tableaux, using pytableaycreate.py, +# for the Nh3D data set (Thiruv et al 2005 BMC Struct. Biol. 5:12) +# + +# location of Nh3D data set, PDB format files +NH3D_PDB_DIR=/local/charikar/Nh3D/v3.0 + + +if [ $# -ne 1 ]; then + echo "Usage: $0 outdir" 2>&1 + exit 1 +fi +outdir=$1 + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +# pytableaucreate.py options +tabopts="-35 -f -t dssp -p none" + +first=1 +for pdbfile in ${NH3D_PDB_DIR}/*.pdb +do + cathid=`basename ${pdbfile} .pdb` + # dodgy: remove periods so that CATH id fits in 8 chars... + # hopefully will get no duplicates... + cathid=`echo ${cathid} | tr -d .` + if [ $first -eq 0 ]; then + echo + else + first=0 + fi + pytableaucreate.py ${tabopts} -i ${cathid} ${pdbfile} | tee ${outdir}/${cathid}.tableaudistmatrix + # append distance matrix, removing identifier on first line + pytableaucreate.py -d ${tabopts} -i ${cathid} ${pdbfile} | awk 'NR > 1' |tee -a ${outdir}/${cathid}.tableaudistmatrix +done + diff --git a/scripts/build_query_input.sh b/scripts/build_query_input.sh new file mode 100755 index 0000000..9446bf4 --- /dev/null +++ b/scripts/build_query_input.sh @@ -0,0 +1,56 @@ +#!/bin/sh +# +# File: build_query_input.sh +# Author: Alex Stivala +# Created: November 2008 +# +# build_query_input.sh - build tsrchd input files for a list of query ids +# +# Usage: build_query_input.sh outdir &1 + exit 1 +fi +outdir=$1 + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +# pytableaucreate.py options +#tabopts="-35 -f -t dssp -p none" +tabopts="-m4 -35 -f -t dssp -p none" +#tabopts="-n -35 -f -t dssp -p none" + +while read scopsid +do + div=`echo $scopsid | cut -c3-4` + pdbfile=${ASTRAL_DIR}/${div}/${scopsid}.ent + echo "${TABLEAUX_DB}" > ${outdir}/${scopsid}.input + echo "T T F" >> ${outdir}/${scopsid}.input # options: type,order,output + pytableaucreate.py ${tabopts} ${pdbfile} >> ${outdir}/${scopsid}.input + # append distance matrix, removing identifier on first line + pytableaucreate.py -d ${tabopts} ${pdbfile} | awk 'NR > 1' >> ${outdir}/${scopsid}.input +done + diff --git a/scripts/build_sheetbc_1qlp_query.sh b/scripts/build_sheetbc_1qlp_query.sh new file mode 100755 index 0000000..6487d42 --- /dev/null +++ b/scripts/build_sheetbc_1qlp_query.sh @@ -0,0 +1,13 @@ +#!/bin/sh +# +# build the tableau + distance matrix for tsrchd_sparse for the +# serpin B/C sheet. +# +# $Id: build_sheetbc_1qlp_query.sh 2105 2009-03-17 01:30:33Z astivala $ + +# tableaux+distmatrix db file +TABLEAUX_DB=/local/charikar/astivala/tableauxdb/astral/tableauxdistmatrixdb.ascii + +echo "${TABLEAUX_DB}" +echo "T T F" # options: type,order,output +pytableaucreate.py -pnone -b -f -35 -i sheetbc -tdssp -s2,26,25,15,14,12,13,18,24 /local/charikar/pdb/pdb/ql/pdb1qlp.ent.gz diff --git a/scripts/build_sheetbc_helixcluster_1qlp_query.sh b/scripts/build_sheetbc_helixcluster_1qlp_query.sh new file mode 100755 index 0000000..d62f136 --- /dev/null +++ b/scripts/build_sheetbc_helixcluster_1qlp_query.sh @@ -0,0 +1,13 @@ +#!/bin/sh +# +# build the tableau + distance matrix for tsrchd_sparse for the +# serpin B/C sheet. +# +# $Id: build_sheetbc_helixcluster_1qlp_query.sh 2108 2009-03-18 05:49:01Z astivala $ + +# tableaux+distmatrix db file +TABLEAUX_DB=/local/charikar/astivala/tableauxdb/astral/tableauxdistmatrixdb.ascii + +echo "${TABLEAUX_DB}" +echo "T T F" # options: type,order,output +pytableaucreate.py -pnone -b -f -35 -i sheetbch -tdssp -s2,3,4,5,26,25,15,14,12,13,18,24 /local/charikar/pdb/pdb/ql/pdb1qlp.ent.gz diff --git a/scripts/build_tops_files.sh b/scripts/build_tops_files.sh new file mode 100755 index 0000000..c2d3a30 --- /dev/null +++ b/scripts/build_tops_files.sh @@ -0,0 +1,54 @@ +#!/bin/sh +# +# File: build_tops_files.sh +# Author: Alex Stivala +# Created: March 2009 +# +# build_tops_files.sh - build TOPS files from hierarchy of ASTRAL files +# +# Usage: build_tops_files.sh astral_root outdir +# +# outdir is name of diretory which is created, and a .tops file +# created as a separate file in that directory, for each .ent file +# in the ASTRAL SCOP hierarchy. +# +# astral_root is the root of the ASTRAL SCOP pdbstyle hierarchy. +# +# +# $Id: build_tops_files.sh 3617 2010-05-05 06:13:27Z alexs $ + +# location of TOPS directory, contains tops.def etc. +# Note all the .dssp and .tops files are temporarily created here, +# (tops.def has these specifications) +TOPS_ROOT=$HOME/Tops + +if [ $# -ne 2 ]; then + echo "Usage: $0 astral_root outdir" 2>&1 + exit 1 +fi + +astral_root=$1 +outdir=$2 + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +cd $TOPS_ROOT + +for ent in `find $astral_root -name \*.ent` +do + # TOPS can only cope with 4 letter PDB codes, so we have to name + # input files that way + sid=`basename $ent .ent` + pdbcode=`echo $sid | cut -c2-5` + cp $ent pdb${pdbcode}.ent + dssp pdb${pdbcode}.ent > ${pdbcode}.dssp + # the -C ALL options ensures all chains in one file, otherwise we get + # multiple files for genetic domain style e.g. d1pid.1 files (multi chains + # in 1 domain) + ${TOPS_ROOT}/bin/Tops -C ALL $pdbcode + mv ${pdbcode}ALL.tops ${outdir}/${sid}.tops + rm ${pdbcode}.dssp + rm pdb${pdbcode}.ent +done diff --git a/scripts/build_tops_files_pdb.sh b/scripts/build_tops_files_pdb.sh new file mode 100755 index 0000000..8b6f340 --- /dev/null +++ b/scripts/build_tops_files_pdb.sh @@ -0,0 +1,49 @@ +#!/bin/sh +# +# File: build_tops_files_pdb.sh +# Author: Alex Stivala +# Created: April 2010 +# +# build_tops_files_pdb.sh - build TOPS files from hierarchy of PDB files +# +# Usage: build_tops_files_pdb.sh pdb_root outdir +# +# outdir is name of diretory which is created, and a .tops file +# created as a separate file in that directory, for each .ent file +# in the PDB hierarchy. +# +# pdb_root is the root of the PDB hierarchy. +# +# +# $Id: build_tops_files_pdb.sh 3575 2010-04-22 00:35:26Z alexs $ + +# location of TOPS directory, contains tops.def etc. +# Note all the .dssp and .tops files are temporarily created here, +# (tops.def has these specifications) +TOPS_ROOT=$HOME/Tops + +if [ $# -ne 2 ]; then + echo "Usage: $0 pdb_root outdir" 2>&1 + exit 1 +fi + +pdb_root=$1 +outdir=$2 + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +cd $TOPS_ROOT + +for ent in `find $pdb_root -name \*.ent` +do + pdbfile=`basename $ent` + pdbcode=`expr substr $pdbfile 4 4` + cp $ent $pdbfile + dssp $pdbfile > ${pdbcode}.dssp + ${TOPS_ROOT}/bin/Tops $pdbcode + mv ${pdbcode}.tops ${outdir}/${pdbcode}.tops + rm ${pdbcode}.dssp + rm $pdbfile +done diff --git a/scripts/buildtableauxdb.py b/scripts/buildtableauxdb.py new file mode 100755 index 0000000..78f8dc5 --- /dev/null +++ b/scripts/buildtableauxdb.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python +############################################################################### +# +# buildtableauxdb.py - build a database of protein tableaux +# +# File: buildtableauxdb.py +# Author: Alex Stivala +# Created: May 2008 +# +# $Id: buildtableauxdb.py 3631 2010-05-12 01:20:01Z alexs $ +# +############################################################################### + +""" +Build a database of protein tableaux from either the divided PDB hierarchy +(compressed files e.g. pdb1qlp.ent.gz or uncompressed) +or the ASTRAL pdb-style hierarchy +(uncompressed files e.g. d1qlpa_.ent). + +Also used to build a database of SSE axis midpoint distance matrices. + +See docstring for main() for usage. + +The database format is a hash table mapping PDB identifiers to a list +of tableaux in packed format (only storing one triangle, only tableau +codes, indexed by SSE sequential number so no PTNode objects etc). +that is saved using Python pickle module so can be loaded straight into +another Python program. +Ie the intention is that the whole db sits in memory (as it does when +built here). + +Refer to tableaubuild.py and pttableau.py for details about tableaux. + +It is written in Python and depends on some Python libraries: + +. BioPython (including Bio.PDB) + http://www.biopython.org + + Reference for Bio.PDB is: + Hamelryck and Manderick 2003 "PDB parser and structure class implemented + in Python" Bioinformatics 19:2308-2310 + + which in turn depends on Numeric + http://sourceforge.net/projects/numpy + + +Developed on Linux 2.6.9 (x86_64) with Python 2.5.1 +and BioPython 1.43 with Numeric 24.2 + +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os +import re +import getopt +import pickle + +import numpy.oldnumeric as Numeric + +import ptsecstruct +from ptnode import ptnode_set_verbose +from ptdomain import * +from ptutils import cleanup_tmpdir +import getdomains +from tableaubuild import get_tableaux +from pttableau import PTTableauPacked +from ptversion import get_version + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + + + +def build_db(input_root, + secstruct_program = 'dssp', + domain_program = 'none', + include_310_helices = True, + include_pi_helices = True, + min_sse_len = None, + use_numeric = False, + use_hk = False, + build_dist_matrices = False): + """ + Build the tableaux db in memory. + + Parameters: + input_root - root of PDB or ASTRAL pdbstyle divided hierarchy + secstruct_program - secondary structure definition program + ('stride' or 'dssp' or 'pdb') to use. + domain_progam - domain decompositino method ('ddomain','cath', etc.) + include_310_helices - if True, include 3_10 helices in the graph + include_pi_helices - if True, include pi helices in the graph + min_sse_len - if not None, minimum SSE length + use_numeric - if True build database of Numeric array (Omega matrix) + rather than PTTableauPacked + use_hk - If True build database with HH and KK codes for strands in + same sheet. + build_dist_matrices - If True build database of SSE axis midpoint + distance matrices rather than tableaux. + + Return value: + dict of { pdbid : [PTTableauPacked list] } for use_numeric=False OR + { pdbid : [Numeric.array list ]} for use_numeric=True OR + { pdbid : [Numeric.array list ]} for build_dist_matrices=True + + pdbid is e.g. 1QLP when built from PDB or + e.g. 1qlp1a when built from ASTRAL pdbstyle + + """ + tableau_db = {} # dict of { pdbid : [PTTableauPacked list] }, OR + # { pdbid : [Numeric.array list ]} for use_numeric=True + # or build_dist_matrices=True + # pdbid is e.g. 1QLP when built from PDB or + # e.g. 1qlp1a when built from ASTRAL pdbstyle + + key_count = 0 + tableaux_count = 0 + keyerror_count = 0 + file_count = 0 + for root,dirs,files in os.walk(input_root): + for pdb_filename in [filename for filename in files + if (os.path.splitext(filename)[1] == '.ent' + or os.path.splitext(filename)[1] == '.pdb' + or os.path.splitext(filename)[1] == '.gz')]: + sys.stderr.write('processing ' + pdb_filename + '\n') + file_count += 1 + (pdbid, tableaux_list, sse_string_list) = \ + get_tableaux(os.path.join(root, pdb_filename), + secstruct_program, domain_program, + include_310_helices, include_pi_helices, + None, # sse_id_list + min_sse_len, + use_numeric, + use_hk, + build_dist_matrices) + if not (use_numeric or build_dist_matrices): + tableaux_list = [PTTableauPacked(tableau) for tableau + in tableaux_list] + if tableau_db.has_key(pdbid): + sys.stderr.write("ERROR: duplicate key " + pdbid + "\n") + keyerror_count += 1 + else: + tableau_db[pdbid] = tableaux_list + key_count += 1 + tableaux_count += len(tableaux_list) + sys.stdout.write("processed %d files\n" % file_count) + sys.stdout.write("resulting in %d db entries\n" % key_count) + if build_dist_matrices: + sys.stdout.write(" %d SSE distance matrices\n" % tableaux_count) + else: + sys.stdout.write(" %d tableaux\n" % tableaux_count) + sys.stdout.write("with %d duplicate key errors\n" % keyerror_count) + + return tableau_db + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + + +def usage(progname): + """ + Print usage message and exit + """ + sys.stderr.write("Usage: " + progname + + " [-35knv] [-d] [-m min_sse_len] [-t struct_prog] " + "[-p domain_prog] \n") + sys.stderr.write(" -3 include 3_10 helices\n") + sys.stderr.write(" -5 include pi helices\n") + sys.stderr.write(" -d build SSE distance matrices not tableaux\n") + sys.stderr.write(" -k use HH and KK codes for anti/parallel strands in same sheet\n") + sys.stderr.write(" -m min_sse_len : minimum SSE length\n") + sys.stderr.write(" -n use numeric values (Omega matrix) rather than tableua\n") + sys.stderr.write(" -p domain decomposition method/db\n" + " valid values are none (default), " + "ddomain, cath:cdffile, pdomains:pdomainsfile\n") + sys.stderr.write(" -t struct_prog : use struct_prog define " \ + "secondary structure\n") + sys.stderr.write(" supported is 'pdb' or 'stride' or 'dssp' (default)\n") + sys.stderr.write(" -v print verbose debugging messages to stderr\n") + sys.stderr.write("\nWARNING: dbname is overwritten if it exists\n") + sys.exit(1) + + +def main(): + """ + main for buildtableauxdb.py + + Usage: pytableaucreate [-35knv] [-d] [-m min_sse_len ] + [-t structprog] [-p domainprog] + + + + WARNING: dbname is overwritten if it exists + + -3 specifies to include 3_10 helices in the diagram. Default is only + alpha helices. + + -5 specifies to include pi helices in the diagram. Defaul is only + alpha helices. + + -d build SSE distance matrices not tableaux + + -k use the HH and KK codes for respectively antiparallel and parallel + strands in the same sheet, rather than the O, P etc. codes. + + -m min_sse_len : specifies the minimum SSE length to include in tableaux. + + -n use numeric values (Omega matrix) rather than tableau. + + -p specify the domain decomposition method. + Valid values are 'none' (default), 'ddomain', 'cath:cdf_filename'. + + -t specifies the secondary structure assignment program to use. + Currently suppoed is 'pdb' and 'dfh,ssp' and 'stride'. Default 'pdb'. + + -v specifies verbose mode: debugging output is written to stderr. + """ + global verbose + + try: + opts, args = getopt.getopt(sys.argv[1:], "35dknm:p:t:v?") + except getopt.GetoptError: + usage(os.path.basename(sys.argv[0])) + + valid_secstruct_programs = ["dssp", "stride", "pdb"] + valid_domain_programs = getdomains.valid_domain_programs + [r"none"] + valid_domain_programs_re = [ re.compile(re_str) for re_str in + valid_domain_programs ] + + verbose = False # global (python globals are only 'global' to module though) + secstruct_program = "dssp" + include_310_helices = False + include_pi_helices = False + use_hk = False + domain_program = "none" + min_sse_len = None + use_numeric = False + build_dist_matrices = False + + for opt,arg in opts: + if opt == "-3": # include 3_10 helices + include_310_helices = True + elif opt == "-5": # include pi helices + include_pi_helices = True + elif opt == "-d": # build distance matrices not tableaux + build_dist_matrices = True + elif opt == "-k": # use HH and KK codes + use_hk = True + elif opt == "-m": # min sse length + min_sse_len = int(arg) + elif opt == "-n": # use numeric values (Omega matrix) + use_numeric = True + elif opt == "-p": # domain parsing program + domain_program = None + for valid_domarg_re in valid_domain_programs_re: + if valid_domarg_re.match(arg): + domain_program = arg + break + if domain_program == None: + sys.stderr.write("valid values for -p are: " + + str(valid_domain_programs) + "\n") + usage(sys.argv[0]) + elif opt == "-t": + if arg not in valid_secstruct_programs: + sys.stderr.write("valid values for -t are: " + + str(valid_secstruct_programs) + "\n") + usage(sys.argv[0]) + secstruct_program = arg + elif opt == "-v": # verbose + verbose = True # this module only + ptnode_set_verbose(True) # ptnode module + ptsecstruct.ptsecstruct_set_verbose(True) # ptsecstruct module + ptdomain_set_verbose(True) # ptdomain module + else: + usage(sys.argv[0]) + + if len(args) != 2: + usage(os.path.basename(sys.argv[0])) + + if build_dist_matrices: + if use_numeric: + use_numeric = False + sys.stderr.write("WARNING: -n (numeric) ignored for -d (distance matrix)\n") + if use_hk: + sys.stderr.write("-k (use HH and KK) invalid for -d (distance matrix)\n"); + usage(sys.argv[0]) + + if use_numeric and use_hk: + sys.stderr.write("-n (numeric) and -k (use HH and KK codes) are " + "mutually exlusive\n") + usage(sys.argv[0]) + + sys.stdout.write(sys.argv[0] + ': version is: ' + get_version() + '\n') + sys.stdout.write(sys.argv[0] + ': options are: ' + str(sys.argv[1:]) + '\n') + + input_root = args[0] + output_filename = args[1] + + fh = open(output_filename, 'w') + tableau_db = build_db(input_root, secstruct_program, domain_program, + include_310_helices, include_pi_helices, + min_sse_len, use_numeric, use_hk, build_dist_matrices) + if build_dist_matrices: + sys.stdout.write('writing SSE distance matrix db to ' + output_filename + '...\n') + else: + sys.stdout.write('writing tableaux db to ' + output_filename + '...\n') + pickle.dump(tableau_db, fh) + fh.close() + sys.stdout.write('done.\n') + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() + + diff --git a/scripts/cathmap.py b/scripts/cathmap.py new file mode 100644 index 0000000..7700d69 --- /dev/null +++ b/scripts/cathmap.py @@ -0,0 +1,837 @@ +# +# cathmap.py - Map compressed CATH identifiers back to CATH identifiers +# +# File: cathmap.py +# Author: Alex Stivala +# Created: September 2008 +# +# $Id: cathmap.py 1911 2008-09-19 07:47:50Z astivala $ +# + +""" + We only need to map the 'compressed' CATH id back to CATH id because + of (short-sighted on my part) limitation of identifiers to 8 chars + int he FORTRAN code since I was initially using only SCOP and PDB + identifiers; removing all the periods from CATH ids makes it short + enough and (luckily) results in no collisions on the Nh3D data set + (did not want to have to increase the limit in FORTRAN code since + would have to rebuild all datasets, re-run old results etc.) + (Note, of course this dictionary was generated by a shell script + based on build_nh3d_db.sh) +""" + +# It is not really necessary for this to be a dictionary, we jsut iterate +# it anyway, and we don't even need the query id supplied on command line, +# just the CATH class.architecture id (first two components of id), +# but this just fits in with how all the other eval scripts work + +CATHMAP = { +'11010' : '1.10.10', +'110100' : '1.10.100', +'1101000' : '1.10.1000', +'110101' : '1.10.101', +'1101020' : '1.10.1020', +'1101030' : '1.10.1030', +'1101040' : '1.10.1040', +'1101060' : '1.10.1060', +'1101070' : '1.10.1070', +'1101080' : '1.10.1080', +'1101090' : '1.10.1090', +'110110' : '1.10.110', +'1101130' : '1.10.1130', +'1101140' : '1.10.1140', +'1101160' : '1.10.1160', +'1101170' : '1.10.1170', +'11012' : '1.10.12', +'110120' : '1.10.120', +'1101200' : '1.10.1200', +'1101240' : '1.10.1240', +'1101270' : '1.10.1270', +'1101280' : '1.10.1280', +'1101290' : '1.10.1290', +'1101300' : '1.10.1300', +'110132' : '1.10.132', +'1101320' : '1.10.1320', +'1101340' : '1.10.1340', +'110135' : '1.10.135', +'1101350' : '1.10.1350', +'1101360' : '1.10.1360', +'1101370' : '1.10.1370', +'1101380' : '1.10.1380', +'110140' : '1.10.140', +'1101400' : '1.10.1400', +'1101410' : '1.10.1410', +'1101420' : '1.10.1420', +'1101450' : '1.10.1450', +'110150' : '1.10.150', +'1101500' : '1.10.1500', +'1101510' : '1.10.1510', +'1101520' : '1.10.1520', +'1101530' : '1.10.1530', +'110155' : '1.10.155', +'1101580' : '1.10.1580', +'1101610' : '1.10.1610', +'110162' : '1.10.162', +'110164' : '1.10.164', +'1101650' : '1.10.1650', +'110166' : '1.10.166', +'1101660' : '1.10.1660', +'110167' : '1.10.167', +'1101670' : '1.10.1670', +'1101680' : '1.10.1680', +'1101710' : '1.10.1710', +'1101750' : '1.10.1750', +'1101760' : '1.10.1760', +'1101780' : '1.10.1780', +'1101790' : '1.10.1790', +'1101820' : '1.10.1820', +'110183' : '1.10.183', +'1101830' : '1.10.1830', +'1101840' : '1.10.1840', +'1101860' : '1.10.1860', +'1101870' : '1.10.1870', +'110189' : '1.10.189', +'1101900' : '1.10.1900', +'11020' : '1.10.20', +'1102000' : '1.10.2000', +'1102080' : '1.10.2080', +'1102090' : '1.10.2090', +'110210' : '1.10.210', +'110220' : '1.10.220', +'110225' : '1.10.225', +'110230' : '1.10.230', +'110238' : '1.10.238', +'110239' : '1.10.239', +'110240' : '1.10.240', +'110245' : '1.10.245', +'110246' : '1.10.246', +'110260' : '1.10.260', +'110274' : '1.10.274', +'110275' : '1.10.275', +'110285' : '1.10.285', +'110286' : '1.10.286', +'110287' : '1.10.287', +'110288' : '1.10.288', +'110290' : '1.10.290', +'11030' : '1.10.30', +'110300' : '1.10.300', +'1103030' : '1.10.3030', +'1103040' : '1.10.3040', +'1103050' : '1.10.3050', +'1103100' : '1.10.3100', +'1103130' : '1.10.3130', +'1103140' : '1.10.3140', +'1103190' : '1.10.3190', +'1103200' : '1.10.3200', +'1103210' : '1.10.3210', +'1103250' : '1.10.3250', +'1103270' : '1.10.3270', +'1103280' : '1.10.3280', +'110340' : '1.10.340', +'110357' : '1.10.357', +'110375' : '1.10.375', +'110390' : '1.10.390', +'11040' : '1.10.40', +'110400' : '1.10.400', +'110405' : '1.10.405', +'110418' : '1.10.418', +'110420' : '1.10.420', +'110422' : '1.10.422', +'110437' : '1.10.437', +'110439' : '1.10.439', +'110440' : '1.10.440', +'110441' : '1.10.441', +'110442' : '1.10.442', +'110443' : '1.10.443', +'11045' : '1.10.45', +'110455' : '1.10.455', +'110460' : '1.10.460', +'110465' : '1.10.465', +'110468' : '1.10.468', +'110472' : '1.10.472', +'110489' : '1.10.489', +'110490' : '1.10.490', +'110494' : '1.10.494', +'110506' : '1.10.506', +'110510' : '1.10.510', +'110520' : '1.10.520', +'110530' : '1.10.530', +'110532' : '1.10.532', +'110533' : '1.10.533', +'110540' : '1.10.540', +'110555' : '1.10.555', +'110565' : '1.10.565', +'110569' : '1.10.569', +'110575' : '1.10.575', +'110579' : '1.10.579', +'110580' : '1.10.580', +'110590' : '1.10.590', +'110599' : '1.10.599', +'11060' : '1.10.60', +'110600' : '1.10.600', +'110606' : '1.10.606', +'110620' : '1.10.620', +'110630' : '1.10.630', +'110640' : '1.10.640', +'110645' : '1.10.645', +'110700' : '1.10.700', +'110710' : '1.10.710', +'110720' : '1.10.720', +'110730' : '1.10.730', +'110740' : '1.10.740', +'110750' : '1.10.750', +'110760' : '1.10.760', +'110790' : '1.10.790', +'1108' : '1.10.8', +'110800' : '1.10.800', +'110820' : '1.10.820', +'110840' : '1.10.840', +'110890' : '1.10.890', +'110910' : '1.10.910', +'110940' : '1.10.940', +'110950' : '1.10.950', +'1201000' : '1.20.1000', +'1201050' : '1.20.1050', +'1201060' : '1.20.1060', +'1201070' : '1.20.1070', +'1201080' : '1.20.1080', +'1201090' : '1.20.1090', +'1201120' : '1.20.1120', +'1201150' : '1.20.1150', +'1201170' : '1.20.1170', +'1201180' : '1.20.1180', +'120120' : '1.20.120', +'1201200' : '1.20.1200', +'1201220' : '1.20.1220', +'1201250' : '1.20.1250', +'1201260' : '1.20.1260', +'1201270' : '1.20.1270', +'1201280' : '1.20.1280', +'1201290' : '1.20.1290', +'1201330' : '1.20.1330', +'1201350' : '1.20.1350', +'1201370' : '1.20.1370', +'120140' : '1.20.140', +'120141' : '1.20.141', +'1201410' : '1.20.1410', +'120142' : '1.20.142', +'1201430' : '1.20.1430', +'120144' : '1.20.144', +'1201460' : '1.20.1460', +'120150' : '1.20.150', +'120190' : '1.20.190', +'120200' : '1.20.200', +'120210' : '1.20.210', +'120225' : '1.20.225', +'120245' : '1.20.245', +'120272' : '1.20.272', +'1205' : '1.20.5', +'12050' : '1.20.50', +'12058' : '1.20.58', +'12059' : '1.20.59', +'12080' : '1.20.80', +'120810' : '1.20.810', +'12082' : '1.20.82', +'120840' : '1.20.840', +'12085' : '1.20.85', +'120870' : '1.20.870', +'12089' : '1.20.89', +'12090' : '1.20.90', +'120900' : '1.20.900', +'12091' : '1.20.91', +'120910' : '1.20.910', +'120920' : '1.20.920', +'120930' : '1.20.930', +'120940' : '1.20.940', +'120950' : '1.20.950', +'120970' : '1.20.970', +'120990' : '1.20.990', +'12510' : '1.25.10', +'12520' : '1.25.20', +'12540' : '1.25.40', +'14010' : '1.40.10', +'15010' : '1.50.10', +'15030' : '1.50.30', +'210109' : '2.10.109', +'210150' : '2.10.150', +'21022' : '2.10.22', +'21025' : '2.10.25', +'210260' : '2.10.260', +'210270' : '2.10.270', +'21050' : '2.10.50', +'21055' : '2.10.55', +'21060' : '2.10.60', +'21069' : '2.10.69', +'21070' : '2.10.70', +'21077' : '2.10.77', +'21090' : '2.10.90', +'210010' : '2.100.10', +'210210' : '2.102.10', +'210220' : '2.102.20', +'210510' : '2.105.10', +'211010' : '2.110.10', +'211510' : '2.115.10', +'212010' : '2.120.10', +'213010' : '2.130.10', +'214010' : '2.140.10', +'215010' : '2.150.10', +'216010' : '2.160.10', +'216020' : '2.160.20', +'217011' : '2.170.11', +'2170130' : '2.170.130', +'2170150' : '2.170.150', +'217016' : '2.170.16', +'2170160' : '2.170.160', +'2170170' : '2.170.170', +'2170190' : '2.170.190', +'2170200' : '2.170.200', +'2170220' : '2.170.220', +'2170230' : '2.170.230', +'2170240' : '2.170.240', +'2170270' : '2.170.270', +'2170280' : '2.170.280', +'2170290' : '2.170.290', +'217040' : '2.170.40', +'21708' : '2.170.8', +'21709' : '2.170.9', +'220100' : '2.20.100', +'220110' : '2.20.110', +'220120' : '2.20.120', +'22025' : '2.20.25', +'22026' : '2.20.26', +'22028' : '2.20.28', +'22050' : '2.20.50', +'22080' : '2.20.80', +'22090' : '2.20.90', +'230110' : '2.30.110', +'230120' : '2.30.120', +'230130' : '2.30.130', +'230140' : '2.30.140', +'230170' : '2.30.170', +'23018' : '2.30.18', +'230210' : '2.30.210', +'230220' : '2.30.220', +'230230' : '2.30.230', +'23027' : '2.30.27', +'23029' : '2.30.29', +'23030' : '2.30.30', +'23031' : '2.30.31', +'23034' : '2.30.34', +'23037' : '2.30.37', +'23038' : '2.30.38', +'23039' : '2.30.39', +'23040' : '2.30.40', +'23042' : '2.30.42', +'23060' : '2.30.60', +'23070' : '2.30.70', +'24010' : '2.40.10', +'240100' : '2.40.100', +'240110' : '2.40.110', +'240128' : '2.40.128', +'24015' : '2.40.15', +'240150' : '2.40.150', +'240155' : '2.40.155', +'240160' : '2.40.160', +'240170' : '2.40.170', +'240180' : '2.40.180', +'24020' : '2.40.20', +'240200' : '2.40.200', +'240220' : '2.40.220', +'240230' : '2.40.230', +'240240' : '2.40.240', +'240260' : '2.40.260', +'240280' : '2.40.280', +'240290' : '2.40.290', +'24030' : '2.40.30', +'240300' : '2.40.300', +'240310' : '2.40.310', +'24033' : '2.40.33', +'240340' : '2.40.340', +'24037' : '2.40.37', +'24040' : '2.40.40', +'24050' : '2.40.50', +'24070' : '2.40.70', +'25010' : '2.50.10', +'25020' : '2.50.20', +'26011' : '2.60.11', +'260110' : '2.60.110', +'260120' : '2.60.120', +'260130' : '2.60.130', +'26015' : '2.60.15', +'260175' : '2.60.175', +'26020' : '2.60.20', +'260200' : '2.60.200', +'260210' : '2.60.210', +'260220' : '2.60.220', +'260240' : '2.60.240', +'260250' : '2.60.250', +'260260' : '2.60.260', +'260270' : '2.60.270', +'260290' : '2.60.290', +'26030' : '2.60.30', +'260320' : '2.60.320', +'260330' : '2.60.330', +'26034' : '2.60.34', +'260340' : '2.60.340', +'260350' : '2.60.350', +'260360' : '2.60.360', +'260390' : '2.60.390', +'26040' : '2.60.40', +'260410' : '2.60.410', +'260420' : '2.60.420', +'26060' : '2.60.60', +'26090' : '2.60.90', +'26098' : '2.60.98', +'270100' : '2.70.100', +'270130' : '2.70.130', +'270160' : '2.70.160', +'270170' : '2.70.170', +'270180' : '2.70.180', +'27020' : '2.70.20', +'270220' : '2.70.220', +'270240' : '2.70.240', +'270250' : '2.70.250', +'27040' : '2.70.40', +'27050' : '2.70.50', +'27070' : '2.70.70', +'2709' : '2.70.9', +'27098' : '2.70.98', +'28010' : '2.80.10', +'29010' : '2.90.10', +'31010' : '3.10.10', +'310100' : '3.10.100', +'310105' : '3.10.105', +'310110' : '3.10.110', +'310120' : '3.10.120', +'310129' : '3.10.129', +'310130' : '3.10.130', +'310150' : '3.10.150', +'310170' : '3.10.170', +'310180' : '3.10.180', +'310196' : '3.10.196', +'31020' : '3.10.20', +'310200' : '3.10.200', +'31025' : '3.10.25', +'310250' : '3.10.250', +'310260' : '3.10.260', +'310270' : '3.10.270', +'31028' : '3.10.28', +'310290' : '3.10.290', +'310300' : '3.10.300', +'310310' : '3.10.310', +'310320' : '3.10.320', +'310330' : '3.10.330', +'310340' : '3.10.340', +'310390' : '3.10.390', +'310400' : '3.10.400', +'310440' : '3.10.440', +'310450' : '3.10.450', +'310460' : '3.10.460', +'310490' : '3.10.490', +'31050' : '3.10.50', +'310010' : '3.100.10', +'31510' : '3.15.10', +'31520' : '3.15.20', +'32010' : '3.20.10', +'320100' : '3.20.100', +'320110' : '3.20.110', +'320120' : '3.20.120', +'320130' : '3.20.130', +'320140' : '3.20.140', +'32016' : '3.20.16', +'32019' : '3.20.19', +'32020' : '3.20.20', +'32070' : '3.20.70', +'32080' : '3.20.80', +'32090' : '3.20.90', +'33010' : '3.30.10', +'3301010' : '3.30.1010', +'3301020' : '3.30.1020', +'3301030' : '3.30.1030', +'3301040' : '3.30.1040', +'3301050' : '3.30.1050', +'3301060' : '3.30.1060', +'330110' : '3.30.110', +'3301110' : '3.30.1110', +'3301120' : '3.30.1120', +'3301130' : '3.30.1130', +'3301150' : '3.30.1150', +'3301160' : '3.30.1160', +'3301180' : '3.30.1180', +'3301220' : '3.30.1220', +'3301230' : '3.30.1230', +'3301240' : '3.30.1240', +'3301270' : '3.30.1270', +'3301280' : '3.30.1280', +'3301300' : '3.30.1300', +'3301310' : '3.30.1310', +'3301330' : '3.30.1330', +'3301340' : '3.30.1340', +'3301360' : '3.30.1360', +'3301370' : '3.30.1370', +'3301380' : '3.30.1380', +'3301390' : '3.30.1390', +'3301400' : '3.30.1400', +'3301430' : '3.30.1430', +'3301440' : '3.30.1440', +'3301450' : '3.30.1450', +'3301460' : '3.30.1460', +'3301480' : '3.30.1480', +'3301490' : '3.30.1490', +'3301500' : '3.30.1500', +'3301520' : '3.30.1520', +'3301530' : '3.30.1530', +'3301540' : '3.30.1540', +'3301560' : '3.30.1560', +'3301570' : '3.30.1570', +'3301590' : '3.30.1590', +'330160' : '3.30.160', +'3301600' : '3.30.1600', +'3301620' : '3.30.1620', +'3301650' : '3.30.1650', +'3301660' : '3.30.1660', +'3301670' : '3.30.1670', +'3301690' : '3.30.1690', +'330170' : '3.30.170', +'3301700' : '3.30.1700', +'3301720' : '3.30.1720', +'3301750' : '3.30.1750', +'3301760' : '3.30.1760', +'3301770' : '3.30.1770', +'3301780' : '3.30.1780', +'330190' : '3.30.190', +'33020' : '3.30.20', +'330200' : '3.30.200', +'330210' : '3.30.210', +'330230' : '3.30.230', +'330240' : '3.30.240', +'330250' : '3.30.250', +'330280' : '3.30.280', +'33030' : '3.30.30', +'330300' : '3.30.300', +'330310' : '3.30.310', +'330350' : '3.30.350', +'330360' : '3.30.360', +'330365' : '3.30.365', +'330370' : '3.30.370', +'330379' : '3.30.379', +'330380' : '3.30.380', +'330386' : '3.30.386', +'330387' : '3.30.387', +'330390' : '3.30.390', +'33040' : '3.30.40', +'330410' : '3.30.410', +'330413' : '3.30.413', +'330420' : '3.30.420', +'330428' : '3.30.428', +'330429' : '3.30.429', +'33043' : '3.30.43', +'330430' : '3.30.430', +'33044' : '3.30.44', +'330450' : '3.30.450', +'330457' : '3.30.457', +'33046' : '3.30.46', +'330460' : '3.30.460', +'330465' : '3.30.465', +'330470' : '3.30.470', +'330479' : '3.30.479', +'330497' : '3.30.497', +'330499' : '3.30.499', +'33050' : '3.30.50', +'330500' : '3.30.500', +'330505' : '3.30.505', +'330519' : '3.30.519', +'330530' : '3.30.530', +'330538' : '3.30.538', +'330540' : '3.30.540', +'330559' : '3.30.559', +'33056' : '3.30.56', +'330560' : '3.30.560', +'330565' : '3.30.565', +'330572' : '3.30.572', +'330590' : '3.30.590', +'33060' : '3.30.60', +'33063' : '3.30.63', +'33066' : '3.30.66', +'33067' : '3.30.67', +'33070' : '3.30.70', +'330700' : '3.30.700', +'330710' : '3.30.710', +'330740' : '3.30.740', +'330750' : '3.30.750', +'330760' : '3.30.760', +'330830' : '3.30.830', +'330870' : '3.30.870', +'3309' : '3.30.9', +'330900' : '3.30.900', +'330920' : '3.30.920', +'330930' : '3.30.930', +'330950' : '3.30.950', +'330990' : '3.30.990', +'3401000' : '3.40.1000', +'3401010' : '3.40.1010', +'3401030' : '3.40.1030', +'3401050' : '3.40.1050', +'3401060' : '3.40.1060', +'3401080' : '3.40.1080', +'340109' : '3.40.109', +'3401090' : '3.40.1090', +'3401120' : '3.40.1120', +'3401130' : '3.40.1130', +'3401140' : '3.40.1140', +'3401160' : '3.40.1160', +'3401170' : '3.40.1170', +'3401180' : '3.40.1180', +'3401190' : '3.40.1190', +'340120' : '3.40.120', +'3401210' : '3.40.1210', +'3401230' : '3.40.1230', +'3401280' : '3.40.1280', +'3401310' : '3.40.1310', +'3401340' : '3.40.1340', +'3401350' : '3.40.1350', +'3401360' : '3.40.1360', +'3401370' : '3.40.1370', +'3401380' : '3.40.1380', +'3401390' : '3.40.1390', +'340140' : '3.40.140', +'3401400' : '3.40.1400', +'3401410' : '3.40.1410', +'3401420' : '3.40.1420', +'3401440' : '3.40.1440', +'3401450' : '3.40.1450', +'3401470' : '3.40.1470', +'3401490' : '3.40.1490', +'3401500' : '3.40.1500', +'3401510' : '3.40.1510', +'3401520' : '3.40.1520', +'3401530' : '3.40.1530', +'3401540' : '3.40.1540', +'3401550' : '3.40.1550', +'3401560' : '3.40.1560', +'340190' : '3.40.190', +'340192' : '3.40.192', +'340198' : '3.40.198', +'34020' : '3.40.20', +'340210' : '3.40.210', +'340220' : '3.40.220', +'340225' : '3.40.225', +'340228' : '3.40.228', +'340250' : '3.40.250', +'34030' : '3.40.30', +'340309' : '3.40.309', +'34033' : '3.40.33', +'34035' : '3.40.35', +'340350' : '3.40.350', +'340366' : '3.40.366', +'340367' : '3.40.367', +'340390' : '3.40.390', +'340395' : '3.40.395', +'340420' : '3.40.420', +'340430' : '3.40.430', +'340440' : '3.40.440', +'340449' : '3.40.449', +'340462' : '3.40.462', +'34047' : '3.40.47', +'340470' : '3.40.470', +'34050' : '3.40.50', +'340532' : '3.40.532', +'340570' : '3.40.570', +'340580' : '3.40.580', +'340600' : '3.40.600', +'340605' : '3.40.605', +'340630' : '3.40.630', +'340640' : '3.40.640', +'340710' : '3.40.710', +'340718' : '3.40.718', +'340720' : '3.40.720', +'34080' : '3.40.80', +'340800' : '3.40.800', +'340810' : '3.40.810', +'340830' : '3.40.830', +'340850' : '3.40.850', +'34091' : '3.40.91', +'340920' : '3.40.920', +'340930' : '3.40.930', +'340950' : '3.40.950', +'340960' : '3.40.960', +'340970' : '3.40.970', +'340980' : '3.40.980', +'35020' : '3.50.20', +'35030' : '3.50.30', +'3504' : '3.50.4', +'35050' : '3.50.50', +'3507' : '3.50.7', +'35070' : '3.50.70', +'35080' : '3.50.80', +'35510' : '3.55.10', +'35520' : '3.55.20', +'35530' : '3.55.30', +'36010' : '3.60.10', +'360100' : '3.60.100', +'360110' : '3.60.110', +'360120' : '3.60.120', +'360130' : '3.60.130', +'360140' : '3.60.140', +'36015' : '3.60.15', +'36020' : '3.60.20', +'36021' : '3.60.21', +'36040' : '3.60.40', +'36070' : '3.60.70', +'3609' : '3.60.9', +'36090' : '3.60.90', +'36510' : '3.65.10', +'37010' : '3.70.10', +'37510' : '3.75.10', +'38010' : '3.80.10', +'39010' : '3.90.10', +'3901000' : '3.90.1000', +'3901010' : '3.90.1010', +'3901020' : '3.90.1020', +'390105' : '3.90.105', +'3901070' : '3.90.1070', +'390110' : '3.90.110', +'3901140' : '3.90.1140', +'3901150' : '3.90.1150', +'3901160' : '3.90.1160', +'3901170' : '3.90.1170', +'3901180' : '3.90.1180', +'390120' : '3.90.120', +'3901200' : '3.90.1200', +'3901210' : '3.90.1210', +'3901230' : '3.90.1230', +'3901240' : '3.90.1240', +'3901260' : '3.90.1260', +'3901280' : '3.90.1280', +'3901290' : '3.90.1290', +'3901300' : '3.90.1300', +'3901310' : '3.90.1310', +'390132' : '3.90.132', +'3901320' : '3.90.1320', +'3901330' : '3.90.1330', +'3901340' : '3.90.1340', +'3901350' : '3.90.1350', +'3901390' : '3.90.1390', +'3901430' : '3.90.1430', +'3901470' : '3.90.1470', +'3901480' : '3.90.1480', +'39015' : '3.90.15', +'3901520' : '3.90.1520', +'3901530' : '3.90.1530', +'3901550' : '3.90.1550', +'3901570' : '3.90.1570', +'3901580' : '3.90.1580', +'3901600' : '3.90.1600', +'3901630' : '3.90.1630', +'3901640' : '3.90.1640', +'390170' : '3.90.170', +'390175' : '3.90.175', +'390176' : '3.90.176', +'390180' : '3.90.180', +'390182' : '3.90.182', +'390190' : '3.90.190', +'390198' : '3.90.198', +'39020' : '3.90.20', +'390209' : '3.90.209', +'390210' : '3.90.210', +'390215' : '3.90.215', +'390220' : '3.90.220', +'390226' : '3.90.226', +'390228' : '3.90.228', +'390230' : '3.90.230', +'390245' : '3.90.245', +'390249' : '3.90.249', +'39025' : '3.90.25', +'390260' : '3.90.260', +'390280' : '3.90.280', +'390310' : '3.90.310', +'390320' : '3.90.320', +'390330' : '3.90.330', +'390340' : '3.90.340', +'390350' : '3.90.350', +'390370' : '3.90.370', +'390380' : '3.90.380', +'39039' : '3.90.39', +'390390' : '3.90.390', +'390400' : '3.90.400', +'390420' : '3.90.420', +'390440' : '3.90.440', +'39045' : '3.90.45', +'390450' : '3.90.450', +'390460' : '3.90.460', +'390470' : '3.90.470', +'390480' : '3.90.480', +'39050' : '3.90.50', +'390510' : '3.90.510', +'390540' : '3.90.540', +'39055' : '3.90.55', +'390550' : '3.90.550', +'390570' : '3.90.570', +'390580' : '3.90.580', +'390600' : '3.90.600', +'390640' : '3.90.640', +'390660' : '3.90.660', +'390670' : '3.90.670', +'39070' : '3.90.70', +'390700' : '3.90.700', +'390730' : '3.90.730', +'390740' : '3.90.740', +'39075' : '3.90.75', +'39076' : '3.90.76', +'390770' : '3.90.770', +'39078' : '3.90.78', +'390780' : '3.90.780', +'39079' : '3.90.79', +'39080' : '3.90.80', +'390800' : '3.90.800', +'390840' : '3.90.840', +'390850' : '3.90.850', +'390870' : '3.90.870', +'390900' : '3.90.900', +'390910' : '3.90.910', +'390920' : '3.90.920', +'390930' : '3.90.930', +'390940' : '3.90.940', +'390950' : '3.90.950', +'390960' : '3.90.960', +'390970' : '3.90.970', +'390980' : '3.90.980', +'41010' : '4.10.10', +'4101020' : '4.10.1020', +'4101070' : '4.10.1070', +'4101080' : '4.10.1080', +'4101090' : '4.10.1090', +'410110' : '4.10.110', +'410160' : '4.10.160', +'410220' : '4.10.220', +'410260' : '4.10.260', +'410270' : '4.10.270', +'410280' : '4.10.280', +'410372' : '4.10.372', +'410375' : '4.10.375', +'410410' : '4.10.410', +'410420' : '4.10.420', +'410450' : '4.10.450', +'410470' : '4.10.470', +'410480' : '4.10.480', +'410490' : '4.10.490', +'410520' : '4.10.520', +'410530' : '4.10.530', +'410540' : '4.10.540', +'410550' : '4.10.550', +'41070' : '4.10.70', +'410740' : '4.10.740', +'410790' : '4.10.790', +'4108' : '4.10.8', +'410800' : '4.10.800', +'410870' : '4.10.870', +'41091' : '4.10.91', +'41093' : '4.10.93', +'410940' : '4.10.940', +'41095' : '4.10.95', +'410950' : '4.10.950', +'410960' : '4.10.960', +'410990' : '4.10.990' +} + + diff --git a/scripts/convdb2.py b/scripts/convdb2.py new file mode 100755 index 0000000..3a7e985 --- /dev/null +++ b/scripts/convdb2.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python +############################################################################### +# +# convdb2.py - Convert PTTableauPacked tableaux db plus Numeri distance matrix +# to single ASCII format file. +# +# File: convdb2.py +# Author: Alex Stivala +# Created: August 2008 +# +# +# Usage: +# convdb2.py [-l] [-s] inputtableauxdb inputdistmatrixdb [< inputlist] > outputfile +# +# (output is to stdout) +# Input on stdin is list (one per line) of identifiers to include in output, +# if -l i specified. +# Sort the db by tableau size if -s is specified. +# +# Requires the Numeric library, as well as the pttableau module. +# +# $Id: convdb2.py 3496 2010-03-19 01:30:36Z alexs $ +# +############################################################################### + +import sys +import getopt +import pickle +import numpy.oldnumeric as Numeric +from pttableau import PTTableauPacked +from ptutils import isNaN + + +""" +This script converts tableaux database in pickled PTTableauPacked format +and distance matrix database in pickled Numeric.array format +built by buildtableauxdb.py to a simple fixed field width ASCII +format useful for parsing by other programs (especially FORTRAN). +There must be both a tableau and distance matrix for each entry (i.e. +the two input databases contain data for same identifiers). + +The format of the tableau input is pickled PTTableauPacked objects built by +buildtableauxdb.py, and the distance matrix input is pickled Numeric.array +objects built by buildtableauxdb.py (with -d option). + + +The format of the 'database' is a text file with an entry for each +structure. +The first line of an entry is the identifier and +order of tableau (i.e. dimension of square array), then +each subsequent row is a row of the tableau, lower triangle +only (since it is symmetric). +The diagonal entries are meaningless (self-angle) in tableaux, +and are included instead to specify the SSE type, with +the following codes: + +e beta strand +xa alpha helix +xi pi helix +xg 3_10 helix + +Width of identifier is 8 chars, blank padded on right, +width of order is 4 digits, blank padded on left. +There is a single space between identifier and order. +Each entry in tableau is two characters, with a space betwen +each on a line, and one line +per row of matrix. + +Following the tableau is the distance matrix. +Each row is a row of the distance matrix, lower triangle +only (since it is symmetric). +The diagonal entries are meaningless (self-distance) +and are included instead to specify the SSE type, with +the following codes: + +0.000 beta strand +1.000 alpha helix +2.000 pi helix +3.000 3_10 helix + +Each entry in matrix is in Angstroms format +F6.3 with a space between each on a line, and one line +per row of matrix. +NB any NaN values are converted to 0.000 in the output. + + +E.g.: + +/local/charikar/astivala/tableauxdb/astral/tableauxdb.ascii + T F +D1UBIA_ 8 +e +OT e +LE RT xa +PD OS RD xg +RT LE RT LS e +LE RD LE LS OT e +RT LS LS RD PE OS xg +PE RT LE RD OT PE RT e + 0.000 + 4.501 0.000 + 1.662 10.386 1.000 +16.932 17.644 9.779 3.000 +10.588 13.738 11.815 10.527 0.000 +15.025 18.692 17.143 15.341 6.466 0.000 +15.298 17.276 16.276 20.075 13.264 11.610 3.000 + 7.549 11.072 12.248 12.446 4.583 9.903 15.689 0.000 + +There is a blank line between each entry. + +""" + + + +def sizecmp(dbent1, dbent2): + """ + Comparison function for (pdbid,dbtablist) + tuples used to sort database by size + """ + tab1 = dbent1[1][0] + tab2 = dbent2[1][0] + if len(tab1) < len(tab2): + return -1 + elif len(tab1) > len(tab2): + return 1 + else: + return 0 + + +def usage(prog): + """ + print usage message and exit + """ + sys.stderr.write("Usage: " + prog + " [-ls] inputtableauxdb inputdistmatrixdb[< inputlist] > outputfile\n") + sys.stderr.write(" -l : read list of SIDs on stdin\n") + sys.stderr.write(" -s : sort database by size\n") + sys.exit(1) + + +def main(): + """ + main for convdb2.py - load PTTableauPacked pickle and Numeric.array + distance matrix pickle and + output as ascii + """ + use_sidlist = False + do_sort = False + + try: + opts, args = getopt.getopt(sys.argv[1:], "ls") + except getopt.GetoptError: + usage(sys.argv[0]) + + for opt,arg in opts: + if opt == "-l": # read list of identifeirs on stdin + use_sidlist = True + elif opt == "-s": # sort db by size + do_sort = True + else: + usage(sys.argv[0]) + + if len(args) != 2: + usage(sys.argv[0]) + + dbfile = args[0] + distmatrixdbfile = args[1] + + if use_sidlist: + # make dictionary of identifiers from list on stdin + sid_dict = dict([(x.strip(), None) for x in sys.stdin.readlines()]) + + sys.stderr.write('loading tableaux...\n') + db = pickle.load(open(dbfile)) + sys.stderr.write('loading distance matrices...\n') + distmatrixdb = pickle.load(open(distmatrixdbfile)) + sys.stderr.write('writing ASCII format tableaux+distmatrices...\n') + first = True + count = 0 + total_count = 0 + dblist = [ (pdbid,dbtablist) for (pdbid,dbtablist) in db.iteritems() + if len(dbtablist) > 0] # remove those with no tableaux + if do_sort: + sys.stderr.write("sorting database...\n") + dblist.sort(cmp=sizecmp) + + for pdbid,dbtablist in dblist: + if use_sidlist and not sid_dict.has_key(pdbid): + continue + tabnum = 0 + while tabnum < len(dbtablist): + tableau = dbtablist[tabnum] + n = len(tableau) + name = pdbid + if len(dbtablist) > 1: + name += str(tabnum) + try: + distmatrix = distmatrixdb[pdbid][tabnum] + except KeyError: + sys.stderr.write('ERROR: no distance matrix for id ' + + pdbid + ' - skipped\n') + tabnum += 1 + continue + if len(distmatrix) != len(tableau): + sys.stderr.write('ERROR: dist matrix order ' + + str(len(distmatrix)) + ' but tableau order ' + + str(len(tableau)) + ' for id ' + pdbid + + ' - skipped\n') + tabnum += 1 + continue + if not first: + sys.stdout.write('\n') + else: + first = False + sys.stdout.write('%6s %4d\n' % (name, n)) + for i in xrange(n): + for j in xrange(i+1): + sys.stdout.write(tableau[(i,j)] + ' ') + sys.stdout.write('\n') + for i in xrange(n): + for j in xrange(i+1): + if isNaN(distmatrix[i,j]): + dist = 0.0 + else: + dist = distmatrix[i,j] + sys.stdout.write('%6.3f ' % dist) + sys.stdout.write('\n') + total_count += 1 + tabnum += 1 + count += 1 + sys.stderr.write('wrote %d tableaux+distmatrices for %d entries\n' + % (total_count, count)) + + +if __name__ == "__main__": + main() diff --git a/scripts/convdbnumeric2ascii.py b/scripts/convdbnumeric2ascii.py new file mode 100755 index 0000000..7630bfe --- /dev/null +++ b/scripts/convdbnumeric2ascii.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +############################################################################### +# +# convdbnumeric2ascii.py - Convert numeric tableaux db to ASCII format +# +# File: convdbnumeric2ascii.py +# Author: Alex Stivala +# Created: July 2008 +# +# +# Usage: +# convdbnumeric2ascii.py inputdb > outputfile +# +# (output is to stdout) +# +# This script is standalone, ie does not import any of the ptgraph +# etc. modules. It requires only the Numeric python library. +# +# $Id: convdbnumeric2ascii.py 2703 2009-07-27 06:01:05Z astivala $ +# +############################################################################### + +import sys +import pickle +import numpy.oldnumeric as Numeric + + +""" +This script converts a Numeric (Omega matrix) tableaux database as +built by buildtableauxdb.py -n to a simple fixed field width ASCII +format useful for parsing by other programs (especially FORTRAN). + +The format of the input is pickled Numeric.array values saved by +buildtableauxdb.py. + +The format of the output 'database' is a text file with an entry for each +structure. The first line of an entry is the identifier and +order of tableau (i.e. dimension of square Omega matrix), then +each subsequent row is a row of the Omega matrix, lower triangle +only (since it is symmetric). +The diagonal entries are meaningless (self-angle) in tableaux, +and are included instead to specify the SSE type, with +the following codes: + +0.000 beta strand +1.000 alpha helix +2.000 pi helix +3.000 3_10 helix + +Width of identifier is 8 chars, blank padded on right, +width of order is 4 digits, blank padded on left. +There is a single space between identifier and order. +Each entry in Omega matrix is in radians in [-pi, pi] format +F6.3 with a space between each on a line, and one line +per row of matrix. + +E.g.: + +D1UBIA_ 6 + 0.000 + 2.650 0.000 +-1.170 2.150 1.000 + 2.040 -1.140 2.080 0.000 +-1.260 1.560 -1.110 2.990 0.000 +-0.590 2.100 -1.230 2.570 -0.720 0.000 + +There is a blank line between each entry. + +Note any NaN values are converted to 0.0 in the output. +""" + +def isNaN(x): + """ + Test if supplied float is an IEEE not-a-number (NaN). + For some reason Python does not hav a function to do this, + and nor does Numeric (although numpy and scipy have support for it). + + Parameters: + x - float to test for NaN + + Return value: + True if x is NaN, else False. + """ + # NaN is the only float value that is not equal to itself (IEEE + # standard) + if x != x: + return True + else: + return False + + +def usage(prog): + """ + print usage message and exit + """ + sys.stderr.write("Usage: " + prog + " inputdb > outputfile\n") + sys.exit(1) + + +def main(): + """ + main for convdbnumeric2ascii.py - load Numeric pickle and output as ascii + """ + if len(sys.argv) != 2: + usage(sys.argv[0]) + + dbfile = sys.argv[1] + db = pickle.load(open(dbfile)) + for pdbid,dbtablist in db.iteritems(): + tabnum = 0 + while tabnum < len(dbtablist): + omega = dbtablist[tabnum] + n = Numeric.shape(omega)[0] + name = pdbid + if len(dbtablist) > 1: + name += str(tabnum) + sys.stdout.write('%6s %4d\n' % (name, n)) + for i in xrange(n): + for j in xrange(i+1): + if isNaN(omega[i,j]): + angle = 0.0 + else: + angle = omega[i,j] + sys.stdout.write('%6.3f ' % angle) + sys.stdout.write('\n') + sys.stdout.write('\n') + tabnum += 1 + + +if __name__ == "__main__": + main() diff --git a/scripts/convdbpacked2ascii.py b/scripts/convdbpacked2ascii.py new file mode 100755 index 0000000..d07fd8d --- /dev/null +++ b/scripts/convdbpacked2ascii.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +############################################################################### +# +# convdbpacked2ascii.py - Convert PTTableauPacked tableaux db to ASCII format +# +# File: convdbpacked2ascii.py +# Author: Alex Stivala +# Created: July 2008 +# +# +# Usage: +# convdbpacked2ascii.py inputdb > outputfile +# +# (output is to stdout) +# +# $Id: convdbpacked2ascii.py 1701 2008-07-18 00:15:06Z astivala $ +# +############################################################################### + +import sys +import pickle +from pttableau import PTTableauPacked + + +""" +This script converts tableaux database in pickled PTTableauPacked format +built by buildtableauxdb.py to a simple fixed field width ASCII +format useful for parsing by other programs (especially FORTRAN). + +The format of the input is pickled PTTableauPacked objects built by +buildtableauxdb.py. + + +The format of the 'database' is a text file with an entry for each +structure. +The first line of an entry is the identifier and +order of tableau (i.e. dimension of square array), then +each subsequent row is a row of the tableau, lower triangle +only (since it is symmetric). +The diagonal entries are meaningless (self-angle) in tableaux, +and are included instead to specify the SSE type, with +the following codes: + +e beta strand +xa alpha helix +xi pi helix +xg 3_10 helix + +Width of identifier is 8 chars, blank padded on right, +width of order is 4 digits, blank padded on left. +There is a single space between identifier and order. +Each entry in tableau is two characters, with a space betwen +each on a line, and one line +per row of matrix. + +E.g.: + +/local/charikar/astivala/tableauxdb/astral/tableauxdb.ascii + T F +D1UBIA_ 6 +e +OT e +LE RT xa +RT LE RT e +LE RD LE OT e +PE RT LE OT PE e + +There is a blank line between each entry. + +""" + +def usage(prog): + """ + print usage message and exit + """ + sys.stderr.write("Usage: " + prog + " inputdb > outputfile\n") + sys.exit(1) + + +def main(): + """ + main for convdbpacked2ascii.py - load PTTableauPacked pickle and + output as ascii + """ + if len(sys.argv) != 2: + usage(sys.argv[0]) + + dbfile = sys.argv[1] + db = pickle.load(open(dbfile)) + for pdbid,dbtablist in db.iteritems(): + tabnum = 0 + while tabnum < len(dbtablist): + tableau = dbtablist[tabnum] + n = len(tableau) + name = pdbid + if len(dbtablist) > 1: + name += str(tabnum) + sys.stdout.write('%6s %4d\n' % (name, n)) + for i in xrange(n): + for j in xrange(i+1): + sys.stdout.write(tableau[(i,j)] + ' ') + sys.stdout.write('\n') + sys.stdout.write('\n') + tabnum += 1 + + +if __name__ == "__main__": + main() diff --git a/scripts/convdbpacked2vector.py b/scripts/convdbpacked2vector.py new file mode 100755 index 0000000..4865ef9 --- /dev/null +++ b/scripts/convdbpacked2vector.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python +############################################################################### +# +# convdbpacked2ascii.py - Convert PTTableauPacked tableaux db to vector form +# +# File: convdbpacked2vector.py +# Author: Alex Stivala +# Created: March2010 +# +# +# Usage: +# convdbpacked2vector.py inputdb > outputfile +# +# (output is to stdout) +# +# $Id: convdbpacked2vector.py 3528 2010-03-25 05:33:30Z alexs $ +# +############################################################################### + +import sys +import pickle +import numpy +from pttableau import PTTableauPacked + + +""" +This script converts tableaux database in pickled PTTableauPacked format +built by buildtableauxdb.py to the vector feature format used +FORTRAN-77 implementation of IR Tableau, tableau searching by +vector cosine similarity as described by: + +Zhang, Bailey, Konagurthu, Ramamohanarao 2010 'A fast indexing approach +to protein structure comparison' BMC Bioinformatics 11(Suppl 1):S46D +8th Asia-Pacific Bioinformatics Conference (APBC 2010) + +The format of the input is pickled PTTableauPacked objects built by +buildtableauxdb.py. + +The output 'database' is an ASCII file where each line contains +the structure identifier and the 32-element integer vector representing +the tableau for that structure reduced to the 'bag of words' vector +as described in the paper cited above. + + +The output is to stdout. + +""" + + +# TYPE_CODE_DICT gives an index 0..32 for each combinatino of a pair +# of SSE type codes ('ee' is two strands, 'ex' is strand/helix, etc.) +# and tableau code +TYPE_CODE_DICT = \ +{ + ('xx','PE') : 0, + ('xx','PD') : 1, + ('xx','RD') : 2, + ('xx','RT') : 3, + ('xx','OT') : 4, + ('xx','OS') : 5, + ('xx','LS') : 6, + ('xx','LE') : 7, + + ('xe','PE') : 8, + ('xe','PD') : 9, + ('xe','RD') : 10, + ('xe','RT') : 11, + ('xe','OT') : 12, + ('xe','OS') : 13, + ('xe','LS') : 14, + ('xe','LE') : 15, + + ('ex','PE') : 16, + ('ex','PD') : 17, + ('ex','RD') : 18, + ('ex','RT') : 19, + ('ex','OT') : 20, + ('ex','OS') : 21, + ('ex','LS') : 22, + ('ex','LE') : 23, + + ('ee','PE') : 24, + ('ee','PD') : 25, + ('ee','RD') : 26, + ('ee','RT') : 27, + ('ee','OT') : 28, + ('ee','OS') : 29, + ('ee','LS') : 30, + ('ee','LE') : 31 +} + +def tableau_to_feature_vector(tableau): + """ + Given a tableau in PTTableau format, convert to feature vector + (numpy.array vector of integers) of dimension 32, in which each + element is the count of the number of occurrences of one of the + 32 (4 different SSE combinatinos and 8 differet two-character + tableau codes) possible types, as described by the Zhou et al 2010 + IR Tableau paper. + + Paramteters: + tableau - tableau in PTTableau format + + Return value: + numpy.array shape (32) vector of integers + + Uses global constant TYPE_CODE_DICT to get index for the type/tabcode + combination + """ + fvec = numpy.zeros(32, dtype='i') + for i in xrange(len(tableau)): + for j in xrange(i+1,len(tableau)): + if tableau[(i,i)][0] == 'e': + ssetypecode = 'e' + else: + ssetypecode = 'x' + if tableau[(j,j)][0] == 'e': + ssetypecode += 'e' + else: + ssetypecode += 'x' + tabcode = tableau[(i,j)] + if tabcode == '??': # sometimes cannot get angle, ignore + continue + vecindx = TYPE_CODE_DICT[(ssetypecode, tabcode)] + fvec[vecindx] += 1 + return fvec + + +def usage(prog): + """ + print usage message and exit + """ + sys.stderr.write("Usage: " + prog + " inputdb > outputfile\n") + sys.exit(1) + + +def main(): + """ + main for convdbpacked2vector.py - load PTTableauPacked pickle and + output feature vectors in ASCII + """ + if len(sys.argv) != 2: + usage(sys.argv[0]) + + dbfile = sys.argv[1] + sys.stderr.write('loading tableaux... ') + db = pickle.load(open(dbfile)) + sys.stderr.write('done\n') + for pdbid,dbtablist in db.iteritems(): + tabnum = 0 + while tabnum < len(dbtablist): + tableau = dbtablist[tabnum] + n = len(tableau) + name = pdbid + if len(dbtablist) > 1: + name += str(tabnum) + sys.stdout.write('%s ' % name) + fvec = tableau_to_feature_vector(tableau) + sys.stdout.write(" ".join(str(x) for x in fvec)) + sys.stdout.write('\n') + tabnum += 1 + + +if __name__ == "__main__": + main() diff --git a/scripts/daliliteout2col.py b/scripts/daliliteout2col.py new file mode 100755 index 0000000..abf450c --- /dev/null +++ b/scripts/daliliteout2col.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# +# File: daliliteout2col.sh +# Author: Alex Stivala +# Created: March 2010 +# +# daliliteout2col.sh - Convert DaliLite .dccp output format to 2 column +# format as output by tsrchd_sparse etc. which can +# be processed with tsevalfn.py etc. +# +# Usage: daliliteout2col.sh < dccpfile +# +# The input file is read fomr stdin +# +# Output has two columns, database id and DaliLite Z-score +# +# Output is to stdout. +# +# +# We also convert pdbid with chain format e.g. 1atnA to format +# for Fischer evaluation e.g. 1atn_a +# and those that did not have chain specified in Fischer data set, +# do now have for DaliLite, so remove them (so we need the Fischer +# pdb list) +# +# $Id: daliliteout2col.py 3575 2010-04-22 00:35:26Z alexs $ +# + + +import sys,os +from itertools import groupby + +from fischer_tables import FISCHER_ID_FOLD_DICT + +def daliid_to_fischerid(daliid): + """ + Convert a DaliLite id with chain e.g. 1atnA + to format for Fischer dat set e.g. 1atn_a, those without + chains in Fischer data set + e.g. 1cew have chain removed i.e. 1cewA becomes 1cew + NB all inputs (Dali ids) have chain + + Parameters: + daliid -DaliLite identifier, with chain on end (no dliemiter) + + Return value: + PDB identifier with chain after _ + """ + if FISCHER_ID_FOLD_DICT.has_key(daliid[:4]): + return daliid[:4] + else: + return daliid[:4] + '_' + daliid[4] + +def usage(progname): + sys.stderr.write("Usage: " + progname + " < dccpdata\n") + sys.exit(1) + + +if len(sys.argv) != 1: + usage(os.path.basename(sys.argv[0])) + + +querypdbid = None + + +scorelist = [] # list of (targetpdbid,zscore) tuples +for line in sys.stdin: + splitline = line.split() + if len(splitline) > 0 and splitline[0] == 'DCCP': + if len(splitline) == 10: + targetpdbid = splitline[9] + zscore = splitline[5] + if querypdbid == None: + querypdbid = splitline[8] + else: # sometimes fields 2 and 3 get stuck together + targetpdbid = splitline[8] + zscore = splitline[4] + scorelist.append((targetpdbid, zscore)) + +# for reasons I don't understand (and can't find any documentation on the +# dccp file format) there are often two or more entries for the same target +# with differing Z-scores and other values. We will also choose the one +# with highest Z-score +single_scorelist = [] +targetpdbid_group_iter = groupby(sorted(scorelist), lambda t : t[0]) +for (targetpdbid, targetpdbid_iter) in targetpdbid_group_iter: + maxzscore = max([zscore for (pdbid,zscore) in targetpdbid_iter]) + single_scorelist.append((daliid_to_fischerid(targetpdbid), maxzscore)) + + +querypdbid = daliid_to_fischerid(querypdbid) + +sys.stdout.write('# QUERY ID = ' + querypdbid + '\n') +for (targetpdbid, zscore) in single_scorelist: + sys.stdout.write('%s %s\n' % (targetpdbid, zscore)) + diff --git a/scripts/dalilitequery2col.py b/scripts/dalilitequery2col.py new file mode 100755 index 0000000..7b07a40 --- /dev/null +++ b/scripts/dalilitequery2col.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# +# File: dalilitequery2col.sh +# Author: Alex Stivala +# Created: March 2010 +# +# dalilitequery2col.sh - Convert DaliLite .dccp output format to 2 column +# format as output by tsrchd_sparse etc. which can +# be processed with tsevalfn.py etc. +# +# Usage: dalilitequery2col.sh < querysidlist +# +# The input file is read as pdbid.dccp in cwd +# where the pdbid is constructed from +# the query id SCOP sid e.g. d1u6ra2 becomes 1u6rA, for each query sid +# in the querysidlist (one sid per line) on stdin. +# +# Output has two columns, database id and DaliLite Z-score +# +# Output is to stdout. +# +# $Id: dalilitequery2col.py 3473 2010-03-16 04:06:23Z alexs $ +# + + +import sys,os +from itertools import groupby + +from Bio.SCOP import * + +from tsevalutils import filter_domains_astral_nrpercent +from pathdefs import SCOP_DIR,SCOP_VERSION +from pdbid2scopsid import pdbid_to_scopsid + + +def usage(progname): + sys.stderr.write("Usage: " + progname + " < querysidlist\n") + sys.exit(1) + + +if len(sys.argv) != 1: + usage(os.path.basename(sys.argv[0])) + + + +# read SCOP and ASTRAL data +sys.stderr.write('reading SCOP data...\n') +scop = Scop(dir_path=SCOP_DIR,version=SCOP_VERSION) +astral = Astral(dir_path=SCOP_DIR,version=SCOP_VERSION,scop=scop) +nrpercent = 95 # Always use 95% nr subset. TODO make this an option +all_domains = scop.getRoot().getDescendents('domain') +if nrpercent != None: + all_domains = filter_domains_astral_nrpercent(all_domains, + scop, astral, + nrpercent) +all_scopsids_dict = dict( [(d.sid,True) for d in all_domains] ) + + +for querysid in sys.stdin: + querysid = querysid.rstrip() + querypdbid = querysid[1:5] + querysid[5].upper() + dccpfilename = querypdbid + ".dccp" + + scorelist = [] # list of (targetsid,zscore) tuples + for line in open(dccpfilename): + splitline = line.split() + if len(splitline) > 0 and splitline[0] == 'DCCP': + if len(splitline) == 10: + targetpdbid = splitline[9] + zscore = splitline[5] + else: # sometimes fields 2 and 3 get stuck together + targetpdbid = splitline[8] + zscore = splitline[4] + targetsid = pdbid_to_scopsid(targetpdbid, all_scopsids_dict) + scorelist.append((targetsid, zscore)) + + # for reasons I don't understand (and can't find any documentation on the + # dccp file format) there are often two or more entries for the same target + # with differing Z-scores and other values. We will also choose the one + # with highest Z-score + single_scorelist = [] + targetsid_group_iter = groupby(sorted(scorelist), lambda t : t[0]) + for (targetsid, targetsid_iter) in targetsid_group_iter: + maxzscore = max([zscore for (sid,zscore) in targetsid_iter]) + single_scorelist.append((targetsid, maxzscore)) + + + sys.stdout.write('# QUERY ID = ' + querysid + '\n') + for (targetsid, zscore) in single_scorelist: + sys.stdout.write('%s %s\n' % (targetsid, zscore)) + diff --git a/scripts/domeval.py b/scripts/domeval.py new file mode 100644 index 0000000..9ad5377 --- /dev/null +++ b/scripts/domeval.py @@ -0,0 +1,459 @@ +############################################################################### +# +# domeval.py - functions to evaluate domain decomposition accuracy +# +# File: domeval.py +# Author: Alex Stivala +# Created: December 2007 +# +# $Id: domeval.py 3236 2010-01-13 02:06:50Z alexs $ +# +############################################################################### + +""" +Functions to evaluate domain decomposition accuracy. + +The accuracy of a domain decomposition is computed as overlap of predicted +(test) and assigned (reference) residues in the domain decomposition. +If a different number of domains is assigned, the decomposition is scored +as failed an no overlap is computed. + +This measure is as defined by Jones et al 1998 'Domain assignment for protein +structure using a consensus approach: Characterization and analysis', +Protein Science 7:233-242. + +""" +import os,sys + +from Bio.PDB import * + +from ptdomain import * +from parsepdom import * +from getdomains import * + + +def permutations(l): + """ + Return all the permutations of the list l. + Obivously this should only be used for extremely small lists (like less + than 9 members). + Paramters: + l - list to find permutations of + Return value: + list of lists, each list a permutation of l. + """ + if len(l) > 0: + return [ [x] + perms for x in l + for perms in permutations([ e for e in l if e != x]) ] + else: + return [[]] + +def compute_overlap_score(test_domlist, ref_domlist): + """ + Compute the overlap score between the two domain decompositions of + the same length represented by test and ref domlist. + + For two domain assignment methods, we can't assume they order + their domains the same way, so we have to consider every possible + mapping between them and use the best score, as dicussed in + Veretnik et al 2004 J. Mol. Biol. 339(3):647-678. + + + Parameters: + test_domlist - list of PTDomain from the test (predicted) decomposition + ref_domlist - list of PTDomain from the reference (gold standard) decomp. + + Return value: + The maximum overlap score over all permutations of domains + """ + assert(len(test_domlist) == len(ref_domlist)) + + # get the lowest and highest residue sequence number in each chain + # of the reference domains and build a dictionary from it. + chain_dict = build_domain_chaindict(ref_domlist) + + # compute the total number of residues spanned by the refernce domain list + total_residues = 0 + for (min_resnum, max_resnum) in chain_dict.itervalues(): + total_residues += max_resnum - min_resnum + 1 + + # verify that the test decompositino is valid + if not verify_domain_disjoint(test_domlist, chain_dict): + sys.stderr.write('ERROR: invalid domain decomposition (not disjoint)\n') + write_domains(sys.stderr, test_domlist) + return 0.0 + + # now compute the score for every possible mapping of the domains + # (so it suffices to use every permutation of one of the lists keeping + # the other fixed). Since we don't expect ever more than 8 domains + # (and usually 4 or fewer) this should be ok, but still expensive. + scores = [compute_overlap_score_ordered(test_permutation, ref_domlist, + chain_dict, total_residues) + for test_permutation in permutations(test_domlist)] + return max(scores) + + +def compute_overlap_score_ordered(test_domlist, ref_domlist, + chain_dict, total_residues): + """ + For two domain lists of the same length, ordered so that + corresponding domains 'line up' compute the overlap score + (discussed above) as the fraction of residues that are assigned to + the same domain. + + Note the ordering requirement. For two domain assignment methods, + we can't assume they order their domains the same way, so this + function has to be called multiple times with different orderings + to find the one with the best score. + + This is a bit more complicated as we handle multiple chains. + + Parameters: + test_domlist - list of PTDomain from the test (predicted) decomposition + ref_domlist - list of PTDomain from the reference (gold standard) decomp. + chain_dict - dict of {chainid : (min_resnum, max_resnum)} built by + caller. + total_resides - total number of residues in protein. + + Return value: + overlap score in [0, 1] + """ + assert(len(test_domlist) == len(ref_domlist)) + + # calculate the overlap score by going through each residue + # and counting 1 for overlap between the two domain decompositions. + # + # TODO: we could probably more efficiently compute this using the + # cut positions and a formula like the one in Emmert-Streib and Mushegian + # 2007 BMC Bioinformatics 8:237 + # but this is easier (if very inefficient). Howevr the Emmert-Streib + # equation assumes domains consist of only one segment (since that's + # how their DomainICA algorithm works) so is not general enough for + # our purposes. + overlap_count = 0 + for i in range(len(ref_domlist)): + for (chain, (min_resnum, max_resnum)) in chain_dict.iteritems(): + for resnum in range(min_resnum, max_resnum+1): + if (test_domlist[i].is_in_domain(chain, resnum) and + ref_domlist[i].is_in_domain(chain, resnum)): + overlap_count += 1 + + score = float(overlap_count) / float(total_residues) + return score + + +def domain_eval(test_domlist, ref_domlist): + """ + If the two domain lists are the same length, compute the overlap score + (discussed above) as the fraction of residues that are assigned to + the same domain. + + Otherwise, describe the test decomposition as 'undercut' (fewer + domains than reference) or 'overcut' (more domains than reference). + + Parameters: + test_domlist - list of PTDomain from the test (predicted) decomposition + ref_domlist - list of PTDomain from the reference (gold standard) decomp. + + Return value: + tuple (description, score) where description is + 'undercut', 'overcut' or 'correct' + and score is the overlap score in [0,1] if 'correct' otherwise 0.0 + + """ + if len(test_domlist) < len(ref_domlist): + return ('undercut', 0.0) + elif len(test_domlist) > len(ref_domlist): + return ('overcut', 0.0) + else: + return ('correct', compute_overlap_score(test_domlist, ref_domlist)) + + + +def evaluate_domains(domainlist, eval_domain_program, pdbid, + pdb_filename, pdb_struct, chainid=None): + """ + Evaluate the performance of the domain decmoposiotion reprresented by + the supplied domainlist against the program or database eval_domain_program + + Parmeters: + domain_list - list of PTDomain for our decomposition + eval_domain_program - 'cath:cdf_file_name' (CATH) or other supported + program or database (see ptdomain.py) + pdbid - PDB identifier for the protein + pdb_filename - name of PDB file (needed for DDOMAIN) + pdb_struct - Bio.PDB parsed structure (needed for DDOMAIN) + chainid - (default None). If not None, only use this chain. + + Return value: tuple the (num_domains, description, score) + where num_domains is the number of domains + in the reference (ie from the eval_domain_program) + and decription and score are from thetuple from domain_eval + (see domeval.py) + + """ + ref_domain_list = get_domains(eval_domain_program, + pdbid, pdb_filename, pdb_struct, + chainid) + num_domains = len(ref_domain_list) + if verbose: + print eval_domain_program + write_domains(sys.stdout, ref_domain_list) + + (description, score) = domain_eval(domainlist, ref_domain_list) + return (num_domains, description, score) + + + +def run_on_pdomains_list(pdbid_list, + pdb_root, + pdomains_filename, + print_results, + get_domains_function, + *get_domains_args): + """ + Run the supplied domain decomposition function get_domains_fuction + (with args get_domains_args) over all pDomains benchmark chains in + the specified list of PDB/chain identifiers. + Used for training/testing/crossvalidation for tuning parameters etc. + + Parameters: + pdbid_list - list of PDB/chain identifiers (keys in dict built by + parse_pdomains_file()) + pdb_root - root of the PDB divided hierarchy to find PDB files. + pdomains_filename - the fiename of the pDomains benchmark file + print_results - If True, write results of each chain to stdout. + get_domains_fuction - A function that, given the following args, + in order: + pdbid - PDB identifier + pdb_filename - PDB file name + pdb_struct - Bio.PDB parsed PDB structure + chainid - If not None, chain idnetifier to process + and then those + in get_domains_args, returns a domain decomposition + in the form of a list of PTDomain objects. + get_domains_args - variable args list for get_dmoains_function + + Return value: + tuple (undercut, overcut, correct, avgscore, + num_correct_assign, numdonmains_dict, num_processed) + where undercut, overcut, correct are number of domains that were + undercut (too few domains) overcut (too many domains) or had the + correct nubmer of domains, respectively, and avgscore is the + average score for all chains (scoring 0.0 for undercut/overcut) + and num_correct_assign is number correctly assigned (number of + domains correct and score over threshold) + and numdomains_dict is a dictionary of + { num_domains : (frequency, totalscore, avgscore,undercut,overcut,num_correct_domains,num_correct_assign)} + mapping number of domains to scores for chains with that number of + domains. + + Raises Exceptions: + ValueError if bad return value from evaluate_domains() + + """ + THRESHOLD_SCORE = 0.75 # must be above this to be correct + + total_score = 0.0 + num_undercut = 0 + num_overcut = 0 + num_correct = 0 + num_correct_assign = 0 + num_processed = 0 + numdomains_dict = {} # described in docstring above + for pdbchain in pdbid_list: + if len(pdbchain) > 4: + pdbid = pdbchain[:4] + chainid = pdbchain[4] + else: + pdbid = pdbchain[:4] + chainid = None + pdb_dir = os.path.join(pdb_root, pdbid[1:3].lower()) + pdb_filename = os.path.join(pdb_dir, 'pdb' + pdbid.lower() + '.ent.gz') + + if not os.path.exists(pdb_filename): + sys.stderr.write("WARNING: pdb file " + pdb_filename + + " not found, skipping\n") + continue + + # check for compressed files. We only support gzip (.gz) + # Note we are not using the zlib or GzipFile python modules + # since we are calling to external programs which require the + # file uncompressed themsevles anyway so we'll just run gzip + # to uncompress the file to a temporary directory. + pdb_file_basename = os.path.basename(pdb_filename) + (name,extension) = os.path.splitext(pdb_file_basename) + if extension == '.gz': + TMPDIR = os.tempnam(None, "ptgz") + os.mkdir(TMPDIR) + tmp_pdbfilename = os.path.join(TMPDIR, name) + os.system("gzip " + pdb_filename + " -d -c > " + tmp_pdbfilename) + our_pdb_filename = tmp_pdbfilename + used_tmp_file = True + else: + our_pdb_filename = pdb_filename + used_tmp_file = False + + try: + # parse PDB file + pdb_parser = PDBParser() + pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename) + + # run the domain decomposition method and evaluate results + domainlist = get_domains_function(pdbid, our_pdb_filename, + pdb_struct, + chainid, + *get_domains_args) + if domainlist == None: + if chainid == None: + chainname = '' + else: + chainname = 'chain ' + chainid + sys.stderr.write('WARNING: domain decomposition failed for ' + + pdbid + ' ' + chainname + '\n') + continue + evalresult = evaluate_domains(domainlist, + "pdomains:" + pdomains_filename, + pdbid, + our_pdb_filename, pdb_struct, + chainid) + (num_domains, description, score) = evalresult + num_processed += 1 + finally: + if used_tmp_file: + cleanup_tmpdir(TMPDIR) + + assigndescr = 'incorrect' + if description == 'undercut': + num_undercut += 1 + elif description == 'overcut': + num_overcut += 1 + elif description == 'correct': + num_correct += 1 + if score > THRESHOLD_SCORE: + num_correct_assign += 1 + assigndescr = 'correct' + else: + raise ValueError('unknown description ' + description + + ' from evaluate_domains\n') + if print_results: + sys.stdout.write(pdbchain + '\t' + str(num_domains) + '\t' + + description + '\t' + str(score) + ' ' + + assigndescr + '\n' ) + + total_score += score + if numdomains_dict.has_key(num_domains): + (dfrequency, dtotalscore, davgscore, + dundercut,dovercut,dnum_correct_domains,dnum_correct_assign) = \ + numdomains_dict[num_domains] + else: + dfrequency = 0 + dtotalscore = 0.0 + davgscore = 0.0 + dundercut = 0 + dovercut = 0 + dnum_correct_domains = 0 + dnum_correct_assign = 0 + dfrequency += 1 + dtotalscore += score + if description == 'undercut': + dundercut += 1 + elif description == 'overcut': + dovercut += 1 + elif description == 'correct': + dnum_correct_domains += 1 + if score > THRESHOLD_SCORE: + dnum_correct_assign += 1 + else: + assert(False) + numdomains_dict[num_domains] = (dfrequency, dtotalscore, davgscore, + dundercut,dovercut, + dnum_correct_domains, + dnum_correct_assign) + + for num_domains in numdomains_dict.iterkeys(): + (freq, total, avg, dunder,dover,dnumcd,dnumca) = numdomains_dict[num_domains] + avg = total / float(freq) + numdomains_dict[num_domains] = (freq,total,avg,dunder,dover,dnumcd,dnumca) + + avgscore = total_score / float(num_processed) + return (num_undercut, num_overcut, num_correct, avgscore, + num_correct_assign, numdomains_dict, num_processed) + + + +def run_on_pdomains_file(pdb_root, + pdomains_filename, + print_results, + get_domains_function, + *get_domains_args): + + """ + Run the domain decomposition over all pDomains benchmark chains in + the specified pDomains benchamge file. + Used for training/testing/crossvalidation for tuning parameters etc. + + Parameters: + pdb_root - root of the PDB divided hierarchy to find PDB files. + pdomains_filename - the fiename of the pDomains benchmark file + print_results - If True, print results for each chain to stdout + get_domains_fuction - A function that, given the following args, + in order: + pdbid - PDB identifier + pdb_filename - PDB file name + pdb_struct - Bio.PDB parsed PDB structure + chainid - if not None, chain identnifier of chain + and then those + in get_domains_args, returns a domain decomposition + get_domains_args - variable args for get-domains_function + + Return value: + tuple (undercut, overcut, correct, avgscore, + num_correct_assign, numdomain_dict) + as described in run_on_pdomains_list() + where undercut, overcut, correct are number of domains that were + undercut (too few domains) overcut (too many domains) or had the + correct nubmer of domains, respectively, and avgscore is the + average score for all chains (scoring 0.0 for undercut/overcut) + and num_correct_assign is number assigned correctly (correct domain + number and score over threshold) + and numdonains_dict maps number of domains to scores, + as described in run_on_pdomains_list() + + """ + pdomains = parse_pdomains_file(open(pdomains_filename)) + return run_on_pdomains_list(pdomains.keys(), pdb_root, pdomains_filename, + print_results, + get_domains_function, + *get_domains_args) + + + +def print_scores(num_processed, + num_undercut, num_overcut, num_correct, num_correct_assign, + avgscore, indent=0): + """ + Neatly format the scores to stdout. + Parameters: + num_undercut - number overcut + num_overcut - number undercut + num_correct - number of correctly assigned domain numbers + num_correct_assign - number of correctly assigned domains + avgscore - average score in [0,1] + indent - (default 0) number of spaces to indent + Return value: + None + """ + sys.stdout.write(indent*' ') + sys.stdout.write("number processed: %d\n" % num_processed) + sys.stdout.write(indent*' ') + sys.stdout.write("undercut: %d\n" % num_undercut) + sys.stdout.write(indent*' ') + sys.stdout.write("overcut: %d\n" % num_overcut) + sys.stdout.write(indent*' ') + sys.stdout.write("correct domains: %d\n" % num_correct) + sys.stdout.write(indent*' ') + sys.stdout.write("correct assign: %d\n" % num_correct_assign) + sys.stdout.write(indent*' ') + sys.stdout.write("average score: %3.1f%%\n" % (avgscore*100.0)) + diff --git a/scripts/fakepdb_to_cops.py b/scripts/fakepdb_to_cops.py new file mode 100755 index 0000000..680f5af --- /dev/null +++ b/scripts/fakepdb_to_cops.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# +# File: fakepdb_to_cops.py +# Author: Alex Stivala +# Created: April 2010 +# +# $Id: fakepdb_to_cops.py 3686 2010-05-17 07:31:00Z alexs $ +""" + fakepdb_to_cops.py - Convert fake PDB identifers back to COPS identifiers + + Usage: fakepdb_to_cops.py < DaliteLite-2col-output + + The input file is 2 column from dalilitout2col.py + on stdin. + Output is to stdout. + + + Note that DaliLite ONLY allows 4 char PDB codes, with chain appended, so + SCOP or COPS type codes will not work in cases where there are more than + one with the same pdb code and chain e.g. c1d3uB1 and c1d3uB2 + This is highly invonvenient, we have to get around it by mapping ALL + structures to 'fake' PDB codes with the cops_to_fakepdb.py and back with the + fakepdb_to_cops.py scripts. + +""" + +import sys,os + +from tsevalutils import iter_searchresult + + +COPS_FAKEPDBIDS_FILE = "/home/alexs/phd/qptabsearch/data/COPS/cops.fakepdbids" + + + +def parse_fakepdbids_file(fname): + """ + Parse the fake pdb id file to build dictionary mapping fake pdb id + to COPS Id + + Parameters: + fname - name of file to parse, first col is COPS id, 2nd is fake PDB id + + Return value: + tuple ( fake2cops, cops2fake ) where + fake2cops is dict { fakepdb : copsid } mapping fake PDB id to COPS id + cops2fake is dict { copsid : fakepdb } mapping COPS to fake PDB id + """ + fake2cops_dict = {} + cops2fake_dict = {} + for line in open(fname): + if line[0] == '#': + continue + sline = line.split() + if len(sline) != 2: + sys.stderr.write('bad line: %s\n' % line) + fake2cops_dict[sline[1]] = sline[0] + cops2fake_dict[sline[0]] = sline[1] + return ( fake2cops_dict, cops2fake_dict ) + + +def usage(progname): + sys.stderr.write("Usage: " + progname + " < DaliLite2ColOut\n") + sys.exit(1) + + +def main(): + """ + main for fakepdb_to_cops.py - see usage message at file header + """ + if len(sys.argv) != 1: + usage(os.path.basename(sys.argv[0])) + + FAKE_TO_COPS_DICT = parse_fakepdbids_file(COPS_FAKEPDBIDS_FILE)[0] + + for (score, dbid) in iter_searchresult(sys.stdin,multiquery=False): + sys.stdout.write("%s %f\n" % (FAKE_TO_COPS_DICT[dbid], score)) + + +if __name__ == "__main__": + main() diff --git a/scripts/fastscopdominfo.py b/scripts/fastscopdominfo.py new file mode 100755 index 0000000..5bafea5 --- /dev/null +++ b/scripts/fastscopdominfo.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +############################################################################### +# +# fastscopdominfo.py - Report information folds and classes of a list of SCOP sids +# +# File: fastscopdominfo.py +# Author: Alex Stivala +# Created: March 2010 +# +# $Id: fastscopdominfo.py 3009 2009-12-08 03:01:48Z alexs $ +# +############################################################################### + +""" +Report information on the folds and superfamilies and classes of a list +of SCOP domain identifiers (sids). +scopdominfo.py does this from Bio.SCOP, but this is quite slow to load +so this version uses a cached (pickle dictionary) table built by +build_fastscopdominfo_cache.py + +See usage in docstring for main() +""" + +import sys,os +import pickle + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def write_dom_info(scopsid_list, scopdominfo_dict): + """ + Write information about the list of SCOP sids (domain identifiers) + in the scopsid_list to fh. For each domain write the superfamily sccs, + superfamily description and fold description. + Delimiter is '|' + + + Parameters: + scopsid_list - list of SCOP sids (domain ids) + scopdominfo_dict - + dict {sid: (superfamily_sccs, superfamily_description, fold_sccs, fold_description)} + where + superfamily_sccs is SCOP sccs identifying the superfamily for the domain + superamily_description is SCOP dessription of the superfamily + fold_description is the SCOP descriptino of the fold the domain is in + Return value: + None. + """ + for sid in scopsid_list: + entry = scopdominfo_dict[sid] + sf_sccs = entry[0] + sf_desc = entry[1] + fold_sccs =entry[2] + fold_desc = entry[3] + sys.stdout.write("%s | %s | %s | %s\n" %(sid, sf_sccs, sf_desc, fold_desc)) + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + + "cachefile < domainidlist\n") + sys.exit(1) + + +def main(): + """ + main for fastscopdominfo.py + + Usage: fastscopdominfo.py cachefile < domainidlist + + cachefile is the filename of the cache (pickled) file built by + build_fastscopdominfo_cache.py + + The list of SCOP domain ids (sids) is read from stdin + Output is written to stdout. + """ + if len(sys.argv) != 2: + usage(os.path.basename(sys.argv[0])) + + pickle_filename = sys.argv[1] + scopdominfo_dict = pickle.load(open(pickle_filename)) + scopsid_list = sys.stdin.read().split('\n')[:-1] + write_dom_info(scopsid_list, scopdominfo_dict) + + +if __name__ == "__main__": + main() diff --git a/scripts/fischer_tables.py b/scripts/fischer_tables.py new file mode 100644 index 0000000..71fd2de --- /dev/null +++ b/scripts/fischer_tables.py @@ -0,0 +1,212 @@ +# +# File: fischer_tables.py +# Author: Alex Stivala +# Created: September 2008 +# +# +# $Id: fischer_tables.py 3575 2010-04-22 00:35:26Z alexs $ +# +# + +""" +Fischer data set (Fischer et al 1996 Pac. Symp. Biocomput. 300-318)) +as per Pelta et all 2008 BMC Bioinformatics 9:161 +""" + + +#----------------------------------------------------------------------------- +# +# Constants +# +#----------------------------------------------------------------------------- + + +# The 68 probe sequences from Fischer 1996 Table II +# Note several PDB ids obsoleted, so change to the replacments + +# map id to name of fold +FISCHER_ID_FOLD_DICT = { + '1dxt_b' : 'globin-like', + '1cpc_l' : 'globin-like', + '1c2r_a' : 'cytochrome', + '2mta_c' : 'cytochrome', + '1bbh_a' : 'helical bundle', + '1bge_b' : 'helical bundle', + '1rcb' : 'helical bundle', + '1aep' : 'helical bundle', + '1osa' : 'ef-hand', + '2sas' : 'ef-hand', + '1hom' : 'other alpha', + '1lga_a' : 'other alpha', + '2hpd_a' : 'other alpha', + '1chr_a' : 'tim barrel', + '2mnr' : 'tim barrel', + '3rub_l' : 'tim barrel', + '1crl' : 'hydrolase', + '1tah_a' : 'hydrolase', + '1aba' : 'thieredoxin', + '1dsb_a' : 'thieredoxin', + '1gpl_a' : 'thieredoxin', + '1atn_a' : 'ribonuclease', + '1hrh_a' : 'ribonuclease', + '3chy' : 'open sheet', + '2ak3_a' : 'open sheet', + '1gky' : 'open sheet', + '2cmd' : 'open sheet', + '1eaf' : 'open sheet', + '2gbp' : 'open sheet', + '1mio_c' : 'open sheet', + '2pia' : 'open sheet', + '1gal' : 'open sheet', + '1npx' : 'open sheet', + '2hhm_a' : 'mixed', + '1hip' : 'small', + '1isu_a' : 'small', + '1fc1_a' : 'ig', + '2fbj_l' : 'ig', + '1cid' : 'ig-like', + '1pfc' : 'ig-like', + '1ten' : 'ig-like', + '1tlk' : 'ig-like', + '3cd4' : 'ig-like', + '3hla_b' : 'ig-like', + '1aaj' : 'copredoxin', + '2afn_a' : 'copredoxin', + '2aza_a' : 'copredoxin', + '4sbv_a' : 'virus', + '1bbt_1' : 'virus', + '1sac_a' : 'lectin-like', + '1lts_d' : 'ob-fold', + '1tie' : 'trefoil', + '8i1b' : 'trefoil', + '1arb' : 'trypsin', + '2sga' : 'trypsin', + '2snv' : 'trypsin', + '1mdc' : 'lipocalin', + '1mup' : 'lipocalin', + '2sim' : 'propeller', + '1cau_b' : 'other beta', + '2omf' : 'other beta', + '1fxi_a' : 'ub fold', + '1cew' : 'cystatin', + '1stf_i' : 'cystatin', + '2pna' : 'sh2', + '2sar_a' : 'other alpha+beta', + '1onc' : 'other alpha+beta', + '5fd1' : 'other alpha+beta' +} + +# map name of fold to list of ids +FISCHER_FOLD_IDLIST_DICT = { + 'globin-like' : ['1dxt_b','1cpc_l'], + 'cytochrome' : ['1c2r_a','2mta_c'], + 'helical bundle' : ['1bbh_a','1bge_b','1rcb','1aep'], + 'ef-hand' : ['1osa','2sas'], + 'other alpha' : ['1hom','1lga_a','2hpd_a'], + 'tim barrel' : ['1chr_a','2mnr','3rub_l'], + 'hydrolase' : ['1crl','1tah_a'], + 'thieredoxin' : ['1aba','1dsb_a','1gpl_a'], + 'ribonuclease' : ['1atn_a','1hrh_a'], + 'open sheet' : ['3chy','2ak3_a','1gky','2cmd','1eaf','2gbp','1mio_c','2pia','1gal','1npx'], + 'mixed' : ['2hhm_a'], + 'small' : ['1hip','1isu_a'], + 'ig' : ['1fc1_a','2fbj_l'], + 'ig-like' : ['1cid','1pfc','1ten','1tlk','3cd4','3hla_b'], + 'copredoxin' : ['1aaj','2afn_a','2aza_a'], + 'virus' : ['4sbv_a','1bbt_1'], + 'lectin-like' : ['1sac_a'], + 'ob-fold' : ['1lts_d'], + 'trefoil' : ['1tie','8i1b'], + 'trypsin' : ['1arb','2sga','2snv'], + 'lipocalin' : ['1mdc','1mup'], + 'propeller' : ['2sim'], + 'other beta' : ['1cau_b','2omf'], + 'ub fold' : ['1fxi_a'], + 'cystatin' : ['1cew','1stf_i'], + 'sh2' : ['2pna'], + 'other alpha+beta': ['2sar_a','1onc','5fd1'] +} + + + +# map id to name of class +FISCHER_ID_CLASS_DICT = { + '1dxt_b' : 'alpha', + '1cpc_l' : 'alpha', + '1c2r_a' : 'alpha', + '2mta_c' : 'alpha', + '1bbh_a' : 'alpha', + '1bge_b' : 'alpha', + '1rcb' : 'alpha', + '1aep' : 'alpha', + '1osa' : 'alpha', + '2sas' : 'alpha', + '1hom' : 'alpha', + '1lga_a' : 'alpha', + '2hpd_a' : 'alpha', + '1chr_a' : 'alpha/beta', + '2mnr' : 'alpha/beta', + '3rub_l' : 'alpha/beta', + '1crl' : 'alpha/beta', + '1tah_a' : 'alpha/beta', + '1aba' : 'alpha/beta', + '1dsb_a' : 'alpha/beta', + '1gpl_a' : 'alpha/beta', + '1atn_a' : 'alpha/beta', + '1hrh_a' : 'alpha/beta', + '3chy' : 'alpha/beta', + '2ak3_a' : 'alpha/beta', + '1gky' : 'alpha/beta', + '2cmd' : 'alpha/beta', + '1eaf' : 'alpha/beta', + '2gbp' : 'alpha/beta', + '1mio_c' : 'alpha/beta', + '2pia' : 'alpha/beta', + '1gal' : 'alpha/beta', + '1npx' : 'alpha/beta', + '2hhm_a' : 'other', + '1hip' : 'other', + '1isu_a' : 'other', + '1fc1_a' : 'beta', + '2fbj_l' : 'beta', + '1cid' : 'beta', + '1pfc' : 'beta', + '1ten' : 'beta', + '1tlk' : 'beta', + '3cd4' : 'beta', + '3hla_b' : 'beta', + '1aaj' : 'beta', + '2afn_a' : 'beta', + '2aza_a' : 'beta', + '4sbv_a' : 'beta', + '1bbt_1' : 'beta', + '1sac_a' : 'beta', + '1lts_d' : 'beta', + '1tie' : 'beta', + '8i1b' : 'beta', + '1arb' : 'beta', + '2sga' : 'beta', + '2snv' : 'beta', + '1mdc' : 'beta', + '1mup' : 'beta', + '2sim' : 'beta', + '1cau_b' : 'beta', + '2omf' : 'beta', + '1fxi_a' : 'alpha+beta', + '1cew' : 'alpha+beta', + '1stf_i' : 'alpha+beta', + '2pna' : 'alpha+beta', + '2sar_a' : 'alpha+beta', + '1onc' : 'alpha+beta', + '5fd1' : 'alpha+beta' +} + +# map name of class to list of ids +FISCHER_CLASS_IDLIST_DICT = { + 'alpha' : ['1dxt_b','1cpc_l','1c2r_a','2mta_c', '1bbh_a','1bge_b','1rcb','1aep','1osa','2sas', '1hom','1lga_a','2hpd_a'], + 'alpha/beta' : ['1chr_a','2mnr','3rub_l','1crl','1tah_a','1aba','1dsb_a','1gpl_a', '1atn_a','1hrh_a','3chy','2ak3_a','1gky','2cmd','1eaf','2gbp','1mio_c','2pia','1gal','1npx'], + 'other' : ['2hhm_a','1hip','1isu_a'], + 'beta' : ['1fc1_a','2fbj_l', '1cid','1pfc','1ten','1tlk','3cd4','3hla_b', '1aaj','2afn_a','2aza_a','4sbv_a','1bbt_1', '1sac_a','1lts_d', '1tie','8i1b', '1arb','2sga','2snv', '1mdc','1mup', '2sim', '1cau_b','2omf'], + 'alpha+beta' : ['1fxi_a', '1cew','1stf_i', '2pna', '2sar_a','1onc','5fd1'] +} + diff --git a/scripts/generate_pml_pdb_for_top_scores.sh b/scripts/generate_pml_pdb_for_top_scores.sh new file mode 100755 index 0000000..0426f1f --- /dev/null +++ b/scripts/generate_pml_pdb_for_top_scores.sh @@ -0,0 +1,175 @@ +#!/bin/bash +############################################################################### +# +# generate_pml_pdb_for_top_scores.sh - Generate PDB and PML files to show +# alignment in PyMOL of top scoring +# hits from QP tableau search output +# +# File: generate_pml_pdb_for_top_scores.sh +# Author: Alex Stivala +# Created: November 2008 +# +# For the top 10 (or other) hits according to QP tableau search output +# on stdin, runs the QP tableau matching program on the query and the +# hit. For each hit, writes a PyMOL script (.pml file) containing a +# visualzation of the matches (maximally similar substructures +# coloured) with the match socre as a comment, and a writes a PDB file +# with structures superimposed. +# Also writes a loadhits_querydomid.pml file to load all superimposed +# strutures in PyMOL +# +# Optionally also use MUSTANG to generate multiple alignment of all +# the structures. +# +# Usage: +# generate_pml_pdb_for_top_scores.sh [-q] [-e sse_num_list] [-n num_hits] +# [-m] query_domid out_dir +# +# -q: do not use the ordering constraint (allow nonsequential matchings) +# +# -e sse_num_list: list of SSE sequential numbers to select from +# query struct rather than whole structure +# +# -n num_hits : number of top hits to use (default 10) +# +# -m : also generate multiple alignment of structures with MUSTANG +# (may take a long time if many structures) +# +# query_domid is the SCOP sid for the query structure +# +# out_dir is the directory to place output files. It is created +# if it does not exist. +# +# Then in PyMOL use @output.pml to run the PyMOL script. +# +# Runs the qptabmatchstructs.sh script which Uses the Python script +# pytableaucreate.py to create tableaux for input to the FORTRAN +# tsrchd_sparse program, and Python scripts soln2ssemap.py and +# ssemap2pml.py to process the output of tsrchd_sparse into PyMOL +# script. +# +# Identifiers are assumed to be SCOP sids, and locates files based +# on the ASTRAL_ROOT defined in this script. +# +# Input is tsrchd output file on stdin. +# Output .pdb and .pml files are written in out_dir - WARNING: +# overwritten if they exist. +# +# Environment variables: +# +# PATH must contain the location of the Python scripts, ie where this +# script itself is and the ptgraph/ directory with pytableaucreate.py etc., +# and the location of tsrchd_sparse. +# The dssp program must also be in the PATH. +# +# PYTHONPATH must contain the directory containing the ptsecstruct.py +# and other Python modules used by the Python scripts. +# +# $Id: generate_pml_pdb_for_top_scores.sh 2110 2009-03-18 05:58:44Z astivala $ +# +############################################################################### + +# Count chains in a PDB file. Writes to stdout the number of unique +# chain identifiers in the supplied PDB filename. Needed since +# MUSTANG can only handle PDB files with a single chain. + +function count_chains() { + pdbfile=$1 + grep '^ATOM' $pdbfile | cut -c22 | sort | uniq | wc -l + return 0 +} + + +# Count MODELs in a PDB file. Writes to stdout the number of models +# in the supplied PDB filename. Needed since +# MUSTANG can only handle PDB files with a single model. + +function count_models() { + pdbfile=$1 + grep -c '^MODEL' $pdbfile + return 0 +} + + +# Root of ASTRAL divided PDB style hierarchy +ASTRAL_ROOT=/local/charikar/ASTRAL/pdbstyle-1.73 + +use_ordering=1 +sse_num_list='' +sse_num_list_opt='' +num_hits=10 +run_mustang=0 + +while getopts 'sqe:n:m' opt +do + case $opt in + q) use_ordering=0 + ;; + e) sse_num_list="$OPTARG" + sse_num_list_opt="-e ${sse_num_list}" + ;; + n) + num_hits="$OPTARG" + ;; + m) run_mustang=1 + ;; + ?) + echo "Usage: $0 [-q] [-e sse_num_list] [-n num_hits] [-m] query_sid out_dir" >&2 + exit 1 + ;; + esac +done +shift $(($OPTIND - 1)) + + +if [ $# -ne 2 ]; then + echo "Usage: $0 [-q] [-e sse_num_list] [-n num_hits] [-m] query_sid out_dir" >&2 + exit 1 +fi + +qid=$1 +outdir=$2 + + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +cd $outdir + +qdiv=`echo $qid | cut -c3-4` +extra_opts="${sse_num_list_opt}" +if [ $use_ordering -eq 0 ]; then + extra_opts="${extra_opts} -q" +fi + +# list of structures (whitespace delimited) for MUSTANG +structlist=${ASTRAL_ROOT}/${qdiv}/${qid}.ent + +# filename of .pml file to load all generated superposition .pdb files +loadfile=loadhits_${qid}.pml + +echo "# Generated by $0 $*" > $loadfile +echo "# on `date`" >> $loadfile +for dom in `grep -v '^#' | sort -k2,2n | head -${num_hits} | cut -d' ' -f1` +do + div=`echo $dom | cut -c3-4` + hit_struct=${ASTRAL_ROOT}/${div}/${dom}.ent + qptabmatchstructs.sh -s ${extra_opts} ${ASTRAL_ROOT}/${qdiv}/${qid}.ent ${hit_struct} > ${qid}_${dom}.pml 2>/dev/null + if [ $run_mustang -ne 0 ]; then + # MUSTANG cannot load PDB files with multiple chains; just skip them + # Also cannot cope with multiple MODELs, skip them too + num_chains=`count_chains ${hit_struct}` + num_models=`count_models ${hit_struct}` + if [ $num_chains -eq 1 -a $num_models -lt 2 ]; then + structlist="${structlist} ${hit_struct}" + else + echo "Skipped ${hit_struct} as it has $num_chains chains and $num_models models" 2>&1 + fi + fi + echo "load ${outdir}/${qid}_`echo ${dom} | tr [a-z] [A-Z]`.pdb" >> $loadfile +done + +if [ $run_mustang -ne 0 ]; then + mustang -i $structlist -o mustang_${qid} +fi diff --git a/scripts/genpermutedqueries.sh b/scripts/genpermutedqueries.sh new file mode 100755 index 0000000..35b8bb0 --- /dev/null +++ b/scripts/genpermutedqueries.sh @@ -0,0 +1,88 @@ +#!/bin/bash +############################################################################### +# +# genpermutedqueries.sh - generate permuted folding pattern queries +# +# File: genpermutedqueries.sh +# Author: Alex Stivala +# Created: March 2009 +# +# +# Generate 5 random permutations of the tableaux of each of the 8 +# structures we use for testing. The idea is then to run tsrchd_sparse +# with the ordering constraint disabled, and check that the actual +# structures are still matched (even though the ordering of SSEs in +# the query structure has been disturbed randomly). +# +# +# Usage: +# genpermutedqueries.sh outdir +# +# outdir is the directory to write the query files to. It is created +# if it does not exist. WARNING: files overwritten if they do exist. +# The files are named d1ubia_.1.input d1ubia_.1.permutation etc. +# where the .permutation file is the output of pytableaucreate, which +# contains the permutation = 8,5,1,7,6,3,2,4 (for example) line +# showing the permutation used (needed to reconstruct alignments later +# if desired with ssepermutationremap.py) and .input is the input +# to tsrchd_sparse containging the database name and options (and +# not the permutation). +# +# Uses the Python scripts pytableaucreate.p to create tableaux for input +# to the FORTRAN tsrchd_sparse program +# +# Environment variables: +# +# PATH must contain the location of the Python scripts, ie where this +# script itself is and the ptgraph/ directory with pytableaucreate.py etc., +# and the location of tsrchd_sparse. +# The dssp program must also be in the PATH. +# +# PYTHONPATH must contain the directory containing the ptsecstruct.py +# and other Python modules used by the Python scripts. +# +# $Id: genpermutedqueries.sh 2120 2009-03-21 01:04:39Z astivala $ +# +############################################################################### + + +# Root of ASTRAL divided PDB style hierarchy +ASTRAL_ROOT=/local/charikar/ASTRAL/pdbstyle-1.73 + +# Tableau+distmatrix database file +TABLEAUXDB=/local/charikar/astivala/tableauxdb/astral/tableauxdistmatrixdb.ascii + +# number of permutations of each structure to make +NUM_PERMUTATIONS=5 + +# list of the structures we use as queries +STRUCTS="d1ubia_ d1tttb1 d1ae6h1 d1bhne_ d1h6rb_ d2phlb1 d1tima_ d1f6dc_" + + +if [ $# -ne 1 ]; then + echo "Usage: $0 outdir" >&2 + exit 1 +fi + +outdir=$1 +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +for struct in $STRUCTS +do + pnum=1 + while [ $pnum -le $NUM_PERMUTATIONS ] + do + div=`echo $struct | cut -c3-4` + pdbfile=${ASTRAL_ROOT}/${div}/${struct}.ent + pfile=${outdir}/${struct}.${pnum}.permutation + qfile=${outdir}/${struct}.${pnum}.input + pytableaucreate.py -u -b -35 -f -t dssp -p none $pdbfile > $pfile + echo $TABLEAUXDB > $qfile + echo "T F F" >> $qfile # options: type,order,output + awk 'NR > 1' < $pfile >> $qfile + pnum=`expr $pnum + 1` + done +done + diff --git a/scripts/genquerylist.py b/scripts/genquerylist.py new file mode 100755 index 0000000..d75304e --- /dev/null +++ b/scripts/genquerylist.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python +############################################################################### +# +# genquerylist.py - Generate a list of SCOP domain ids to query database with +# +# File: genquerylist.py +# Author: Alex Stivala +# Created: November 2008 +# +# $Id: genquerylist.py 3009 2009-12-08 03:01:48Z alexs $ +# +############################################################################### + +""" +Generate a list of SCOP domain ids that are representative for use as +database queries. For a given number of queries to generate, ensure +that each class is represeneted in proportion to its total representatino +in the SCOP database. + +See usage in docstring for main() + +SCOP and ASTRAL data is obtained using the Bio.SCOP library (Casbon et +al 2006 'A high level interface to SCOP and ASTRAL implemented in +Python' BMC Bioinformatics 7:10) and depends on having the data +downloaded, in SCOP_DIR (defined below). + +Downloaded SCOP files from + +http://scop.mrc-lmb.cam.ac.uk/scop/parse/index.html + +and ASTRAL files (in scopseq-1.73) from + +http://astral.berkeley.edu/scopseq-1.73.html + +The files downlaoded are: + +/local/charikar/SCOP/: +dir.cla.scop.txt_1.73 +dir.des.scop.txt_1.73 +dir.hie.scop.txt_1.73 + +/local/charikar/SCOP/scopseq-1.73: +astral-scopdom-seqres-all-1.73.fa +astral-scopdom-seqres-sel-gs-bib-95-1.73.id + +Other files there are indices built by Bio.SCOP when first used. +""" + +import sys,os +import getopt +import random + +from Bio.SCOP import * + +from pathdefs import SCOP_DIR,SCOP_VERSION + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def generate_query_list(num_queries, use_nonredundant, scop, astral): + """ + Return a list of randomly chosen num_queries SCOP sids for folds that are + representative of the distribution of folds in the SCOP classes + all-alpha, all-beta, alpha+beta, alpha/beta, multidomain, membrane/cell + surface and small proteins. + Only one (randomly) chosen domain per fold is returned, i.e. each + fold in the list is uniquely represented. + + + Parameters: + num_queries - number of domain ids to generate + use_nonredundant - if True, use only from ASTRAL 95% nr subset not + whole SCOP. + scop - previously built Bio.SCOP Scop instance + astral - previously build Bio.SCOP Astral instance + + Return value: + list of SCOP sids representing domains. + """ + class_sunids = [ 46456, # all alpha + 48724, # all beta + 51349, # alpha/beta + 53931, # alpha+beta + ] +# 56572, # multi-domain (alpha and beta) +# 56835, # membrane and cell surface proteins +# 56992 ] # small proteins + + num_folds_per_class = [] # correspdoning to class_sunids, num folds in class + for class_id in class_sunids: + scop_class = scop.getNodeBySunid(class_id) + assert(scop_class.type == 'cl') + fold_count = len(scop_class.getDescendents('fold')) + if verbose: + sys.stderr.write('%d folds in class %d (%s)\n' % + (fold_count, class_id, scop_class.description)) + num_folds_per_class.append(fold_count) + total_folds = sum(num_folds_per_class) + + if num_queries > total_folds: + sys.stderr.write( + 'WARNING: There are only %d folds, num_queries changed to %d\n' + % (total_folds, total_folds)) + num_queries = total_folds + + num_folds_per_class = [ int(round((float(n)/float(total_folds)) * + num_queries)) + for n in num_folds_per_class ] + if verbose: + for i in xrange(len(class_sunids)): + sys.stderr.write('%d folds will be represented for class %d (%s)\n' + % (num_folds_per_class[i], class_sunids[i], + scop.getNodeBySunid(class_sunids[i]).description)) + + fold_list = [] + for i in xrange(len(class_sunids)): + scop_class = scop.getNodeBySunid(class_sunids[i]) + folds = random.sample(scop_class.getDescendents('fold'), + num_folds_per_class[i]) + fold_list += folds + + domain_sids = [] + for fold in fold_list: + if use_nonredundant: + domain_list = [ dom for dom in fold.getDescendents('domain') + if astral.isDomainInId(dom, 95) ] + else: + domain_list = fold.getDescendents('domain') + if len(domain_list) > 0: + domain = random.choice(domain_list) + domain_sids.append(domain.sid) + + return domain_sids[:num_queries] + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-vn] " + " \n") + sys.stderr.write(' -v verbose messages to stderr\n') + sys.stderr.write(' -n use ASTRAL 95% nonredundant subset not all SCOP\n') + sys.exit(1) + + +def main(): + """ + main for genquerylist.py + + Usage: genquerylist.py [-vn] + + + -v turns on debug output to stderr + + -f use only the ASTRAL 95% sequence identity subset not all of SCOP + + The list of SCOP domain ids (sids) is printed to stdout. + """ + global verbose + verbose = False + + use_nonredundant = False + + try: + opts,args = getopt.getopt(sys.argv[1:], "vn?") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-v": # verbose + verbose = True # this module only + elif opt == "-n": # use ASTRAL 95% nr subset + use_nonredundant = True + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 1: + usage(os.path.basename(sys.argv[0])) + + num_queries = int(args[0]) + + # read SCOP and ASTRAL data + if verbose: + sys.stderr.write('Reading SCOP data...\n') + scop = Scop(dir_path=SCOP_DIR,version=SCOP_VERSION) + astral = Astral(dir_path=SCOP_DIR,version=SCOP_VERSION,scop=scop) + + sid_list = generate_query_list(num_queries, use_nonredundant, scop, astral) + for sid in sid_list: + sys.stdout.write(sid + '\n') + + +if __name__ == "__main__": + main() diff --git a/scripts/gentopsstringqueries.sh b/scripts/gentopsstringqueries.sh new file mode 100755 index 0000000..c146c4f --- /dev/null +++ b/scripts/gentopsstringqueries.sh @@ -0,0 +1,73 @@ +#!/bin/bash +############################################################################### +# +# gentopsstringqueries.sh - generate TOPS string queries +# +# File: gentopsstringqueries.sh +# Author: Alex Stivala +# Created: March 2009 +# +# +# Generate TOPS strings for each of the 8 +# structures we use for testing. +# +# +# Usage: +# gentopsstringqueries.sh outdir +# +# outdir is the directory to write the query files to. It is created +# if it does not exist. WARNING: files overwritten if they do exist. +# +# $Id: gentopsstringqueries.sh 2122 2009-03-23 22:39:40Z astivala $ +# +############################################################################### + + +# location of TOPS directory, contains tops.def etc. +# Note all the .dssp and .tops files are temporarily created here, +# (tops.def has these specifications) +TOPS_ROOT=/local/charikar/astivala/biosoftware/Tops + +# location of tops_comparison directory, contains jars/translation.jar etc. +TOPS_COMPARISON_ROOT=/local/charikar/astivala/biosoftware/tops_comparison + +# Root of ASTRAL divided PDB style hierarchy +ASTRAL_ROOT=/local/charikar/ASTRAL/pdbstyle-1.73 + +# list of the structures we use as queries +STRUCTS="d1ubia_ d1tttb1 d1ae6h1 d1bhne_ d1h6rb_ d2phlb1 d1tima_ d1f6dc_" + + + +if [ $# -ne 1 ]; then + echo "Usage: $0 outdir" >&2 + exit 1 +fi + +outdir=$1 + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +cd $TOPS_ROOT + +for struct in $STRUCTS +do + div=`echo $struct | cut -c3-4` + pdbfile=${ASTRAL_ROOT}/${div}/${struct}.ent + # TOPS can only cope with 4 letter PDB codes, so we have to name + # input files that way + sid=$struct + pdbcode=`echo $sid | cut -c2-5` + cp ${pdbfile} pdb${pdbcode}.ent + dssp ${pdbfile} > ${pdbcode}.dssp + ${TOPS_ROOT}/bin/Tops $pdbcode + topsfile=${outdir}/${sid}.tops + mv ${pdbcode}.tops ${topsfile} + rm ${pdbcode}.dssp + rm pdb${pdbcode}.ent + + java -cp ${TOPS_COMPARISON_ROOT}/jars/translation.jar tops.translation.Tops2String $topsfile $sid > ${outdir}/${sid}.topsstring +done + diff --git a/scripts/geometry.py b/scripts/geometry.py new file mode 100644 index 0000000..7d54819 --- /dev/null +++ b/scripts/geometry.py @@ -0,0 +1,110 @@ +############################################################################### +# +# geometry.py - geometry (3D) functions. +# +# File: geometry +# Author: Alex Stivala +# Created: November 2007 +# +# $Id: geometry.py 2705 2009-07-27 06:18:22Z astivala $ +# +# Utility functions for geometry in 3D space. +# +############################################################################### + +from numpy import * +from Bio.PDB import Vector + +def LineLineIntersect(p1, p2, p3, p4): + """ + Calculate the line segment PaPb that is the shortest route between + two lines P1P2 and P3P4. This line is perpendicular to both P1P2 and P3P4. + Calculate also the values of mua and mub + where + PA = P1 + mua (P2 - P1) + Pb = P3 + mub (P4 - p3) + + This function is just a simple Python implementation of the + algorithm described and implememted in C by Paul Bourke: + + http://local.wasp.uwa.edu.au/~pbourke/geometry/lineline3d/ + + Parameters: + p1 - Vector of first point for first line + p2 - Vector of second point for first line + p3 - Vector of first point of second line + p4 - Vector of second point of second line + + + Return value: + tuple (pa, pb, mua, mub) where + pa is Vector of Pa, first point on shortest line + pb is Vector of Pb, second point on shortest line + mua is the value s.t. Pa = P1 + mua (P2 - P1) + mub is the value s.t. Pb = P3 + mub (P4 - P4) + or None if there is no solution + """ + EPS = 1.0e-08 + + p13 = p1 - p3 + p43 = p4 - p3 + + if alltrue(less(abs(p43.get_array()), EPS)): + return None + p21 = p2 - p1 + if alltrue(less(abs(p21.get_array()), EPS)): + return None + + d1343 = p13[0] * p43[0] + p13[1] * p43[1] + p13[2] * p43[2] + d4321 = p43[0] * p21[0] + p43[1] * p21[1] + p43[2] * p21[2] + d1321 = p13[0] * p21[0] + p13[1] * p21[1] + p13[2] * p21[2] + d4343 = p43[0] * p43[0] + p43[1] * p43[1] + p43[2] * p43[2] + d2121 = p21[0] * p21[0] + p21[1] * p21[1] + p21[2] * p21[2] + + denom = d2121 * d4343 - d4321 * d4321 + if abs(denom) < EPS: + return None + + numer = d1343 * d4321 - d1321 * d4343 + mua = numer / denom + mub = (d1343 + d4321 * mua) / d4343 + + # Note using Numeric.array '*' operator here for element-wise multiplication + # as Bio.PDB.Vector '*' operator is vector dot product. + # (Note also must have Vector + int * array and NOT + # int * array + Vector due to Python type coercion rules). + pa = p1 + mua * p21.get_array() + pb = p3 + mub * p43.get_array() + + return (pa, pb, mua, mub) + + +def ProjectPointOntoLine(A, B, P): + """ + Project a point (in 3D), P, onto a line (in 3D) defined by the two + points A and B. + This point Q is the point on line AB such that the line PQ is orthogonal + to AB (PQ is the shortest line between the point P and the line AB) + + Parameters: + A - Vector for point A on the line AB + B - Vector for point B on the line AB + P - Vector for point P to project onto the line AB + + Return value: + Vector representing the point Q on line AB such that PQ is the shortest + line from P to line AB. + """ + # this basically involves finding the point Q such that + # (P - Q) * (B - A) = 0 + # where * is dot product. + # This is done by solving for u after substituting the equation for line AB: + # Q = A + u(B - A) + # into the first equation. + # See http://local.wasp.uwa.edu.au/~pbourke/geometry/pointline/ + + # Note * here is just scalar multiplication (float) + u =((P[0]-A[0])*(B[0]-A[0])+(P[1]-A[1])*(B[1]-A[1])+(P[2]-A[2])*(B[2]-A[2]))\ + / (B - A).normsq() + Q = A + u*(B - A).get_array() # this * is scalar*vector (Numeric.array *) + return Q diff --git a/scripts/get_all_sf_found.sh b/scripts/get_all_sf_found.sh new file mode 100755 index 0000000..96027a4 --- /dev/null +++ b/scripts/get_all_sf_found.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# +# File: get_all_sf_found.sh +# Author: Alex Stivala +# Created: Mach 2009 +# +# get_all_sf_found.sh - make list of all superfamilies found by all methods +# +# Usage: get_all_sf_found.sh +# +# Output is to the scopsid.allsffound files in the cwd +# WARNING: these files are overwritten if they exist +# Must be run from other_results directory after make has been run +# to build .sflist files in all subsidiary directories that are used. +# +# +# $Id: get_all_sf_found.sh 2169 2009-03-30 05:33:43Z astivala $ +# + +RESULTS_DIRS="ProSMoS SSM tops/folds ../results" +SF_LISTS="d1ubia_.sflist d1tttb1.sflist d1ae6h1.sflist d1bhne_.sflist d1h6rb_.sflist d2phlb1.sflist d1tima_.sflist d1f6dc_.sflist" + + + +for sflist in ${SF_LISTS} ; do + scopsid=`basename ${sflist} .sflist` + tmpfile=/var/tmp/gsf$$.${scopsid} + cat /dev/null > ${tmpfile} + for resdir in ${RESULTS_DIRS} ; do + cat ${resdir}/${sflist} >> ${tmpfile} + done + sort ${tmpfile} | uniq > ${scopsid}.allsffound + rm ${tmpfile} +done + diff --git a/scripts/get_superfamily_list.sh b/scripts/get_superfamily_list.sh new file mode 100755 index 0000000..a487d5a --- /dev/null +++ b/scripts/get_superfamily_list.sh @@ -0,0 +1,67 @@ +#!/bin/bash +############################################################################### +# +# get_superfamily_list.sh - Get list of superfamilies in top scoring hits +# +# File: get_superfamily_list.sh +# Author: Alex Stivala +# Created: March 2009 +# +# For the top hits according to structural search output +# on stdin (in two column name score format), get list of unique superfamilies. +# +# Usage: +# get_superfamily_list.sh [-n num_hits] +# +# -n num_hits : number of top hits to use (default all) +# +# Input is tsrchd output file on stdin. +# Output is list of SCOP superfamily sccs identifier (e.g. d.15.1) on stdout. +# +# Environment variables: +# +# PATH must contain the location of the Python scripts, ie where this +# script itself is and the ptgraph/ directory with scopdominfo.py etc. +# +# PYTHONPATH must contain the directory containing the ptsecstruct.py +# and other Python modules used by the Python scripts. +# +# +# Uses GNU utilities options (and bash) +# +# Relies on the output format generated by scopdominfo.py +# +# $Id: get_superfamily_list.sh 2104 2009-03-16 06:47:45Z astivala $ +# +############################################################################### + +num_hits=0 + +while getopts 'n:' opt +do + case $opt in + n) + num_hits="$OPTARG" + ;; + ?) + echo "Usage: $0 [-n num_hits]" >&2 + exit 1 + ;; + esac +done +shift $(($OPTIND - 1)) + + +if [ $# -ne 0 ]; then + echo "Usage: $0 [-n num_hits" >&2 + exit 1 +fi + +if [ $num_hits -gt 0 ]; then + filter="head -n${num_hits}" +else + filter="cat" +fi + +grep -v '^#' | sort -k2,2n | $filter | cut -d' ' -f1 | scopdominfo.py | \ + grep '^d' | cut -d\( -f2 | cut -d \) -f1 | sort | uniq diff --git a/scripts/getdomains.py b/scripts/getdomains.py new file mode 100644 index 0000000..824a106 --- /dev/null +++ b/scripts/getdomains.py @@ -0,0 +1,222 @@ +############################################################################### +# +# getdomains.py - functions to get domain decomposition from multiple methods +# +# File: getdomains.py +# Author: Alex Stivala +# Created: December 2007 +# +# $Id: getdomains.py 3236 2010-01-13 02:06:50Z alexs $ +# +############################################################################### + +""" +Functions to get domain decompositions via external programs or databases. + +""" + +from ptdomain import * +from parsepdom import * + +#----------------------------------------------------------------------------- +# +# Constants +# +#----------------------------------------------------------------------------- + +# regular expression to match each supported method +valid_domain_programs = [r"ddomain", r"cath:.*", r"pdomains:.*", r"none"] + +#----------------------------------------------------------------------------- +# +# Module globals +# +#----------------------------------------------------------------------------- + +# dictionary of domains from pDomains file built by parse_pdomains_file() +# the first time something is requested from pDomains, the file is parsed +# and dictionary stored here, subsequently it is looked up in this dictionary +pdomains_dict = None + +#----------------------------------------------------------------------------- +# +# Class definitions +# +#----------------------------------------------------------------------------- + + +# +# Empty classes for exceptions +# + +class Unsupported_Exception(Exception): # unsupported domain method found + pass + + +class NotInpDomains_Exception(Exception): # ident not found in pDomains file + pass + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def get_domains(domain_program, pdbid, pdb_filename, pdb_struct, chainid=None): + """ + Get domain decomposition in the form of list of PTDomain objects + from the nominated domain program or database. + + Parmeters: + domain_program - 'cath:cdf_file_name' (CATH) or other supported + program or database + pdbid - PDB identifier for the protein + pdb_filename - name of PDB file + pdb_struct - Bio.PDB parsed structure (needed for DDOMAIN) + chainid - (default None). If not None, the chainid for single chain + to use. + + Return value: list of domains according to the chosen method + + Raises exceptions: + Unsupported_Exception for unsupported domain program/db + NotInpDomains_Exception for identifier not in pDomains database + """ + global pdomains_dict + + if domain_program[0:5] == "cath:": + cdf_filename = domain_program[5:] + try: + domain_list = read_domains_from_cath_cdf_file(cdf_filename, pdbid, + chainid) + except NotInCATH_Exception: + sys.stderr.write('WARNING: PDB identifier ' + pdbid + + ' not found in CDF file.') + sys.stderr.write(' Treating as single domain.\n') + return [PTDomain(None, None)] # one-domain protein, no further info + elif domain_program == "ddomain": + domain_list = read_domains_from_ddomain(pdb_filename, + pdb_struct[0], + chainid) + # Sometimes DDOMAIN seems to give domain decompositions that do not + # make sense, i.e. have domains nested one inside the other. + # This happens for example with 2RH1. We will check for this and + # if it happens just ignore the decomposition, making a single domain. + domain_cd = build_domain_chaindict(domain_list) + if not verify_domain_disjoint(domain_list, domain_cd): + sys.stderr.write('WARNING: DDOMAIN domain decomposition is ' + + 'inconsistent. Treating as single domain.\n') + domain_list = [PTDomain(None, None)] + # NOTE: if there is only one domain, we will make it a list + # with a single PTDomain with all data None, signifying a + # single domain protein with no further information. This is + # mainly because of when there are multiple chains, in which + # case the single domain is reported by DDOMAIN as having a + # different chain id for start and end. If there is a single + # domain we really don't want to do anything special, so it is + # better to just have it as a special case where no domain + # processing is done. + if len(domain_list) == 1: + domain_list = [PTDomain(None, None)] + elif len(domain_list) == 0: + # This happens if DDomain crashes for example (e.g. on 1PPJ) + sys.stderr.write("WARNING: no domain decomposition from DDOMAIN." + " Treating as single domain.\n") + domain_list = [PTDomain(None, None)] + + elif domain_program[0:9] == "pdomains:": + if pdomains_dict == None: + # Build the pDomains dictionary, subsequently look up in it + pdomains_dict = parse_pdomains_file(open(domain_program[9:])) + # TODO: we will just always use the STERNBERG ('AUTHORS') entry for now + if chainid != None: + pdbid += chainid + try: + domain_list = pdomains_dict[pdbid]['STERNBERG'] + except KeyError: + raise NotInpDomains_Exception(pdbid) + elif domain_program == "none": + # The 'none' method is the baseline method of just always assigning + # everything to a single domain + return [PTDomain(None, None)] # one-domain protein, no further info + else: + raise Unsupported_Exception("unsupported domain program/db " + + domain_program) + + return domain_list + + +def write_domains(fh, domain_list): + """ + output the domain decomposition in a more or less conventional format + + Parmeters: + fh - open for write filehandle to write domain decomposition to + domain_list - list of PTDomain objects + Return value: + None + """ + fh.write(str(len(domain_list)) + ' domains:\n') + for i in range(len(domain_list)): + fh.write(str(domain_list[i])) + if i < len(domain_list) - 1: + fh.write('/') + fh.write('\n') + + +def verify_domain_disjoint(test_domlist, chaindict): + """ + Check that the supplied domain decomposition is valid in that + no residue is in more than one domain. + + Parameters: + test_domlist - list of PTDomain from the test (predicted) decomposition + chain_dict - dict of { chainid : (min_resnum, max_resnum) } + + Return value: + True if valid domain decomposition else False (not disjoint) + """ + for (chain, (min_resnum, max_resnum)) in chaindict.iteritems(): + for resnum in range(min_resnum, max_resnum+1): + num_domains_with_residue = 0 + for domain in test_domlist: + if (domain.is_in_domain(chain, resnum)): + num_domains_with_residue += 1 + if (num_domains_with_residue > 1): + sys.stderr.write('ERROR: chain ' + chain + ' residue ' + + str(resnum) + ' is in ' + + str(num_domains_with_residue) + ' domains.\n') + return False + return True + + + +def build_domain_chaindict(domlist): + """ + Build the diction of min and max residue sequence numbers for each + chain in the supplied domain decompositions (list of PTDomain objects). + + Parameters: + domlist - list of PTDomain objects represetning a domain decomp. + + Return value: + dict of { chainid : (min_resnum, max_resnum) } + """ + # get the lowest and highest residue sequence number in each chain + # of the reference domains and build a dictionary from it. + chain_dict = {} # dict of { chainid : (min_resnum, max_resnum) } + for domain in domlist: + for chainid in domain.get_chainids(): + (min_resnum,max_resnum)=domain.get_minmax_res_seq_in_chain(chainid) + if chain_dict.has_key(chainid): + (oldmin,oldmax) = chain_dict[chainid] + if min_resnum < chain_dict[chainid][0]: + chain_dict[chainid] = (min_resnum, oldmax) + if max_resnum > chain_dict[chainid][1]: + chain_dict[chainid] = (oldmin, max_resnum) + else: + chain_dict[chainid] = (min_resnum, max_resnum) + return chain_dict + diff --git a/scripts/getdomainsinsf.py b/scripts/getdomainsinsf.py new file mode 100755 index 0000000..1eee9d2 --- /dev/null +++ b/scripts/getdomainsinsf.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python +############################################################################### +# +# getdomainsinsf.py - Get a domain in each superfamily specfied on stdin. +# +# File: getdomainsinsf.py +# Author: Alex Stivala +# Created: February 2010 +# +# $Id: getdomainsinsf.py 3322 2010-02-11 05:46:13Z alexs $ +# +############################################################################### + +""" +For each SCOP sccs superfamily string (e.g. 'd.58.1') read from stdin, +output a domain in the 95% nr ASTRAL subset. + +See usage in docstring for main() + +SCOP and ASTRAL data is obtained using the Bio.SCOP library (Casbon et +al 2006 'A high level interface to SCOP and ASTRAL implemented in +Python' BMC Bioinformatics 7:10) and depends on having the data +downloaded, in SCOP_DIR (defined below). + +Downloaded SCOP files from + +http://scop.mrc-lmb.cam.ac.uk/scop/parse/index.html + +and ASTRAL files (in scopseq-1.73) from + +http://astral.berkeley.edu/scopseq-1.73.html + +The files downlaoded are: + +/local/charikar/SCOP/: +dir.cla.scop.txt_1.73 +dir.des.scop.txt_1.73 +dir.hie.scop.txt_1.73 + +/local/charikar/SCOP/scopseq-1.73: +astral-scopdom-seqres-all-1.73.fa +astral-scopdom-seqres-sel-gs-bib-95-1.73.id + +Other files there are indices built by Bio.SCOP when first used. +""" + +import sys,os +import getopt +import random + +from Bio.SCOP import * + +from pathdefs import SCOP_DIR,SCOP_VERSION + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def get_domain_for_each_sf(sccs_list, scop, astral): + """ + For each superfamily named by sccs in the sccs_list, + return a domain sid in that + superfamily in the 95% nr ASTRAL subset. + + Parameters: + sf_list - list of Bio.SCOP superfamily objects + scop - previously built Bio.SCOP Scop instance + astral - previously build Bio.SCOP Astral instance + + Return value: + list of SCOP sids, one for each superfamily. + """ + + # Bio.SCOP actually doesn't seem to have a facility to look up by + # sccs so we'll build a dictionary ourselves of all superfamilies + # keyed by sccs + all_superfamilies = scop.getRoot().getDescendents('sf') + sccs_dict = dict([(sf.sccs, sf) for sf in all_superfamilies]) + + domain_sids = [] + for sccs in sccs_list: + sf = sccs_dict[sccs] + domain_list = [ dom for dom in sf.getDescendents('domain') + if astral.isDomainInId(dom, 95) ] +# sys.stderr.write('xxx ' + str(domain_list)) + if len(domain_list) > 0: + domain = random.choice(domain_list) + domain_sids.append(domain.sid) + + return domain_sids + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + "\n") + sys.exit(1) + + +def main(): + """ + main for getdomainsinsf.py + + Usage: getdomainsinsf.py + + + List of SCOP superfamily ids (sccs) is read from stdin. + The list of SCOP domain ids (sids) is printed to stdout. + """ + global verbose + verbose = False + + use_nonredundant = False + + if len(sys.argv) != 1: + usage(os.path.basename(sys.argv[0])) + + + # read SCOP and ASTRAL data + scop = Scop(dir_path=SCOP_DIR,version=SCOP_VERSION) + astral = Astral(dir_path=SCOP_DIR,version=SCOP_VERSION,scop=scop) + + sccs_list = sys.stdin.read().split('\n')[:-1] + sid_list = get_domain_for_each_sf(sccs_list, scop, astral) + for sid in sid_list: + sys.stdout.write(sid + '\n') + + +if __name__ == "__main__": + main() diff --git a/scripts/getrank.sh b/scripts/getrank.sh new file mode 100755 index 0000000..9982f05 --- /dev/null +++ b/scripts/getrank.sh @@ -0,0 +1,41 @@ +#!/bin/sh +# +# getrank.sh - get rank of a hit from the output file +# +# $Id: getrank.sh 1790 2008-08-04 02:17:59Z astivala $ +# +# Usage: getrank.sh query_sid tabsearch.outputfile +# +# query_sid is the SCOP id (e.g. 'd1ubia_') of the query domain +# +# tabsearch.outputfile is the output from tabsearchqpml / tsrchn / tsrchd +# which is a text file where each line is identifier then whitespace +# then score, sorted by score from most negative to least negative e.g. +# +# d1xksa_ -35.99999999 +# d3sila_ -35.99999999 +# .... +# d2mhua_ -0.499999999 +# +# ie this means the best hit as the top of the file, and worst at bottom. +# +# Output is rank in file as fraction of line number/total line and percentage +# e.g. +# +# 944/15174 (6%) +# +# Uses GNU sort options. + +if [ $# -ne 2 ]; then + echo "Usage: $0 query_sid tabsearch_outputfile" >&2 + exit 1 +fi + + +query_sid=$1 +resultsfile=$2 +totallines=`grep -v '^#' ${resultsfile} | wc -l` +line=`grep -v '^#' ${resultsfile} | sort -k2,2n | grep -n ${query_sid} | cut -d: -f1` +percentage=`echo "(${line} / ${totallines}) * 100" | bc -l` +printf "%d/%d (%.0f%%)\n" $line $totallines $percentage + diff --git a/scripts/lock2out2col.sh b/scripts/lock2out2col.sh new file mode 100755 index 0000000..3141b4d --- /dev/null +++ b/scripts/lock2out2col.sh @@ -0,0 +1,61 @@ +#!/bin/sh +# +# File: LOCK2out2col.sh +# Author: Alex Stivala +# Created: March 2010 +# +# locks2out2col.sh - Convert LOCK2 default output format to same +# format as output by tsrchd_sparse etc. which can +# be processed with tsevalfn.py etc. +# +# Usage: LOCK2out2col.sh [-q] < LOCK2output.LOCK2.out +# +# -q : add QUERY ID = line +# +# Output has two columns, database id and LOCK2 score. +# The query id is put in a comment line at top of file, it is assumed +# to be the same in every line of LOCK2 -A output since that mode +# runs one query against a db. +# +# Output is to stdout. +# +# Uses the output format from LOCK2 in the FoldMiner package +# http://motif.stanford.edu/distributions/foldminer/FoldMinerDistribution.tar.gz +# +# +# $Id: lock2out2col.sh 3522 2010-03-24 05:50:47Z alexs $ +# + +outputqueryid=0 +if [ $# -gt 1 ]; then + echo "usage: $0 [-q] < LOCK2output" >&2 + exit 1 +elif [ $# -eq 1 ]; then + if [ $1 = "-q" ]; then + outputqueryid=1 + else + echo "usage: $0 [-q] < LOCK2output" >&2 + exit 1 + fi +fi + +awk -v outputqueryid=$outputqueryid ' + /^\*\* Target =/ { path = $4; + splitlen = split(path, splitpath, "/"); + entname = splitpath[splitlen]; + target = substr(entname, 1, 7) + } + /^final score:/ { score = $3; + print target, score; + } + /^\*\* Query =/ { path = $4; + if (outputqueryid == 1 && !donequery) { + splitlen = split(path, splitpath, "/"); + entname = splitpath[splitlen]; + queryid = substr(entname, 1, 7) + printf("# QUERY ID = %s\n", queryid); + donequery = 1; + } + } + + ' diff --git a/scripts/mergeoutput.sh b/scripts/mergeoutput.sh new file mode 100755 index 0000000..7f52574 --- /dev/null +++ b/scripts/mergeoutput.sh @@ -0,0 +1,57 @@ +#!/bin/bash +############################################################################### +# +# mergeoutput.sh - merge two sets of tableau search output into single file +# +# File: mergeoutput.sh +# Author: Alex Stivala +# Created: December 2009 +# +# +# For all the .out files in outdir1 and outdir2, assumed to be named +# with as queryid.out e.g. d4ubpb_.out and with output in two-column +# +# dbid score +# +# format, create a single merged output with 3 columns +# +# queryid dbid score1 score2 +# +# where score1 and score2 are score for matching queryid with dbid +# according to output in dir1 and dir2 respectively. +# Used for large-scale comparison of differences in scores between two +# methods (in R etc.) +# +# Output is to stdout. +# +# Usage: +# mergeoutput.sh outdir1 outdir2 +# +# $Id: mergeoutput.sh 3007 2009-12-04 06:31:37Z alexs $ +# +############################################################################### + +if [ $# -ne 2 ]; then + echo Usage: $0 outdir1 outdir2 >&2 + exit 1 +fi + +dir1=$1 +dir2=$2 + +for qpfile in ${dir1}/*.out +do + queryid=`basename $qpfile .out` + safile=${dir2}/${queryid}.out + sortedqpfile=`mktemp` + sortedsafile=`mktemp` + grep -v '^#' $qpfile | sort > $sortedqpfile + grep -v '^#' $safile | sort > $sortedsafile + join $sortedqpfile $sortedsafile \ + | grep -v ERROR \ + | awk "{print \"$queryid\", \$0}" + rm $sortedqpfile + rm $sortedsafile +done + + diff --git a/scripts/mkauctabrow.sh b/scripts/mkauctabrow.sh new file mode 100755 index 0000000..7022db1 --- /dev/null +++ b/scripts/mkauctabrow.sh @@ -0,0 +1,39 @@ +#!/bin/sh +# +# File: mkauctabrow.sh +# Author: Alex Stivala +# Created: March 2009 +# +# mkauctabrow.sh - make a row in the LaTeX table of AUC values for +# QP tableau search and other methods on large data set +# +# +# Usage: mkauctabrow statsfiename +# +# statsfilename is the .stats file generated by the rocauc.r R program +# from the .slrtab file generated fby the +# rocrfischer.py script (all this is done from Makefile) +# +# Output is to stdout +# +# $Id: mkauctabrow.sh 3612 2010-05-05 01:32:37Z alexs $ +# + +if [ $# -ne 1 ]; then + echo "Usage: $0 statsfilename" 2>&1 + exit 1 +fi + +statsfile=$1 + + +#auc=`fgrep 'RROC AUC' ${statsfile} | cut -d= -f2` +auc=`fgrep 'Hanley-McNeil AUC' ${statsfile} | cut -d= -f2` +stderror=`fgrep 'std. error' ${statsfile} | cut -d= -f2` +interval95=`fgrep '95% CI' ${statsfile} | cut -d= -f2` +low95=`echo "${interval95}" | cut -d, -f1` +high95=`echo "${interval95}" | cut -d, -f2` + +printf "%6.3f & %5.3f & %6.3f & %6.3f " $auc $stderror $low95 $high95 +echo \\\\ + diff --git a/scripts/mkdiscretetab.sh b/scripts/mkdiscretetab.sh new file mode 100755 index 0000000..82d08fc --- /dev/null +++ b/scripts/mkdiscretetab.sh @@ -0,0 +1,43 @@ +#!/bin/sh +# +# File: mkdiscretetab.sh +# Author: Alex Stivala +# Created: August 2008 +# +# mkdiscretetab.sh - make table of AUC and time for tableaux on some folds +# +# Usage: mkdiscretetab.sh +# +# Output is to stdout. +# +# Each row contains: +# +# Fold & SCOP sid & \# SSEs & AUC & time\\ +# e.g. +# Immunoglobulin & \texttt{d1ae6h1} & 13 & 0.93 & 9:20 \\ +# +# The mktabrow.sh script is called to make each row. +# The table is sorted by #SSEs +# +# Uses options specific to GNU sort +# +# $Id: mkdiscretetab.sh 1790 2008-08-04 02:17:59Z astivala $ +# + +RTABS_DISCRETE="d1ubia_.tsrchd.tt.rtab d1ae6h1.tsrchd.tt.rtab d1bhne_.tsrchd.tt.rtab d1h6rb_.tsrchd.tt.rtab d1tttb1.tsrchd.tt.rtab d1tima_.tsrchd.tt.rtab d2phlb1.tsrchd.tt.rtab d1f6dc_.tsrchd.tt.rtab" + +cat <&1 + exit 1 +fi +dataset=$1 +level=$2 +for statsdir in results/${dataset} results/${dataset}/norm? maxcmo_results/${dataset} maxcmo_results/${dataset}/norm? +do + lastpart=`basename ${statsdir}` + if [ `expr substr ${lastpart} 1 4` = "norm" ]; then + normtype=${lastpart} + else + normtype="None" + fi + resdir=`echo ${statsdir} | cut -d/ -f1` + if [ ${resdir} = "results" ]; then + method="QP tableau search" + elif [ ${resdir} = "maxcmo_results" ]; then + method="MSVNS3" + else + echo "unknown directory ${resdir}" 2>&1 + fi + for statsfile in ${statsdir}/${level}.stats + do + level=`basename ${statsfile} .stats` + printf '%-20s & %-5s &' "${method}" "${normtype}" + mkfischer3dtabrow.sh ${statsfile} + done +done | sort -t'&' -k1,1 -k2,2 + diff --git a/scripts/mkfischer3dtabrow.sh b/scripts/mkfischer3dtabrow.sh new file mode 100755 index 0000000..5822152 --- /dev/null +++ b/scripts/mkfischer3dtabrow.sh @@ -0,0 +1,37 @@ +#!/bin/sh +# +# File: mkfischer3dtabrow.sh +# Author: Alex Stivala +# Created: October 2008 +# +# mkfischer3dtabrow.sh - make a row in the LaTeX table of results for MSVNS and +# QP tableau search on Fischer and Nh3D data sets +# +# +# Usage: mkfischer3dtabrow statsfiename +# +# statsfilename is the .stats file generated by the rocauc.r R program +# from the .slrtab file generated from MSVNS or tsrchd output by the +# rocrfischer.py script (all this is done from Makefile) +# +# Output is to stdout +# +# $Id: mkfischer3dtabrow.sh 3686 2010-05-17 07:31:00Z alexs $ +# + +if [ $# -ne 1 ]; then + echo "Usage: $0 statsfilename" 2>&1 + exit 1 +fi + +statsfile=$1 + +auc=`fgrep 'Hanley-McNeil AUC' ${statsfile} | cut -d= -f2` +stderror=`fgrep 'std. error' ${statsfile} | cut -d= -f2` +interval95=`fgrep '95% CI' ${statsfile} | cut -d= -f2` +low95=`echo "${interval95}" | cut -d, -f1` +high95=`echo "${interval95}" | cut -d, -f2` + +printf "%6.3f & %5.3f & %6.3f & %6.3f " $auc $stderror $low95 $high95 +echo '\\' + diff --git a/scripts/mkfischer3dtabs_all.sh b/scripts/mkfischer3dtabs_all.sh new file mode 100755 index 0000000..4866dfd --- /dev/null +++ b/scripts/mkfischer3dtabs_all.sh @@ -0,0 +1,48 @@ +#!/bin/sh +# +# File: mkfischer3dtabs_all.sh +# Author: Alex Stivala +# Created: October 2008 +# +# mkfischer3dtab.sh - make all LaTeX tables of results for MSVNS and +# QP tableau search on Fischer and Nh3D data sets +# +# Uses the mkfischer3dtab.sh script to make ecah table +# Muast be run at directoryu containg results/ and maxcmo_results/ dirs +# +# Usage: mkfischer3dtabs_all.sh +# +# $Id: mkfischer3dtabs_all.sh 2996 2009-11-30 05:25:05Z alexs $ +# + +if [ $# -ne 0 ]; then + echo "Usage: $0 " 2>&1 + exit 1 +fi + +tmpfile=/var/tmp/f3tab.$$ + +for dataset in fischer nh3d +do + for level in fold class arch + do + if [ \( $dataset = "fischer" -a $level = "arch" \) -o \( $dataset = "nh3d" -a $level = "fold" \) ]; then + continue + fi + outfile=results/${dataset}.${level}.textab + cat <${outfile} +\begin{tabular}{llrrrr} +\hline + & & & standard & \multicolumn{2}{c}{95\% confidence interval} \\\\ +Method & Normalization & AUC & error & lower & upper \\\\ +\hline +EOF + mkfischer3dtab.sh $dataset $level > $tmpfile + tabcolmax.sh $tmpfile 3 >> ${outfile} + cat <>${outfile} +\hline +\end{tabular} +EOF + done +done +rm ${tmpfile} diff --git a/scripts/mkfoldstab2.sh b/scripts/mkfoldstab2.sh new file mode 100755 index 0000000..a33ebde --- /dev/null +++ b/scripts/mkfoldstab2.sh @@ -0,0 +1,66 @@ +#!/bin/sh +# +# File: mkfoldstab2.sh +# Author: Alex Stivala +# Created: September 2008 +# +# mkfoldstab2.sh - make table of AUC and time for tableaux on some folds +# for two variations of method (with+without distance +# difference constraints) +# +# Usage: mkfoldstab2.sh +# +# Output is to stdout. +# +# Each row contains: +# +# Fold & SCOP sid & \# SSEs & AUC (without dist) & time (without dist) & AUC (with dist) & time (with dist) \\ +# e.g. +# $\beta$-grasp & \texttt{d1ubia\_} & 8 & 0.80 & 0 h 47 m & 0.92 & 0 h 32 m +# +# The mktabrow.sh script is called to make each row. +# The table is sorted by #SSEs +# +# Uses options specific to GNU sort +# Note may have problems with +# multiple backslash escapes used to format LaTeX table +# - this works on room0219pos09.cs.mu.oz.au +# but not charikar.cs.mu.oz.au (both Linux)! +# +# $Id: mkfoldstab2.sh 2204 2009-04-06 00:23:10Z astivala $ +# + +RTABS_DISCRETE="d1ubia_.tsrchd.tt.rtab d1ae6h1.tsrchd.tt.rtab d1bhne_.tsrchd.tt.rtab d1h6rb_.tsrchd.tt.rtab d1tttb1.tsrchd.tt.rtab d1tima_.tsrchd.tt.rtab d2phlb1.tsrchd.tt.rtab d1f6dc_.tsrchd.tt.rtab" + +RTABS_NODISTMATRIX_DISCRETE="d1ubia_.nodistmatrix.tsrchd.tt.rtab d1ae6h1.nodistmatrix.tsrchd.tt.rtab d1bhne_.nodistmatrix.tsrchd.tt.rtab d1h6rb_.nodistmatrix.tsrchd.tt.rtab d1tttb1.nodistmatrix.tsrchd.tt.rtab d1tima_.nodistmatrix.tsrchd.tt.rtab d2phlb1.nodistmatrix.tsrchd.tt.rtab d1f6dc_.nodistmatrix.tsrchd.tt.rtab" + +ftab_discrete_tmpfile=/tmp/ftab$$ +ftab_nodistmatrix_discrete_tmpfile=/tmp/ftabnd$$ + +cat < ${ftab_discrete_tmpfile} + +for fold in $RTABS_NODISTMATRIX_DISCRETE ; do + mktabrow.sh `basename ${fold} .rtab` +done | sort -t '&' -k2,2 > ${ftab_nodistmatrix_discrete_tmpfile} + +join -t\& -12 -22 ${ftab_nodistmatrix_discrete_tmpfile} ${ftab_discrete_tmpfile} | sed 's/\\\\//g' |awk -F\& '{print $2 "&" $1 "&" $3 "&" $4 "&" $5 "&" $8 "&" $9 "\\\\"}' | sort -t '&' -k3,3n + +cat <&1 + exit 1 +fi + +# cat <&1 + exit 1 +fi + +cat < 0: + sys.stderr.write("(queryid %s): set score to %f for %d domains\n" % (queryid, bottom_score, lowscore_domains)) + + if skip_self_query: + # skipping query against itelf, need to remove it from gold standrd + goldstd_domains = [d for d in goldstd_domains + if d.sid.lower() != queryid.lower()] + + if do_slrtab: + sys.stdout.write('# QUERY ID = ' + queryid + '\n') + auc = write_slrtab(sorted(allscores,reverse=True), + goldstd_domains, + use_nonredundant, scop, astral) + else: + auc = compute_auc_mann_whitney( + sorted(allscores,reverse=True), + goldstd_domains, + use_nonredundant, scop, astral) + total += auc + num += 1 + sys.stdout.write('%s %4.3f\n' % (queryid.lower(), auc)) + + avg = total / num + + if not do_slrtab: + sys.stdout.write('# AVERAGE = %f\n' % (avg)) + + +if __name__ == "__main__": + main() diff --git a/scripts/mksfcounttab.sh b/scripts/mksfcounttab.sh new file mode 100755 index 0000000..6979558 --- /dev/null +++ b/scripts/mksfcounttab.sh @@ -0,0 +1,138 @@ +#!/bin/sh +# +# File: mksfcounttab.sh +# Author: Alex Stivala +# Created: Mach 2009 +# +# mksfcounttab.sh - make table of superfamily counts +# +# Usage: mksfcounttab.sh +# +# Output is to stdout. +# Must be run from other_results directory after make has been run +# to build .sflist files in all subsidiary directories that are used. +# +# Each row contains: +# +# & & Number of superfamilies \\ +# Fold & SCOP sid & ProSMoS & SSM & TOPS & QP tabsearch & SCOP & P/R & S/R & T/R & Q/R \\ +# e.g. +# Immunoglobulin & \texttt{d1ae6h1} & 27 & 1 & 2 & 3 & & & & & \\ +# +# Note the SCOP column (number explicitly mentioned in SCOP ) and following +# depend on the .scopmentioned files WHICH MUST BE GENERATED MANUALLY +# (see README), so if any of the .sflist files actally change, +# the .scopmentioned files must be manually updated. +# +# $Id: mksfcounttab.sh 2181 2009-03-31 03:17:24Z astivala $ +# + +RESULTS_DIRS="ProSMoS SSM tops/folds ../results" +SF_LISTS="d1ubia_.sflist d1tttb1.sflist d1ae6h1.sflist d1bhne_.sflist d1h6rb_.sflist d2phlb1.sflist d1tima_.sflist d1f6dc_.sflist" + + + +# write common lines between two files to stdount +# Parameters: +# file1 - filename of first file +# file2 - filename of secondfile +commonlines() { + file1=$1 + file2=$2 + tmpfile1=/var/tmp/msf$$.1 + tmpfile2=/var/tmp/msf$$.2 + sort $file1 > $tmpfile1 + sort $file2 > $tmpfile2 + comm -12 $tmpfile1 $tmpfile2 + rm ${tmpfile1} ${tmpfile2} +} + + +cat < $tmpufile + cat $others | sort | uniq > $tmpothers + comm -23 $tmpufile $tmpothers + rm $tmpufile $tmpothers +} + + +# write common lines between two files to stdout +# Parameters: +# file1 - filename of first file +# file2 - filename of secondfile +commonlines() { + file1=$1 + file2=$2 + tmpfile1=/var/tmp/msf$$.1 + tmpfile2=/var/tmp/msf$$.2 + sort $file1 > $tmpfile1 + sort $file2 > $tmpfile2 + comm -12 $tmpfile1 $tmpfile2 + rm ${tmpfile1} ${tmpfile2} +} + +# +# write abbreviation for method name based on results directory +# Paramters: +# results directory name +getmethodabbrev() { + if [ `basename ${resdir}` = "results" ]; then + method="QP tableau search" + else + method=`echo ${resdir} | cut -d/ -f1` + if [ "${method}" != "TableauSearch" -a "${method}" != "ProSMoS" ]; then + method=`echo "${method}" | tr a-z A-Z` + fi + fi + case "${method}" in + ProSMoS) + methodabbrev='P' + ;; + SSM) + methodabbrev='S' + ;; + TOPS) + methodabbrev='T' + ;; + "QP tableau search") + methodabbrev='Q' + ;; + esac + echo $methodabbrev +} + + +############################################################################# +# +# Main +# +############################################################################# + +cat < $tmpuniquesf + uniquesfinscop=${sid}.${methodabbrev}.uniquesfinscop + commonlines ${tmpuniquesf} ${scopmentioned} > $uniquesfinscop + rm $tmpuniquesf + done + + # count the total number of superfamilies that are uniquely + # found for each method, and which is also mentioned in SCOP: + # this is just the total number in all .uniquesfinscop files + # since they are disjoint sets (an entry in one cannot be in any + # other since they are those found ONLY by each method) + uniquescopcount=`wc -l ${sid}.*.uniquesfinscop | grep total | awk '{print $1}'` + printf '& %3d ' $uniquescopcount + + # now count the unique superfamilies and unique superfamilies mentioned + # in scop in the files just generated + # for each method and write them in the table + for resdir in ${RESULTS_DIRS} ; do + methodabbrev=`getmethodabbrev ${resdir}` + uniquesfinscop=${sid}.${methodabbrev}.uniquesfinscop + uniquesf_in_r_count=`wc -l ${uniquesfinscop} | awk '{print $1}'` + printf '& %s ' "${uniquesf_in_r_count}/${uniquescopcount}" + done + + echo '\\\' +done + +cat <&1 + exit 1 +fi + +basefilename=$1 + +dotindex=`expr index ${basefilename} '.'` +dotindex=`expr ${dotindex} - 1` +scopsid=`expr substr ${basefilename} 1 ${dotindex}` + +case ${scopsid} in + d1ubia_) + fold='$\beta$-grasp' + ;; + d1ae6h1) + fold='Immunoglobulin' + ;; + d1tima_) + fold='Tim-barrel' + ;; + d1bhne_) + fold='Plait (ferredoxin)' + ;; + d1h6rb_) + fold='GFP-like' + ;; + d1tttb1) + fold='Key-barrel' + ;; + d2phlb1) + fold='Jelly-roll' + ;; + d1f6dc_) + fold='NAD-binding fold' + ;; + *) + fold=`echo ${scopsid} | sed 's/_/\\\_/g'` +esac + +rtabfile=${RESULTSDIR}/${basefilename}.rtab +inputfile=${INPUTDIR}/${scopsid}.input +errfile=${RESULTSDIR}/${basefilename}.err + +auc=`tail ${rtabfile} | grep 'AUC = ' | cut -d' ' -f4` +numsses=`grep -i ${scopsid} ${inputfile} | awk '{print $2}'` + +# assume times are in this sort of format, as generated by /usr/bin/time +# on Linux: +#49576.89user 2907.19system 14:45:02elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k +#0inputs+0outputs (0major+504890689minor)pagefaults 0swaps +# or (for less than an hour): +#2603.51user 217.97system 47:10.00elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k +#0inputs+0outputs (0major+26943522minor)pagefaults 0swaps + +elapsed=`grep elapsed ${errfile} | awk '{print $3}' | sed 's/elapsed//'` +dotindex=`expr index ${elapsed} '.'` +if [ ${dotindex} -ne 0 ]; then + # less than an hour + colonindex=`expr index ${elapsed} ':'` + colonindex=`expr $colonindex - 1` + hours=0 + mins=`expr substr ${elapsed} 1 ${colonindex}` + secindex=`expr $colonindex + 2` + secs=`expr substr ${elapsed} ${secindex} 2` +else + colonindex=`expr index ${elapsed} ':'` + colonindex=`expr $colonindex - 1` + hours=`expr substr ${elapsed} 1 ${colonindex}` + next=`expr $colonindex + 2` + rest=`expr substr ${elapsed} $next 999` + colonindex=`expr index ${rest} ':'` + colonindex=`expr $colonindex - 1` + mins=`expr substr ${rest} 1 $colonindex` + next=`expr $colonindex + 2` + rest=`expr substr ${rest} $next 999` + secs=$rest +fi +if [ $secs -ge 30 ]; then + mins=`expr $mins + 1` +fi + +scopsid=`echo ${scopsid} | sed 's/_/\\\_/g'` +printf '%-20s & %-20s & %3d & %4.2f & %d h %02d m ' "${fold}" "\texttt{${scopsid}}" ${numsses} ${auc} ${hours} ${mins} +echo '\\\' + diff --git a/scripts/mktimertab.py b/scripts/mktimertab.py new file mode 100755 index 0000000..20e4c10 --- /dev/null +++ b/scripts/mktimertab.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# +# File: mktimertab.py +# Author: Alex Stivala +# Created: November 2009 +# +# +# mktimertab.sh - make table for R read.table(,header=TRUE) with CPU times +# from tsrchd_* -t output +# +# +# Usage: mktimertab.sh tsrdcd-t_outputfilename +# +# The input file is the output from tsrchd_* -t (3 columns: +# queryid, score, cputime), with QUERY ID and DBFILE In comments) +# from which we extract the scopsid as the first component (e.g. d1ubia_) +# and use ../data/queryid.input and the database file +# +# Output is to stdout, with each row containing +# +# queryid dbid querysses dbsses score cputime +# +# +# (Note that queryid and querysses will actually be the same for every row) +# +# $Id: mktimertab.py 2898 2009-11-02 04:14:06Z astivala $ +# + + +import os,sys + +# location of .input files (used for getting number of SSEs and db filename) +INPUTDIR = os.getenv("HOME") + "/phd/qptabsearch/data" + +if len(sys.argv) != 2: + sys.stderr.write("Usage: " + sys.argv[0] + " tsrchd-t_filename\n") + sys.exit(1) + +tsrchdfile = sys.argv[1] + + +sys.stdout.write('#' + ' '.join(sys.argv) + '\n') #identifying info about us +sys.stdout.write('# results from:\n') +firsttime = True +for line in open(tsrchdfile): + if line[:12] == "# QUERY ID =": + sys.stdout.write("# " + line) + queryid = line.split("=")[1].lstrip().rstrip().lower() + inputfile = os.path.join(INPUTDIR, queryid + ".input") + for infline in open(inputfile): + if infline[:len(queryid)].lower() == queryid: + querysses = infline.split()[1] + break + elif line[:10] == "# DBFILE =": + sys.stdout.write("# " + line) + dbfile = line.split("=")[1].lstrip().rstrip() + elif line[0] == "#" or len(line) == 0: + sys.stdout.write("# " + line) + else: + if firsttime: + sys.stdout.write("queryid dbid querysses dbsses score cputime\n") + firsttime = False + # build dict of number of SSEs in each database structure + dbnumsse_dict = {} # dict of {scopid : numsses} + for dbline in open(dbfile): + if dbline[0] == "d": + splitdbline = dbline.split() + dbnumsse_dict[splitdbline[0]] = splitdbline[1] + splitline = line.split() + dbid = splitline[0] + score = splitline[1] + cputime = splitline[2] + dbsses = dbnumsse_dict[dbid] + sys.stdout.write("%s %s %s %s %s %s\n" + % (queryid, dbid, querysses, dbsses, score, cputime)) diff --git a/scripts/normalize_all.sh b/scripts/normalize_all.sh new file mode 100755 index 0000000..a91cdb8 --- /dev/null +++ b/scripts/normalize_all.sh @@ -0,0 +1,117 @@ +#!/bin/sh +# +# File: normalize_all.sh +# Author: Alex Stivala +# Created: October 2008 +# +# normalize_all.sh - build all normalized scores files +# +# Usage: normalize_all.sh +# +# Run from the qptabsearch/ directory: processes files under +# results/fischer, results/nh3d, maxcmo_results/fishcer, maxcmo_results/nh3d +# etc., makeing norm1/ norm2/ norm3/ subdirectories under those, +# using also tableaux and contact map input files under data/ +# via scripts normalize_tabmatch.py and normalize_msvns4maxcmo.py +# +# The scripts/ directory (containing this scrpipt and the abovementioned ones) +# must be in the PATH +# +# $Id: normalize_all.sh 2092 2009-03-09 23:19:14Z astivala $ + +TABMATCH_FISCHER_DIR=results/fischer +TABMATCH_NH3D_DIR=results/nh3d +MSVNS_FISCHER_DIR=maxcmo_results/fischer +MSVNS_NH3D_DIR=maxcmo_results/nh3d + +TABLEAUX_FISCHER_DIR=data/fischer_db +TABLEAUX_NH3D_DIR=data/nh3d +CM_FISCHER_DIR=data/fischer_cm +CM_NH3D_DIR=data/nh3d_cm + +TABMATCH_QUERY200_DIR=results/query200 +TABLEAUXDB=/local/charikar/astivala/tableauxdb/astral/tableauxdistmatrixdb.full.ascii + +TABSEARCH_QUERY200_DIR=other_results/TableauSearch/query200 +TABSEARCHDB=/local/charikar/TableauSearchDB + +NORMTYPES="1 2 3" + +for norm in ${NORMTYPES} +do + echo "normalizing QP tableau search Fischer results with norm ${norm}..." + outdir=${TABMATCH_FISCHER_DIR}/norm${norm} + if [ ! -d ${outdir} ]; then + mkdir ${outdir} + fi + for infile in ${TABMATCH_FISCHER_DIR}/*.out + do + qid=`basename ${infile} .out` + outfile=${outdir}/${qid}.out + normalize_tabmatch.py ${norm} ${qid} ${TABLEAUX_FISCHER_DIR} < ${infile} > ${outfile} + done + echo "normalizing QP tableau search Nh3D results with norm ${norm}..." + outdir=${TABMATCH_NH3D_DIR}/norm${norm} + if [ ! -d ${outdir} ]; then + mkdir ${outdir} + fi + for infile in ${TABMATCH_NH3D_DIR}/*.out + do + qid=`basename ${infile} .out` + outfile=${outdir}/${qid}.out + # dodgy: remove periods so that CATH id fits in 8 chars... + # luckily we get no duplicates... + cathid=`echo ${qid} | tr -d .` + normalize_tabmatch.py ${norm} ${cathid} ${TABLEAUX_NH3D_DIR} < ${infile} > ${outfile} + done + + echo "normalizing MSVNS Fischer results with norm ${norm}..." + outdir=${MSVNS_FISCHER_DIR}/norm${norm} + if [ ! -d ${outdir} ]; then + mkdir ${outdir} + fi + for infile in ${MSVNS_FISCHER_DIR}/*.out + do + qid=`basename ${infile} .out` + outfile=${outdir}/${qid}.out + normalize_msvns4maxcmo.py ${norm} ${qid} ${CM_FISCHER_DIR} < ${infile} > ${outfile} + done + echo "normalizing MSVNS Nh3D results with norm ${norm}..." + outdir=${MSVNS_NH3D_DIR}/norm${norm} + if [ ! -d ${outdir} ]; then + mkdir ${outdir} + fi + for infile in ${MSVNS_NH3D_DIR}/*.out + do + qid=`basename ${infile} .out` + outfile=${outdir}/${qid}.out + # dodgy: remove periods so that CATH id fits in 8 chars... + # luckily we get no duplicates... + cathid=`echo ${qid} | tr -d .` + normalize_msvns4maxcmo.py ${norm} ${cathid} ${CM_NH3D_DIR} < ${infile} > ${outfile} + done + echo "normalizing QP tableau search query200 results with norm ${norm}..." + outdir=${TABMATCH_QUERY200_DIR}/norm${norm} + if [ ! -d ${outdir} ]; then + mkdir ${outdir} + fi + for infile in ${TABMATCH_QUERY200_DIR}/*.out + do + qid=`basename ${infile} .out` + outfile=${outdir}/${qid}.out +# echo $qid + normalize_tabmatch.py ${norm} ${qid} ${TABLEAUXDB} < ${infile} > ${outfile} + done + echo "normalizing TableauSearch query200 results with norm ${norm}..." + outdir=${TABSEARCH_QUERY200_DIR}/norm${norm} + if [ ! -d ${outdir} ]; then + mkdir ${outdir} + fi + for infile in ${TABSEARCH_QUERY200_DIR}/*.scores + do + qid=`basename ${infile} .scores` + outfile=${outdir}/${qid}.tabsearch.out + tableausearchout2col.py < ${infile} | normalize_tabmatch.py ${norm} ${qid} ${TABSEARCHDB} > ${outfile} + done +done + diff --git a/scripts/normalize_cops.sh b/scripts/normalize_cops.sh new file mode 100755 index 0000000..44f590e --- /dev/null +++ b/scripts/normalize_cops.sh @@ -0,0 +1,42 @@ +#!/bin/sh +# +# File: normalize_cops_multiquery.sh +# Author: Alex Stivala +# Created: October 2008 +# +# normalize_cops.sh - build all normalized scores files for COPS db +# +# Usage: normalize_cops.sh +# +# Run from the cops/ subdirectory: processes files under there +# makeing norm1/ norm2/ norm3/ subdirectories, +# using also tableaux files for COPS benchmark data set +# via script normalize_tabmatch.py +# +# The scripts/ directory (containing this scrpipt and the abovementioned ones) +# must be in the PATH +# +# $Id: normalize_cops.sh 3635 2010-05-12 06:48:14Z alexs $ + +TABMATCH_COPS_DIR=. + +TABLEAUXDB=/home/alexs/tableauxdb/COPS/COPS.tableaux_db_and_queries.ascii + + +NORMTYPES="1 2 3" + +for norm in ${NORMTYPES} +do + echo "normalizing tableau search COPS results with norm ${norm}..." + outdir=${TABMATCH_COPS_DIR}/norm${norm} + if [ ! -d ${outdir} ]; then + mkdir ${outdir} + fi + for infile in ${TABMATCH_COPS_DIR}/*.out + do + qid=`basename ${infile} .out` + outfile=${outdir}/${qid}.out + normalize_tabmatch.py ${norm} ${qid} ${TABLEAUXDB} < ${infile} > ${outfile} + done +done + diff --git a/scripts/normalize_fischer.sh b/scripts/normalize_fischer.sh new file mode 100644 index 0000000..e11d72e --- /dev/null +++ b/scripts/normalize_fischer.sh @@ -0,0 +1,42 @@ +#!/bin/sh +# +# File: normalize_fischer.sh +# Author: Alex Stivala +# Created: October 2008 +# +# normalize_fischer.sh - build all normalized scores files for Fischer db +# +# Usage: normalize_fischer.sh +# +# Run from the fischer/ subdirectory: processes files under there +# makeing norm1/ norm2/ norm3/ subdirectories, +# using also tableaux and contact map input files under data/ +# via script normalize_tabmatch.py +# +# The scripts/ directory (containing this scrpipt and the abovementioned ones) +# must be in the PATH +# +# $Id: normalize_fischer.sh 3565 2010-04-19 05:09:31Z alexs $ + +TABMATCH_FISCHER_DIR=. + +TABLEAUX_FISCHER_DIR=${HOME}/phd/qptabsearch/data/fischer_db + + +NORMTYPES="1 2 3" + +for norm in ${NORMTYPES} +do + echo "normalizing QP tableau search Fischer results with norm ${norm}..." + outdir=${TABMATCH_FISCHER_DIR}/norm${norm} + if [ ! -d ${outdir} ]; then + mkdir ${outdir} + fi + for infile in ${TABMATCH_FISCHER_DIR}/*.out + do + qid=`basename ${infile} .out` + outfile=${outdir}/${qid}.out + normalize_tabmatch.py ${norm} ${qid} ${TABLEAUX_FISCHER_DIR} < ${infile} > ${outfile} + done +done + diff --git a/scripts/normalize_fischer_multiquery.sh b/scripts/normalize_fischer_multiquery.sh new file mode 100755 index 0000000..179d12c --- /dev/null +++ b/scripts/normalize_fischer_multiquery.sh @@ -0,0 +1,42 @@ +#!/bin/sh +# +# File: normalize_fischer_multiquery.sh +# Author: Alex Stivala +# Created: October 2008 +# +# normalize_fischer.sh - build all normalized scores files for Fischer db +# +# Usage: normalize_fischer.sh +# +# Run from the fischer/ subdirectory: processes files under there +# makeing norm1/ norm2/ norm3/ subdirectories, +# using also tableaux and contact map input files under data/ +# via script normalize_tabmatch.py +# +# The scripts/ directory (containing this scrpipt and the abovementioned ones) +# must be in the PATH +# +# $Id: normalize_fischer_multiquery.sh 3571 2010-04-19 06:44:53Z alexs $ + +TABMATCH_FISCHER_DIR=. + +TABLEAUX_FISCHER_DIR=${HOME}/phd/qptabsearch/data/fischer_db + + +NORMTYPES="1 2 3" + +for norm in ${NORMTYPES} +do + echo "normalizing tableau search Fischer results with norm ${norm}..." + outdir=${TABMATCH_FISCHER_DIR}/norm${norm} + if [ ! -d ${outdir} ]; then + mkdir ${outdir} + fi + for infile in ${TABMATCH_FISCHER_DIR}/*.out + do + qid=`basename ${infile} .out` + outfile=${outdir}/${qid}.out + normalize_tabmatch.py ${norm} -m ${TABLEAUX_FISCHER_DIR} < ${infile} > ${outfile} + done +done + diff --git a/scripts/normalize_msvns4maxcmo.py b/scripts/normalize_msvns4maxcmo.py new file mode 100755 index 0000000..bac79ae --- /dev/null +++ b/scripts/normalize_msvns4maxcmo.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +############################################################################### +# +# normalize_msvns4maxcmo.py - normalize MSVNS4MACMO output scores by size +# +# File: normalize_msvns4maxcmo.py +# Author: Alex Stivala +# Created: September 2008 +# +# +# Given list of scores from MSVNS4MAXCMO (Pelta et al 2008) +# run with msvns4maxcmo_allall.py etc. +# normalize by protein size. +# +# Usage: +# normalize_msvns4maxcmo.py normtype queryid db_directory < scoresfile > matrixfile +# +# normtype is the normalization function: 1,2, or 3 +# queryid is the identifier for the query structure, all the scores are +# assumed to be for matching this to the identifier on each line of the input. +# Input is from stdin, tab-delimited in the format +# +# pdbid score +# +# Also reads .cm_a7.0 files from db_directory, as used by MSVNS4MAXCMO +# to get number of contacts in the contact map. +# +# Output is to stdout, in the same format as input i.e. +# +# pdbid normscore +# +# where normscore is the normalized score. +# +# $Id: normalize_msvns4maxcmo.py 1964 2008-10-08 06:20:53Z astivala $ +# +############################################################################### + +import sys,os,glob + +from norms import norm1,norm2,norm3 + +def usage(progname): + """ + Print usage message and exit + """ + sys.stderr.write("Usage: " + progname + " \n") + sys.exit(1) + + +def main(): + """ + main for normalize_tabmatch.py + """ + if len(sys.argv) != 4: + usage(os.path.basename(sys.argv[0])) + + normtype = int(sys.argv[1]) + queryid = sys.argv[2] + db_directory = sys.argv[3] + if normtype < 1 or normtype > 3: + sys.stderr.write('normtype must be 1,2, or 3\n') + usage(sys.argv[0]) + + # get numbers of SSEs from .tableauxdistmatrix files in db_diurectory + # build dict of {name : size} + size_dict = {} + input_list = glob.glob(os.path.join(db_directory, '*.cm_a7.0')) + for dbfile in input_list: + splitfile = os.path.basename(dbfile).split('.') + qid = reduce(lambda a,b:a+b, splitfile[:len(splitfile)-2]).upper() + if not qid.isdigit(): # if not 'compressed' CATH id (periods removed) + qid = reduce(lambda a,b:a+'.'+b, splitfile[:len(splitfile)-2]).upper() + firstline = open(dbfile).readline() + num_residues = int(firstline) + num_contacts = 0 + lineno = 0 + fh = open(dbfile) + for line in fh: + if lineno == 0: + lineno += 1 + continue # first line is number of residues + if line.lstrip().rstrip() == '': + lineno += 1 + continue + sline = line.split() + if sline[0].isdigit() and sline[1].isdigit(): + num_contacts += 1 + lineno += 1 + fh.close() + # size is number of contacts in contact map, not number of residues + size_dict[qid] = num_residues + # FIXME NB this is SUPPOSED to be num_contacts, but actually + # num_residues works better and is closer to Pelta et al 2008 + # published results when measured for accuracy with AUC +# size_dict[qid] = num_contacts + + # build list of (name , score) + scorelist = [] + commentlines = [] + for line in sys.stdin: + if line[0] == '#': + commentlines.append(line) + continue + s = line.split() + scorelist.append((s[0], float(s[1]))) + + querysize = size_dict[queryid.upper()] + + sys.stdout.write('# normalization type ' + str(normtype) + '\n') + sys.stdout.write('# generated by ' + os.path.basename(sys.argv[0]) +'\n') + sys.stdout.write('# from:\n') + for line in commentlines: + sys.stdout.write(line) + sys.stdout.write('#\n') + + for (pdbid, score) in scorelist: + dsize = size_dict[pdbid.upper()] + if normtype == 1: + normscore = norm1(score, querysize, dsize) + elif normtype == 2: + normscore = norm2(score, querysize, dsize) + elif normtype == 3: + normscore = norm3(score, querysize, dsize) + else: + raise ValueError('unknown norm type ' + str(normtype) + '\n') + sys.stdout.write('%s\t%20.8f\n' % (pdbid, normscore)) + + +if __name__ == "__main__": + main() + diff --git a/scripts/normalize_multiquery200.sh b/scripts/normalize_multiquery200.sh new file mode 100755 index 0000000..0aa3267 --- /dev/null +++ b/scripts/normalize_multiquery200.sh @@ -0,0 +1,39 @@ +#!/bin/sh +# +# File: normalize_multiquery200.sh +# Author: Alex Stivala +# Created: February 2010 +# +# normalize_multiquery200.sh - build normalized scores for query200 set +# +# Usage: normalize_multiquery200.sh +# +# Run from the query200 results directory: processes files under there +# makeing norm1/ norm2/ norm3/ subdirectories, +# using also tableaux and contact map input files under data/ +# via scripts normalize_tabmatch.py +# +# The scripts/ directory (containing this scrpipt and the abovementioned ones) +# must be in the PATH +# +# $Id: normalize_multiquery200.sh 3345 2010-02-16 03:42:05Z alexs $ + +TABMATCH_QUERY200_DIR=. +TABLEAUXDB=/home/alexs/tableauxdb/ASTRAL-1.75/tableauxdistmatrixdb.ascii + +NORMTYPES="1 2 3" + +for norm in ${NORMTYPES} +do + echo "normalizing query200 results with norm ${norm}..." + outdir=${TABMATCH_QUERY200_DIR}/norm${norm} + if [ ! -d ${outdir} ]; then + mkdir ${outdir} + fi + for infile in ${TABMATCH_QUERY200_DIR}/*.out + do + outfile=${outdir}/`basename $infile .out`.out + normalize_tabmatch.py ${norm} -m ${TABLEAUXDB} < ${infile} > ${outfile} + done +done + diff --git a/scripts/normalize_query200.sh b/scripts/normalize_query200.sh new file mode 100755 index 0000000..136b7e2 --- /dev/null +++ b/scripts/normalize_query200.sh @@ -0,0 +1,41 @@ +#!/bin/sh +# +# File: normalize_query200.sh +# Author: Alex Stivala +# Created: October 2008 +# +# normalize_query200.sh - build all normalized scores files for query200 set +# +# Usage: normalize_query200.sh +# +# Run from the query200 results directory: processes files under there +# makeing norm1/ norm2/ norm3/ subdirectories, +# using also tableaux and contact map input files under data/ +# via scripts normalize_tabmatch.py +# +# The scripts/ directory (containing this scrpipt and the abovementioned ones) +# must be in the PATH +# +# $Id: normalize_query200.sh 2092 2009-03-09 23:19:14Z astivala $ + +TABMATCH_QUERY200_DIR=. +TABLEAUXDB=/home/alexs/tableauxdb/ASTRAL-1.75/tableauxdistmatrixdb.ascii + +NORMTYPES="1 2 3" + +for norm in ${NORMTYPES} +do + echo "normalizing QP tableau search query200 results with norm ${norm}..." + outdir=${TABMATCH_QUERY200_DIR}/norm${norm} + if [ ! -d ${outdir} ]; then + mkdir ${outdir} + fi + for infile in ${TABMATCH_QUERY200_DIR}/*.out + do + qid=`basename ${infile} .out` + outfile=${outdir}/${qid}.out +# echo $qid + normalize_tabmatch.py ${norm} ${qid} ${TABLEAUXDB} < ${infile} > ${outfile} + done +done + diff --git a/scripts/normalize_tabmatch.py b/scripts/normalize_tabmatch.py new file mode 100755 index 0000000..63c9be2 --- /dev/null +++ b/scripts/normalize_tabmatch.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python +############################################################################### +# +# normalize_tabmatch.py - normalize tableau match output scores by protein size +# +# File: normalize_tabmatch.py +# Author: Alex Stivala +# Created: September 2008 +# +# +# Given list of scores from QP tableau search (tsrchd_sparse etc.) output, +# normalize the scores by proteni size. +# +# Usage: +# normalize_tabmatch.py normtype +# +# normtype is the normalization function: 1,2, or 3 +# queryid is the identifier for the query structure, all the scores are +# assumed to be for matching this to the identifier on each line of the input. +# If -m is specified instead of a queryid, it is a multiquery input file with +# multiple queries, each with a '# QUERY ID = queyrid' comment line to +# delimit each query: the output file is then also a multiquery file. +# Input is from stdin, tab-delimited in the format +# +# pdbid score +# +# Also reads .tableuxdistmatrix files from db_directory, as used by +# qptabmatch_allparirs.py, tsrchd_sparse etc., to get +# orders of tableaux (number of SSEs) +# required for normalization. +# If db_file is specified instead of db_directory, just reads tableaux/ +# distmatrix db from that file. +# +# Output is to stdout, in the same format as input i.e. +# +# pdbid normscore +# +# where normscore is the normalized score. +# +# $Id: normalize_tabmatch.py 3345 2010-02-16 03:42:05Z alexs $ +# +############################################################################### + +import sys,os,glob +from itertools import groupby +from norms import norm1,norm2,norm3 +from tsevalutils import iter_searchresult + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def parse_tableauxdb_sizes(fh): + """ + Parse the dimensinos of tableaux from the ascii tableauxdb + + Parameters: + fh - open (read) filehandle for ascii tableauxdb file (numeric or + discrete) + Return value: + dictionary { domid : dim } mapping domain identifiers to tableau + dimension (numer of sses) + """ + dimdict = {} + line = fh.readline() + while line != "": + (domid, sdim) = line.split() + dim = int(sdim) + dimdict[domid.lstrip().rstrip().upper()] = dim + i = 0 + while i < dim * 2: + line = fh.readline() # read through the tableau and dist matrix + i += 1 + line = fh.readline() # read the blank line between entries + line = fh.readline() + return dimdict + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + sys.stderr.write("Usage: " + progname + " \n") + sys.exit(1) + + +def main(): + """ + main for normalize_tabmatch.py + """ + multiquery = False + + if len(sys.argv) != 4: + usage(os.path.basename(sys.argv[0])) + + normtype = int(sys.argv[1]) + queryid = sys.argv[2] + if queryid == "-m": + multiquery = True + db_directory = sys.argv[3] + if normtype < 1 or normtype > 3: + sys.stderr.write('normtype must be 1,2, or 3\n') + usage(sys.argv[0]) + + # get numbers of SSEs from .tableauxdistmatrix files in db_diurectory + # build dict of {name : size} + size_dict = {} + if (not os.path.isdir(db_directory)): + # all tableaux/distmatrix in one db file, not dirctory of them + dbfile = db_directory + size_dict = parse_tableauxdb_sizes(open(dbfile)) + else: + input_list = glob.glob(os.path.join(db_directory, '*.tableaudistmatrix')) + if len(input_list) == 0: + # try .angles files instead + input_list = glob.glob(os.path.join(db_directory, '*.angles')) + for dbfile in input_list: + qid = os.path.splitext(os.path.splitext(os.path.basename(dbfile))[0])[0].upper() + firstline = open(dbfile).readline() + if len(firstline) < 1: + # can happen if no SSEs + # sys.stderr.write('skipped ' + dbfile + '\n') + continue + qsize=int(firstline) + size_dict[qid] = qsize + else: + for dbfile in input_list: + idline = open(dbfile).readline() + if len(idline) < 2: + # can happen if no SSEs + # sys.stderr.write('skipped ' + dbfile + '\n') + continue + qid = idline[:8].lstrip().rstrip().upper() + qsize = int(idline[8:]) + size_dict[qid] = qsize + + # build list of (name , score) + scorelist = [] + commentlines = [] + if multiquery: + # get list of iterables each for same queryid. + # iter_searchresult() is isterable of tuples (queryid, score, + # domainid) groupby() requires that the iterable already has + # identical consecutive queryids (first element of tuple) - + # iter_searchresult() should yield this + query_group_iter = groupby(iter_searchresult(sys.stdin,multiquery=True, + skip_self_query=False, + negateflag=False), + lambda t : t[0]) + + else: + for line in sys.stdin: + if line[0] == '#': + commentlines.append(line) + continue + s = line.split() + if len(s) != 2: + sys.stderr.write('skipped line ' + line) + continue + score_str = s[1] + if score_str.lower() == 'nan' or score_str == '********': + sys.stderr.write('skipped line ' + line) + continue + scorelist.append((s[0], float(score_str))) + query_group_iter = [ ( queryid, [ (queryid,score,domainid) + for (domainid,score) in scorelist ] ) ] + + for (queryid, result_iter) in query_group_iter: + querysize = size_dict[queryid.upper()] + + sys.stdout.write('# normalization type ' + str(normtype) + '\n') + sys.stdout.write('# generated by ' + os.path.basename(sys.argv[0]) +'\n') + sys.stdout.write('# from:\n') + for line in commentlines: + sys.stdout.write(line) + sys.stdout.write('#\n') + + if multiquery: + sys.stdout.write('# QUERY ID = ' + queryid + '\n') + + skipcount = 0 + for (queryid, score, pdbid) in result_iter: + try: + dsize = size_dict[pdbid.upper()] + except KeyError: + skipcount += 1 + + if normtype == 1: + normscore = norm1(score, querysize, dsize) + elif normtype == 2: + normscore = norm2(score, querysize, dsize) + elif normtype == 3: + normscore = norm3(score, querysize, dsize) + else: + raise ValueError('unknown norm type ' + str(normtype) + '\n') + sys.stdout.write('%s\t%20.8f\n' % (pdbid, normscore)) + + + +if __name__ == "__main__": + main() + diff --git a/scripts/norms.py b/scripts/norms.py new file mode 100755 index 0000000..d9421bc --- /dev/null +++ b/scripts/norms.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +############################################################################### +# +# norms.py - functions to normalize tableau match score by protein size +# +# File: norms.py +# Author: Alex Stivala +# Created: September 2008 +# +# $Id: norms.py 3585 2010-04-29 03:56:03Z alexs $ +# +############################################################################### +""" + Contains function to normalize tableua matching scores by number + of SSEs, analogous to the normalizatino functions used in Pelta et al 2008 + BMC Bioinformatics 9:161 + + Note comments on these functions refer to #SSEs for normalizing tableau + match score, but actually they can equally well be used for normalizing + MAX-CMO scores where size is number of contacts, as original use + in Pelta et al 2008. + + Also note using these functions with QP tableau search (size is + number of SSES) does not 'normalize' the results into [0,1] either, + since max score is actually n(n-1) where n is #SSEs in smaller structure. + TODO: should have different normalization functions for tableau search + +""" +import sys,os,glob +import numpy.oldnumeric as Numeric + + + +def norm1(score, size1, size2): + """ + Normalization similar to norm1 in Pelta et al 2008 (from Lancia et al + 2006) for MAX-CMO. + + norm1(struct1,struct2) = tabmatch_score(struct1,struct2) / + min(#sses(struct1), #sses(struct2)) + + Parameters: + score - tableau match score for the two structures + size1 - number of SSEs in one structure + size2 - number of SSEs in other structure + + Return value: + normalized score as described above. + """ + return score / float(min(size1, size2)) +# n = min(size1,size2) +# if n < 2: +# n = 2 +# return score / float(n*(n-1)) + + +def norm2(score, size1, size2): + """ + Normalization similar to norm2 in Pelta et al 2008 (from Xie & Sahinidis + 2006) for MAX-CMO. + + norm1(struct1,struct2) = 2*tabmatch_score(struct1,struct2) / + (#sses(struct1) + #sses(struct2)) + + Parameters: + score - tableau match score for the two structures + size1 - number of SSEs in one structure + size2 - number of SSEs in other structure + + Return value: + normalized score as described above. + """ + return 2 * score / float(size1 + size2) + + +def norm3(score, size1, size2): + """ + Normalization similar to norm3 in Pelta et al 2008 for MAX-CMO. + + norm1(struct1,struct2) = 0 if #SSE differnce > 75% + norm1(struct1,struc2) otherwise + + Parameters: + score - tableau match score for the two structures + size1 - number of SSEs in one structure + size2 - number of SSEs in other structure + + Return value: + normalized score as described above. + """ + if float(abs(size1 - size2)) / float(max(size1,size2)) > 0.75: + return 0 + else: + return norm1(score, size1, size2) + + diff --git a/scripts/out2col2html.sh b/scripts/out2col2html.sh new file mode 100755 index 0000000..d324160 --- /dev/null +++ b/scripts/out2col2html.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# +# File: out2col2html.sh +# Author: Alex Stivala +# Created: October 2009 +# +# out2col2html.sh - Convert 2-colum (dbid score) format to HTML format +# +# Usage: out2col2html.sh < qptabsearchoutput +# +# Output from qp tableau search has two columns, database id and matching score. +# This converts to HTML formatted text, with each dbid given link to +# pro-origami webserver database prebuilt cartoon for that ASTRAL SCOP sid, +# and to SCOP entry. +# +# Output is to stdout. +# +# $Id: out2col2html.sh 3090 2009-12-20 06:06:14Z alexs $ +# + +echo '' +echo '' +echo '
' +echo '' +awk '{printf("\n"),$2,$1,$1,$1,$1}' +echo '
%s %sSCOP entry for %s
' +echo '
' +echo '' + + diff --git a/scripts/out2col2htmlscop.py b/scripts/out2col2htmlscop.py new file mode 100644 index 0000000..d82d18a --- /dev/null +++ b/scripts/out2col2htmlscop.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python +############################################################################### +# +# out2col2htmlscop.py - Convert 2 column format (score,domainid) to HTML +# with SCOP information and links to SCOP +# +# See usage comment in docstring for main() +# +# File: out2col2htmlscop.py +# Author: Alex Stivala +# Created: March 2010 +# +# $Id: out2col2htmlscop.py 2038 2008-11-25 05:39:26Z astivala $ +# +############################################################################### + +""" +Converts the 2 column output from qptabsearch, SA tab search, etc. +HTML formatted text, with each dbid given link to +pro-origami webserver database prebuilt cartoon for that ASTRAL SCOP +sid, with the selected SSEs indicated to be highlighted, and link to +SCOP entry, also SCOP superfamily sccs and fold description. + +Output is to stdout. + +The cache file is a python pickle dictionary: + scopdominfo_dict - + dict {sid: (superfamily_sccs, superfamily_description, fold_sccs fold_description)} + where + superfamily_sccs is SCOP sccs identifying the superfamily for the domain + superamily_description is SCOP dessription of the superfamily + fold_sccs is SCOP sccs of the fold it is in + fold_description is the SCOP descriptino of the fold the domain is in + +""" + +import sys,os +import pickle + + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " cachefile\n") + sys.exit(1) + + +def main(): + """ + main for out2col2htmlscop.py + + Usage: out2col2htmlscop.py cachefile + + + cachefile is the filename of the cache (pickled) file built by + build_fastscopdominfo_cache.py + + Input is on stdin, the output of soln2ssemap.py, + format is identifier and score + + Ouput is HTML format text with each dbid given link to pro-origami + webserver database prebuilt cartoon for that ASTRAL SCOP sid, with + the selected SSEs indicated to be highlighted, and link to SCOP + entry, also superfamily sccs id with link to SCOP and fold + description. + + """ + + if len(sys.argv) != 2: + usage(os.path.basename(sys.argv[0])) + + pickle_filename = sys.argv[1] + scopdominfo_dict = pickle.load(open(pickle_filename)) + + + print '' + print '' + print '
' + print '' + + print '' + + for line in sys.stdin: + splitline = line.split() + sid = splitline[0] + score = splitline[1] + entry = scopdominfo_dict[sid] + sf_sccs = entry[0] + sf_desc = entry[1] + fold_sccs = entry[2] + fold_desc = entry[3] + + sys.stdout.write("\n" + % (score, + sid, + sid, sid, + sid, + sf_sccs, sf_sccs, sf_desc, + fold_sccs, fold_desc) + ) + + print '
scorecartoonSCOP entrysuperfamilyfold
%s %s%s%s %s%s
' + print '
' + print '' + + +if __name__ == "__main__": + main() diff --git a/scripts/parsepdom.py b/scripts/parsepdom.py new file mode 100644 index 0000000..8c1f420 --- /dev/null +++ b/scripts/parsepdom.py @@ -0,0 +1,149 @@ +############################################################################### +# +# parsepdom.py - functions to parse the pDomains domain benchmark files +# +# File: parsepdom.py +# Author: Alex Stivala +# Created: December 2007 +# +# $Id: parsepdom.py 871 2007-12-30 05:30:22Z astivala $ +# +############################################################################### + +""" +Functions to parse the pDomains protein domain decomposition benchmark +data files. + +These files are available from http://pdomains.sdsc.edu + +and the data and benhmarks are described in + +Veretnik et al 2004 'Toward Consistent Assignment of Structural Domains in +Proteins' J. Mol. Biol. 339(3):647-678 + +and + +Holland et al 2006 'Partitioning Protein Structures into Domains: Why is it so +Difficult?' J. Mol. Biol. 361:562-590 + +""" + +import os,sys + +from ptdomain import * + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def parse_pdomains_file(fh): + """ + Parse a a pDomains file in the full (not raw) format, which has results + of multiple methods for each chain. We will build a dictionary + indexed first by chain (pdb and possibly chain identifier) + where each entry is a dictionary indexed by method + (PDP, DomainParser, DALI, NCBI, STERNBERG, CATH, SCOP). The values + for each of these methods is then the domain decomposition in the + form a list of PTDomain (ptdomain.py) objects. + + Note that the STERNBERG method is from what is usually known as the AUTHORS + assignment, from Islam et al 1995 'Identification and analysis of domains + in proteins' Protein Engineering 8(6):513-525. + + This was developed with files downloaded from http://pdomains.sdsc.edu + on 28Dec2007. + + Parameters: + fh - filehandle open for read of a pDomains file (such as + Benchmark_1_467) + Return value: + dictionary as described above + """ + + # Entries look like this: + # + # Chain: 3adk + # + # Method: PDP + # Number of domains 2 + # Domain name: 3adk_a + # Number of fragments in this domain: 2 + # Position of the fragment 1 start:1, end:37 + # Position of the fragment 2 start:76, end:194 + # Domain name: 3adk_b + # Number of fragments in this domain: 1 + # Position of the fragment 1 start:38, end:75 + # + + # NOTE: DALI method tends to often put end residue sequence number 0 + # don't know what to do with that so we will omit DALI. + + chainid_dict = {} # { chainid : method_dict } + method_dict = {} # { method string : list of PTDomain objects } + domainlist = [] + domain = None + segment = None + method = None + for line in fh: + if line[:7] == " Chain:": # found a new entry, finish the old one + if segment: + domain.add_segment(segment) + domainlist.append(domain) + if method != "DALI": # have to omit DALI for now + method_dict[method] = domainlist + if method_dict: + chainid_dict[chainid] = dict(method_dict) + chainid = line[8:].lstrip().rstrip('\n').rstrip().upper() + if len(chainid) > 4: + chainchar = chainid[4] + else: + chainchar = 'A' # use chain A by default (for remediated pdb) + domainlist = [] + domain = segment = method = None + elif line[:8] == " Method:": # new method, finish the old one + if segment: + if domain: + domain.add_segment(segment) + domainlist.append(domain) + if method != "DALI": # we'll have to omit DALI for now + method_dict[method] = domainlist + domainlist = [] + domain = segment = None + method = line[9:].lstrip().rstrip('\n').rstrip() + elif line[:12] == "Domain name:": # new domain, finish the old one + if domain: + if segment: + domain.add_segment(segment) + domainlist.append(domain) + domain = PTDomain(line[13:].lstrip().rstrip('\n').rstrip(), []) + segment = None + elif line[:24] == "Position of the fragment": + if segment: + domain.add_segment(segment) # new segment, finish the old one + startline = line[line.index("start:"):] + start_resnum = int(startline[6 : startline.index(',')]) + endline = line[line.index("end:"):] + end_resnum = int(endline[4:].rstrip('\n')) + try: +# print chainid,method + segment = PTSegment(chainchar, start_resnum, end_resnum) + except ValueError: + if method == "DALI": + pass # DALI puts 0 as end res seq num, we'll omit it anyway + else : + sys.stderr.write('WARNING: chainid ' + chainid + ' method ' + + method + ': end before start, ' + 'ignoring fragment\n') + segment = None + + if segment: + domain.add_segment(segment) + domainlist.append(domain) + if method != "DALI": # have to omit DALI for now + method_dict[method] = domainlist + chainid_dict[chainid] = dict(method_dict) + + return chainid_dict diff --git a/scripts/parsessemap.py b/scripts/parsessemap.py new file mode 100644 index 0000000..91d7807 --- /dev/null +++ b/scripts/parsessemap.py @@ -0,0 +1,133 @@ +############################################################################### +# +# parsessemap.py - Functions to parse soln2ssemap.py output +# +# File: parsessemap.py +# Author: Alex Stivala +# Created: June 2008 +# +# $Id: parsessemap.py 3120 2009-12-24 04:42:47Z alexs $ +# +############################################################################### + +""" +Parse output of soln2ssepmap.py +""" +import sys,os + + + +#----------------------------------------------------------------------------- +# +# Class definitions +# +#----------------------------------------------------------------------------- + +class SearchMap: + """ + SearchMap is just a dummy class for containing the search results + with solution vectors, returned by parse_ssemap() + """ + pass + +class QuerySSEMap: + """ + QuerySSEMap is a dummy class for containign result from individual query, + in SearchMap. + """ + pass + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def parse_ssemap(fh): + """ + Parse the output of soln2ssemap.py; + identifier and score (as per input), then + for each matching a line containing + i and j separated by a space, + one per line (with blank line before next id) e.g.: + + # soln2ssemap.py processed: + # + # TSRCHD LTYPE = T LORDER = F LSOLN = T + # QUERY ID = D1KI9A_ + # DBFILE = /home/astivala/tableauxdb.ascii + # Tue Aug 5 12:31:50 2008 + # + # on 05Aug2008 15:36:05 + # + d1wiua_ -23.0000 + 1 1 + 3 2 + 8 4 + 9 5 + 11 6 + 14 9 + + Note the header 'comment' information is required, we get the QUERY ID + from it. + + Parameters: + fh - open (read) filehandle to parse from + + Return value: + search_maps - dummy class SearchMap containing: + queryid - query identifier parsed from comments + query_ssemap_list - list of dummy class QuerySSEMap containing: + domid - id of domain in db + score - score of queryid against domid + sse_map - list of (i,j) SSE sequential index tuples + comment_lines - list of comment lines read + + """ + search_maps = SearchMap() + query_ssemap_list = [] + query_ssemap = None + search_maps.comment_lines = [] + for line in fh: + if line[0] == '#': + sline = line[1:].split('=') + if sline[0].lstrip().rstrip() == 'QUERY ID': + search_maps.queryid = sline[1].lstrip().rstrip().lower() + search_maps.comment_lines.append(line) + continue + elif len(line) < 2: + continue #skip blank lines + elif not line.split()[0].isdigit(): # new identifier + if query_ssemap: + query_ssemap_list.append(query_ssemap) # finsished with prev one + splitline = line.split() + if len(splitline) != 2: + sys.stderr.write('bad line: ' + line + '\n') + continue + domainid = splitline[0] + score_str = splitline[1] + query_ssemap = QuerySSEMap() + query_ssemap.domid = domainid + query_ssemap.sse_map = [] + query_ssemap.score = float(score_str) + else: + # line should be two SSE indices spearated by space + try: + sline = line.split() + i = int(sline[0]) + j = int(sline[1]) + except ValueError: + sys.stderr.write('bad line: ' + line + '\n') + # skip whole query of a value is bad + if query_ssemap: + sys.stderr.write('skipping ' + query_ssemap.domid + '\n') + query_ssemap = None + continue + query_ssemap.sse_map.append((i,j)) + if query_ssemap: + query_ssemap_list.append(query_ssemap) + + search_maps.query_ssemap_list = query_ssemap_list + return search_maps + diff --git a/scripts/pathdefs.py b/scripts/pathdefs.py new file mode 100644 index 0000000..89c2c39 --- /dev/null +++ b/scripts/pathdefs.py @@ -0,0 +1,52 @@ +############################################################################### +# +# pathdefs.py - Definitions of pathnames: edit for your system. +# +# File: pathdefs.py +# Author: Alex Stivala +# Created: August 2008 +# +# $Id: pathdefs.py 3485 2010-03-17 04:48:13Z alexs $ +# +############################################################################### +""" +Locations of directory hierarchies for ASTRAL SCOP etc. +Edit appropriately for your system. + +SCOP and ASTRAL data is obtained using the Bio.SCOP library (Casbon et +al 2006 'A high level interface to SCOP and ASTRAL implemented in +Python' BMC Bioinformatics 7:10) and depends on having the data +downloaded, in SCOP_DIR (defined below). + +Downloaded SCOP files from + +http://scop.mrc-lmb.cam.ac.uk/scop/parse/index.html + +and ASTRAL files (in scopseq-1.73) from + +http://astral.berkeley.edu/scopseq-1.73.html + +The files downlaoded are: + +/local/charikar/SCOP/: +dir.cla.scop.txt_1.73 +dir.des.scop.txt_1.73 +dir.hie.scop.txt_1.73 + +/local/charikar/SCOP/scopseq-1.73: +astral-scopdom-seqres-all-1.73.fa +astral-scopdom-seqres-sel-gs-bib-95-1.73.id + +Other files there are indices built by Bio.SCOP when first used. + +""" + +# location of ASTRAL PDB-style coordinate files divided hierarchy +ASTRAL_ROOT = "/usr/local/ASTRAL/pdbstyle-1.75" + +# location of SCOP dir files +#SCOP_DIR = "/usr/local/SCOP" +SCOP_DIR = "/home/alexs/SCOP" + +# SCOP version to use +SCOP_VERSION = 1.75 diff --git a/scripts/pdbid2scopsid.py b/scripts/pdbid2scopsid.py new file mode 100755 index 0000000..21c47ab --- /dev/null +++ b/scripts/pdbid2scopsid.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +############################################################################### +# +# pdbid2scopsid.py - Convert PDB identifier with chain id to SCOP sid +# +# File: pdbid2scopsid.py +# Author: Alex Stivala +# Created: March 2010 +# +# $Id: pdbid2scopsid.py 3471 2010-03-15 23:13:09Z alexs $ +# +############################################################################### + +""" +Usage: pdbid2scopsid.py < pdbidlist + +Convert a list of PDB identifiers (one per line) +with chain identifiers as used by DaliLite +e.g. 1qlpA to SCOP sid e.g. d1qlpa_ + +The reverse (SCOP sid to PDB id with chain) can be done with just + + sed 's/.$/\U&/g' + +but this is more complicated since we need to get the right character +on the end (_ or a domain id) and also in the latter case there +is ambiguity that we need +to resolve e.g. 1u6rA could be either d1u6ra1 or d1u6ra2. +A further complication arises with the 'genetic domain' type ASTRAL +structures e.g. d1mtp.1 (g1mtp.1 in the sequence and id SCOP files, another +complication (see the diff to Bio/SCOP/__init__.py for this) +which is 1mtpA and 1mtpB in the PDB DaliLite scheme (2 chains, both in +same domain). +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os +import getopt + + +from Bio.SCOP import * + +from tsevalutils import filter_domains_astral_nrpercent + +from pathdefs import SCOP_DIR,SCOP_VERSION + + +def pdbid_to_scopsid(pdbid, all_scopsids_dict): + """ + Convert a PDB id with chain to SCOP sid, as per description in + module header docstring + + Parameters: + pdbid - pdb id with chain e.g. 1u6rA + all_scopsids_dict - dict (scopsid, True) of all SCOP sids to check + + Return value: + scop sid e.g. d1u6ra2 + """ + scopsid = 'd' + pdbid.lower() + '_' + if not all_scopsids_dict.has_key(scopsid): + sid_list = [sid for sid in all_scopsids_dict.keys() if + sid[1:6] == scopsid[1:6]] + if len(sid_list) < 1: + sid_list = [sid for sid in all_scopsids_dict.keys() if + sid[1:5] == scopsid[1:5]] + if len(sid_list) > 0: + scopsid = sorted(sid_list)[0] # always take lowest domain id FIXME + else: + scopsid="UNKNOWN" + return scopsid + + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + + " < pdbidlist\n") + sys.exit(1) + + +def main(): + """ + main for pdbid2scopsid + see usage in module docstring + """ + + + if len(sys.argv) != 1: + usage(os.path.basename(sys.argv[0])) + + # read SCOP and ASTRAL data + sys.stderr.write('reading SCOP data...\n') + scop = Scop(dir_path=SCOP_DIR,version=SCOP_VERSION) + astral = Astral(dir_path=SCOP_DIR,version=SCOP_VERSION,scop=scop) + + nrpercent = 95 # Always use 95% nr subset. TODO make this an option + + all_domains = scop.getRoot().getDescendents('domain') + if nrpercent != None: + all_domains = filter_domains_astral_nrpercent(all_domains, + scop, astral, + nrpercent) + all_scopsids_dict = dict( [(d.sid,True) for d in all_domains] ) + + + for pdbid in sys.stdin: + scopsid = pdbid_to_scopsid(pdbid.rstrip(), all_scopsids_dict) + sys.stdout.write(scopsid + '\n') + +if __name__ == "__main__": + main() diff --git a/scripts/plotrocs_fischer_nh3d.r b/scripts/plotrocs_fischer_nh3d.r new file mode 100644 index 0000000..7f3bcc5 --- /dev/null +++ b/scripts/plotrocs_fischer_nh3d.r @@ -0,0 +1,126 @@ +# plotrocfischer.r - plot ROCs for different method on Fischer and Nh3D data +# +# Alex Stivala, October 2008 +# +# Plot ROC curves for different methods on the Fischer data set +# (Fischer et al 1996) and Nh3D data set (Thiruv et al 2005) +# as used in Pelta et al 2008. +# +# Requires the ROCR package from CRAN (developed with version 1.0-2) +# (ROCR in turn requires gplots, gtools, gdata) +# +# Run this on the output of e.g. tsevalfn.py with the -l option, +# it is a table with one column of scores from classifier, and second +# column of true class label (0 or 1) +# +# The citation for the ROCR package is +# Sing et al 2005 "ROCR: visualizing classifier performance in R" +# Bioinformatics 21(20):3940-3941 +# +# +# $Id: plotrocs_fischer_nh3d.r 2376 2009-05-14 01:40:32Z astivala $ + + +library(ROCR) + +# +# globals +# + +colorvec=c('deepskyblue4','brown','red','turquoise','blue','purple','green','cyan','gray20','magenta','darkolivegreen2','midnightblue','magenta3','darkseagreen','violetred3','darkslategray3') +ltyvec=c(1,2,4,5,6,1,2,1,5,6,1,2,4,5,6,1,2) +namevec=c('MSVNS3 norm1','MSVNS3 norm2', 'MSVNS3 norm3', 'QP tableau search norm1', 'QP tableau search norm2', 'QP tableau search norm3') + +fischer_fold_files=c('../maxcmo_results/fischer/norm1/fold.slrtab', '../maxcmo_results/fischer/norm2/fold.slrtab','../maxcmo_results/fischer/norm3/fold.slrtab','fischer/norm1/fold.slrtab','fischer/norm2/fold.slrtab','fischer/norm3/fold.slrtab') +fischer_class_files=c('../maxcmo_results/fischer/norm1/class.slrtab','../maxcmo_results/fischer/norm2/class.slrtab','../maxcmo_results/fischer/norm3/class.slrtab','fischer/norm1/class.slrtab','fischer/norm2/class.slrtab','fischer/norm3/class.slrtab') +nh3d_arch_files=c('../maxcmo_results/nh3d/norm1/arch.slrtab','../maxcmo_results/nh3d/norm2/arch.slrtab','../maxcmo_results/nh3d/norm3/arch.slrtab','nh3d/norm1/arch.slrtab','nh3d/norm2/arch.slrtab','nh3d/norm3/arch.slrtab') +nh3d_class_files=c('../maxcmo_results/nh3d/norm1/class.slrtab','../maxcmo_results/nh3d/norm2/class.slrtab','../maxcmo_results/nh3d/norm3/class.slrtab','nh3d/norm1/class.slrtab','nh3d/norm2/class.slrtab','nh3d/norm3/class.slrtab') + +# +# functions +# + +# +# Return the ROCR performance object for plotting ROC curve +# +# Parameters: +# tab : data frame with score and label columns +# +# Return value: +# ROCR performance object with FPR and TPR for plotting ROC curve +# +compute_perf <- function(tab) +{ + # tab is a data frame with score and label columns + pred <- prediction(tab$score, tab$label) + perfroc <- performance(pred, measure="tpr",x.measure="fpr") + return(perfroc) +} + +# +# main +# + + + +# EPS suitable for inserting into LaTeX + +postscript('rocs_fischer_fold.eps', + onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) +for (i in 1:length(fischer_fold_files)) { + tab <- read.table(fischer_fold_files[i], header=TRUE) + perfroc <- compute_perf(tab) + plot(perfroc, lty=ltyvec[i], col=colorvec[i], add=(i>1), + #main='Fischer data set at fold level' # remove title for paper + ) +} +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) +#lines(c(0,1),c(0,1),type='l',lty=3) +dev.off() + + +postscript('rocs_fischer_class.eps', + onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) +for (i in 1:length(fischer_class_files)) { + tab <- read.table(fischer_class_files[i], header=TRUE) + perfroc <- compute_perf(tab) + plot(perfroc, lty=ltyvec[i], col=colorvec[i], add=(i>1), +# main='Fischer data set at class level' # remove title for paper + ) +} +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) +#lines(c(0,1),c(0,1),type='l',lty=3) +dev.off() + +postscript('rocs_nh3d_arch.eps', + onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) +for (i in 1:length(nh3d_arch_files)) { + tab <- read.table(nh3d_arch_files[i], header=TRUE) + perfroc <- compute_perf(tab) + plot(perfroc, lty=ltyvec[i], col=colorvec[i], add=(i>1), + # main='Nh3D data set at architecture level' # remove title for paper + ) +} +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) +#lines(c(0,1),c(0,1),type='l',lty=3) +dev.off() + + +postscript('rocs_nh3d_class.eps', + onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) +for (i in 1:length(nh3d_class_files)) { + tab <- read.table(nh3d_class_files[i], header=TRUE) + perfroc <- compute_perf(tab) + plot(perfroc, lty=ltyvec[i], col=colorvec[i], add=(i>1), + # main='Nh3D data set at class level' # removed title for paper + ) + +} +#lines(c(0,1),c(0,1),type='l',lty=3) +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) +dev.off() + diff --git a/scripts/plotrocs_query200.r b/scripts/plotrocs_query200.r new file mode 100644 index 0000000..1c91435 --- /dev/null +++ b/scripts/plotrocs_query200.r @@ -0,0 +1,77 @@ +# plotrocs_query200.r - plot ROCs for different methods on query200 data set +# +# Alex Stivala, March 2009 +# +# Plot ROC curves for different methods on the query200 query set in +# ASTRAL SCOP 95% sequence nr data set +# +# Requires the ROCR package from CRAN (developed with version 1.0-2) +# (ROCR in turn requires gplots, gtools, gdata) +# +# Run this on the output of e.g. tsevalfn.py with the -l option, +# it is a table with one column of scores from classifier, and second +# column of true class label (0 or 1) +# +# The citation for the ROCR package is +# Sing et al 2005 "ROCR: visualizing classifier performance in R" +# Bioinformatics 21(20):3940-3941 +# +# +# $Id: plotrocs_query200.r 2376 2009-05-14 01:40:32Z astivala $ + + +library(ROCR) + +# +# globals +# + +colorvec=c('deepskyblue4','brown','red','turquoise','blue','purple','green','cyan','gray20','magenta','darkolivegreen2','midnightblue','magenta3','darkseagreen','violetred3','darkslategray3') +ltyvec=c(1,2,4,5,6,1,2,1,5,6,1,2,4,5,6,1,2) +namevec=c('QP tableau search (norm2)','VAST','SHEBA','TableauSearch (norm2)', 'TOPS') +slrtabs=c('query200/norm2/query200_roc.slrtab','../other_results/vast/vast_query200_res/vast_query200_roc.slrtab','../other_results/sheba/query200-pdbstyle-sel-gs-bib-95-1.73/sheba_query200_roc.slrtab','../other_results/TableauSearch/query200/norm2/tabsearch_query200_roc.slrtab','../other_results/tops/query200/tops_query200_roc.slrtab') + +# +# functions +# + +# +# Return the ROCR performance object for plotting ROC curve +# +# Parameters: +# tab : data frame with score and label columns +# +# Return value: +# ROCR performance object with FPR and TPR for plotting ROC curve +# +compute_perf <- function(tab) +{ + # tab is a data frame with score and label columns + pred <- prediction(tab$score, tab$label) + perfroc <- performance(pred, measure="tpr",x.measure="fpr") + return(perfroc) +} + +# +# main +# + + + +# EPS suitable for inserting into LaTeX + +postscript('rocs_query200.eps', + onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) +for (i in 1:length(slrtabs)) { + tab <- read.table(slrtabs[i], header=TRUE) + perfroc <- compute_perf(tab) + plot(perfroc, lty=ltyvec[i], col=colorvec[i], add=(i>1), + downsampling=0.5, +# main='ROC for 200 queries in ASTRAL SCOP 95% sequence identity nonredundant data set' # no title since including in paper with caption + ) +} +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) +#lines(c(0,1),c(0,1),type='l',lty=3) +dev.off() + diff --git a/scripts/plotsearchroc.r b/scripts/plotsearchroc.r new file mode 100644 index 0000000..1e15339 --- /dev/null +++ b/scripts/plotsearchroc.r @@ -0,0 +1,47 @@ +# R script for plotting ROC curve for different tableau search methods +# Alex Stivala +# July 2008 +# $Id: plotsearchroc.r 1946 2008-10-04 05:14:43Z astivala $ + + +# +# globals +# + +colorvec=c('deepskyblue4','brown','red','turquoise','blue','purple','green','cyan','gray20','magenta','darkolivegreen2','midnightblue','magenta3','darkseagreen','violetred3','darkslategray3') +ltyvec=c(1,2,4,5,6,1,2,1,5,6,1,2,4,5,6,1,2) +namevec=c('QP numeric', 'QP numeric SSE type', 'QP numeric ordering SSE type', 'QP discrete','QP discrete SSE type', 'QP discrete ordering SSE type') + + +# +# main +# + +qp_numeric <- read.table('d1ubia_.tsrchn.ff.rtab',header=TRUE) +qp_numeric_penalizessetype<-read.table('d1ubia_.tsrchn.tf.rtab',header=TRUE) +qp_numeric_ordering_penalizessetype <- read.table('d1ubia_.tsrchn.tt.rtab', header=TRUE) + +qp_discrete <- read.table('d1ubia_.tsrchd.ff.rtab',header=TRUE) +qp_discrete_penalizessetype<-read.table('d1ubia_.tsrchd.tf.rtab',header=TRUE) +qp_discrete_ordering_penalizessetype <- read.table('d1ubia_.tsrchd.tt.rtab', header=TRUE) + + +# EPS suitable for inserting into LaTeX +postscript('searchroc.eps',onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + +plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3,main='ROC curves for query d1ubia_ against ASTRAL 95% seq id SCOP as truth') + +lines(qp_numeric$fpr, qp_numeric$tpr, col=colorvec[1], lty=ltyvec[1]) +lines(qp_numeric_penalizessetype$fpr, qp_numeric_penalizessetype$tpr, col=colorvec[2], lty=ltyvec[2]) +lines(qp_numeric_ordering_penalizessetype$fpr, qp_numeric_ordering_penalizessetype$tpr, col=colorvec[3], lty=ltyvec[3]) +lines(qp_discrete$fpr, qp_discrete$tpr, col=colorvec[4], lty=ltyvec[4]) +lines(qp_discrete_penalizessetype$fpr, qp_discrete_penalizessetype$tpr, col=colorvec[5], lty=ltyvec[5]) +lines(qp_discrete_ordering_penalizessetype$fpr, qp_discrete_ordering_penalizessetype$tpr, col=colorvec[6], lty=ltyvec[6]) + + +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) + + +dev.off() + diff --git a/scripts/plotsearchroc_betagrasp.r b/scripts/plotsearchroc_betagrasp.r new file mode 100644 index 0000000..7c599ea --- /dev/null +++ b/scripts/plotsearchroc_betagrasp.r @@ -0,0 +1,45 @@ +# R script for plotting ROC curve for different tableau search methods +# Alex Stivala +# July 2008 +# $Id: plotsearchroc_betagrasp.r 1946 2008-10-04 05:14:43Z astivala $ + + +# +# globals +# + +colorvec=c('deepskyblue4','brown','red','turquoise','blue','purple','green','cyan','gray20','magenta','darkolivegreen2','midnightblue','magenta3','darkseagreen','violetred3','darkslategray3') +ltyvec=c(1,2,4,5,6,1,2,1,5,6,1,2,4,5,6,1,2) +namevec=c('QP numeric', 'QP numeric SSE type', 'QP numeric ordering SSE type', 'QP discrete','QP discrete SSE type', 'QP discrete ordering SSE type') + + +# +# main +# + +qp_numeric <- read.table('betagrasp.tsrchn.ff.rtab',header=TRUE) +qp_numeric_penalizessetype<-read.table('betagrasp.tsrchn.tf.rtab',header=TRUE) +qp_numeric_ordering_penalizessetype <- read.table('betagrasp.tsrchn.tt.rtab',header=TRUE) + +qp_discrete <- read.table('betagrasp.tsrchd.ff.rtab',header=TRUE) +qp_discrete_penalizessetype<-read.table('betagrasp.tsrchd.tf.rtab',header=TRUE) +qp_discrete_ordering_penalizessetype <- read.table('betagrasp.tsrchd.tt.rtab', header=TRUE) + +# EPS suitable for inserting into LaTeX +postscript('searchroc_betagrasp.eps',onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + +plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3,main='ROC curves for query betagrasp against ASTRAL 95% seq id ProSMoS beta-grasp dataset as truth') + +lines(qp_numeric$fpr, qp_numeric$tpr, col=colorvec[1], lty=ltyvec[1]) +lines(qp_numeric_penalizessetype$fpr, qp_numeric_penalizessetype$tpr, col=colorvec[2], lty=ltyvec[2]) +lines(qp_numeric_ordering_penalizessetype$fpr, qp_numeric_ordering_penalizessetype$tpr, col=colorvec[3], lty=ltyvec[3]) +lines(qp_discrete$fpr, qp_discrete$tpr, col=colorvec[4], lty=ltyvec[4]) +lines(qp_discrete_penalizessetype$fpr, qp_discrete_penalizessetype$tpr, col=colorvec[5], lty=ltyvec[5]) +lines(qp_discrete_ordering_penalizessetype$fpr, qp_discrete_ordering_penalizessetype$tpr, col=colorvec[6], lty=ltyvec[6]) + +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) + + +dev.off() + diff --git a/scripts/plotsearchroc_betagrasp_reduced.r b/scripts/plotsearchroc_betagrasp_reduced.r new file mode 100644 index 0000000..e132496 --- /dev/null +++ b/scripts/plotsearchroc_betagrasp_reduced.r @@ -0,0 +1,43 @@ +# R script for plotting ROC curve for different tableau search methods +# Alex Stivala +# July 2008 +# Reduced version to make clearer in grayscale (showing only those that +# don't overlap too much). +# $Id: plotsearchroc_betagrasp_reduced.r 1946 2008-10-04 05:14:43Z astivala $ + + +# +# globals +# + +colorvec=c('black','black','black','black','black','black') +ltyvec=c(1,2,4,5,6) +namevec=c('QP numeric ordering SSE type', 'QP discrete SSE type', 'QP discrete ordering SSE type') + + +# +# main +# + +qp_numeric <- read.table('betagrasp.tsrchn.ff.rtab',header=TRUE) +qp_numeric_ordering_penalizessetype <- read.table('betagrasp.tsrchn.tt.rtab',header=TRUE) + +qp_discrete <- read.table('betagrasp.tsrchd.ff.rtab',header=TRUE) +qp_discrete_penalizessetype <- read.table('betagrasp.tsrchd.tf.rtab', header=TRUE) +qp_discrete_ordering_penalizessetype <- read.table('betagrasp.tsrchd.tt.rtab', header=TRUE) + +# EPS suitable for inserting into LaTeX +postscript('searchroc_betagrasp_reduced.eps',onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + +plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3,main='ROC curves for query d1ubia_ against ASTRAL 95% seq id ProSMoS dataset as truth') + +lines(qp_numeric_ordering_penalizessetype$fpr, qp_numeric_ordering_penalizessetype$tpr, col=colorvec[1], lty=ltyvec[1]) +lines(qp_discrete_penalizessetype$fpr, qp_discrete_penalizessetype$tpr, col=colorvec[2], lty=ltyvec[2]) +lines(qp_discrete_ordering_penalizessetype$fpr, qp_discrete_ordering_penalizessetype$tpr, col=colorvec[3], lty=ltyvec[3]) + +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) + + +dev.off() + diff --git a/scripts/plotsearchroc_d1tttb1.r b/scripts/plotsearchroc_d1tttb1.r new file mode 100644 index 0000000..c695dd8 --- /dev/null +++ b/scripts/plotsearchroc_d1tttb1.r @@ -0,0 +1,47 @@ +# R script for plotting ROC curve for different tableau search methods +# Alex Stivala +# July 2008 +# $Id: plotsearchroc_d1tttb1.r 1946 2008-10-04 05:14:43Z astivala $ + + +# +# globals +# + +colorvec=c('deepskyblue4','brown','red','turquoise','blue','purple','green','cyan','gray20','magenta','darkolivegreen2','midnightblue','magenta3','darkseagreen','violetred3','darkslategray3') +ltyvec=c(1,2,4,5,6,1,2,1,5,6,1,2,4,5,6,1,2) +namevec=c('QP numeric', 'QP numeric SSE type', 'QP numeric ordering SSE type', 'QP discrete','QP discrete SSE type', 'QP discrete ordering SSE type') + + +# +# main +# + +qp_numeric <- read.table('d1tttb1.tsrchn.ff.rtab',header=TRUE) +qp_numeric_penalizessetype<-read.table('d1tttb1.tsrchn.tf.rtab',header=TRUE) +qp_numeric_ordering_penalizessetype <- read.table('d1tttb1.tsrchn.tt.rtab', header=TRUE) + +qp_discrete <- read.table('d1tttb1.tsrchd.ff.rtab',header=TRUE) +qp_discrete_penalizessetype<-read.table('d1tttb1.tsrchd.tf.rtab',header=TRUE) +qp_discrete_ordering_penalizessetype <- read.table('d1tttb1.tsrchd.tt.rtab', header=TRUE) + + +# EPS suitable for inserting into LaTeX +postscript('searchroc_d1tttb1.eps',onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + +plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3,main='ROC curves for query d1tttb1 against ASTRAL 95% seq id SCOP as truth') + +lines(qp_numeric$fpr, qp_numeric$tpr, col=colorvec[1], lty=ltyvec[1]) +lines(qp_numeric_penalizessetype$fpr, qp_numeric_penalizessetype$tpr, col=colorvec[2], lty=ltyvec[2]) +lines(qp_numeric_ordering_penalizessetype$fpr, qp_numeric_ordering_penalizessetype$tpr, col=colorvec[3], lty=ltyvec[3]) +lines(qp_discrete$fpr, qp_discrete$tpr, col=colorvec[4], lty=ltyvec[4]) +lines(qp_discrete_penalizessetype$fpr, qp_discrete_penalizessetype$tpr, col=colorvec[5], lty=ltyvec[5]) +lines(qp_discrete_ordering_penalizessetype$fpr, qp_discrete_ordering_penalizessetype$tpr, col=colorvec[6], lty=ltyvec[6]) + + +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) + + +dev.off() + diff --git a/scripts/plotsearchroc_folds.r b/scripts/plotsearchroc_folds.r new file mode 100644 index 0000000..b03c36a --- /dev/null +++ b/scripts/plotsearchroc_folds.r @@ -0,0 +1,47 @@ +# R script for plotting ROC curve for different tableau search methods +# Alex Stivala +# July 2008 +# $Id: plotsearchroc_folds.r 1946 2008-10-04 05:14:43Z astivala $ + + +# +# globals +# + +colorvec=c('deepskyblue4','brown','red','turquoise','blue','purple','green','cyan','gray20','magenta','darkolivegreen2','midnightblue','magenta3','darkseagreen','violetred3','darkslategray3') +ltyvec=c(1,2,4,5,6,1,2,1,5,6,1,2,4,5,6,1,2) +namevec=c('beta-grasp', 'Immunoglobulin', 'TIM-barrel', 'Plait (ferredoxin)', 'GFP-like', 'Key-barrel', 'Jelly-roll') + + +# +# main +# + +betagrasp <- read.table('d1ubia_.tsrchn.orderpen1.typepen0.rtab', header=TRUE) +immunoglobulin <- read.table('d1ae6h1.tsrchn.orderpen1.typepen0.rtab', header=TRUE) +timbarrel <- read.table('d1tima_.tsrchn.orderpen1.typepen0.rtab', header=TRUE) +plait <- read.table('d1bhne_.tsrchn.orderpen1.typepen0.rtab', header=TRUE) +greekkey <- read.table('d1h6rb_.tsrchn.orderpen1.typepen0.rtab', header=TRUE) +keybarrel <- read.table('d1tttb1.tsrchn.orderpen1.typepen0.rtab', header=TRUE) +jellyroll <- read.table('d2phlb1.tsrchn.orderpen1.typepen0.rtab', header=TRUE) + + +# EPS suitable for inserting into LaTeX +postscript('searchroc_folds.eps',onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + +plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3,main='ROC curves against ASTRAL 95% seq id SCOP as truth') + +lines(betagrasp$fpr, betagrasp$tpr, col=colorvec[1], lty=ltyvec[1]) +lines(immunoglobulin$fpr, immunoglobulin$tpr, col=colorvec[2], lty=ltyvec[2]) +lines(timbarrel$fpr, timbarrel$tpr, col=colorvec[3], lty=ltyvec[3]) +lines(plait$fpr, plait$tpr, col=colorvec[4], lty=ltyvec[4]) +lines(greekkey$fpr, greekkey$tpr, col=colorvec[5], lty=ltyvec[5]) +lines(keybarrel$fpr, keybarrel$tpr, col=colorvec[6], lty=ltyvec[6]) +lines(jellyroll$fpr, jellyroll$tpr, col=colorvec[7], lty=ltyvec[7]) + +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) + + +dev.off() + diff --git a/scripts/plotsearchroc_folds_00.r b/scripts/plotsearchroc_folds_00.r new file mode 100644 index 0000000..f79caff --- /dev/null +++ b/scripts/plotsearchroc_folds_00.r @@ -0,0 +1,49 @@ +# R script for plotting ROC curve for different tableau search methods +# Alex Stivala +# July 2008 +# $Id: plotsearchroc_folds_00.r 1946 2008-10-04 05:14:43Z astivala $ + + +# +# globals +# + +colorvec=c('deepskyblue4','brown','red','turquoise','blue','purple','green','cyan','gray20','magenta','darkolivegreen2','midnightblue','magenta3','darkseagreen','violetred3','darkslategray3') +ltyvec=c(1,2,4,5,6,1,2,1,5,6,1,2,4,5,6,1,2) +namevec=c('beta-grasp', 'Immunoglobulin', 'TIM-barrel', 'Plait (ferredoxin)', 'GFP-like', 'Key-barrel', 'Jelly-roll', 'NAD-binding fold') + + +# +# main +# + +betagrasp <- read.table('d1ubia_.tsrchn.tt.rtab', header=TRUE) +immunoglobulin <- read.table('d1ae6h1.tsrchn.orderpen0.typepen0.rtab', header=TRUE) +timbarrel <- read.table('d1tima_.tsrchn.orderpen0.typepen0.rtab', header=TRUE) +plait <- read.table('d1bhne_.tsrchn.orderpen0.typepen0.rtab', header=TRUE) +greekkey <- read.table('d1h6rb_.tsrchn.orderpen0.typepen0.rtab', header=TRUE) +keybarrel <- read.table('d1tttb1.tsrchn.orderpen0.typepen0.rtab', header=TRUE) +jellyroll <- read.table('d2phlb1.tsrchn.orderpen0.typepen0.rtab', header=TRUE) +nadbinding <- read.table('d1f6dc_.tsrchn.orderpen0.typepen0.rtab', header=TRUE) + + +# EPS suitable for inserting into LaTeX +postscript('searchroc_folds.eps',onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + +plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3,main='ROC curves for numeric version against ASTRAL 95% seq id SCOP as truth') + +lines(betagrasp$fpr, betagrasp$tpr, col=colorvec[1], lty=ltyvec[1]) +lines(immunoglobulin$fpr, immunoglobulin$tpr, col=colorvec[2], lty=ltyvec[2]) +lines(timbarrel$fpr, timbarrel$tpr, col=colorvec[3], lty=ltyvec[3]) +lines(plait$fpr, plait$tpr, col=colorvec[4], lty=ltyvec[4]) +lines(greekkey$fpr, greekkey$tpr, col=colorvec[5], lty=ltyvec[5]) +lines(keybarrel$fpr, keybarrel$tpr, col=colorvec[6], lty=ltyvec[6]) +lines(jellyroll$fpr, jellyroll$tpr, col=colorvec[7], lty=ltyvec[7]) +lines(nadbinding$fpr, nadbinding$tpr, col=colorvec[8], lty=ltyvec[8]) + +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) + + +dev.off() + diff --git a/scripts/plotsearchroc_folds_00_reduced.r b/scripts/plotsearchroc_folds_00_reduced.r new file mode 100644 index 0000000..53305b3 --- /dev/null +++ b/scripts/plotsearchroc_folds_00_reduced.r @@ -0,0 +1,45 @@ +# R script for plotting ROC curve for different tableau search methods +# Alex Stivala +# July 2008 +# Reduced version to make clearer in grayscale (showing only those that +# don't overlap too much). +# $Id: plotsearchroc_folds_00_reduced.r 1946 2008-10-04 05:14:43Z astivala $ + + +# +# globals +# + +colorvec=c('black','black','black','black','black','black') +ltyvec=c(1,2,5,4,5,6) +namevec=c('beta-grasp', 'Immunoglobulin', 'TIM-barrel', 'Plait (ferredoxin)', 'Key-barrel') + + +# +# main +# + +betagrasp <- read.table('d1ubia_.tsrchn.tt.rtab', header=TRUE) +immunoglobulin <- read.table('d1ae6h1.tsrchn.orderpen0.typepen0.rtab', header=TRUE) +timbarrel <- read.table('d1tima_.tsrchn.orderpen0.typepen0.rtab', header=TRUE) +plait <- read.table('d1bhne_.tsrchn.orderpen0.typepen0.rtab', header=TRUE) +keybarrel <- read.table('d1tttb1.tsrchn.orderpen0.typepen0.rtab', header=TRUE) + + +# EPS suitable for inserting into LaTeX +postscript('searchroc_folds_reduced.eps',onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + +plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3,main='ROC curves for numeric version against ASTRAL 95% seq id SCOP as truth') + +lines(betagrasp$fpr, betagrasp$tpr, col=colorvec[1], lty=ltyvec[1]) +lines(immunoglobulin$fpr, immunoglobulin$tpr, col=colorvec[2], lty=ltyvec[2]) +lines(timbarrel$fpr, timbarrel$tpr, col=colorvec[3], lty=ltyvec[3]) +lines(plait$fpr, plait$tpr, col=colorvec[4], lty=ltyvec[4]) +lines(keybarrel$fpr, keybarrel$tpr, col=colorvec[5], lty=ltyvec[5]) + +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) + + +dev.off() + diff --git a/scripts/plotsearchroc_folds_discrete.r b/scripts/plotsearchroc_folds_discrete.r new file mode 100644 index 0000000..b35a52c --- /dev/null +++ b/scripts/plotsearchroc_folds_discrete.r @@ -0,0 +1,53 @@ +# R script for plotting ROC curve for different tableau search methods +# Alex Stivala +# July 2008 +# $Id: plotsearchroc_folds_discrete.r 2376 2009-05-14 01:40:32Z astivala $ + + +# +# globals +# + +colorvec=c('deepskyblue4','brown','red','turquoise','blue','purple','green','cyan','gray20','magenta','darkolivegreen2','midnightblue','magenta3','darkseagreen','violetred3','darkslategray3') +ltyvec=c(1,2,4,5,6,1,2,1,5,6,1,2,4,5,6,1,2) +namevec=c('beta-grasp', 'Immunoglobulin', 'TIM-barrel', 'Plait (ferredoxin)', 'GFP-like', 'Key-barrel', 'Jelly-roll', 'NAD-binding fold') + + +# +# main +# + +betagrasp <- read.table('d1ubia_.tsrchd.tt.rtab', header=TRUE) +immunoglobulin <- read.table('d1ae6h1.tsrchd.tt.rtab', header=TRUE) +timbarrel <- read.table('d1tima_.tsrchd.tt.rtab', header=TRUE) +plait <- read.table('d1bhne_.tsrchd.tt.rtab', header=TRUE) +greekkey <- read.table('d1h6rb_.tsrchd.tt.rtab', header=TRUE) +keybarrel <- read.table('d1tttb1.tsrchd.tt.rtab', header=TRUE) +jellyroll <- read.table('d2phlb1.tsrchd.tt.rtab', header=TRUE) +nadbinding <- read.table('d1f6dc_.tsrchd.tt.rtab', header=TRUE) + + +# EPS suitable for inserting into LaTeX +postscript('searchroc_folds_discrete.eps',onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + +#plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3,main='ROC curves (using distance information) against ASTRAL 95% seq id SCOP as truth') +# no figure title for BMC Bioinformatics +#plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3) +plot(c(0,1),c(0,1),type='n',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3) + + +lines(betagrasp$fpr, betagrasp$tpr, col=colorvec[1], lty=ltyvec[1]) +lines(immunoglobulin$fpr, immunoglobulin$tpr, col=colorvec[2], lty=ltyvec[2]) +lines(timbarrel$fpr, timbarrel$tpr, col=colorvec[3], lty=ltyvec[3]) +lines(plait$fpr, plait$tpr, col=colorvec[4], lty=ltyvec[4]) +lines(greekkey$fpr, greekkey$tpr, col=colorvec[5], lty=ltyvec[5]) +lines(keybarrel$fpr, keybarrel$tpr, col=colorvec[6], lty=ltyvec[6]) +lines(jellyroll$fpr, jellyroll$tpr, col=colorvec[7], lty=ltyvec[7]) +lines(nadbinding$fpr, nadbinding$tpr, col=colorvec[8], lty=ltyvec[8]) + +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) + + +dev.off() + diff --git a/scripts/plotsearchroc_folds_discrete_reduced.r b/scripts/plotsearchroc_folds_discrete_reduced.r new file mode 100644 index 0000000..3d09172 --- /dev/null +++ b/scripts/plotsearchroc_folds_discrete_reduced.r @@ -0,0 +1,45 @@ +# R script for plotting ROC curve for different tableau search methods +# Alex Stivala +# July 2008 +# Reduced version to make clearer in grayscale (showing only those that +# don't overlap too much). +# $Id: plotsearchroc_folds_discrete_reduced.r 1946 2008-10-04 05:14:43Z astivala $ + + +# +# globals +# + +colorvec=c('black','black','black','black','black','black') +ltyvec=c(1,2,5,4,5,6) +namevec=c('beta-grasp', 'Immunoglobulin', 'TIM-barrel', 'Plait (ferredoxin)','Key-barrel') + + +# +# main +# + +betagrasp <- read.table('d1ubia_.tsrchd.tt.rtab', header=TRUE) +immunoglobulin <- read.table('d1ae6h1.tsrchd.tt.rtab', header=TRUE) +timbarrel <- read.table('d1tima_.tsrchd.tt.rtab', header=TRUE) +plait <- read.table('d1bhne_.tsrchd.tt.rtab', header=TRUE) +keybarrel <- read.table('d1tttb1.tsrchd.tt.rtab', header=TRUE) + + +# EPS suitable for inserting into LaTeX +postscript('searchroc_folds_discrete_reduced.eps',onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + +plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3,main='ROC curves for discrete version against ASTRAL 95% seq id SCOP as truth') + +lines(betagrasp$fpr, betagrasp$tpr, col=colorvec[1], lty=ltyvec[1]) +lines(immunoglobulin$fpr, immunoglobulin$tpr, col=colorvec[2], lty=ltyvec[2]) +lines(timbarrel$fpr, timbarrel$tpr, col=colorvec[3], lty=ltyvec[3]) +lines(plait$fpr, plait$tpr, col=colorvec[4], lty=ltyvec[4]) +lines(keybarrel$fpr, keybarrel$tpr, col=colorvec[5], lty=ltyvec[5]) + +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) + + +dev.off() + diff --git a/scripts/plotsearchroc_folds_nodistmatrix_discrete.r b/scripts/plotsearchroc_folds_nodistmatrix_discrete.r new file mode 100644 index 0000000..f765b0b --- /dev/null +++ b/scripts/plotsearchroc_folds_nodistmatrix_discrete.r @@ -0,0 +1,52 @@ +# R script for plotting ROC curve for different tableau search methods +# Alex Stivala +# July 2008 +# $Id: plotsearchroc_folds_nodistmatrix_discrete.r 2376 2009-05-14 01:40:32Z astivala $ + + +# +# globals +# + +colorvec=c('deepskyblue4','brown','red','turquoise','blue','purple','green','cyan','gray20','magenta','darkolivegreen2','midnightblue','magenta3','darkseagreen','violetred3','darkslategray3') +ltyvec=c(1,2,4,5,6,1,2,1,5,6,1,2,4,5,6,1,2) +namevec=c('beta-grasp', 'Immunoglobulin', 'TIM-barrel', 'Plait (ferredoxin)', 'GFP-like', 'Key-barrel', 'Jelly-roll', 'NAD-binding fold') + + +# +# main +# + +betagrasp <- read.table('d1ubia_.nodistmatrix.tsrchd.tt.rtab', header=TRUE) +immunoglobulin <- read.table('d1ae6h1.nodistmatrix.tsrchd.tt.rtab', header=TRUE) +timbarrel <- read.table('d1tima_.nodistmatrix.tsrchd.tt.rtab', header=TRUE) +plait <- read.table('d1bhne_.nodistmatrix.tsrchd.tt.rtab', header=TRUE) +greekkey <- read.table('d1h6rb_.nodistmatrix.tsrchd.tt.rtab', header=TRUE) +keybarrel <- read.table('d1tttb1.nodistmatrix.tsrchd.tt.rtab', header=TRUE) +jellyroll <- read.table('d2phlb1.nodistmatrix.tsrchd.tt.rtab', header=TRUE) +nadbinding <- read.table('d1f6dc_.nodistmatrix.tsrchd.tt.rtab', header=TRUE) + + +# EPS suitable for inserting into LaTeX +postscript('searchroc_folds_nodistmatrix_discrete.eps',onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + +#plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3,main='ROC curves (no distance information) against ASTRAL 95% seq id SCOP as truth') +# no figure title for BMC Bioinformatics +#plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3) +plot(c(0,1),c(0,1),type='n',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3) + +lines(betagrasp$fpr, betagrasp$tpr, col=colorvec[1], lty=ltyvec[1]) +lines(immunoglobulin$fpr, immunoglobulin$tpr, col=colorvec[2], lty=ltyvec[2]) +lines(timbarrel$fpr, timbarrel$tpr, col=colorvec[3], lty=ltyvec[3]) +lines(plait$fpr, plait$tpr, col=colorvec[4], lty=ltyvec[4]) +lines(greekkey$fpr, greekkey$tpr, col=colorvec[5], lty=ltyvec[5]) +lines(keybarrel$fpr, keybarrel$tpr, col=colorvec[6], lty=ltyvec[6]) +lines(jellyroll$fpr, jellyroll$tpr, col=colorvec[7], lty=ltyvec[7]) +lines(nadbinding$fpr, nadbinding$tpr, col=colorvec[8], lty=ltyvec[8]) + +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) + + +dev.off() + diff --git a/scripts/plotsearchroc_reduced.r b/scripts/plotsearchroc_reduced.r new file mode 100644 index 0000000..41106a0 --- /dev/null +++ b/scripts/plotsearchroc_reduced.r @@ -0,0 +1,43 @@ +# R script for plotting ROC curve for different tableau search methods +# Alex Stivala +# July 2008 +# Reduced version to make clearer in grayscale (showing only those that +# don't overlap too much). +# $Id: plotsearchroc_reduced.r 1946 2008-10-04 05:14:43Z astivala $ + + +# +# globals +# + +colorvec=c('black','black','black','black','black','black') +ltyvec=c(1,2,4,5,6) +namevec=c('QP numeric ordering SSE type', 'QP discrete SSE type', 'QP discrete ordering SSE type') + + +# +# main +# + +qp_numeric_ordering_penalizessetype <- read.table('d1ubia_.tsrchn.tt.rtab', header=TRUE) + +qp_discrete_penalizessetype<-read.table('d1ubia_.tsrchd.tf.rtab',header=TRUE) +qp_discrete_ordering_penalizessetype <- read.table('d1ubia_.tsrchd.tt.rtab', header=TRUE) + + +# EPS suitable for inserting into LaTeX +postscript('searchroc_reduced.eps',onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + +plot(c(0,1),c(0,1),type='l',xlim=c(0,1),ylim=c(0,1),xlab="False Positive Rate",ylab="True Positive Rate",lty=3,main='ROC curves for query d1ubia_ against ASTRAL 95% seq id SCOP as truth') + +lines(qp_numeric_ordering_penalizessetype$fpr, qp_numeric_ordering_penalizessetype$tpr, col=colorvec[1], lty=ltyvec[1]) +lines(qp_discrete_penalizessetype$fpr, qp_discrete_penalizessetype$tpr, col=colorvec[2], lty=ltyvec[2]) +lines(qp_discrete_ordering_penalizessetype$fpr, qp_discrete_ordering_penalizessetype$tpr, col=colorvec[3], lty=ltyvec[3]) + + +legend('bottomright', col=colorvec, lty=ltyvec, legend=namevec) + + +dev.off() + diff --git a/scripts/plotssehistogram.r b/scripts/plotssehistogram.r new file mode 100644 index 0000000..11816e2 --- /dev/null +++ b/scripts/plotssehistogram.r @@ -0,0 +1,28 @@ +# R script for plotting SSE number histogram +# Alex Stivala +# August 2008 +# $Id: plotssehistogram.r 3001 2009-12-01 05:41:12Z alexs $ +# +# Uses commandArgs() R function to get trailing arguments from R command +# line ie after the --args option. The filename of the .ssenumslist file +# (which is just a list of numbers of SSEs in each domatin, one per line) +# is obtained from --args, and the output file is constructed form it +# eg astral-sel-gs-95-1.73.ssenumslist results in +# astral-sel-gs-95-1.73-ssehistogram.eps with +# +# R --vanilla -f plotssehistogram.r --args astral-sel-gs-95-1.73.ssenumlist +# + +filename <- commandArgs(trailingOnly=TRUE) + +# EPS suitable for inserting into LaTeX +postscript(sub('[.]ssenumslist$','-ssehistogram.eps',filename), + onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) +ssenums <- read.table(filename) + +if (length(grep("1.73",filename)) > 0) ymax <- 1000 else ymax <- 1500 + +hist(ssenums$V1,freq=TRUE,density=20,main=NULL,xlab='Tableau size (number of SSEs in domain)',breaks=100, ylim=c(0,ymax), xlim=c(0,100)) +dev.off() + diff --git a/scripts/prosmos_list_to_query.sh b/scripts/prosmos_list_to_query.sh new file mode 100755 index 0000000..20ac020 --- /dev/null +++ b/scripts/prosmos_list_to_query.sh @@ -0,0 +1,43 @@ +#!/bin/sh +# +# File: prosmos_list_to_query.sh +# Author: Alex Stivala +# Created: March 2009 +# +# prosmst_list_to_query.sh - convert ProSMoS .list file to first go at query +# +# Usage: prosmost_list_to_query.sh listfile +# +# Takes the .list file created by the fetchmatrix.pl script and convert +# it to format for ProSMoS query. All this does is reformatting of the +# data (and removing some) - the .query file MUST be manually edited to +# be useful. (Actually, may be simpler to edit the a copy of list file first, +# to remove unwanted SSEs, +# then run this to convert format) +# +# Writes query file format to stdout +# +# $Id: prosmos_list_to_query.sh 2138 2009-03-26 23:54:00Z astivala $ +# + +if [ $# -ne 1 ]; then + echo "Usage: $0 listfile" 2>&1 + exit 1 +fi +listfile=$1 +num_sses=`awk '/^[0-9]+[ ]*\*/ {print}' $listfile | wc -l` +i=1 +while [ $i -le $num_sses ]; do + printf "%d" $i + if [ $i -lt $num_sses ]; then + printf " " + else + printf '\n' + fi + i=`expr $i + 1` +done +awk '/^[0-9]+[ ]*[EH]/{print substr($2,1,1)}' $listfile | tr '\n' ' ' +echo +awk '/^[0-9]+[ ]*\*/{rownum=$1 ; row=""; for (i=1;i<=length($2);i++){row = row substr($2,i,1) " "} ; numspaces = rownum * 2 - 2 ; for(i=0; i&2 + exit 1 +fi + +queryid=$1 +htmlfile=$2 + +tmpfile1=/var/tmp/p1$$ +tmpfile2=/var/tmp/p2$$ + +awk 'BEGIN { i = 1 } /^\* [0-9]*/{print i,$3,$NF;i=i+1}' < $htmlfile > $tmpfile1 +awk 'BEGIN {i=1} / d[0-9]..... $tmpfile2 + +echo "# QUERYID = " $queryid +join $tmpfile1 $tmpfile2 | awk '{print $4,$3}' + +rm $tmpfile1 $tmpfile2 + diff --git a/scripts/ptdistmatrix.py b/scripts/ptdistmatrix.py new file mode 100644 index 0000000..62ab446 --- /dev/null +++ b/scripts/ptdistmatrix.py @@ -0,0 +1,1068 @@ +############################################################################### +# +# ptdistmatrix.py - Protein distance matrices +# +# File: ptdistmatrix.py +# Author: Alex Stivala +# Created: October 2007 +# +# $Id: ptdistmatrix.py 2703 2009-07-27 06:01:05Z astivala $ +# +# +############################################################################### + +from sets import Set # note not using builtin set, so we can use python 2.3 + +import Bio.PDB +import numpy.oldnumeric as Numeric + +from ptnode import * +from ptutils import biopdbresid_to_pdbresseq + +# TODO: quite a lot of wasted computation in building the whole +# residue distance matrix then building the SSE distance matrix from +# it - we don't really need the whole residue distance matrix, really +# only the residues that are part of SSEs. So maybe should compute +# only those and save a fair bit of computation and memory. +# having to do all the mapping from Bio.PDB Residue objects to array +# indices is wasteful/overly complex as well, now that we maintain +# a sequential residue number system in ptgraph2.py (pdb_resid_dict) +# anyway. + +#----------------------------------------------------------------------------- +# +# Module globals +# +#----------------------------------------------------------------------------- + +# +# global variables +# + +# dict of { Residue : bool } just to prevent excessive repeated +# error message in calc_residue_dist() +residue_errmsg_dict = {} + +#----------------------------------------------------------------------------- +# +# Class definitions +# +#----------------------------------------------------------------------------- + +class PTDistMatrix: + """ + + PTDistMatrix gives residue and secondary structure element distance + matrices for a given list of residues and SSEs. + + See Peter Cock's Python programming pages for how easy it is to compute + contact maps with with Bio.PDB and Numeric: + + http://www2.warwick.ac.uk/fac/sci/moac/currentstudents/peter_cock/python/protein_contact_map/ + + We don't actually compute contact maps here, but different kinds + of distance matrices (for residues, and for SSEs), and provide + some methods on them, specifically to find the closet SSE to a given SSE. + + A third kind of distance map is also used, a sheet/sse distance map, + where the elements are either sheets or helices. This is also a matrix + but it is a bit odd in that since each element is the distance between + (possibly) two sheets or a sheet and a helix, the idea that running + along the matrix index follows sequence no longer applies (since a sheet + consists of several strands, which may be all over the place in terms + of the sequence numbers of the residues that make them up). + So it is a bit unusual, but we still use a matrix for convenenience here, + and it is actually used in ptgraph2 to find the closest sheet or helix + to a given sheet or helix. It is built here from the sse distance + map, which in turn is built from the residue distance map. + """ + + def __init__(self, residue_list, ptnode_list, sheet_dict, pdb_struct): + """ + Construct the PTDistMatrix with the supplied list of residues. + This creates a residue distance matrix and an SSE (PTNode) distance + matrix. Various methods can then be used to get information from + these matrices. + + Parameters: + residue_list - List of Bio.PBD Residue objects + (ie iterable over Resdiue), from the pdb_struct + + ptnode_list - list of PTNode objects (ie iterable of PTNode) + representing the SSEs (helices,strands) we want + a distance matrix for. + + sheet_dict - dictionary of {sheet_id : nodelist} representing + each sheet as list of strand ptnodes. + May be None for no sheet/sse distance map. + + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + + + """ + self.pdb_struct = pdb_struct + + # square symmetric + # matrix (Numeric.array), of dimensions len(ersidue_list) x + # len(residue_list) where each element (Numeric.Float) is + # distance between C-alpha atoms of residues i,j. + self.dist_matrix = None # build by calc_dist_matrix + + + + # dict of { residue : array_index } mapping a Bio.PDB + # residue object to index in dist_matrix + self.index_map = {} # built by calc_dist_matrix + + # list of Bio.PDB residue objects mapping + # index back to Bio.PDB (i.e. reverse of index_map) + self.reverse_index_map = [] # built by calc_dist_matrix + + + # square symmetric Numeric.array matrix of dimensions + # len(ptnode_list) x len(ptnode_list) where each element + # is distance between the two SSEs represented by the + # ptnodes, as defined by calc_sse_dist() (min residue distance) + self.sse_dist_matrix = None # but by calc_sse_dist_matrix + + # dict of { ptnode : array_index } mapping a PTNode SSE object to + # index in sse_dist_matrix + self.sse_index_map = {} # build by calc_sse_dist_matrix + + # list of PTNode objects mapping index back to PTNode i.e. + # reverseof sse_index_map + self.reverse_sse_index_map = [] # built by calc_sse_dist-matrix + + # dict of {(ptnode1, ptnode2) : (residue1, residue2)} + # which for every pair of sses gives the residue + # in each which are closest (used in the distance + # matrix). Note both (ptnode1,ptnode2) and + # (ptnode2,ptnode1) are stored, with residues + # swapped appropriately. + # + self.sse_residue_map = {} # built by calc_sse_dist_matrix + + + # square symmetric matrix (Numeric.array) of dimensions + # n x n where n is the number of sheets + number of helices + # each element is Numeric.Float and is the distance between + # the two sheets (or two helices, or sheet and helix). + self.sheet_matrix = None # build by calc_sheet_dist_matrix + + # dict of { id : array_index } where id is either a sheet identifier + # (a single char 'A' or 'B' etc.) or a helix identifier string + # as used in ptgraph2 (string like "HELIX_A_1") etc. + # FIXME: This seems a bit hacky/dangerous, maybe sheet should be + # a proper object like a PTNode + self.sheet_index_map = {} #built by calc_sheet_dist_matrix + + # list of identifiers where an identifier is either sheet id e.g. + # 'A' or helix id e.g. "HELIX_A_1" mapping index back to these + # identifiers i.e. the inverse of sheet_index_map + self.reverse_sheet_index_map = [] # build by calc_sheet_dist_matrix + + # dict of {(sheet_id1, id2) : strand_node} + # which for every sheet id and (for id2) object id (sheet + # id or ptnodehelix) gives the PTNodeStrand in the sheet + # that was the closest (use in the sheet_matrix). + self.sheet_strand_map = {} # built by calc_sheet_dist_matrix + + + self.calc_dist_matrix(residue_list) + self.calc_sse_dist_matrix(ptnode_list) + if sheet_dict != None: + self.calc_sheet_dist_matrix(sheet_dict, ptnode_list) + + + + def calc_dist_matrix(self, residue_list): + """ + Compute the matrix of C-alpha distances between residues + + Parameters: + residue_list - List of Bio.PBD Residue objects + + Return value: + None. (sets data members) + + + Uses data members (WRITE): + + dist_matrix - square symmetric + matrix (Numeric.array), of dimensions len(ersidue_list) x + len(residue_list) where each element (Numeric.Float) is + distance between C-alpha atoms of residues i,j. + + index_map - dict of { residue : array_index } mapping a Bio.PDB + residue object to index in dist_matrix + + reverse_index_map - list of Bio.PDB residue objects mapping + index back to Bio.PDB (i.e. reverse of index_map) + + """ + self.dist_matrix = Numeric.zeros((len(residue_list), len(residue_list)), + Numeric.Float) + + self.reverse_index_map = len(residue_list) * [ -1 ] # will in 0..len-1 + index_maplist = list(enumerate(residue_list)) + for i in range(len(index_maplist)): + row, residue_one = index_maplist[i] + self.index_map[residue_one] = row + self.reverse_index_map[row] = residue_one + for j in range(i+1, len(index_maplist)): + col, residue_two = index_maplist[j] + dist = calc_residue_dist(residue_one, residue_two) + self.dist_matrix[row, col] = dist + self.dist_matrix[col, row] = dist + +# print self.dist_matrix + + + def get_distance(self, residue_one, residue_two): + """ + Get the distance from residue_one to residue_two, from the + data members already computed by calc_dist_matrix() + Note: calc_residue_dist() actually calculates this distance, + this function retrieves the previously calculated value from + the matrix. + + Parameters: + residue_one - Bio.PDB Residue object + residue_two - Bio.PDB Residue object + + Uses data members (readonly): + index_map + dist_matrix + + Return value: + distance (Angstroms) between residue_one and residue_two C-alphas + """ + try: + row = self.index_map[residue_one] + except KeyError: + # this happens when domain decomposition has broken an SSE e.g. + # 1CTN with DDomain and DSSP. + if not residue_errmsg_dict.has_key(residue_one): + sys.stderr.write('WARNING: Residue ' + + str(residue_one) + + ' not found,\n probably due to domain ' + 'decomposition breaking an SSE.' + '\n Distance set to infinity\n') + residue_errmsg_dict[residue_one] = True + return float("inf") + try: + col = self.index_map[residue_two] + except KeyError: + if not residue_errmsg_dict.has_key(residue_two): + sys.stderr.write('WARNING: Residue ' + + str(residue_two) + + ' not found,\n probably due to domain ' + 'decomposition breaking an SSE.' + '\n Distance set to infinity\n') + residue_errmsg_dict[residue_one] = True + return float("inf") + + dist = self.dist_matrix[row, col] + return dist + + + def get_max_distance_residue(self, residue): + """ + Get the residue with maxmum distance from supplied residue, + from the data members already computed by calc_dist_matrix() + + Paremeters: + residue = Bio.PDB residue to get min distance to + + Uses data members (readonly): + index_map + reverse_index_map + dist_matrix + + Return value: + Bio.PDB residue that has max distance from supplied residue + + """ + row = self.index_map[residue] + maxdist_index = Numeric.argmax(self.dist_matrix[row]) + maxdist_residue = self.reverse_index_map[maxdist_index] + return maxdist_residue + + + def calc_sse_dist(self, sse1, sse2): + """ + Calculate the distance between two SSEs (helices or strands, + represented by PTNode objects). + This distance is defined as the smallest distance between any two + residues, one in each of the SSEs, i.e. the distance betwee the + two parts of the SSEs tha are closest. + + This is calculated from the residue distance matrix, i.e. + calc_dist_matrix is assumed to have been already called. + + Parameters: + sse1 - PTNode representing one SSE + sse2 - PTNode representing the other SSE + + Uses data members (readonly): + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + + + Return value: + tuple (distance, residue1, residue2) where + distance is the distance (Angstroms) between the + closest residues, one from each + of the two SSEs. + residue1 is the residue in sse1 used in this min distance + residue2 is the residue in sse2 used in this min distance + """ + assert (isinstance(sse1, PTNodeHelix) or isinstance(sse1, PTNodeStrand) + or isinstance(sse1, PTNodeLoop)) + assert (isinstance(sse2, PTNodeHelix) or isinstance(sse2, PTNodeStrand) + or isinstance(sse2, PTNodeLoop)) + + min_dist = float("inf") + min_res1 = None + min_res2 = None + sse1_residues = sse1.get_residue_list() + sse2_residues = sse2.get_residue_list() + for res1 in sse1_residues: + for res2 in sse2_residues: + dist = self.get_distance(res1, res2) + if dist < min_dist: + min_dist = dist + min_res1 = res1 + min_res2 = res2 + return (min_dist, min_res1, min_res2) + + + + def calc_sse_dist_matrix(self, ptnode_list): + """ + Build the matrix of SSE distance, i.e. min dist between residues + in the SSEs. + + NOTE: the self-distance (i.e. matrix elements [i,i]) are set to + infinity rather than 0, so we can efficiently use argmin + in get_sse_min_distance() to find SSE (not same one) with min + distance - we are wanting to find minimum distances + in ptgraph2, not maximum distances. + + Parameters: + ptnode_list - iterable over PTNode objects represneting SSEs + + + Return value: + None. (sets data members) + + + Uses data members (WRITE): + + sse_dist_matrix - square symmetric + Numeric.array matrix of dimensions + len(ptnode_list) x len(ptnode_list) where each + elementis distance between the two SSEs + represented by the ptnodes, + as defined by calc_sse_dist() (min residue distance) + + sse_index_map - dict of { ptnode : array_index } mapping a PTNode + object to index in sse_dist_matrix + + reverse_sse_index_map - list of PTNode objects mapping + index back to PTNode + (i.e. reverse of index_map) + + sse_residue_map - dict of {(ptnode1, ptnode2) : (residue1, residue2)} + which for every pair of sses gives the residue + in each which are closest (used in the distance + matrix). Note both (ptnode1,ptnode2) and + (ptnode2,ptnode1) are stored, with residues + swapped appropriately. + + + """ + self.sse_dist_matrix =Numeric.zeros((len(ptnode_list),len(ptnode_list)), + Numeric.Float) + + # set the self-distances to infinity (see comments above and in + # get_sse_min_distance() + # TODO: maybe if we used NaN instead of inf, this would allow + # both min/max and argmin/argmax rather than just min/argmin + # (as we actualy use) to be useful. I tried it with Python 2.5.1 + # on Linux and it worked (ie NaN is neither max nor min) but + # not really sure how reliable that behaviour is... so sticking + # with inf for now since we only need min/argmin anyway. + for i in range(0, Numeric.size(self.sse_dist_matrix,0)): + self.sse_dist_matrix[i,i] = float("inf") + + self.reverse_sse_index_map = len(ptnode_list) * [ -1 ] #will in 0..len-1 + index_maplist = list(enumerate(ptnode_list)) + for i in range(len(index_maplist)): + row, sse_one = index_maplist[i] + self.sse_index_map[sse_one] = row + self.reverse_sse_index_map[row] = sse_one + for j in range(i+1, len(index_maplist)): + col, sse_two = index_maplist[j] + (dist, res_one, res_two) = self.calc_sse_dist(sse_one, sse_two) + self.sse_dist_matrix[row, col] = dist + self.sse_dist_matrix[col, row] = dist + self.sse_residue_map[sse_one, sse_two] = (res_one, res_two) + self.sse_residue_map[sse_two, sse_one] = (res_two, res_one) + +# print self.sse_dist_matrix + + + def get_sse_distance(self, sse1, sse2): + """ + Get the distance from sse1 to sse2, from the + data members already computed by calc_sse_dist_matrix() + Note: calc_sse_dist() actually calculates this distance, + this function retrieves the previously calculated value from + the matrix. + + Parameters: + sse1 - PTNode for an SSE (helix or strand) + sse2 - PTNode for an SSE (helix or strand) + + Uses data members (readonly): + sse_index_map + sse_dist_matrix + + Return value: + distance (Angstroms) between sse1 and sse1 as defined + by calc_sse_distance() + """ + row = self.sse_index_map[sse1] + col = self.sse_index_map[sse2] + dist = self.sse_dist_matrix[row, col] + return dist + + def get_min_distance_sse(self, ptnode): + """ + Get the SSE with minimum distance from supplied SSE, + from the data members already computed by calc_sse_dist_matrix() + + Optionally, set the element that was found to infinity so that + this routine can be used iteratively to find only elements that + have not already been found. + + Paremeters: + ptnode - PTNode representing an SSE (helix,strand) + + Uses data members (readonly): + sse_index_map + reverse_sse_index_map + sse_dist_matrix + + Return value: + PTNode representing SSE that has min distance from supplied SSE + + """ + # NB: use of argmin depends on having set diagonal (self distance) + # elements to inf instead of 0 in calc_sse_dist_matrix(). + row = self.sse_index_map[ptnode] + mindist_index = Numeric.argmin(self.sse_dist_matrix[row]) + mindist_ptnode = self.reverse_sse_index_map[mindist_index] + return mindist_ptnode + + + def calc_sse_sheet_dist(self, sse, sheet_node_list): + """ + Calculate the distance between an SSE and a sheet, where a sheet + is defined by the suplied list of PTNodes representing strands. + This distance is defined as the smallest distance between the + supplied sse and any of the strands in the sheet, + i.e. the distance between a sheet and some other SSE is the distance + between the SSE and the closest strand to it in the sheet. + + This is calculated from the SSE distance matrix, i.e. + calc_sse_dist_matrix is assumed to have been already called. + + Parameters: + sse - PTNode representing one SSE + sheet_node_list - list of PTNodes of strands in the sheet. + + Return value: + tuple (dist, strand) where + dist is the + distance (Angstroms) between the closest residues, one from each + of the two SSEs and + strand is PTNodeStrand of the strand in the sheet used for this + minimum distance + """ + assert isinstance(sse, PTNodeHelix) or isinstance(sse, PTNodeStrand) + + min_dist = float("inf") + min_dist_strand = None + for strand in sheet_node_list: + assert isinstance(strand, PTNodeStrand) + dist = self.get_sse_distance(sse, strand) + if dist < min_dist: + min_dist = dist + min_dist_strand = strand + return (min_dist, min_dist_strand) + + def calc_sheet_sheet_dist(self, sheet1, sheet2): + """ + Calculate the distance between two sheets, where the sheets + are represnetd by lists of PTNodes represneting the strands in them. + This distance is defined as the distance between the two closest + strands, one from each sheet. + + This is calculated from the SSE distance matrix, i.e. + calc_sse_dist_matrix is assume already to have been called. + + Parameters: + sheet1 - list of PTNodes of strands in the sheet + sheet2 - list of PTNodes of strands in the other sheet + + Return value: + tuple (dist, strand1, strand2) where dist is the + distance (Angstroms) between the two sheets, as defined above + and strand1 is the strand in sheet1 that was used in this + minimum distance calculation, and strand2 is that in sheet2 + """ + min_dist = float("inf") + min_dist_strand1 = None + min_dist_strand2 = None + for strand in sheet1: + assert isinstance(strand, PTNodeStrand) + (dist, strand2) = self.calc_sse_sheet_dist(strand, sheet2) + if dist < min_dist: + min_dist = dist + min_dist_strand1 = strand + min_dist_strand2 = strand2 + return (min_dist, min_dist_strand1, min_dist_strand2) + + + def calc_sheet_dist_matrix(self, sheet_dict, ptnode_list): + """ + Build the matrix of distances between all the helices and sheets. + + NOTE: the self-distance (i.e. matrix elements [i,i]) are set to + infinity rather than 0, so we can efficiently use argmin + in get_sse_min_distance() to find SSE (not same one) with min + distance - we are wanting to find minimum distances + in ptgraph2, not maximum distances. + + Parameters: + ptnode_list - iterable over PTNode objects represneting SSEs, + we only use the helices though. + + sheet_dict - dictionary of {sheet_id : nodelist} representing + each sheet as list of strand ptnodes. + + Uses data members (WRITE): + sheet_matrix - + square symmetric matrix (Numeric.array) of dimensions + n x n where n is the number of sheets + number of helices + each element is Numeric.Float and is the distance between + the two sheets (or two helices, or sheet and helix). + + sheet_index_map - + dict of { id : array_index } where id is either a sheet id + (a single char 'A' or 'B' etc.) or a helix identifier string + as used in ptgraph2 (string like "HELIX_A_1") etc. + FIXME: This seems a bit hacky/dangerous, maybe sheet should be + a proper object like a PTNode + + reverse_sheet_index_map - + list of identifiers where an identifier is either sheet id e.g. + 'A' or helix id e.g. "HELIX_A_1" mapping index back to these + identifiers i.e. the inverse of sheet_index_map + + sheet_strand_map - + dict of {(sheet_id1, id2) : strand_node} + which for every sheet id and (for id2) object id (sheet + id or ptnodehelix) gives the PTNodeStrand in the sheet + that was the closest (use in the sheet_matrix). + + + Return value: + None. (sets data members) + + """ + helix_list = [ ptnode for ptnode in ptnode_list if + isinstance(ptnode, PTNodeHelix) ] + sheet_id_list = sheet_dict.keys() + objlist = helix_list + sheet_id_list + n = len(objlist) # n = number of helices + number of sheets + self.sheet_dist_matrix = Numeric.zeros((n, n), Numeric.Float) + + # set the self-distances to infinity (see comments above and in + # get_sse_min_distance() + # TODO: maybe if we used NaN instead of inf, this would allow + # both min/max and argmin/argmax rather than just min/argmin + # (as we actualy use) to be useful. I tried it with Python 2.5.1 + # on Linux and it worked (ie NaN is neither max nor min) but + # not really sure how reliable that behaviour is... so sticking + # with inf for now since we only need min/argmin anyway. + for i in range(0, n): + self.sheet_dist_matrix[i,i] = float("inf") + + self.reverse_sheet_index_map = n * [ -1 ] # will be in 0..n-1 + index_maplist = list(enumerate(objlist)) + for i in range(len(index_maplist)): + row, obj1 = index_maplist[i] + if isinstance(obj1, PTNode): + obj1_id = obj1.nodeid + else: + obj1_id = obj1 # it is a sheet id e.g. 'A' + assert(obj1.isalpha()) + self.sheet_index_map[obj1_id] = row + self.reverse_sheet_index_map[row] = obj1_id + for j in range(i+1, len(index_maplist)): + col, obj2 = index_maplist[j] + if isinstance(obj2, PTNode): + obj2_id = obj2.nodeid + else: + obj2_id = obj2 # it is a sheet id e.g. 'A' + assert(obj2.isalpha()) + if isinstance(obj1, PTNode) and isinstance(obj2, PTNode): + # both are helices + dist = self.get_sse_distance(obj1, obj2) + elif isinstance(obj1, PTNode): + # obj1 is a helix, obj2 is a sheet + (dist, strand) = \ + self.calc_sse_sheet_dist(obj1, sheet_dict[obj2_id]) + self.sheet_strand_map[(obj2_id, obj1_id)] = strand + elif isinstance(obj2, PTNode): + # obj1 is a sheet, obj2 is a helix + (dist, strand) = \ + self.calc_sse_sheet_dist(obj2, sheet_dict[obj1_id]) + self.sheet_strand_map[(obj1_id, obj2_id)] = strand + else: + # both are sheets + (dist, strand1, strand2) = \ + self.calc_sheet_sheet_dist(sheet_dict[obj1_id], + sheet_dict[obj2_id]) + self.sheet_strand_map[(obj1_id, obj2_id)] = strand1 + self.sheet_strand_map[(obj2_id, obj1_id)] = strand2 + + self.sheet_dist_matrix[row, col] = dist + self.sheet_dist_matrix[col, row] = dist + +# print self.sheet_strand_map +# print self.reverse_sheet_index_map +# print self.sheet_dist_matrix + + + def get_min_distance_objid(self, objid, not_objid_set, sheets_only=False): + """ + Get the sheet or helix with minimum distance from the supplied + sheet or helix, specified by id (e.g. 'A' for a sheet or + 'HELIX_A_10' for a helix). + + Optionally, set the element that was found to infinity so that + this routine can be used iteratively to find only elements that + have not already been found. + + Paremeters: + objid - sheet id (e.g. 'A') or helix id (e.g. 'HELIX_A_10') + of the object to find the id of the closest object for. + not_objid_set - set of objids that we do NOT want to find. + Used so we can find the nearest element to an + already positioned element that is not itself + an already positioned element. + sheets_only - (Default False) only find sheets, not helices. + + + Uses data members (readonly): + sheet_index_map + reverse_sheet_index_map + sheet_dist_matrix + + Return value: + tuple (id, dist) where + id (as per the objid paramter) of the closest sheet or helix + to the speicfied one and + dist is that smallest distance, and it is not in the + not_objid_set. + + """ + + row = self.sheet_index_map[objid] + mindist_index = Numeric.argmin(self.sheet_dist_matrix[row]) + + # get 1d array of object ids sorted (ascending) by their distance + # from the target objid in the sheet dist matrix + # NB: use of argsort depends on having set diagonal (self distance) + # elements to inf instead of 0 in calc_sse_dist_matrix(). + objids_sorted_by_dist = Numeric.argsort(self.sheet_dist_matrix[row]) + + # find the first (i.e. smallest distance) id that is not in + # the not_objid_set + mindist_index = None + for mindist_index in objids_sorted_by_dist: + mindist_objid = self.reverse_sheet_index_map[mindist_index] + if ( (mindist_objid not in not_objid_set) and + (not sheets_only or len(mindist_objid) == 1) ): + dist = self.sheet_dist_matrix[row, mindist_index] + break + return (mindist_objid, dist) + + + def get_strand_nearest_element(self, sheet_id, element): + """ + Get the PTNodeStrand in the supplied sheet that is nearest + to the supplied object (specified by object id). Uses the + sheet_strand_map built by calc_sheet_dist_matrix() to do this; + the idea is that nearest objects are found with + get_min_distance_objid(), and in the case that an element is + nearest to a sheet, this function is then called to find + the paritcular strand in that sheet that was used as the nearest + element. + + Parameters: + sheet_id - id of sheet to find strand nearest objid + element - element (sheet id or PTNodeHelix) to find the + strand in the sheet nearest to. + + Return value: + tuple (strand1, strand2) where strand 1 is the + PTNodeStrand for the strand in the sheet that is nearest to element + and strand2 is the PTNodeStrand for the strand in element that + was closest, if element is a sheet id, or None if element is a helix. + + Uses data members (readonly): + sheet_strand_map - + dict of {(sheet_id1, id2) : strand_node} + which for every sheet id and (for id2) object id (sheet + id or ptnodehelix) gives the PTNodeStrand in the sheet + that was the closest (use in the sheet_matrix). + """ + if isinstance(element, PTNodeHelix): + element_objid = element.nodeid + else: + element_objid = element # objid of sheet id is just sheet id + strand1 = self.sheet_strand_map[(sheet_id, element_objid)] + assert isinstance(strand1, PTNodeStrand) + assert strand1.get_sheet_id() == sheet_id + if isinstance(element, PTNodeHelix): + strand2 = None + else: + strand2 = self.sheet_strand_map[(element_objid, sheet_id)] + assert isinstance(strand2, PTNodeStrand) + assert strand2.get_sheet_id() == element_objid +# print 'zzz',sheet_id, str(element),str(strand1),str(strand2) + return (strand1, strand2) + + + def get_nearest_sse_residues(self, sse1, sse2): + """ + Find the residue in each of the two SSEs that are nearest to each + other and were used in building the SSE distance matrix. + Uses the sse_residue_map built by calc_sse_dist_matrix() to do this; + the idea is that nearest SSEs are found with get_min_distance_sse() + or other functions using the SSE distance matrix, then if required + this functino is used to retrieve the particular residues that + were used in calculating the min distance between SSEs. + + Parameters: + sse1 - PTNode for helix/strand 1 + sse2 - PTNode for helix/strand 2 + + Return value: + tuple (res_seq_num_1, res_seq_num_2) where res_seq_num_1 and + res_seq_num_2 are the residue sequence numbers in sse1 and sse2 + respectively that have min distance to each other (of all + residues in sse1 and sse2) + + Uses data members: + sse_residue_map - + dict of {(ptnode1, ptnode2) : (residue1, residue2)} + which for every pair of sses gives the residue + in each which are closest (used in the distance + matrix). Note both (ptnode1,ptnode2) and + (ptnode2,ptnode1) are stored, with residues + swapped appropriately. + + """ + (residue1, residue2) = self.sse_residue_map[sse1, sse2] + # id of a residue in Bio.PDB is tuple (hetatm, resseqnum, icode) + res_seq_num_1 = biopdbresid_to_pdbresseq(residue1.get_id()) + res_seq_num_2 = biopdbresid_to_pdbresseq(residue2.get_id()) + return (res_seq_num_1, res_seq_num_2) + + + def find_nearest_sses_in_sets(self, ref_set, test_set): + """ + Find the nearest sse (helix or strand) in test_set to any + of the sses in the supplied ref_set. + + Parameters: + ref_set - set of PTNodes to find the nearest + element to any of them, that is in the test_test + test_set - set of PTNodes to find neareset + from. + Uses data members (read): + distmatrix - The PTDistMatrix that has been built already + Return value: + tuple (set_element, close_element, dist) + where set_element is an element in the supplied ref_set and + close_element is the + PTNode element + in the test_test which has minimum distance + to set_element + and dist is the distance between the two. + + """ + min_dist = float("inf") + close_element = None + set_element = None + + for ref_node in ref_set: + for test_node in test_set: + this_dist = self.get_sse_distance(ref_node, test_node) + if this_dist < min_dist: + set_element = ref_node + close_element = test_node + min_dist = this_dist + + return (set_element, close_element, min_dist) + + + def find_nearby_sses_in_sets(self, ref_set, test_set, dist_threshold): + """ + Find all sses (helices or strands) in test_set whose distance + from any of the sses in the supplied ref_set is below a + threshold. + + Parameters: + ref_set - set of PTNodes to find nearby + elements to any of them, that is in the test_test + test_set - set of PTNodes to find nearby sses + from. + dist_threshold - threshold (Angstroms) below which SSEs are 'nearby' + Uses data members (read): + distmatrix - The PTDistMatrix that has been built already + Return value: + List of tuples (dist, sse) of SSEs in test_set that are less + than dist_treshold from some SSE in ref_set, and the distance + (dist) that each is from its closest SSE in the ref_set + (Note dist is before sse in tuple to make sorting by dist easy). + + """ + close_dict = {} # dict of {node : dist} for min dist to test node + for ref_node in ref_set: + for test_node in test_set: + this_dist = self.get_sse_distance(ref_node, test_node) + if this_dist < dist_threshold: + if (not close_dict.has_key(test_node) or + this_dist < close_dict[test_node]): + close_dict[test_node] = this_dist + # items() converts dict to list of (node,dist) tuples then + # swap each tuple so we have list of (dist,node) tuples for ease + # of sorting + return [ (dist, node) for (node, dist) in close_dict.items() ] + + + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def calc_residue_dist(residue_one, residue_two) : + """Returns the C-alpha distance between two residues + + Paramters + residue_one - Bio.PDB Residue object + residue_two - Bio.PDB Residue object + + Return value: + distance in Angstroms between Carbon alpha atoms of residue_one and + residue_two + + Uses globals (read/write): + residue_errmsg_dict - map Residue to bool flagging error msg issued + + Based on Peter Cock's Python programming pages: + + http://www2.warwick.ac.uk/fac/sci/moac/currentstudents/peter_cock/python/protein_contact_map/ + + but of course life is never quite that simple with PDB files... + note this function is almost entirely error handling, only two + lines actually do the calculation, everything else handles exceptions. + """ + try: + res1_ca_coord = residue_one["CA"].coord + except KeyError: # this happens ocassionaly on some PDB files e.g. 1BRD + if not residue_errmsg_dict.has_key(residue_one): + sys.stderr.write('WARNING: no Carbon-alpha atom for residue ' + + str(residue_one) + + '\nDistance in matrix set to infinity\n') + residue_errmsg_dict[residue_one] = True + return float('inf') + try: + res2_ca_coord = residue_two["CA"].coord + except KeyError: + if not residue_errmsg_dict.has_key(residue_two): + sys.stderr.write('WARNING: no Carbon-alpha atom for residue ' + + str(residue_two) + + '. Distance in matrix set to infinity\n') + residue_errmsg_dict[residue_two] = True + return float('inf') + + diff_vector = res1_ca_coord - res2_ca_coord + return Numeric.sqrt(Numeric.sum(diff_vector * diff_vector)) + + +def calc_point_residue_dist(residue_one, point) : + """Returns the distance between the C_alpha of a residue and + a point. + + Paramters + residue_one - Bio.PDB Residue object + point - Bio.PDB Vector representatin of a point in 3d space + + Return value: + distance in Angstroms between Carbon alpha atom of residue_one and + point + + Uses globals (read/write): + residue_errmsg_dict - map Residue to bool flagging error msg issued + """ + try: + res1_ca_coord = residue_one["CA"].coord + except KeyError: # this happens ocassionaly on some PDB files e.g. 1BRD + if not residue_errmsg_dict.has_key(residue_one): + sys.stderr.write('WARNING: no Carbon-alpha atom for residue ' + + str(residue_one) + + '\nDistance in matrix set to infinity\n') + residue_errmsg_dict[residue_one] = True + return float('inf') + diff_vector = res1_ca_coord - point.get_array() +# print 'debug pointresdist',res1_ca_coord,point.get_array(),diff_vector + return Numeric.sqrt(Numeric.sum(diff_vector * diff_vector)) + + +def calc_sse_sse_dist(sse1, sse2, pdb_struct): + """ + Calculate the distance between two SSEs (helices or strands, + represented by PTNode objects). + This distance is defined as the smallest distance between any two + residues, one in each of the SSEs, i.e. the distance betwee the + two parts of the SSEs tha are closest. + + Parameters: + sse1 - PTNode representing one SSE + sse2 - PTNode representing the other SSE + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + + + Return value: + tuple (distance, residue1, residue2) where + distance is the distance (Angstroms) between the + closest residues, one from each + of the two SSEs. + residue1 is the residue in sse1 used in this min distance + residue2 is the residue in sse2 used in this min distance + """ + assert (isinstance(sse1, PTNodeHelix) or isinstance(sse1, PTNodeStrand) + or isinstance(sse1, PTNodeLoop)) + assert (isinstance(sse2, PTNodeHelix) or isinstance(sse2, PTNodeStrand) + or isinstance(sse2, PTNodeLoop)) + + min_dist = float("inf") + min_res1 = None + min_res2 = None + sse1_residues = sse1.get_residue_list() + sse2_residues = sse2.get_residue_list() + for res1 in sse1_residues: + for res2 in sse2_residues: + dist = calc_residue_dist(res1, res2) + if dist < min_dist: + min_dist = dist + min_res1 = res1 + min_res2 = res2 + return (min_dist, min_res1, min_res2) + + +def calc_sse_sse_midpoint_dist(sse1, sse2, pdb_struct): + """ + Calculate the midpoint distance between two SSEs (helices or strands, + represented by PTNode objects). + This distance is defined as the distance between the midpoints of + the two SSE axes (as calculated by fit_axis() methods). + + Parameters: + sse1 - PTNode representing one SSE + sse2 - PTNode representing the other SSE + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + + + Return value: + distance (Angstroms) between the midpoints of the axes fitted + to each SSE; or None if no axis could be found. + """ + sse1_axis = sse1.fit_axis(pdb_struct) + sse2_axis = sse2.fit_axis(pdb_struct) + if sse1_axis == None or sse2_axis == None: + return None + + (sse1_dircos, sse1_centroid) = sse1_axis + (sse2_dircos, sse2_centroid) = sse2_axis + + diff_vector = sse1_centroid - sse2_centroid + return Numeric.sqrt(Numeric.sum(diff_vector * diff_vector)) + + +def compute_sse_midpoint_dist_matrix(ptnode_list, pdb_structure): + """ + Return a distance matrix between midpoints of each SSE + by computing axis midpoint distances between all SSEs in the ptnode_list + + Parameters: + ptnode_list - list of PTNode objects (ie iterable of PTNode) + representing the SSEs (helices,strands) the + SSE midpoint distance matrix is for. + pdb_structure - parsed Bio.PDB structure + + Return value: + Numeric.array square symmetric (order length of ptnode_list) where + each entry is distance (Angstroms) between midpoints of SSE axes. + Main diagonal entries set to SSE type (0 strand, 1 alpha helix, + 2 pi helix, 3 3_10 helix). + + """ + n = len(ptnode_list) + dist_array = Numeric.zeros((n, n), Numeric.Float) + for i in range(n): + for j in range(i+1, n): + dist = calc_sse_sse_midpoint_dist(ptnode_list[i], ptnode_list[j], + pdb_structure) + if dist == None: + dist_array[i, j] = float('NaN') + else: + dist_array[i, j] = dist + dist_array[j, i] = dist_array[i, j] + + # set the diagonal as follows: + # 0.00 for strand + # 1.00 for alpha helix + # 2.00 for pi helix + # 3.00 for 3_10 helix + for i in range(n): + if isinstance(ptnode_list[i], PTNodeHelix): + if ptnode_list[i].get_type() == "ALPHA": + v = 1.00 + elif ptnode_list[i].get_type() == "PI": + v = 2.00 + elif ptnode_list[i].get_type() == "310": + v = 3.00 + else: + pass # should not happen + elif isinstance(ptnode_list[i], PTNodeStrand): + v = 0.00 + dist_array[i,i] = v + + return dist_array + + diff --git a/scripts/ptdomain.py b/scripts/ptdomain.py new file mode 100644 index 0000000..a12def2 --- /dev/null +++ b/scripts/ptdomain.py @@ -0,0 +1,855 @@ +############################################################################### +# +# ptdomain.py - object to represent protein domains and functions to +# parse into domains from external programs +# +# File: ptdomain.py +# Author: Alex Stivala +# Created: September 2007 +# +# $Id: ptdomain.py 2011 2008-10-30 01:54:20Z astivala $ +# +# PTDomain is a class representing a protein domain. A domain is represented +# by a list of segments, which are contiguous subsequences of a chain. +# +# Functions are provided to parse domains using different domain parsing +# programs and return the corresponding list of PTDomain objects. +# Supported so far is: +# . DDOMAIN (Zhou et al 2007 Protein Science 16:947-955) program +# . CATH (CATH Domall File (CDF) 2.0) file +# +############################################################################### + +import os,sys + +from Bio.PDB import * # only needed for DDomain when segment spans chains +from ptutils import cleanup_tmpdir,get_int_icode +from ptsecstruct import PTSecStruct + +#----------------------------------------------------------------------------- +# +# Module globals +# +#----------------------------------------------------------------------------- + +verbose = False + +#----------------------------------------------------------------------------- +# +# Class definitions +# +#----------------------------------------------------------------------------- + + +# +# Empty classes for exceptions +# + +class NotInCATH_Exception(Exception): # identifier not found in CATH CDF file + pass + +# +# Real classes +# + +class PTSegment: + """ + The PTSegment object represents a segment (contiguous subsequence of + a chain) by the chain identifier, and start and end residue sequence + numbers + + Note residue numbers here are integers, not PDB residue + 'numbers' which are actually strings and may have insertion + codes, are not sequential (may have gaps, run backwards, etc.); + unlike in PTNode, PTGraph2 etc. where we store the PDB residue numbers, + (but use dictionary to get sequence integers when necesary). + Using integers makes everything much + simpler, can do calculations easily in domeval.py etc. + For this to work strictly correctly, should use proper purely sequential + numbering in these classes (PTSegment, PTDomain), such as that assigned + by DSSP or STRIDE, or equivalently by using the index into list of + residues built by Polypeptide Builder or similar from Bio.PDB. + """ + def __init__(self, chainid, start_resnum, end_resnum): + """ + Construct segment with supplied chain identifier and start and + end residue sequence numbers. + + Parameters: + chainid - PDB chain identifier (may be '-' for none) + start_resnum - start residue sequence number + end_resnum - end residue sequence number + + Exceptions: + raises ValueEror if end_resnum < start_resnum + + """ + if end_resnum < start_resnum: + raise ValueError("end residue seqnum " + str(end_resnum) + " < " + + "start residue seqnum " + str(start_resnum)) + self.chainid = chainid + self.start_resnum = start_resnum + self.end_resnum = end_resnum + + + def __str__(self): + """ + Return string representation of segment as 'chainid:start-chainid:end' + """ + return self.chainid + ":" + str(self.start_resnum) + \ + "-" + self.chainid + ":" + str(self.end_resnum) + + + def is_in_segment(self, res_seq_num): + """ + Return True iff the supplied residue sequence number is in the + interval spanned by this segment (assumed to be same chainid) + + Parameters: + res_seq_num - PDB residue sequence number to test + + Return value: + True if res_seq_num is in >=start_resnum and <=end_resnum else False + """ + if res_seq_num >= self.start_resnum and \ + res_seq_num <= self.end_resnum: + return True + else: + return False + + # we will define only the rich comparison operators __eq__ and __ne__ + # (not __le__, __gt__, etc.) to test for equality or non-equality + # of segments only. + + def __eq__(self, other): + """ + Two segments are equal if they have same chain and same start and + end residues + """ + if (self.chainid == other.chainid and + self.start_resnum == other.start_resnum and + self.end_resnum == other.end_resnum): + return True + else: + return False + + def __ne__(self,other): + """ + Two segments are '!=' (or '<>') exactly when they are not equal. + """ + return not self.__eq__(other) + + + +class PTDomain: + """ + The PTDomain object represents a protein domain by a list of segments. + Segments are contiguous subsequences of a chain, and so each is + represented by a chain identifier, and a start and end residue sequence + number. + + The domain consisting of domainid == None and segment_list == None + is a special domain signifying a single-domain protein. This is used + because we don't want to have to specify multiple segments for multiple + chains in a single domain - a single domain protein should be treated + just as one unit without worrying about dividing anything up. + """ + def __init__(self, domainid, segment_list): + """ + Create new PTDomain with supplied domain identifier and segment list. + + Parameters: + domainid -domain identifier, string + segment_list - list of PTSegment objects + + NOTE if both parameters are None this marks this PTDomain as the + one used as single element in domain list to signify single-domain + protein with no further information. + """ + self.domainid = domainid + self.segment_list = segment_list + + + def __str__(self): + """ + Return a representation of the domain as list of segments + separated by ';' + """ + if self.domainid == None and self.segment_list == None: + return "SINGLE-DOMAIN" + + s = "" + for i in range(len(self.segment_list)): + s += str(self.segment_list[i]) + if i < len(self.segment_list) - 1: + s += ';' + return s + + def is_single(self): + """ + Return True iff this is the domain with no information representing + a single-domain protein + """ + if self.domainid == None and self.segment_list == None: + return True + else: + return False + + def is_in_domain(self, chainid, res_seq_num): + """ + Return True iff the supplied residue specified by chainid and + residue sequence number is in this domain. + + Parameters: + chainid - chainid of the residue to test + res_seq_num - PDB residue sequence number of the residue to test + + Return value: + True if the residue is in this domain, else False. + """ + # Note we just do a linear search, no dictionaries or anything, + # as there is only have a maximum of maybe 5 domains, and usually + # only 1 or 2. + if self.is_single(): + return True # always in the special 'single domain' + else: + for segment in self.segment_list: + if segment.chainid == chainid and \ + segment.is_in_segment(res_seq_num): + return True + return False + + def get_segments_for_chainid(self, chainid): + """ + Return a list of segments of the supplied chain in this domain + + Parameters: + chainid - id of chain to find segments of + Return value: + list of PTSegment objects with supplied chain id + """ + if self.is_single(): + return [] + else: + return [f for f in self.segment_list if f.chainid == chainid] + + def get_minmax_res_seq_in_chain(self, chainid): + """ + Return a tuple with the lowest and highest residue sequence numbers + in the supplied chainid in this domain. + + Parameters: + chainid - chain id to find low and high residue sequence numbers for + + Return value: + tuple (min_res_seq, max_res_seq) of the lowest and highest + respectively residue sequence numbers in the supplied chainid + in this domain. + """ + max_res_seq = 0 + min_res_seq = sys.maxint + for segment in self.get_segments_for_chainid(chainid): + if segment.start_resnum < min_res_seq: + min_res_seq = segment.start_resnum + if segment.end_resnum > max_res_seq: + max_res_seq = segment.end_resnum + return (min_res_seq, max_res_seq) + + def get_chainids(self): + """ + Return a list of all chain identifiers in this domain. + + Parameters: + None. + + Return value: + List of chain identifiers used by segments in this domain. + (Each chain identifier appears only once in list). + """ + chaindict = {} # dict of { chainid : True } (value not used) + if self.segment_list != None: + for segment in self.segment_list: + chaindict[segment.chainid] = True + return chaindict.keys() + + + def add_segment(self, segment): + """ + Add a segment to this domain. + + Parameters: + segment - PTSegement to add to the domain + Return value: None + Modifies member data: segment_list + """ + if self.segment_list == None: + self.segment_list = [segment] + else: + self.segment_list.append(segment) + + + def remove_segment(self, segment): + """ + Remove a segment from this domain. + This may involve either simply removing a segment if there is one + in the domain that corresponds exactly to the supplied segment to + remove, otherwise the range of residues in the segment to remove + must be deleted from some existing segment resulting in a smaller + segment; a more complicated case can arise when the segment to + remove spans two (or more) segments (either entirely or in part). + + Parameters: + segment - PTSegment representing segment (continguous range of + residues in a chain) to remove. + Return value: None + Modifies member data: segment_list, and segments in the list + """ + try: + sindex = self.segment_list.index(segment) + self.segment_list.pop(sindex) + except ValueError: + # no segment equal to supplied one found + # so look for a segment that entirely contains the one to remove + found = False + for cur_seg in self.segment_list: + if (segment.chainid == cur_seg.chainid and + segment.start_resnum >= cur_seg.start_resnum and + segment.end_resnum <= cur_seg.end_resnum): + found = True + break + if found: + if (segment.start_resnum == cur_seg.start_resnum): + cur_seg.start_resnum = segment.end_resnum + 1 + elif (segment.end_resnum == cur_seg.end_resnum): + cur_seg.end_resnum = segment.start_resnum - 1 + else: + # need to split segment in two which we will do + # by shortening existing segment for first part + # and creating new segment for later part + new_seg = PTSegment(cur_seg.chainid, + segment.end_resnum + 1, + cur_seg.end_resnum) + cur_seg.end_resnum = segment.start_resnum - 1 + self.segment_list.append(new_seg) + + else: + # segment is not found at all or extends ouside of a + # segment in the list + for cur_seg in self.segment_list: + if (segment.chainid == cur_seg.chainid): + if (segment.start_resnum >= cur_seg.start_resnum and + segment.start_resnum <= cur_seg.end_resnum): + # extends over end of cur_seg: shorten cur_seg + # to end at start of segment + cur_seg.end_resnum = segment.start_resnum - 1 + elif (segment.end_resnum <= cur_seg.end_resnum and + segment.end_resnum >= cur_seg.start_resnum): + # extends over start of cur_seg: shorten + # cur_seg to start and end of segment + cur_seg.start_resnum = segment.end_resnum + 1 + + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def read_domains_from_ddomain(pdb_filename, pdb_model, chainid=None): + """ + Use the DDOMAIN program to parse the structure from a PDB file into + domains and return the corresponding list of PTDomain objects. + + DDOMAIN is described in + + Zhou, Xue, Zhou 2007 'DDOMAIN: Dividing structures into domains using a + normalized domain-domain interaction profile' Protein Science 16:947-955. + + It is available as a 64-bit linux executable and FORTRAN-77 source code + from http://sparks.informatics.iupui.edu/Resource_files/DDOMAIN.tar.gz + + Parameters: + pdb_filename - filename of PDB file to run DDOMAIN on + pdb_model - Bio.PDB model struct for this PDB entry. Note that this + is only needed in the case that a DDomain domain has + different chain identifiers for start and end and + is then used just to find last residue number in chain. + chainid - (default None). If not None, only the specified chain + is requested. + + Return value: + List of PTDomain objects, one for each domain. + + NOTE: if there is only one domain, we will return a list with a single + PTDomain with all data None, signifying a single domain protein + with no further information. + This is mainly because of when + there are multiple chains, in which case the single domain is reported + by DDOMAIN as having a different chain id for start and end. If there + is a single domain we really don't want to do anything special, so + it is better to just have it as a special case where no domain processing + is done. + + """ + # DDOMAIN needs the PDB file in its working directory, and it reads + # the PDB code (e.g. 1QLP for PDB file 1QLP.pdb) from stdin + # (optionaly with chain suffix, which we won't use) + # Note it requires this filename format, so for format like pdb1qlp.ent + # we need to rename the file for DDOMAIN to 1QLP.pdb + + # This is nasty, but otherwise have to modify DDOMAIN FORTRAN-77 source + # so that's even more hassle to have to have a custom version (like we + # did with STRIDE). + # So we'll work in /tmp directory, make a symlink (TODO: only UNIX allows + # this, maybe should actually copy file so works on other platforms) + # and run DDOMAIN there. + oldcwd = os.getcwd() + TMPDIR = os.tempnam(None, "ptdd") + os.mkdir(TMPDIR) + symlink_path = None + try: + pdb_file_basename = os.path.basename(pdb_filename) + (name,extension) = os.path.splitext(pdb_file_basename) + if extension.lower() == '.pdb': # e.g. 1QLP.pdb + pdb_identifier = name + pdb_file_directory = os.path.split(pdb_filename)[0] + symlink_path = os.path.join(TMPDIR, pdb_file_basename) + os.symlink(os.path.abspath(pdb_filename), symlink_path) + elif extension != '.ent' or name[:3].lower() != 'pdb': + sys.stderr.write('WARNING: unknown PDB filename format "' + + pdb_file_basename + '"\n') + sys.stderr.write(' Not running DDomain\n') + domain_list = [PTDomain(None, None)] # one-domain protein, no further info + return domain_list + else: # e.g. pdb1qlp.ent, make a symlink to it in format 1QLP.pdb + pdb_identifier = name[3:7].upper() + symlink_path = os.path.join(TMPDIR, pdb_identifier + '.pdb') + os.symlink(os.path.abspath(pdb_filename), symlink_path) + + os.chdir(TMPDIR) + if verbose: + sys.stderr.write("running DDomain...") + (ddomain_stdin, ddomain_stdout) = os.popen2("DDomain") + if chainid != None: + pdbchainid = pdb_identifier + chainid + else: + pdbchainid = pdb_identifier + ddomain_stdin.write(pdbchainid + '\n') + ddomain_stdin.close() + domain_list = parse_ddomain_output(ddomain_stdout, pdb_model) + ddomain_stdout.close() + if verbose: + sys.stderr.write("done\n") + finally: + if symlink_path: + os.unlink(symlink_path) + os.chdir(oldcwd) + cleanup_tmpdir(TMPDIR) + return domain_list + +def parse_ddomain_output(fh, pdb_model): + """ + Parse the output of the DDOMAIN program. + + DDOMAIN is described in + + Zhou, Xue, Zhou 2007 'DDOMAIN: Dividing structures into domains using a + normalized domain-domain interaction profile' Protein Science 16:947-955. + + It is available as a 64-bit linux executable and FORTRAN-77 source code + from http://sparks.informatics.iupui.edu/Resource_files/DDOMAIN.tar.gz + + Parameters: + fh - filehandle to read DDOMAIN output from (alrady open for read) + pdb_model - Bio.PDB model struct for this PDB entry. Note that this + is only needed in the case that a DDomain domain has + different chain identifiers for start and end and + + Return value: + List of PTDomain objects, one for each domain. + + """ + + domain_list = [] + + # Output looks like this: + # + # AUTHORS-trained parameters + # 1 A 3 A 109 + # 2 B 3 B 109 + # SCOP-trained parameters + # 1 A 3 A 109 + # 2 B 3 B 109 + # CATH-trained parameters + # 1 A 3 A 109 + # 2 B 3 B 109 + # + + # We will use the AUTHORS-trained parameters output (see paper). + + # Note also that DDOMAIN only allows domains to be a single + # continguous subsequence of chain anyway (i.e. not multiple + # segments) (see paper), so we only ever have one segment in a + # domain from this function. + # It can, however, have a segment that includes parts of two chains + # i.e. runs off one chain and includes (part of) another chain. + # E.g. 1BAR: + # + # AUTHORS-trained parameters + # 1 A 11 B 7 + # 2 B 8 B 138 + + readout = False + for line in fh: + if line.strip()[0:8] == "AUTHORS-": + readout = True + continue + elif line.strip()[0:5] == "SCOP-" or line.strip()[0:5] == "CATH-": + break #finished with output + if readout: + splitline = line.split() + if len(splitline) == 5: + (domain_id,chainid_start,resnum_start,chainid_end,resnum_end) =\ + splitline + elif len(splitline) == 3: + # chain identifier is ' ' (space) so convert to '-' (dash) + domain_id = splitline[0] + chainid_start = '-' + resnum_start = splitline[1] + chainid_end = '-' + resnum_end = splitline[2] + else: + sys.stderr.write( + 'WARNING: error parsing line DDOMAIN output line:\n' + + line) + continue + resnum_start = int(resnum_start) + resnum_end = int(resnum_end) + if resnum_start < 0: + resnum_start = 0 + sys.stderr.write( + 'WARNING: DDomain negative residue start number, set to 0\n') + if resnum_end < 0: + resnum_end = 0 + sys.stderr.write( + 'WARNING: DDomain negative residue end number, set to 0\n') + if chainid_start != chainid_end: + sys.stderr.write('WARNING: DDomain different chainid in domain'+ + ' ' + str(domain_id) + + ': splitting into two segments\n') + # DDomain (as of Sep 2007) cannot have multiple segments in a + # domain but it does sometimes have a different chain id for + # start and end in a domain meaning (I think) that the + # segment consists of the first chain from start residue up + # to C-terminus and second chain from N-terminus up to + # end residue (in that second chain). + # So we make two segments, one in each of the two chains, + # for this domain. + + start_chain = pdb_model[chainid_start] + end_chain = pdb_model[chainid_end] + # id of a residue in Bio.PDB is tuple (hetatm, resseqnum, icode) + startchain_res_seqnum_list = [ res.get_id()[1] for res in + start_chain.get_list() + if res.get_id()[0] == ' ' ] + max_startchain_resnum = max(startchain_res_seqnum_list) + endchain_res_seqnum_list = [ res.get_id()[1] for res in + end_chain.get_list() + if res.get_id()[0] == ' ' ] + min_endchain_resnum = min(endchain_res_seqnum_list) + segment1 = PTSegment(chainid_start, resnum_start, + max_startchain_resnum) + segment2 = PTSegment(chainid_end, min_endchain_resnum, + resnum_end) + domain = PTDomain(domain_id, [segment1, segment2]) + domain_list.append(domain) + else: + if resnum_start > resnum_end: + # This happens e.g. on 1BMV chain 2 (only if chain 2 + # only is requested). Don't know what it means really, + # but let's make sure we don't get an exception anyway + sys.stderr.write('WARNING: DDomain start resnum ' + + str(resnum_start) + ' > end resnum ' + + str(resnum_end) + ', swapping.\n') + tmp = resnum_start + resnum_start = resnum_end + resnum_end = tmp + segment = PTSegment(chainid_start, resnum_start, resnum_end) + domain = PTDomain(domain_id, [segment]) + domain_list.append(domain) + + return domain_list + + + +def read_domains_from_cath_cdf_file(cdf_filename, pdbid, chainid=None): + """ + Read the domain decomposition from the CATH Domall File (CDF) + whose filename is supplied. + + These files and their description can be found at + + ftp://ftp.biochem.ucl.ac.uk/pub/cathdata/v3.1.0 + + specifically the files README.CDF_FORMAT_2.0 for the description and + the actual CDF file CathDomall.v3.1.0 + + See http://www.cathdb.info/ for CATH in general. + + Parameters: + cdf_filename - filename of the CATH CDF file to read + pdbid - pdb identifier to read domains for + chainid - (default None). If not None, get only this chain. + + Return value: + List of PTDomain objects, one for each domain. + + Raises exceptions: + NOTE: if the pdb id is not found in the file, we will return a list + raise the NotInCATH_Exception + + """ + domain_list = [] + issued_warning = False + # This is from README_CDF_FORMAT_2.0: + # + # KEY: + # N = Number of segments + # C = Chain character + # I = Insert character/code ('-' indicates no insert character) + # S = Start PDB number + # E = End PDB number + # NR = number of residues (fragment information only) + # + # 1chmA D02 F00 1 A 2 - A 156 - 1 A 157 - A 402 - + # N |C S I C E I| N |C S I C E I| + # |<----Domain One---->|<-----Domain Two---->| + # |<--Segment One-->| |<--Segment One-->| + # + # This translates to: + # 1chmA01 = Chain A; 2-156 + # 1chmA02 = Chain A; 157-402 + found = False + cdf_fh = open(cdf_filename) + for line in cdf_fh: + line = line.lstrip().upper() + if line[0] == '#': + continue # skip comment lines + cdf_rec = line.split() + # CDF file appears to be sorted by PDB id ascending so we could + # do a binary search or at least shortcut this loop but don't + # really want to depend on it so won't bother. + chain_name = cdf_rec[0] # 5 chars e.g. 1chmA + if ( chain_name[:4] == pdbid.upper() and # matches our PDB identifier + (chainid == None or chain_name[4] == chainid) ): # and chainid + found = True + if cdf_rec[1][0] != 'D' or cdf_rec[2][0] != 'F': + sys.stderr.write('WARNING: bad CDF record ignored: ' + + line + '\n') + continue + num_domains = int(cdf_rec[1][1:]) + num_fragments = int(cdf_rec[2][1:]) + field = 3 # fields 0,1,2 where chainname, domains, fragments + for domnum in range(num_domains): + # CDF records actually have a different line for each + # chain and so each chain is always a new domain in this + # format it would appear. So we will name the domains + # with chain identifier AND domain number so they are unique. + # Though it seems not quite right that a new chain means + # a new domain - domains should be able to contain multiple + # segments of (different) chains but CDF records seem + # to maintain chains as the higher level of hierarchy + # (chain being part of the record identifier ie chainid + # = pdbid + chainid, the first field). + domain_id = (chain_name[4] + # chain identifier + str(domnum + 1)) # number from one not zero + segment_list = [] + num_segments = int(cdf_rec[field]) + field += 1 + for segnum in range(num_segments): + start_chainchar = cdf_rec[field] + field += 1 + start_pdbnum = int(cdf_rec[field]) + field += 1 + insertcode1 = cdf_rec[field] + field += 1 + end_chainchar = cdf_rec[field] + field += 1 + end_pdbnum = int(cdf_rec[field]) + field += 1 + insertcode2 = cdf_rec[field] + field += 1 + + if start_chainchar != end_chainchar or \ + start_chainchar != chain_name[4]: + # TODO: I think this never happens, but should + # do something with it anyway + sys.stderr.write('WARNING: mismatch chain characters '+ + 'in CDF record: ' + line + '\n') + if start_chainchar == '0': # blank chainid in (old) PDB recs + if not issued_warning: + sys.stderr.write('WARNING: '\ + 'CDF record ' + chain_name +\ + ' indicates no chain field '\ + 'in PDB record. '\ + '(Will not work with'\ + ' remediated (2007) PDB files).\n' + ' Changing chain to A.\n') + issued_warning = True + start_chainchar = 'A' + if start_pdbnum > end_pdbnum: + # This happens e.g. on 1BMV chain 2 (only if chain 2 + # only is requested). Don't know what it means really, + # but let's make sure we don't get an exception anyway + sys.stderr.write('WARNING: CATH start resnum ' + + str(start_pdbnum) + ' > end resnum ' + + str(end_pdbnum) + ', swapping.\n') + tmp = start_pdbnum + start_pdbnum = end_pdbnum + end_pdbnum = tmp + + segment_list.append(PTSegment( + cdf_chainid_to_stride_chainid(start_chainchar), + start_pdbnum, end_pdbnum)) + + domain_list.append(PTDomain(domain_id, segment_list)) + # we don't do anything with what CDF terms 'fragments' + + if not found: + raise NotInCATH_Exception(pdbid) + + cdf_fh.close() + if len(domain_list) > 0: + return domain_list + else: + return [PTDomain(None, None)] # one-domain protein, no further info + + +def cdf_chainid_to_stride_chainid(cdf_chainid): + """ + Convert a CDF (CATH Domall File) chainid to a STRIDE chainid. + STRIDE uses '-' for a 'blank' chainid while PDB uses ' ' (space) + and CATH (CDF) uses '0' where PDB has a blank (space) chain identfifer. + We use the STRIDE convention ('-') in this program. + So all this does is return the cdf_chainid unless it is '0', then + it returns '-'. + + Parameters: + pdb_chainid - the PDB chain identifier + Return value: + STRIDE chain identifier corresponding to supplied pdb_chainid + """ + if cdf_chainid == '0': + return '-' + else: + return cdf_chainid + + +def ptdomain_set_verbose(verb): + """ + set the module global verbose flag in this module to supplied value + Parameters: verb - True (for verbose output) or False + Return value: None + Uses globals: verbose (in this module) + """ + global verbose + verbose = verb + + +def fixup_crossdomain_sses(secstruct, domain_list): + """ + Find any SSEs that span a domain boundary, and put each entirely + in one domain. + The domain is chosen as the one that contains most of the residues + int the SSE. + + Parameters: + secstruct - PTSecStruct (ptsecstruct.py) object descirbing SSEs + domain_list - list of PTDomain objects representing all the domains + in this protein. + (in/out) NOTE: may be modified by having a segment + removed from a domain if SSE is only partly in + the domain. + Return value: None. + """ + sse_list = ( [(start_chainid, start_resnum, end_chainid, end_resnum) + for (start_chainid, start_resnum, end_chainid, end_resnum) + in secstruct.strand_list] + + [(start_chainid, start_resnum, end_chainid, end_resnum) + for (start_chainid, start_resnum, end_chainid, end_resnum, helix_type) + in secstruct.helix_list] ) + + for (start_chainid, start_resnum, end_chainid, end_resnum) in sse_list: + for domain in domain_list: + if (domain.is_in_domain(start_chainid, + get_int_icode(start_resnum)[0]) + and not domain.is_in_domain(end_chainid, + get_int_icode(end_resnum)[0]) ): + # This really shouldn't happen, but does: domain + # decomposition has determined that this SSE crosses + # a domain boundary (really our SSE decisions don't + # match whatever domain decomposition has done). + # We'll have to assign the SSE to + # a domain, and add the residues it spans into that + # domain. + + # find domain2 as the other domain the SSE is also in + for domain2 in domain_list: + if domain2 == domain: + continue + if domain2.is_in_domain(end_chainid, + get_int_icode(end_resnum)[0]): + break + + + # find sse_domain as the domain with more residues of the + # SSE in it + + domain_res_count = 0 + domain2_res_count = 0 + # FIXME: this is ignoring insertion codes etc., really + # should convert to proper sequential residue sequence numbers + # to do this + start_resint = get_int_icode(start_resnum)[0] + end_resint = get_int_icode(end_resnum)[0] + for resint in range(start_resint, end_resint+1): + if domain.is_in_domain(start_chainid, resint): + domain_res_count += 1 + elif domain2.is_in_domain(start_chainid, resint): + domain2_res_count += 1 + else: + sys.stderr.write('ERROR: SSE in more than 2 domains\n') + if domain2_res_count > domain_res_count: + sse_domain = domain2 + else: + sse_domain = domain # arbitrarily domain if equal count + + # first remove the segment from where it currently is + seg = PTSegment(start_chainid, + get_int_icode(start_resnum)[0], + get_int_icode(end_resnum)[0]) +# print 'xxx',str(seg) + for dom in domain_list: +# print 'aaa',str(dom) + dom.remove_segment(seg) +# print 'bbb',str(dom) + + + sys.stderr.write('WARNING: SSE ' + start_chainid + ':' + + start_resnum + '-' + end_resnum + + ' crosses domain boundary.\n' + ' Put in domain ' + sse_domain.domainid + + ' (' + str(sse_domain) + ').\n') + sse_domain.add_segment(seg) +# print 'zzz',str(sse_domain) + + break # no need to look at any more domains for this SSE + +# # DEBUG +# for i in range(len(domain_list)): +# sys.stdout.write(str(domain_list[i])) +# if i < len(domain_list) - 1: +# sys.stdout.write('/') +# sys.stdout.write('\n') +# # END DEBUG diff --git a/scripts/ptmfile.py b/scripts/ptmfile.py new file mode 100644 index 0000000..0f2aee0 --- /dev/null +++ b/scripts/ptmfile.py @@ -0,0 +1,194 @@ +############################################################################### +# +# ptmfile.py - Functions to write MATLAB .m file for drawing 3D strand axes +# +# File: ptmfile.py +# Author: Alex Stivala +# Created: November 2007 +# +# $Id: ptmfile.py 2703 2009-07-27 06:01:05Z astivala $ +# +# Utility functions for writing MATLAB .m files to plot 3D strand backbone +# carbon-alpha traces and the axes fit to them. +# +############################################################################### + +import sys +from time import strftime,localtime + +from numpy.oldnumeric import array +from Bio.PDB import Vector + +def mfile_write_prelude(fh): + """ + Write the start of the m-file, which is just intructions to MATLAB + to hold plot etc. + + Parameters: + fh - open filehandle of m-file to write to + Return value: None + """ + timestamp = strftime("%d%b%Y %H:%M:%S", localtime()) + fh.write("% Generated on " + timestamp + + " by ptmfile.py $Revision: 2703 $: " + + " ".join(sys.argv) + "\n") + + fh.write('hold on\n') + + +def mfile_write_strand(fh, c_alpha_veclist, midpoints, centroid, dircos, + nterm_point, cterm_point, label): + """ + Write the data for a single strand to the open m-file. This information + is an array of the carbon-alpha co-ordinates of residues in the strand, + for which a 3D line is plotted, a vector of the centroid, for which + an asterisk is plotted, and a direction cosines vector for the axis, + along which a line is drawn (through the centroid). + A red asterisk is plotted at one end of the axis line segment to indicate + the direction along the line of the direction cosines vector. + A blue plus is plotted at the projection of the most nterminal midpoitn + on the axis, and a red circle at the projection of the most cterminal + midpoint on the axis. + + Parameters: + fh - open filehandle of m-file to write to + c_alpha_veclist - list of Bio.PDB.Vector for each C-alpha atom co-ord + midpoints - array of Bio.PDB.Vectors of each pair of consecutive + C_alpha atoms, or None. + centroid - Bio.PDB.Vector of centroid + dircos - Bio.PDB.Vector of direction cosine of computed strand axis + + Return value: None + """ + coords_str = "A = " + coords_str += str(array([list(vec) for vec in c_alpha_veclist])) + coords_str += ";\n" + + if midpoints != None: + midpoints_str = "M = " + midpoints_str += str(array([list(vec) for vec in midpoints])) + midpoints_str += ";\n" + + centroid_str = "c = " + str(list(centroid)) + ";\n" + dircos_str = "v = " + str(list(dircos)) + ";\n" + + fh.write(coords_str) + fh.write(centroid_str) + fh.write(dircos_str) + + fh.write("plot3(A(:,1),A(:,2),A(:,3),'r');\n") # red c-alpha backbone line + + # cyan line along axis through centroid in direction of direction cosines vector + fh.write("l = [c(1)-30*v(1), c(2)-30*v(2), c(3)-30*v(3); ") + fh.write(" c(1)+30*v(1), c(2)+30*v(2), c(3)+30*v(3)];\n") + fh.write("plot3(l(:,1), l(:,2), l(:,3),'c');\n") + + fh.write("plot3(c(1),c(2),c(3),'b*');\n") # blue asterisk for centroid + + # plot red asterisk at end of axis line segment to indicate direction + fh.write("d = c + 30 * v;\n") + fh.write("plot3(d(1), d(2), d(3), 'r*');\n") + + # plot blue plus at projection of n-terminal midpoint on axis, and + # red circle at projection of c-terminal midpoint on axis + nterm_str = "np = " + str(list(nterm_point)) + ";\n" + cterm_str = "cp = " + str(list(cterm_point)) + ";\n" + fh.write(cterm_str) + fh.write(nterm_str) + fh.write("plot3(np(1),np(2),np(3),'b+');\n") + fh.write("plot3(cp(1),cp(2),cp(3),'ro');\n") + + # write label at centroid + fh.write("text(c(1), c(2), c(3), '" + " " + label + "')\n") + + +def mfile_write_helix(fh, c_alpha_veclist, midpoints, centroid, + dircos, nterm_point, cterm_point, label): + """ + Write the data for a single helix to the open m-file. This information + is an array of the carbon-alpha co-ordinates of residues in the helix, + for which a 3D line is plotted, a vector of midpoints of consectuive + C_alpha triples, for wihch red crosses are plotted, + a vector of the centroid, for which + an asterisk is plotted, and a direction cosines vector for the axis, + along which a line is drawn (through the centroid). + A red asterisk is plotted at one end of the axis line segment to indicate + the direction along the line of the direction cosines vector. + A blue plus is plotted at the projection of the most nterminal midpoitn + on the axis, and a red circle at the projection of the most cterminal + midpoint on the axis. + + Parameters: + fh - open filehandle of m-file to write to + c_alpha_veclist - list of Bio.PDB.Vector for each C-alpha atom co-ord + midpoints - list of Bio.PDB.Vector of midpoints of consectuive + C_alpha triples used to fit axis to. + centroid - Bio.PDB.Vector of centroid + dircos - Bio.PDB.Vector of direction cosine of computed helix axis + nterm_point - Bio.PDB.Vector of project of most N-terminal midpoint onto + axis + cterm_point - Bio.PDB.Vector of projection of most C-terminal midpoint + onto axis. + label - string to write as label for the helix + + Return value: None + """ + coords_str = "A = " + coords_str += str(array([list(vec) for vec in c_alpha_veclist])) + coords_str += ";\n" + + midpoints_str = "M = " + midpoints_str += str(array([list(vec) for vec in midpoints])) + midpoints_str += ";\n" + + centroid_str = "c = " + str(list(centroid)) + ";\n" + dircos_str = "v = " + str(list(dircos)) + ";\n" + + fh.write(coords_str) + fh.write(midpoints_str) + fh.write(centroid_str) + fh.write(dircos_str) + + fh.write("plot3(A(:,1),A(:,2),A(:,3),'m');\n") # magenta c-alpha backbone line + # plot red cross for each midpiont of C_alpha triple + fh.write("for i = 1:size(M,1)\n") + fh.write(" plot3(M(i,1), M(i,2), M(i,3), 'rx');\n") + fh.write("end;\n") + + fh.write("plot3(c(1),c(2),c(3),'k*');\n") # black asterisk for centroid + + # cyan line along axis through centroid in direction of direction cosines vector + fh.write("l = [c(1)-30*v(1), c(2)-30*v(2), c(3)-30*v(3); ") + fh.write(" c(1)+30*v(1), c(2)+30*v(2), c(3)+30*v(3)];\n") + fh.write("plot3(l(:,1), l(:,2), l(:,3),'c');\n") + + # plot red asterisk at end of axis line segment to indicate direction + fh.write("d = c + 30 * v;\n") + fh.write("plot3(d(1), d(2), d(3), 'r*');\n") + + # plot blue plus at projection of n-terminal midpoint on axis, and + # red circle at projection of c-terminal midpoint on axis + nterm_str = "np = " + str(list(nterm_point)) + ";\n" + cterm_str = "cp = " + str(list(cterm_point)) + ";\n" + fh.write(cterm_str) + fh.write(nterm_str) + fh.write("plot3(np(1),np(2),np(3),'b+');\n") + fh.write("plot3(cp(1),cp(2),cp(3),'ro');\n") + + # write label at centroid + fh.write("text(c(1), c(2), c(3), '" + label + "')\n") + + +def mfile_write_conclusion(fh): + """ + Write the concluding commands to the open m-file after all strands + have been writtein with mfile_write_strand(). + This is instructions to turn on grid, etc. + + Parameters: + fh - open filehandle of m-file to write to + + Return value: None + """ + fh.write("grid on\n") + diff --git a/scripts/ptnode.py b/scripts/ptnode.py new file mode 100644 index 0000000..f325585 --- /dev/null +++ b/scripts/ptnode.py @@ -0,0 +1,2639 @@ +############################################################################### +# +# ptnode.py - node (strand, helix, terminus) classes for ptgraph. +# +# File: ptnode.py +# Author: Alex Stivala +# Created: July 2007 +# +# $Id: ptnode.py 2852 2009-10-12 03:18:51Z astivala $ +# +# A PTnode is a node in the protein topology graph. It represents a +# secondary structure which is an alpha helix or beta sheet. +# +# Since these are simple (linear) secondary structures, the +# PTnode will have two +# edges, one from the previous (towards N-terminal) PTnode and one to +# the next (towards C-terminal) PTnode. So we just keep these nodes +# in a list, the ordering of which is sufficient for this purpose. +# +# Specific PTNode types (Helix, Strand, Terminus) are derived from PTNode. +# +# +############################################################################### + +import sys +from math import pi,atan2,acos + +from numpy.oldnumeric import array +from numpy.oldnumeric.linear_algebra import singular_value_decomposition # part of Numeric +from Bio.PDB import * + +from ptsecstruct import stride_chainid_to_pdb_chainid +from geometry import * +from ptmfile import mfile_write_strand, mfile_write_helix +from ptutils import get_int_icode,biopdbresid_to_pdbresseq,char_if_not_blank + +#----------------------------------------------------------------------------- +# +# Module constants +# +#----------------------------------------------------------------------------- + +ALPHA = 100 # multiplier of dircos to get second point on line +EPSILON = 1e-4 # epsilon for testing closeness to 0 or pi in fit_axis + +#----------------------------------------------------------------------------- +# +# Module globals +# +#----------------------------------------------------------------------------- + +verbose = False +global issued_warning +issued_warning = {} # warning for (chainid,res_seq_num) not found + +#----------------------------------------------------------------------------- +# +# Class definitions +# +#----------------------------------------------------------------------------- + +class PTNode: + """ + A PTnode is a node in the protein topology graph. It represents a + secondary structure which is an alpha helix or beta strand. + + Since these are simple (linear) secnodary structures, the + PTnode will have two + edges, one from the previous (towards N-terminal) PTnode and one to + the next (towards C-terminal) PTnode. So we just keep these nodes + in a list, the ordering of which is sufficient for this purpose. + + Specific PTNode types (Helix, Strand, Terminus) are derived from this + class. + + The 'rich comparison' operators (<, <=, <, etc.) are overridden + so that they are integer comparison on the start residue sequence number. + Note equality and inequality are not overriden. + + """ + def __init__(self, nodeid, seqnum, start_res_seq, end_res_seq, chainid, + domainid, + pdb_residue_list, pdb_resid_dict, + fake_resids = False): + + """ + Construct a PTNode with supplied nodeid and type, and empty + hydrogen bond list. + + Parameters: + nodeid - unique identifier for this node (string) + seqnum - sequence number of this node, note need not be unique + start_res_seq - smallest PDB residue sequence number in the structure + end_res_seq - largest PDB residue sequence number in the structure + Note these residue sequence 'numbers' are strings, may have + insertion codes. + chainid = chain identifier for this SSE + domainid = domain identifier for domain this SSE is in + pdb_residue_list - list of all residues (for all chains) in the protein + pdb_resid_dict - dict of { {chainid,pdb_resseq) : seqindx } + where chainid and pdb_resseq make up + the PDB residue identifier, the pdb_resseq + being string resnum+icode if any e.g. + '60' or '60A', seqindx is the indiex + into sequential list of all residues + pdb_residue_list. + fake_resids - Default False. IF True, do not look up resids in + pdb_resid_dict as they do not really exist; used for + terminus nodes. + + + Exceptions: + raises ValueEror if end_res_seq < start_res_seq + + """ + + if ( not fake_resids ): + try: + start_res_indx = pdb_resid_dict[(chainid, start_res_seq)] + except KeyError: + if not issued_warning.has_key((chainid, start_res_seq)): + sys.stderr.write('WARNING: residue ' + start_res_seq + + ' (chain ' + + chainid + + ') not found. May be HETATM.\n') + issued_warning[(chainid,start_res_seq)] = True + while not pdb_resid_dict.has_key((chainid, start_res_seq)): + start_res_seq = str(get_int_icode(start_res_seq)[0] + 1) + start_res_indx = pdb_resid_dict[(chainid, start_res_seq) ] + try: + end_res_indx = pdb_resid_dict[(chainid, end_res_seq)] + except KeyError: + if not issued_warning.has_key((chainid, end_res_seq)): + sys.stderr.write('WARNING: residue ' + end_res_seq + + ' (chain ' + + chainid + + ') not found. May be HETATM.\n') + issued_warning[(chainid,end_res_seq)] = True + while not pdb_resid_dict.has_key((chainid, end_res_seq)): + end_res_seq = str(get_int_icode(end_res_seq)[0] - 1) + end_res_indx = pdb_resid_dict[(chainid, end_res_seq)] + + if ( end_res_indx < start_res_indx ): + raise ValueError("end residue seqnum " + str(end_res_indx) + " < " + + "start residue seqnum " + str(start_res_indx)) + + self.nodeid = nodeid + self.seqnum = seqnum # assigned sequentially, maybe not unique, + # used so strands can have a sequence number + # and helices also (both start at 1) + self.start_res_seq = start_res_seq # start PDB residue seq num + self.end_res_seq = end_res_seq # end PDB residue seq num + self.chainid = chainid # id of the chain this node is in + self.pdb_residue_list = pdb_residue_list + self.pdb_resid_dict = pdb_resid_dict + self.hydrogen_bond_list = [] # list of (PTnode, r1, r2, dist) tuples + self.sideways = False # True if element to be + # drawn left-right not up-down + # use get/set_sideways() + + self.reversed = False # True if strand is to be drawn in reverse + # direction. This is set based on the + # parallel/antiparaell relationshiup to + # neighbour strand (so once the first + # in a sheet is set to False, all the others + # are set based on this and following the + # bridge relationships and their 'N' or 'P' flag). + # Also used for helices, where the tableau + # information is used i.e. orienation as + # parallel or antiparallel. + # use get/set_reversed() + + self.residue_list = [] # list of Bio.PDB Residue objects in this SSE. + # Built and returned by get_residue_list() + self.resname_list = [] # list of 3 letter residue names in this SSE + # built by build_resname_sequence() + self.resid_list = [] # list of string PDB residue sequence numbers + # built by build_resname_sequence() + + self.residue_ordinal_dict = None + # dict { pdb_resseq : ordinal } mapping + # the PDB sequence number string to integer orinal position + # in sequence. Used to memoize this funtino so fast time it + # is called the dict is built, subsequent calls look up in + # dictionary. + # built get by get_residue_ordinal() + + self.fake_resids = fake_resids + + self.seq_component_id = None # intger sequence connect componented + # id used for the 'fold' color scheme: + # all strands in a sheet that are + # connected in sequence with no + # other SSEs in between have + # the same id + # similar for helices in clusters if + # used. + # use get/set_seq_component_id() + + self.domain_id = domainid # domain identifier for domain this SSE is in + + + # The following are only used by domainfc.py not ptgraph2.py: + + self.closenode_list=[]# list of (node, dist) tuples for + # PTNodes within some distance threshold + # of this node + self.endchain = False # True if first or last in chain. + # use get/set_endchain() + + + + + + def __str__(self): + """ + Return String representation of the node as 'TYPE id' + """ + return "PTNode" + " " + self.nodeid + + def __lt__(self, other): + """ + Compare based on chainid and start residue sequence number + """ + assert(isinstance(other, PTNode)) + return (self.pdb_resid_dict[(self.chainid, self.start_res_seq)] < + self.pdb_resid_dict[(other.chainid,other.start_res_seq)]) + + def __le__(self, other): + """ + Compare based on chainid and start residue sequence number + """ + assert(isinstance(other, PTNode)) + return (self.pdb_resid_dict[(self.chainid, self.start_res_seq)] <= + self.pdb_resid_dict[(other.chainid,other.start_res_seq)]) + + + def __gt__(self, other): + """ + Compare based on chainid and start residue sequence number + """ + assert(isinstance(other, PTNode)) + return (self.pdb_resid_dict[(self.chainid, self.start_res_seq)] > + self.pdb_resid_dict[(other.chainid,other.start_res_seq)]) + + def __ge__(self, other): + """ + Compare based on chainid and start residue sequence number + """ + assert(isinstance(other, PTNode)) + return (self.pdb_resid_dict[(self.chainid, self.start_res_seq)] >= + self.pdb_resid_dict[(other.chainid,other.start_res_seq)]) + + def get_chainid(self): + """ + Return the chain identifier in this structure node + Parameters: None + Uses membe data (readonly): + chainid + Return value: + The chain identifier of this structure node + """ + return self.chainid + + def get_start_res_seq(self): + """ + Return the lowest residue sequence number in this structure node + + Parameters: None + + Uses member data: (readonly) + start_res_seq + + Return value: + The residue sequence number at the start of this structure + NB this is a pdb residue sequence nubmer so it is a string + and may have an insertion code + """ + return self.start_res_seq + + + def get_end_res_seq(self): + """ + Return the highest residue sequence number in this structure node + + Parameters: None + + Uses member data: (readonly) + end_res_seq + + Return value: + The residue sequence number at the end of this structure + NB this is a pdb residue sequence number so it is a astring and + may have an insertion code. + """ + return self.end_res_seq + + + def get_span(self): + """ + Return the length in residues spanned by this structure node + + Parameters: None + + Uses member data: (readonly) + end_res_seq, start_res_seq + + Return value: length in resisudes spanned by this node, ie the + number of residues in this node. + """ + return len(self.get_residue_list()) + + def is_in_interval(self, res_seq_num): + """ + Return True iff the supplied residue sequence number is contained + in the interval spanned by this PTNode. + + Parameters: + res_seq_num - PDB resisude sequence number to test (assumed in + this chain) + + Uses member data (readonly): + chainid,start_res_seq, end_res_seq - start/end res seq num and + chainid of this node + pdb_resid_dict - dict mapping chainid,resnum+icode to sequential + index + + Return value: + True if res_seq_num is >= start_res_seq and <= end seq num + else False + """ + if self.fake_resids: + return False + try: + res_indx = self.pdb_resid_dict[(self.chainid, res_seq_num)] + except KeyError: + if not issued_warning.has_key((self.chainid, res_seq_num)): + sys.stderr.write('WARNING: residue ' + res_seq_num + ' (chain ' + + self.chainid + ') not found. May be HETATM.\n') + issued_warning[(self.chainid,res_seq_num)] = True + return False + return \ + ( res_indx >= self.pdb_resid_dict[(self.chainid,self.start_res_seq)] + and + res_indx <= self.pdb_resid_dict[(self.chainid,self.end_res_seq)] ) + + + def add_hbond(self, other_node, resnum1, resnum2, dist): + """ + Add a hydrogen bond to the list of hydrogen bonds in this node. + The bond is from PDB resdiue number resnum1 (in this node) to + PDB residue number resnum2 in other_node with distance dist (Angstroms). + + Parameters: + other_node - PTNode the bond is to from this node + resnum1 - PDB residue number, must be in this node + resnum2 - PDB residue number, must be in other_node + dist - (float) N..O distance of the bond (Angstroms) + + Uses data members (write): + hydrogen_bond_list - list of (ptnode, resnum1, resnum2, dist) tuples + + Return value: None + """ + assert(isinstance(other_node, PTNode)) + self.hydrogen_bond_list.append((other_node, resnum1, resnum2, dist)) + + + def get_hbond_list(self): + """ + Return list of hydrogen bonds from this node in form of list of + (ptnode, resnum1, resnum2, dist) tuples. + + Parameters: None + Uses member data (readonly): + hydrogen_bond_list - list of (other_node, resnum1, resnum2, dist) + tuples + Return value: + list of (ptnode, resnum1, resnum2, dist) tuples. + The actual list (ref) + in this node, not a new copy. + """ + return self.hydrogen_bond_list + + + def get_residue_ordinal(self, pdb_resseq): + """ + Given a PDB residue sequence number (may have insertino code) string + for a residue in this SSE, return its ordinal position int the + sequence of residues in this SSE (starting at 1). + + Parameters: + pdb_resseq - string PDB residue sequence number e.g. '60' or '60A' + Return value: + integer >=1 ordinal number of the resiude in sequnce of residues + for this SSE (from N to C terminal end) + Uses data members (read/write): + residue_ordinal_dict - dict { pdb_resseq : ordinal } mapping + the PDB sequence number string to integer orinal position + in sequence. Used to memoize this funtino so fast time it + is called the dict is built, subsequent calls look up in + dictionary. + """ + if self.residue_ordinal_dict: + return self.residue_ordinal_dict[pdb_resseq] + self.residue_ordinal_dict = {} + ordinal = 1 + for residue in self.get_residue_list(): + res_seq = biopdbresid_to_pdbresseq(residue.get_id()) + self.residue_ordinal_dict[res_seq] = ordinal + ordinal += 1 + return self.residue_ordinal_dict[pdb_resseq] + + + def get_residue_list(self): + """ + Return the list of Bio.PDB Residue objects for the residues in this + PTNode (strand or helix). + Also store list in residue_list data member, and return stored + value if already there. + + Parameters: + None + Uses data members: + residue_list (read/write). + pdb_residue_list, pdb_resid_dict (readonly) + start_res_seq,end_res_seq,chainid (readonly) + + """ + if self.residue_list: # memoization: return if already computed + return self.residue_list + + start_indx = self.pdb_resid_dict[(self.chainid,self.start_res_seq)] + end_indx = self.pdb_resid_dict[(self.chainid,self.end_res_seq)] + self.residue_list = self.pdb_residue_list[start_indx : end_indx + 1] + return self.residue_list + + + def set_sideways(self, sideways): + """ + Label this strand with the sideways flag. See comments in __init__ + + Parameters: sideways - True/False sideways flag + Return value: None + Uses data members (write): sideways + """ + self.sideways = sideways + + def get_sideways(self): + """ + Get the value of the sideways flag. See comments in __init__ + + Parameters: None. + Return value: True/False value of sideways flag + Uses data members (readnly): sideways + """ + return self.sideways + + + def set_reversed(self, reversed): + """ + Label this node with the reversed flag. See comments in __init__ + + Parameters: reversed - True/False reversed flag + Return value: None + Uses data members (write): reversed + """ + self.reversed = reversed + + def get_reversed(self): + """ + Get the value of the reversed flag. See comments in __init__ + + Parameters: None. + Return value: True/False value of reversed flag + Uses data members (readnly): reversed + """ + return self.reversed + + + + + def is_resnum_nearer_top(self, resnum): + """ + Return True if the supplied residue sequence number is nearer + the 'top' than the 'bottom' of this strand or helix (SSE). + I.e. if it is nearer + to the C- than the N- terminal end of the strand when the + SSE is not reversed (i.e. drawn pointing 'up'). Or, if the + SSE is reversed (drawn pointing 'down'), if it is nearer he + N terminal than the C terminal end of the SSE. + + Parameters: + resnum - residue sequence number, must be in this strand + + Uses data members (readonly) + start_res_seq, end_res_seq, reversed + + Return value: + True iff resnum is nearer C term and strand is not reversed OR + resnum is nearer N terman and strand is reversed + """ + assert(self.is_in_interval(resnum)) + midpoint_resnum = ( self.get_residue_ordinal(self.start_res_seq) + + (self.get_residue_ordinal(self.end_res_seq) + - self.get_residue_ordinal(self.start_res_seq)) + / 2 ) + if resnum > midpoint_resnum: + # closer to C terminal end + return not self.reversed + else: + # closer to N terminal end + return self.reversed + + + + def add_closenode(self, other_node, dist): + """ + Add a (node, dist) tuple + to the list of nodes that are close to this one. + + Parameters: + other_node - PTNode which is below threshold distance from this node. + dist - distance to the other_node (Angstroms) + + Uses member data (write): + closenode_list - the list of nodes close to this one. + + Return value: None + """ + assert(isinstance(other_node, PTNode)) + self.closenode_list.append((other_node, dist)) + + + def get_closenode_list(self): + """ + Return list of (node, dist) tuples for nodes + that are below threshold distance from this one. + + Parameters: None + + Uses member data (readonly): + closenode_list - list of close nodes + + Return value: + List of (ptnode, dist) tuples. The actual list (ref) in this + node, not a new copy. + """ + return self.closenode_list + + + def set_endchain(self, endchain): + """ + Label this node with the endchain flag. See comments in __init__ + + Parameters: reversed - True/False endchain flag + Return value: None + Uses data members (write): endchain + """ + self.endchain = endchain + + def get_endchain(self): + """ + Get the value of the endchain flag. See comments in __init__ + + Parameters: None. + Return value: True/False value of endchain flag + Uses data members (readnly): endchain + """ + return self.endchain + + def get_degree(self): + """ + Return the degree of this node (number of edges incident with it). + This is equal to the number of spatial edges + (nubmer of spatially adjacent nodes) plus the number of + sequence edges which are implicit - all nodes have two except + the first and last in a chain which have only one. + + Parameters: + None + Return value: + Degree of the node. + """ + deg = len(self.closenode_list) + if self.endchain: + deg += 1 + else: + deg += 2 + return deg + + + def axis_dihedral_angle(self, SSE1, SSE2, pdb_struct): + """ + We fit an axis to each of the two SSEs, and this SSE, and + compute the dihedral angle between the two planes defined by + (1) the axis lines SSE1 and self, and (2) SSE2 and self. + + 3 consecutive vectors between 4 points can define a dihedral + angle. Call the points A,B,C,D in order. ABC is a plane and + BCD is a plane and we can calculate the angle between those + planes by the conventional formula (using Bio.PDB Vector + module). + + Here, instead of having this vectors as actual bonds between + atoms (as in hbonds_dihedral_angle()), we are using purely the + abstraction of the SSEs as straight lines. Like + Arun Konagurthu's method in TableauCreator (email 05Oct2007) + we choose the points so that the vector between each of the two + SSE axes and self SSE axis is the shortest line between + the axes (mutual perpendicular). So we choose A and B as the + points on SSE1 axis and self axis respectively such that AB is + the shortest line between the SSE1 axis and this axis. + Similarly C and D are chosen so that CD is the shortest line + between this axis and SSE2 axis. + + + + + SSE1 self SSE2 + + | | | + | | | AB is shortest line between + | | v3 | SSE1 and self, defining + | C *--------->* D vector v1 + | ^ | + | | | CD is shortest line between + | | | self and SSE2, defining + | | v2 | vector v3 + | | | + | (|)theta | v2 is then defined by the + | | | line BC + A *--------->* B | + | v1 | | and the dihedral angle theta + | | | between planes ABC and BCD + | | | is given by: + | | | + |v2|v1 . (v2 x v3) + tan(theta) = -------------------- + (v1 x v2) . (v2 x v3) + + + Parameters: + SSE1 - first SSE node (strand or helix) to test + SSE2 - 2nd SSE node (strand or helix) to test + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + + Return value: + The angle (in (-pi, pi]) between the planes formed between + the axes fitted to the two SSEs with this SSE in common. + or None if no common perpendicular can be found or one + (or more) of the required axes cannot be computed. + + + NOTE: this method is only for STRAND and HELIX nodes, which have + a fit_axis() method. This method is identical in operation between + these types of nodes so is shared, but fit_axis() works differently + so PTNodeHelix and PTNodeStrand each have their own implemetnation + of it. + """ + SSE1_axis = SSE1.fit_axis(pdb_struct) + SSE2_axis = SSE2.fit_axis(pdb_struct) + self_axis = self.fit_axis(pdb_struct) + if SSE1_axis == None or SSE2_axis == None or self_axis == None: + return None + + (SSE1_dircos, SSE1_centroid) = SSE1_axis + (SSE2_dircos, SSE2_centroid) = SSE2_axis + (self_dircos, self_centroid) = self_axis + + # Get pa and pb, endpoints of the shortest line segment between + # SSE1 axis line and self axis line + # Note using Numeric.array '*' operator here for + # element-wise multiplication + # as Bio.PDB.Vector '*' operator is vector dot product. + # (Note also must have Vector + int * array and NOT + # int * array + Vector due to Python type coercion rules). + s1self_line = LineLineIntersect( + SSE1_centroid, SSE1_centroid + + ALPHA * SSE1_dircos.get_array(), + self_centroid, self_centroid + + ALPHA * self_dircos.get_array() ) + if s1self_line == None: + if verbose: + sys.stderr.write('no common perpendicular for axes ' + + SSE1.nodeid + ', ' + self.nodeid + '\n') + return None + else: + (pa, pb, mua, mub) = s1self_line + + # and pc, pd similarly for self and SSE2 + # Note using Numeric.array '*' operator here for + # element-wise multiplication + # as Bio.PDB.Vector '*' operator is vector dot product. + # (Note also must have Vector + int * array and NOT + # int * array + Vector due to Python type coercion rules). + s2self_line = LineLineIntersect( + self_centroid, self_centroid + + ALPHA * self_dircos.get_array(), + SSE2_centroid, SSE2_centroid + + ALPHA * SSE2_dircos.get_array() ) + if s2self_line == None: + if verbose: + sys.stderr.write('no common perpendicular for axes ' + + SSE2.nodeid + ', ' + self.nodeid + '\n') + return None + else: + (pc, pd, muc, mud) = s2self_line + +# angle = calc_dihedral(pa, pb, pc, pd) + + v1 = pb - pa + v2 = pc - pb + v3 = pd - pc + +# print 'xxx',self.nodeid,SSE1.nodeid,SSE2.nodeid,pa,pb,pc,pd + + # Using Bio.PDB.Vector class; v*v is dot product, v**v is cross product + # This is the same result as calling Vector.calc_dihedral() anyway +# theta = atan2( Vector((v2.norm() * v1.get_array())) * (v2 ** v3), +# (v1 ** v2) * (v2 ** v3) ) + +# print 'xxxx',theta, +# theta0 = theta + + # and this is the more elegant way, not using atan2() (as per Arun). + normals_dotprod = (v1 ** v2).normalized() * (v2 ** v3).normalized() + # handle roundoff errors + normals_dotprod = min(normals_dotprod, 1) + normals_dotprod = max(normals_dotprod, -1) + theta = acos(normals_dotprod) + normals_crossprod = (v1 ** v2).normalized() ** (v2 ** v3).normalized() + stp = v2 * normals_crossprod + if stp < 0: + theta = -theta + +# print 'yyyy',theta +# assert(abs(theta0 - theta) < EPSILON) + + + if verbose: + sys.stderr.write('axis_dihedral_angle ' + self.nodeid + ' ' + + SSE1.nodeid + ' ' + + SSE2.nodeid + ' ' + + str(theta) + '\n') + + return theta + + + + def relative_angle(self, SSE1, pdb_struct): + """ + We fit an axis to this SSE and the supplied other SSE (SSE1), and + compute the relative angle (omega) between the two axes. + This is the angle use to define the Tableau + (See Kamat and Lesk 2007 Proteins 66:869-876 and + Konagurthu, Stuckey and Lesk 2008 'Structural search and retrieval using + a tableau representation of protein folding patterns' Bioinformatics + (advance access, to be published Jan 5 2008). + + As per Arun Konagurthu's method in TableauCreator (email + 05Oct2007) we choose the points so that the vector between + each of the two SSE axes is the shortest line between the axes + (mutual perpendicular). So we choose A and B as the points on + SSE1 axis and self axis respectively such that AB is the + shortest line between the SSE1 axis and this axis. + + + + SSE1 self + + | | + | | BC is shortest line between + | | SSE1 and self, defining + | | vector v2 + A * * D + | /|\ + | | v1 and v3 are vectors + | | defining the axes of SSE1 + |v1 | v3 and self respectively. + | | + \|/ omega #| The relative angle omega + B *---(-)--->* C (interaxial angle) is the + |# v2 | smallest angle required to + | | reorient v1 to eclipse v3 + | | (or vice versa): + | | + |v2|v1 . (v2 x v3) + tan(omega) = -------------------- + (v1 x v2) . (v2 x v3) + + + Parameters: + SSE1 - The other SSE (helix or strand) to get relative angle to self + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + + Return value: + The angle (in (-pi, pi]) required to reorient self axis + to eclipse axis of SSE1 (or vice versa) 'looking along' the + shortest line (mutual perpendicular) between the two axes. + + + NOTE: this method is only for STRAND and HELIX nodes, which have + a fit_axis() method. This method is identical in operation between + these types of nodes so is shared, but fit_axis() works differently + so PTNodeHelix and PTNodeStrand each have their own implemetnation + of it. + """ + sse1_axis = SSE1.fit_axis(pdb_struct) + self_axis = self.fit_axis(pdb_struct) + if sse1_axis == None or self_axis == None: + return None + + (SSE1_dircos, SSE1_centroid) = sse1_axis + (self_dircos, self_centroid) = self_axis + + pa = SSE1_centroid + ALPHA * SSE1_dircos.get_array() + pd = self_centroid + ALPHA * self_dircos.get_array() + + # Get pb and pc, endpoints of the shortest line segment between + # SSE1 axis line and self axis line + # Note using Numeric.array '*' operator here for + # element-wise multiplication + # as Bio.PDB.Vector '*' operator is vector dot product. + # (Note also must have Vector + int * array and NOT + # int * array + Vector due to Python type coercion rules). + s1self_line = LineLineIntersect(SSE1_centroid, pa, + self_centroid, pd) + if s1self_line == None: + if verbose: + sys.stderr.write('relative_angle: ' + + 'no common perpendicular for axes ' + + str(self) + ',' + str(SSE1) + '\n') + return None + else: + (pb, pc, mub, muc) = s1self_line + +# # DEBUG +# if verbose: +# sys.stderr.write('mutual perpendicular ' + self.nodeid + ' ' +# + SSE1.nodeid + +# ': pb = ' + str(array(list(pb))) + +# '; pc = ' + str(array(list(pc))) + '\n') +# # END DEBUG + + +# omega = calc_dihedral(pa, pb, pc, pd) + + v1 = pb - pa + v2 = pc - pb + v3 = pd - pc + + # Using Bio.PDB.Vector class; v*v is dot product, v**v is cross product + # This is the same result as calling Vector.calc_dihedral() anyway +# omega = atan2( Vector((v2.norm() * v1.get_array())) * (v2 ** v3), +# (v1 ** v2) * (v2 ** v3) ) + +# print 'xxxx',omega, + + # and this is the more elegant way, not using atan2() (as per Arun). + normals_dotprod = (v1 ** v2).normalized() * (v2 ** v3).normalized() + # handle roundoff errors + normals_dotprod = min(normals_dotprod, 1) + normals_dotprod = max(normals_dotprod, -1) + omega = acos(normals_dotprod) + normals_crossprod = (v1 ** v2).normalized() ** (v2 ** v3).normalized() + stp = v2 * normals_crossprod + if stp < 0: + omega = -omega + +# print 'yyyy',omega + + +# DEBUG +# if verbose: +# sys.stderr.write('relative_angle ' + self.nodeid + ' ' +# + SSE1.nodeid + ' ' +# + str(omega) + '\n') +# END DEBUG + + return omega + + + def build_resname_sequence(self): + """ + Build list of (3 letter) residue names in sequence for the residues + in this node (SSE). E.g. and matching + list of PDB residue sequence numbers + + Parameters: + None. + + Return value: None + Uses data member (write): resname_list + resid_list + """ + residue_list = self.get_residue_list() + self.resname_list = [residue.get_resname() for residue in residue_list] + # id of a residue in Bio.PDB is tuple (hetatm, resseqnum, icode) + self.resid_list = [str(residue.get_id()[1]) + + char_if_not_blank(residue.get_id()[2]) + for residue in residue_list] + + def set_seq_component_id(self, seqc_id): + """ + Label this node with a sequence connected component identifier + + Parameters: + seqc_id - int seq connected component id + Uses data members: + seq_component_id (write) + Return value: None + """ + self.seq_component_id = seqc_id + + def get_seq_component_id(self): + """ + Return the id of the seq connectec component to which + this node belongs (may be None) + + Parameters: None + Uses data members (readonly): seq_component_id + Return value: Seq cc id set in this node (int) or None + """ + return self.seq_component_id + + +class PTNodeHelix(PTNode): + """ + The PTNodeHelix class is the type of PTNode for an alpha helix. + """ + + def __init__(self, helixtype="ALPHA", *args): + """ + Construct PTNodeHelix with supplied nodeid and type. + Parameters: + helixtype - string "ALPHA" or "PI" or "310". default "ALPHA". + +Variable parameter list: straight to PTNode constructor (q.v.). + + This extends PTNode by adding is_positioned, + and other than that just calls PTNode constructor. + Raises exceptions: + TypeError if helixtype argument is invalid. + """ + if helixtype not in ["ALPHA", "PI", "310"]: + raise TypeError("Invalid helixtype " + helixtype + "\n") + + PTNode.__init__(self, *args) + self.helixtype = helixtype + + # + # Flags used in heuristic helix placement + # + self.is_positioned = False # set to True if helix is already positioned + # (drawn). Used when old_helix_placement + # (i.e. -i option NOT supplied) is in use, + # sometimes as a special case we position + # helices before calling write_helices_svg() + # and this flag is set to mark these as + # already positioned. + # use get/set_is_positioned + self.is_interstrand = False # Set to True to mark special case + # of helix between strands on same + # vert axis in same sheet, treated + # specially when using heuristic helix + # placement to + # force interstand helices beside sheet. + # If this is set then the reversed + # flag of this helix node (in base class) + # is set to reversed flag of + # the first strand N-terminal to it. + # (see write_insterstrand_helices_svg()). + # Use get/set is_interstrand. + self.is_last_in_seq = False # Set to True if this helix is the last + # in a sequence of helices all aligned + # on an axis, AND n-terminal strand + # is on different axis. + # Used so that if + # connector into this helix is on + # another axis then we know to use + # the top (reversed) else bottom + # port instead of what what normally + # be used (reveresed is set as per + # is inter_strand (see comments above). + # Use get/set is_last_in_seq(). + self.cluster_id = None # If use helix 'clustering' then this is + # the intege id of the cluster to which this + # helix belongs (ids assigned sequentially + # starting from 1). + # Use get/set cluster_id() + + self.axis_centroid = None # Bio.PDB.Vector representing the + # centroid of midpoints of consecutive + # c_alpha atoms of consecutive residue triples + # in the helix. Set by fit_axis() + + self.axis_direction_cosines = None # Bio.PDB.Vector representing the + # direction cosines of the axis line + # fitted to this helix. Set by + # fit_axis() + self.axis_nterm_point = None # Bio.PDB.Vector of projection most + # N-terminal point of SSE onto axis. + # Set by fit_axis() + self.axis_cterm_point = None # Bio.PDB.Vector of projection of most + # C-terminal point of SSE onto axis. + # Set by fit_axis() + + + + def __str__(self): + """ + Return String representation of the node as + 'TYPE id [startResNum..endResNum] + """ + return self.helixtype + " " +\ + self.nodeid + "[" + str(self.start_res_seq) + \ + ".." + str(self.end_res_seq) + "]" + + def get_is_positioned(self): + """ + Return True if the helix is marked already positioned + See is_positioned in __init__() + Parameters: None + Return value: True if helix is marked already positioned else False + Uses member data (readonly): is_positioned + """ + return self.is_positioned + + def set_is_positioned(self, is_pos): + """ + Set the is_positioned flag to the supplied boolean value. + See is_positioned in __init__() + Parmeters: + is_pos - True to mark as already positioned, False to unmark + Return value: None + Uses member data (WRITE): is_positioned + """ + self.is_positioned = is_pos + + def get_type(self): + """ + Return the helix type 'ALPHA', 'PI' or '310' + Parameters: None + Return value: helix type as above. + """ + return self.helixtype + + def get_is_interstrand(self): + """ + Return True if the helix is marked already interstrand + See is_interstrand in __init__() + Parameters: None + Return value: True if helix is marked already interstrand else False + Uses member data (readonly): is_interstrand + """ + return self.is_interstrand + + def set_is_interstrand(self, is_intstr): + """ + Set the is_interstrand flag to the supplied boolean value. + See is_interstrand in __init__() + Parmeters: + is_pos - True to mark as already interstrand, False to unmark + Return value: None + Uses member data (WRITE): is_interstrand + """ + self.is_interstrand = is_intstr + + def get_is_last_in_seq(self): + """ + Return True if the helix is marked already last_in_seq + See is_last_in_seq in __init__() + Parameters: None + Return value: True if helix is marked already last_in_seq else False + Uses member data (readonly): is_last_in_seq + """ + return self.is_last_in_seq + + def set_is_last_in_seq(self, is_last): + """ + Set the is_last_in_seq flag to the supplied boolean value. + See is_last_in_seq in __init__() + Parmeters: + is_pos - True to mark as already last_in_seq, False to unmark + Return value: None + Uses member data (WRITE): is_last_in_seq + """ + self.is_last_in_seq = is_last + + def get_cluster_id(self): + """ + Return the cluster id of cluster to which this helix belongs. + See cluster_id in __init__() + Parmaters: None + Return value: cluster id (integer from 1) or None if no cluster + USes member data (readonly): cluster_id + """ + return self.cluster_id + + def set_cluster_id(self, cluster_id): + """ + Set the cluster id of the cluser to which this helix belongs. + Parameters: + cluster_id: integer > 0 or None + Return value: None + Uses member data (write): cluster_id + """ + self.cluster_id = cluster_id + + + def fit_axis(self, pdb_struct, mfile_fh = None): + """ + Approximate this helix as a straight line by fitting a total least + squares line through the midpoints of planes formed by + C-alpha atoms all consecutive residue triples in the helix. + + This is the method used in Arun Konagurthu's TableauCreator program, + this is more or less a re-implementation in Python of the C++ function + used there. + (See Kamat and Lesk 2007 Proteins 66:869-876 and + Konagurthu, Stuckey and Lesk 2008 'Structural search and retrieval using + a tableau representation of protein folding patterns' Bioinformatics + (advance access, to be published Jan 5 2008). + Parameters: + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + mfile_fh - (Default None) + filehandle (open write) to write MATLAB commands + for plotting helix axis data to, or None for no MATLAB. + + Return value: + tuple (direction_cosines, centroid) + direction_coisines is Vector (a,b,c), the direction cosines + of the axis + centroid is Vector (x,y,z) the centroid of the midpoints of + planes formed by C_alpha atoms of consecutive residue triples. + + Uses data members: + axis_centroid (read/write) - Vector of centroid of triplet planes. + axis_direction_cosines (read/write) - Vector of direction cosines + of axis line + + Note: this function is memoized - the first time it is called it + computes the axis and stores it in data members as well as + returning it; subsequent calls return the stored values. + This is because we don't necessarily need this computation + for all helices so don't want to always compute it up front + but also may need it multiple times for some helices. + """ + # return stored values if already computed + if self.axis_direction_cosines != None: + return (self.axis_direction_cosines, self.axis_centroid) + + # We use Numeric rather than newer NumPy since BioPython requires + # Numeric anyway. + + # get list of position vectors of C_alpha atoms in this helix + c_alpha_veclist = [ residue['CA'].get_vector() + for residue in self.get_residue_list() ] + + if len (c_alpha_veclist) < 3: + sys.stderr.write('WARNING: helix ' + str(self) + + ' has only ' + str(len(c_alpha_veclist)) + + ' residues, cannot fit axis\n') + return None + + # build list of position vectors of midpoints of each three + # consecutive C_alpha atoms. + midpoints = [] + for i in range(1, len(c_alpha_veclist)-1): + vec1 = c_alpha_veclist[i-1] - c_alpha_veclist[i] + vec2 = c_alpha_veclist[i+1] - c_alpha_veclist[i] + midpoint = (vec1 + vec2) / 2 + # convert to position vector relative to C_alpha of residue i + midpoint = c_alpha_veclist[i] + midpoint + midpoints.append(midpoint) + + # calculate centroid of the midpoints (axis must pass through this) + centroid = Vector([0,0,0]) + for i in range(len(midpoints)): + centroid += midpoints[i] + centroid /= len(midpoints) + + if len(c_alpha_veclist) >= 4: + # build array A where each row is vector from centroid + # to midpoint +# midpoints.reverse() #XXX + A = array([list(mp - centroid) for mp in midpoints]) + + # compute the SVD of the array A, giving singular values on + # diagonal of s and right singular vectors, forming + # an orthonormal basis for the space spanned by rows of A, + # as columns of v + # (i.e. rows of vt, vt = transpose(v)). + # TODO: handle exception LinAlgError and fit axis by other + # method (as when fewer than 4 residues). Never actually seen + # this exception occur here but it could. + (u, s, vt) = singular_value_decomposition(A) + # the direction cosine is the first row of vt, this is the basis + # vector corresponding to first (largest) singular value, + # i.e. the dimension in which variance is greatest, hence this + # must be the major axis of the helix. + dircos = Vector(vt[0,:]) + + # get projection of most N-terminal and most C-terminal midpionts + # onto the axis line + self.axis_nterm_point = ProjectPointOntoLine(centroid, centroid + + ALPHA*dircos.get_array(), + midpoints[0]) + self.axis_cterm_point = ProjectPointOntoLine(centroid, centroid + + ALPHA*dircos.get_array(), + midpoints[-1]) + # The dircos gives the axis of the helix, but there is no + # guarantee that it 'points the right way' i.e. from the N- + # to C- terminus. We test if it does by finding the angle + # between it and the vector pointing frmo nterm_piont to + # cterm_point. This angle must either be 0 or pi, if 0 then the + # axis pionts the right way, otherwise it is pointing the wrong + # way so we reverse it. + + # FIXME: instead of using Vector.angle() should just compute + # ((cterm_point-nterm_point) dotproduct dircos) + # / (cterm_point-nterm_point).norm() * dircos.norm() + # and test for 1 or -1, avoiding arccosine computation. + # (Actually don't need to normalize, could just check +ve or -ve). + # In fact, we probably don't really need to poject points onto + # line at all, could just compute angle between dircos and + # vector through N- and C- terminal midpoints. + angle = (self.axis_cterm_point-self.axis_nterm_point).angle(dircos) + assert (abs(angle) < EPSILON or abs(angle - pi) < EPSILON) + if (abs(angle - pi) < EPSILON): + dircos = Vector(-1 * dircos.get_array()) + if verbose: + sys.stderr.write('reversed axis for ' + str(self) + '\n') + + +# # DEBUG +# print 'helix axis svd ',str(self),':' +# print 'A = ' +# print A +# print 's = ' +# print s +# print 'vt= ' +# print vt +# print 'dircos=' +# print dircos +# # END DEBUG + + if mfile_fh != None: + mfile_write_helix(mfile_fh, c_alpha_veclist, + midpoints, centroid, dircos, + self.axis_nterm_point, + self.axis_cterm_point, + self.nodeid) + else: + # if we have fewer than 4 residues in the helix then we cannot + # compute an axis with SVD using midpoints of consecutive + # residue triples. + # If here are 3 residues then we can compute two midpionts for + # consecutive triples, an will just say the axis is the line + # through these 2 points. Otherwise, for 1 or 2 residues, + # we cannot sensibly compute any axis. + assert(len(c_alpha_veclist) == 3) + mp1 = (c_alpha_veclist[1] - c_alpha_veclist[0]) / 2 + \ + c_alpha_veclist[0] + mp2 = (c_alpha_veclist[2] - c_alpha_veclist[1]) / 2 + \ + c_alpha_veclist[1] + centroid = (mp1 + mp2) / 2 + v = mp2 - mp1 + dircos = v / v.norm() + if mfile_fh != None: + # get projection of most N-terminal and most + # C-terminal C_alpha atom points onto the axis line + self.axis_nterm_point = ProjectPointOntoLine(centroid, + centroid + + ALPHA*dircos.get_array(), + c_alpha_veclist[0]) + self.axis_cterm_point = ProjectPointOntoLine(centroid, + centroid + + ALPHA*dircos.get_array(), + c_alpha_veclist[-1]) + mfile_write_helix(mfile_fh, c_alpha_veclist, + [mp1, mp2], centroid, dircos, + self.axis_nterm_point, + self.axis_cterm_point, + self.nodeid) + + self.axis_direction_cosines = dircos + self.axis_centroid = centroid + return (dircos, centroid) + + +class PTNodeStrand(PTNode): + """ + The PTNodeStrand class is the type of PTNode for a beta strand. + + The additions it has are the bridge_list and methods for using it - + beta strands can have beta bridges to other beta strands, and a beta + sheet label. + """ + def __init__(self, *args): + """ + Construct a PTNodeStrand with supplied nodeid and type, and empty + bridge bond list. + + Parameters: + Variable parameter list: straight to PTNode constructor (q.v.). + + This extends PTNode by adding bridge_list, + and other than that just calls PTNode constructor. + + """ + PTNode.__init__(self, *args) + self.bridge_list = [] # list of (other_node, bdir, side) tuples + # bdir is 'N' or 'P' for antiparallel or + # parallel respectively + # side is '+' or '-' to indicate relative + # side of this strand bridge partners are + # on, as per Westhead et al 1999 + # (see label_strand_bridge_sides()) + # use add_bridge(), remove_brdige(), + # get_bridge_list() + + self.sheet_id = None # id of sheet that this strand belongs to + # single char 'A', 'B', etc. + # use get/set_sheet_id() + + + self.align_pos = 0 # The relative 'vertical' (assuming + # strands are drawn as arrows 'pointing' + # up or down) alignmenent position of this strand + # on its vertical axis. + # This value is in 'residues' (i.e. number + # of residues from 'top' of neighbour strand) + # to 'top' of this strand. + # Maybe be positive or negative (or zero). + # (see build_sheet_constraints() in ptgraph2.py) + # use get/set_align_pos() + self.barrel_edge = False # True if this strand is one of the two + # in a beta-barrel that was opened up + # by breaking the bridge between them + # use get/set_barrel_edge() + + self.axis_centroid = None # Bio.PDB.Vector representing the + # centroid of c_alpha atoms of residues + # in the strand. Set by fit_axis() + + self.axis_direction_cosines = None # Bio.PDB.Vector representing the + # direction cosines of the axis line + # fitted to this strand. Set by + # fit_axis() + self.axis_nterm_point = None # Bio.PDB.Vector of projection most + # N-terminal point of SSE onto axis. + # Set by fit_axis() + self.axis_cterm_point = None # Bio.PDB.Vector of projection of most + # C-terminal point of SSE onto axis. + # Set by fit_axis() + + + + + def __str__(self): + """ + Return String representation of the node as + 'TYPE id [startResNum..endResNum]' + """ + return "STRAND" + " " + self.nodeid + "[" + str(self.start_res_seq) + \ + ".." + str(self.end_res_seq) + "]" + + + def add_bridge(self, other_node, bdir): + """ + Add a bridge to another strand to the table of bridges in this node. + The bridge is to other_node and bdir is 'N' for antiparallel or + 'P' for parallel. + + Does not add duplicates: if there is already an edge to the supplied + node, this new one is not added; also, ensure that these edges + are undirected by adding the reverse edge at the same time + (this is really the only reason we need to check for duplicates: + it seems that stride sometimes gives a bridge partner from one + strand to another but not back the other way, but not always). + + Bridge side label ('+' or '-') is set as '.', this is actually set + by label_bridge_sides() to be called afterwards. + + Parameters: + other_node - PTNode the bond is to from this node (see NOTE) + bdir - 'N' or 'P' for antiparrallel or parallel resp. + + Uses data members (write): + brdige_list - list of (ptnodestrand, bdir, side) tuples + + NOTE: also modifies other_node by adding an edge in its bridge_list + directly. + + Return value: None + """ + + assert(isinstance(other_node, PTNodeStrand)) + assert(bdir == 'N' or bdir == 'P') + if other_node not in [ node for (node, bdir_unused, side_unused) + in self.bridge_list ]: + self.bridge_list.append((other_node, bdir, '.')) + other_node.bridge_list.append((self, bdir, '.')) + + #----- debug TESTING FOR STRANDS WITH MORE THAN 2 PARTNERS --- + if verbose: + if len(self.bridge_list) > 2: + sys.stderr.write(self.nodeid + " has " \ + + str(len(self.bridge_list)) +\ + " adjacent strands\n") + #----- end ----- + + + def remove_bridge(self, other_node): + """ + Remove the bridge to other_node from the list of bridges in this node. + + Parameters: + other_node - PTNode the bridge is to from this node (see NOTE) + + Uses data members (write): + brdige_list - list of (ptnodestrand, bdir, side) tuples + + + NOTE: also modifies other_node by adding an edge in its bridge_list + directly. + + Return value: None + + Raises exceptions: + KeyError if other_node not found in bridge list + + """ + found = False + for i in range(len(self.bridge_list)): + if self.bridge_list[i][0] == other_node: + found = True + break + if found: + self.bridge_list.pop(i) + # now remove other node's bridge to this one + found = False + for i in range(len(other_node.bridge_list)): + if other_node.bridge_list[i][0] == self: + found = True + break + assert(found) # logic error if there wasn't a matching bridge + other_node.bridge_list.pop(i) + else: + raise KeyError("node not found") + + + + def set_sheet_id(self, sheet_id): + """ + Label this strand with a sheet id to which it belongs. + + Parameters: + sheet_id - single char sheet identifier + Uses data members: + sheet_id - sheet id (write) + Return value: None + """ + self.sheet_id = sheet_id + + def get_sheet_id(self): + """ + Return the id of the sheet to which this strand belongs. + + Parameters: None + Uses data members (readonly): sheet_id + Return value: Sheet id set in this node (single char) + """ + return self.sheet_id + + def set_align_pos(self, align_pos): + """ + Set the (integer) relative alignment position for this strand. + See comments on align_pos in __init__ + + Parmeters: align_pos (integer) + Return value: None + Uses data members (write): align_pos + """ + self.align_pos = align_pos + + def get_align_pos(self): + """ + Get the relative alignment position of this strand. + See comments on align_pos in __init__ + + Paramneters: None + Return value: align_pos value (integer) + Uses data members (Readonly): align_pos + """ + return self.align_pos + + def set_barrel_edge(self, barrel_edge): + """ + Set the barrel_edge flag in this strand to the supplied value. + See comments on barrel_edge in __init__ + Parameters: barrel_edge: True or False + REturn value: None + Uses data members (write): barrel_edge + """ + self.barrel_edge = barrel_edge + + def get_barrel_edge(self): + """ + Return the value of the barrel_edge flag in this node + See comments on barrel_edge in __init__ + Parameter: None + Return value: barrel_edge flag (True or False) + Uses data members (readonly): barrel_edge + """ + return self.barrel_edge + + def get_bridge_list(self): + """ + Return list of bridges from this node in form of list of + (ptnodestand, bdir, side) tuples. + + Parameters: None + Uses member data (readonly): + bridge_list - list of bridge tuples + Return value: + list of (ptnodestrand, bdir, side) tuples. This list is pointer to + the list in this node, not a new copy. + """ + return self.bridge_list + + + def num_neighbours(self): + """ + Return the number of neighbouring strands that this strand has. + This is just the number of elements in the bridge list. + + Parameters: None + Uses member data (Readonly): + bridge_list - list of brdige tuples + Return value: + number of neighbouring strands (length of bridge list) + """ + return len(self.bridge_list) + + def is_neighbour(self, strand): + """ + Return True iff the supplied strand is a neighbour of this one + (i.e. is in the bridge list) + + Parameters: + strand - strand to test for being in the bridge list of this node + Uses data members (readonly): + bridge_list - list of bridge tuples (ptnodestrand, bdir, side) + Return value: + True if strand is in the list of bridges at this node, else False + """ + assert(isinstance(strand, PTNodeStrand)) + for (node, bdir_unused, side_unused) in self.bridge_list: + if node == strand: + return True + return False + + + def label_strand_bridge_sides(self, pdb_struct): + """ + Label bridge edges with '+' or '-' indicating relative side of + the strand the bridge partners are on, using hydrogen bond overlap. + For one strand with two more more neighbours, if there is H bond + overlap (i.e. one residue in the reference strand has more than + one neighbour) then those neighbours must be on opposite sides + of the reference strand. + + Otherwise, they may be on the same side, (but not necessarily). + This has to be determined by geometric criteria, specifically + the dihedral angle (NB this is not the usual phi/psi meaning + of dihedral, but the generic meaning of an angle between planes) + formed between two carbon alpha atoms on the reference strand + and one on each of the other two strands. + + Every PTNodeStrand with neighbour(s) has a bridge edge (tuple + in the bridge_list) going out to the neighbour, and that neighbour + symmetrically has one going back (see PTNodeStrand.add_bridge()). + So for a strand with one neighbour, the label on the bridge edge + is '+'; if there is more than one neighbour, the edges are labelled + '+' for those on one side and '-' on the other. + + This algorithm is described in + Westhead et al 1999 ``Protein structural topology: Automated + analysis and diagrammatic representation'' Protein Science 8:897-904 + (see pp.901-902 and Figures 4 and 5). + + Parameters: + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + + Uses data members + (read/write): + bridge_list - list of bridge (node, bdir, side) tuples + (readonly): + hydrogen_bond_list - list of H-bond (node, resnum1, resnum2, dist) + tuples + + Return value: None + """ + if len(self.bridge_list) == 0: + return # no neighbours, nothing to do here + + # now go through all pairs of neighbour strands (if any) + # in turn, labelling as opposite sides if H-bond nesting criteria + # tells us they must be on different sides of this strand. + for i in range(len(self.bridge_list)): + (node_i, bdir_i, side_i) = self.bridge_list[i] + for j in range(i+1, len(self.bridge_list)): + (node_j, bdir_j, side_j) = self.bridge_list[j] + # FIXME: really has_hbond_strand_overlap() implies + # has_strand_extent_overlap() (or should do), so + # should be able to remove the former. + if (self.has_hbond_strand_overlap(node_i, node_j) or + self.has_strand_extent_overlap(node_i, node_j) ): + # node_j has overlap with node_i on this strand, so + # set its side to opposite of that of node_i + if side_i == '+': + self.bridge_list[j] = (node_j, bdir_j, '-') + elif side_i == '-': + self.bridge_list[j] = (node_j, bdir_j, '+') + else: + self.bridge_list[i] = (node_i, bdir_i, '+') + self.bridge_list[j] = (node_j, bdir_j, '-') + if verbose: + sys.stderr.write('overlap; opposite sides of '+ + self.nodeid + ': ' + + self.bridge_list[i][0].nodeid + ','+ + self.bridge_list[j][0].nodeid + '\n') + + + # now for all pairs where H bond overlap did not manage to set + # a relative side, use geometry test where no overlap + for i in range(len(self.bridge_list)): + (node_i, bdir_i, side_i) = self.bridge_list[i] + for j in range(i+1, len(self.bridge_list)): + (node_j, bdir_j, side_j) = self.bridge_list[j] + if ((side_j == '+' or side_j == '-') and + (side_i == '+' or side_i == '-')): + continue # already set, skip it + if self.strands_on_opposite_sides(node_i, node_j, pdb_struct): + # node_j and node_i are on different sides of this + # strand, so set its side to opposite of that of + # node_i, if side set + if verbose: + sys.stderr.write('OPPOSITESIDES of '+self.nodeid+': '+ + node_i.nodeid+','+node_j.nodeid+'\n') + if side_j != '+' and side_j != '-': + if side_i == '+': + self.bridge_list[j] = (node_j, bdir_j, '-') + elif side_i == '-': + self.bridge_list[j] = (node_j, bdir_j, '+') + else: + self.bridge_list[i] = (node_i, bdir_i, '+') + self.bridge_list[j] = (node_j, bdir_j, '-') + elif side_i != '+' and side_i != '-': + if side_j == '+': + self.bridge_list[i] = (node_i, bdir_i, '-') + elif side_j == '-': + self.bridge_list[i] = (node_i, bdir_i, '+') + else: + self.bridge_list[i] = (node_i, bdir_i, '+') + self.bridge_list[j] = (node_j, bdir_j, '-') + else: + # can't happen since skipped if both sides set + assert(False) + else: + # they must be on the same side of this strand + if verbose: + sys.stderr.write('SAMESIDE of '+self.nodeid+': '+ + node_i.nodeid+','+node_j.nodeid+'\n') + if side_i == '+' or side_i == '-': + self.bridge_list[j] = (node_j, bdir_j, side_i) + elif side_j == '+' or side_j == '-': + self.bridge_list[i] = (node_i, bdir_i, side_j) + else: + self.bridge_list[i] = (node_i, bdir_i, '+') + self.bridge_list[j] = (node_j, bdir_j, '+') + + + def strands_on_opposite_sides(self, strand1, strand2, pdb_struct): + """ + Test if the two strands are on opposite sides of this strand + by using a geometric test like that described in Westhead et al 1999. + + This is determined by geometric criteria, specifically + the dihedral angle (NB this is not the usual phi/psi meaning + of dihedral, but the generic meaning of an angle between planes) + between the strands. + + The dihedral angle as calculated by the Bio.PDB + Vector module is in the range (-pi, pi] so if the absolute + value of this angle is larger than pi/2 we say the strands + are on opposite sides of the reference strand, else they are + on the same side. + + If the above method fails because a common perpendicular + cannot be found, the alternative method of averaging the dihedral + angles for all H bonds along the SSEs is used. + + Parameters: + strand1 - first STRAND node to test, a bridge partner of this one + strand2 - 2nd STRAND node to test, a bridge partner of this one + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + + Return value: + True if the geometric criteria between this and + the neighouring strands indicate they (the neighbour strands) + are on opposite sides of this strand. Else + False if it indcates they are on the same side. + """ + angle = self.axis_dihedral_angle(strand1, strand2, pdb_struct) + if angle == None: + angle = self.calc_average_hbonds_dihedral_angle(strand1, strand2, + pdb_struct) + + if abs(angle) > pi/2: + return True + else: + return False + + + + + def calc_average_hbonds_dihedral_angle(self, strand1, strand2, pdb_struct): + """ + Return the average abosolute value of the dihedral angle + formed between two carbon alpha atoms on the reference strand + and one on each of the other two strands, over all such + hydrogen bonds. + + Parameters: + strand1 - first STRAND node to test, a bridge partner of this one + strand2 - 2nd STRAND node to test, a bridge partner of this one + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + + Return value: + The average absolute value of the dihedral angle + between planes formed by hydrogen bonds between the + strands, which is in the range [0, pi]. + """ + + hbonds_to_strand1 = self.get_hbonds_to_neighbour(strand1) + hbonds_to_strand2 = self.get_hbonds_to_neighbour(strand2) + if len(hbonds_to_strand1) == 0 or len(hbonds_to_strand2) == 0: + # sometimes the (modified) stride FA1,FA2 records will + # indicate bridge partners but we won't have any corresponding + # H bonds from DNR/ACC records, so we don't find hbonds + # here. Will just return True (opposite sides) since that's + # what the most common case is. + sys.stderr.write( + 'WARNING: no hbonds available between neighbour strands ' + + strand1.nodeid + ', ' + + self.nodeid + ', ' + + strand2.nodeid +'\n') + return True + + # compute dihedral angle between planes formed by each pair of + # hydrogen bonds, and return average absolute value. + sum_abs_angles = 0.0 + num_angles = 0 + for hb1 in hbonds_to_strand1: + for hb2 in hbonds_to_strand2: + angle = self.hbonds_dihedral_angle(hb1, hb2, pdb_struct) + if verbose: + sys.stderr.write('angle '+ self.nodeid + ' ' + + strand1.nodeid + ' ' + str(hb1[1]) + ' ' + + strand2.nodeid + ' ' + str(hb2[1]) + ' ' + + str(angle) + '\n') + num_angles += 1 + sum_abs_angles += abs(angle) + + avg_abs_angle = sum_abs_angles / num_angles + if verbose: + sys.stderr.write('avg abs angle ' + self.nodeid + ' ' + + strand1.nodeid + ' ' + + strand2.nodeid + ' ' + + str(avg_abs_angle) + '\n') + return avg_abs_angle + + + def hbonds_dihedral_angle(self, hbond_to_strand1, hbond_to_strand2, + pdb_struct): + """ + Calculate the dihedral angle formed with hbonds from this + strand to two others. + + 3 consecutive bonds between 4 atoms can define a dihedral + angle. Call the atoms A,B,C,D in order. ABC is a plane and + BCD is a plane and we can calculate the angle between those + planes by the conventional formula (using Bio.PDB Vector + module). We choose B and C as the carbon alpha atoms of + residues on the reference strand (self), of residues that + have H-bonds to residues on the first and second test + strands respectively, choosing their carbon alpha atoms to + be A and D. + + Parameters: + bhond_to_strand1 - hbond tuple (node, resnum1, resnum2, dist) + of H bond to first STRAND node to test, + a bridge partner of this one + hbond_to_strand2 - hbond tuple (node, resnum1, resnum2, dist) + of H bond to 2nd STRAND node to test, + a bridge partner of this one + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + + Return value: + The angle (in (-pi, pi]) between the planes formed between two + C_alpha atoms on the reference (this) strand and respectively + a C_alpha atom on strand1 and strand2 is (absolute value) + """ + + chainid_self = stride_chainid_to_pdb_chainid(self.chainid) + chainid_strand1 = stride_chainid_to_pdb_chainid( + hbond_to_strand1[0].get_chainid()) + chainid_strand2 = stride_chainid_to_pdb_chainid( + hbond_to_strand2[0].get_chainid()) + + pdb_model = pdb_struct[0] # TODO always using model 0 for now + + # hbond is tuple (node, resnum1, resnum2, dist) + # resnum1 is the residue seqnum in self, resnum2 in node + CA_B = pdb_model[chainid_self][hbond_to_strand1[1]]['CA'] + CA_C = pdb_model[chainid_self][hbond_to_strand2[1]]['CA'] + CA_A = pdb_model[chainid_strand1][hbond_to_strand1[2]]['CA'] + CA_D = pdb_model[chainid_strand2][hbond_to_strand2[2]]['CA'] + + angle = calc_dihedral(CA_A.get_vector(), + CA_B.get_vector(), CA_C.get_vector(), + CA_D.get_vector()) + return angle + + + def fit_axis(self, pdb_struct, mfile_fh = None): + """ + Approximate this strand as a straight line by fitting a total least + squares line through the midpoints of consecutive + C-alpha atoms of its residues. + + Parameters: + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + mfile_fh - (Default None) + filehandle (open write) to write MATLAB commands + for plotting strand data to, or None for no MATLAB. + + Return value: + tuple (direction_cosines, centroid) + direction_coisines is Vector (a,b,c), the direction cosines + of the axis + centroid is Vector (x,y,z) the centroid of the midpoints of + consectuvie c_alpha atoms. + + Uses data members: + axis_centroid (read/write) - Vector of centroid of c_alpha atoms + axis_direction_cosines (read/write) - Vector of direction cosines + of axis line + + Note: this function is memoized - the first time it is called it + computes the axis and stores it in data members as well as + returning it; subsequent calls return the stored values. + This is because we don't necessarily need this computation + for all strands so don't want to always compute it up front + but also may need it multiple times for some strands. + """ + # return stored values if already computed + if self.axis_direction_cosines != None: + return (self.axis_direction_cosines, self.axis_centroid) + + # We use Numeric rather than newer NumPy since BioPython requires + # Numeric anyway. + + c_alpha_veclist = [] + for residue in self.get_residue_list(): + c_alpha_vector = residue['CA'].get_vector() + c_alpha_veclist.append(c_alpha_vector) + + + # calculate centroid of the points (axis must pass through this) + centroid = Vector([0,0,0]) + for i in range(len(c_alpha_veclist)): + centroid += c_alpha_veclist[i] + centroid /= len(c_alpha_veclist) + + # find midpoint of each pair of consecutive c_alpha atoms in + # order to reduce error in fitting line due to pleat of beta strand + # (Cohen et al 1981, J. Mol. Biol. 148(3):253-272) + # and build list of vectors from centroid to each midpoint + if len(c_alpha_veclist) > 3: + centroid_mp_veclist = [] + for i in range(len(c_alpha_veclist)-1): + midpoint = ((c_alpha_veclist[i+1] - c_alpha_veclist[i])/2) + \ + c_alpha_veclist[i] + centroid_mp_veclist.append(midpoint - centroid) + + # build array A where each row is vector from centroid + # to midpoint + A = array([list(vect) for vect in centroid_mp_veclist]) + + # compute the SVD of the array A, givng singular values on + # diagonal of s and right singular vectors as columns of v + # (i.e. rows of vt, vt = transpose(v)). + # TODO: handle exception LinAlgError and fit axis by other + # method (as when fewer than 3 residues). Never actually seen + # this exception occur here but it could. + (u, s, vt) = singular_value_decomposition(A) + # the direction cosine is the first row of vt + dircos = Vector(vt[0,:]) + + # get projection of most N-terminal and most C-terminal + # C_alpha astoms onto the axis line + self.axis_nterm_point = ProjectPointOntoLine(centroid, centroid + + ALPHA*dircos.get_array(), + c_alpha_veclist[0]) + self.axis_cterm_point = ProjectPointOntoLine(centroid, centroid + + ALPHA*dircos.get_array(), + c_alpha_veclist[-1]) + # The dircos gives the axis of the strand, but there is no + # guarantee that it 'points the right way' i.e. from the N- + # to C- terminus. We test if it does by finding the angle + # between it and the vector pointing frmo nterm_piont to + # cterm_point. This angle must either be 0 or pi, if 0 then the + # axis pionts the right way, otherwise it is pointing the wrong + # way so we reverse it. + + # FIXME: instead of using Vector.angle() should just compute + # ((cterm_point-nterm_point) dotproduct dircos) + # / (cterm_point-nterm_point).norm() * dircos.norm() + # and test for 1 or -1, avoiding arccosine computation. + angle = (self.axis_cterm_point-self.axis_nterm_point).angle(dircos) + assert (abs(angle) < EPSILON or abs(angle - pi) < EPSILON) + if (abs(angle - pi) < EPSILON): + dircos = Vector(-1 * dircos.get_array()) + if verbose: + sys.stderr.write('reversed axis for ' + str(self) + '\n') + + if mfile_fh != None: + mfile_write_strand(mfile_fh, c_alpha_veclist, + centroid_mp_veclist, centroid, dircos, + self.axis_nterm_point, self.axis_cterm_point, + str(self.seqnum)) + # DEBUG + #coords=array([list(vec) for vec in c_alpha_veclist]) + #print str(self),coords,centroid,A,vt,dircos + # END DEBUG + + self.axis_direction_cosines = dircos + self.axis_centroid = centroid + return (dircos, centroid) + else: + # When we have fewer than 3 c-alpha atoms, just use vector + # between the two (if only two) or vector between the two + # midpoints (if three). + if len(c_alpha_veclist) == 3: + mp1 = (c_alpha_veclist[1] - c_alpha_veclist[0]) / 2 + \ + c_alpha_veclist[0] + mp2 = (c_alpha_veclist[2] - c_alpha_veclist[1]) / 2 + \ + c_alpha_veclist[1] + v = mp2 - mp1 + elif len(c_alpha_veclist) == 2: + v = c_alpha_veclist[1] - c_alpha_veclist[0] + else: + # only 1 residue - cannot fix an axis to this strand + sys.stderr.write('WARNING: strand ' + str(self) + + ' has only ' + str(len(c_alpha_veclist)) + + ' residues, cannot fit axis\n') + return None + + dircos = v / v.norm() + + # get projection of most N-terminal and most C-terminal C_alphas + # onto the axis line + self.axis_nterm_point = ProjectPointOntoLine(centroid, centroid + + ALPHA*dircos.get_array(), + c_alpha_veclist[0]) + self.axis_cterm_point = ProjectPointOntoLine(centroid, centroid + + ALPHA*dircos.get_array(), + c_alpha_veclist[-1]) + + # DEBUG + #coords=array([list(vec) for vec in c_alpha_veclist]) + #print str(self),'(short)',coords,centroid,dircos + # END DEBUG + if mfile_fh != None: + mfile_write_strand(mfile_fh, c_alpha_veclist, + None ,centroid, dircos, + self.axis_nterm_point, self.axis_cterm_point, + str(self.seqnum)) + self.axis_direction_cosines = dircos + self.axis_centroid = centroid + return (dircos, centroid) + + + def get_endpoint_projections(self, pdb_struct): + """ + Return the projections of the most N-terminal and most C-terminal + C_alpha midpoints onto the strand axis. + + Parameters: + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + + Return value: tuple (axis_nterm_point, axis_cterm_point) + Bio.PDB Vector of projection of respectively most N-term + and most C-term endpoints onto the axis line. + + Uses data members (read/write): + axis_direction_cosines + axis_centroid + axis_nterm_point + axis_cterm_point + (basically just calls fit_axis() if necessary to compute the points + otherwise just returns them, all the above dat members are + memoizations for fit_axis()). + """ + if self.axis_nterm_point == None: + self.fit_axis(pdb_struct) + return (self.axis_nterm_point, self.axis_cterm_point) + + + def has_hbond_strand_overlap(self, strand1, strand2): + """ + For two strands neighbouring (i.e. having bridge edges + from&to) this one, determine if the H-bonds from those strands + to this strand are interleaved or nested, indicating that the + strands must be on opposite sides of this strand. + + Parameters: + strand1 - a strand with a bridge edge to this strand (i.e. + node is in self.bridge_list) + strand2 - another strand, not strand1, with a bridge to this strand + + Uses data members (readonly): + hydrogen_bond_list - list of H-bond (node, resnum1, resnum2, dist) + tuples + + Return value: + True if H bonds from strand1 and strand2 to this strand are + interleaved. Otherwise False (all bonds from strand1 are + before (or after) all bonds to strand2 on this strand). + + """ + try: + min_resnum_to_strand1 = self.min_resnum_hbond_to_neighbour(strand1)[0] + min_resnum_to_strand2 = self.min_resnum_hbond_to_neighbour(strand2)[0] + max_resnum_to_strand1 = self.max_resnum_hbond_to_neighbour(strand1)[0] + max_resnum_to_strand2 = self.max_resnum_hbond_to_neighbour(strand2)[0] + except ValueError: + # sometimes get no H bonds even to a neighbour strand (at least + # with STRIDE), resulting in ValueError min/max arg is an empty + # sequence. + # We'll just have to return False and let + # geometric criteria try to sort it out + return False + + if verbose: + sys.stderr.write('test hbond overlap ' + self.nodeid + ': ' + + strand1.nodeid + ' ' + str(min_resnum_to_strand1) + + ',' + str(max_resnum_to_strand1) + ' ' + + strand2.nodeid + ' ' + str(min_resnum_to_strand2) + + ',' + str(max_resnum_to_strand2) + '\n') + + + if (max_resnum_to_strand1 >= min_resnum_to_strand2 and + max_resnum_to_strand2 >= min_resnum_to_strand1): + return True + else: + return False + + + def min_resnum_hbond_to_neighbour(self, strand): + """ + Return lowest residue number in this strand that has a hydrogen bond + to a resdiue in the supplied neighbour (bridge partner) strand. + + Parameters: + strand - neighbouring strand to find min resnum of the H bonds to + + Return value: + tuple (resnum_this, resnum_neighbour) where resnum_this is + smallest residue number in this strand with an H-bond to a residue + in the supplied neighbour strand. + + Uses data members (readonly): + hydrogen_bond_list - list of H-bond (node, resnum1, resnum2, dist) + + """ + assert(self.is_neighbour(strand)) + return min([(self.get_residue_ordinal(resnum1), + strand.get_residue_ordinal(resnum2)) + for (node, resnum1, resnum2, dist) in + self.get_hbonds_to_neighbour(strand)]) + + + def max_resnum_hbond_to_neighbour(self, strand): + """ + Return highest residue number in this strand that has a hydrogen bond + to a resdiue in the supplied neighbour (bridge partner) strand. + + Parameters: + strand - neighbouring strand to find min resnum of the H bonds to + + Return value: + tuple (resnum_this, resnum_neighbour) where resnum_this is + largest residue number in this strand with an H-bond to a residue + in the supplied neighbour strand. + + Uses data members (readonly): + hydrogen_bond_list - list of H-bond (node, resnum1, resnum2, dist) + + """ + assert(self.is_neighbour(strand)) + return max([(self.get_residue_ordinal(resnum1), + strand.get_residue_ordinal(resnum2)) + for (node, resnum1, resnum2, dist) in + self.get_hbonds_to_neighbour(strand)]) + + + def calc_strand_neighbour_occupation(self, strand): + """ + Return the first and last residues on this strand that are + 'occupied' by the supplied neighbouring strand, i.e. the residues + that have residues in the neighbour strand beside them when they + are aligned according to H bonds. + + Parameters: + strand - neighbour strand to find occupancy of + + Return value: + tuple (first_resnu, last_resnum) of the lowest and highest + residue sequence numbers in this strand 'occupied' by the + neighbour strand + + This is calculated as follows: if residue P is the residue in this + strand with maximum sequence number with an H bond to a residue Q + in the neighbour strand, then let a = Q - strand_min and + b = strand_max - Q where strand_min and strand_max are the min + and max residue sequence numbers respectively in the neighbouring + strand. Then the 'occupied' residues in this strand (those with + neighbours in the neighbouring strand) are those from + P - b through to P + a (inclusive). + + + neighbour + self strand + . + /#\ + P + a -------#-------------------- + # # | + # # a + #P #Q | + -------*==========*--------- + # H bond # | + # w/ max # | + # res num # b + # in self # | + # strand \./ | + P - b -------#-------------------- + # + # + # + # + + Note: this does not take into account beta-bulges at all, we are + considering the represnetatino of strands as perfectly straight and + lined up beside each other perfectly in the cartoon. + + Note2: The call to max_resnum_hbond_to_neighbour() may generate + a ValueError exception sometimes when using STRIDE which idnicates + bridge partners but no H bonds sometimes. + + """ + (P, Q) = self.max_resnum_hbond_to_neighbour(strand) + a = Q - strand.get_residue_ordinal(strand.get_start_res_seq()) + b = strand.get_residue_ordinal(strand.get_end_res_seq()) - Q + first_resnum = P - b + last_resnum = P + a + assert(last_resnum - first_resnum + 1 == strand.get_span()) + return (first_resnum, last_resnum) + + + def has_strand_extent_overlap(self, strand1, strand2): + """ + For two strands neighbouring (i.e. having bridge edges + from&to) this one, determine if the strands would overlap + if they are both aligned according to the H bonds on the same + side of this strand + + Parameters: + strand1 - a strand with a bridge edge to this strand (i.e. + node is in self.bridge_list) + strand2 - another strand, not strand1, with a bridge to this strand + + Return value: + True if the interval of residues on this strand that is + 'occupied' by the residues on strand1 (i.e. are adjacent if + aligned according to H bonds) overlaps that of strand2, + else False (the intervals are disjoint and the two strands + could possibly be drawn on the same side of this strand + without overlapping). + + """ + try: + (min_resnum_to_strand1, max_resnum_to_strand1) = \ + self.calc_strand_neighbour_occupation(strand1) + except ValueError: + # sometimes get no H bonds even to a neighbour strand (at least + # with STRIDE), resulting in ValueError min/max arg is an empty + # sequence. + # We'll just have to return False and let + # geometric criteria try to sort it out + sys.stderr.write( + 'WARNING: no hbonds available between neighbour strands ' + + self.nodeid + ', ' + + strand1.nodeid +'\n') + return False + try: + (min_resnum_to_strand2, max_resnum_to_strand2) = \ + self.calc_strand_neighbour_occupation(strand2) + except ValueError: + # sometimes get no H bonds even to a neighbour strand (at least + # with STRIDE), resulting in ValueError min/max arg is an empty + # sequence. + # We'll just have to return False and let + # geometric criteria try to sort it out + sys.stderr.write( + 'WARNING: no hbonds available between neighbour strands ' + + self.nodeid + ', ' + + strand2.nodeid +'\n') + return False + + if verbose: + sys.stderr.write('test strand extent overlap '+self.nodeid + ': ' + + strand1.nodeid + ' ' + str(min_resnum_to_strand1) + + ',' + str(max_resnum_to_strand1) + ' ' + + strand2.nodeid + ' ' + str(min_resnum_to_strand2) + + ',' + str(max_resnum_to_strand2) + '\n') + + + if (max_resnum_to_strand1 >= min_resnum_to_strand2 and + max_resnum_to_strand2 >= min_resnum_to_strand1): + return True + else: + return False + + def get_hbonds_to_neighbour(self, strand): + """ + Return list of hydrogen bond tuples (node, resnum1, resnum2, dist) from + this strand to the the supplied neighbouring strand (i.e. one + that is in the bridge list of this strand). Because it is a bridge + partner of this strand it should have H bond(s) to it (or from it). + + Parameters: + strand - neighbouring strand to find the H bonds to + + Return value: + list of (node, resnum1, resnum2, dist) tuples of hbonds + where resnum1 is in self and resnum2 is in strand + (parameter). + + NB we are storing donor H bonds, not necessarily having + the symmetical edge from strand back to self, so it + can happen that there is no hbond from self to strand + even if there is a bridge between them (though there must + be a bond from strand to self) - in that case we + get the bond from strand to self and swap the residue numbers + in the tuple so resnum1 is the residue number in self + and resnum2 is the reisude number in the parameter strand. + + Uses data members (readonly): + hydrogen_bond_list - list of H-bond (node, resnum1, resnum2, dist) + + """ + assert(isinstance(strand, PTNodeStrand)) + + hblist = [] + for hbond in self.hydrogen_bond_list: + if hbond[0] == strand: + hblist.append(hbond) + + # also add bond(s) from other strand to this one + for (node, resnum1, resnum2, dist) in strand.get_hbond_list(): + if node == self: + # note resnum1,resnum1 swapped + hblist.append((strand, resnum2, resnum1, dist)) + + return hblist + + + + def get_side_of_neighbouring_strand(self, strand): + """ + Using the labels in the bridge_list set by + label_strand_bridge_sides() (q.v.), return the same/other (+/-) + side label for the supplied strand, which is expected to be + in the bridge list for this node. + + + Parameters: + strand - strand to find side label for in the bridge list + + Uses data members (readonly): + bridge_list - list of (node, bdir, side) tuples + + Return value: + same/other ('+' or '-') side label for the strand if found + in the bridge list or None if not found. + """ + assert(isinstance(strand, PTNodeStrand)) + for (node, bdir, side) in self.bridge_list: + if node == strand: + return side + return None + + + def is_parallel(self, strand): + """ + Return True iff the supplied strand is a parallel bridge partner + of this one. + + Parameters: + strand - strand to test for being in the bridge list of this node + + Uses data members (readonly): + bridge_list - list of bridge tuples (ptnodestrand, bdir, side) + + Return value: + True if strand marked as parallel in the bridge list, else False. + + Raises exceptions: + KeyError if strand not found in bridge list. + """ + assert(isinstance(strand, PTNodeStrand)) + for (node, bdir, side_unused) in self.bridge_list: + if node == strand: + return (bdir == 'P') + raise KeyError('is_parallel(): neighbour strand not found') + + + def get_is_positioned(self): + """ + Strands are counted as always positioned, unlike helices, + so just always return True (This is used only for distance + matrix placement). + """ + return True + + +class PTNodeTerminus(PTNode): + """ + The PTNodeTerminus class is the type of PTNode for a (N or C) terminus. + """ + + def __init__(self, termtype, pseudo, *args): + """ + Construct PTNodeTerminus with supplied nodeid and type. + Parameters: + termtype - string "N" or "C" + pseudo - Boolean True for pseudo-terminus (domain boundary), + else False (actually N or C terminal of chain). + +Variable parameter list: straight to PTNode constructor (q.v.). + + This extends PTNode by adding the termtype (note nodeid is more + informative, includes information about break for domain etc. + but termtype just N or C is useful for checking in code) + and other than that just calls PTNode constructor. + Raises exceptions: + TypeError if termtype argument is invalid. + + """ + if termtype not in ['N', 'C']: + raise TypeError("PTNodeTerminus bad termtype " + termtype + '\n') + PTNode.__init__(self, *args) + self.termtype = termtype + + self.is_positioned = False # set to True if helix is already positioned + # (drawn). Used when old_helix_placement + # (i.e. -i option NOT supplied) is in use, + # sometimes as a special case we position + # helices before calling write_helices_svg() + # and this flag is set to mark these as + # already positioned. + # use get/set_is_positioned + + self.pseudo = pseudo # set to True if the terminus is a + # pseudo-terminus to mark domain boundary + # Else False. + # Use get/set_pseudo + + self.adjnode = None # for pseudo nodes, the node that is + # immediately adjacent, i.e. the most + # N-terminal SSE for pseudo-N-Terminus + # and the most C-terminal SSE for + # pseudo-N-terminus. Else None. + # use get/set_adjnode + + + + + def __str__(self): + """ + Return String representation of the node as 'TYPE id [resnum]' + """ + return "TERMINUS" + " " + self.nodeid + "[" + str(self.start_res_seq)+"]" + + def get_termtype(self): + """ + Just return the type of this terminus node, N or C + Parmeters: None + Return value: 'N' or 'C' for N or C terminus resp. + """ + return self.termtype + + def get_is_positioned(self): + """ + Return True if the node is marked already positioned + See is_positioned in __init__() + Parameters: None + Return value: True if node is marked already positioned else False + Uses member data (readonly): is_positioned + """ + return self.is_positioned + + def set_is_positioned(self, is_pos): + """ + Set the is_positioned flag to the supplied boolean value. + See is_positioned in __init__() + Parmeters: + is_pos - True to mark as already positioned, False to unmark + Return value: None + Uses member data (WRITE): is_positioned + """ + self.is_positioned = is_pos + + def get_pseudo(self): + """ + Return True if the node is marked as pseudo-terminus (domain boundary) + See pseudo in __init__() + Parameters: None + Return value: True if node is marked as pseudoterminus else False + Uses member data (readonly): pseudo + """ + return self.pseudo + + def set_pseudo(self, pseudo): + """ + Set the psuedo flag to the supplied boolean value. + See pseudo in __init__() + Parmeters: + pseudo - True to mark as pseudo-terminus, False to unmark + Return value: None + Uses member data (WRITE): pseudo + """ + self.pseudo = pseudo + + def get_adjnode(self): + """ + Return the adjnode. Only for pseudo nodes. + See adjnode in __init__() + Parameters: None + Return value: The adjacent node to this pseudo node + Uses member data (readonly): adjnode + """ + return self.adjnode + + def set_adjnode(self, adjnode): + """ + Set the adjnode to the supplied PTNode.Only for pseudo nodes. + See adjnode in __init__() + Parmeters: + adjnode - The PTNode to set as the adjnode. + Return value: None + Uses member data (WRITE): adjnode + Raises exceptions: + TypeError if adjnode is not a PTNode instance. + """ + if not isinstance(adjnode, PTNode): + raise TypeError('bad adjnode parameter') + self.adjnode = adjnode + +class PTNodeLoop(PTNode): + """ + The PTNodeLoop class representes loops (coils) rather than SSEs as such. + Used for domain decomposition (domainfc.py) so that all residues + are represented in some node. + """ + def __str__(self): + """ + Return String representation of the node as + 'TYPE id [startResNum..endResNum] + """ + return "LOOP" + " " +\ + self.nodeid + "[" + str(self.start_res_seq) + \ + ".." + str(self.end_res_seq) + "]" + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def dfs_strands_from(start_strand, visited, dfs_list, from_node): + """ + Make a depth-first search traversal of STRAND nodes + using bridge (not sequence) + edges starting at the specfieid strand, + returning list of (node,from_node) tuples in DFS traversal order + where from_node is the node from which node is reached. + + Parameters: + start_strand - STRAND node to start at + visited - (in/out) dictionary of {ptnode:True} visited nodes + dfs_list - (in/out) list of (node, from_node) visited in dfs order + from_node - node from which we are being (recursively) called + + Recursive function. call initially as + dfslist = [] + dfs_strands_from(startnode, {}, dfslist, None) + + Return value: + None. (output is dfs_list parameter) + + """ + visited[start_strand] = True + dfs_list.append((start_strand,from_node)) + for (node, bdir_unused, side_unused) in start_strand.get_bridge_list(): + if node not in visited: + dfs_strands_from(node, visited, dfs_list, start_strand) + + + + +def compute_align_positions(node, from_node): + """ + set relative vertical position based on offsets of + maximum (or minimum, for reversed) from last in strand + (or first, for reversed) residue sequence numbers in + strand of H bonds to previous neighbour strand (starting + at 0 for first node) + Called by build_sheet_constraints() in ptgraph2.py, and may be + called again to recompute align positions if we reverse the order of + strands in a sheet. (TODO: should be a more efficient way of just + recalcuating these without calling this again, but since we need the dfs + order anyway, it does not really matter much). + + Parameters: + node - not to set align positin in + from_node - node we reach this from in DFS order + Return value: + None. + + Sets the align_pos in node. + """ + # set relative vertical position based on offsets of + # maximum (or minimum, for reversed) from last in strand + # (or first, for reversed) residue sequence numbers in + # strand of H bonds to previous neighbour strand (starting + # at 0 for first node) + + # TODO: introduction of get_residue_ordinal() has made his even more + # overly complicated than it was - since first residue is now always 1 + # using ordinal per-strand numbers rather than pdb sequence numbers, + # could simplify this eg by not having to get the + # ordinal for start_res_seq since it is always 1 anyway.; + # and in fact after changing again so that we use the pdb_resid_dict + # to get index in full sequence list don't even need get_residue_ordinal() + # any more... really need to clean all the code up. + + if node.get_reversed(): + try: + (this_bond_resnum, neighbour_bond_resnum) = \ + node.min_resnum_hbond_to_neighbour(from_node) + except ValueError: + # get 'min() arg is an empty sequence' when no H bonds + # somtimes happens e.g. 1CSE (ptnode.py gives a warning) + sys.stderr.write('WARNING: no H bonds found between ' + + from_node.nodeid + ' and ' + + node.nodeid + ', position may be wrong\n') + this_bond_resnum = node.get_residue_ordinal(node.get_start_res_seq()) + if node.is_parallel(from_node): + neighbour_bond_resnum = from_node.get_residue_ordinal(from_node.get_start_res_seq()) + else: + neighbour_bond_resnum = from_node.get_residue_ordinal(from_node.get_end_res_seq()) + + this_bond_offset = this_bond_resnum -\ + node.get_residue_ordinal(node.get_start_res_seq()) + else: + try: + (this_bond_resnum, neighbour_bond_resnum) = \ + node.max_resnum_hbond_to_neighbour(from_node) + except ValueError: + sys.stderr.write('WARNING: no H bonds found between ' + + from_node.nodeid + ' and ' + + node.nodeid + ', position may be wrong\n') + this_bond_resnum = node.get_residue_ordinal(node.get_end_res_seq()) + neighbour_bond_resnum = from_node.get_residue_ordinal(from_node.get_end_res_seq()) + if node.is_parallel(from_node): + neighbour_bond_resnum = from_node.get_residue_ordinal(from_node.get_end_res_seq()) + else: + neighbour_bond_resnum = from_node.get_residue_ordinal(from_node.get_start_res_seq()) + + this_bond_offset = node.get_residue_ordinal(node.get_end_res_seq()) -\ + this_bond_resnum + if from_node.get_reversed(): + neighbour_bond_offset = neighbour_bond_resnum - \ + from_node.get_residue_ordinal(from_node.get_start_res_seq()) + else: + neighbour_bond_offset = from_node.get_residue_ordinal(from_node.get_end_res_seq()) - \ + neighbour_bond_resnum + # node_align_pos is relative to neigbour (from_node) + node_align_pos = neighbour_bond_offset - this_bond_offset +# print 'rrrrrr',str(node),node_align_pos + # set the align_pos to pos relative to first (leftmost) node + # which is to say the cumulative align_pos + node.set_align_pos(node_align_pos + from_node.get_align_pos()) + + +def ptnode_set_verbose(verb): + """ + set the module global verbose flag in this module to supplied value + Parameters: verb - True (for verbose output) or False + Return value: None + Uses globals: verbose (in this module) + """ + global verbose + verbose = verb diff --git a/scripts/ptsecstruct.py b/scripts/ptsecstruct.py new file mode 100644 index 0000000..44c22c3 --- /dev/null +++ b/scripts/ptsecstruct.py @@ -0,0 +1,982 @@ +############################################################################### +# +# ptsecstruct.py - object to represent secondary structure elements (SSEs) and +# their associated Hydrogen bonds and functions to parse +# these from external programs (DSSP, STRIDE, etc.) +# +# File: ptsse.py +# Author: Alex Stivala +# Created: July 2007 +# +# $Id: ptsecstruct.py 2749 2009-08-12 07:33:14Z astivala $ +# +# PTSecStruct is a class representing secondary structures (helices, sheets) +# and their associated hydrogen bonds. +# +# This module contains a class to read protein secondary structure +# information generated by: +# +# STRIDE +# (see http://webclu.bio.wzw.tum.de/stride/ +# download source code from ftp://ftp.ebi.ac.uk/pub/software/unix/stride/) +# See +# Frishman and Argos 1995 "Knowledge-based secondary structure assignment" +# Proteins: structure function and genetics 23:566-579. +# +# DSSP +# see http://swift.cmbi.ru.nl/gv/dssp/ +# Note that a licence (free for academic use) is required. +# See also +# Kabsch and Sander 1983 "Dictionary of Protein Secondary Structure: +# Pattern Recognition of Hydrogen-Bonded and Geometrical Features" +# Biopolymers 22:2577-2637). +# +############################################################################### + +import os,sys +from time import strftime,localtime + +from ptutils import get_int_icode,pdb_res_seq_cmp + + +#----------------------------------------------------------------------------- +# +# Module globals +# +#----------------------------------------------------------------------------- + +# constants + + +DSSP_HBOND_ENERGY_THRESHOLD = -0.5 # (kcal/mol) + # DSSP hbonds must have energy <= this + # to be considered. + +# global variables + +verbose = False + +#----------------------------------------------------------------------------- +# +# Class definitions +# +#----------------------------------------------------------------------------- + +class PTSecStruct: + """ + The PTSecStruct class represents secondary structure + + Just access the helix_list and strand_list directly and pdb_id + and other member data directly. + + Member data (set and ready to read after calling this routine): + helix_list - list of helix tuples in form: + (start_chainid, start_resnum, end_chainid, end_resnum, + type) + where type is the DSSP code ('H', 'G' or 'I'). + NB start_resnum and and end_resnum are PDB residue numbers + and are strings, not integers. They may contain + an insertion code, so e.g. may have residue numbers + '60','60A','60B','61' which are 4 distinct and sequtnially + adjacnet residues. Also may have gapes e.g. '58','70' + adjacent in sequence. + strand_list - list of strand tuples, as per helix_list, but without + helix type + pdb_id - PDB identifier + pdb_header - PDB header string (description) + hbond_list - list of hydrogen bond tuples of the form: + (chainid1, resnum1, chainid2, resnum2, dist_N_O) + bridgeres_list - list of residues in bridges in tuples of form: + (chainid1, resnum1, chainid2, resnum2, bdir) + bdir is P for parallel or N for antiparallel. + Note for hbond_list and brideres_list: resnums + are PDB residue numbers as strings, may have insertino + codes. + """ + def __init__(self): + """ + Initialize empty PTSecStruct object. + See class documentation for more information. + """ + self.helix_list = [] # list of tuples, see comments above + self.strand_list = [] # as above + self.hbond_list = [] # list of tuples, see comments above + self.bridgeres_list = [] # list of bridge residue tuples (see above) + + + def __str__(self): + """ + Return string representation of secondary structure as, one per line, + sse type (H or E like DSSP) followed by start and end residue + numbers then chain id. Ordered by chain then residue number. + + NOTE H bonds and bridges not included in string representation. + """ + tuple_list = self.get_sse_tuple_list() + s = "" + for sse_tuple in tuple_list: + s += sse_tuple[3] + "\t" + str(sse_tuple[1]) +\ + "\t" + str(sse_tuple[2]) + \ + "\t" + sse_tuple[0] + "\n" + return s + + + def get_sse_tuple_list(self): + """ + build sse tuple list of (chain, start_res, end_res, type) and sort + type is 'H' or 'E'. + Parameters: None + Return value: list of (chain,strartres,end_res,type) + """ + # build sse tuple list of (chain, start_res, end_res, type) and sort + tuple_list = [ (chain, start_res, end_res, 'H') for \ + (chain, start_res, end_chainid, end_res, htype) in \ + self.helix_list ] + tuple_list += [ (chain, start_res, end_res, 'E') for \ + (chain, start_res, end_chainid, end_res) in \ + self.strand_list ] + tuple_list.sort(cmp=tuplecmp) + return tuple_list + + + def get_num_sses(self): + """ + Return the total number of SSEs, which is the number of helices + plus number of strands. + + Parameters: None + Uses data members (readonly): + helix_list, strand_list + Return value: number of helices + number of strands + """ + return len(self.helix_list) + len(self.strand_list) + + + def check_validity_and_fix(self): + """ + Check for overlapping secondary structures. This happens for + example in the PDB HELIX records for 1DLC. In such a case we + recover from it in for example this case + by adding or subtracting one to start/end of ovlerlapping + HELIX records, + + Parameters: + None + Return value: + True if OK, False if invalid (overlapping structures) + (Now returns True if it has fixed up overlaps itself) + Uses data members (READ/WRITE): + helix_list, strand_list + (start and end in helix and strand tuples may be modified; + lists are sorted by increasing residue sequence number) + """ + helices = [ (chain, start, end, endchain, 'H', htype) + for (chain, start, endchain, end, htype) + in self.helix_list ] + strands = [ (chain, start, end, endchain, 'E', None) + for (chain, start, endchain, end) + in self.strand_list ] + sselist = helices + strands + sselist.sort(cmp=tuplecmp) + is_valid = True + for i in xrange(1, len(sselist)): + sse = sselist[i] + prevsse = sselist[i-1] + if (prevsse[0] == sse[0] and + pdb_res_seq_cmp(sse[1], prevsse[2]) <= 0): + sys.stderr.write('WARNING: PDB has overlapping SSE definitions' + ' ' + str(prevsse) + ' and ' + str(sse) + ': ') + # remove overlap by shortening longer one and lengthing + # shorter one + # FIXME: this is ignoring insertion codes etc., really + # should convert to proper sequential residue sequence numbers + # to do this + (prevsse_start,prevsse_start_icode) = get_int_icode(prevsse[1]) + (prevsse_end,prevsse_end_icode) = get_int_icode(prevsse[2]) + (sse_start,sse_start_icode) = get_int_icode(sse[1]) + (sse_end,sse_end_icode) = get_int_icode(sse[2]) + if (prevsse_end_icode or sse_start_icode): + sys.stderr.write('contains insertion codes, giving up\n') + is_valid = False + continue + prevsse_len = prevsse_end - prevsse_start + 1 + sse_len = sse_end - sse_start + 1 + overlap = prevsse_end - sse_start + 1 + if sse_len > prevsse_len: + sse_start += overlap + else: + prevsse_end -= overlap + sselist[i] = (sse[0],str(sse_start),str(sse_end), + sse[3],sse[4],sse[5]) + sselist[i-1] = (prevsse[0],str(prevsse_start),str(prevsse_end), + prevsse[3],prevsse[4],prevsse[5]) + sys.stderr.write('changed to ' + str(sselist[i-1]) + ' and ' + + str(sselist[i]) + '\n') + i += 1 + + # rebuild the helix_list and strand_list with our modified tuples + self.helix_list = [ (chain, start, endchain, end, htype) + for (chain, start, end, endchain, ssetype, htype) + in sselist if ssetype == 'H' ] + self.strand_list = [ (chain, start, endchain, end) + for (chain, start, end, endchain, ssetype, htype) + in sselist if ssetype == 'E' ] + return is_valid + + + def write_pymol_sse_commands(self, fh, pdbfilename): + """ + Write PyMOL command file (use @filename in PyMOL) to load the PDB + and define SSEs according to the definition we used here (PDB cards + or STRIDE or DSSP) rather than PyMOL's own (DSS or PDB) definition. + + Parameters: + fh - open (write) filehandle to write commands to + pdbfilename -filename of the PDB file to load + Return value: + None + """ + write_pymol_prelude(fh) + fh.write('# ' + self.pdb_id + '\n') + fh.write('# ' + self.pdb_header + '\n') + fh.write('#\n') + write_pymol_load(fh, self.pdb_id, pdbfilename) + write_pml_define_sses(fh, self.pdb_id, self.get_sse_tuple_list()) + write_pymol_conclusion(fh) + + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def read_secstruct_from_stride(pdb_filename, pdb_secstruct = None): + """ + Build and return an instance of the PTSecStruct class + which represents the secondary structure as assigned + by the STRIDE program + (Frishman and Argos 1995 "Knowledge-based secondary structure assignment" + Proteins: structure function and genetics 23:566-579). + + Parameters: + pdb_filename - filename of PDB file to run STRIDE against + pdb_secstruct (read/write) default None + - a previously built PTSecStruct + (eg from PDB HELIX and SHEET cards). + If this is not None, only hydrogen bond information + is read, and added to this PTSecstruct. + + Return value: + PTSecStruct class representing secondary structure and H bonds + ass assigned by STRIDE (see following comments). + + The class is instantiated by supplying a file handle of an open + PDB file (for reading) + to the constructor. STRIDE is run with that PDB file as input and + the secondary structure assignments are parsed + from the STRIDE output. + + If the H-bond information is to be read then stride must have been + invoked with the -h option to generate the DNR and ACC records. + As described in the STRIDE documentation (stride.doc in the stride + source code distribution), these are very redundant in order to + facilitiate human readability. This subroutine uses only the DNR + records. + + In order to read bridge information we use the 'private' STRIDE option + -i which requires the 'secret' option -$ to work i.e. a command line like + (note necessity to escape $ in shell): + + stride -h -\$ -i 1QLP.pdb + + Actually, a modified version of stride is required, that indicates + whether bridges are parallel or antiparallel (amongst other + things), creating the new record types FA1 and FA2 from some of + the 'private' -i info. This modified version should be supplied + with this module (OK for academic use only; see the notice in + stride.doc in the stride directory). + """ + + if verbose: + sys.stderr.write("running stride...") + fd = os.popen("stride -\$ -i -h " + pdb_filename) + ptsecstruct = parse_stride_output(fd, pdb_secstruct) + fd.close() + if verbose: + sys.stderr.write("done\n") + sys.stderr.write(str(ptsecstruct)) + return ptsecstruct + + +def parse_stride_output(filehandle, pdb_secstruct): + """ + Build StrideStruct object data members using the STRIDE output + read from the supplied filehandle. + See more comments in read_secstruct_from_stride() above. + + Parameters: + filehandle - filehandle to read from (already open for read) + pdb_secstruct (read/write) + - a previously built PTSecStruct + (eg from PDB HELIX and SHEET cards). + If this is not None, only hydrogen bond information + is read, and added to this PTSecstruct. + + Return value: + PTSecStruct object representing secondary sturcture and H bonds + as assigned by STRIDE. + + """ + if pdb_secstruct == None: + pts = PTSecStruct() + hbonds_only = False + else: + pts = pdb_secstruct + hbonds_only = True + + # From the STRIDE documentation: + # + # STRIDE produces output that is easily readable both + # visually and with computer programs. The side effect of + # this conveniency is larger file size of individual + # STRIDE entries. Every record is 79 symbols long and has + # the following general format: + # + # Position Description + # + # 1-3 Record code + # 4-5 Not used + # 6-73 Data + # 74-75 Not used + # 75-79 Four letter PDB code (if available) + + chainid_dict = {} + + for line in filehandle: + rectype = line[0:3] + if rectype == "HDR": + # read pdb_header and pdb_id from HDR records + pts.pdb_header = line[5:74] + pts.pdb_id = line[74:79].lstrip() + elif rectype == "LOC" and not hbonds_only: + # + # Build helix_list and strand_list from LOC record + # types AlphaHelix/310Helix/PiHelix and Strand respectively. + # The helix_list is a list of tuples + # + # (start_chainid, start_resnum, end_chainid, end_resnum, type) + # + # and the strand_list is the same format. + # + # From the stride doc: [lines marked WRONG are incorrect though] + # + # LOC Location of secondary structure elements + # + # Format: 6-17 Element name + # 19-21 First residue name + # WRONG 32-26 First residue PDB number + # WRONG 28-28 First residue chain identifier + # 36-38 Last residue name + # 42-45 Last residue PDB number + # 47-47 Last residue chain identifier + # + element_name = line[5:17].rstrip().upper() + start_chainid = line[28] + start_resnum = line[22:28].lstrip().rstrip() + end_resnum = line[40:46].lstrip().rstrip() + end_chainid = line[46] + + if not chainid_dict.has_key(start_chainid): + chainid_dict[start_chainid] = True + if element_name in ["ALPHAHELIX", "PIHELIX", "310HELIX"]: + if element_name == "ALPHAHELIX": + helixtype = "H" + elif element_name == "PIHELIX": + helixtype = "I" + elif element_name == "310HELIX": + helixtype = "G" + else: + helixtype = None # can't happen, TypeError later if does + element_tuple = (start_chainid, start_resnum, + end_chainid, end_resnum, + helixtype) + pts.helix_list.append(element_tuple) + elif element_name == "STRAND": + element_tuple = (start_chainid, start_resnum, + end_chainid, end_resnum) + pts.strand_list.append(element_tuple) + + elif rectype == "DNR": # donor of H-bond; requires stride -h + # From stride.doc: + # + # DNR Donor residue + # + # Format: 6-8 Donor residue name + # 10-10 Protein chain identifier + # 12-15 PDB residue number + # 17-20 Ordinal residue number + # 26-28 Acceptor residue name + # 30-30 Protein chain identifier + # 32-35 PDB residue number + # 37-40 Ordinal residue number + # 42-45 N..0 distance + # 47-52 N..O=C angle + # 54-59 O..N-C angle + # 61-66 Angle between the planes of donor + # complex and O..N-C + # 68-73 angle between the planes of acceptor + # complex and N..O=C + # + # Build a tuple representing the Hydrogen bond: + # + # (chainid1, resnum1, chainid2, resnum2, dist_N_O) + # + # and add to the list of H-bonds. + # + hbond_tuple = ( line[9], line[11:15].lstrip(), + line[29], line[31:35].lstrip(), + float(line[41:45]) ) # Angstroms + pts.hbond_list.append(hbond_tuple) + elif (rectype == "FA1" or rectype == "FA2") and not hbonds_only: + # NB: requires modified stride and -\$ -i options + # FA1 and FA2 show beta bridges, 4 hbonds for a bridge. + # Also note residues might be off-by-one on the end as this + # record is internal stride information output before the + # adjustments made for assigning E to a residue (see stride + # source code sheet.c and fillasn.c). We don't care much + # here, we'll just allow a fudge factor of 1 on begin and + # end of strand when checking if these bridges connect them. + # Example FA1 records: + # + # FA1 AntiPar From: - - 3 69 69 3 + # FA1 Par From: - - 2 30 32 2 + # + # (note '-' for blank chain id as per usual stride convention). + # Build a tuple representing this (part of a) bridge: + # + # (chainid1, resnum1, chainid2, resnum2, bdir) + # + # dir is P for parallel or N for antiparallel + # + fa1_rec = line.split() + if fa1_rec[1] == "AntiPar": + bdir = "N" + elif fa1_rec[1] == "Par": + bdir = "P" + else: + sys.stderr.write("readstride.py: ignored bad " + + rectype + " record: " + + line + '\n') + + bridgepart_tuple1 = (fa1_rec[3], fa1_rec[5], + fa1_rec[4], fa1_rec[6], bdir) + bridgepart_tuple2 = (fa1_rec[4], fa1_rec[7], + fa1_rec[3], fa1_rec[8], bdir) + + pts.bridgeres_list.append(bridgepart_tuple1) + pts.bridgeres_list.append(bridgepart_tuple2) + return pts + + + +def stride_chainid_to_pdb_chainid(stride_chainid): + """ + Convert a STRIDE chainid to a PDB chainid. + STRIDE uses '-' for a 'blank' chainid while PDB uses ' ' (space). + So all this does is return the stride_chainid unless it is '-', then + it returns ' '. + + Parameters: + stride_chainid - the STRIDE chain identifier + Return value: + PDB chain identifier corresponding to supplied stride_chainid + """ + if stride_chainid == '-': + return ' ' + else: + return stride_chainid + +def pdb_chainid_to_stride_chainid(pdb_chainid): + """ + Convert a PDB chainid to a STRIDE chainid. + STRIDE uses '-' for a 'blank' chainid while PDB uses ' ' (space). + So all this does is return the pdb_chainid unless it is ' ', then + it returns '-'. + + We always use STRIDE style chain identifiers ('-' not ' ') internally. + Note that the remediated (2007) PDB files now no longer use blank + chainid anyway, they always name the chain 'A' etc. + + Parameters: + pdb_chainid - the PDB chain identifier + Return value: + STRIDE chain identifier corresponding to supplied pdb_chainid + """ + if pdb_chainid == ' ': + return '-' + else: + return pdb_chainid + + +def read_secstruct_from_dssp(pdb_filename, pdb_secstruct = None): + """ + Build and return an instance of the PTSecStruct class + which represents the secondary structure as assigned + by the DSSP program + (Kabsch and Sander 1983 "Dictionary of Protein Secondary Structure: + Pattern Recognition of Hydrogen-Bonded and Geometrical Features" + Biopolymers 22:2577-2637). + + Parameters: + pdb_filename - filename of PDB file to run DSSP against + pdb_secstruct (read/write) default None + - a previously built PTSecStruct + (eg from PDB HELIX and SHEET cards). + If this is not None, only hydrogen bond information + is read, and added to this PTSecstruct. + + Return value: + PTSecStruct class representing secondary structure and H bonds + ass assigned by DSSP (see following comments). + + The class is instantiated by supplying a file handle of an open + PDB file (for reading) + to the constructor. DSSP is run with that PDB file as input and + the secondary structure assignments are parsed + from the DSSP output. + + Note the DSSP executable is actually 'dsspcmbi' not 'dssp'. + """ + + if verbose: + sys.stderr.write("running dsspcmbi...") + fd = os.popen("dsspcmbi " + pdb_filename) + ptsecstruct = parse_dssp_output(fd, pdb_secstruct) + fd.close() + if verbose: + sys.stderr.write("done\n") + sys.stderr.write(str(ptsecstruct)) + return ptsecstruct + + +def parse_dssp_output(filehandle, pdb_secstruct): + """ + Build StrideStruct object data members using the DSSP output + read from the supplied filehandle. + See more comments in read_secstruct_from_dssp() above. + As well as the DSSP paper by Kabsch and Sander in Biopolymers cited + there, see: + + http://swift.cmbi.ru.nl/gv/dssp/ + + Note that a licence is needed for DSSP (free for academic use - + see website above). + + Parameters: + filehandle - filehandle to read from (already open for read) + pdb_secstruct (read/write) + - a previously built PTSecStruct + (eg from PDB HELIX and SHEET cards). + If this is not None, only hydrogen bond information + is read, and added to this PTSecstruct. + + Return value: + PTSecStruct object representing secondary sturcture and H bonds + as assigned by DSSP. + + """ + if pdb_secstruct == None: + pts = PTSecStruct() + hbonds_only = False + else: + pts = pdb_secstruct + hbonds_only = True + + in_assignments = False + in_secstruct = False + prev_dssp_code = None + dssp_bp_tuple_list = [] # list of tuples + # (dssp_resnum, bp1, bp1_label, bp2, bp2_label) + # from DSSP + dssp_hbond_tuple_list = [] # list of tuples + # (dssp_resnum, hbond_resnum, energy) from DSSP + dssp_pdb_resnum_dict = {} # dict of {dssp_resnum : (pdb_resnum, chainid)} + for line in filehandle: + if line[:6] == "HEADER": + pts.pdb_header = line[10:80].rstrip() + pts.pdb_id = line.rstrip().split()[-2] + elif line[:25] == " # RESIDUE AA STRUCTURE": # header marking assigns + in_assignments = True # secondary struct assignments start next line + continue + elif in_assignments: + # go through and identify helices and strands and + # and them to helix_list and strand_list + # Then bridges and H-bonds + dssp_resnum = line[:5].lstrip() + if not dssp_resnum.isdigit(): + sys.stderr.write( + 'WARNING: bad DSSP residue sequence number ' + 'in DSSP output, ' + 'skipped line:\n' + line) + continue + dssp_resnum = int(dssp_resnum) + pdb_resnum = line[5:11].lstrip().rstrip() + residue = line[13] + if not pdb_resnum.isdigit(): + if not (len(pdb_resnum) > 0 and + (pdb_resnum[0] == '-' # can be negative, e.g. in 2CZR + or pdb_resnum[-1].isalpha())): #can have insertion code + if residue != '!': # '!' indicates chain break + sys.stderr.write('WARNING: bad PDB residue sequence ' + 'number in DSSP output, ' + 'skipped line:\n' + line) + continue + pdb_icode = line[10] # aleady included in pdb_resnum also + pdb_chainid = pdb_chainid_to_stride_chainid(line[11]) + dssp_pdb_resnum_dict[dssp_resnum] = (pdb_resnum, pdb_chainid) + # The DSSP code: + # H = alpha helix + # B = residue in isolated beta-bridge + # E = extended strand, participates in beta ladder + # G = 3-helix (3/10 helix) + # I = 5 helix (pi helix) + # T = hydrogen bonded turn + dssp_code = line[16] + if dssp_code != prev_dssp_code: + # add secondary structure elements, whose boundaries we + # detect in DSSP output by change of DSSP secondary structure + # summary code. + # TODO: deal with discrmiinating between continuous helices + # (as per identifySSE2 in TableauCreator) + if not in_secstruct: + if dssp_code in ['H','B','E','G','I']: +# print 'xxx START',dssp_code, pdb_resnum + in_secstruct = True + start_resnum = pdb_resnum + start_chainid = pdb_chainid + else: +# print 'xxx END',prev_dssp_code, pdb_resnum,dssp_code + in_secstruct = False + end_resnum = prev_pdb_resnum + end_chainid = prev_pdb_chainid + if prev_dssp_code in ['H','G','I']: + helixtype = prev_dssp_code + sse_tuple = (start_chainid, start_resnum, + end_chainid, end_resnum, helixtype) + else: + sse_tuple = (start_chainid, start_resnum, + end_chainid, end_resnum) + if prev_dssp_code == 'E': # ignore 'B' isolated bridge + if (not hbonds_only): + pts.strand_list.append(sse_tuple) + elif prev_dssp_code in ['H', 'G', 'I']: + if (not hbonds_only): + pts.helix_list.append(sse_tuple) + + if dssp_code in ['H','B','E','G','I']: +# print 'yyy START',dssp_code, pdb_resnum + in_secstruct = True + start_resnum = pdb_resnum + start_chainid = pdb_chainid + + prev_dssp_code = dssp_code + # END of dssp_code != prev_dssp_code + + if dssp_code == 'E': # only use bridge partners from E not B + bp1_resnum = int(line[26:29].lstrip()) + bp2_resnum = int(line[30:33].lstrip()) + + bp1_label = line[23] + bp2_label = line[24] + dssp_bp_tuple_list.append((dssp_resnum, + bp1_resnum, bp1_label, + bp2_resnum, bp2_label)) + + # Make tuple for each of four different H bond fields from DSSP + nho1_fields= line[40:51].lstrip().rstrip().split(',') #offset,energy + if int(nho1_fields[0]) != 0: + nho1_resnum = dssp_resnum + int(nho1_fields[0]) + nho1_energy = float(nho1_fields[1]) + dssp_hbond_tuple_list.append((dssp_resnum,nho1_resnum,nho1_energy)) + ohn1_fields= line[51:62].lstrip().rstrip().split(',') #offset,energy + if int(ohn1_fields[0]) != 0: + ohn1_resnum = dssp_resnum + int(ohn1_fields[0]) + ohn1_energy = float(ohn1_fields[1]) + dssp_hbond_tuple_list.append((dssp_resnum,ohn1_resnum,ohn1_energy)) + nho2_fields= line[62:73].lstrip().rstrip().split(',') #offset,energy + if int(nho2_fields[0]) != 0: + nho2_resnum = dssp_resnum + int(nho2_fields[0]) + nho2_energy = float(nho2_fields[1]) + dssp_hbond_tuple_list.append((dssp_resnum,nho2_resnum,nho2_energy)) + ohn2_fields= line[73:84].lstrip().rstrip().split(',') #offset,energy + if int(ohn2_fields[0]) != 0: + ohn2_resnum = dssp_resnum + int(ohn2_fields[0]) + ohn2_energy = float(ohn2_fields[1]) + dssp_hbond_tuple_list.append((dssp_resnum,ohn2_resnum,ohn2_energy)) +# print 'xxx',nho1_fields,ohn1_fields,nho2_fields,ohn2_fields + prev_pdb_resnum = pdb_resnum + prev_pdb_chainid = pdb_chainid + prev_residue = residue + prev_pdb_icode = pdb_icode + # END of iteration of lines in filehanlde + filehandle.close() + + if not hbonds_only: + for (dssp_resnum, bp1_resnum, bp1_label, bp2_resnum, bp2_label) \ + in dssp_bp_tuple_list: + # convert DSSP BP1 and BP2 fields to + # bridgeres_list format, a list of: + # (chainid1, resnum1, chainid2, resnum2, bdir) + # This involves converting the DSSP sequential residue numbers to + # PDB residue numbers which we use + (pdb_resnum, pdb_chainid) = dssp_pdb_resnum_dict[dssp_resnum] + if bp1_resnum != 0: # 0 indicates no beta brdige partner in this field + try: + (bp1_pdb_resnum, bp1_pdb_chainid) = dssp_pdb_resnum_dict[bp1_resnum] + except KeyError: + sys.stderr.write('WARNING: no PDB residue found for DSSP ' + 'residue ' + str(bp1_resnum) + + '; bridge skipped\n') + continue + # The beta bridge labels from DSSP (columns 23,24) are uppercase + # for antiparallel bridges and lowercase for parallel bridges + # (as per Kabsch & Sander 1983, p. 2581). + if bp1_label.islower(): + bdir = 'P' + else: + bdir = 'N' + pts.bridgeres_list.append((pdb_chainid, pdb_resnum, \ + bp1_pdb_chainid, bp1_pdb_resnum, \ + bdir)) + if bp2_resnum != 0: + try: + (bp2_pdb_resnum, bp2_pdb_chainid) = dssp_pdb_resnum_dict[bp2_resnum] + except: + sys.stderr.write('WARNING: no PDB residue found for DSSP ' + 'residue ' + str(bp2_resnum) + + '; bridge skipped\n') + continue + if bp2_label.islower(): + bdir = 'P' + else: + bdir = 'N' + pts.bridgeres_list.append((pdb_chainid, pdb_resnum, \ + bp2_pdb_chainid, bp2_pdb_resnum,\ + bdir)) + + for (dssp_resnum, hbond_dssp_resnum, hbond_energy) in dssp_hbond_tuple_list: + # convert the DSSP hydrogen bond fields to hbond_list format: + # (chainid1, resnum1, chainid2, resnum2, energy) + # TODO note energy (kcal/mol) for DSSP not N..O distance (Angstroms) + (pdb_resnum, pdb_chainid) = dssp_pdb_resnum_dict[dssp_resnum] + (hbond_resnum, hbond_chainid) = dssp_pdb_resnum_dict[hbond_dssp_resnum] + if hbond_energy <= DSSP_HBOND_ENERGY_THRESHOLD: + pts.hbond_list.append((pdb_chainid, pdb_resnum, + hbond_chainid, hbond_resnum, hbond_energy)) + + return pts + + +def read_secstruct_from_pdb_file(pdb_filename): + """ + Parameters: + pdb_filename - filename of PDB file to read HELIX and SHEET cards from. + + Return value: + PTSecStruct class representing secondary structure or + None if there are no HELIX and SHEET cards in PDB file. + + See the PDB format specification (secondary structure section) at + http://www.wwpdb.org/documentation/format23/sect5.html + """ + + # Note we are ignoring the sheet groupings here (though maybe we should + # use them) and relying on 'registration' fields to define bridge + # relationships between strands so we discover the correct sheets + # by connected components the same as when using DSSP or STRIDE. + # TODO: need more work on this, especially with tricky constructions + # like in 1MTP where there are duplicated SHEET lines for same + # strand in multiple sheets. + + pts = PTSecStruct() + found_secstruct = False + fh = open(pdb_filename) + for line in fh: + if line[:6] == "ATOM ": + break # secondary struct section must be before coordinate section + if line[:6] == "HELIX ": + found_secstruct = True + initChainID = pdb_chainid_to_stride_chainid(line[19]) + initSeqNum = line[21:26].lstrip().rstrip() # include insertion code + endChainID = pdb_chainid_to_stride_chainid(line[31]) + endSeqNum = line[33:38].lstrip().rstrip() #include insertion code + helixClass = int(line[38:40]) + if helixClass == 1 or helixClass == 6: # alpha + helixtype = "H" + elif helixClass == 3: # pi + helixtype = "I" + elif helixClass == 5: # 3_10 + helixtype = "G" + else: + helixtype = "H" + element_tuple = (initChainID, initSeqNum, + endChainID, endSeqNum, + helixtype) + pts.helix_list.append(element_tuple) + elif line[:6] == "SHEET ": + found_secstruct = True + initChainID = pdb_chainid_to_stride_chainid(line[21]) + initSeqNum = line[22:27].lstrip().rstrip() #include insertion code + endChainID = pdb_chainid_to_stride_chainid(line[32]) + endSeqNum = line[33:38].lstrip().rstrip() # include insertion code + element_tuple = (initChainID, initSeqNum, + endChainID, endSeqNum) + # can get duplicated strands in different sheets, e.g. 1MTP + if element_tuple not in pts.strand_list: + pts.strand_list.append(element_tuple) + # "Registration" gives us bridge relationships between strands + sense = int(line[38:40]) + if sense == 0: + first_strand = True # no registration information for 1st strand + elif sense == 1: # parallel + first_strand = False + bdir = 'P' + elif sense == -1: # anti-parallel + first_strand = False + bdir = 'N' + else: + sys.stderr.write('WARNING: bad sense (columns 39-40) in line:\n' + + line + '\n') + found_secstruct = False + break + if not first_strand: + curChainId = pdb_chainid_to_stride_chainid(line[49]) + curResSeq = line[50:55].lstrip().rstrip() # include icode + prevChainId = pdb_chainid_to_stride_chainid(line[64]) + prevResSeq = line[65:70].lstrip().rstrip() # include icode + if curResSeq == '' or prevResSeq == '': + sys.stderr.write('WARNING: bad registration info in line:\n' + + line + '\n') + found_secstruct = False + break + bridge_tuple = (curChainId, curResSeq, prevChainId, prevResSeq, + bdir) + pts.bridgeres_list.append(bridge_tuple) + + fh.close() + + if not pts.check_validity_and_fix(): + return None + + # sort the strand and helix lists (by chainid,startresnum ascending) + # so that the numbering when we build graph is by sequence in chain + # TOOD: this is to match what we do with DSSP and STRIDE, but maybe + # there should be an option to use strand numbering from PDB file + # (typically strands numbered left to right in sheet). + pts.strand_list.sort(cmp=tuplecmp) + pts.helix_list.sort(cmp=tuplecmp) + +# print 'zzzzz',pts.helix_list + + if found_secstruct: + return pts + else: + return None + +def ptsecstruct_set_verbose(verb): + """ + set the module global verbose flag in this module to supplied value + Parameters: verb - True (for verbose output) or False + Return value: None + Uses globals: verbose (in this module) + """ + global verbose + verbose = verb + + +def tuplecmp(tup1, tup2): + """ + Comparison function for (chain, pdb_seqres_strt, pdb_seqres_end, type) + tuples used by sort in PTSecStruct.check_validity() and + PTSecStruct.__str__() functions. + """ + if tup1[0] < tup2[0]: + return -1 + elif tup1[0] > tup2[0]: + return 1 + else: + return pdb_res_seq_cmp(tup1[1], tup2[1]) + + +def write_pml_define_sses(fh, domid, sse_list): + """ + Write PyMOL commands to define the SSEs according to our (DSSP or STRIDE + or other) definitions rather than the PDB or PyMOL dss definitions. + + Parameters: + fh - open (write) filehandle to write PyMOL commands to + domid - structure identifier to define SSEs in + sse_list - list of (chain,start,end,type) tuples specifyign SSEs in struct + Return value: None + """ + fh.write("alter /"+domid + ", ss='L'\n") # make entire struct loop region + fh.write("rebuild\n") + for (chain, start_resi, end_resi, ssetype) in sse_list: + if ssetype == 'E': + pml_ssetype = 'S' + elif ssetype == 'H': + pml_ssetype = 'H' + else: + raise ValueError('bad sse type ' + ssetype + '\n') + fh.write('alter ' + '/' + domid + '//' + chain + + '/' + str(start_resi) + '-' + str(end_resi) + + ", ss = '" + pml_ssetype + "'\n") + fh.write("rebuild\n") + + +def write_pymol_load(fh, domid, pdbfile): + """ + Write comands to load specified sturcture into PyMOL + + Parameters: + fh - open (write) filehandle to write to + domid - identifier of sructure + pdbfile - PDB file to get SSEs for + + Return value: None + """ + fh.write('load ' + pdbfile + '\n') +# fh.write('color ' + color + ', /'+domid + '\n') + + + + +def write_pymol_prelude(fh): + """ + Write startup information in PyMOL script (.pml) file. + + Parameters: + fh - open (write) filehandle to write to + Return value: None + """ + fh.write('# generated by ' + " ".join(sys.argv) +'\n') + timestamp = strftime("%d%b%Y %H:%M:%S", localtime()) + fh.write('# on ' + timestamp + '\n') + fh.write('#\n') + + + +def write_pymol_conclusion(fh): + """ + Write finalizing information in PyMOL script (.pml) file. + + Parameters: + fh - open (write) filehandle to write to + Return value: None + """ + fh.write('hide everything\n') + fh.write('show cartoon\n') + diff --git a/scripts/pttableau.py b/scripts/pttableau.py new file mode 100644 index 0000000..961c07c --- /dev/null +++ b/scripts/pttableau.py @@ -0,0 +1,934 @@ +############################################################################### +# +# pttableau.py - Object to represent protein tableaux and functions to +# parse output of TableauCreator program into tableau object. +# +# +# File: pttableau.py +# Author: Alex Stivala +# Created: October 2007 +# +# +# $Id: pttableau.py 2703 2009-07-27 06:01:05Z astivala $ +# +############################################################################### +""" +Ths module contains routines to generate protein tableaux using the +axes fitted to SSEs by functions in the ptnode.py module, and the relative +angle calculate in that module. + +This module also contains functions to parse the output of +Arun Konargurthu's TableauCreator program, which was used before +tableaux were re-implemented internally to this code. +NOTE: not yet published or available (as of October 2007). +IMPORTANT: this requires my modified versin of TableauCreator, +and also a patched version of Bio.PDB file PDBIO.py : +context diff for patching it is Bio.PDB.PDBIO.py.diff +This patch was made relative to BioPython release 1.43. + +Tableaux are described by Kamat and Lesk 2007 +'Contact Patterns Between Helices and Strands of Sheet Define Protein + Folding Patterns' Proteins 66:869-876 +and Lesk 2003 'From Electrons to Proteins and Back Again' +Int. J. Quant. Chem. 95:678-682 +and Lesk 1995 'Systematic representation of folding patterns' +J. Mol. Graph. 13:159-164. + +Two classes are provided, PTTableau and PTTableauPacked. The latter is +a more compact format based on LAPACK style symmetric matrix packed +array storage, useful for holding whole databse of tableaux in memory +(see buildtableauxdb.py) and dumping/loading it. They can be used +interchangeably when it comes to getting and setting with [] +(__getitem__ and __setitem__). (They should probably both inherit from +some tableau base class to make this explicity, but it doesn't really +matter with Python 'duck typing' of which this is a use (abuse?). +""" + +import os,sys +from math import pi +import numpy.oldnumeric as Numeric +from Bio.PDB import * + +from ptnode import * +from ptdomain import PTDomain +from ptsecstruct import pdb_chainid_to_stride_chainid +from ptutils import cleanup_tmpdir + +#----------------------------------------------------------------------------- +# +# Module globals +# +#----------------------------------------------------------------------------- + + +# constants + +TABLEAU_MIN_HELIX_LEN = 4 # min length of a helix in TableauCreator is 4 +TABLEAU_MIN_STRAND_LEN = 2 # min length of a strand in TableauCreator is 2 + +# global variables + +verbose = False + +#----------------------------------------------------------------------------- +# +# Class definitions +# +#----------------------------------------------------------------------------- + +class PTTableau: + """ + The PTTableau class is a protein tableau, as per Kamat and Lesk 2007 + 'Contact Patterns Between Helices and Strands of Sheet Define Protein + Folding Patterns' Proteins 66:869-876. + + The tableau is a 2 dimensional symmetric matrix indexed by SSEs + in the protein where each entry + is a two character code representing the angle between those SSEs. + (See paper(s) for details). + + We implement it as a mapping container, i.e. using __getitem__ and + __setitem__ so that elements can bet get/set with dictionary/array + type syntax e.g. tableau[(helix1,strand2)]. (index is a tuple of SSEs + represented by PTNode objects)- NB must have in parens to ensure tuple. + This is implemented with a standard dictionary object, + and since it is symmetric, only one copy is stored, the one where + i < j in (i,j) index; however either can be get/set, they are swapped + internally if necessary. Accessing (i,i) returns + 'xa', 'xi', 'xg' for respectively alpha,pi,310 helices and + 'e ' for strand. + + + """ + + def __init__(self, nodelist): + """ + Intialize a PTTableau with no tableau entries set yet. + + Parameters: + nodelist - list of nodes that will be in the tableau, ins + residue sequence number order. + """ + self.tabdict = {} # { (res1, res2) : code }; see class documentation\ + self.nodelist = nodelist # ptnodes with those not in tableau removed + + + def __str__(self): + """ + Return string representation of the tableau; we will write a full matrix + just like TableauCreator does. + """ + s = "" + for sse1 in self.nodelist: + for sse2 in self.nodelist: + try: + s += self[(sse1, sse2)] + ' ' + except KeyError: + s += "?? " + s += '\n' + return s + + + + # + # methods defined to implement container type + # + + def __len__(self): + """ + Return number of SSEs in the tableau + + Parameters: None + Return value: Number of SSEs in nodelist for building tableau + """ + return len(self.nodelist) + + + def __getitem__(self, ssepair): + """ + Return the entry in the tableau for the pair of SSEs + (sse1, sse2) where sse1 and sse2 are PTNode objects; + or if ssepair is (i,j) where i,j are integers, the corresponding + tableau entry for (nodelist[i], nodelist[j]). + + Parameters: + ssepair - tuple (sse1,sse2) (PTNode objects) or + tuple (i,j) (integers) to look up tableau entry for + Return value: + two character tableau string e.g. 'RD' or 'HH', or ' ' (2 spaces). + On the main diagonal (self-orientation) since this has no menaing + we return a (two-char) encoding of the SSE type instead: + 'xa', 'xi', 'xg' for respectively alpha,pi,310 helices and + 'e ' for strand. + + Raises Exceptions: + TypeError if ssepair is not PTNode pair or int pair. + """ + ssespec1 = ssepair[0] + ssespec2 = ssepair[1] + if isinstance(ssespec1, PTNode) and isinstance(ssespec2, PTNode): + sse1 = ssespec1 + sse2 = ssespec2 + elif isinstance(ssespec1, int) and isinstance(ssespec2, int): + sse1 = self.nodelist[ssespec1] + sse2 = self.nodelist[ssespec2] + else: + raise TypeError("bad tuple type in PTTableau getitem") + if sse1 == sse2: + if isinstance(sse1, PTNodeHelix): + if sse1.get_type() == "ALPHA": + return "xa" + elif sse1.get_type() == "PI": + return "xi" + elif sse1.get_type() == "310": + return "xg" + else: + return "??" # should not happen + elif isinstance(sse1, PTNodeStrand): + return "e " + else: + return "??" # should not happen + elif sse1 < sse2: + ssepair = (sse1,sse2) + else: + ssepair = (sse2,sse1) + return self.tabdict[ssepair] + + + def __setitem__(self, ssepair, tabcode): + """ + Set the entry in the tableau for the pair of SSEs (sse1,sse2) + specified as the key (ssepair) parameter to the tabcode value. + + Parameters: + ssepair - tuple (sse1,sse2) to set. + tabccode - two character tableau string e.g. 'RD' or 'HH', or ' '. + + Return value: None + + Raises exceptions: + TypeError if tabcode is not a valid 2 char uppercase string or ' ' + """ + if len(tabcode) != 2 or not tabcode.isupper() and not tabcode.isspace(): + raise TypeError("bad tableau code '" + tabcode + "'\n") + + if (tabcode[0] not in ['L','R','P','O'] or \ + tabcode[1] not in ['E','D','S','T']) and \ + tabcode != 'HH' and tabcode != 'KK': + raise TypeError("bad tableau code '" + tabcode + "'\n") + + sse1 = ssepair[0] + sse2 = ssepair[1] + if sse1 == sse2: + return + elif sse1 < sse2: + ssepair = (sse1, sse2) + else: + ssepair = (sse2, sse1) + self.tabdict[ssepair] = tabcode + + + # have not implemented: __delitem__, __iter__, __contains__ + + + # TODO: work out how to implement things like tab[2:] to get row 2, + # ust like Numeric.array etc. + def getrow(self, i): + """ + Return a row of the tableau as a list of tableau codes. + + Parameters: + i - row to get 0 <= i < len(self) + + Return value: + list of two-character tableau codes for row i. + """ + return [self[(i,j)] for j in xrange(len(self))] + + +class PTTableauPacked: + """ + The PTTableauPacked class is a compact representation + of a protein tableau, as per Kamat and Lesk 2007 + 'Contact Patterns Between Helices and Strands of Sheet Define Protein + Folding Patterns' Proteins 66:869-876. + + The tableau is a 2 dimensional symmetric matrix indexed by SSEs + in the protein where each entry + is a two character code representing the angle between those SSEs. + (See paper(s) for details). + + We implement it as a mapping container, i.e. using __getitem__ and + __setitem__ so that elements can bet get/set with dictionary/array + type syntax e.g. tableau[(1,2)]. (index is a pair of sequential + SSE numbers, from 0 to n-1 where n is the order of tableau ie number + of SSEs) - NB must have in parens to ensure tuple. + + This is the compact respresentation, storing tableau as simply + a linear string of two-character tableau codes, in the same as as the + LAPACK 'packed' format for triangular/symmetric arrays. i.e. + each column of the matrix is stored in sequence. + We could save even more space by using only 4 bits for each tableau + code (since there are only 16 possible codes), but in Python + it doesn't really make sense to try to be so efficient - but + we are trying to save space to some degree so that the entire ASTRAL + PDB non-redundant set or similar can be loaded as tableaux in + memory. + As it happens, strings in python don't even support item assignemnt, + so we have to store it as a list anyway i.e ['xa','OT',...] + instead of 'xaOT...' + + Unlike PTTableau, this format contains just the tableau codes + and diagonal SSE type entries, i.e. just character data. there + are no PTNode object references or anything, so it is simple and + quick to dump/load with Python pickle module (or similar) with + no need to build all sorts of other objects (PTNode, Bio.PDB.Structure, + etc.). + + Accessing (i,i) returns 'xa', 'xi', 'xg' for respectively + alpha,pi,310 helices and 'e ' for strand. + """ + + def __init__(self, tableau): + """ + Intialize a PTTableauPacked given an already built tableaux in + the full PTTableau format. + + Parameters: + tableau - an already built PTTableau object + """ + #self.n = 3 + #self.uplist = ['xa','OS','e ','OT','PE','xa'] + self.n = len(tableau) # order of tableau (number of SSEs) + self.uplist = [] # packed format of matrix upper triangle + # NB COLUMN-MAJOR (LAPACK style) + for j in range(self.n): + for i in range(j+1): + try: + tabcode = tableau[(i,j)] + except: + tabcode = '??' + self.uplist.append(tabcode) + assert(len(self.uplist) == self.n * (self.n + 1) / 2) + + def __str__(self): + """ + Return string representation of the tableau; we will write a full matrix + just like TableauCreator does. + """ + s = "" + for i in range(self.n): + for j in range(self.n): + s += self[(i,j)] + ' ' + s += '\n' + return s + + + # + # methods defined to implement container type + # + + def __len__(self): + """ + Return number of SSEs respresented the tableau + + Parameters: None + Return value: order of tableau + """ + return self.n + + + def __getitem__(self, ssepair): + """ + Return the entry in the tableau for the pair of SSEs + (i,j) where i,j are integers, 0 <= i,j < n. + + Parameters: + ssepair - tuple (i,j) (integers) to look up tableau entry for + Return value: + two character tableau string e.g. 'RD' or 'HH', or ' ' (2 spaces). + On the main diagonal (self-orientation) since this has no menaing + we return a (two-char) encoding of the SSE type instead: + 'xa', 'xi', 'xg' for respectively alpha,pi,310 helices and + 'e ' for strand. + + Raises Exceptions: + TypeError if ssepair is not PTNode pair or int pair. + """ + i = ssepair[0] + 1 + j = ssepair[1] + 1 # more convenient to have 1 < i,j <= n internally + if j < i: + tmp = i + i = j + j = tmp + r = i + j*(j-1)/2 + r -= 1 # back to zero-based for list indexing + return self.uplist[r] + + + def __setitem__(self, ssepair, tabcode): + """ + Set the entry in the tableau for the pair of SSEs (sse1,sse2) + specified as the key (ssepair) parameter to the tabcode value. + + Parameters: + ssepair - tuple (i,j) to set, 0 <= i,j < n. + tabccode - two character tableau string e.g. 'RD' or 'HH', + or 'xa','e ' etc. if i==j for SSE type on diagonal. + + Return value: None + + Raises exceptions: + TypeError if tabcode is not a valid 2 char uppercase string + or lowercase type code for diagonal. + """ + i = ssepair[0] + j = ssepair[1] + if i == j: + if tabcode not in ['xa','xi','xg','e ']: + raise TypeError('bad tableau sse type code ' + tabcode + '\n') + else: + if len(tabcode) != 2 or not tabcode.isupper(): + raise TypeError("bad tableau code '" + tabcode + "'\n") + + if ( (tabcode[0] not in ['L','R','P','O'] or + tabcode[1] not in ['E','D','S','T']) and + tabcode != 'HH' and tabcode != 'KK' ): + raise TypeError("bad tableau code '" + tabcode + "'\n") + + if j < i: + tmp = i + i = j + j = tmp + i += 1 + j += 1 # more convenient to have 1 < i,j <= n internally + r = i + j*(j-1)/2 # location in packed rep assuming each entry len 1 + r -= 1 # back to zero-based for list indexing + self.uplist[r] = tabcode + + # have not implemented: __delitem__, __iter__, __contains__ + + # TODO: work out how to implement things like tab[2:] to get row 2, + # ust like Numeric.array etc. + def getrow(self, i): + """ + Return a row of the tableau as a list of tableau codes. + + Parameters: + i - row to get 0 <= i < len(self) + + Return value: + list of two-character tableau codes for row i. + """ + # TODO: we could do this more efficiently for packed tableau + return [self[(i,j)] for j in xrange(len(self))] + + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def angle_to_tabcode(omega): + """ + Convert an angle (radians in (-pi, pi]) to a two-character tableau + code (double quadrant encoding) as described in the papers cited + at top of module. + + Parmaters: + omega - relative angle to encode + Return value: + two-character tableua code (OS, PD, etc.) + Raises exceptions: + ValueError if angle is out of range + """ + if omega > -pi/4 and omega <= pi/4: + tabcode = 'P' # parallel + elif omega > pi/4 and omega <= 3*pi/4: + tabcode = 'R' # crossing-right + elif (omega > 3*pi/4 and omega <= pi) or omega > -pi and omega <= -3*pi/4: + tabcode = 'O' # antiparallel (opposite) + elif omega > -3*pi/4 and omega <= -pi/4: + tabcode = 'L' # crossing-left + else: + raise ValueError('bad omega value ' + str(omega) + '\n') + + if omega > 0 and omega <= pi/2: + tabcode += 'D' # dinner + elif omega > pi/2 and omega <= pi: + tabcode += 'T' # tea + elif omega > -pi and omega <= -pi/2: + tabcode += 'S' # supper + elif omega > -pi/2 and omega <= 0: + tabcode += 'E' # elevenses + else: + raise ValueError('bad omega value ' + str(omega) + '\n') + + return tabcode + + + +def compute_tableau(ptnode_list, pdb_structure, use_hk=True): + """ + Build a PTTableau object for the tableau by computing relative angles + between all SSEs in the ptnode_list. + + Parameters: + ptnode_list - list of PTNode objects (ie iterable of PTNode) + representing the SSEs (helices,strands) the + tabelau is for. + pdb_structure - parsed Bio.PDB structure + use_hk - If True, use the HH and KK codes for respectively + antiparallel and parallel strands of the same sheet. + Default True. + + Return value: + PTTableau object with entry for each pair of SSEs. + """ + tableau = PTTableau(ptnode_list) + for i in range(len(ptnode_list)): + for j in range(i+1, len(ptnode_list)): + omega = ptnode_list[i].relative_angle(ptnode_list[j], pdb_structure) + if omega != None: + try: + tabcode = angle_to_tabcode(omega) + except ValueError: + sys.stderr.write('WARNING: catch bad tableau angle, seting Parallel (%d,%d)\n' %(i,j)) + tabcode = "PE" # NaN -> 0.0 -> parallel: should not happen but does e.g. d7pcka_ -35 + # set tabcode to HH for antiparallel strands and + # KK for parallel strands + if (use_hk and + isinstance(ptnode_list[i], PTNodeStrand) and + isinstance(ptnode_list[j], PTNodeStrand) and + (ptnode_list[i].get_sheet_id() != None and + ptnode_list[i].get_sheet_id() == + ptnode_list[j].get_sheet_id())): + if tabcode[0] == 'O': + tableau[(ptnode_list[i], ptnode_list[j])] = 'HH' + elif tabcode[0] == 'P': + tableau[(ptnode_list[i], ptnode_list[j])] = 'KK' + else: + tableau[(ptnode_list[i], ptnode_list[j])] = tabcode + else: + tableau[(ptnode_list[i], ptnode_list[j])] = tabcode + + if verbose: + sys.stderr.write(str(tableau)) + + return tableau + + +def compute_omega_matrix(ptnode_list, pdb_structure): + """ + Return the omega (relative angle, in radians) matrix as a 2D Numeric.array + by computing relative angles between all SSEs in the ptnode_list + + Parameters: + ptnode_list - list of PTNode objects (ie iterable of PTNode) + representing the SSEs (helices,strands) the + tabelau is for. + pdb_structure - parsed Bio.PDB structure + + Return value: + Numeric.array square symmetric (order length of ptnode_list) where + each entry is relative angle between SSEs in radians. + Main diagonal entries set to 0. + + """ + n = len(ptnode_list) + omega_array = Numeric.zeros((n, n), Numeric.Float) + for i in range(n): + for j in range(i+1, n): + omega = ptnode_list[i].relative_angle(ptnode_list[j], pdb_structure) + if omega == None: + omega_array[i, j] = float('NaN') + else: + omega_array[i, j] = omega + omega_array[j, i] = omega_array[i, j] + + # set the diagonal as follows: + # 0.00 for strand + # 1.00 for alpha helix + # 2.00 for pi helix + # 3.00 for 3_10 helix + for i in range(n): + if isinstance(ptnode_list[i], PTNodeHelix): + if ptnode_list[i].get_type() == "ALPHA": + v = 1.00 + elif ptnode_list[i].get_type() == "PI": + v = 2.00 + elif ptnode_list[i].get_type() == "310": + v = 3.00 + else: + pass # should not happen + elif isinstance(ptnode_list[i], PTNodeStrand): + v = 0.00 + omega_array[i,i] = v + + return omega_array + + +#----------------------------------------------------------------------------- +# +# Classes and functions for running external TableauCreator +# +#----------------------------------------------------------------------------- + + +# Inherit from the PDBIO.Select class for writing only parts of PDB to file +# See the Bio.PDB documentation: biopython-1.43/Doc/biopdb_faq.pdf +class DomainSelect(Select): + """ + The DomainSelect class inherits from the PDBIO.Select class + and overrides function to select only certain residues for writing + ATOM records in the domain we are interested in to the + simplified PDB file for TableauCreator. + + See the Bio.PDB documentation by Thomas Hamelryck: + biopython-1.43/Doc/biopdb_faq.pdf + """ + def __init__(self, domain): + """ + Constructor for the DomainSelect class, sets the domain member + used to accept only residues in that domain. + Parameters: + domain - ptdomain object of domain to select residues from + """ + self.domain = domain + + def __repr__(self): + """ + Overrides the base __repr__ to write out the domain we have + """ + return "" + + def accept_residue(self, residue): + """ + overrides the base accept_residue() function to accept only + residues in our domain. Also reject HETATMS. + Paramteters: + residue - Bio.PDB Residue object of residue to test + Return value: + 1 to accept residue, 0 to reject. + """ + chain = residue.get_parent() + chainid = pdb_chainid_to_stride_chainid(chain.get_id()) + # id of a residue in Bio.PDB is tuple (hetatm, resseqnum, icode) + # so we choose those where chain and residue PDB number + # is in the domain. + resnum = residue.get_id()[1] + if (self.domain.is_in_domain(chainid, resnum) and + residue.get_id()[0] == ' '): + return 1 + else: + return 0 + + + +def get_tableau_from_pdbstruct(pdbid, domain, + pdb_structure, ptnode_list): + """ + Build a PTTableau object for the tableau by first creating a + simple PDB file with only the ATOM records for residues in the + domain we are processing, and also a .SSEsInfo file containing the + secnodary structure assignments we already have, then running + TableauCreator on it (using our simple PDB file and SSEsInfo) and + parsing the output. + + Parameters: + pdbid - PDB identifier of the strucutre + domain - The PTDomain object for our current domain + pdb_structure - parsed Bio.PDB structure + ptnode_list - list of PTNode objects (ie iterable of PTNode) + representing the SSEs (helices,strands) the + tabelau is for. + Return value: + PTTableau object built from TableauCreator output + + """ + TMPDIR = os.tempnam(None, "pttabin") + os.mkdir(TMPDIR) + try: + filename = pdbid + if domain.domainid != None: + filename += '-' + domain.domainid + filename += '.pdb' + domain_pdb_filename = os.path.join(TMPDIR, filename) + io = PDBIO() + io.set_structure(pdb_structure) + io.save(domain_pdb_filename, DomainSelect(domain)) + + ssesinfo_filename = os.path.join(TMPDIR, filename + ".input-SSEsInfo") + write_ssesinfo(ssesinfo_filename, ptnode_list) + + tableau = read_tableau_from_tableaucreator(domain_pdb_filename, + ptnode_list, + ssesinfo_filename) + os.unlink(domain_pdb_filename) + os.unlink(ssesinfo_filename) + finally: + cleanup_tmpdir(TMPDIR) + return tableau + + +def read_tableau_from_tableaucreator(pdb_filename, ptnode_list, + ssesinfo_filename): + """ + Run Arun's TableauCreator program on the supplied pdb_filename + using SSEsInfo file. + + Parameters: + pdb_filename - PDB file to run TableauCreator on + ptnode_list - list of PTNode objects (ie iterable of PTNode) + representing the SSEs (helices,strands) the + tabelau is for. + ssesinfo_filename - filename of the .SSEsInfo file that was written + to define SSEs for TableauCreator. + Return value: + PTTableau object built from TableauCreator output + + NB: TableauCreator is not yet published or available (October 2007) + and I am using a private version which Arun sent me, which I modified + to add the -s option to use STRIDE rather than DSSP + and to have the -i option to parse .SSEsInfo files. + """ + + # TableauCreator needs an output directory where it writes all its + # intermediate/output files, only puts progress information/errors + # to stdout/stderr. + + tmpdir = os.tempnam(None, "pttab") + os.mkdir(tmpdir) + command = "TableauCreator " + command += "-i " + ssesinfo_filename + " " + command += pdb_filename + " " + tmpdir + command += " >/dev/null" + if verbose: + sys.stderr.write("running '" + command + "'...") + os.system(command) + if verbose: + sys.stderr.write("done\n") + # output files are: + # .angles + # .SSEsInfo + # .stride or .dssp + # .tableau + outfile_prefix = os.path.join(tmpdir, os.path.basename(pdb_filename)) + if not os.path.isfile(os.path.join(tmpdir, "TABCREATE_OK")): + sys.stderr.write("ERROR: TableauCreator failed\n") + cleanup_tmpdir(tmpdir) + return None + # Now the tricky thing is TableauCreator indexes its matrix just with + # purely sequential numbers from 0 (as conventional) + # assuming all SSEs in one domain and in fact one chain + # (so we handle this by creating our own simple PDB file with only + # ATOM records for our current domain, and only one TER record on + # end so chains concatenated effectively). + # And also (as in comments above functions) we have the dodginess of + # doing the same thing in different ways in multiple places (DSSP/STRIDE + # parsing, PDB parsing, etc.). + # So let's check that the TableauCreator SSE info lines up with ours + # (otherwise we can't use the tableau data). + + # parse the SSEsInfo file and check lines up with ptnodes, + # returns list of ptnodes corresponding to Tableau entries (may be shorter + # than our input node list; some removed as no equivalent in tableua). + nodelist = parse_tableaucreator_ssesinfo(outfile_prefix + '.SSEsInfo', + ptnode_list) + if nodelist != None: + tableau_filename = outfile_prefix + ".tableau" + tableau = parse_tableaucreator_output(tableau_filename, nodelist) + if tableau != None: + if verbose: + sys.stderr.write(str(tableau)) + else: + sys.stderr.write('WARNING: problem parsing TableauCreator output;\n' + ' tableau information will not be used\n') + else: + sys.stderr.write('WARNING: problem with TableauCreator output;\n' + ' tableau information will not be used\n') + tableau = None + + cleanup_tmpdir(tmpdir) + return tableau + + +def parse_tableaucreator_ssesinfo(filename, nodelist): + """ + Parse the .SSEsInfo file created by TableauCreator and check that + it lines up with our SSE info in the form of the list of helix/strand + PTNodes. + + Parameters: + filename - filename of the .SSEsInfo file + nodelist - list of PTNodes, in order of residue sequence number + + Return value: + nodelist where nodes with no tableau entry removed (too short) if + OK (they line up) else None (different number of nodes, + residue sequence numbers/types don't match, etc). + """ + + # first remove all nodes with len < TABLEAU_MIN_SSE_LEN, since Tableau + # Creator won't have entries for them; we will have to set them +# ptnodelist = [ node for node in nodelist \ +# if ( (isinstance(node, PTNodeHelix) and +# node.get_span() >= TABLEAU_MIN_HELIX_LEN) or +# (isinstance(node, PTNodeStrand) and +# node.get_span() >= TABLEAU_MIN_STRAND_LEN) ) +# ] + + ptnodelist = nodelist + # FIXME: no longer need this filtering of too short SSEs, + # now that .SSEsInfo input is being used + if len(ptnodelist) != len(nodelist): + sys.stderr.write('WARNING: no tableau entry for ' + + str(len(nodelist)-len(ptnodelist)) + + ' nodes due to length to small' + + '\n') + + fh = open(filename) + # first line is number of SSEs, subsequent lines are + # type start_resnum end_resnum + # where type is E or H (DSSP code) and resnums are PDB residue numbers + # + numlines = int(fh.readline()) + if numlines < 2: + sys.stderr.write('ERROR: bad SSEsInfo data\n') + fh.close() + return None + linenum = 1 + sseinfo = [] # tuple (type, start, end) + line = fh.readline() + while line != "": + fields = line.split() + sseinfo.append((fields[0], int(fields[1]), int(fields[2]))) + linenum += 1 + line = fh.readline() + fh.close() + if len(sseinfo) != numlines: + sys.stderr.write('ERROR: TableauCreator SSEsInfo file specified ' \ + + str(numlines) + ' entries but ' \ + + str(len(sseinfo)) + ' read\n') + return None + if len(sseinfo) != len(ptnodelist): + sys.stderr.write('ERROR: TableauCreator SSEsInfo file has ' \ + + str(numlines) + ' entries but ' \ + + 'we have ' + str(len(ptnodelist)) + ' SSEs\n') + return None + for i in range(len(sseinfo)): + sse = sseinfo[i] + ptnode = ptnodelist[i] + if sse[0] == 'H' and not isinstance(ptnode, PTNodeHelix) or \ + sse[0] == 'E' and not isinstance(ptnode, PTNodeStrand) or \ + sse[1] != ptnode.get_start_res_seq() or \ + sse[2] != ptnode.get_end_res_seq(): + sys.stderr.write('ERROR: TableauCreator SSEInfo entry ' + \ + str(sse) + ' does not match node ' + + str(ptnode) + '\n') + return None + return ptnodelist + + +def parse_tableaucreator_output(filename, nodelist): + """ + Parse the .tableau file created by TableauCreator + + Parameters: + filename - filename of the .tableau file + nodelist - list of PTNodes, in order of residue sequence number + + Return value: + PTTableau for the tableau parsed + """ + tableau = PTTableau(nodelist) + # First line of file is number of SSEs (order of square matrix) + # The whole matrix is in the file (it is symmetric), diagonal elements + # and other unset (ie non-contact) elements are set to '--', or ' ' + # (two spaces) when cannot be calculated. + fh = open(filename) + numlines = int(fh.readline()) + if numlines < 2: + sys.stderr.write('ERROR: bad tableau data\n') + fh.close() + return None + linenum = 1 + line = fh.readline() + i = 0 + while line != "": + # we will use the fact that fields are fixed length rather than + # splitting on space separator as fields may be set to ' ' (2 spaces) + # when error calculating (helices too short etc.) in TableauCreator. + # fields are two chars with two spaces between each + node_i = nodelist[i] + if len(line) < len(nodelist) * 4: + sys.stderr.write('ERROR: bad line in tableau; line too short:\n') + sys.stderr.write(line) + fh.close() + return None + for j in range(i+1, len(nodelist)): # no need to store both i,j and j,i + node_j = nodelist[j] + col = 4*j # each field as 2 char tabcode then 2 spaces + tabcode = line[col:col+2] + if tabcode != ' ' and tabcode != '--': + tableau[(node_i, node_j)] = line[col:col+2] + i += 1 + linenum += 1 + line = fh.readline() + fh.close() + return tableau + + + +def write_ssesinfo(filename, nodelist): + """ + Write a TableauCreator .SSEsInfo file describing the SSE asignments + we have to the specified filename. This is so that we avoid having + TableauCreator re-run DSSP or STRIDE for the assignments, which is + inefficient and leads to inconsistencies. TableauCreator has been + modified to be able to read this .SSEsInfo file instead, allowing + the same assignments we have to be re-used by TableauCreator. + + WARNING: file is overwritten if it exists + + Parameters: + filenanme - filename to write SSEsInfo to + nodelist - list of PTNodes defining the SSEs + + Return value: None + """ + # The format of the .SSEsInfo file is (see writeTableauAnglesSSEinfo()) + # that the first line has number of records and each subsequent + # line (record) + # is whitespace-separated: + # dssp-code start end chainid + # e.g. + # H 10 21 A + # Only H and E codes are used. + # blank chainid is not allowed, '-' used instead. + fh = open(filename, 'w') + fh.write(str(len(nodelist)) + "\n") + for node in nodelist: + if isinstance(node, PTNodeHelix): + typecode = 'H' + elif isinstance(node, PTNodeStrand): + typecode = 'E' + else: + assert(False) + fh.write(typecode + " " + str(node.get_start_res_seq()) + " " + + str(node.get_end_res_seq()) + " " + node.get_chainid() + "\n") + fh.close() + + +def pttableau_set_verbose(verb): + """ + set the module global verbose flag in this module to supplied value + Parameters: verb - True (for verbose output) or False + Return value: None + Uses globals: verbose (in this module) + """ + global verbose + verbose = verb + diff --git a/scripts/ptutils.py b/scripts/ptutils.py new file mode 100644 index 0000000..9539038 --- /dev/null +++ b/scripts/ptutils.py @@ -0,0 +1,148 @@ +############################################################################### +# +# ptutils.py - Miscellaneous utility functions +# +# +# File: ptutils.py +# Author: Alex Stivala +# Created: October 2007 +# +# $Id: ptutils.py 1682 2008-07-15 02:55:14Z astivala $ +# +############################################################################### + +import os,sys +import glob + +def cleanup_tmpdir(tmpdir): + """ + Remove a temporary directory and its contents + Parameters: + tmpdir - temporary directory to remove + Return value: None + """ + try: + for filename in glob.glob(os.path.join(tmpdir, "*")): + os.remove(filename) + os.rmdir(tmpdir) + except OSError, inst: + sys.stderr.write('WARNING: could not remove temp files' + ' in ' + tmpdir + '\n' + str(inst) + '\n') + + + +def get_int_icode(res_seq): + """ + Return tuple (int, icode) with integer residue sequence number and + single char icode from PDB residue sequence string such as '60A' or '61' + etc. + + Parameters: + res_seq - PDB resisue sequence number string, with or without icode + Return value: + tuple (int, icode) where + int is integer part of res_seq, and icode is char insertino code or None + """ + if not res_seq[-1].isdigit(): + int1 = int(res_seq[0:len(res_seq)-1]) + icode = res_seq[-1] + else: + int1 = int(res_seq) + icode = None + return (int1, icode) + + +def biopdbresid_to_pdbresseq(biopdb_residueid): + """ + Give a Bio.PDB Residue id tupe (hetatm, resseqnum, icode), return + the PDB residue sequence number string consisting of the sequence + number and the insertion code, if not blank. + + Parameters: + biopdb_residueid - tuple (hetatm, resseqnum, icode) from Residue.get_id() + Return value: + string residue PDB sequence number e.g. '60' or '60A'. + """ + # Residue.get_id() gives tuple (hetatm, resseqnum, icode) + res_seq = str(biopdb_residueid[1]) + if biopdb_residueid[2] != ' ': + res_seq += biopdb_residueid[2] + return res_seq + + +def pdb_res_seq_cmp(res_seq1, res_seq2): + """ + Comparison function for PDB residue sequence numbers, which + are strings that may have insertion code on end e.g. '60' + or '60A'. Compare in integer part as integers, if same then + use insertion code e.g. '60' < '61' regardless of what insertion + codes we could put on the end, but also '60' < '60A' < '60B' < '61'. + + DANGER: this comparison is not always correct for all PDB files: + is assumes residues are ordered as above (60, 60A, 60B, 61, etc.) + but some PDB files do NOT work this way, e.g. 1HVC, where the + first part of the chain numbered 1 up to 99 all has insertion code + B, then there are some more residues, then numbering starts + again at 1 but with insertion code A on all residues! + + Parameters: + res_seq1 - PDB residue sequence string as above + res_seq2 - PDB resiue seqwuence string as above + Return value: + -1 if res_seq1 < res_seq2 + 0 if res_seq1 = res_seq2 + 1 if res_seq1 > res_seq2, + according to the ordering defined above. + Uses no data members. + """ + (int1, icode1) = get_int_icode(res_seq1) + (int2, icode2) = get_int_icode(res_seq2) + + if int1 < int2: + return -1 + elif int1 > int2: + return 1 + else: # compare icodes, note None < x for any x + if icode1 < icode2: + return -1 + elif icode1 > icode2: + return 1 + else: + return 0 + +def char_if_not_blank(char): + """ + IF supplied character is not space, return it, else return empty + string. + Paramaters: + char - char to test + Return value: + char if char is not space, else ''. + """ + if char == ' ': + return '' + else: + return char + + + +def isNaN(x): + """ + Test if supplied float is an IEEE not-a-number (NaN). + For some reason Python does not hav a function to do this, + and nor does Numeric (although numpy and scipy have support for it). + + Parameters: + x - float to test for NaN + + Return value: + True if x is NaN, else False. + """ + # NaN is the only float value that is not equal to itself (IEEE + # standard) + if x != x: + return True + else: + return False + + diff --git a/scripts/ptversion.py b/scripts/ptversion.py new file mode 100644 index 0000000..6ff572a --- /dev/null +++ b/scripts/ptversion.py @@ -0,0 +1,7 @@ +# autogenerated by /home/alexs/phd/ptgraph/buildversion.sh +# Mon Nov 23 16:31:03 EST 2009 +def get_version(): + """ + Return version string containing global version number and 'build' date + """ + return "Revision 2978, Mon Nov 23 16:31:03 EST 2009" diff --git a/scripts/pytableaucreate.py b/scripts/pytableaucreate.py new file mode 100755 index 0000000..773d17e --- /dev/null +++ b/scripts/pytableaucreate.py @@ -0,0 +1,589 @@ +#!/usr/bin/env python +############################################################################### +# +# pytableaucreate - Python implementation of protein Tableau creator +# +# File: pytableaucreate.py +# Author: Alex Stivala +# Created: February 2008 +# +# $Id: pytableaucreate.py 2950 2009-11-16 06:24:50Z astivala $ +# +# +# Create a protein tableau and write it to stdout. +# The implemntation is actually in pttableau.py which is used by ptgraph2.py +# (Pro-Origami), this is basically just a wrapper for testing / standalone +# tableau creation (see pttableau.py). +# +# Also used to create SSE midpoint distance matrix. +# +# Tableaux are described by Kamat and Lesk 2007 +# 'Contact Patterns Between Helices and Strands of Sheet Define Protein +# Folding Patterns' Proteins 66:869-876 +# and Lesk 2003 'From Electrons to Proteins and Back Again' +# Int. J. Quant. Chem. 95:678-682 +# and Lesk 1995 'Systematic representation of folding patterns' +# J. Mol. Graph. 13:159-164. +# +# The implementation is based on Arun Konagurthu's TableauCreator program, see +# Konagurthu, Stuckey and Lesk 2008 'Structural search and retrieval using +# a tableau representation of protein folding patterns' Bioinformatics +# (advance access, to be published Jan 5 2008). +# +# Example usage: +# +# pytableaucreate.py 1QLP.pdb +# +# Filenames may be either in the format above or the pdbq1lp.pdb format. +# Compressed pdb files are supported (gzip) (e.g. pdb1qlp.ent.gz). +# +# It is written in Python and depends on some Python libraries: +# +# . BioPython (including Bio.PDB) +# http://www.biopython.org +# +# Reference for Bio.PDB is: +# Hamelryck and Manderick 2003 "PDB parser and structure class implemented +# in Python" Bioinformatics 19:2308-2310 +# +# which in turn depends on Numeric +# http://sourceforge.net/projects/numpy +# +# +# Developed on Linux 2.6.9 (x86_64) with Python 2.5.1 +# and BioPython 1.43 with Numeric 24.2 +# +############################################################################### + + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os +import getopt +import re +import pickle +import random +import copy +from math import degrees +import numpy.oldnumeric as Numeric +from Bio.PDB import * + +import ptsecstruct +from ptnode import ptnode_set_verbose +from ptdomain import * +from ptutils import cleanup_tmpdir,isNaN +import getdomains +from tableaubuild import TableauBuild,make_tableaux +from pttableau import PTTableauPacked + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def write_tableau(n, tableau, permutation, use_numeric, + fortran_format, build_distance_matrix): + """ + Write tableau or distance matrix to stdout. + + n - order of tableau or distance matrix (n by n) + tableau - PTTableau object for tableau or Numeric matrix for + Omega matrix or Numeric matrix for distance matrix + permutation - permuted list of ingeters in interval [0, n-1] to + permute the rows+cols of the tableau/matrix by + (so [0,1,2,...n-1] for no permutation. + use_numeric - boolean. If true, tableau is a Numeric Omega matrix + not a tableau. + fortran_format - boolean. If True, put in lower triangle format + for FORTRAN programs tsrchd etc. + build_distance_matrix - boolean. If True is a distance matrix not a + tableau or Omega matrix. + + """ + if build_distance_matrix: + distmatrix = tableau + if fortran_format: + for k in range(n): + for l in range(k+1): + kprime = permutation[k] + lprime = permutation[l] + if isNaN(distmatrix[kprime,lprime]): + dist = 0.0 + else: + dist = distmatrix[kprime,lprime] + if dist > 99.9: + sys.stderr.write('WARNING: distance %f at (%d,%d) truncated to 99.9 for fortran format\n' % (dist,kprime,lprime)) + dist = 99.9 + sys.stdout.write("%6.3f " % dist) + sys.stdout.write("\n") + else: + for k in range(n): + for l in range(n): + kprime = permutation[k] + lprime = permutation[l] + sys.stdout.write("% 6.2f " % distmatrix[kprime,lprime]) + sys.stdout.write("\n") + elif use_numeric: + Omega = tableau + if fortran_format: + for k in range(n): + for l in range(k+1): + kprime = permutation[k] + lprime = permutation[l] + if isNaN(Omega[kprime,lprime]): + angle = 0.0 + else: + angle = Omega[kprime,lprime] + sys.stdout.write("%6.3f " % angle) + sys.stdout.write("\n") + else: + for k in range(n): + for l in range(n): + kprime = permutation[k] + lprime = permutation[l] + sys.stdout.write("% 4.3f " % Omega[kprime,lprime]) + sys.stdout.write("\n") + else: + if fortran_format: + for k in range(n): + for l in range(k+1): + kprime = permutation[k] + lprime = permutation[l] + sys.stdout.write(tableau[(kprime,lprime)] + " ") + sys.stdout.write("\n") + else: + # can't just sys.stdout.write(str(tableau)) if shuffled... + for k in range(n): + for l in range(n): + kprime = permutation[k] + lprime = permutation[l] + sys.stdout.write(tableau[(kprime,lprime)] + " ") + sys.stdout.write('\n') + + +def write_tableau_old_format(n, Omega, ssestr): + """ + Write tableau to stdout in the original + (Arun) TableauCreator format, with angles in degrees, + full matrix, number of SSEs on first line and SSE sequence + (DSSP codes E,H) on second line. + + n - order of tableau matrix (n by n) + Omega - Numeric matrix for Omega matrix + sse_str - SSE string correspdonding to the Omega matrix + """ + sys.stdout.write(str(len(Omega)) + '\n') + sys.stdout.write(ssestr + '\n') + for k in range(n): + for l in range(n): + angle = degrees(Omega[k,l]) + if isNaN(angle) or k == l: + angle = -999.0 + sys.stdout.write("% 7.1f " % angle) + sys.stdout.write("\n") + +def write_distmatrix_old_format(n, dmat, ssestr): + """ + Write distance matrix to stdout in format for TableauComparer + (Arun) + full matrix, number of SSEs on first line and SSE sequence + (DSSP codes E,H) on second line. + + n - order of distance matrix (n by n) + dmat - Numeric matrix for distance matrix + sse_str - SSE string correspdonding to the distance matrix + """ + sys.stdout.write(str(len(dmat)) + '\n') + sys.stdout.write(ssestr + '\n') + for k in range(n): + for l in range(n): + d = dmat[k,l] + if isNaN(d) or k == l: + d = -999.0 + sys.stdout.write("% 7.1f " % d) + sys.stdout.write("\n") + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + + +def usage(progname): + """ + Print usage message and exit + """ + sys.stderr.write("Usage: " + progname + + " [-35knuvfe] [-d|-b] [-t struct_prog] " + "[-p domain_prog] [-a domainid] [-s sse_num_list] [-c chainid] " + "[-m min_sse_len] [-o ] [-i identifier] " + "\n") + sys.stderr.write(" -3 include 3_10 helices\n") + sys.stderr.write(" -5 include pi helices\n") + sys.stderr.write(" -k use HH and KK codes for anti/parallel strands in same sheet\n") + sys.stderr.write(" -n output numeric matrix rather than tableau\n") + sys.stderr.write(" -e output numeric tableau angles in degrees, in original TableauCreator .angles file format\n") + sys.stderr.write(" -f output in FORTRAN style format for TSRCHN\n") + sys.stderr.write(" -d build SSE axis midpoint distance matrix not tableau\n") + sys.stderr.write(" -b build both tableau and distance matrix\n") + sys.stderr.write(" -p domain decomposition method/db\n" + " valid values are none (default), " + "ddomain, cath:cdffile, pdomains:pdomainsfile\n") + sys.stderr.write(" -a domainid : only output for specified domain\n") + sys.stderr.write(" -t struct_prog : use struct_prog define " \ + "secondary structure\n") + sys.stderr.write(" supported is 'pdb' (default) or 'stride' or 'dssp'\n") + sys.stderr.write(" -s sse_num_list : specifies comma-separated list of " + "SSE sequential numbers to include in the tableau\n") + sys.stderr.write(" -m min_sse_len : minimum number of residues in SSE to " + "be included in tableau\n") + sys.stderr.write(" -c chainid : specify chain identifier; only build" + "tableau for that chain\n") + sys.stderr.write(" -i identifier : when using -f, specify identifier " + " to use rather than deriving from filename\n") + sys.stderr.write(" -o savefile : save tableau in packed format for use " + "in other programs such as tabsearchqpml.py\n" + " WARNING: savefile is overwritten if it exists.\n") + sys.stderr.write(" -u randomly permute the tableau/distance matrix\n") + sys.stderr.write(" -v print verbose debugging messages to stderr\n") + sys.exit(1) + + +def main(): + """ + main for pytableaucreate.py + + Usage: pytableaucreate [-35nefuv] [-d|-b] [-t structprog] [-p domainprog] + [-a domainid] + [-s sse_num_list] [-c chainid] [-m min_sse_len] + [-o savefile] + + + -3 specifies to include 3_10 helices in the diagram. Default is only + alpha helices. + + -5 specifies to include pi helices in the diagram. Defaul is only + alpha helices. + + -k use the HH and KK codes for respectively antiparallel and parallel + strands in the same sheet, rather than the O, P etc. codes. + + -n output a numeric omega matrix instead of tableau. + + -e output numeric tableau angles in degrees, in the original + TableauCreator .angles file format, with number of entries on + first line, SSE sequence description on second line (E/H), then + (full) matrix with angles in degrees (rather than radians). + For distance matrix, same format with distances between SSEs + in Angstroms. + + -f output the matrix in 'FORTRAN style' lower triangle with + header line suitable for input to TMATN. + + -d build SSE axis midpoint distance matrix rather than tableau. + + -b build both the tableau and distance matrix and output together, + for use with tsrchd etc. for example. If -u is used to permute + the matrices, they are permuted the same way so they are still + consistent. + + -p specify the domain decomposition method. + Valid values are 'none' (default), 'ddomain', 'cath:cdf_filename'. + + -a domainid : only output specified domain + + -t specifies the secondary structure assignment program to use. + Currently suppoed is 'pdb' and 'dfh,ssp' and 'stride'. Default 'pdb'. + + -s sse_num_list specifies a comman-separated + list of SSE sequential ids to build the + tableau for. SSE sequential id's start at 1 and go from N to C + terminus. E.g. -s1,5,8 includes only the 1st, 5th and 8ths SSEs. + Numbers do not restart at chains (but do restart in each domain). + These nubmers are those assigned by 'ptgraph2 -b sequential' option. + + TODO: this currently does not make sense when multiple domains + are being procssed, this option applies to each domain. + + -c chainid : specify chain identifier; only build tableau for that chain + + -m min_sse_len : minimum nubmer of residues in SSE for it to be included + + -i identifier : when using fortran format (-f), specify the identifier + to use in the output rather than deriving it from the filename + + -o savefile : save tableau in packed format for use in other + programs, such as tabsearchqpml.py + WARNING: savefile is overwritten if it exists + + TODO: this currently does not make sense when multiple domains + are being procssed, this option only saves first domain. + + -u randomly pemute the rows+cols (symmetric) of the tableau/distance matrix. + writes the permutation vector in form + permutation = i,j,..,m + e.g. + permutation = 3,1,2,4 + as first line of output before identifier information and tableau + + -v specifies verbose mode: debugging output is written to stderr. + """ + global verbose + + try: + opts, args = getopt.getopt(sys.argv[1:], "35bdfknep:a:t:s:c:m:i:o:uv?") + except getopt.GetoptError: + usage(os.path.basename(sys.argv[0])) + + valid_secstruct_programs = ["dssp", "stride", "pdb"] + valid_domain_programs = getdomains.valid_domain_programs + [r"none"] + valid_domain_programs_re = [ re.compile(re_str) for re_str in + valid_domain_programs ] + + verbose = False # global (python globals are only 'global' to module though) + secstruct_program = "pdb" + include_310_helices = False + include_pi_helices = False + domain_program = "none" + sse_id_list = None + use_numeric = False + use_hk = False + savefilename = None + min_sse_len = None + fortran_format = False + build_distance_matrix = False + chainid = None + fident = None + do_shuffle = False + build_both = False # both tableau and dist matrix + use_old_format = False # size + SSE chain + degrees omega matrix + domainid = None + + for opt,arg in opts: + if opt == "-3": # include 3_10 helices + include_310_helices = True + elif opt == "-5": # include pi helices + include_pi_helices = True + elif opt == "-d": # build SSE midpoint distance matrix not tableau + build_distance_matrix = True + elif opt == "-b": # build both tableau and distance matrix + build_both = True + elif opt == "-k": # use HH and KK codes + use_hk = True + elif opt == "-n": # output numeric matrix not tableau + use_numeric = True + elif opt == "-e": # use TableauCreator .angles file format + use_old_format = True + elif opt == "-f": # FORTRAN style format for TMATN + fortran_format = True + elif opt == "-p": # domain parsing program + domain_program = None + for valid_domarg_re in valid_domain_programs_re: + if valid_domarg_re.match(arg): + domain_program = arg + break + if domain_program == None: + sys.stderr.write("valid values for -p are: " + + str(valid_domain_programs) + "\n") + usage(sys.argv[0]) + elif opt == "-a": # only output tableau for specified domain id + domainid = arg + elif opt == "-t": + if arg not in valid_secstruct_programs: + sys.stderr.write("valid values for -t are: " + + str(valid_secstruct_programs) + "\n") + usage(sys.argv[0]) + secstruct_program = arg + elif opt == "-s": + sse_id_list_str = arg.split(',') + sse_id_list = [] + sse_id_uniq_dict = {} # { id : True } just for checking all unique + for sse_id_str in sse_id_list_str: + if sse_id_str.isdigit(): + if sse_id_uniq_dict.has_key(int(sse_id_str)): + sys.stderr.write("duplicate SSE sequential number " + + sse_id_str + "\n") + usage(sys.argv[0]) + sse_id_uniq_dict[int(sse_id_str)] = True + sse_id_list.append(int(sse_id_str)) + else: + sys.stderr.write("not a valid SSE sequential number '" + + sse_id_str + "'\n") + usage(sys.argv[0]) + sse_id_list.sort() # ensure SSEs are in order + elif opt == "-c": # chain identifier + if len(arg) != 1: + sys.stderr.write("invalid chain identifier for -c option\n") + usage(sys.argv[0]) + chainid = arg.upper() + elif opt == "-m": # min sse len + min_sse_len = int(arg) + elif opt == "-i": # identifier to use for fortran format + fident = arg + elif opt == "-o": # save tableau in packed format + savefilename = arg + elif opt == "-u": # randomly permute the tableau/matrix + do_shuffle = True + elif opt == "-v": # verbose + verbose = True # this module only + ptnode_set_verbose(True) # ptnode module + ptsecstruct.ptsecstruct_set_verbose(True) # ptsecstruct module + ptdomain_set_verbose(True) # ptdomain module + else: + usage(sys.argv[0]) + + if use_numeric and use_hk: + sys.stderr.write("-n (numeric) and -k (use HH and KK codes) are " + "mutually exlusive\n") + usage(sys.argv[0]) + + if build_distance_matrix and build_both: + sys.stderr.write("WARNING: both -d (build dist matrix) and -b " + "(build both) specified, ignoring -d\n") + build_distance_matrix = False + + if savefilename and do_shuffle: + sys.stderr.write('WARNING: saved tableau will not be shuffled\n') + + if build_distance_matrix: + if use_numeric: + use_numeric = False + sys.stderr.write("WARNING: -n (numeric) ignored for -d (distance matrix)\n") + if use_hk: + sys.stderr.write("-k (use HH and KK) invalid for -d (distance matrix)\n"); + usage(sys.argv[0]) + + if fident: + if not fortran_format: + sys.stderr.write("-i is only valid with -f\n") + usage(sys.argv[0]) + elif len(fident) > 8: + sys.stderr.write("identifier must be 8 chars or less\n") + usage(sys.argv[0]) + + if use_old_format and (build_both or + use_hk or use_numeric or fortran_format or + do_shuffle or savefilename): + sys.stderr.write("-e (use old .angles format) is not compatible " + "with -b -k or -n or -f or -u or -o\n") + usage(os.path.basename(sys.argv[0])) + + if len(args) != 1: + usage(os.path.basename(sys.argv[0])) + + pdb_filename = args[0] + + # check for compressed files. We only support gzip (.gz) + # Note we are not using the zlib or GzipFile python modules + # since we are calling to external programs which require the + # file uncompressed themsevles anyway so we'll just run gzip + # to uncompress the file to a temporary directory. + pdb_file_basename = os.path.basename(pdb_filename) + (name,extension) = os.path.splitext(pdb_file_basename) + if extension == '.gz': + TMPDIR = os.tempnam(None, "ptgz") + os.mkdir(TMPDIR) + tmp_pdbfilename = os.path.join(TMPDIR, name) + os.system("gzip " + pdb_filename + " -d -c > " + tmp_pdbfilename) + our_pdb_filename = tmp_pdbfilename + used_tmp_file = True + else: + our_pdb_filename = pdb_filename + used_tmp_file = False + + try: + if fortran_format and fident: + pdbid = fident + else: + pdbid = name.upper() + if len(pdbid) >= 6 and pdbid[:3] == "PDB": + pdbid = pdbid[3:7] + if chainid: + pdbid += '_' + chainid + + # parse PDB file + pdb_parser = PDBParser() + pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename) + # create the Tableaux and output them + (tableaux_list, ssestr_list) = make_tableaux(our_pdb_filename, + pdb_struct, + secstruct_program, + domain_program, + include_310_helices, + include_pi_helices, + (use_numeric or use_old_format), + sse_id_list, + use_hk, + min_sse_len, + build_distance_matrix, + chainid, + domainid) + if build_both: + (distmatrix_list, ssestr_list) = make_tableaux(our_pdb_filename, + pdb_struct, + secstruct_program, + domain_program, + include_310_helices, + include_pi_helices, + use_numeric, + sse_id_list, + use_hk, + min_sse_len, + True, # build_distance_matrix + chainid, + domainid) + i = 1 + for tableau in tableaux_list: + n = len(tableau) + permutation = range(n) # used to permute rows/cols: null permutation + if do_shuffle: + random.shuffle(permutation) # actually permute for shuffle mode + if verbose: + sys.stderr.write('permutation is: ' + str(permutation)+'\n') + sys.stdout.write('permutation = ' + ','.join([str(x+1) for x in permutation]) + '\n') + if i > 1: + sys.stdout.write('\ndomain ' + str(i) + ':\n') + + if fortran_format: + sys.stdout.write("%7s %4d\n" % (pdbid.upper(), n)) + + if use_old_format: + if build_distance_matrix: + write_distmatrix_old_format(n, tableau, ssestr_list[i-1]) + else: + write_tableau_old_format(n, tableau, ssestr_list[i-1]) + else: + write_tableau(n, tableau, permutation, use_numeric, + fortran_format, build_distance_matrix) + + if build_both: + write_tableau(n, distmatrix_list[i-1], + permutation, use_numeric, + fortran_format, True) + + i += 1 + finally: + if used_tmp_file: + cleanup_tmpdir(TMPDIR) + + + if savefilename: + if verbose: + sys.stderr.write('writing tableau to ' + savefilename +'\n') + fh = open(savefilename, "w") + if len(tableaux_list) > 1: + sys.stderr.write('WARNING: only saving first tableau in list\n') + if build_distance_matrix: + pickle.dump(distmatrix, fh) + elif use_numeric: + # Numeric/numpy seems to have no 'packed' format for symmetric + # matrices, so we just have to dump the whole thing. + pickle.dump(Omega, fh) + else: + pickle.dump(PTTableauPacked(tableaux_list[0]), fh) + fh.close() + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() diff --git a/scripts/qptabmatch_allall.py b/scripts/qptabmatch_allall.py new file mode 100755 index 0000000..9d7a091 --- /dev/null +++ b/scripts/qptabmatch_allall.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +############################################################################### +# +# qptabmatch_allall.py - run the QP tableau matching with all tableauxdistmatrix +# files in a directory against a db (file of multiple +# tableaux+distmatrices) +# +# File: qptabmatch_allall.py +# Author: Alex Stivala +# Created: September 2008 +# +# +# Run QP tableau match on all tableaux in a directory against database +# of tableaux such +# as that created by build_fischer_db.sh. Each file in the directory +# has a .tableaudistmatrix suffix and contains the header line with +# identifeir and dimension, then tabelau and SSE distance matrix +# (both lower triangle fortran format +# for tsrchrd_sparse etc.) +# The db is is many such files concantentaed together (with blank line +# between each) +# +# This is _allall since for Fischer data set the db is actually just +# all the tableaux in directory, so it is doing +# duplicate comparisons so for n (=68) tableux it does +# n*n (=4624) comparisons. +# +# +# Usage: +# qptabmatch_allall.py program query_directory dbfile results_directory +# +# query_directory is the directory containing .tableaudistmatrix files, +# as built with build_fischer_db.sh for example. +# +# db_file is the 'database' file of tableaux+distmatrices. +# +# results_dirctory is a directory to write the output to. +# Each query (.tableauxdistmatrix file) results in one file created +# with .out suffix, and stderr in file with .err suffix +# in the output directory, containing the results from that query +# against the db. +# Each file is created by tsrchd_sparse (see tsrchd.f for output format): +# each line is query identifier and score (whitespace delimited). +# WARNING: these files overwritten if they exist. +# results_directory is created if it does not exist. +# +# $Id: qptabmatch_allall.py 3583 2010-04-29 02:11:46Z alexs $ +# +############################################################################### + +import sys,os,glob + +def usage(progname): + """ + Print usage message and exit + """ + sys.stderr.write("Usage: " + progname + " \n") + sys.exit(1) + + +def main(): + """ + main for qptabmatch_allall.py + """ + if len(sys.argv) != 5: + usage(os.path.basename(sys.argv[0])) + + tsrchd_program = sys.argv[1] + query_directory = sys.argv[2] + db_file = sys.argv[3] + results_directory = sys.argv[4] + + if not os.path.exists(results_directory): + os.mkdir(results_directory) + elif not os.path.isdir(results_directory): + sys.stderr.write('%s is not a directory\n' % results_directory) + sys.exit(1) + + input_list = glob.glob(os.path.join(query_directory, '*.tableaudistmatrix')) + i = 0 + while i < len(input_list): + qfile = input_list[i] + qid = open(qfile).readline()[:8].lstrip().rstrip() + outfile = os.path.join(results_directory, + os.path.splitext(os.path.basename(qfile))[0] + '.out' ) + errfile = os.path.join(results_directory, + os.path.splitext(os.path.basename(qfile))[0] + '.err' ) + tsrchd_in = os.popen('/usr/bin/time ' + tsrchd_program + ' > '+ outfile + ' 2> ' + errfile, 'w') + tsrchd_in.write(db_file + '\n') # name of db file + tsrchd_in.write('T T F\n') # LTYPE LORDER LSOLN + tsrchd_in.write(open(qfile).read()) # tableau+distmatrix of qfile + tsrchd_in.close() + i += 1 + +if __name__ == "__main__": + main() + diff --git a/scripts/qptabmatch_allall_nodbfile.py b/scripts/qptabmatch_allall_nodbfile.py new file mode 100755 index 0000000..0ba6836 --- /dev/null +++ b/scripts/qptabmatch_allall_nodbfile.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +############################################################################### +# +# qptabmatch_allall_nodbfile.py - +# run the QP tableau matching with all tableauxdistmatrix +# files in a directory against each other +# +# File: qptabmatch_allall_nodbfile.py +# Author: Alex Stivala +# Created: September 2008 +# +# +# Run QP tableau match on all tableaux in a directory against each other +# Each file in the directory +# has a .tableaudistmatrix suffix and contains the header line with +# identifeir and dimension, then tabelau and SSE distance matrix +# (both lower triangle fortran format +# for tsrchrd_sparse etc.) +# +# This is _allall_nodbfile since for Fischer data set the db is actually just +# all the tableaux in directory, so it is doing +# duplicate comparisons so for n (=68) tableux it does +# n*n (=4624) comparisons. +# +# Note _allall uses a dbfile so n comparisons are done each run of tsrchd, +# so much more efficient (less overhead) than doing it this way. +# This _allall_nodbfile version does only one comparison on each run, +# the way MSVNS4MaxCMO (Pelta et al 2008) does, to make timings comparable. +# +# +# Usage: +# qptabmatch_allall_nodbfile.py query_directory results_directory +# +# query_directory is the directory containing .tableaudistmatrix files, +# as built with build_fischer_db.sh for example. +# +# results_dirctory is a directory to write the output to. +# Each query (.tableauxdistmatrix file) results in one file created +# with .out suffix +# in the output directory, containing the results from that query +# against all the other .tableauxdistmatrix files (and itself). +# Each file is created by parsing tsrchd_sparse output: +# each line is query identifier and score (whitespace delimited). +# WARNING: these files overwritten if they exist. +# results_directory is created if it does not exist. +# +# Environment variables: +# +# PATH must contain the location of tsrchd_sparse. +# +# +# $Id: qptabmatch_allall_nodbfile.py 1970 2008-10-10 01:12:47Z astivala $ +# +############################################################################### + +import sys,os,glob + +def usage(progname): + """ + Print usage message and exit + """ + sys.stderr.write("Usage: " + progname + " \n") + sys.exit(1) + + +def main(): + """ + main for qptabmatch_allall_nodbfile.py + """ + if len(sys.argv) != 3: + usage(os.path.basename(sys.argv[0])) + + query_directory = sys.argv[1] + results_directory = sys.argv[2] + + if not os.path.exists(results_directory): + os.mkdir(results_directory) + elif not os.path.isdir(results_directory): + sys.stderr.write('%s is not a directory\n' % results_directory) + sys.exit(1) + + input_list = glob.glob(os.path.join(query_directory, '*.tableaudistmatrix')) + i = 0 + while i < len(input_list): + qfile = input_list[i] + qid = open(qfile).readline()[:8].lstrip().rstrip() + if qid == '': + i += 1 + continue # empty file, happens for 41010 + outfile = os.path.join(results_directory, + os.path.splitext(os.path.basename(qfile))[0] + '.out' ) + outfh = open(outfile, 'w') + j = 0 + while j < len(input_list): + tgtfile = input_list[j] + if os.path.getsize(tgtfile) == 0: + j += 1 + continue # empty file, happens for 41010 + (tsrchd_in, tsrchd_out) = os.popen2('tsrchd_sparse') + tsrchd_in.write(tgtfile + '\n') # name of db file + tsrchd_in.write('T T F\n') # LTYPE LORDER LSOLN + tsrchd_in.write(open(qfile).read()) # tableau+distmatrix of qfile + tsrchd_in.close() + for line in tsrchd_out: + if line[0] == '#': + continue + sline = line.split() + if sline[0] == '': + continue + tgtid = sline[0] + score = float(sline[1]) + outfh.write('%8s %12.4f\n' % (tgtid,score)) + j += 1 + outfh.close() + i += 1 + +if __name__ == "__main__": + main() + diff --git a/scripts/qptabmatch_allpairs.py b/scripts/qptabmatch_allpairs.py new file mode 100755 index 0000000..6aa5f24 --- /dev/null +++ b/scripts/qptabmatch_allpairs.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +############################################################################### +# +# qptabmatch_allpairs.py - run the QP tableau matching pairwise on all +# tableaux in a directory +# +# File: qptabmatch_allpairs.py +# Author: Alex Stivala +# Created: September 2008 +# +# +# Run QP tableau match on all pairs of tableaux in a directory such +# as that created by build_skolnick_db.sh. Each file in the directory +# has a .tableaudistmatrix suffix and contains the header line with +# identifeir and dimension, then tabelau and SSE distance matrix +# (both lower triangle fortran format +# for tsrchrd_sparse etc.) +# +# This does not do duplicate comparisons (i.e. runs on a,b exactly once, +# does not also run on b,a) so for n .tableaudistmatrix files gives +# n(n-1)/2 scores. +# +# +# Usage: +# qptabmatch_allpairs.py db_directory +# +# db_directory is the directory containing .tableaudistmatrix files, +# as built with build_skolnick_db.sh for example. +# +# Output is to stdout, tab-delimited in the format +# +# pdbid1 pdbid2 score +# +# e.g. +# +# 1BAW 1KDI -50.9997 +# +# Environment variables: +# +# PATH must contain the location of tsrchd_sparse. +# The dssp program must also be in the PATH. +# +# +# $Id: qptabmatch_allpairs.py 1884 2008-09-11 07:59:29Z astivala $ +# +############################################################################### + +import sys,os,glob +from time import strftime,localtime + + +def usage(progname): + """ + Print usage message and exit + """ + sys.stderr.write("Usage: " + progname + " \n") + sys.exit(1) + + +def main(): + """ + main for qptabmatch_allpairs.py + """ + if len(sys.argv) != 2: + usage(os.path.basename(sys.argv[0])) + + db_directory = sys.argv[1] + + sys.stdout.write('# Generated by: \n') + sys.stdout.write('# ' + ' '.join(sys.argv)) + sys.stdout.write('\n') + timestamp = strftime("%d%b%Y %H:%M:%S", localtime()) + sys.stdout.write('# on ' + timestamp + '\n') + sys.stdout.write('#\n') + + input_list = glob.glob(os.path.join(db_directory, '*.tableaudistmatrix')) + i = 0 + while i < len(input_list): + j = i + 1 + while j < len(input_list): + file1 = input_list[i] + file2 = input_list[j] + qid = open(file1).readline()[:8].lstrip().rstrip() + (tsrchd_in, tsrchd_out) = os.popen2(['tsrchd_sparse']) + tsrchd_in.write(file2 + '\n') # name of db file + tsrchd_in.write('T T F\n') # LTYPE LORDER LSOLN + tsrchd_in.write(open(file1).read()) # tableau dn distmatrix of file1 + tsrchd_in.close() + for line in tsrchd_out: + if line[0] == '#': + continue + pdbid = line[:8].lstrip().rstrip() + score = float(line[7:]) + break + tsrchd_out.close() + sys.stdout.write('%s\t%s\t%.4f\n' % (qid, pdbid, score)) + j += 1 + i += 1 + +if __name__ == "__main__": + main() + diff --git a/scripts/qptabmatch_n3hd.sh b/scripts/qptabmatch_n3hd.sh new file mode 100755 index 0000000..cdefc3b --- /dev/null +++ b/scripts/qptabmatch_n3hd.sh @@ -0,0 +1,63 @@ +#!/bin/sh +############################################################################### +# +# qptabmatch_.nh3d.sh - run the QP tableau matching on Nh3D data set +# +# File: qptabmatch_nh3d.sh +# Author: Alex Stivala +# Created: September 2008 +# +# Run QP tableau matching on +# the Nh3D data set (Thiruv et al 2005 BMC Struct. Biol. 5:12) +# with the 73 queries defined in Pelta et al 2008 BMC Bioinformatics 9:161 +# +# Usage: +# qptabmatch_nh3d.sh indir dbfile outdir +# +# indir is directory containing the tableaux+dismatrices built with +# build_nh3d_db.sh +# +# dbfile is the nh3d tableau+distmatrices database file built with +# build_nh3d_db.sh +# +# outdir is diretory to place corresponding output from tsrchd_sparse +# created if it does not exist +# WARNNG: .out files in outdir overwritten if they exist +# +# Environment variables: +# +# PATH must contain the location of tsrchd_sparse +# +# $Id: qptabmatch_n3hd.sh 1887 2008-09-13 00:52:50Z astivala $ +# +############################################################################### + + +# List of query CATH identifiers, from the Additional File 1 spreadsheet +# for Pelta et al 2008 +QUERY_LIST="1.10.1040 1.10.1320 1.10.533 1.10.645 1.20.1280 1.20.210 1.20.5 1.20.840 2.10.25 2.10.260 2.10.270 2.10.90 2.170.16 2.170.230 2.170.290 2.170.40 2.30.110 2.30.18 2.30.230 2.30.29 2.30.40 2.40.155 2.40.160 2.40.180 2.40.340 2.40.50 2.60.130 2.60.260 2.60.420 2.60.90 2.70.100 2.70.180 2.70.220 2.70.98 3.10.105 3.10.170 3.10.270 3.10.330 3.10.400 3.20.120 3.20.140 3.20.19 3.20.70 3.20.90 3.30.1530 3.30.1690 3.30.240 3.30.559 3.30.560 3.30.60 3.30.990 3.40.1210 3.40.1380 3.40.225 3.40.720 3.60.100 3.60.120 3.60.20 3.60.40 3.60.90 3.90.1280 3.90.1300 3.90.1350 3.90.1580 3.90.510 3.90.850 4.10.1080 4.10.1090 4.10.220 4.10.260 4.10.480 4.10.540 4.10.790" + +if [ $# -ne 3 ]; then + echo "Usage: $0 indir dbfile outdir" 2>&1 + exit 1 +fi + +indir=$1 +dbfile=$2 +outdir=$3 + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + +for qid in ${QUERY_LIST} +do + tmpfile=/tmp/nh3d.${qid}.$$ + echo ${dbfile} >$tmpfile + echo "T T F" >> $tmpfile + tfile=${indir}/`echo $qid | tr -d .`.tableaudistmatrix + cat ${tfile} >> $tmpfile + tsrchd_sparse < $tmpfile > ${outdir}/${qid}.out + rm $tmpfile +done + diff --git a/scripts/qptabmatch_nh3d_nodbfile.sh b/scripts/qptabmatch_nh3d_nodbfile.sh new file mode 100755 index 0000000..2641588 --- /dev/null +++ b/scripts/qptabmatch_nh3d_nodbfile.sh @@ -0,0 +1,81 @@ +#!/bin/sh +############################################################################### +# +# qptabmatch_nh3d_nodbfile.sh - run the QP tableau matching on Nh3D data set +# not using db file of tableaux +# +# File: qptabmatch_nh3d.sh +# Author: Alex Stivala +# Created: September 2008 +# +# Run QP tableau matching on +# the Nh3D data set (Thiruv et al 2005 BMC Struct. Biol. 5:12) +# with the 73 queries defined in Pelta et al 2008 BMC Bioinformatics 9:161 +# +# This version does not use the file of tableaux+distmatrices crated +# by build_nh3d_db.sh, just the individual files in the indir, +# so that it can be timed comparably with MSVNS4MaxCMO that has to work +# this way. I.e. this has all the overhead of starting tsrchd_sparse +# for every pairise comparison, rather than doing one query against whole +# nh3d db in one run. +# +# Usage: +# qptabmatch_nh3d.sh indir outdir +# +# indir is directory containing the tableaux+dismatrices built with +# build_nh3d_db.sh +# +# outdir is diretory to place corresponding output from tsrchd_sparse +# created if it does not exist +# WARNNG: .out files in outdir overwritten if they exist +# +# Environment variables: +# +# PATH must contain the location of tsrchd_sparse +# +# $Id: qptabmatch_nh3d_nodbfile.sh 1971 2008-10-10 01:25:16Z astivala $ +# +############################################################################### + + +# List of query CATH identifiers, from the Additional File 1 spreadsheet +# for Pelta et al 2008 +QUERY_LIST="1.10.1040 1.10.1320 1.10.533 1.10.645 1.20.1280 1.20.210 1.20.5 1.20.840 2.10.25 2.10.260 2.10.270 2.10.90 2.170.16 2.170.230 2.170.290 2.170.40 2.30.110 2.30.18 2.30.230 2.30.29 2.30.40 2.40.155 2.40.160 2.40.180 2.40.340 2.40.50 2.60.130 2.60.260 2.60.420 2.60.90 2.70.100 2.70.180 2.70.220 2.70.98 3.10.105 3.10.170 3.10.270 3.10.330 3.10.400 3.20.120 3.20.140 3.20.19 3.20.70 3.20.90 3.30.1530 3.30.1690 3.30.240 3.30.559 3.30.560 3.30.60 3.30.990 3.40.1210 3.40.1380 3.40.225 3.40.720 3.60.100 3.60.120 3.60.20 3.60.40 3.60.90 3.90.1280 3.90.1300 3.90.1350 3.90.1580 3.90.510 3.90.850 4.10.1080 4.10.1090 4.10.220 4.10.260 4.10.480 4.10.540 4.10.790" + +if [ $# -ne 2 ]; then + echo "Usage: $0 indir outdir" 2>&1 + exit 1 +fi + +indir=$1 +outdir=$2 + +if [ ! -d ${outdir} ]; then + mkdir ${outdir} +fi + + +tmpfile=/tmp/qtsnh3d$$ +for aid in ${QUERY_LIST} +do + afile=${indir}/`echo $aid | tr -d .`.tableaudistmatrix + outfile=${outdir}/${aid}.out + cat /dev/null > ${outfile} + for bfile in ${indir}/*.tableaudistmatrix + do + if [ `ls -s ${bfile} | cut -d' ' -f1` -eq 0 ]; then + # zero-size file, happens for 41010.tableauxdistmatrix + continue + fi + bid=`basename ${bfile} .tableaudistmatrix` + zbid=`echo ${bid} | tr -d .` + echo ${bfile} >$tmpfile + echo "T T F" >> $tmpfile + cat ${afile} >> $tmpfile + score=`tsrchd_sparse < $tmpfile | grep -v '^#' | grep -v '^$' | awk '{print $2}'` + printf '%8s %12.4f\n' ${zbid} ${score} >> ${outfile} + done +done +rm $tmpfile + + diff --git a/scripts/qptabmatch_pbs_script.sh b/scripts/qptabmatch_pbs_script.sh new file mode 100644 index 0000000..2412472 --- /dev/null +++ b/scripts/qptabmatch_pbs_script.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# +# File: qptabmatch_pbs_script.sh +# Author: Alex Stivala +# Created: July 2009 +# +# PBS script for submitting QP tableau search jobs on tango.vpac.org +# requires PATH and PYTHONPATH already set up in environment +# +# $Id: qptabmatch_pbs_script.sh 2716 2009-07-29 02:07:14Z astivala $ + +#PBS -N QP_tableau_search + +#PBS -l walltime=1:0:0 + +#PBS -v MKL_NUM_THREADS=1 + +module load python + +cd $PBS_O_WORKDIR +set CONV_RSH = ssh + + +time qptabmatchstructs.sh /home/alexs/share/ASTRAL/pdbstyle-sel-gs-bib-95-1.75/ql/d1qlpa_.ent /home/alexs/share/ASTRAL/pdbstyle-sel-gs-bib-95-1.75/ql/d1qlpa_.ent + + diff --git a/scripts/qptabmatchstructs.sh b/scripts/qptabmatchstructs.sh new file mode 100755 index 0000000..bfef075 --- /dev/null +++ b/scripts/qptabmatchstructs.sh @@ -0,0 +1,179 @@ +#!/bin/bash +############################################################################### +# +# qptabmatchstructs.sh - run the QP tableau matching on two structures +# +# File: qptabmatchstructs.sh +# Author: Alex Stivala +# Created: August 2008 +# +# Run the QP tableau matching program on two structures in PDB format. +# Outputs to stdout a PyMOL script (.pml file) containing a visualzation +# of the matches (maximally similar substructures coloured) with the match +# socre as a comment, and, if the -s option is given, writes a PDB +# file with structures superimposed in cwd. +# +# Usage: +# qptabmatchstructs.sh [-skquh] [-e sse_num_list] +# struct1.pdb struct2.pdb > output.pml +# +# -s: write PDB file with superposition to cwd, filenme is +# queryid_dbid.pdb. WARNING: overwrites if file exists. +# +# -k: use the tableau HH and KK codes for antiprallel/parallel +# strands in same sheet. +# +# -q: do not use the ordering constraint (allow nonsequential matchings) +# +# -u: randomly permute the rows+columns of the struct1 tableau+distmatrix +# (This is for testing non-sequential tableau matching; so isn't +# useful except with -q) +# +# -h: use fast heuristic (simulated annealing) version instead of QP +# +# -e sse_num_list: list of SSE sequential numbers to select from +# struct1 reather than whole structure +# +# Then in PyMOL use @output.pml to run the PyMOL script. +# +# Uses the Python scripts pytableaucreate.p to create tableaux for input +# to the FORTRAN tsrchd_sparse program, and Python scripts +# soln2ssemap.py and ssemap2pml.py to process the output of tsrchd_sparse +# into PyMOL script. +# +# Environment variables: +# +# PATH must contain the location of the Python scripts, ie where this +# script itself is and the ptgraph/ directory with pytableaucreate.py etc., +# and the location of tsrchd_sparse. +# The dssp program must also be in the PATH. +# +# PYTHONPATH must contain the directory containing the ptsecstruct.py +# and other Python modules used by the Python scripts. +# +# Note you would not use this script to do a large number of comparisons +# such as a database search - for that you would run tsrchd_sparse (or other) +# for a tableau against database of tableaux previously created with +# pytableaucrate.py and buildtableauxdb.py, and soln2ssemap.py | ssemap2pml.py +# pipline on output for desired matches (e.g. top n scores obtained by sorting +# output of tsrchd). This script is a convenient way of running a single +# matching between two structures and examining the result correspondence +# between SSEs. +# +# $Id: qptabmatchstructs.sh 3464 2010-03-15 05:38:48Z alexs $ +# +############################################################################### + + +# write tableau and distance matrix for tsrchd to stdout +# Parameters: +# pdbfile - filename of PDB file +# use_hk - if 1, use the HH and KK codes +# extra_opts - extra options to add to pytableaucreate.py +writetableau() { + pdbfile=$1 + use_hk=$2 + extra_opts="$3" + tabopts="-b -35 -f -t dssp -p none ${extra_opts}" + if [ $use_hk -eq 1 ]; then + tabopts="${tabopts} -k" + fi + pytableaucreate.py ${tabopts} ${pdbfile} +} + + + +writepdb=0 +use_hk=0 +use_ordering=1 +sse_num_list='' +sse_num_list_opt='' +randomly_permute=0 +heuristic=0 + +while getopts 'hskqe:u' opt +do + case $opt in + h) heuristic=1 + ;; + s) writepdb=1 + ;; + k) use_hk=1 + ;; + q) use_ordering=0 + ;; + e) sse_num_list="$OPTARG" + sse_num_list_opt="-s ${sse_num_list}" + ;; + u) randomly_permute=1 + ;; + ?) + echo "Usage: $0 [-squk] [-e sse_num_list] struct1.pdb struct2.pdb" >&2 + exit 1 + ;; + esac +done +shift $(($OPTIND - 1)) + +if [ $# -ne 2 ]; then + echo "Usage: $0 [-squk] [-e sse_num_list] struct1.pdb struct2.pdb" >&2 + exit 1 +fi + +struct1=$1 +struct2=$2 + + +tmpfile1=/tmp/qptabsh1$$ +tmpfile2=/tmp/qptabsh2$$ +tmpfile3=/tmp/qptabsh3$$ +tmpfile4=/tmp/qptabsh4$$ + +writetableau ${struct2} ${use_hk} "" > ${tmpfile2} +echo ${tmpfile2} > ${tmpfile1} # filename of tableaux database +if [ $use_ordering -eq 0 ]; then + echo "T F T" >> ${tmpfile1} # options: type,order,output +else + echo "T T T" >> ${tmpfile1} # options: type,order,output +fi +extra_opts="${sse_num_list_opt}" +if [ $randomly_permute -ne 0 ]; then + extra_opts="${extra_opts} -u" + writetableau ${struct1} ${use_hk} "${extra_opts}" > ${tmpfile4} + # get permutation from first line of tableaucreate output + permutation=`awk 'NR == 1 {print $3}' ${tmpfile4}` + awk 'NR > 1' ${tmpfile4} >> ${tmpfile1} +else + writetableau ${struct1} ${use_hk} "${extra_opts}" >> ${tmpfile1} +fi + +qsize=`awk 'NR == 3 {print $2}' < ${tmpfile1}` + +if [ $heuristic -eq 0 ]; then + PROGRAM=tsrchd_sparse + solnremap_stage="soln2ssemap.py -q ${qsize}" +else + PROGRAM="cudaSaTabsearch -c -r2048" + solnremap_stage=cat # outputs sse map directly, no need to convert +fi + +if [ -z ${sse_num_list} ]; then + remap_stage="cat" # no need to remap SSE nums: stdin to stdout unchanged +else + remap_stage="ssesubsetremap.py ${sse_num_list}" +fi +if [ $randomly_permute -ne 0 ]; then + perm_remap_stage="ssepermutationremap.py ${permutation}" +else + perm_remap_stage="cat" +fi + +if [ $writepdb -eq 1 ]; then + $PROGRAM < ${tmpfile1} | ${solnremap_stage} | ${perm_remap_stage} | ${remap_stage} | tee ${tmpfile3} | ssemap2pml.py -s -u ${struct1} -b ${struct2} + superimposessemap.py -u ${struct1} -b ${struct2} -o . < ${tmpfile3} >/dev/null +else + $PROGRAM < ${tmpfile1} | ${solnremap_stage} | ${perm_remap_stage} | ${remap_stage} | ssemap2pml.py -s -u ${struct1} -b ${struct2} +fi + +rm -f ${tmpfile1} ${tmpfile2} ${tmpfile3} ${tmpfile4} + diff --git a/scripts/rocauc.r b/scripts/rocauc.r new file mode 100644 index 0000000..be8159c --- /dev/null +++ b/scripts/rocauc.r @@ -0,0 +1,116 @@ +# rocauc.r - plot ROC and compute AUC given score+label table with R using RROC +# +# Alex Stivala, October 2008 +# +# Requires the ROCR package from CRAN (developed with version 1.0-2) +# (ROCR in turn requires gplots, gtools, gdata) +# +# Run this on the output of e.g. tsevalfn.py with the -l option, +# it is a table with one column of scores from classifier, and second +# column of true class label (0 or 1) +# +# Uses commandArgs() R function to get trailing arguments from R command +# line ie after the --args option. The filename of the .slrtab file +# is obtained from --args, and the output file is constructed form it +# eg foo.slrtab results in foo.eps file for ROC plot with +# +# R --vanilla -f rocauc.r --args foo.slrtab +# +# +# The citation for the ROCR package is +# Sing et al 2005 "ROCR: visualizing classifier performance in R" +# Bioinformatics 21(20):3940-3941 +# +# +# Also calculate stdand eror (and 95% confidence inteval) for the AUC, +# according to Hanley-McNeil method: +# Hanley & McNeil 1982 "The Meaning and Use of the Area under a Receiver +# Operating Characteristic (ROC) Curve" Radiology 143(1):29-36 +# ROCR has all sorts of featuers but this isn't one of them so had +# to implement it here. +# + +# $Id: rocauc.r 3606 2010-05-04 06:03:55Z alexs $ + +library(ROCR) + +# +# functions +# + +# +# compute AUC and standard error and 95% CI by Hanley-McNeil method, +# using R wilcox.test for the Wilcoxon rank-sum (aka Mann-Whitney) test +# +# Parameters: +# tab : data frame with score and label columns +# +# Return value: +# list with auc and stderror members +# +compute_auc_error <- function(tab) +{ + # tab is a data frame with score and label columns + x <- tab$score[tab$label == 1] + y <- tab$score[tab$label == 0] + wilcox <- wilcox.test(x,y) + nA <- length(x) + nN <- length(y) + stopifnot(nA + nN == length(tab$label)) + nA <- as.double(nA) + nN <- as.double(nN) + U <- wilcox$statistic # Mann-Whitney U statistic + theta <- U / (nA * nN) # AUC + theta2 <- theta*theta + Q1 <- theta / (2 - theta) + Q2 <- 2*theta2 / (1 + theta) + SE2 <- (theta*(1-theta) + (nA - 1)*(Q1 - theta2) + (nN - 1)*(Q2 - theta2)) / + (nA*nN) + SE <- sqrt(SE2) + + retval <- list() + retval$auc <- theta + retval$stderror <- SE + return(retval) +} + +# +# plot the ROC curve and compute AUC using ROCR +# +plotroc <- function(filename) +{ + tab <- read.table(filename, header=TRUE) + # tab is a data frame with score and label columns + pred <- prediction(tab$score, tab$label) + perfroc <- performance(pred, measure="tpr",x.measure="fpr") + perfauc <- performance(pred, measure="auc") + + # EPS suitable for inserting into LaTeX + postscript(sub('[.]slrtab$','.eps',filename), + onefile=FALSE,paper="special",horizontal=FALSE, + width = 9, height = 6) + plot(perfroc) + auc <- perfauc@y.values + legend('bottomright', legend=paste('AUC =', format(auc, digits=4)), bty='n') + dev.off() + + aucerr <- compute_auc_error(tab) + quantile <- 1.96 # 0.975 quantile of normal distrib + low95 <- aucerr$auc - quantile * aucerr$stderr + high95 <- aucerr$auc + quantile * aucerr$stderr + + cat(filename,':\n') + cat('RROC AUC =',format(perfauc@y.values,digits=4),'\n') + cat('Hanley-McNeil AUC =',format(aucerr$auc,digits=4),'\n') + cat(' std. error =',format(aucerr$stderror,digits=5),'\n') + cat(' 95% CI =',format(low95,digits=4),',', + format(high95,digits=4),'\n') +} + + +# +# main +# +filename <- commandArgs(trailingOnly=TRUE) +plotroc(filename) + diff --git a/scripts/rocrcops.py b/scripts/rocrcops.py new file mode 100755 index 0000000..75d95fe --- /dev/null +++ b/scripts/rocrcops.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python +# +# +# rocrcops.py - Output score+label table for results on COPS dataset +# +# File: rocrcops.py +# Author: Alex Stivala +# Created: May 2010 +# +# Evaluate structure search for COPS benchmark data set +# (Frank et al. 1999 "COPS Benchmark: interactive analysis of database +# search methods" Bioinformatics 26(4):574-575) available from +# http://benchmark.services.came.sbg.ac.at/ +# +# $Id: rocrcops.py 3635 2010-05-12 06:48:14Z alexs $ +# +# + +""" +Write scores from structure search method and actual class labels +(0/1 for same/different fold as query). +Using COPS benchmark true positives as gold standard. + +Output is to stdout in a format that is easily usable in R with the +read.table() function, i.e. we use '#' to prefix lines not to be parsed, +and have a header line with suitable R variable names for the columns. + +See usage in docstring for main() + +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os,glob +import getopt +from itertools import groupby + +from tsevalutils import parse_searchresult,iter_searchresult + + +#----------------------------------------------------------------------------- +# +# Constants +# +#----------------------------------------------------------------------------- + +# The true positives are stored in a text file, not embedded here + +COPS_DIR = "/home/alexs/phd/qptabsearch/data/COPS/" +COPS_TP_FILE = COPS_DIR + "cops.truepositives" +COPS_QUERYLIST = COPS_DIR + "cops.querylist" +COPS_DBLIST = COPS_DIR + "cops.dblist" + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def parse_cops_tp_file(fname): + """ + Parse the COPS true positives file (copied from COPS readme.txt file). + Each (whitespace delmited) line starts with query id then has + (exactly 6) true positives for that query. + Comment lines starting with # are ignored. + + For conveniene for methods that mess with case of identifiers, + everything is converted to lowercase here. + + Parameters: + filename - name of COPS true positives text file to parse + + Return value: + dict { queryid : tp_list } where queryid is query structure name + and tp_list is list of the true positives structure names for + that query. + """ + tp_dict = {} + for line in open(fname): + if line[0] == '#': + continue + sline = line.split() + if len(sline) < 6: + sys.stderr.write('bad line in COPS tp file: %s\n' % line) + continue + tp_dict[sline[0].lower()] = [s.lower() for s in sline[1:]] + return tp_dict + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-mnv] [-z score] " + " \n") + sys.stderr.write(' -m read multiquery file on stdin\n') + sys.stderr.write(' -n negate scores (so that most -ve is best)\n') + sys.stderr.write(' -v verbose messages to stderr\n') + sys.stderr.write(' -z score : assign identifiers not present in the output a score of score\n') + sys.exit(1) + + +def main(): + """ + main for rocrcops.py + + Usage: rocrcops.py [-mnv] [-z score] + + + -v turns on debug output to stderr + -n negate all scores (so that most -ve is best) + -m read multiquery file (all query results in one file) from stdin + instead of separate .out file for each query + -z score : any identifiers that are not present in the output but + are in the gold standard data are given the specified score. + This is for programs that do not assign a score to all + domains, but only those above some threshold or the top n, + or just cannot assign a score to some domains for some reason. + This would generally be specified as some score lower + than all other scores. + + + is the directory containing output files as generated + by tsrchd_sparse (qptabmatch_allall.py) or msvns4maxcmo_allall.py + + The table of scores and labels is printed to stdout. + """ + global verbose + verbose = False + use_class = False + negateflag = False + multiquery = False + bottom_score = None + + try: + opts,args = getopt.getopt(sys.argv[1:], "vz:ncm?") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-v": # verbose + verbose = True # this module only + elif opt == "-n": # negate scores + negateflag = True + elif opt == '-c': # class not fold level evaluation + use_class = True + elif opt == '-m': # read multiquery file + multiquery = True + elif opt == "-z": # score to give to domains that have no score + bottom_score = float(arg) + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 1: + usage(os.path.basename(sys.argv[0])) + + outdir = args[0] + + cops_truepositives = parse_cops_tp_file(COPS_TP_FILE) + cops_dblist = [line.rstrip().lower() for line in open(COPS_DBLIST)] + cops_querylist = [line.rstrip().lower() for line in open(COPS_QUERYLIST)] + + # build dict of search results, one for each query + # and corresponding dict of gold standard results + + searchresult_dict = {} + goldstd_dict = {} + + if multiquery: + # get list of iterables each for same queryid. + # iter_searchresult() is isterable of tuples (queryid, score, domainid) + # groupby() requires that the iterable already has identical consecutive + # queryids (first element of tuple) - iter_searchresult() should yield this + # and does, when there is only one instance of the QUERY ID for each query, + # but new version of cudaSaTabsearch does all queries in small structure + # db, then in large structure db, so two QUERY ID for each query in the + # output, so we sort by queryid first before groupby. + query_group_iter = groupby(sorted( + iter_searchresult(sys.stdin, multiquery=True, + skip_self_query=True, + negateflag=negateflag) ), + lambda t : t[0]) + + for (query_id, result_iter) in query_group_iter: + try: + goldstd_ids = cops_truepositives[query_id.lower()] + except KeyError: + if verbose: + sys.stderr.write('skipped ' + query_id + '\n') + continue + searchresult_dict[query_id.lower()] = [(score, dbid.lower()) for (domainid,score,dbid) in result_iter] + goldstd_dict[query_id.lower()] = goldstd_ids + else: + # each query is in a separate .out file + for result_file in glob.glob(os.path.join(outdir, '*.out')): + query_id = os.path.splitext(os.path.basename(result_file))[0].lower() + result_fh = open(result_file) + (searchresult,commentlist) = parse_searchresult(result_fh, negateflag) + result_fh.close() + try: + goldstd_ids = cops_truepositives[query_id] + except KeyError: + if verbose: + sys.stderr.write('skipped ' + query_id + '\n') + continue + searchresult_dict[query_id] = searchresult + goldstd_dict[query_id] = goldstd_ids + + + sys.stdout.write('#' + ' '.join(sys.argv) + '\n') #identifying info about us + sys.stdout.write('score label\n') + sys.stdout.write('#-------------\n') + qcount = 0 + for query_id in searchresult_dict.iterkeys(): + sys.stdout.write('# %s\n' % query_id) # XXX + slcount=0 + qcount += 1 + searchresult = searchresult_dict[query_id] + goldstd_pos_dict = dict([(domainid,True) for domainid in + goldstd_dict[query_id.lower()]]) + if len(searchresult) == 0: + sys.stderr.write("warning: no results for query %s\n" % query_id) + for (score, domainid) in searchresult: + # skip self-query + if domainid.lower() == query_id.lower(): + continue + if goldstd_pos_dict.has_key(domainid.lower()): + label = 1 + else: + label = 0 + sys.stdout.write('%20.8f %d\n' % (score,label)) + slcount += 1 + + if bottom_score != None: + lowscore_domains = 0 + for domid in cops_dblist: + if domid.lower() == query_id.lower(): + continue #skip self-query + if domid.lower() not in [d.lower() for (s,d) in searchresult]: + lowscore_domains += 1 + if goldstd_pos_dict.has_key(domid.lower()): + label = 1 + else: + label = 0 + sys.stdout.write('%20.8f %d\n' % (bottom_score,label)) + slcount += 1 + if verbose and lowscore_domains > 0: + sys.stderr.write("(queryid %s): set score to %f for %d domains\n" % (query_id, bottom_score, lowscore_domains)) + if verbose: + sys.stderr.write('wrote %d (score,label) pairs for query %s\n' % (slcount, query_id)) + + if verbose: + sys.stderr.write("processed %d queries\n" % qcount) + + # some methods (actually only VAST as far as I've found) actually + # give NO results for some queries, which is a major hassle specially + # for StAR which requires all methods to have all results. + # so we'll just give the bottom score to all those matchings + if qcount < len(cops_querylist): + sys.stderr.write("WARNING: only %d of %d queries have results\n" % (qcount, len(cops_querylist))) + for qid in cops_querylist: + if qid not in searchresult_dict.iterkeys(): + sys.stderr.write("query %s has no results\n" %qid) + if bottom_score != None: + sys.stdout.write('# %s\n' % qid) # XXX + goldstd_ids = cops_truepositives[qid.lower()] + goldstd_dict[qid.lower()] = goldstd_ids + goldstd_pos_dict = dict([(domainid,True) for domainid in + goldstd_dict[qid.lower()]]) + lowscore_domains = 0 + for domid in cops_dblist: + if domid.lower() == qid.lower(): + continue #skip self-query + lowscore_domains += 1 + if goldstd_pos_dict.has_key(domid.lower()): + label = 1 + else: + label = 0 + sys.stdout.write('%20.8f %d\n' % (bottom_score,label)) + if verbose and lowscore_domains > 0: + sys.stderr.write("(queryid %s): set score to %f for %d domains\n" % (qid, bottom_score, lowscore_domains)) + + + + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() + diff --git a/scripts/rocrfischer.py b/scripts/rocrfischer.py new file mode 100755 index 0000000..660e885 --- /dev/null +++ b/scripts/rocrfischer.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python +# +# +# rocrfischer.py - Output score+label table for results on Fischer dataset +# +# File: rocrfischer.py +# Author: Alex Stivala +# Created: September 2008 +# +# Evaluate structure search for all against all matching +# for the Fischer data set (Fischer et al 1996 Pac. Symp. Biocomput. 300-318)) +# as per Pelta et all 2008 BMC Bioinformatics 9:161 +# Output a table of scores and true class labels (binary: in or not in same +# class/fold) , for use with R CRAN package ROCR +# +# $Id: rocrfischer.py 3603 2010-05-04 04:47:51Z alexs $ +# +# + +""" +Write scores from structure search method and actual class labels +(0/1 for same/different fold as query). +Using Fischer Table II as the gold standard at fold or class level + +Output is to stdout in a format that is easily usable in R with the +read.table() function, i.e. we use '#' to prefix lines not to be parsed, +and have a header line with suitable R variable names for the columns. + +See usage in docstring for main() + +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os,glob +import getopt +from itertools import groupby + +from tsevalutils import parse_searchresult,iter_searchresult +from fischer_tables import FISCHER_ID_FOLD_DICT,FISCHER_FOLD_IDLIST_DICT,FISCHER_ID_CLASS_DICT,FISCHER_CLASS_IDLIST_DICT + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-cmnv] [-z score] " + " \n") + sys.stderr.write(' -c class level not fold level evaluation\n') + sys.stderr.write(' -m read multiquery file on stdin\n') + sys.stderr.write(' -n negate scores (so that most -ve is best)\n') + sys.stderr.write(' -v verbose messages to stderr\n') + sys.stderr.write(' -z score : assign identifiers not present in the output a score of score\n') + sys.exit(1) + + +def main(): + """ + main for rocfischer.py + + Usage: rocfischer.py [-cmnv] [-z score] + + + -c evaluate at class level rather than default fold level + -v turns on debug output to stderr + -n negate all scores (so that most -ve is best) + -m read multiquery file (all query results in one file) from stdin + instead of separate .out file for each query + -z score : any identifiers that are not present in the output but + are in the gold standard data are given the specified score. + This is for programs that do not assign a score to all + domains, but only those above some threshold or the top n, + or just cannot assign a score to some domains for some reason. + This would generally be specified as some score lower + than all other scores. + + + is the directory containing output files as generated + by tsrchd_sparse (qptabmatch_allall.py) or msvns4maxcmo_allall.py + + The table of scores and labels is printed to stdout. + """ + global verbose + verbose = False + use_class = False + negateflag = False + multiquery = False + bottom_score = None + + try: + opts,args = getopt.getopt(sys.argv[1:], "vz:ncm?") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-v": # verbose + verbose = True # this module only + elif opt == "-n": # negate scores + negateflag = True + elif opt == '-c': # class not fold level evaluation + use_class = True + elif opt == '-m': # read multiquery file + multiquery = True + elif opt == "-z": # score to give to domains that have no score + bottom_score = float(arg) + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 1: + usage(os.path.basename(sys.argv[0])) + + outdir = args[0] + + # build dict of search results, one for each query + # and corresponding dict of gold standard results + + searchresult_dict = {} + goldstd_dict = {} + + if multiquery: + # get list of iterables each for same queryid. + # iter_searchresult() is isterable of tuples (queryid, score, domainid) + # groupby() requires that the iterable already has identical consecutive + # queryids (first element of tuple) - iter_searchresult() should yield this + # and does, when there is only one instance of the QUERY ID for each query, + # but new version of cudaSaTabsearch does all queries in small structure + # db, then in large structure db, so two QUERY ID for each query in the + # output, so we sort by queryid first before groupby. + query_group_iter = groupby(sorted( + iter_searchresult(sys.stdin, multiquery=True, + skip_self_query=True, + negateflag=negateflag) ), + lambda t : t[0]) + + for (query_id, result_iter) in query_group_iter: + try: + if use_class: + goldstd_ids = FISCHER_CLASS_IDLIST_DICT[FISCHER_ID_CLASS_DICT[query_id.lower()]] + else: + goldstd_ids = FISCHER_FOLD_IDLIST_DICT[FISCHER_ID_FOLD_DICT[query_id.lower()]] + except KeyError: + if verbose: + sys.stderr.write('skipped ' + query_id + '\n') + continue + searchresult_dict[query_id.lower()] = [(score, dbid.lower()) for (domainid,score,dbid) in result_iter] + goldstd_dict[query_id.lower()] = goldstd_ids + else: + # each query is in a separate .out file + for result_file in glob.glob(os.path.join(outdir, '*.out')): + query_id = os.path.splitext(os.path.basename(result_file))[0].lower() + result_fh = open(result_file) + (searchresult,commentlist) = parse_searchresult(result_fh, negateflag) + result_fh.close() + try: + if use_class: + goldstd_ids = FISCHER_CLASS_IDLIST_DICT[FISCHER_ID_CLASS_DICT[query_id]] + else: + goldstd_ids = FISCHER_FOLD_IDLIST_DICT[FISCHER_ID_FOLD_DICT[query_id]] + except KeyError: + if verbose: + sys.stderr.write('skipped ' + query_id + '\n') + continue + searchresult_dict[query_id] = searchresult + goldstd_dict[query_id] = goldstd_ids + + + sys.stdout.write('#' + ' '.join(sys.argv) + '\n') #identifying info about us + sys.stdout.write('score label\n') + sys.stdout.write('#-------------\n') + qcount = 0 + for query_id in searchresult_dict.iterkeys(): + sys.stdout.write('# %s\n' % query_id) # XXX + slcount=0 + qcount += 1 + searchresult = searchresult_dict[query_id] + goldstd_pos_dict = dict([(domainid,True) for domainid in + goldstd_dict[query_id.lower()]]) + if len(searchresult) == 0: + sys.stderr.write("warning: no results for query %s\n" % query_id) + for (score, domainid) in searchresult: + # skip self-query + if domainid.lower() == query_id.lower(): + continue + if goldstd_pos_dict.has_key(domainid.lower()): + label = 1 + else: + label = 0 + sys.stdout.write('%20.8f %d\n' % (score,label)) + slcount += 1 + + if bottom_score != None: + lowscore_domains = 0 + for domid in FISCHER_ID_FOLD_DICT.keys(): + if domid.lower() == query_id.lower(): + continue #skip self-query + if domid.lower() not in [d.lower() for (s,d) in searchresult]: + lowscore_domains += 1 + if goldstd_pos_dict.has_key(domid.lower()): + label = 1 + else: + label = 0 + sys.stdout.write('%20.8f %d\n' % (bottom_score,label)) + slcount += 1 + if verbose and lowscore_domains > 0: + sys.stderr.write("(queryid %s): set score to %f for %d domains\n" % (query_id, bottom_score, lowscore_domains)) + if verbose: + sys.stderr.write('wrote %d (score,label) pairs for query %s\n' % (slcount, query_id)) + + if verbose: + sys.stderr.write("processed %d queries\n" % qcount) + + # some methods (actually only VAST as far as I've found) actually + # give NO results for some queries, which is a major hassle specially + # for StAR which requires all methods to have all results. + # so we'll just give the bottom score to all those matchings + if qcount < len(list(FISCHER_ID_FOLD_DICT.iterkeys())): + sys.stderr.write("WARNING: only %d of %d queries have results\n" % (qcount, len(list(FISCHER_ID_FOLD_DICT.iterkeys())))) + for qid in FISCHER_ID_FOLD_DICT.iterkeys(): + if qid not in searchresult_dict.iterkeys(): + sys.stderr.write("query %s has no results\n" %qid) + if bottom_score != None: + sys.stdout.write('# %s\n' % qid) # XXX + if use_class: + goldstd_ids = FISCHER_CLASS_IDLIST_DICT[FISCHER_ID_CLASS_DICT[qid.lower()]] + else: + goldstd_ids = FISCHER_FOLD_IDLIST_DICT[FISCHER_ID_FOLD_DICT[qid.lower()]] + goldstd_dict[qid.lower()] = goldstd_ids + goldstd_pos_dict = dict([(domainid,True) for domainid in + goldstd_dict[qid.lower()]]) + lowscore_domains = 0 + for domid in FISCHER_ID_FOLD_DICT.keys(): + if domid.lower() == qid.lower(): + continue #skip self-query + lowscore_domains += 1 + if goldstd_pos_dict.has_key(domid.lower()): + label = 1 + else: + label = 0 + sys.stdout.write('%20.8f %d\n' % (bottom_score,label)) + if verbose and lowscore_domains > 0: + sys.stderr.write("(queryid %s): set score to %f for %d domains\n" % (qid, bottom_score, lowscore_domains)) + + + + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() + diff --git a/scripts/rocrnh3d.py b/scripts/rocrnh3d.py new file mode 100755 index 0000000..67c2fb1 --- /dev/null +++ b/scripts/rocrnh3d.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python +# +# +# rocrnh3d.py - Output score+label table for results on NH3D data set +# +# File: rocnh3d.py +# Author: Alex Stivala +# Created: September 2008 +# +# Evaluate QP tableau search or MSVNS4MaxCMO for all against all matching +# for the Nh3D data set (Thiruv et al 2005 BMC Struct Biol 5:12) +# as per Pelta et all 2008 BMC Bioinformatics 9:161 +# +# $Id: rocrnh3d.py 2079 2009-03-03 07:43:11Z astivala $ +# + +""" +Write scores from structure search method and actual class labels +(0/1 for same/different fold as query). Using CATH architecture as gold +standard. +Output is to stdout in a format that is easily usable in R with the +read.table() function, i.e. we use '#' to prefix lines not to be parsed, +and have a header line with suitable R variable names for the columns. + +See usage in docstring for main() + +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os,glob +import getopt + +from cathmap import CATHMAP +from tsevalutils import compute_auc,parse_searchresult + +#----------------------------------------------------------------------------- +# +# Constants +# +#----------------------------------------------------------------------------- + +# list of different CATH architectures +ARCH_LIST= ["1.10", "1.20", "2.10", "2.170", "2.30", "2.40", "2.60", "2.70", "3.10", "3.20", "3.30", "3.40", "3.60", "3.90", "4.10"] + +# List of query CATH identifiers, from the Additional File 1 spreadsheet +# for Pelta et al 2008 +QUERY_LIST=["1.10.1040", "1.10.1320", "1.10.533", "1.10.645", "1.20.1280", "1.20.210", "1.20.5", "1.20.840", "2.10.25", "2.10.260", "2.10.270", "2.10.90", "2.170.16", "2.170.230", "2.170.290", "2.170.40", "2.30.110", "2.30.18", "2.30.230", "2.30.29", "2.30.40", "2.40.155", "2.40.160", "2.40.180", "2.40.340", "2.40.50", "2.60.130", "2.60.260", "2.60.420", "2.60.90", "2.70.100", "2.70.180", "2.70.220", "2.70.98", "3.10.105", "3.10.170", "3.10.270", "3.10.330", "3.10.400", "3.20.120", "3.20.140", "3.20.19", "3.20.70", "3.20.90", "3.30.1530", "3.30.1690", "3.30.240", "3.30.559", "3.30.560", "3.30.60", "3.30.990", "3.40.1210", "3.40.1380", "3.40.225", "3.40.720", "3.60.100", "3.60.120", "3.60.20", "3.60.40", "3.60.90", "3.90.1280", "3.90.1300", "3.90.1350", "3.90.1580", "3.90.510", "3.90.850", "4.10.1080", "4.10.1090", "4.10.220", "4.10.260", "4.10.480", "4.10.540", "4.10.790"] + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-cnv] " + " \n") + sys.stderr.write(' -c evaulate at Class not Architecture level\n') + sys.stderr.write(' -n negate scores\n') + sys.stderr.write(' -v verbose messages to stderr\n') + sys.exit(1) + + +def main(): + """ + main for rocnh3d.py + + Usage: rocnh3d.py [-cnv] + + + -c evaluate at CATH Class not Architecture level + -v turns on debug output to stderr + -n negate scores + + is the directory containing output files as generated + by tsrchd_sparse (qptabmatch_allall.py) or msvns4maxcmo_allall.py + + The table of scores and class labels is printed to stdout. + """ + global verbose + verbose = False + negateflag = False + class_level = False + + try: + opts,args = getopt.getopt(sys.argv[1:], "cvn?") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-v": # verbose + verbose = True # this module only + elif opt == "-c": #class not arch + class_level = True + elif opt == "-n": # negate scores + negateflag = True + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 1: + usage(os.path.basename(sys.argv[0])) + + outdir = args[0] + + # build dict of search results, one for each query + # and corresponding dict of gold standard results + searchresult_dict = {} + goldstd_dict = {} + for result_file in glob.glob(os.path.join(outdir, '*.out')): + query_id = os.path.splitext(os.path.basename(result_file))[0].lower() + result_fh = open(result_file) + (searchresult,commentlist) = parse_searchresult(result_fh, negateflag) + result_fh.close() + searchresult_dict[query_id] = searchresult + + # get the gold standard as the list of 'compreseed' CATH ids + # that have same architecture as query + query_id_split = query_id.split('.') + query_class = query_id_split[0] + query_arch = query_id_split[1] + goldstd_ids = [] + for (compressed_id, cathid) in CATHMAP.iteritems(): + cathid_split = cathid.split('.') + cathid_class = cathid_split[0] + cathid_arch = cathid_split[1] + if cathid_class == query_class: + # only check Architecture match if not evaluating at Class level + if class_level or cathid_arch == query_arch: + goldstd_ids.append(compressed_id) + + goldstd_dict[query_id] = goldstd_ids + + + sys.stdout.write('#' + ' '.join(sys.argv) + '\n') #identifying info about us + + sys.stdout.write('score label\n') + sys.stdout.write('#-------------\n') + for query_id in searchresult_dict.iterkeys(): + searchresult = searchresult_dict[query_id] + goldstd_pos_dict = dict([(domainid,True) for domainid in + goldstd_dict[query_id]]) + for (score, domainid) in searchresult: + if goldstd_pos_dict.has_key(domainid): + label = 1 + else: + label = 0 + sys.stdout.write('%20.8f %d\n' % (score,label)) + + + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() + diff --git a/scripts/runTableauSearch.sh b/scripts/runTableauSearch.sh new file mode 100755 index 0000000..6a37a0e --- /dev/null +++ b/scripts/runTableauSearch.sh @@ -0,0 +1,49 @@ +#!/bin/sh +# +# runTableauSearch.sh +# +# run Arun's TableauSearch +# on a given PDB file against database (hardcoded in exercutable) +# saving results and stderr (for $TIME) +# +# Uses other scripts to build query angles file, convert output +# to format for evaluation with tsevalfn.py etc. (also runs the latter) +# +# Usage: runTableauSearch.sh querypdbfile outdir +# +# Puts output in outdir, with names starting with the querypdbfile baesname +# WARNING: overwrites output files +# +# $Id: runTableauSearch.sh 2958 2009-11-19 04:10:07Z astivala $ +# + +TIME=/usr/bin/time +TABLEAUSEARCH=/home/alexs/phd/TableauxCompare/bin/TableauComparer + +if [ $# -ne 2 ]; then + echo "Usage: $0 querypdbfile outdir" >&2 + exit 1 +fi +infile=$1 +outdir=$2 +queryid=`basename $1` + +anglesfile=${outdir}/${queryid}.angles +outfile=${outdir}/${queryid}.TableauSearch.out + +pytableaucreate.py -e -35 -t dssp -p none ${infile} > ${anglesfile} + +# NB TableauSearch always writes to search.scores in output dir +$TIME ${TABLEAUSEARCH} ${anglesfile} ${outdir} > ${outdir}/${queryid}.err 2>&1 +tableausearchout2col.py < ${outdir}/search.scores > ${outfile} + +cat ${outdir}/${queryid}.err + +# run tsevalfn.py if it is a SCOP identifier +if [ `expr substr ${queryid} 1 1` = 'd' ]; then + scopsid=`basename ${queryid} .ent` + rtabfile=${outdir}/${scopsid}.TableauSearch.rtab + tsevalfn.py $scopsid ${outfile} > ${rtabfile} + grep AUC ${rtabfile} +fi + diff --git a/scripts/run_tlocsd_filter_tsrchd.sh b/scripts/run_tlocsd_filter_tsrchd.sh new file mode 100755 index 0000000..81538f4 --- /dev/null +++ b/scripts/run_tlocsd_filter_tsrchd.sh @@ -0,0 +1,144 @@ +#!/bin/bash +############################################################################### +# +# run_tlocsd_filter_tsrchd.sh - Run tlocsd heuristic (fast) then use top +# 10% hits only as db for slower tsrchd +# +# File: run_tlocsd_filter_tsrchd.sh +# Author: Alex Stivala +# Created: November 2009 +# +# For the top 10% (or other) hits according to tlocsd simulated annealing +# use that as database for slower QP tableau matching. +# +# Usage: +# run_tlocsd_filter_tsrchd.sh [-p percent] [-e sse_num_list] [-q] querystruct dbdir +# +# -q: do not use the ordering constraint (allow nonsequential matchings) +# +# -e sse_num_list: list of SSE sequential numbers to select from +# query struct rather than whole structure +# +# -p percent_hits : percentage of top hits to use (default 10) +# +# querystruct is a structure in PDB format for the query +# dbdir is the tableaux+distmatrix db directory. It must contain +# the files: +# distmatrixdb.pickle +# tableauxdb.pickle +# tableauxdistmatrixdb.ascii +# +# where the .pickle files were created with buildtableauxdb.py +# and the .ascii file is created from them with convdb2.py +# +# Output to stdout is the output from tsrchd +# +# The script works by first building the query tableau+distmatrix +# with pytableaucreate.py then using this as input to the fast +# tlocsd program. The top 10% (or n%) of the hits are then used +# to select only those structures from the database to build a +# new database with only those, and then tsrchd_pardiso run on that +# reduced database. +# TODO: this is just a temporary hack until it is done properly by +# having an option on tsrchd etc. to just examine the listed structures. +# Output is then the output from tsrchd run on the reduced database. +# +# Environment variables: +# +# PATH must contain the location of the Python scripts, ie where this +# script itself is and the ptgraph/ directory with pytableaucreate.py etc., +# and the location of tsrchd_sparse. +# The dssp program must also be in the PATH. +# +# PYTHONPATH must contain the directory containing the ptsecstruct.py +# and other Python modules used by the Python scripts. +# +# $Id: run_tlocsd_filter_tsrchd.sh 2110 2009-03-18 05:58:44Z astivala $ +# +############################################################################### + +TIME=/usr/bin/time +TLOCSD=tlocsd +TSRCHD=tsrchd_pardiso + +# write tableau and distance matrix for tsrchd to stdout +# Parameters: +# pdbfile - filename of PDB file +# use_hk - if 1, use the HH and KK codes +# extra_opts - extra options to add to pytableaucreate.py +writetableau() { + pdbfile=$1 + use_hk=$2 + extra_opts="$3" + tabopts="-b -35 -f -t dssp -p none ${extra_opts}" + if [ $use_hk -eq 1 ]; then + tabopts="${tabopts} -k" + fi + pytableaucreate.py ${tabopts} ${pdbfile} +} + + + +use_ordering=1 +sse_num_list='' +sse_num_list_opt='' +percent_hits=10 +run_mustang=0 + +while getopts 'qe:p:' opt +do + case $opt in + q) use_ordering=0 + ;; + e) sse_num_list="$OPTARG" + sse_num_list_opt="-e ${sse_num_list}" + ;; + p) + percent_hits="$OPTARG" + ;; + ?) + echo "Usage: $0 [-q] [-e sse_num_list] [-p percent_hits] query_pdb_file dbdir" >&2 + exit 1 + ;; + esac +done +shift $(($OPTIND - 1)) + + +if [ $# -ne 2 ]; then + echo "Usage: $0 [-q] [-e sse_num_list] [-p percent_hits] query_pdb_file dbdir" >&2 + exit 1 +fi + +querystruct=$1 +dbdir=$2 + +db_size=`grep -c '^d' ${dbdir}/tableauxdistmatrixdb.ascii` +num_hits=`echo "$db_size * $percent_hits / 100" | bc` +#echo 'xxx num_hits =' $num_hits + +queryinput1_tmp=/var/tmp/rtft1$$ +tableauxdb_tmp=/var/tmp/rtftdb$$ +queryinput2_tmp=/var/tmp/rtft2$$ + +echo ${dbdir}/tableauxdistmatrixdb.ascii >${queryinput1_tmp} +if [ $use_ordering -eq 0 ]; then + echo "T F F" >> ${queryinput1_tmp} # options: type,order,output +else + echo "T T F" >> ${queryinput1_tmp} # options: type,order,output +fi +extra_opts="${sse_num_list_opt}" +writetableau ${querystruct} 0 "" >> ${queryinput1_tmp} + +trap "rm ${queryinput1_tmp} ${queryinput2_tmp} ${tableauxdb_tmp}" 0 + +${TIME} ${TLOCSD} < ${queryinput1_tmp} | sort -k2,2nr | head -${num_hits} | + cut -d' ' -f1 | + convdb2.py -l ${dbdir}/tableauxdb.pickle ${dbdir}/distmatrixdb.pickle > ${tableauxdb_tmp} + +# build input for tsrchd as same as original but using new (reduced) database +awk "NR == 1 {print(\"${tableauxdb_tmp}\")} NR > 1 {print}" < ${queryinput1_tmp} > ${queryinput2_tmp} + +${TIME} ${TSRCHD} < ${queryinput2_tmp} + + diff --git a/scripts/sarf2_6col_to_score.py b/scripts/sarf2_6col_to_score.py new file mode 100755 index 0000000..2149201 --- /dev/null +++ b/scripts/sarf2_6col_to_score.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# +# File: sarf2_6col_to_score.sh +# Author: Alex Stivala +# Created: April 2010 +# +# sarf2_6col_to_score.sh - Convert SARF2 6 column format from sarf2out6col +# to 2 column format for tsevalfn.py etc. +# +# +# Usage: sarf2out6col < results.sarf2 | sarf2_6col_to_score.py +# +# The input in 6 column (id1 id2 len1 len2 nres rmsd) is read from stdin +# Output is to stdout. +# +# +# $Id: sarf2_6col_to_score.py 3585 2010-04-29 03:56:03Z alexs $ +# + + +import sys,os +from itertools import groupby + + +def Q_ssm(nres, rmsd, n1, n2): + """ + Compute the SSM 'Q' score for an alignment of two proteins with n1 + and n2 residues, with nres residues aligned and RMSD value of rmsd + (Angstroms). + + This score is defined in + + Kirssinel, E. & Henrick, K. 2004 'Secondary-structure matching (SSM), a new + tool for fast protein structure alignment in three dimensions' + Acta Crystallographica D60:2256-2268 + + Parameters: + nres - number of residues in alignment + rmsd - root mean square deviation of aligned residues (Angstroms) + n1 - number of residues in protein 1 + n2 - number of residues in protein 2 + + Return value: Q score for alignment + + """ + R0 = 3.0 # Krissinel & Henrick p. 2262 + return nres**2 / ( (1 + (rmsd / R0)**2) * n1 * n2) + +# +# main +# + +if len(sys.argv) != 1: + usage(os.path.basename(sys.argv[0])) + +sarflist = [] # list of (id1,id2,size1,size2,nres,rmsd) +for line in sys.stdin: + (id1, id2, size1, size2, nres, rmsd) = line.split() + size1 = int(size1) + size2 = int(size2) + nres = int(nres) + rmsd = float(rmsd) + sarflist.append((id1, id2, size1, size2, nres, rmsd)) + +query_group_iter = groupby(sorted(sarflist), lambda t : t[0]) + +for (queryid, result_iter) in query_group_iter: + sys.stdout.write("# QUERY ID = %s\n" % queryid) + for (id1, id2, size1, size2, nres, rmsd) in result_iter: + if size1 == 0 or size2 ==0: # somtimes we get junk results from SARF + sys.stderr.write("WARNING: bad results for %s - %s\n" % (id1, id2)) + score = 0 + else: + score = Q_ssm(nres, rmsd, size1, size2) + sys.stdout.write("%s %f\n" % (id2, score)) + + + + + + + diff --git a/scripts/sarf2fischerid.py b/scripts/sarf2fischerid.py new file mode 100755 index 0000000..bf152ae --- /dev/null +++ b/scripts/sarf2fischerid.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# +# File: sarf2fischerid.py +# Author: Alex Stivala +# Created: April 2010 +# +# sarf2fischerid.py - Convert SARF2 id never with chain to fischer id +# +# Usage: sarf2fischerid.py < sarfmultiqeryout +# +# The input file is 2 column from sarf2_6col_to_score.py +# (multiple query) on stdin +# Output is to stdout. +# +# +# We convert pdbid never with chain format e.g. 8ilb to format +# for Fischer evaluation e.g. 8ilb_a for those that have chain +# spcified in Fischer data set, otherwise leave the chain off. +# +# $Id: sarf2fischerid.py 3599 2010-05-03 04:22:27Z alexs $ +# + + +import sys,os +from itertools import groupby + +from tsevalutils import iter_searchresult +from fischer_tables import FISCHER_ID_FOLD_DICT + +# dict of { pdbid : pdbid_chain } where pdbid is each identifier +# in Fischer data set WITHOUT chainid and pdbid_chain has the chainid +# if it is specified in Fischer (oterhwise same as pdbid) + +FISCHER_CHAINID_DICT = dict( [ (d[:4], d) for d in FISCHER_ID_FOLD_DICT.keys() ] ) + +def sarfid_to_fischerid(sarfid): + """ + We convert pdbid never with chain format e.g. 8ilb to format + for Fischer evaluation e.g. 8ilb_a for those that have chain + spcified in Fischer data set, otherwise leave the chain off + NB no inputs have chain + + Parameters: + sarf -sarf identifier, no chain + + Return value: + PDB identifier with chain after _ only if chain specified in Fischer + + Uses global FISCHER_CHAIN_ID dict + """ + return FISCHER_CHAINID_DICT[sarfid] + + +def usage(progname): + sys.stderr.write("Usage: " + progname + " < sarf2colout\n") + sys.exit(1) + + +if len(sys.argv) != 1: + usage(os.path.basename(sys.argv[0])) + + +query_groupby_iter = groupby(sorted(iter_searchresult(sys.stdin,multiquery=True) ), lambda t : t[0]) +for (queryid, result_iter) in query_groupby_iter: + sys.stdout.write("# QUERY ID = %s\n" % sarfid_to_fischerid(queryid)) + for (queryid, score,dbid) in result_iter: + sys.stdout.write("%s %f\n" % (sarfid_to_fischerid(dbid), score)) + diff --git a/scripts/sarf2out6col.py b/scripts/sarf2out6col.py new file mode 100755 index 0000000..b68ba9e --- /dev/null +++ b/scripts/sarf2out6col.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# +# File: sarf2out6col.sh +# Author: Alex Stivala +# Created: April 2010 +# +# sarf2out6col.sh - Convert SARF2 output format to 6 column format +# queryid dbid querylen dblen Nres RMSD +# +# +# Usage: sarf2out6col.sh < sarf2_results_file +# +# The input file is read fomr stdin +# Output is to stdout. +# +# +# This is quite tricky since it is not really space-delimited, but (sort of) +# fixed column where fields can end up too large so no space between them, +# and yet don't always START in same column. +# +# $Id: sarf2out6col.py 3583 2010-04-29 02:11:46Z alexs $ +# + + +import sys,os + + +if len(sys.argv) != 1: + usage(os.path.basename(sys.argv[0])) + + +querypdbid = None + + +for line in sys.stdin: + queryid = line[:8] + if line[11] == "*": # sometimes we get '***' for some reason + querylen = 0 + else: + querylen = int(line[11:14].lstrip()) + dbid = line[16:24] + if line[27] == "*": # sometimes we get '***' for some reason + dblen = 0 + else: + dblen = int(line[27:30].lstrip()) + nres = int(line[32:35].lstrip()) + if line[35] == "*": # sometimes we get '***' for some reason + sys.stderr.write('WARNING: bad RMSD for %s - %s\n' % (queryid,dbid)) + rmsd = 9999 + else: + rmsd = float(line[35:40].lstrip()) + sys.stdout.write("%s %s %3d %3d %3d %6.2f\n" + % (queryid, dbid, querylen, dblen, nres, rmsd)) + diff --git a/scripts/scopdominfo.py b/scripts/scopdominfo.py new file mode 100755 index 0000000..de77545 --- /dev/null +++ b/scripts/scopdominfo.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python +############################################################################### +# +# scomdominfo.py - Report information folds and classes of a list of SCOP sids +# +# File: scomdominfo.py +# Author: Alex Stivala +# Created: November 2008 +# +# $Id: scopdominfo.py 3009 2009-12-08 03:01:48Z alexs $ +# +############################################################################### + +""" +Report information on the folds, superfamilies and classes of a list +of SCOP domain identifiers (sids). + +See usage in docstring for main() + +SCOP and ASTRAL data is obtained using the Bio.SCOP library (Casbon et +al 2006 'A high level interface to SCOP and ASTRAL implemented in +Python' BMC Bioinformatics 7:10) and depends on having the data +downloaded, in SCOP_DIR (defined below). + +Downloaded SCOP files from + +http://scop.mrc-lmb.cam.ac.uk/scop/parse/index.html + +and ASTRAL files (in scopseq-1.73) from + +http://astral.berkeley.edu/scopseq-1.73.html + +The files downlaoded are: + +/local/charikar/SCOP/: +dir.cla.scop.txt_1.73 +dir.des.scop.txt_1.73 +dir.hie.scop.txt_1.73 + +/local/charikar/SCOP/scopseq-1.73: +astral-scopdom-seqres-all-1.73.fa +astral-scopdom-seqres-sel-gs-bib-95-1.73.id + +Other files there are indices built by Bio.SCOP when first used. +""" + +import sys,os + +from Bio.SCOP import * + +from pathdefs import SCOP_DIR,SCOP_VERSION + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def write_scopdom_info(scopsid_list, fh, scop): + """ + Write information about the list of SCOP sids (domain identifiers) + in the scopsid_list to fh. For each domain write the fold and class, + then write stats about number of different folds represented + and the number of domains in each class. + + Parameters: + scopsid_list - list of SCOP sids (domain ids) + fh - open (write) filehandle to write to + scop - previously built Bio.SCOP Scop instance + + Return value: + None. + """ + superfamily_count = {} # dict of {sf_sunid : count} counting domains in eac superfamily + fold_count= {} # dict of {fold_sunid : count} counting domains in each fold + class_count={} # dict of {class_sunid : count} counting domains in each class + for sid in scopsid_list: + scop_dom = scop.getDomainBySid(sid) + scop_superfamily = scop_dom.getAscendent('superfamily') + scop_fold = scop_dom.getAscendent('fold') + scop_class = scop_dom.getAscendent('class') + if superfamily_count.has_key(scop_superfamily.sunid): + superfamily_count[scop_superfamily.sunid] += 1 + else: + superfamily_count[scop_superfamily.sunid] = 1 + if fold_count.has_key(scop_fold.sunid): + fold_count[scop_fold.sunid] += 1 + else: + fold_count[scop_fold.sunid] = 1 + if class_count.has_key(scop_class.sunid): + class_count[scop_class.sunid] += 1 + else: + class_count[scop_class.sunid] = 1 + fh.write('%s\t(%s) %s\t%s\t%s\n' % (sid, scop_superfamily.sccs,scop_superfamily.description, scop_fold.description, scop_class.description)) + + num_domains = len(scopsid_list) + num_superfamilies = len(superfamily_count) + num_folds = len(fold_count) + num_classes = len(class_count) + fh.write('Totals: %d domains\t%d superfamilies\t%d folds\t%d classes\n' % + (num_domains, num_superfamilies, num_folds, num_classes)) + fh.write('Class distribution:\n') + for (class_sunid, count) in class_count.iteritems(): + fh.write('\t%s:\t%d\n' % (scop.getNodeBySunid(class_sunid).description, + count)) + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + + " < domainidlist\n") + sys.exit(1) + + +def main(): + """ + main for scomdominfo.py + + Usage: scomdominfo.py < domainidlist + + + The list of SCOP domain ids (sids) is read from stdin + Output is written to stdout. + """ + if len(sys.argv) != 1: + usage(os.path.basename(sys.argv[0])) + + + # read SCOP data + scop = Scop(dir_path=SCOP_DIR,version=SCOP_VERSION) + + scopsid_list = sys.stdin.read().split('\n')[:-1] + write_scopdom_info(scopsid_list, sys.stdout, scop) + + +if __name__ == "__main__": + main() diff --git a/scripts/scopsuperfamilyinfo.py b/scripts/scopsuperfamilyinfo.py new file mode 100755 index 0000000..2cf9aaa --- /dev/null +++ b/scripts/scopsuperfamilyinfo.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +############################################################################### +# +# scopsuperfamilyinfo.py - Report information folds and classes of a +# list of SCOP sccs identifiers +# +# File: scopsuperfamilyinfo.py +# Author: Alex Stivala +# Created: March 2009 +# +# $Id: scopsuperfamilyinfo.py 3009 2009-12-08 03:01:48Z alexs $ +# +############################################################################### + +""" +Report information on the folds, superfamilies and classes of a list +of SCOP superfamily identifiers in the form of sccs (SCOP Concise +Classification Strings) identifeirs +(e.g. d.58.1) + +See usage in docstring for main() + +SCOP and ASTRAL data is obtained using the Bio.SCOP library (Casbon et +al 2006 'A high level interface to SCOP and ASTRAL implemented in +Python' BMC Bioinformatics 7:10) and depends on having the data +downloaded, in SCOP_DIR (defined below). + +Downloaded SCOP files from + +http://scop.mrc-lmb.cam.ac.uk/scop/parse/index.html + +and ASTRAL files (in scopseq-1.73) from + +http://astral.berkeley.edu/scopseq-1.73.html + +The files downlaoded are: + +/local/charikar/SCOP/: +dir.cla.scop.txt_1.73 +dir.des.scop.txt_1.73 +dir.hie.scop.txt_1.73 + +/local/charikar/SCOP/scopseq-1.73: +astral-scopdom-seqres-all-1.73.fa +astral-scopdom-seqres-sel-gs-bib-95-1.73.id + +Other files there are indices built by Bio.SCOP when first used. +""" + +import sys,os + +from Bio.SCOP import * + +from pathdefs import SCOP_DIR,SCOP_VERSION + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def write_scopsuperfamily_info(scopsccs_list, fh, scop): + """ + Write information about the list of SCOP sccs ids + in the scopsccs_list to fh. For each sccs id write the + superfamily and fold description. + + Parameters: + sccsid_list - list of SCOP superfamilies (sccs ids) + fh - open (write) filehandle to write to + scop - previously built Bio.SCOP Scop instance + + Return value: + None. + """ + # Bio.SCOP actually doesn't seem to have a facility to look up by + # sccs so we'll build a dictionary ourselves of all superfamilies + # keyed by sccs + all_superfamilies = scop.getRoot().getDescendents('sf') + sccs_dict = dict([(sf.sccs, sf) for sf in all_superfamilies]) + + for sccs in scopsccs_list: + sf = sccs_dict[sccs] + fold = sf.getAscendent('fold') + fh.write('%s\t%s\t%s\n' % (sf.sccs, sf.description, fold.description)) + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + + " < sccslist\n") + sys.exit(1) + + +def main(): + """ + main for scopsuperfamilyinfo.py + + Usage: scopsuperfamilyinfo.py < sccslist + + + The list of SCOP sccs superfamily strings (e.g. 'd.58.1') + is read from stdin + Output is written to stdout. + """ + if len(sys.argv) != 1: + usage(os.path.basename(sys.argv[0])) + + + # read SCOP data + scop = Scop(dir_path=SCOP_DIR,version=SCOP_VERSION) + + sccs_list = sys.stdin.read().split('\n')[:-1] + write_scopsuperfamily_info(sccs_list, sys.stdout, scop) + + +if __name__ == "__main__": + main() diff --git a/scripts/select_pdb_chain.py b/scripts/select_pdb_chain.py new file mode 100755 index 0000000..3cbe9d8 --- /dev/null +++ b/scripts/select_pdb_chain.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +############################################################################### +# +# select_pdb_chain.py - select just one chain from a PDB file +# +# +# File: select_pdb_chain.py +# Author: Alex Stivala +# Created: April 2010 +# +# +# $Id: select_pdb_chain.py 3572 2010-04-20 04:07:48Z alexs $ +# +############################################################################### +""" +This script parses a PDB file and writes out another PDB file with +only the specified chain in it, using BioPython. + +See usage in main() documentation. +""" + +import os,sys +import gzip +from Bio.PDB import * + +class ChainSelect(Select): + """ + The ChainSelect class inherits from the PDBIO.Select class + and overrides function to select only certain residues for writing + ATOM records in the chain we are interested in. + + See the Bio.PDB documentation by Thomas Hamelryck: + biopython-1.43/Doc/biopdb_faq.pdf + """ + def __init__(self, chainid): + """ + Constructor for the ChainSelect class, sets the chainid member + used to accept only residues in that chain. + Parameters: + chainid - chain id to select + """ + self.chainid = chainid.upper() + + def __repr__(self): + """ + Overrides the base __repr__ to write out the domain we have + """ + return "" + + + def accept_chain(self, chain): + """ + Overrides the base accept_chain() to select only the chain we want + + Parameters: + chain - Bio.PDB Chain object for the chain in question + + Return value: + 1 to accept the chain (its chainid is the one we want) + 0 otherwise (do not select the chain) + """ + if chain.get_id().upper() == self.chainid: + return 1 + else: + return 0 + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " pdbfile chainid\n") + sys.exit(1) + + +def main(): + """ + main for select_pdb_chain.py + + Usage: + select_pdb_chain.py pdbfile chainid + + Parses the specified PDB in pdbfile and writes out a version with + only the specified chain by chainid in it. + + Filenames may be either in the 1QLP.pdb format or the pdbq1lp.ent format. + Compressed pdb files are supported (gzip) (e.g. pdb1qlp.ent.gz). + + The output filename is the input with the chain appened after an + undserscore e.g. 1QLP_A.pdb. + """ + + if len(sys.argv) != 3: + usage(os.path.basename(sys.argv[0])) + + + pdb_filename = sys.argv[1] + chainid = sys.argv[2] + + if len(chainid) != 1: + usage(os.path.basename(sys.argv[0])) + + pdb_file_basename = os.path.basename(pdb_filename) + (name,extension) = os.path.splitext(pdb_file_basename) + + if extension == '.gz': + pdb_fh = gzip.open(pdb_filename) + else: + pdb_fh = open(pdb_filename) + + if len(name) == 4: + pdbid = name + elif len(name) >= 7 and name[:3] == 'pdb': + pdbid = name[3:7] + + outfilename = pdbid + '_' + chainid + '.pdb' + + parser = PDBParser() + structure = parser.get_structure(pdbid, pdb_fh) + pdb_fh.close() + + io = PDBIO() + io.set_structure(structure) + io.save(outfilename, ChainSelect(chainid)) + + +if __name__ == "__main__": + main() diff --git a/scripts/sheba2fischerid.py b/scripts/sheba2fischerid.py new file mode 100755 index 0000000..dacc2aa --- /dev/null +++ b/scripts/sheba2fischerid.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# +# File: sheba2fischerid.py +# Author: Alex Stivala +# Created: April 2010 +# +# sheba2fischerid.py - Convert SHEBA id always with chain to fischer id +# +# Usage: sheba2fischerid.py < shebamultiqeryout +# +# The input file is 2 column from shebaout2col (multiple query) on stdin +# Output is to stdout. +# +# +# We convert pdbid always with chain format e.g. 8ilb_a to format +# for Fischer evaluation e.g. 8ilb for those that do not have chain +# spcified in Fischer data set, otherwise leave the chain on. +# +# $Id: sheba2fischerid.py 3596 2010-05-03 02:38:39Z alexs $ +# + + +import sys,os +from itertools import groupby + +from tsevalutils import iter_searchresult +from fischer_tables import FISCHER_ID_FOLD_DICT + +def shebaid_to_fischerid(shebaid): + """ + We convert pdbid always with chain format e.g. 8ilb_a to format + for Fischer evaluation e.g. 8ilb for those that do not have chain + spcified in Fischer data set, otherwise leave the chain on. + NB all inputs have chain + + Parameters: + sheba -sheba identifier, with chain on end after underscore + + Return value: + PDB identifier with chain after _ only if chain specified in Fischer + """ + if FISCHER_ID_FOLD_DICT.has_key(shebaid[:4]): + return shebaid[:4] + else: + return shebaid + +def usage(progname): + sys.stderr.write("Usage: " + progname + " < sheba2colout\n") + sys.exit(1) + + +if len(sys.argv) != 1: + usage(os.path.basename(sys.argv[0])) + + +query_groupby_iter = groupby(sorted(iter_searchresult(sys.stdin,multiquery=True) ), lambda t : t[0]) +for (queryid, result_iter) in query_groupby_iter: + sys.stdout.write("# QUERY ID = %s\n" % shebaid_to_fischerid(queryid)) + for (queryid, score,dbid) in result_iter: + sys.stdout.write("%s %d\n" % (shebaid_to_fischerid(dbid), score)) + diff --git a/scripts/shebaout2col.sh b/scripts/shebaout2col.sh new file mode 100755 index 0000000..38d8160 --- /dev/null +++ b/scripts/shebaout2col.sh @@ -0,0 +1,29 @@ +#!/bin/sh +# +# File: shebaout2col.sh +# Author: Alex Stivala +# Created: Novermber 2008 +# +# shebaout2col.sh - Convert sheba -A summary column format to 2-column +# format as output by tsrchd_sparse etc. which can +# be processed with tsevalfn.py etc. +# +# Usage: shebaout2col.sh < shebaoutput.A.out +# +# Output has two columns, database id and SHEBA m-score. +# The query id is put in a comment line at top of file, it is assumed +# to be the same in every line of sheba -A output since that mode +# runs one query against a db. +# +# Output is to stdout. +# +# Uses the output format from SHEBA 3.1.1, see documentation at +# http://rex.nci.nih.gov/RESEARCH/basic/lmb/mms/sheba.htm +# for more information +# +# $Id: shebaout2col.sh 2031 2008-11-22 04:57:16Z astivala $ +# +# Uses GNU head options (-n -1) + + +awk '/ pdb1 na pdb2 nb id m %ma %mb /,/^$/' | awk 'NR > 1' | head -n -1 | awk 'NR==1 {printf "# QUERYID = %s\n",$1} {printf "%s %s\n",$3,$6}' diff --git a/scripts/slrtabs2star.py b/scripts/slrtabs2star.py new file mode 100755 index 0000000..13dbcec --- /dev/null +++ b/scripts/slrtabs2star.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python +############################################################################### +# +# slrtabs2star.py - convert .slrtab files into format for StAR +# +# File: slrtabs2star.py +# Author: Alex Stivala +# Created: April 2010 +# +# $Id: slrtabs2star.py 3593 2010-05-02 05:02:35Z alexs $ +############################################################################### +""" + slrtabs2star.py - convert .slrtab files into format for StAR + + Usage: slrtabs2star.py [-v] posfile negfile < listfile + + + posfile is the postivies filename to create + negfile is the negatives filename to create + + -v specifies verbose output to stderr + + listfile read from stdin is a two-column tab-delimited file with method namd + in first column and slrtab filename in second column. The script + reads each of the slrtab files named and creates the postivies and + negatives file for StAR. + + The listfile looks like e.g. + +"QP Tableau Search" /astivala/phd/qptabsearch/results/query200/norm2/query200_roc.slrtab + + NB tab delimiter not space + + Converts the .slrtab files, one for each method, + created by rocrfischer.py, tsevalfn.py etc. + into the positives.dat and negatives.dat files used by + StAR. + Both files are tab-delimited, with names on first line, + and scores for the true negative and positive classes (for negatives + and postivies file respectively) for each + classifier (according to order of names on first line) on each + subsequent line. (See examples in StAR documentation). + + WARNING: the positives file and negatives files are overwritten + if they exist. + + The reference for StAR is: + + Vergara, Normabuena, Ferrada, Slater and Melo 'StAR: a simple tool for + the statistical comparison of ROC curves' BMC Bioinformatics 2008 9:265 + +""" + + +import sys,os +import getopt + +from tsevalutils import iter_slrtab + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + sys.stderr.write("Usage " + progname + " [-v] posfile negfile < listfile\n") + sys.exit(1) + +def main(): + """ + main for slrtabs2star.py + see usage message in header comment + """ + verbose = False + + try: + opts,args = getopt.getopt(sys.argv[1:], "v?") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-v": # verbose + verbose = True + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 2: + usage(os.path.basename(sys.argv[0])) + + posfilename = args[0] + negfilename = args[1] + + (namelist, slrtabfilelist) = zip(*[line.split('\t') for line in sys.stdin]) # trciky use of zip and * to unzip list + slrtabfilelist = [x[:-1] for x in slrtabfilelist] # remove newlines on end + + posfile_fh = open(posfilename, "w") + negfile_fh = open(negfilename, "w") + + posscores = [] # list of lists: each list is scores for each method in pos class + negscores = [] # similarly for negative class scores + for slrtabfile in slrtabfilelist: + if verbose: + sys.stderr.write("Reading results from file %s..." % slrtabfile) + slrlist = list(iter_slrtab(open(slrtabfile))) # (score,label) list + posscores.append([sl[0] for sl in slrlist if sl[1] == 1]) + negscores.append([sl[0] for sl in slrlist if sl[1] == 0]) + assert(len(posscores[-1]) + len(negscores[-1]) == len(slrlist)) + if verbose: + sys.stderr.write(" %d entries (%d pos, %d neg)\n" % (len(slrlist),len(posscores[-1]),len(negscores[-1]))) + + if verbose: + sys.stderr.write("writing output to %s and %s..." %(posfilename, negfilename)) + + posfile_fh.write('\t'.join(namelist) + '\n') + negfile_fh.write('\t'.join(namelist) + '\n') + + numpos = len(posscores[0]) # FIXME may be different lengths + for i in xrange(numpos): + for j in xrange(len(namelist)): + posfile_fh.write(str(posscores[j][i])) + if j < len(posscores) - 1: + posfile_fh.write('\t') + posfile_fh.write('\n') + + numneg = len(negscores[0]) # FIXME may be different lengths + for i in xrange(numneg): + for j in xrange(len(namelist)): + negfile_fh.write(str(negscores[j][i])) + if j < len(negscores) - 1: + negfile_fh.write('\t') + negfile_fh.write('\n') + + + posfile_fh.close() + negfile_fh.close() + if verbose: + sys.stderr.write("done\n") + + +if __name__ == "__main__": + main() diff --git a/scripts/soln2ssemap.py b/scripts/soln2ssemap.py new file mode 100755 index 0000000..97d2471 --- /dev/null +++ b/scripts/soln2ssemap.py @@ -0,0 +1,381 @@ +#!/usr/bin/env python +############################################################################### +# +# soln2ssemap.py - Convert solution x vector from tsrchd/tsrchn to SSE mapping +# +# File: solns2ssemap.py +# Author: Alex Stivala +# Created: June 2008 +# +# $Id: soln2ssemap.py 2703 2009-07-27 06:01:05Z astivala $ +# +############################################################################### + +""" +Parse the output of the FORTRAN-77 tsrchn/tsrchd programs with the LSOLN +logical option set so that solution x vector is output as well as score +(objective function value at the solution), and convert to a correspondance +between SSEs in each structure (sequentially numbered from 1 along chains). + +Note we need to know the SSEs (or at least the number of them) so this +script actually gets the name of the tableaux database from the +tsrchd/tsrchnd output file it parses, and reads that information from the +database. + +This all is rather cumbersome and inefficient, but as we actually only +want the matching for comparatively few 'hits' (the best scoring ones) +it would be wasteful to have to do it for every match in the FORTRAN code +itself, this way the searching stays fast and we just have this slow +and cumbersome step to get the matching (and eg generate a PyMOL script +to show the correspondences with colours in PyMOL from it with ssemap2pml.py). + +Requires the Numeric library: + http://sourceforge.net/projects/numpy + +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os +import getopt +from time import strftime,localtime + +import numpy.oldnumeric as Numeric + +#----------------------------------------------------------------------------- +# +# Constants +# +#----------------------------------------------------------------------------- + +EPS = 1e-1 # epsilon for closesness to 0/1 + +#----------------------------------------------------------------------------- +# +# Class definitions +# +#----------------------------------------------------------------------------- + +class SearchSoln: + """ + SearchSoln is just a dummy class for containing the search results + with solution vectors, returned by parse_searchsoln() + """ + pass + +class QuerySoln: + """ + QuerySoln is a dummy class for containign result from individual query, + in SearchSoln. + """ + pass + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def parse_searchsoln(fh, domid=None): + """ + Parse the output of tsrchn/tsrchd with LSOLN set to .true. + so each structure has first structure name and score, then + x vector one element per line e.g. + + # TSRCHD LTYPE = T LORDER = T LSOLN = T + # QUERY ID = D1KI9A_ + # DBFILE = /local/charikar/astivala/tableauxdb/astral/tableauxdb.ascii + # Mon Aug 4 12:34:07 2008 + + d1xksa_ -35.99999999 + 0.0000 + 1.0000 + 0.0000 + 0.0000 + 0.0000 + ... + d3sila_ -35.99999999 + 0.0000 + ... + etc. + + Parameters: + fh - open (read) filehandle to parse from + domid - (default None) if not None, get only this identifier, + not whole file. + + Return value: + search_soln - dummy class SearchSoln containing: + queryid - query identifier parsed from comments + dbfile - filename of database file parsed from comments + query_soln_list - list of dummy class QuerySoln containing: + domid - id of domain in db + score - score of queryid against domid + soln - list of values in solution vector + comment_lines - list of comment lines read + """ + search_soln = SearchSoln() + query_soln_list = [] + query_soln = None + search_soln.comment_lines = [] + for line in fh: + if line[0] == '#': + sline = line[1:].split('=') + if sline[0].lstrip().rstrip() == 'QUERY ID': + search_soln.queryid = sline[1].lstrip().rstrip().lower() + elif sline[0].lstrip().rstrip() == 'DBFILE': + search_soln.dbfile = sline[1].lstrip().rstrip() + search_soln.comment_lines.append(line) + continue + if len(line.split()) > 1: # new identifier + if query_soln: + query_soln_list.append(query_soln) # finsished with prev one + if domid: + query_soln = None # appended here, don't do again + break # if only getting one, we have finished + splitline = line.split() + if len(splitline) != 2: + sys.stderr.write('bad line: ' + line + '\n') + continue + domainid = splitline[0] + if (domid and domid != domainid): + continue # only interested in domid, skip others + score_str = splitline[1] + if score_str.lower() == 'nan' or score_str == '********': + # if we get any NaN values then then sort() gets completely + # screwed up and the evaluation is all wrong, so skip them. + sys.stderr.write('skipping NaN: ' + line + '\n') + query_soln = None + continue + score = float(score_str) + query_soln = QuerySoln() + query_soln.domid = domainid + query_soln.score = score + query_soln.soln = [] + else: + # line should be an element of the solution vector + try: + xval = float(line) + except ValueError: + sys.stderr.write('bad float value: ' + line + '\n') + # skip whole query of a value is bad + if query_soln: + sys.stderr.write('skipping ' + query_soln.domid + '\n') + query_soln = None + continue + if query_soln: + query_soln.soln.append(xval) + if query_soln: + query_soln_list.append(query_soln) + + search_soln.query_soln_list = query_soln_list + return search_soln + + +def parse_tableauxdb_sizes(fh): + """ + Parse the dimensinos of tableaux from the ascii tableauxdb + + Parameters: + fh - open (read) filehandle for ascii tableauxdb file (numeric or + discrete) + Return value: + dictionary { domid : dim } mapping domain identifiers to tableau + dimension (numer of sses) + """ + dimdict = {} + line = fh.readline() + while line != "": + (domid, sdim) = line.split() + dim = int(sdim) + dimdict[domid.lstrip().rstrip()] = dim + i = 0 + while i < dim * 2: + line = fh.readline() # read through the tableau and dist matrix + i += 1 + line = fh.readline() # read the blank line between entries + line = fh.readline() + return dimdict + + +def soln_list_to_matrix(soln_vec, n1, n2): + """ + Convert the solution vector in the form of list of floats to matching + matrix where (i,j) is 1.0 for SSE i matching SSE j. + This requires dimensions of the two tableaux. + + Parameters: + soln_vec - solution vector as list of floats, length n1*n2+n1+n2 + n1 - dimension of tableau (number of SSEs) for 1st structure + n2 - dimension of tableau (number of SSEs) for 2nd structure + + Return value: + Numeric.array matrix dimension n1 x n2 of matching matrix. + """ + x = Numeric.array(soln_vec[:n1*n2]) + matchmat = Numeric.reshape(x, (n1, n2)) + return matchmat + +def matrix_to_tuplelist(matchmat): + """ + Convert the matrching matrix in the form of Numeric.array + matrix where (i,j) is 1.0 for SSE i matching SSE j + to a list of tuples (k,l) where k is sequential (from 1) SSE number + in first structure and l is sequential (from 2) SSE number in second + structure. + + Parameters: + matchmat - Numeric.array matching matrix (n1 x n2) + Return value: + list of tuples as described above. + """ + matchlist = [] + i = 0 + while i < Numeric.shape(matchmat)[0]: + j = Numeric.argmax(matchmat[i]) + if matchmat[i,j] >= 1.0 - EPS: + matchlist.append((i+1,j+1)) + i += 1 + return matchlist + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-d domainid] [-q querysize]\n") + sys.stderr.write(" -d domainid: only output result for hit against domainid\n") + sys.stderr.write(" -q querysize: specify size of query tableau\n") + sys.exit(1) + + +def main(): + """ + main for soln2ssemap.py + + Usage: sol2ssemap.py [ -d domainid ] [ -q querysize ] + + -d domainid: only output result for query against that domain (SCOP) id, + not all. + + -q querysize: specify size of query tableau, for when the query structure + is not itself in the database. NB if this does not match + actual size of query tableau, results undefined + (probably get ValueError on Numeric.reshape()). + + Input is on stdin, the output of tsrchn/tsrchd with LSOLN set to .true. + so each structure has first structure name and score, then + x vector one element per line e.g. + + # TSRCHD LTYPE = T LORDER = T LSOLN = T + # QUERY ID = D1KI9A_ + # DBFILE = /local/charikar/astivala/tableauxdb/astral/tableauxdb.ascii + # Mon Aug 4 12:34:07 2008 + d1xksa_ -35.99999999 + 0.0000 + 1.0000 + 0.0000 + 0.0000 + 0.0000 + ... + d3sila_ -35.99999999 + 0.0000 + ... + etc. + + Note the header 'comment' information is actually required, to get + the query id since we need to get the SSEs for that structure + (as well as the structures matched against it), we read this + from the tableaux database, the location of which is also in + the information at the top of the file we parse. + + Output format is identifier and score (as per input), then + for each matching a line containing + i and j separated by a space, + one per line (with blank line before next id) e.g.: + + # TSRCHD LTYPE = T LORDER = T LSOLN = T + # QUERY ID = D1KI9A_ + # DBFILE = /local/charikar/astivala/tableauxdb/astral/tableauxdb.ascii + # Mon Aug 4 12:34:07 2008 + d1wiua_ -23.0000 + 1 1 + 3 2 + 8 4 + 9 5 + 11 6 + 14 9 + + Note we copy the QUERY ID and DBFILE and other information to output for use + in later processing. + + """ + global verbose + verbose = False + + dbdomid = None + qsize = None + + try: + opts,args = getopt.getopt(sys.argv[1:], "d:q:") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-d": # domain id specified, only get this one + dbdomid = arg + elif opt == "-q": # specify query tableau szie + qsize = int(arg) + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 0: + usage(os.path.basename(sys.argv[0])) + + search_soln = parse_searchsoln(sys.stdin, dbdomid) + dim_dict = parse_tableauxdb_sizes(open(search_soln.dbfile)) + n1 = None + try: + n1 = dim_dict[search_soln.queryid] + except KeyError: + if not qsize: + sys.stderr.write("Query structure " + search_soln.queryid + + " not found in database. Use -q to specify" + + " query tableau size.\n") + sys.exit(1) + if qsize and n1 and qsize != n1: + sys.stderr.write("WARNING: -q " + str(qsize) + " does not match " + "size of query " + search_soln.queryid + + " in database (" + str(n1) + "). Using " + + " user-specified value " + str(qsize) + "\n") + n1 = qsize + elif qsize and not n1: + n1 = qsize + sys.stdout.write('# ' + os.path.basename(sys.argv[0]) + + ' processed:\n') + sys.stdout.write('#\n') + for cline in search_soln.comment_lines: + sys.stdout.write(cline) + sys.stdout.write('#\n') + timestamp = strftime("%d%b%Y %H:%M:%S", localtime()) + sys.stdout.write('# on ' + timestamp + '\n') + sys.stdout.write('#\n') + for query_soln in search_soln.query_soln_list: + if ((not dbdomid) or (query_soln.domid == dbdomid)): + n2 = dim_dict[query_soln.domid] + matchmat = soln_list_to_matrix(query_soln.soln, n1, n2) + matchlist = matrix_to_tuplelist(matchmat) + sys.stdout.write('%s %12.4f\n' % (query_soln.domid,query_soln.score)) + for (i,j) in matchlist: + sys.stdout.write(str(i) + ' ' + str(j) + '\n') + sys.stdout.write('\n') + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() diff --git a/scripts/sort_tableux_db_pbs_script.sh b/scripts/sort_tableux_db_pbs_script.sh new file mode 100644 index 0000000..1e234ff --- /dev/null +++ b/scripts/sort_tableux_db_pbs_script.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# +# File: sort_tableaux_db_pbs_script.sh +# Author: Alex Stivala +# Created: February 2010 +# +# PBS script for sorting tableaux+distance matrix database +# + +#PBS -N sort_tableaux_db + +#PBS -l walltime=23:0:0 + +#PBS -l nodes=1 + +module load python/2.6.2-gcc + +cd $PBS_O_WORKDIR +set CONV_RSH = ssh + + +OUTPUT_TABLEAUX_DIR=/home/alexs/tableauxdb/ASTRAL-sel-gs-bib-95-1.75 +TABLEAUX_PICKLE=$OUTPUT_TABLEAUX_DIR/tableauxdb.pickle +DISTMATRIX_PICKLE=$OUTPUT_TABLEAUX_DIR/distmatrixdb.pickle +TABLEAUXDB_ASCII=$OUTPUT_TABLEAUX_DIR/tableauxdistmatrixdb.sorted.ascii + +convdb2.py -s $TABLEAUX_PICKLE $DISTMATRIX_PICKLE > $TABLEAUXDB_ASCII + +times + diff --git a/scripts/ssemap2html.py b/scripts/ssemap2html.py new file mode 100755 index 0000000..f866cdb --- /dev/null +++ b/scripts/ssemap2html.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python +############################################################################### +# +# ssemap2html.py - Convert ssemap format (e.g. soln2ssemap.py output) to HTML +# in the structure. +# +# See usage comment in docstring for main() +# +# File: ssemap2html.py +# Author: Alex Stivala +# Created: November 2008 +# +# $Id: ssemap2html.py 2038 2008-11-25 05:39:26Z astivala $ +# +############################################################################### + +""" +Converts the ssemap format from soln2ssemap.py or ssesubsetreamp.py +or tlocsd to HTML formatted text, with each dbid given link to +pro-origami webserver database prebuilt cartoon for that ASTRAL SCOP +sid, with the selected SSEs indicated to be highlighted, and link to +SCOP entry, also SCOP superfamily sccs and fold description. + +Output is to stdout. + +The cache file is a python pickle dictionary: + scopdominfo_dict - + dict {sid: (superfamily_sccs, superfamily_description, fold_sccs, fold_description)} + where + superfamily_sccs is SCOP sccs identifying the superfamily for the domain + superamily_description is SCOP dessription of the superfamily + fold_sccs is SCOP sccs of the fold it is in + fold_description is the SCOP descriptino of the fold the domain is in + +""" + +import sys,os +import getopt +import pickle + +from parsessemap import parse_ssemap,SearchMap,QuerySSEMap + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " cachefile\n") + sys.exit(1) + + +def querymap_cmp(qmap1, qmap2): + """ + compare two QuerySSEMap objects by score for use in sorting. + uses absolute value so works with QP tableau search (-ve) and + heuristic tableau search (+ve) scores + """ + if abs(qmap1.score) < abs(qmap2.score): + return -1 + elif abs(qmap1.score) > abs(qmap2.score): + return 1 + else: + return 0 + +def main(): + """ + main for ssemap2html.py + + Usage: ssemap2html.py cachefile + + + cachefile is the filename of the cache (pickled) file built by + build_fastscopdominfo_cache.py + + Input is on stdin, the output of soln2ssemap.py, + format is identifier and score, then + for each matching a line containing + i and j separated by a space, + one per line (with blank line before next id) e.g.: + + # TSRCHD LTYPE = T LORDER = T LSOLN = T + # QUERY ID = D1KI9A_ + # DBFILE = /local/charikar/astivala/tableauxdb/astral/tableauxdb.ascii + # Mon Aug 4 12:34:07 2008 + d1wiua_ -23.0000 + 1 1 + 3 2 + 8 4 + 9 5 + 11 6 + 14 9 + + Note we copy the QUERY ID and DBFILE and other information to output for use + in later processing. + + Ouput is HTML format text with each dbid given link to pro-origami + webserver database prebuilt cartoon for that ASTRAL SCOP sid, with + the selected SSEs indicated to be highlighted, and link to SCOP + entry, also superfamily sccs id with link to SCOP and fold + description. + + """ + dbdomid = None + + if len(sys.argv) != 2: + usage(os.path.basename(sys.argv[0])) + + pickle_filename = sys.argv[1] + scopdominfo_dict = pickle.load(open(pickle_filename)) + + search_maps = parse_ssemap(sys.stdin) + + print '' + print '' + print '
' + print '' + + print '' + + for query_ssemap in sorted(search_maps.query_ssemap_list,cmp=querymap_cmp,reverse=True): + if len(query_ssemap.sse_map) == 0: + sseseqnums = None + sseseqnums_str = "none" + else: + sseseqnums = [j for (i,j) in query_ssemap.sse_map] + sseseqnums_str = ','.join([str(s) for s in sseseqnums]) + + sid = query_ssemap.domid + entry = scopdominfo_dict[sid] + sf_sccs = entry[0] + sf_desc = entry[1] + fold_sccs = entry[2] + fold_desc = entry[3] + + sys.stdout.write("\n" + % (query_ssemap.score, + query_ssemap.domid, + sseseqnums_str, + query_ssemap.domid, query_ssemap.domid, + query_ssemap.domid, + sf_sccs, sf_sccs, sf_desc, + fold_sccs, fold_desc) + ) + + print '
scorecartoonSCOP entrysuperfamilyfold
%s %s%s%s %s%s
' + print '
' + print '' + + +if __name__ == "__main__": + main() diff --git a/scripts/ssemap2pml.py b/scripts/ssemap2pml.py new file mode 100755 index 0000000..c0b3354 --- /dev/null +++ b/scripts/ssemap2pml.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python +############################################################################### +# +# ssemap2pml.py - Convert SSE mapping to PyMOL script for visualization +# +# File: ssemap2pml.py +# Author: Alex Stivala +# Created: June 2008 +# +# $Id: ssemap2pml.py 2117 2009-03-20 07:25:15Z astivala $ +# +############################################################################### + +""" + +Using the SSE mapping from soln2ssemap.py, which shows pairs of SSE +sequential (from 1) numbers that correspond to each other, produce +a PyMOL (.pml) script to visualize the mapping. + +Requires the ptsecstruct.py module to get secondary structures using +DSSP (or STRIDE) (add directory contianing to to PYTHONPATH). + +Note that these must be the same definintions used +to produce the mapping, i.e. that the tableaux database and query +were built with, otherwise it won't realy make sense. + +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os +import getopt +from time import strftime,localtime + +import ptsecstruct +from parsessemap import parse_ssemap,SearchMap,QuerySSEMap +from pathdefs import ASTRAL_ROOT + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def get_sses(scopsid, thepdbfile=None): + """ + Get SSE definitions (residue ranges) from the supplied SCOP sid using + DSSP. Uses the ptsecstruct.py module, note comments at top of this + module also regarding ensuring the same definitions are used here + as for the actual search. + + Parameters: + scopsid - SCOP identifier to get SSEs for; used to locate file + under ASTRAL SCOP hierarchy. + thepdbfile - (default None) if not None, PDB file to get SSEs for, + overriding scopsid. + Return value: + list of (chain, start_resi, end_resi, type) tuples sorted by + ascending residue number. type is 'H' or 'E'. + """ + if thepdbfile: + pdbfile = thepdbfile + else: + pdbfilename = os.path.join(scopsid[2:4].lower(), + scopsid.lower() + '.ent') + pdbfile = os.path.join(ASTRAL_ROOT, pdbfilename) + + secstruct = ptsecstruct.read_secstruct_from_dssp(pdbfile) + return secstruct.get_sse_tuple_list() + + +def write_pymol_prelude(fh, search_maps): + """ + Write startup information in PyMOL script (.pml) file. + + Parameters: + fh - open (write) filehandle to write to + search_maps - dummy class SearchMap containing: + queryid - query identifier parsed from comments + query_ssemap_list - list of dummy class QuerySSEMap containing: + domid - id of domain in db + score - score of queryid against domid + sse_map - list of (i,j) SSE sequential index tuples + comment_lines - list of comment lines read + Return value: None + """ + sys.stdout.write('# generated by ' + os.path.basename(sys.argv[0]) +'\n') + timestamp = strftime("%d%b%Y %H:%M:%S", localtime()) + sys.stdout.write('# on ' + timestamp + '\n') + sys.stdout.write('# from \n') + for cline in search_maps.comment_lines: + sys.stdout.write(cline) + sys.stdout.write('#\n') + sys.stdout.write('#\n') + + +def write_pymol_load(fh, domid, score, color, thepdbfile=None): + """ + Write comands to load specified sturcture into PyMOL + + Parameters: + fh - open (write) filehandle to write to + domid - identifier of sructure, filename built from this + score - socre of matching on this structure (just written as comment) + color - color name to color the whole structure initially + thepdbfile - (default None) if not None, PDB file to get SSEs for, + overriding scopsid. + + Return value: None + """ + if thepdbfile: + pdbfile = thepdbfile + else: + scopsid = domid + pdbfilename = os.path.join(scopsid[2:4].lower(), + scopsid.lower() + '.ent') + pdbfile = os.path.join(ASTRAL_ROOT, pdbfilename) + + if score: + fh.write('# ID = ' + domid + '\n# SCORE = ' + str(score) + '\n') + fh.write('load ' + pdbfile + '\n') + fh.write('color ' + color + ', /'+domid + '\n') + + + +def write_pymol_prelude(fh, search_maps): + """ + Write startup information in PyMOL script (.pml) file. + + Parameters: + fh - open (write) filehandle to write to + search_maps - dummy class SearchMap containing: + queryid - query identifier parsed from comments + query_ssemap_list - list of dummy class QuerySSEMap containing: + domid - id of domain in db + score - score of queryid against domid + sse_map - list of (i,j) SSE sequential index tuples + comment_lines - list of comment lines read + Return value: None + """ + sys.stdout.write('# generated by ' + ' '.join(sys.argv) +'\n') + timestamp = strftime("%d%b%Y %H:%M:%S", localtime()) + sys.stdout.write('# on ' + timestamp + '\n') + sys.stdout.write('# from \n') + for cline in search_maps.comment_lines: + sys.stdout.write(cline) + sys.stdout.write('#\n') + sys.stdout.write('#\n') + + + +def write_pymol_conclusion(fh): + """ + Write finalizing information in PyMOL script (.pml) file. + + Parameters: + fh - open (write) filehandle to write to + Return value: None + """ + sys.stdout.write('hide everything\n') + sys.stdout.write('show cartoon\n') + + + +def write_pymol_color_sses(fh, domid, sse_list, color_list): + """ + Write PyMOL commands to colour the supplied list of SSEs (specified + as (chain,start,end,type) tuples from get_sses()) the specified colours. + + Parameters: + fh - open (write) filhandle to write to + domid - identifier of the structure to colour + sse_list list of (chain,start,end,type) tuples specifygin SSEs to color + color_list - list of color name to color the SSEs + + """ + i = 0 + for (chain, start_resi, end_resi, ssetype) in sse_list: + fh.write('color ' + color_list[i] + ',' + '/' + domid + '//' + chain + + '/' + str(start_resi) + '-' + str(end_resi) + '\n') + i += 1 + + +def write_pml_color_substructure_matching_query(fh, domid, ssemap_list, + sse_list): + """ + Color all the SSEs in the structure that have a match in the query + structure with each a different color from constant list + - the idea is to call this for both query and + hit structures, so matching SSEs have the same colors. + + Parameters: + fh - open (write) filehandle to write PyMOL script to + domid - identifier of domain to color + ssemap_list - list of (i,j) tuples matching SSE i (from 1) in query + structure with SSE j (from 1) in this structure + sse_list - list of (chain,start,end,type) tuples specifyign SSEs in struct + Return value: + None + """ + COLOR_LIST = ['red','green','blue','yellow','magenta','cyan','orange','pink','aquamarine','brightorange','brown','deepolive','deepteal','purple','ruby','violet','wheat','firebrick','hotpink','lead','lime','raspberry','splitpea','sulfur','zink','tantalum'] + color_sse_list = [ sse_list[j-1] for (i,j) in ssemap_list] + write_pymol_color_sses(fh, domid, color_sse_list, COLOR_LIST) + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-d domainid] [-s] [-u query_pdbfile] [-b db_pdbfile] \n") + sys.stderr.write( + "-d domainid: only output for this domain, not all\n" + "-s: color the query structure with matching SSEs also.\n" + " Only valid if there is only one domain (either becuase -d is\n" + " specified or there is only one in the input).\n" + "-u query_pdbfile: filename of query PDB file. If not specified then\n" + " identifier is used to find in ASTRAL SCOP hierarchy.\n" + "-b db_pdbfile: filename of database PDB file. If not specfied then\n" + " identifier is used to find in ASTRAL SCOP hierarchy.\n" + " Only valid if there is only one domain (either becuase -d is\n" + " specified or there is only one in the input).\n" + + + ) + sys.exit(1) + + +def main(): + """ + main for ssemap2pml.py + + Usage: ssemap2pml.py [-d domainid] [-s] [-u query_pdbfile] [-b db_pdbfile] + + -d domainid: only output for this domain, not all + -s: color the query structure with matching SSEs also. + Only valid if there is only one domain (either becuase -d is + specified or there is only one in the input). + -u query_pdbfile: filename of query PDB file. If not specified then + identifier is used to find in ASTRAL SCOP hierarchy. + -b db_pdbfile: filename of database PDB file. If not specfied then + identifier is used to find in ASTRAL SCOP hierarchy. + Only valid if there is only one domain (either becuase -d is + specified or there is only one in the input). + + + Input is on stdin, the output of soln2ssemap.py, + identifier and score (as per input), then + for each matching a line containing + i and j separated by a space, + one per line (with blank line before next id) e.g.: + + d1wiua_ -23.0000 + 1 1 + 3 2 + 8 4 + 9 5 + 11 6 + 14 9 + + + The first SSE number on each line is in the query structure + (specified in header information), the second + is in the db structure (d1wiua_ in example). + + Output is a PyMOL script (.pml) on stdout. + """ + global verbose + verbose = False + + dbdomid = None + color_query = False + query_pdbfile = None + db_pdbfile = None + + try: + opts,args = getopt.getopt(sys.argv[1:], "sd:u:b:") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-d": # domain id specified, only get this one + dbdomid = arg + elif opt == "-s": # also color SSEs matched in the query structure + color_query = True + elif opt == "-u": # query PDB filename + query_pdbfile = arg + elif opt == "-b": # db PDB filename + db_pdbfile = arg + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 0: + usage(os.path.basename(sys.argv[0])) + + search_maps = parse_ssemap(sys.stdin) + if (color_query and not dbdomid and len(search_maps.query_ssemap_list) > 1): + sys.stderr.write("WARNING: -s specfied without -d and more than one " + "structure on input: ignored -s option\n") + color_query = False + if (db_pdbfile and not dbdomid and len(search_maps.query_ssemap_list) > 1): + sys.stderr.write("ERROR: -b specified without -d and more than one " + "structure on input\n") + sys.exit(1) + + query_sses = get_sses(search_maps.queryid, query_pdbfile) + write_pymol_prelude(sys.stdout, search_maps) + + if color_query: + write_pymol_load(sys.stdout, search_maps.queryid, + None, 'white', query_pdbfile) + ptsecstruct.write_pml_define_sses(sys.stdout, search_maps.queryid, query_sses) + # reverse order of (i,j) so that it is the query SSE numbers used + ssemap = [(j,i) for (i,j) in search_maps.query_ssemap_list[0].sse_map] + write_pml_color_substructure_matching_query(sys.stdout, + search_maps.queryid, + ssemap, + query_sses) + + for query_ssemap in search_maps.query_ssemap_list: + if ((not dbdomid) or (query_ssemap.domid == dbdomid)): + db_sses = get_sses(query_ssemap.domid, db_pdbfile) + write_pymol_load(sys.stdout, query_ssemap.domid, + query_ssemap.score,'white', db_pdbfile) + ptsecstruct.write_pml_define_sses(sys.stdout, query_ssemap.domid, db_sses) + write_pml_color_substructure_matching_query(sys.stdout, + query_ssemap.domid, + query_ssemap.sse_map, + db_sses) + write_pymol_conclusion(sys.stdout) + + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() diff --git a/scripts/ssepermutationremap.py b/scripts/ssepermutationremap.py new file mode 100755 index 0000000..c9580ff --- /dev/null +++ b/scripts/ssepermutationremap.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python +############################################################################### +# +# ssepermutationremap.py - Remap SSE mapping output of solns2ssemap.py for the +# case that one of the tableaux was row+column permuted. +# +# See usage comment in docstring for main() +# +# File: ssepermutationremap.py +# Author: Alex Stivala +# Created: November 2008 +# +# $Id: ssepermutationremap.py 2043 2008-11-27 01:54:29Z astivala $ +# +############################################################################### + +""" +In the case that one of the tableaux has been row+column permuted +(for testing nonsequential matching) +(-u option on pytableaucreate.py (-u option on +qptabmatchstructs.sh)) we cannot just use the mapping produced +by soln2ssemap.py (q.v.) as it just numbers sequentially along +the tableau, but this is no longer in the same order as the SSEs in +the structure. So to get the actualy visualization/RMSD calculation +(superimposessemap.py, the next stage in the pipeline) correct, +we have to apply the permutation so that the numbers are mapped +back to the actual SSEs in the structure that that row+column in the +tableau represents. + +""" + +import sys,os +import getopt +from time import strftime,localtime + +from parsessemap import parse_ssemap,SearchMap,QuerySSEMap + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-d domainid] permutation_list\n") + sys.exit(1) + + +def main(): + """ + main for ssepermutationremap.py + + Usage: sol2ssemap.py [-d domainid] permutation_list + + -d domainid: only output result for query against that domain (SCOP) id, + not all. + + permutation_list is the comma-separted list of integers representing + the permutation. E.g. (for 4 SSEs) 1,2,3,4 is the null permutation, + and 4,3,2,1 represents reversing the sequence, etc. + Note that this must be the same list used to + to produce the mapping, i.e. that the tableaux query + was built with, otherwise it won't mean anything. + + Input is on stdin, the output of soln2ssemap.py, + format is identifier and score, then + for each matching a line containing + i and j separated by a space, + one per line (with blank line before next id) e.g.: + + # TSRCHD LTYPE = T LORDER = T LSOLN = T + # QUERY ID = D1KI9A_ + # DBFILE = /local/charikar/astivala/tableauxdb/astral/tableauxdb.ascii + # Mon Aug 4 12:34:07 2008 + d1wiua_ -23.0000 + 1 1 + 3 2 + 8 4 + 9 5 + 11 6 + 14 9 + + Note we copy the QUERY ID and DBFILE and other information to output for use + in later processing. + + Output is the same format, but the SSE numbers in the first column + have been remapped according to the sse_num_list; i.e. the numbers + in that column of the input as used as indices into the sse_num_list + and the resulting numbers from that list are the correspdoning output. + """ + global verbose + verbose = False + + dbdomid = None + + try: + opts,args = getopt.getopt(sys.argv[1:], "d:") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-d": # domain id specified, only get this one + dbdomid = arg + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 1: + usage(os.path.basename(sys.argv[0])) + + permut_list_str = args[0].split(',') + permut_list = [] + sse_id_uniq_dict = {} # { id : True } just for checking all unique + for sse_id_str in permut_list_str: + if sse_id_str.isdigit(): + if sse_id_uniq_dict.has_key(int(sse_id_str)): + sys.stderr.write("duplicate SSE sequential number " + + sse_id_str + "\n") + usage(sys.argv[0]) + sse_id_uniq_dict[int(sse_id_str)] = True + permut_list.append(int(sse_id_str)) + else: + sys.stderr.write("not a valid SSE sequential number '" + + sse_id_str + "'\n") + usage(sys.argv[0]) + + search_maps = parse_ssemap(sys.stdin) + + sys.stdout.write('# generated by ' + os.path.basename(sys.argv[0]) + ' ' + ' '.join(sys.argv[1:]) +'\n') + timestamp = strftime("%d%b%Y %H:%M:%S", localtime()) + sys.stdout.write('# on ' + timestamp + '\n') + sys.stdout.write('# from:\n') + for cline in search_maps.comment_lines: + sys.stdout.write(cline) + sys.stdout.write('#\n') + for query_ssemap in search_maps.query_ssemap_list: + if ((not dbdomid) or (query_ssemap.domid == dbdomid)): + sys.stdout.write('%s %12.4f\n' % (query_ssemap.domid,query_ssemap.score)) + for (i,j) in query_ssemap.sse_map: + iprime = permut_list[i-1] + sys.stdout.write(str(iprime) + ' ' + str(j) + '\n') + + + + +if __name__ == "__main__": + main() diff --git a/scripts/ssesubsetremap.py b/scripts/ssesubsetremap.py new file mode 100755 index 0000000..9d8f812 --- /dev/null +++ b/scripts/ssesubsetremap.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python +############################################################################### +# +# ssesubsetremap.py - Remap SSE mapping output of solns2ssemap.py for the +# case that one of the tableaux was a subset of the SSEs +# in the structure. +# +# See usage comment in docstring for main() +# +# File: ssesubsetremap.py +# Author: Alex Stivala +# Created: November 2008 +# +# $Id: ssesubsetremap.py 2038 2008-11-25 05:39:26Z astivala $ +# +############################################################################### + +""" +In the case that one of the tableaux is not a whole structure but a +substructure (-s option on pytableaucreate.py (-e option on +qptabmatchstructs.sh)) we cannot just use the mapping produced +by soln2ssemap.py (q.v.) as it just numbers SSEs 1,2,3,... along +the tableau. If one of the tableaux actually reprsents for example +SSEs 4,5,6,7,8 (or 2,6,9,10,11,12,18, need not be sequential) then +we need to map the sequential 1-based numbering of the tableau columns +from soln2ssemap back to the actual SSE sequential nubmers in the structure +according to the mapping provided by the SSE subset list used on +pytableaucreate.py (and provided to this script). + +This all is rather cumbersome and inefficient, but as we actually only +want the matching for comparatively few 'hits' (the best scoring ones) +it would be wasteful to have to do it for every match in the FORTRAN code +itself, this way the searching stays fast and we just have this slow +and cumbersome step to get the matching (and eg generate a PyMOL script +to show the correspondences with colours in PyMOL from it with ssemap2pml.py). + +""" + +import sys,os +import getopt +from time import strftime,localtime + +from parsessemap import parse_ssemap,SearchMap,QuerySSEMap + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-d domainid] sse_num_list\n") + sys.exit(1) + + +def main(): + """ + main for ssesubsetremap.py + + Usage: sol2ssemap.py [-d domainid] sse_num_list + + -d domainid: only output result for query against that domain (SCOP) id, + not all. + + sse_num_list is the comma-separted list of sequential SSE numbers + represented by the tableau of the first ('query') structure, + as supplied to pytableaucreate.py -s option and + qptabmatscructs.py -e option + Note that this must be the same list used to + to produce the mapping, i.e. that the tableaux query + was built with, otherwise it won't mean anything. + + Input is on stdin, the output of soln2ssemap.py, + format is identifier and score, then + for each matching a line containing + i and j separated by a space, + one per line (with blank line before next id) e.g.: + + # TSRCHD LTYPE = T LORDER = T LSOLN = T + # QUERY ID = D1KI9A_ + # DBFILE = /local/charikar/astivala/tableauxdb/astral/tableauxdb.ascii + # Mon Aug 4 12:34:07 2008 + d1wiua_ -23.0000 + 1 1 + 3 2 + 8 4 + 9 5 + 11 6 + 14 9 + + Note we copy the QUERY ID and DBFILE and other information to output for use + in later processing. + + Output is the same format, but the SSE numbers in the first column + have been remapped according to the sse_num_list; i.e. the numbers + in that column of the input as used as indices into the sse_num_list + and the resulting numbers from that list are the correspdoning output. + """ + global verbose + verbose = False + + dbdomid = None + + try: + opts,args = getopt.getopt(sys.argv[1:], "d:") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-d": # domain id specified, only get this one + dbdomid = arg + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 1: + usage(os.path.basename(sys.argv[0])) + + sse_id_list_str = args[0].split(',') + sse_id_list = [] + sse_id_uniq_dict = {} # { id : True } just for checking all unique + for sse_id_str in sse_id_list_str: + if sse_id_str.isdigit(): + if sse_id_uniq_dict.has_key(int(sse_id_str)): + sys.stderr.write("duplicate SSE sequential number " + + sse_id_str + "\n") + usage(sys.argv[0]) + sse_id_uniq_dict[int(sse_id_str)] = True + sse_id_list.append(int(sse_id_str)) + else: + sys.stderr.write("not a valid SSE sequential number '" + + sse_id_str + "'\n") + usage(sys.argv[0]) + sse_id_list.sort() # ensure SSEs are in order + + search_maps = parse_ssemap(sys.stdin) + + sys.stdout.write('# generated by ' + os.path.basename(sys.argv[0]) + ' ' + ' '.join(sys.argv[1:]) +'\n') + timestamp = strftime("%d%b%Y %H:%M:%S", localtime()) + sys.stdout.write('# on ' + timestamp + '\n') + sys.stdout.write('# from:\n') + for cline in search_maps.comment_lines: + sys.stdout.write(cline) + sys.stdout.write('#\n') + for query_ssemap in search_maps.query_ssemap_list: + if ((not dbdomid) or (query_ssemap.domid == dbdomid)): + sys.stdout.write('%s %12.4f\n' % (query_ssemap.domid,query_ssemap.score)) + for (i,j) in query_ssemap.sse_map: + iprime = sse_id_list[i-1] + sys.stdout.write(str(iprime) + ' ' + str(j) + '\n') + + + + +if __name__ == "__main__": + main() diff --git a/scripts/ssmxmlout2col.py b/scripts/ssmxmlout2col.py new file mode 100755 index 0000000..9735632 --- /dev/null +++ b/scripts/ssmxmlout2col.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# +# File: ssmxmlout2col.sh +# Author: Alex Stivala +# Created: November 2008 +# +# ssmxmlout2col.sh - Convert SSM webserver XML output to 2 column +# format as output by tsrchd_sparse etc. which can +# be processed with tsevalfn.py etc. +# +# Usage: ssmxmlout2col.sh < domain.xml +# +# Output has two columns, database id and SSM Pcli score +# +# Output is to stdout. +# +# Uses the XML output format from the SSM webserver +# http://www.ebi.ac.uk/msd-srv/ssm/ +# +# Developed with SSM v2.36 output +# +# $Id: ssmxmlout2col.py 2103 2009-03-16 05:15:19Z astivala $ +# + +import os,sys +from xml.dom import minidom + +if (len(sys.argv) != 1): + sys.stderr.write('Usage: ' + sys.argv[0] + '\n') + sys.exit(1) + +doc = minidom.parse(sys.stdin) +matches = doc.getElementsByTagName("Match") +for match in matches: + qscore = [child for child in match.childNodes + if child.nodeType == child.ELEMENT_NODE and + child.nodeName == "Q-score"][0] + qval = qscore.firstChild.data + target = [child for child in match.childNodes + if child.nodeType == child.ELEMENT_NODE and + child.nodeName == "Target"][0] + name = [child for child in target.childNodes + if child.nodeType == child.ELEMENT_NODE and + child.nodeName == "name"][0] + sid = name.firstChild.data + sys.stdout.write('%s %s\n' % (sid, qval)) + + + diff --git a/scripts/star2auctable.py b/scripts/star2auctable.py new file mode 100755 index 0000000..9d241c0 --- /dev/null +++ b/scripts/star2auctable.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python +############################################################################### +# +# star2auctable.py - convert StAR results.txt format to table of differences +# +# File: star2auctable.py +# Author: Alex Stivala +# Created: April 2010 +# +# $Id: star2auctable.py 3681 2010-05-17 06:05:04Z alexs $ +############################################################################### +""" + star2auctable.py - convert StAR output to table of AUC differences + + Usage: star2auctable.py [-p pvalue] results.txt conf_intervals.txt reference-method + + -p pvalue - p-value to use for signficant differences, default 0.05 + + results.txt is the output of StAR to read, a matrix of AUC differences + (upper triangle) and p-values (lower triangle) + + conf_intervals.txt is the StAR conf_intervals.txt output file. + + reference-method is the method we want to measure the others + relative to i.e we produce a table of AUC difference from + this method with p-values + (don't include the quotes that StAR always includes) + + Output is to stdout. + + The reference for StAR is: + + Vergara, Normabuena, Ferrada, Slater and Melo 'StAR: a simple tool for + the statistical comparison of ROC curves' BMC Bioinformatics 2008 9:265 + +""" + + +import sys,os +import getopt + +import numpy + +def parse_results_file(results_fh): + """ + Parse the StAR results.txt file, a matrix of AUC differences + (upper triangle) and p-values (lower triangle). + The file is tab-delimited + + Parameters: + results_fh - open filehandle to read results.txt from + + Return value: + tuple (resarray, methodlist) + where resarray is the matrix parsed, with delta AUC in uppper + and p-value in lower triange, n square for n methods. + methodlist is list of method names, position in list is index + in resarray for that method + """ + for line in results_fh: + sline = line.split('\t') + if len(sline) < 2: + continue + if line[0] == '\t': + methodlist = line.split('\t')[1:] + n = len(methodlist) + resarray = numpy.zeros((n,n)) + i = 0 + continue + j = 0 + for v in sline[1:]: + if i != j: + resarray[i,j] = v + j += 1 + i += 1 + + # remove quotes from method names (and newline from last one) + methodlist = [method.rstrip().lstrip('"').rstrip('"') for method in methodlist] + return (resarray, methodlist) + + +def parse_conf_intervals(ci_fh): + """ + Parse the StAR conf_intervals.txt file, each row a pair of methods + with AUC difference (this time WITH sign, so we know which is better + and which worse) and confidence interval + + Parameters: + ci_fh - open filehandle to read conf_intervals.txt from + + Return value: + dict { (method1,method2) : (auc_difference, cilower, ciupper) } + + mapping pair of methods to difference in AUC (method1 - method2), + and lower and upper confidence interval value + """ + ci_dict = {} + lineno = 1 + for line in ci_fh: + if lineno == 1: + lineno += 1 + continue + sline = line.split('\t') + (method1,method2) = sline[0].split('/') + method1 = method1.lstrip('"').rstrip('"') + method2 = method2.lstrip('"').rstrip('"') + deltaAUC = float(sline[1]) + cipair = sline[2] # ( -0.0642863 , -0.0410837 ) + cilower = cipair.split(' ')[1] + ciupper = cipair.split(' ')[3] + ci_dict[(method1,method2)] = (deltaAUC, cilower, ciupper) + lineno += 1 + return ci_dict + + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + sys.stderr.write("Usage " + progname + "[-p pvalue] results.txt conf_intervals.txt reference-method\n") + sys.exit(1) + +def main(): + """ + main for star2auctable.py + see usage message in header comment + """ + sigpvalue = 0.05 + + try: + opts,args = getopt.getopt(sys.argv[1:], "p:h?") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-p": + sigpvalue = float(arg) + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 3: + usage(os.path.basename(sys.argv[0])) + + resultsfilename = args[0] + confintervalsfilename = args[1] + referencemethod = args[2] + + (resarray, methodlist) = parse_results_file(open(resultsfilename)) + + confintervals_dict = parse_conf_intervals(open(confintervalsfilename)) + + + q = 0 + for q in xrange(len(methodlist)): + if methodlist[q] == referencemethod: + j = q + break + if q >= len(methodlist)-1: + sys.stderr.write("Method %s not found\n" % referencemethod) + sys.exit (1 ) + + notdiff_list = [] + for i in xrange(len(methodlist)): + if methodlist[i] != referencemethod: + try: + ci_tuple = confintervals_dict[(referencemethod,methodlist[i])] + signedDeltaAUC = ci_tuple[0] + except KeyError: + ci_tuple = confintervals_dict[(methodlist[i],referencemethod)] + signedDeltaAUC = -ci_tuple[0] + + if i < j: + deltaAUC = resarray[i,j] + pvalue = resarray[j,i] + else: + pvalue = resarray[i,j] + deltaAUC = resarray[j,i] + + assert(deltaAUC - abs(signedDeltaAUC) < 1e-5) + + if pvalue < sigpvalue: + sys.stdout.write("%s\t%5.4f\t%5.4g\t%5.4f\n" % (methodlist[i], deltaAUC, pvalue, signedDeltaAUC)) + else: + notdiff_list.append(methodlist[i]) + + + if len(notdiff_list) > 0: +# sys.stdout.write("Not signficantly different from %s at p = %5.4f:\n" % (referencemethod, sigpvalue)) +# for m in notdiff_list: +# sys.stdout.write("%s\n" % m) + + # output in same column format as different ones, to allow sort or + # other easy parsing of output + sys.stdout.write("%s\t%4.3f\t%5.4g\t%4.3f\n" % + (', '.join([referencemethod]+notdiff_list), + 0, sigpvalue, 0)) + + +if __name__ == "__main__": + main() diff --git a/scripts/starauctable2tex.sh b/scripts/starauctable2tex.sh new file mode 100755 index 0000000..ceb61b5 --- /dev/null +++ b/scripts/starauctable2tex.sh @@ -0,0 +1,39 @@ +#!/bin/sh +# +# File: starauctable2tex.sh +# Author: Alex Stivala +# Created: May 2010 +# +# starauctable2tex.sh -convert output of star2auctable.py to LaTeX format +# +# Usage: starauctable2tex.sh +# +# Input on on stdin is stdout of star2auctable.py +# Output is to stdout +# +# Uses options specific to GNU sort +# +# $Id: starauctable2tex.sh 3682 2010-05-17 06:05:18Z alexs $ +# + +if [ $# -ne 0 ]; then + echo "Usage: $0" 2>&1 + exit 1 +fi + + +cat <&2 + exit 1 +fi + +use_hours=1 +use_seconds=1 + +if [ $# -ge 1 ] ; then + if [ `expr substr $1 1 1` = "-" ]; then + if [ "$1" = "-m" ]; then + use_hours=0 + elif [ "$1" = "-h" ]; then + use_seconds=0 + else + echo "Usage: $0 [-m|h] file list" >&2 + exit 1 + fi + shift 1 + fi +fi + +total_seconds=0 +builtinformat=0 +for errfile in $* +do + grep --text elapsed ${errfile} >/dev/null 2>&1 + if [ $? -eq 0 ]; then + elapsed=`grep --text elapsed ${errfile} | awk '{print $3}' | sed 's/elapsed//' | tail -1` + dotindex=`expr index ${elapsed} '.'` + if [ ${dotindex} -ne 0 ]; then + # less than an hour + colonindex=`expr index ${elapsed} ':'` + colonindex=`expr $colonindex - 1` + hours=0 + mins=`expr substr ${elapsed} 1 ${colonindex}` + secindex=`expr $colonindex + 2` + secs=`expr substr ${elapsed} ${secindex} 2` + else + colonindex=`expr index ${elapsed} ':'` + colonindex=`expr $colonindex - 1` + hours=`expr substr ${elapsed} 1 ${colonindex}` + next=`expr $colonindex + 2` + rest=`expr substr ${elapsed} $next 999` + colonindex=`expr index ${rest} ':'` + colonindex=`expr $colonindex - 1` + mins=`expr substr ${rest} 1 $colonindex` + next=`expr $colonindex + 2` + rest=`expr substr ${rest} $next 999` + secs=$rest + fi + total_seconds=`expr $total_seconds + $hours \* 3600 + $mins \* 60 + $secs` + else + builtinformat=1 + elapsed=`grep --text real ${errfile} | awk '{print $2}'` + mindex=`expr index ${elapsed} 'm'` + mindex=`expr $mindex - 1` + mins=`expr substr ${elapsed} 1 ${mindex}` + secindex=`expr $mindex + 2` + sindex=`expr index ${elapsed} 's'` + seclen=`expr ${sindex} - ${secindex}` + secs=`expr substr ${elapsed} ${secindex} ${seclen}` + + total_seconds=`echo "$total_seconds + $mins * 60 + $secs" | bc -l` + fi +done + +if [ ${builtinformat} -eq 1 ]; then + total_seconds=`printf "%.0f" $total_seconds` +fi + +if [ $use_hours -eq 1 ]; then + hours=`expr $total_seconds / 3600` + mins=`expr $total_seconds - $hours \* 3600` + mins=`expr $mins / 60` + rsecs=`expr $total_seconds - $hours \* 3600` + rsecs=`expr $rsecs - $mins \* 60` + if [ $use_seconds -eq 1 ]; then + printf '%d h %02d m %02d s' ${hours} ${mins} ${rsecs} + else + if [ $rsecs -ge 30 ]; then + mins=`expr $mins + 1` + fi + printf '%d h %02d m' ${hours} ${mins} + fi +else + mins=`expr $total_seconds / 60` + rsecs=`expr $total_seconds - $mins \* 60` + printf '%02d m %02d s' ${mins} ${rsecs} +fi diff --git a/scripts/superimposessemap.py b/scripts/superimposessemap.py new file mode 100755 index 0000000..1a3920b --- /dev/null +++ b/scripts/superimposessemap.py @@ -0,0 +1,421 @@ +#!/usr/bin/env python +############################################################################### +# +# superimposessemap.py - Superimpose structures according to SSE mapping +# +# File: superimposessemap.py +# Author: Alex Stivala +# Created: August 2008 +# +# +# Supermipose in 3D the residues in corresponding SSEs by orthogonal +# transformations (using SVD) using the Bio.PDB.Superimposer module. +# +# $Id: superimposessemap.py 1821 2008-08-18 00:54:56Z astivala $ +# +############################################################################### + +""" + +Using the SSE mapping from soln2ssemap.py, which shows pairs of SSE +sequential (from 1) numbers that correspond to each other, use orthogonal +transormation to superimpose the residues in corresponding SSEs, +calculating RMSD and producing superimposition in a PDB file for visualization. + +Requires the ptsecstruct.py module to get secondary structures using +DSSP (or STRIDE) (add directory contianing to to PYTHONPATH). + +Note that these must be the same definintions used +to produce the mapping, i.e. that the tableaux database and query +were built with, otherwise it won't realy make sense. + +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os +import getopt +from time import strftime,localtime + +import Bio.PDB + +import ptsecstruct +from ptutils import biopdbresid_to_pdbresseq,get_int_icode + +from parsessemap import parse_ssemap,SearchMap,QuerySSEMap +from pathdefs import ASTRAL_ROOT + + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def get_structure(scopsid, thepdbfile=None): + """ + Get Bio.PDB parsed structure for specified identifier or PDB file. + + Parameters: + scopsid - SCOP identifier to get SSEs for; used to locate file + under ASTRAL SCOP hierarchy. + thepdbfile - (default None) if not None, PDB file to get SSEs for, + overriding scopsid. + Return value: + Bio.PDB parsed structure. + """ + if thepdbfile: + pdbfile = thepdbfile + else: + pdbfilename = os.path.join(scopsid[2:4].lower(), + scopsid.lower() + '.ent') + pdbfile = os.path.join(ASTRAL_ROOT, pdbfilename) + + parser = Bio.PDB.PDBParser() + structure = parser.get_structure(scopsid, pdbfile) + return structure + + +def get_sse_nodes(scopsid, thepdbfile=None): + """ + Get SSE definitions in form of PTNode objects + from the supplied SCOP sid using + DSSP. Uses the ptsecstruct.py module, note comments at top of this + module also regarding ensuring the same definitions are used here + as for the actual search. + + Parameters: + scopsid - SCOP identifier to get SSEs for; used to locate file + under ASTRAL SCOP hierarchy. + thepdbfile - (default None) if not None, PDB file to get SSEs for, + overriding scopsid. + Return value: + list of PTNode objects represneting the SSEs. + """ + if thepdbfile: + pdbfile = thepdbfile + else: + pdbfilename = os.path.join(scopsid[2:4].lower(), + scopsid.lower() + '.ent') + pdbfile = os.path.join(ASTRAL_ROOT, pdbfilename) + + secstruct = ptsecstruct.read_secstruct_from_dssp(pdbfile) + return secstruct.get_sse_tuple_list() + + +def get_residue_list(model): + """ + Get list of Bio.PDB.Residue objects in supplied Bio.PDB.Model + Parmeters: + model - Bio.PDB.Model object + Return value: + List of Bio.PDB.Residue objects in the model + + """ + residue_list = [] + for chain in model: + # id of a residue in Bio.PDB is tuple (hetatm, resseqnum, icode) + residue_list += [ residue for residue in chain.get_unpacked_list() + if Bio.PDB.is_aa(residue) ] + return residue_list + + +def build_resid_dict(residue_list): + """ + Build dictionary mapping (chainid, pdb_resid) to index in residue_list + for all residues, not just those in this domain. + + Parameters: + residue_list - list of Bio.PDB.Residue objects + Return value: + dict of { {chainid,pdb_resseq) : seqindx } + where chainid and pdb_resseq make up + the PDB residue identifier, the pdb_resseq + being string resnum+icode if any e.g. + '60' or '60A', seqindx is the indiex + into sequential list of all residues + residue_list. + """ + pdb_resid_dict = {} + seq_indx = 0 + while seq_indx < len(residue_list): + residue = residue_list[seq_indx] + pdb_resid_dict[( ptsecstruct.pdb_chainid_to_stride_chainid( + residue.get_full_id()[2]), + biopdbresid_to_pdbresseq(residue.get_id()) )] = seq_indx + seq_indx += 1 + return pdb_resid_dict + + +def get_matched_residues(matched_sses, query_struct, db_struct): + """ + Given the list of correpsonding SSEs in the two structures, return + list of corresponding Bio.PDB.Residue objects. + + Parameters: + matched_sses - list of (A,B) tuples where A and B are + tuples (chain, start_resi, end_resi, type) in + query_struct and db_struct respectively. + query_struct - Bio.PDB.Structure + db_struct - Bio.PDB.Structure + Return value: + tuple (match_query_residues, match_db_residues) of equal length lists of + corresponding Bio.PDB.Residue objects in query and db structs resp. + """ + query_model = query_struct[0] # always using model 0 (TODO) + db_model = db_struct[0] # always using model 0 (TODO) + + query_residue_list = get_residue_list(query_model) + query_resid_dict = build_resid_dict(query_residue_list) + db_residue_list = get_residue_list(db_model) + db_resid_dict = build_resid_dict(db_residue_list) + + match_query_residues = [] + match_db_residues = [] + for ((qchain, qstart_resi, qend_resi, qtype), + (dchain, dstart_resi, dend_resi, dtype)) in matched_sses: + try: + start_indx = query_resid_dict[(qchain, qstart_resi)] + except KeyError: + # May be HETATM + while not query_resid_dict.has_key((qchain, qstart_resi)): + qstart_resi = str(get_int_icode(qstart_resi)[0] + 1) + start_indx = query_resid_dict[(qchain, qstart_resi)] + try: + end_indx = query_resid_dict[(qchain, qend_resi)] + except KeyError: + # May be HETATM + while not query_resid_dict.has_key((qchain, qend_resi)): + qend_resi = str(get_int_icode(qend_resi)[0] - 1) + end_indx = query_resid_dict[(qchain, qend_resi)] + query_residues = query_residue_list[start_indx : end_indx + 1] + try: + start_indx = db_resid_dict[(dchain, dstart_resi)] + except KeyError: + # May be HETATM + while not db_resid_dict.has_key((dchain, dstart_resi)): + dstart_resi = str(get_int_icode(dstart_resi)[0] + 1) + start_indx = db_resid_dict[(dchain, dstart_resi)] + try: + end_indx = db_resid_dict[(dchain, dend_resi)] + except KeyError: + # May be HETATM + while not db_resid_dict.has_key((dchain, dend_resi)): + dend_resi = str(get_int_icode(dend_resi)[0] - 1) + end_indx = db_resid_dict[(dchain, dend_resi)] + db_residues = db_residue_list[start_indx : end_indx + 1] + +# # if the SSEs are of unequal length, just truncate the longer +# # FIXME: should do something better here, e.g. use residues +# # in middle of SSEs since definitions at ends probably less certain + +# if len(db_residues) > len(query_residues): +# db_residues = db_residues[:len(query_residues)] +# elif len(query_residues) > len(db_residues): +# query_residues = query_residues[:len(db_residues)] + +# match_query_residues += query_residues +# match_db_residues += db_residues + + +# # use the first and last residues in each SSE +# # FIXME: should really use projected enpoints on vector +# # to represent the vector actually used to construct tableau +# # as per fit_axis in ptnode.py +# match_query_residues += [query_residues[0], query_residues[-1]] +# match_db_residues += [db_residues[0], db_residues[-1]] + + + # another dodgy way: just the 'most cetnral' residue (FIXME) + match_query_residues.append(query_residues[len(query_residues)/2]) + match_db_residues.append(db_residues[len(db_residues)/2]) + + + assert(len(match_query_residues) == len(match_db_residues)) + return (match_query_residues, match_db_residues) + + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-d domainid] [-u query_pdbfile] [-b db_pdbfile] [-o outputdir] \n") + sys.stderr.write( + "-d domainid: use this structure, if more than one in input\n" + "-u query_pdbfile: filename of query PDB file. If not specified then\n" + " identifier is used to find in ASTRAL SCOP hierarchy.\n" + "-b db_pdbfile: filename of database PDB file. If not specfied then\n" + " identifier is used to find in ASTRAL SCOP hierarchy.\n" + " Only valid if there is only one domain (either becuase -d is\n" + " specified or there is only one in the input).\n" + "-o outputdir: directory to write PDB of superimposed structures in.\n" + ) + sys.exit(1) + + +def main(): + """ + main for superimposessemap.py + + Usage: superimposessemap.py [-d domainid] [-s] [-u query_pdbfile] [-b db_pdbfile] [-o outputdir] + + -d domainid: only output for this domain, not all + -u query_pdbfile: filename of query PDB file. If not specified then + identifier is used to find in ASTRAL SCOP hierarchy. + -b db_pdbfile: filename of database PDB file. If not specfied then + identifier is used to find in ASTRAL SCOP hierarchy. + Only valid if there is only one domain (either becuase -d is + specified or there is only one in the input). + -o outputdir: directory to write PDB files of superimposed structures in. + + + Input is on stdin, the output of soln2ssemap.py, + identifier and score (as per input), then + for each matching a line containing + i and j separated by a space, + one per line (with blank line before next id) e.g.: + + d1wiua_ -23.0000 + 1 1 + 3 2 + 8 4 + 9 5 + 11 6 + 14 9 + + + The first SSE number on each line is in the query structure + (specified in header information), the second + is in the db structure (d1wiua_ in example). + + Output is RMSD value on stdout, and PDB file(s) in specified directory if -o + specfied. + stdout output format is one result per line, fields whitespace delimited: + + identifier score num_sses_matched num_aligned_points rmsd + + e.g. + + d1t10a_ -40.9999 8 16 16.93 + + num_aligned_points is number of points used in the superposition, + RMSD is the RMS deviation of those points (in Angstroms). + + """ + global verbose + verbose = False + + dbdomid = None + query_pdbfile = None + db_pdbfile = None + outputdir = None + + try: + opts,args = getopt.getopt(sys.argv[1:], "d:u:b:o:") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-d": # domain id specified, only get this one + dbdomid = arg + elif opt == "-u": # query PDB filename + query_pdbfile = arg + elif opt == "-b": # db PDB filename + db_pdbfile = arg + elif opt == "-o": # output directory + outputdir = arg + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 0: + usage(os.path.basename(sys.argv[0])) + + search_maps = parse_ssemap(sys.stdin) + + if (db_pdbfile and not dbdomid and len(search_maps.query_ssemap_list) > 1): + sys.stderr.write("ERROR: -b specified without -d and more than one " + "structure on input\n") + sys.exit(1) + + query_sse_nodes = get_sse_nodes(search_maps.queryid, query_pdbfile) + query_structure = get_structure(search_maps.queryid, query_pdbfile) + for query_ssemap in search_maps.query_ssemap_list: + if ((not dbdomid) or (query_ssemap.domid == dbdomid)): + db_sse_nodes = get_sse_nodes(query_ssemap.domid, db_pdbfile) + db_structure = get_structure(query_ssemap.domid, db_pdbfile) + sse_map = query_ssemap.sse_map + if len(sse_map) == 0: + sys.stderr.write('no SSEs matched for ' + query_ssemap.domid + + ': skipping\n') + continue + + matched_sse_nodes = [(query_sse_nodes[i-1],db_sse_nodes[j-1]) for (i,j) in sse_map] + matched_residues = get_matched_residues(matched_sse_nodes, + query_structure, + db_structure) + + # get Carbon alpha atoms for matched residues + query_atoms = [residue['CA'] for residue in matched_residues[0]] + db_atoms = [residue['CA'] for residue in matched_residues[1]] + + # get orthogonal transformation to superimpose query and db atoms + superimposer = Bio.PDB.Superimposer() + superimposer.set_atoms(query_atoms, db_atoms) + + # get the RMSD for the atoms used to calculate transformation + rmsd = superimposer.rms + + sys.stdout.write('%s %8.4f %4d %4d %6.2f\n' % + (query_ssemap.domid,query_ssemap.score, + len(sse_map), + len(matched_residues[0]), rmsd)) + + if outputdir: + if not os.path.isdir(outputdir): + sys.stderr.write("'" + outputdir + "' is not an existing " + "directory, no output written\n") + else: + # apply the transformation to all db atoms + superimposer.apply(db_structure.get_atoms()) + + # save aligned structure as PDB file + io = Bio.PDB.PDBIO() + io.set_structure(db_structure) + outpdbfilename = search_maps.queryid.lstrip().rstrip() + \ + '_' + \ + query_ssemap.domid.lstrip().rstrip() + \ + '.pdb' + outpdbfh = open(os.path.join(outputdir,outpdbfilename), 'w') + outpdbfh.write('REMARK generated by ' + + os.path.basename(sys.argv[0]) + '\n') + timestamp = strftime("%d%b%Y %H:%M:%S", localtime()) + outpdbfh.write('REMARK on ' + timestamp + '\n') + outpdbfh.write('REMARK \n') + outpdbfh.write('REMARK ' + query_ssemap.domid + + ' superimposed on ' + search_maps.queryid + + '\n') + outpdbfh.write('REMARK SCORE = %8.4f\n' % query_ssemap.score) + outpdbfh.write('REMARK NSSES = %4d\n' % len(sse_map)) + outpdbfh.write('REMARK NRES = %4d\n' % len(matched_residues[0])) + outpdbfh.write('REMARK RMSD = %6.2f\n' % rmsd) + outpdbfh.write('REMARK \n') + outpdbfh.write('REMARK from:\n') + for cline in search_maps.comment_lines: + outline = cline[:65] + outpdbfh.write('REMARK ' + outline) + if outline[-1] != '\n': + outpdbfh.write('\n') + io.save(outpdbfh) + outpdbfh.close() + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() diff --git a/scripts/tabcolmax.sh b/scripts/tabcolmax.sh new file mode 100755 index 0000000..da5dc0c --- /dev/null +++ b/scripts/tabcolmax.sh @@ -0,0 +1,98 @@ +#!/bin/sh +# +# tabcolmax.sh - indicate the maximum value in a LaTeX table column +# by enclosing it in \colmax{} +# +# Alex Stivala, originally 2006 +# modified for real numbers not integers October 2008, +# works on room0219pos09.cs.mu.oz.au (maybe not charikar.cs.mu.oz.au, +# note strange problem with differnece in handling \\ and \\\\ etc.) even +# though both Linux +# +# Usage: tablcolmax ... +# +# The is the file the table is read from (must be able +# to be read twice, so we can't use stdin). +# +# etc are the column (field) numbers (from 1) to process. +# The field separator is '&' +# +# Define something like: +# +# \newcommand{\colmax}[1]{\textbf{1}} % indicate maximum value in a table column +# +# in the LaTeX file to make the maximum value in boldface. +# +# $Id: tabcolmax.sh 1961 2008-10-07 06:08:06Z astivala $ +# + + + +if [ $# -lt 2 ]; then + echo "Usage: $0 ..." >&2 + exit 1 +fi + +infile=$1 +shift +col_list=$@ + +delim='&' + +tempfile=/var/tmp/tabcolmax.$$.tmp +cat /dev/null > $tempfile + +# we are assuming every row has the same numbef of fields +numfields=`head -1 $infile | sed "s/[^${delim}]//g" | wc -c` + +for col_num in $col_list ; do + # we assume that the field we are processing is numeric + maxval=`cut -d $delim -f $col_num < $infile | sort -n -r | head -1` + if [ $col_num -eq $numfields ]; then + # last field is messy: has whitesapce then \\\ so fix it + maxval=`echo "$maxval" | cut -f1 -d' '` + fi + /bin/echo -e "$col_num\t$maxval" >> $tempfile +done + + +# tricky: sed needed to replace \ with \\ since read removes single \ +#sed 's/\\/\\\\/g' $infile | while read line ; do +cat $infile | while read line ; do + i=1 + while [ $i -le $numfields ]; do + done_field=0 + field=`echo "$line" | cut -d $delim -f $i` +# if [ $i -eq $numfields ]; then +# # last field is messy: has whitesapce then \\\ so fix it +# field=`echo "$field" | cut -f1 -d' '` +# fi + for col_num in $col_list ; do + if [ $col_num -eq $i ]; then + maxline=`grep "^${col_num}[ ]*" $tempfile` + if [ $? -eq 0 ]; then + maxval=`echo "$maxline" | cut -f2` + iseq=`echo "$field == $maxval" | bc` + if [ $iseq -ne 0 ]; then + /bin/echo -En "\colmax{$field}" + else + echo -n "$field" + fi + done_field=1 + fi + fi + done + if [ $done_field -eq 0 ]; then + echo -n "$field" + fi + if [ $i -eq $numfields ]; then + echo \\\\ + else + echo -n $delim + fi + i=`expr $i + 1` + done +done + +rm $tempfile + diff --git a/scripts/tableaubuild.py b/scripts/tableaubuild.py new file mode 100644 index 0000000..61508da --- /dev/null +++ b/scripts/tableaubuild.py @@ -0,0 +1,1088 @@ +############################################################################### +# +# tableaubuild.py - class for building protein tableaux in Python +# +# File: tableaubuild.py +# Author: Alex Stivala +# Created: May 2008 (moved from pytableaucreate.py) +# +# $Id: tableaubuild.py 2852 2009-10-12 03:18:51Z astivala $ +# +############################################################################### + +""" +Build a protein tableau. +The implemntation is actually in pttableau.py which is used by ptgraph2.py +(Pro-Origami), + +Also used to create SSE midpoint distance matrix. + +Tableaux are described by Kamat and Lesk 2007 +'Contact Patterns Between Helices and Strands of Sheet Define Protein + Folding Patterns' Proteins 66:869-876 +and Lesk 2003 'From Electrons to Proteins and Back Again' +Int. J. Quant. Chem. 95:678-682 +and Lesk 1995 'Systematic representation of folding patterns' +J. Mol. Graph. 13:159-164. + +The implementation is based on Arun Konagurthu's TableauCreator program, see +Konagurthu, Stuckey and Lesk 2008 'Structural search and retrieval using +a tableau representation of protein folding patterns' Bioinformatics +(advance access, to be published Jan 5 2008). + +Filenames may be either in the format above or the pdbq1lp.pdb format. +Compressed pdb files are supported (gzip) (e.g. pdb1qlp.ent.gz). + +It is written in Python and depends on some Python libraries: + +. BioPython (including Bio.PDB) + http://www.biopython.org + + Reference for Bio.PDB is: + Hamelryck and Manderick 2003 "PDB parser and structure class implemented + in Python" Bioinformatics 19:2308-2310 + + which in turn depends on Numeric + http://sourceforge.net/projects/numpy + + +Developed on Linux 2.6.9 (x86_64) with Python 2.5.1 +and BioPython 1.43 with Numeric 24.2 + +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os +import getopt +import numpy.oldnumeric as Numeric +from Bio.PDB import * + +from ptnode import * +import pttableau +import ptsecstruct +from ptdomain import * +from ptutils import cleanup_tmpdir,get_int_icode,biopdbresid_to_pdbresseq +from domeval import * +import getdomains +from ptdistmatrix import compute_sse_midpoint_dist_matrix + +#----------------------------------------------------------------------------- +# +# Class definitions +# +#----------------------------------------------------------------------------- + + +# +# Empty classes for exceptions +# + +class NoSSE_Exception(Exception): # raised when no helices or strands found + pass + +# +# Real classes +# +class TableauBuild: + """ + The protein representation consists of a sequence of structure + (helix, strand) nodes with sequence edges in and out of them in + sequence from N-terminus to C-terminus and adjacency edges for + SSEs less than a threshold distance apart. + + Note there may be multiple such sequences (one for each + chain). + + Also the nodes are all labelled with start and end residue + sequence numbers, and node types etc. but this is not used at all + in the here, it is only included because this code was reused from + another program (ptraph2.py) which does require the node + labelling. + """ + + # + # member functions + # + + def __init__(self, pdb_structure, pdbid, + include_310_helices = False, include_pi_helices = False, + add_loop_nodes = False): + """ + Construct empty TableauBuild. To build the structure call + build_graph_from_secstruct(). + + Parameters: + pdb_structure - parsed PDB structure from Bio.PDB + pdbid - PDB identifier + include_310_helices - include 3_10 helices in the graph if True + include_pi_helices - include pi_helices in the graph if True + add_loop_nodes - include nodes for loop regions between SSEs if True + + """ + self.pdb_struct = pdb_structure + self.pdbid = pdbid + self.chain_dict = None # Each value of the chain_dict is a + # List of nodes in order from N to C terminus + # so chain_dict is { chainid : node_list } + self.seqnum2node = {} # dictionary of { seqnum : PTNode } + # maping int sequence numbers to PTNode objs + self.tableau = None # PTTableau build in build_tableau + self.include_310_helices = include_310_helices + self.include_pi_helices = include_pi_helices + self.pdb_resid_dict = None # dict of { {chainid,pdb_resseq) : seqindx } + # where chainid and pdb_resseq make up + # the PDB residue identifier, the pdb_resseq + # being string resnum+icode if any e.g. + # '60' or '60A', seqindx is the indiex + # into sequential list of all residues + # residue_list. + self.residue_list = None # list of all residues (for all chains) + # in sequence, built by get_residue_list() + + + + def iter_chains(self): + """ + This generator function iterates over all chains in this PTGraph. + A chain is just a list of nodes so it yields a node list for each + chain. + + Parameters: Nonde. + Return value: YIELDs a node list. + Uses data members (readony): + chain_dict - dict of {chainid:node_list} + """ + # FIXME: can we just 'return self.chain_dict.itervalues()' here? + for nodelist in self.chain_dict.itervalues(): + yield nodelist + + + def iter_nodes(self): + """ + This generator function iterates over all the node in this PTGraph. + + Parameters: None + Return Value: YIELDs a node. + Uses data members: (readonly): + chain_dict - dict of {chainid_node_list} + """ + for nodelist in self.iter_chains(): + for ptnode in nodelist: + yield ptnode + + + + def build_graph_from_secstruct(self, secstruct, domain, chainid=None): + """ + Build the list of nodes from the the supplied PTSecStruct + object. + + + Parameters: + secstruct - PTSecStruct (ptsecstruct.py) object to build from + domain - PTDomain (ptdomain.py) object listing the segment(s) + that make up this domain (only one domain processed at a + time). + (in/out) NOTE: may be modified by having a segment + added if SSE is only partly in domain. + chainid - chain identifier to build graph for only this chain, + or None for all chains (default) + + Uses member data (write): + chain_dict - dict of { chainid : node_list } where node_list is + list of nodes in order, built in this function + secstruct - keeps a pointer to the supplied secstruct + + (readonly): + pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) + for this protein. + include_310_helices, include_pi_helices - if true, include + these kinds of helices. + + Raises exceptions: + NoSSE_Exception if no helices or strands found + + Return value: + None. + + """ + + self.secstruct = secstruct + + helix_num = 1 + strand_num = 1 + + num_helices_in_domain = 0 + num_strands_in_domain = 0 + + # + # Build dictionary mapping (chainid, pdb_resid) to index in residue_list + # for ALL residues, not just those in this domain. + # + self.residue_list = self.get_residue_list(self.pdb_struct, + PTDomain(None, None)) + self.pdb_resid_dict = {} + seq_indx = 0 + while seq_indx < len(self.residue_list): + residue = self.residue_list[seq_indx] + self.pdb_resid_dict[( ptsecstruct.pdb_chainid_to_stride_chainid( + residue.get_full_id()[2]), + biopdbresid_to_pdbresseq( + residue.get_id()) )] = seq_indx + seq_indx += 1 + + # Note that now we are only adding elements in the supplied domain, + # so the so-called 'chains' may really be segments, i.e. subsequences + # of chains (rest of chain may be in other domain(s) + + self.chain_dict = {} # dict of {chainid : node_list} + + for (start_chainid, start_resnum, end_chainid, end_resnum, helixtype) \ + in secstruct.helix_list: + assert(start_chainid == end_chainid) #helix must be same chain + if chainid and chainid != start_chainid: + continue # chainid specified, skip ones not in that chain + # will consider structures in domain if first residue is in domain + if domain.is_in_domain(start_chainid, + get_int_icode(start_resnum)[0]): + num_helices_in_domain += 1 + if helixtype == "H": + idprefix = "ALPHAHELIX_" + htype = "ALPHA" + this_helix_num = helix_num + helix_num += 1 + elif helixtype == "I": + if not self.include_pi_helices: + continue + idprefix = "PIHELIX_" + htype = "PI" + this_helix_num = helix_num + helix_num += 1 + elif helixtype == "G": + if not self.include_310_helices: + continue + idprefix = "310HELIX_" + htype = "310" + this_helix_num = helix_num + helix_num += 1 + else: # shouldn't happen + sys.stderr.write("ERROR: bad helix type " + helixtype+"\n") + ah_node = PTNodeHelix(htype, + idprefix + start_chainid+"_" +\ + str(this_helix_num), + this_helix_num, + start_resnum, end_resnum, start_chainid, + domain.domainid, + self.residue_list, self.pdb_resid_dict) + if not self.chain_dict.has_key(start_chainid): + self.chain_dict[start_chainid] = [] + self.chain_dict[start_chainid].append(ah_node) + + # we must already have handled the case of SSEs that cross + # domain boundaries (by moving whole SSE to one of the domains) + assert( domain.is_in_domain(end_chainid, get_int_icode(end_resnum)[0]) ) + + for (start_chainid, start_resnum, end_chainid, end_resnum) \ + in secstruct.strand_list: + assert(start_chainid == end_chainid) # must be in same chain + if chainid and chainid != start_chainid: + continue # chainid specified, skip ones not in that chain + if domain.is_in_domain(start_chainid, + get_int_icode(start_resnum)[0]): + num_strands_in_domain += 1 + bs_node = PTNodeStrand("STRAND_"+start_chainid +"_"+\ + str(strand_num), + strand_num, + start_resnum, end_resnum, start_chainid, + domain.domainid, + self.residue_list, + self.pdb_resid_dict) + strand_num += 1 + if not self.chain_dict.has_key(start_chainid): + self.chain_dict[start_chainid] = [] + + # we must already have handled the case of SSEs that cross + # domain boundaries (by moving whole SSE to one of the domains) + assert( domain.is_in_domain(end_chainid, get_int_icode(end_resnum)[0]) ) + self.chain_dict[start_chainid].append(bs_node) + + + # raise an exception if there are no SSEs at all in this domain + if num_helices_in_domain == 0 and num_strands_in_domain == 0: + raise NoSSE_Exception + + delete_chainid_list = [] # list of chainids to delete from chain_dict + for (chainid, nodelist) in self.chain_dict.iteritems(): + # sort in order of start residue id ascending (all must be disjoint) + nodelist.sort() + + if len(nodelist) < 1: + # There are no SSEs in this chain, get rid of it. + sys.stderr.write('WARNING: no SSEs in chain ' + chainid + + '; chain ignored\n') + delete_chainid_list.append(chainid) # don't delete while in loop + continue + else: + # Check for chain with only SSEs that will not be drawn + # (i.e. pi or 310 helices), and delete those too + found_useful_node = False + for ptnode in nodelist: + if isinstance(ptnode, PTNodeStrand): + found_useful_node = True + break + elif isinstance(ptnode, PTNodeHelix): + if ptnode.get_type() == "ALPHA": + found_useful_node = True + break + elif ((ptnode.get_type() == "310" and + self.include_310_helices) or + (ptnode.get_type() == "PI" and + self.include_pi_helices)): + found_useful_node = True + break + if not found_useful_node: + sys.stderr.write('WARNING: only pi or 310 helices in chain ' + + chainid + + '; chain ignored\n') + delete_chainid_list.append(chainid) + continue + + + # delete chains from chain_dict that were marked earlier for deletion + for chainid in delete_chainid_list: + self.chain_dict.pop(chainid) + + # ------------------------------------------------------------------- + + # This is needed only for labelling sheets for HH and KK codes + # (see dfs_strands() etc. below) + + # add edges for hydrogen bonds + # uses secstruct and chainid member data + # these are used for determining which side bridge partners are + # on (and also for drawing a hydrogen bond graph if requested) + self.add_hbond_edges_from_secstruct() + + # add edges for bridge partners + # uses secstruct and chainid member data + self.add_bridge_edges_from_secstruct() + + #--------------------------------------------------------------------- + + + # for sequential numbering, we'll build this dictionary mapping + # sequential number (note NOT restarting for each chain) + # to PTNode + # so that sequential numbers as used in ptgraph2 -b sequential + # option. + # this is a dictionary of { seqnum : PTNode } + self.seqnum2node = {} + for (seqnum, node) in \ + enumerate([node for node in self.iter_nodes() if \ + not ( (isinstance(node, PTNodeTerminus)) or + (isinstance(node, PTNodeHelix) and + ( (node.get_type() == "310" and + not self.include_310_helices) or + (node.get_type() == "PI" and + not self.include_pi_helices) ) ) ) ]): + self.seqnum2node[seqnum+1] = node # start at 1 not 0 + + # ------------------------------------------------------------------------ + + def get_residue_list(self, pdb_struct, domain, getchainid = None): + """ + Return list of Bio.PDB Residue objects in this domain, and optionally + in the specified chain., + + Parameters: + pdb_struct - Bio.PDB parsed PDB struct for the protein + domain - PTDomain (ptdomain.py) object listing the segment(s) + that make up this domain (only one domain processed at a + time). + getchainid - chain identifier to get residues in (default None - + all chains). + + Return value: + list of Bio.PDB Residue objects in the domain (and optionally chain). + Raises exceptions: + NoSSE_Exception for empty structure (happens eg on d1oayi_.ent) + + """ + residue_list = [] + try: + pdb_model = self.pdb_struct[0] # TODO always using model 0 for now + except KeyError: + raise NoSSE_Exception + + for chain in pdb_model: + chainid = ptsecstruct.pdb_chainid_to_stride_chainid(chain.get_id()) + if getchainid and getchainid != chainid: + continue # this is not the chain we want + + # Build a list of Bio.PDB Residue objects that are in this + # domain. + # id of a residue in Bio.PDB is tuple (hetatm, resseqnum, icode) + # so we choose those where residue PDB number + # (in the current chain) is in the domain. + # TODO: maybe should use polypeptide builder for this instead + # (and indeed should probably use it right from the beginning) - + residue_list += [ residue for residue in chain.get_unpacked_list() + if is_aa(residue) and + domain.is_in_domain(chainid, residue.get_id()[1]) + ] + if getchainid: + break # if getchainid specified, we now have it so can quit + return residue_list + + + # iter_strands(), dfs_strands(), + # find_connected_components() and label_sheets() are needed to assign + # strands to sheets in order for HH and KK codes to be used for strands + # only when they are in the same sheet. + # TODO this code is cut&pasted from ptgraph2.py, probably should + # have a base class that this at PTGraph2 both inherit from or somethibng + # rather than all this duplication. + + def add_hbond_edges_from_secstruct(self): + """ + Add edges between structural elements for hydrogen bonds between + those nodes. Called by build_graph_from_secstruct(). + + NB: adds bonds between STRANDs only, not between HELIXes (helices). + + Parameters: None. + Return value: None. + Uses data members: + readonly: + secstruct - PTSecStruct object to get hbonds from + chainid - chainid of chain in PTSecStruct to use + read/write: + chain_dict - dict by chainid of + list of nodes (changes node data, not list as such) + + Precondition: each nodelist in chain_dict + is sorted (by start res seq ascending); + this is done by build_graph_from_secstruct() + before calling. + + """ + hbond_list = self.secstruct.hbond_list + # TODO: do this more efficiently using presorting (ie how it used to + # be done when only one chain) + for (chainid1, resnum1, chainid2, resnum2, dist) in hbond_list: + for ptnode in self.iter_strands(): + if chainid1 == ptnode.get_chainid() and \ + resnum1 >= ptnode.get_start_res_seq() and \ + resnum1 <= ptnode.get_end_res_seq(): + dest_node = self.find_node_containing_seqnum(resnum2, + chainid2) + if dest_node != None and \ + isinstance(dest_node, PTNodeStrand): # only STRANDs + ptnode.add_hbond(dest_node, resnum1, resnum2, dist) + + def add_bridge_edges_from_secstruct(self): + """ + Add edges between strand nodes representing beta brdiges between + those nodes (add just one edge between any two strands). + Called by build_graph_from_secstruct(). + + NB: adds bonds between STRANDs only, not between HELIXes (helices). + + Parameters: None. + Return value: None. + Uses data members: + readonly: + secstruct - PTSecStruct object to get hbonds from + chainid - chainid of chain in PTSecStruct to use + read/write: + chain_dict - dict by chainid of + list of nodes (changes node data, not list as such) + + """ + + bridge_list = self.secstruct.bridgeres_list + # (chainid1, resnum1, chainid2, resnum2, bdir) + + # TODO: do this more efficiently using presorting (ie how it used to + # be done when only one chain) + + for ptnode in self.iter_strands(): + for (chainid1, resnum1, chainid2, resnum2, bdir) in bridge_list: + if chainid1 == ptnode.get_chainid() and \ + resnum1 >= ptnode.get_start_res_seq() and \ + resnum1 <= ptnode.get_end_res_seq(): + try: + dest_node = self.find_node_containing_seqnum(resnum2, + chainid2) + except KeyError: + dest_node = None + sys.stderr.write('WARNING: chain ' + chainid2 + \ + ' involved in beta bridge not found.'+\ + '\n Probably due to domain parsing' +\ + ' breaking a beta sheet.\n') + if dest_node != None and \ + isinstance(dest_node, PTNodeStrand): # only STRANDs + if ptnode == dest_node: + sys.stderr.write('WARNING: ignoring self-bridge ' + + ptnode.nodeid + '\n') + else: + ptnode.add_bridge(dest_node, bdir) + + + def iter_strands(self): + """ + This generator function iterates over all strands in this PTGraph + object. I.e. it yields a strand for each strand in the + node lists. + + Parameters: None. + Return value: YIELDs a strand. + Uses data members (readonly): + self.chain_dict - dict of { chainid : list of nodes } + """ + for nodelist in self.iter_chains(): + for ptnode in nodelist: + if isinstance(ptnode, PTNodeStrand): + yield ptnode + + def find_node_containing_seqnum(self, res_seqnum, chainid): + """ + Find and return node in node list for chain chainid + containing supplied PDB residue + sequence number. + + Parameters: + res_seqnum - PDB residue sequence number to find node for + chainid - chain identifier to find node in + + Return value: + PTNode pointer of PTNode containing the supplied residue seq num + in supplied chainid + or None if the residue is not in a structural element PTNode + + Uses data members (readonly): + chain_dict - chainid dict of list of PTNodes + """ + # TODO: since node_list is sorted should use binary search here + # (maybe try the Python bisect module) + if not self.chain_dict.has_key(chainid): + return None # no such chain, can happen due to domain parsing + for ptnode in self.chain_dict[chainid]: + if ptnode.is_in_interval(res_seqnum): + return ptnode + return None + + def dfs_strands(self, start_strand, visited, dfs_list, from_node, + back_edge_list, + sheet_id=None): + """ + Make a depth-first search traversal of STRAND nodes + using bridge (not sequence) + edges starting at the specfied strand. + + Parameters: + start_strand - STRAND node to start at + visited - (in/out) dictionary of {ptnode:True} visited nodes + dfs_list - (in/out) list of ptnodes visited in dfs order + from_node - node from which we are being (recursively) called + back_edge_list - list of (node, node) tuples representing an + edge between the two nodes, which is a back + edge, i.e. from a node to an ancestor of that + node in the spanning tree. The back edge + means there is a cycle of which the back + edge forms a part. + sheet_id - identifier of this sheet (connected component) to mark + each strand in it with, or None to not mark at all + (default). + + + Recursive function. call initially as + dfslist = [] + back_edge_list = [] + dfs_strands(startnode, {}, dfslist, None, back_edge_list) + + Return value: + None. (Output is dfs_list, back_edge_list parameters) + + Uses members (readonly): + chain_dict - dict by chainid of list of PTNodes + + """ + visited[start_strand] = True + if sheet_id != None: + start_strand.set_sheet_id(sheet_id) + #print 'xxx',str(start_strand),sheet_id + dfs_list.append(start_strand) + for (node, bdir_unused, side_unused) in start_strand.get_bridge_list(): + if node not in visited: + self.dfs_strands(node, visited, dfs_list, start_strand, + back_edge_list, sheet_id) + elif node != from_node: #not parent of start_strand in spanning tree + # don't add duplicate back edges + # ((node1,node2) is same as (node2,node1)) + duplicate = False + for (a,b) in back_edge_list: + if ((start_strand == a and node == b) or + (node == a and start_strand == b)): + duplicate = True + break + if not duplicate: + if verbose: + sys.stderr.write('dfs_strands back edge from ' + + str(start_strand) + ' to ' + + str(node) + + '\n') + back_edge_list.append((start_strand, node)) + + + + def find_connected_components(self): + """ + Find the connected components (considering only STRAND nodes + and bridge [not sequence] edges in the graph). + + This is done by a DFS traversal at every node in the graph + (skipping already visited ones), giving us the partition of + the graph into connected components. + + Parameters: None + + Uses member data: + chain_dict - dict by chainid of list + of PTNodes in the graph (modifies PTNodes not list) + + (WRITE): + + sheet_dict - + dictionary of { sheet_id : ptnode_list } where sheet_id is 'A', + 'B', etc. and ptnode_list is a list of PTNodeStrand instances + in that connected component (sheet). + + self.sheet_backedges_dict - + dict of {sheet_id : ((node1,node2))} + listing 'back edges' i.e. edges + to an ancestor in DFS spanning tree + in the connected component (sheet). + note (node1,node2) and (node2,node1) + are the same (undirected graph) and + only one of the two is present in the + + + Labels each strand node with the sheet id it belongs to as it goes. + """ + + sheet_id = 'A' # sheet id is single alpha char A, B, etc. + # (will be a problem for more than 26 sheets... eg + # this actually happens on 2J28), wrap to lowercase + + visited = {} # dictionary of {ptnode : True} visited nodes + back_edge_list = [] # list of (ptnode, ptnode) tuples for back edges + self.sheet_dict = {} # dictionary of {sheet_id : nodelist} + self.sheet_backedges_dict = {} # dict of {sheet_id : ((node1,node2))} + # listing 'back edges' i.e. edges + # to an ancestor in DFS spanning tree + # in the connected component (sheet). + # note (node1,node2) and (node2,node1) + # are the same (undirected graph) and + # only one of the two is present in the + # list. + for node in self.iter_strands(): + if node not in visited: + connected_node_list = [] + back_edge_list = [] + self.dfs_strands(node, visited, connected_node_list, None, + back_edge_list, + sheet_id) + self.sheet_dict[sheet_id] = list(connected_node_list) + self.sheet_backedges_dict[sheet_id] = list(back_edge_list) + sheet_id = chr(ord(sheet_id)+1) + if sheet_id == '[': + sheet_id = 'a' # if go past Z, wrap to lowercase + + + def label_sheets(self): + """ + Label strands with sheet id to which each belongs by finding + connected components; strands in a connected componenent of + the graph (considering nonly STRAND nodes and bridge edges) + form a sheet. + + Parameters: None + + Uses member data: + node_list - list of nodes. Modifies nodes by labelling them. + + Return value: + Returns the sheet dictionary (dictionary of + { sheet_id : ptnode_list }) from find_connected_components. + """ + # ACtually don't do anything except call find_connected_components() + # which does the labeling itself (more efficient since it knows + # as each one is added which sheet it is added to) + return self.find_connected_components() + + + # ------------------------------------------------------------------------- + + def build_tableau(self, pdbid, domain, ptnode_list = None, + use_hk = True): + """ + Build the tableau data member (see PTTableau in pttableau.py) + by calling function in pttableau.py. + + Parameters: + pdbid - PDB identifier of the strucutre + domain - The PTDomain object for our current domain + ptnode_list - list of PTNodes (in sequence order, but not + necessarily continguous) to build the tableau for, + or None to use all nodes in domain. + Default None. + use_hk - If True, use the HH and KK codes for respectively + antiparallel and parallel strands. Default True. + + Return value: None + Uses data members (WRITE): + tableau - created by this function + (readonly): + chain_dict - dict { chainid : ptnode_list } of nodes in chains + pdb_structure - Bio.PDB parsed PDB structure + """ + if ptnode_list == None: + # Build list of all helix and strand PTNodes + ptnode_list = [] + for nodelist in self.iter_chains(): + for node in nodelist: # these nodes are only those in our domain + if (not isinstance(node, PTNodeTerminus)): # not terminii + ptnode_list.append(node) + + self.tableau = pttableau.compute_tableau(ptnode_list, self.pdb_struct, + use_hk) + + + + def build_omega_matrix(self, pdbid, domain, ptnode_list = None): + """ + Return the relative angles matrix by calling function in pttableau.py + + Parameters: + pdbid - PDB identifier of the strucutre + domain - The PTDomain object for our current domain + ptnode_list - list of PTNodes (in sequence order, but not + necessarily continguous) to build the tableau for, + or None to use all nodes in domain. + Default None. + Return value: Numeric.array Omega matrix. + Uses data members: + (readonly): + chain_dict - dict { chainid : ptnode_list } of nodes in chains + pdb_structure - Bio.PDB parsed PDB structure + """ + if ptnode_list == None: + # Build list of all helix and strand PTNodes + ptnode_list = [] + for nodelist in self.iter_chains(): + for node in nodelist: # these nodes are only those in our domain + if (not isinstance(node, PTNodeTerminus)): # not terminii + ptnode_list.append(node) + + return pttableau.compute_omega_matrix(ptnode_list, self.pdb_struct) + + + def build_sse_dist_matrix(self, pdbid, domain, ptnode_list = None): + """ + Return SSE axis midpoint distance matrix by calling function + in ptdistmatrix.py + + Parameters: + pdbid - PDB identifier of the strucutre + domain - The PTDomain object for our current domain + ptnode_list - list of PTNodes (in sequence order, but not + necessarily continguous) to build the matrix for, + or None to use all nodes in domain. + Default None. + Return value: Numeric.array SSE midpoint distance matrix. + Uses data members: + (readonly): + chain_dict - dict { chainid : ptnode_list } of nodes in chains + pdb_structure - Bio.PDB parsed PDB structure + """ + if ptnode_list == None: + # Build list of all helix and strand PTNodes + ptnode_list = [] + for nodelist in self.iter_chains(): + for node in nodelist: # these nodes are only those in our domain + if (not isinstance(node, PTNodeTerminus)): # not terminii + ptnode_list.append(node) + + return compute_sse_midpoint_dist_matrix(ptnode_list, self.pdb_struct) + + + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + + +def make_tableaux(pdb_filename, + pdb_struct, + secstruct_program, + domain_program, + include_310_helices = False, + include_pi_helices = False, + use_numeric = False, + sse_id_list = None, + use_hk = False, + min_sse_len = None, + build_dist_matrix = False, + chainid = None, + domainid = None): + """ + For the supplied filemame, read PDB format data from that file + and create tableaux or SSE distance matrix for that structre. + This function is called by get_tableaux() (below), which handles + compressed files etc. + + Note: for multi-domains, will be multiple output tableaux, one for + each domain. + + Paramteters: + pdb_filename - filename of PDB file to read + pdb_struct - Bio.PDB parsed PDB structure + secstruct_program - secondary structure definition program + ('stride' or 'dssp' or 'pdb') to use. + domain_progam - domain decompositino method ('ddomain','cath', etc.) + include_310_helices - if True, include 3_10 helices in the graph + include_pi_helices - if True, include pi helices in the graph + use_numeric - If True, use numeric matrix rather than tableau + sse_id_list - list of ints representing SSE sequential id numbers + to include in tableau. Default None. + When None, all SSEs are included. + use_hk - If True, use HH and KK codes for strands. + min_sse_len - if not None, the minimum length of SSE to include + in tableau. + build_dist_matrix - If True, build SSE midpoint distance matrix + instead of tableau. + chainid - If not None, only build tableau for that chain id. + domainid - If note None, only build tableau for that domain id. + + Return value: tuple (tableaux_list, sse_string_list) + where tableaux_list is + list of tableaux (only one in list unless domain decomp + is used and finds multiple domains); + or list of omega matrices (Numeric.array) if use_numeric + is True + or list of SSE axis midpiont distance matrices + (Numeric.array) if build_dist_matrix is True + and + sse_string_list is SSE string description e.g. 'EEHHE' etc. + """ + (pdbid,suffix) = os.path.splitext(os.path.basename(pdb_filename)) + pdbid = pdbid.upper() + if len(pdbid) >= 6 and pdbid[:3] == "PDB": + pdbid = pdbid[3:7] + + if secstruct_program == "pdb": + secstruct = ptsecstruct.read_secstruct_from_pdb_file(pdb_filename) + if secstruct != None: + secstruct.pdb_header = pdb_struct.header['head'] + else: + secstruct_program = "dssp" + sys.stderr.write('WARNING: error with HELIX or SHEET cards in PDB' + ': ' + secstruct_program + + ' will be used instead\n') + else: + secstruct = None + + if secstruct == None: + # read secondary structure information from STRIDE or DSSP + if secstruct_program == "stride": + secstruct = ptsecstruct.read_secstruct_from_stride(pdb_filename) + elif secstruct_program == "dssp": + secstruct = ptsecstruct.read_secstruct_from_dssp(pdb_filename) + else: + assert(False) + + if domain_program != None: + domain_list = getdomains.get_domains(domain_program, + pdbid, pdb_filename, pdb_struct) + else: + domain_list = [PTDomain(None, None)] # one-domain protein, no further info + + + # for SSEs that cross domain boundaries, move whole SSE to one of the domains + fixup_crossdomain_sses(secstruct, domain_list) + + tableaux_list = [] # NB may be list of PTTableau or list of Numeric.array + sse_str_list = [] + for domain in domain_list: + if domainid and domain.domainid != domainid: + if verbose: + sys.stderr.write("skipped domainid " + domainid + "\n") + continue + + ptg = TableauBuild(pdb_struct, pdbid, + include_310_helices, include_pi_helices) + # build tableaubuild object from secondary structure + try: + ptg.build_graph_from_secstruct(secstruct, domain, chainid) + except NoSSE_Exception: + if chainid: + sys.stderr.write('WARNING: No helices or strands found in ' + + pdbid + + ' chain ' + chainid + + ': skipping\n') + else: + sys.stderr.write('WARNING: No helices or strands found in ' + + pdbid + + ': skipping\n') + continue + + if use_hk: # only need to know sheets if using HH and KK codes + ptg.label_sheets() + + if verbose: + for nodelist in ptg.iter_chains(): + for node in nodelist: + sys.stderr.write(str(node) + '\n') + + # if list of int SSE sequential ids supplied, convert to list of + # PTNode objects + if sse_id_list: + try: + ptnode_list = [ptg.seqnum2node[sse_id] for sse_id in sse_id_list] + except KeyError,k: + sys.stderr.write("SSE sequential id " + str(k) + + " does not exist\n") + sys.exit(1) + else: + ptnode_list = None + + if not ptnode_list: + # Build list of all helix and strand PTNodes with len >= min_sse_len + ptnode_list = [] + for nodelist in ptg.iter_chains(): + for node in nodelist: # these nodes are only those in our domain + if (not isinstance(node, PTNodeTerminus)): # not terminii + ptnode_list.append(node) + + if min_sse_len: + ptnode_list = [node for node in ptnode_list + if node.get_span() >= min_sse_len] + + if build_dist_matrix: + dist_matrix = ptg.build_sse_dist_matrix(pdbid, domain, ptnode_list) + tableaux_list.append(dist_matrix) + elif use_numeric: + Omega = ptg.build_omega_matrix(pdbid, domain, ptnode_list) + tableaux_list.append(Omega) + else: + ptg.build_tableau(pdbid, domain, ptnode_list, use_hk) + tableaux_list.append(ptg.tableau) + + sse_str = "" + for node in ptnode_list: + if isinstance(node, PTNodeStrand): + sse_str += 'E' + elif isinstance(node, PTNodeHelix): + sse_str += 'H' + else: + raise ValueError('bad node type ' + str(node)) + sse_str_list.append(sse_str) + + + return (tableaux_list, sse_str_list) + + + +def get_tableaux(pdb_filename, + secstruct_program = 'dssp', + domain_program = 'none', + include_310_helices = True, + include_pi_helices = True, + sse_id_list = None, + min_sse_len = None, + use_numeric = False, + use_hk = False, + build_dist_matrix = False): + + """ + Get a tableau for a single PDB or ASTRAL pdb-style file + (compressed files e.g. pdb1qlp.ent.gz) or uncompressed + or the ASTRAL pdb-style hierarchy + (uncompressed files e.g. d1qlpa_.ent). + + Parameters: + pdb_filename - filename of PDB or ASTRAL pdb-style file, as above. + secstruct_program - secondary structure definition program + ('stride' or 'dssp' or 'pdb') to use. + domain_progam - domain decompositino method ('ddomain','cath', etc.) + include_310_helices - if True, include 3_10 helices in the graph + include_pi_helices - if True, include pi helices in the graph + sse_id_list - list of ints representing SSE sequential id numbers + to include in tableau. Default None. + When None, all SSEs are included. + min_sse_len - min number of residues in SSE to be ncluded. + Default None (no min length). + use_numeric - if True build Numeric.array Omega matrix not PTTableau + use_hk - If True build tableaux with HH and KK codes for strands in + same sheet. default False. + build_dist_matrix - If True, build SSE midpoint distance matrices + instead of tableaux. + + + Return value: + tuple (pdbid, tableaux_list, sse_string_list) + from the pdb file, only one in lists unless + domain decomposition is used and finds multidomains in input. + tableaux_list is list of tableaux or omega matrices + sse_string_list is SSE string description e.g. 'EEHHE' etc. + """ + tableaux_list = [] + # check for compressed files. We only support gzip (.gz) + # Note we are not using the zlib or GzipFile python modules + # since we are calling to external programs which require the + # file uncompressed themsevles anyway so we'll just run gzip + # to uncompress the file to a temporary directory. + pdb_file_basename = os.path.basename(pdb_filename) + (name,extension) = os.path.splitext(pdb_file_basename) + if extension == '.gz': + TMPDIR = os.tempnam(None, "ptgz") + os.mkdir(TMPDIR) + tmp_pdbfilename = os.path.join(TMPDIR, name) + os.system("gzip " + pdb_filename + " -d -c > " + tmp_pdbfilename) + our_pdb_filename = tmp_pdbfilename + used_tmp_file = True + else: + our_pdb_filename = pdb_filename + used_tmp_file = False + + try: + pdbid = name + if len(pdbid) >= 6 and pdbid[:3].upper() == "PDB": + pdbid = pdbid[3:7].upper() + # parse PDB file + pdb_parser = PDBParser() + pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename) + # create the Tableaux and output them + (tableaux_list, sse_string_list) = make_tableaux(our_pdb_filename, + pdb_struct, + secstruct_program, + domain_program, + include_310_helices, + include_pi_helices, + use_numeric, + sse_id_list, + use_hk, + min_sse_len, + build_dist_matrix) + + finally: + if used_tmp_file: + cleanup_tmpdir(TMPDIR) + return (pdbid, tableaux_list, sse_string_list) + + diff --git a/scripts/tableausearchout2col.py b/scripts/tableausearchout2col.py new file mode 100755 index 0000000..6aa8bda --- /dev/null +++ b/scripts/tableausearchout2col.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# +# File: tableausearchout2col.py +# Author: Alex Stivala +# Created: March 2009 +# +# tableausearchout2col.py - Convert Arun's TableauSearch (TableauComparer) +# output format to same +# format as output by tsrchd_sparse etc. which can +# be processed with tsevalfn.py etc. +# +# Usage: tableausearchout2col.py < tableausearchoutput.A.out +# +# Output has two columns, database id and tableau comparer score +# +# Input is TableauComparer output (search.scores) on stdin, e.g.: +#/local/charikar/TableauSearchDB/d1u3ya_.ent.angles Score-of-comparison: -149.2 +#/local/charikar/TableauSearchDB/d1geea_.ent.angles Score-of-comparison: -593.7 +# +# Output is to stdout. +# +# $Id: tableausearchout2col.py 2088 2009-03-07 02:30:49Z astivala $ +# + + +import os,sys + +for line in sys.stdin: + splitline = line.split() + fname = splitline[0] + dbid = os.path.splitext(os.path.splitext(os.path.basename(fname))[0])[0] + score = splitline[-1] + sys.stdout.write('%s %s\n' % (dbid, score)) diff --git a/scripts/tops_to_strings.sh b/scripts/tops_to_strings.sh new file mode 100755 index 0000000..13b2ad7 --- /dev/null +++ b/scripts/tops_to_strings.sh @@ -0,0 +1,48 @@ +#!/bin/sh +# +# File: tops_to_strings.sh +# Author: Alex Stivala +# Created: March 2009 +# +# tops_to_strings.sh - build TOPS strings from database of TOPS files +# +# Usage: tops_to_strings.sh tops_db_dir +# +# top_db_dir is the name of the directory containing TOPS file +# (as built with build_tops_files) +# +# The TOPS strings, one per line, are written to stdout +# +# $Id: tops_to_strings.sh 3689 2010-05-18 00:52:29Z alexs $ + +# location of tops_comparison directory, contains jars/translation.jar etc. +TOPS_COMPARISON_ROOT=$HOME/tops_comparison + +if [ $# -ne 1 ]; then + echo "Usage: $0 tops_db_dir" 2>&1 + exit 1 +fi + +tops_db_dir=$1 + + +for topsfile in `find $tops_db_dir -maxdepth 1 -name \*.tops` +do + #sid=`basename $topsfile .tops` + sid=`basename $topsfile .pdb.tops` + pdbcode=`echo $sid | cut -c2-5` + # for some reason even though TOPS only handles PDB ids e.g. 1NDD not + # SCOP sids e.g. d1ndda_, Tops2String crashes when when the PDB id + # is used, regardless of what we use as the 'string name' on the command + # line so we have to replace it in the .tops file with a SCOP sid + # e.g. + # DOMAIN_NUMBER 0 1ndd 1 1 74 + # becomes + # DOMAIN_NUMBER 0 d1ndda_ 1 1 74 + # then we can use anythnign as the 'string name' but it gets junk added + # to the end, which can can anyway remove later (topscompreout2col.sh). + temp_tops_file=/var/tmp/topstmp.$$.tops + sed "s/DOMAIN_NUMBER 0 [a-z0-9.]* \(.*\) \(.*\) \(.*\)/DOMAIN_NUMBER 0 ${sid} \1 \2 \3/" < ${topsfile} > ${temp_tops_file} + java -cp ${TOPS_COMPARISON_ROOT}/jars/translation.jar tops.translation.Tops2String $temp_tops_file $sid + rm ${temp_tops_file} +done diff --git a/scripts/topscompareout2col.sh b/scripts/topscompareout2col.sh new file mode 100755 index 0000000..33f8607 --- /dev/null +++ b/scripts/topscompareout2col.sh @@ -0,0 +1,21 @@ +#!/bin/sh +# +# File: topscompareout2col.sh +# Author: Alex Stivala +# Created: March 2009 +# +# topscompareout2col.sh - Convert tops_comparison output format to 2-column +# format as output by tsrchd_sparse etc. which can +# be processed with tsevalfn.py etc. +# +# Usage: topscompareout2col.sh < topscompareoutput +# +# +# Output has two columns, database id and tops_comparison compressino score +# +# Output is to stdout. +# +# $Id: topscompareout2col.sh 2161 2009-03-29 03:27:50Z astivala $ +# + +awk '$2 != "probe" {print substr($2,1,7),$1}' diff --git a/scripts/tsevalfischer.py b/scripts/tsevalfischer.py new file mode 100755 index 0000000..db70440 --- /dev/null +++ b/scripts/tsevalfischer.py @@ -0,0 +1,337 @@ +#!/usr/bin/env python +############################################################################### +# +# tsevalfischer.py - Evaluate tableaux search or MAX-CMO +# against Fischer data set +# +# File: tsevalfischer.py +# Author: Alex Stivala +# Created: September 2008 +# +# Evaluate QP tableau search or MSVNS for MAX-CMO (Pelta et al 2008) +# for all against all matching +# for the Fischer data set (Fischer et al 1996 Pac. Symp. Biocomput. 300-318)) +# as per Pelta et all 2008 BMC Bioinformatics 9:161 +# +# $Id: tsevalfischer.py 1956 2008-10-07 00:28:16Z astivala $ +# +############################################################################### + +# OBSOLETE - use rocrfischer.py and rocauc.r now + +""" +Evaluate false negatives using Fischer Table II at each cutoff (rank/score), +domains that are not included above the cuttoff but are in same fold +(for fold level evaluation) or class (for class level evaluation) +in Table II of Fischer 1996 are false negatives. + +Output is to stdout in a format that is easily usable in R with the +read.table() function, i.e. we use '#' to prefix lines not to be parsed, +and have a header line with suitable R variable names for the columns. + +See usage in docstring for main() + +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os +import getopt + +from tsevalutils import compute_auc,parse_searchresult,eval_fn + +#----------------------------------------------------------------------------- +# +# Constants +# +#----------------------------------------------------------------------------- + +# The 68 probe sequences from Fischer 1996 Table II +# Note several PDB ids obsoleted, so change to the replacments + +# map id to name of fold +FISCHER_ID_FOLD_DICT = { + '1dxt_b' : 'globin-like', + '1cpc_l' : 'globin-like', + '1c2r_a' : 'cytochrome', + '2mta_c' : 'cytochrome', + '1bbh_a' : 'helical bundle', + '1bge_b' : 'helical bundle', + '1rcb' : 'helical bundle', + '1aep' : 'helical bundle', + '1osa' : 'ef-hand', + '2sas' : 'ef-hand', + '1hom' : 'other alpha', + '1lga_a' : 'other alpha', + '2hpd_a' : 'other alpha', + '1chr_a' : 'tim barrel', + '2mnr' : 'tim barrel', + '3rub_l' : 'tim barrel', + '1crl' : 'hydrolase', + '1tah_a' : 'hydrolase', + '1aba' : 'thieredoxin', + '1dsb_a' : 'thieredoxin', + '1gpl_a' : 'thieredoxin', + '1atn_a' : 'ribonuclease', + '1hrh_a' : 'ribonuclease', + '3chy' : 'open sheet', + '2ak3_a' : 'open sheet', + '1gky' : 'open sheet', + '2cmd' : 'open sheet', + '1eaf' : 'open sheet', + '2gbp' : 'open sheet', + '1mio_c' : 'open sheet', + '2pia' : 'open sheet', + '1gal' : 'open sheet', + '1npx' : 'open sheet', + '2hhm_a' : 'mixed', + '1hip' : 'small', + '1isu_a' : 'small', + '1fc1_a' : 'ig', + '2fbj_l' : 'ig', + '1cid' : 'ig-like', + '1pfc' : 'ig-like', + '1ten' : 'ig-like', + '1tlk' : 'ig-like', + '3cd4' : 'ig-like', + '3hla_b' : 'ig-like', + '1aaj' : 'copredoxin', + '2afn_a' : 'copredoxin', + '2aza_a' : 'copredoxin', + '4sbv_a' : 'virus', + '1bbt_1' : 'virus', + '1sac_a' : 'lectin-like', + '1lts_d' : 'ob-fold', + '1tie' : 'trefoil', + '8i1b' : 'trefoil', + '1arb' : 'trypsin', + '2sga' : 'trypsin', + '2snv' : 'trypsin', + '1mdc' : 'lipocalin', + '1mup' : 'lipocalin', + '2sim' : 'propeller', + '1cau_b' : 'other beta', + '2omf' : 'other beta', + '1fxi_a' : 'ub fold', + '1cew' : 'cystatin', + '1stf_i' : 'cystatin', + '2pna' : 'sh2', + '2sar_a' : 'other alpha+beta', + '1onc' : 'other alpha+beta', + '5fd1' : 'other alpha+beta' +} + +# map name of fold to list of ids +FISCHER_FOLD_IDLIST_DICT = { + 'globin-like' : ['1dxt_b','1cpc_l'], + 'cytochrome' : ['1c2r_a','2mta_c'], + 'helical bundle' : ['1bbh_a','1bge_b','1rcb','1aep'], + 'ef-hand' : ['1osa','2sas'], + 'other alpha' : ['1hom','1lga_a','2hpd_a'], + 'tim barrel' : ['1chr_a','2mnr','3rub_l'], + 'hydrolase' : ['1crl','1tah_a'], + 'thieredoxin' : ['1aba','1dsb_a','1gpl_a'], + 'ribonuclease' : ['1atn_a','1hrh_a'], + 'open sheet' : ['3chy','2ak3_a','1gky','2cmd','1eaf','2gbp','1mio_c','2pia','1gal','1npx'], + 'mixed' : ['2hhm_a'], + 'small' : ['1hip','1isu_a'], + 'ig' : ['1fc1_a','2fbj_l'], + 'ig-like' : ['1cid','1pfc','1ten','1tlk','3cd4','3hla_b'], + 'copredoxin' : ['1aaj','2afn_a','2aza_a'], + 'virus' : ['4sbv_a','1bbt_1'], + 'lectin-like' : ['1sac_a'], + 'ob-fold' : ['1lts_d'], + 'trefoil' : ['1tie','8i1b'], + 'trypsin' : ['1arb','2sga','2snv'], + 'lipocalin' : ['1mdc','1mup'], + 'propeller' : ['2sim'], + 'other beta' : ['1cau_b','2omf'], + 'ub fold' : ['1fxi_a'], + 'cystatin' : ['1cew','1stf_i'], + 'sh2' : ['2pna'], + 'other alpha+beta': ['2sar_a','1onc','5fd1'] +} + + + +# map id to name of class +FISCHER_ID_CLASS_DICT = { + '1dxt_b' : 'alpha', + '1cpc_l' : 'alpha', + '1c2r_a' : 'alpha', + '2mta_c' : 'alpha', + '1bbh_a' : 'alpha', + '1bge_b' : 'alpha', + '1rcb' : 'alpha', + '1aep' : 'alpha', + '1osa' : 'alpha', + '2sas' : 'alpha', + '1hom' : 'alpha', + '1lga_a' : 'alpha', + '2hpd_a' : 'alpha', + '1chr_a' : 'alpha/beta', + '2mnr' : 'alpha/beta', + '3rub_l' : 'alpha/beta', + '1crl' : 'alpha/beta', + '1tah_a' : 'alpha/beta', + '1aba' : 'alpha/beta', + '1dsb_a' : 'alpha/beta', + '1gpl_a' : 'alpha/beta', + '1atn_a' : 'alpha/beta', + '1hrh_a' : 'alpha/beta', + '3chy' : 'alpha/beta', + '2ak3_a' : 'alpha/beta', + '1gky' : 'alpha/beta', + '2cmd' : 'alpha/beta', + '1eaf' : 'alpha/beta', + '2gbp' : 'alpha/beta', + '1mio_c' : 'alpha/beta', + '2pia' : 'alpha/beta', + '1gal' : 'alpha/beta', + '1npx' : 'alpha/beta', + '2hhm_a' : 'other', + '1hip' : 'other', + '1isu_a' : 'other', + '1fc1_a' : 'beta', + '2fbj_l' : 'beta', + '1cid' : 'beta', + '1pfc' : 'beta', + '1ten' : 'beta', + '1tlk' : 'beta', + '3cd4' : 'beta', + '3hla_b' : 'beta', + '1aaj' : 'beta', + '2afn_a' : 'beta', + '2aza_a' : 'beta', + '4sbv_a' : 'beta', + '1bbt_1' : 'beta', + '1sac_a' : 'beta', + '1lts_d' : 'beta', + '1tie' : 'beta', + '8i1b' : 'beta', + '1arb' : 'beta', + '2sga' : 'beta', + '2snv' : 'beta', + '1mdc' : 'beta', + '1mup' : 'beta', + '2sim' : 'beta', + '1cau_b' : 'beta', + '2omf' : 'beta', + '1fxi_a' : 'alpha+beta', + '1cew' : 'alpha+beta', + '1stf_i' : 'alpha+beta', + '2pna' : 'alpha+beta', + '2sar_a' : 'alpha+beta', + '1onc' : 'alpha+beta', + '5fd1' : 'alpha+beta' +} + +# map name of class to list of ids +FISCHER_CLASS_IDLIST_DICT = { + 'alpha' : ['1dxt_b','1cpc_l','1c2r_a','2mta_c', '1bbh_a','1bge_b','1rcb','1aep','1osa','2sas', '1hom','1lga_a','2hpd_a'], + 'alpha/beta' : ['1chr_a','2mnr','3rub_l','1crl','1tah_a','1aba','1dsb_a','1gpl_a', '1atn_a','1hrh_a','3chy','2ak3_a','1gky','2cmd','1eaf','2gbp','1mio_c','2pia','1gal','1npx'], + 'other' : ['2hhm_a','1hip','1isu_a'], + 'beta' : ['1fc1_a','2fbj_l', '1cid','1pfc','1ten','1tlk','3cd4','3hla_b', '1aaj','2afn_a','2aza_a','4sbv_a','1bbt_1', '1sac_a','1lts_d', '1tie','8i1b', '1arb','2sga','2snv', '1mdc','1mup', '2sim', '1cau_b','2omf'], + 'alpha+beta' : ['1fxi_a', '1cew','1stf_i', '2pna', '2sar_a','1onc','5fd1'] +} + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-rcv] " + " \n") + sys.stderr.write(' -c class level not fold level evaluation\n') + sys.stderr.write(' -r higher scores are better (for MSVNS4MaxCMO)\n') + sys.stderr.write(' -v verbose messages to stderr\n') + sys.exit(1) + + +def main(): + """ + main for tsevalfischer.py + + Usage: tsevalfischer.py [-crv] + + + -c evaluate at class level rather than default fold level + -v turns on debug output to stderr + -r higher scores are better (rather than default of more negative + scores better). MSVNS4MaxCMO requires this (QP tableau search + gives lower (more negative) scores for better matches) + + is the PDB id (e.g. 1CRL or 1C2R_A) of the query structure + + is the output from the tabsearchqpml.file, + which is a text file where each line is identifier then whitespace + then score, sorted by score from most negative to least negative e.g. + + d1xksa_ -35.99999999 + d3sila_ -35.99999999 + .... + d2mhua_ -0.499999999 + + ie this means the best hit as the top of the file, and worst at bottom. + May be specified as - for stdin. + + The table of positive and false negative rates is printed to stdout. + """ + global verbose + verbose = False + use_class = False + reverseflag = False + + try: + opts,args = getopt.getopt(sys.argv[1:], "vrc?") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-v": # verbose + verbose = True # this module only + elif opt == "-r": # reversed: higher scores better + reverseflag = True + elif opt == '-c': # class not fold level evaluation + use_class = True + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 2: + usage(os.path.basename(sys.argv[0])) + + query_id = args[0] + tabsearch_file = args[1] + + if use_class: + goldstd_ids = FISCHER_CLASS_IDLIST_DICT[FISCHER_ID_CLASS_DICT[query_id.lower()]] + else: + goldstd_ids = FISCHER_FOLD_IDLIST_DICT[FISCHER_ID_FOLD_DICT[query_id.lower()]] + goldstd_ids = [pdbid.upper() for pdbid in goldstd_ids] + + if verbose: + sys.stderr.write('parsing search results...\n') + if tabsearch_file == '-': + tabsearch_fh = sys.stdin + else: + tabsearch_fh = open(tabsearch_file) + (searchresult,commentlist) = parse_searchresult(tabsearch_fh, reverseflag) + if tabsearch_file != '-': + tabsearch_fh.close() + + sys.stdout.write('#' + ' '.join(sys.argv) + '\n') #identifying info about us + sys.stdout.write('# results from:\n') + for line in commentlist: + sys.stdout.write('# ') + sys.stdout.write(line) # identifying information about search run + sys.stdout.write('\n') + eval_fn(goldstd_ids, searchresult, reverseflag) + + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() diff --git a/scripts/tsevalfischer_all.sh b/scripts/tsevalfischer_all.sh new file mode 100755 index 0000000..fe48313 --- /dev/null +++ b/scripts/tsevalfischer_all.sh @@ -0,0 +1,115 @@ +#!/bin/sh +############################################################################### +# +# tsevalfischer_all.sh - run tsevalfischer.py on all output in a directory +# +# File: tsevalfischer_all.sh +# Author: Alex Stivala +# Created: September 2008 +# +# Run the tsevalfischer.py script on all .out files in the supplied +# directory, creating correpsonding .rtab files in that directory +# WARNING: overwrites the .rtab files if they exist +# +# query id is dervied from filename e.g. 8i1b.out is for 8i1b query, +# output is produced in this form from qptabmatch_allall.py script +# from input generated by build_fischer_db.sh script +# +# +# Also print average AUC to stdout +# +# Usage: +# tsevalfischer_all.sh [-rc] outdir +# +# -c evaluate on class not fold level (produces .class.rtab not +# .fold.rtab files) +# -n negate scores so that most -ve is best +# -x exclude folds that don't appear to be inlucded in Pelta et al 2008 +# Figure 5. +# +# outdir is the directory containing the .out files and to which the +# .rtab files are to be written. +# +# Environment variables: +# +# PATH must contain the location of the Python scripts, ie where this +# script itself is, and tsevalfischer.py +# +# PYTHONPATH must contain the directory containing the tsevalutils.py +# and other Python modules used by the Python scripts. +# +# $Id: tsevalfischer_all.sh 2079 2009-03-03 07:43:11Z astivala $ +# +############################################################################### + +# OBSOLETE - use rocrfischer.py and rocauc.r now + +use_class=0 +negate=0 +use_skip=0 + +# skip the following folds that for some reason arenot included in +# Pelta et al 2008 assessment (see Figure 5): +# cytochrome, mixed, small, copredoxin, lectin-like, ob-fold, trefoil, +# propellor, ub-fold, cystatin, sh2 +# (27 folds in Fischer Table II but only 16 in Pelta Figure 5). +SKIP_IDS="1c2r_a 2mta_c 2hhm_a 1hip 1isu_a 1aaj 2afn_a 2aza_a 1sac_a 1lts_d 1tie 8i1b 2sim 1fxi_a 1cew 1stf_i 2pna" + +while getopts 'ncx' opt +do + case $opt in + n) negate=1 + ;; + c) use_class=1 + ;; + x) use_skip=1 + ;; + ?) + echo "Usage: $0 [-nc] outdir" >&2 + exit 1 + ;; + esac +done +shift `expr $OPTIND - 1` + +if [ $# -ne 1 ]; then + echo "Usage: $0 [-rc] outdir" >&2 + exit 1 +fi + +outdir=$1 + +if [ $use_class -eq 1 ]; then + if [ $use_skip -eq 1 ]; then + echo "$0: -x ignored as not applicable with -c" >&2 + use_skip=0 + fi + suffix="class.rtab" + pyopts="-c" +else + suffix="fold.rtab" + pyopts="" +fi + +if [ $negate -eq 1 ]; then + pyopts="${pyopts} -n" +fi + +for resultfile in ${outdir}/*.out +do + queryid=`basename ${resultfile} .out` + if [ $use_skip -eq 1 ]; then + for i in $SKIP_IDS + do + if [ $i = `echo $queryid | tr '[A-Z]' '[a-z]'` ]; then +# echo xx $queryid + continue 2 + fi + done + fi + tsevalfischer.py ${pyopts} ${queryid} ${resultfile} > ${outdir}/${queryid}.${suffix} +done + +grep 'AUC' ${outdir}/*.${suffix} | awk '{sum += $4}; END {printf("for %d results, average AUC is %4.2f\n", NR, sum/NR)}' + + diff --git a/scripts/tsevalfn.py b/scripts/tsevalfn.py new file mode 100755 index 0000000..590c11d --- /dev/null +++ b/scripts/tsevalfn.py @@ -0,0 +1,457 @@ +#!/usr/bin/env python +############################################################################### +# +# tsevalfn.py - Evaluate tableaux search false negative rate against SCOP +# +# File: tsevalfn.py +# Author: Alex Stivala +# Created: June 2008 +# +# $Id: tsevalfn.py 3497 2010-03-19 06:02:21Z alexs $ +# +############################################################################### + +""" +Evaluate false negatives using SCOP at each cutoff (rank/score), +domains that are not included above the cuttoff but are in the SOCP +related domains for the query, are false negatives. + +Output is to stdout in a format that is easily usable in R with the +read.table() function, i.e. we use '#' to prefix lines not to be parsed, +and have a header line with suitable R variable names for the columns. + +See usage in docstring for main() + +SCOP and ASTRAL data is obtained using the Bio.SCOP library (Casbon et +al 2006 'A high level interface to SCOP and ASTRAL implemented in +Python' BMC Bioinformatics 7:10) and depends on having the data +downloaded, in SCOP_DIR (defined below). + +Downloaded SCOP files from + +http://scop.mrc-lmb.cam.ac.uk/scop/parse/index.html + +and ASTRAL files (in scopseq-1.73) from + +http://astral.berkeley.edu/scopseq-1.73.html + +The files downlaoded are: + +/local/charikar/SCOP/: +dir.cla.scop.txt_1.73 +dir.des.scop.txt_1.73 +dir.hie.scop.txt_1.73 + +/local/charikar/SCOP/scopseq-1.73: +astral-scopdom-seqres-all-1.73.fa +astral-scopdom-seqres-sel-gs-bib-95-1.73.id + +Other files there are indices built by Bio.SCOP when first used. +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os +import getopt + + +from Bio.SCOP import * + +from tsevalutils import compute_auc,parse_searchresult,eval_fn,get_betagrasp_containing_domains + +from pathdefs import SCOP_DIR,SCOP_VERSION + +#----------------------------------------------------------------------------- +# +# Module globals +# +#----------------------------------------------------------------------------- + +verbose = False + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def get_scop_domains(query_sid, scop): + """ + Get a list of SCOP domains that have the same fold as the query domain, + + Parameters: + query_sid - SCOP domain id (eg 'd1ubia_' of the domain to query. + scop - previously built Bio.SCOP Scop instance + Return value: + list of Bio.SCOP domain instances that have the same fold as the + query_sid. + """ + + if verbose: + sys.stderr.write('getting domains related to ' + query_sid +'...\n') + dom =scop.getDomainBySid(query_sid) + fold = dom.getAscendent('fold') + related = fold.getDescendents('domain') + if verbose: + sys.stderr.write('found %d domains\n' % len(related)) + return related + + +def get_domains_in_same_superfamily(sid, scop): + """ + Return a list of SCOP domain instances that are in the same + superfamily as the supplied SCOP domain id. + + Parameters: + sid - (string) SCOP domain identifier as sid e.g. d1ubia_ + scop - previuosly created Bio.SCOP Scop object + + Return value: + list of SCOP domains that are in the same superfamily as the + supplied sid (including the domain for the sid itself). + """ + return scop.getDomainBySid(sid).getAscendent( + 'superfamily').getDescendents('domain') + + + + +def filter_domains_astral95(scop_domain_list, scop, astral): + """ + Given list of Bio.SCOP domain objects, return list of those domains + that are in the ASTRAL 95% sequence identity nonredundant subset. + + Parameters: + scop_domain_list - list of Bio.SCOP domain objects + scop - previously built Bio.SCOP Scop instance + astral - previously build Bio.SCOP Astral instance + + """ + related95 = [ r for r in scop_domain_list if astral.isDomainInId(r, 95) ] + return related95 + + +def get_domains_not_in_searchresult(searchresult, scop, astral, + use_nonredundant): + """ + Return a list of domain identifiers that are in SCOP but are not + present in the search result. + + Parameters: + searchresult - list of (score, domainid) + scop - previously built Bio.SCOP Scop instance + astral - previously build Bio.SCOP Astral instance + use_nonredundant - If True, filter out domains not in ASTRAL 95% nr subset + + Return value: + list of domainid where each domainid is in SCOP but not in searchresult + """ + scoproot = scop.getRoot() + all_domains = scoproot.getDescendents('domain') + if verbose: + sys.stderr.write('got %d domains total\n' % len(all_domains)) + if use_nonredundant: + all_domains = filter_domains_astral95(all_domains, scop, astral) + if verbose: + sys.stderr.write('filtered to %d domains in ASTRAL 95\n' % + len(all_domains)) + searchresult_dict = dict([(sid, True) + for sid in + [sid for (score,sid) in searchresult] ]) + domainid_list = [scopdom.sid for scopdom in all_domains + if scopdom.sid not in searchresult_dict] + return domainid_list + + +def get_goldstd_domains(query_sid, use_prosmos_dataset, use_superfamily, + use_nonredundant, + scop, astral): + """ + Geth the "gold standard" list of domains. This is the domains that are + in the same fold or superfamily (according to use_superfamily option) + as the supplied domain specified by query_sid (or the beta grasp containg + domains if use_prosmos_dataset is True). + + Parameters: + query_sid - SCOP domain identifier (sid) of query + use_prosmos_dataset - Bool gold standard is the ProSMos beta-grasp data set + use_superfamily - Bool use domains in same superfamily rather than same fold + use_nonredundant - Bool if True use ASTRAL 95% nr subset + scop - previously built Bio.SCOP Scop instance + astral - previously build Bio.SCOP Astral instance + + Return value: + list of sids for domains in same fold/superfamily as query_sid + """ + if use_prosmos_dataset: + goldstd_domains = get_betagrasp_containing_domains(scop) + elif use_superfamily: + if verbose: + sys.stderr.write('getting domains in same superfamily as ' + query_sid + + '...\n') + goldstd_domains = get_domains_in_same_superfamily(query_sid, scop) + if verbose: + sys.stderr.write('found ' + str(len(goldstd_domains)) + ' domains\n') + else: + goldstd_domains = get_scop_domains(query_sid,scop) + + if use_nonredundant: + goldstd_domains = filter_domains_astral95(goldstd_domains, scop, astral) + + if verbose: + sys.stderr.write('got ' + str(len(goldstd_domains)) + ' domains in ASTRAL.\n') + + return goldstd_domains + + +def write_slrtab(searchresult, goldstd_domains, bottom_score, + use_nonredundant, + scop, astral): + """ + Write the slrtab output to stdout. This is two columms with score in + first column and label (0 or 1) in second column, for use with R + ROCR package. + + Parameters: + searchresult -list of (score, domainid) tuples parsed from pogram to eval + goldstd_domains - list of gold standard domains to eval against + bottom_score - if not None, a float for score to give any domain + not assigned a score in the searchresult + use_nonredundant - Bool if True use ASTRAL 95% nr subset + scop - previously built Bio.SCOP Scop instance + astral - previously build Bio.SCOP Astral instance + """ + # convert goldstd list of domainids to dictionary keyed by domainid + # for fast lookup as we iterate through search results. + # The dictionary is { domainid : True } (we don't have a value, + # just need to quickly test for presence of domainid in gold std + # postive list). + goldstd_pos_dict = dict([(sid, True) for + sid in [scopdom.sid for scopdom in goldstd_domains]]) + for (score, domainid) in searchresult: + if goldstd_pos_dict.has_key(domainid): + label = 1 + else: + label = 0 + sys.stdout.write('%20.8f %d\n' % (score,label)) + + if bottom_score != None: + # write out an entry with the 'lowest' score for all domains + # that are in the gold standard data but not given a score + # in the parsed output from the program being evaluated. + lowscore_domains = get_domains_not_in_searchresult(searchresult, + scop, astral, + use_nonredundant) + for domainid in lowscore_domains: + if goldstd_pos_dict.has_key(domainid): + label = 1 + else: + label = 0 + sys.stdout.write('%20.8f %d\n' % (bottom_score, label)) + if verbose: + sys.stderr.write('set score to %f for %d domains\n' % + (bottom_score, len(lowscore_domains))) + + + + +def tsevalfn_set_verbose(verb): + """ + set the module global verbose flag in this module to supplied value + Parameters: verb - True (for verbose output) or False + Return value: None + Uses globals: verbose (in this module) + """ + global verbose + verbose = verb + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-xvrbfluno] [-z score] " + " \n") + sys.stderr.write(' -v verbose messages to stderr\n') + sys.stderr.write(' -x output is from external database, ignore key erors\n') + sys.stderr.write(' -b use the ProSMoS data set for validation of beta-grasp query\n') + sys.stderr.write(' -n negate scores\n') + sys.stderr.write(' -o take log10 of scores\n') + sys.stderr.write(' -f use full SCOP not ASTRAL 95% nonredundant subset\n') + sys.stderr.write(' -l write table of values and labels intead of TPR,FPR\n') + sys.stderr.write(' -u evaluate at superfamily rather than fold level\n') + sys.stderr.write(' -z score : assign identifiers not present in the output a score of score\n') + sys.stderr.write(' -e version: SCOP version (default %s)\n' % SCOP_VERSION) + sys.exit(1) + + +def main(): + """ + main for tsevalfn.py + + Usage: tsevalfn.py [-vbxnoflu] [-e version] [-z score] + + + -v turns on debug output to stderr + + -n negate all scores (so that most -ve score becomes highest score) + (if used with -o then log10 is taken and then negated) + + -o take log10 of all scores + + -b use the PRoSMoS data set (Shi et al 2007) data set for validation + of the beta-grasp substructure query + + -x results are from external program eg TableauSearch webserver so + ignore SCOP sids that are in our database but not in search results + (since they may be from older version of database that does not + have these domains). This is also needed when evaluating output + from a program that does not output a score for every query + (many only output scores for queries with score above some threshold + e.g. VAST or top n scores), otherwise invalid rates and AUC are + calculated. (This latter is not applicable to -l option). + + -b use the PRoSMoS data set (Shi et al 2007) data set for validation + of the beta-grasp substructure query + + -f use the full SCOP data set not the ASTRAL 95% sequence identity subset + + -l instead of computing TPR/FPR table and AUC, just + write table with one column of values (scores) and other of + corresponding true labels (0/1 for same/different fold as query) + for use with the R ROCR packge. + + -u use domains from the same superfamily as the query rather than from + the same fold as the query as the gold standard. + + -z score : any identifiers that are not present in the output but + are in the gold standard data are given the specified score. + This is for programs that do not assign a score to all + domains, but only those above some threshold or the top n, + or just cannot assign a score to some domains for some reason. + This would generally be specified as some score lower + than all other scores. + + -e version: use SCOP version specified e.g. -e1.73 + + is the SCOP id (e.g. 'd1ubia_') of the query domain + + is the output from the tabsearchqpml.file, + which is a text file where each line is identifier then whitespace + then score + + d1xksa_ -35.99999999 + d3sila_ -35.99999999 + .... + d2mhua_ -0.499999999 + + May be specified as - for stdin. + + The table of positive and false negative rates is printed to stdout. + """ + negateflag = False + ignore_search_keyerror = False + use_prosmos_dataset = False + use_nonredundant = True + compute_rates = True + use_superfamily = False + bottom_score = None + logflag = False + scop_version = SCOP_VERSION + + try: + opts,args = getopt.getopt(sys.argv[1:], "e:lbfxrnos:uz:v?") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-s" : #self match score + sys.stderr.write("-s option is obsolete; ignored\n") + elif opt == "-r": # reversed: higher scores better + sys.stderr.write("-r option is obsolete, fix caller: exiting\n") + sys.exit(1) + elif opt == "-n": # negate scores + negateflag = True + elif opt == "-o": # take log10 of scores + logflag = True + elif opt == "-v": # verbose + tsevalfn_set_verbose(True) + elif opt == "-x": # results from search of external db + ignore_search_keyerror = True + elif opt == "-b": # use the ProSMoS dataset to validate beta-grasp query + use_prosmos_dataset = True + elif opt == "-f": # use full SCOP not nonredundant subset + use_nonredundant = False + elif opt == "-l": # write scores and labels for ROCR, don't compute + compute_rates = False + elif opt == "-u": # evaluate at superfamily not fold level + use_superfamily = True + elif opt == "-z": # score to give to domains that have no score + bottom_score = float(arg) + elif opt == "-e": # specify SCOP version + scop_version = float(arg) + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 2: + usage(os.path.basename(sys.argv[0])) + + query_sid = args[0] + tabsearch_file = args[1] + + # read SCOP and ASTRAL data + if verbose: + sys.stderr.write('Reading SCOP data...\n') + scop = Scop(dir_path=SCOP_DIR,version=scop_version) + astral = Astral(dir_path=SCOP_DIR,version=scop_version,scop=scop) + + goldstd_domains = get_goldstd_domains(query_sid, use_prosmos_dataset, + use_superfamily, + use_nonredundant, scop, astral) + + if verbose: + sys.stderr.write('parsing search results...\n') + if tabsearch_file == '-': + tabsearch_fh = sys.stdin + else: + tabsearch_fh = open(tabsearch_file) + (searchresult,commentlist) = parse_searchresult(tabsearch_fh, negateflag, + logflag) + if tabsearch_file != '-': + tabsearch_fh.close() + + sys.stdout.write('#' + ' '.join(sys.argv) + '\n') #identifying info about us + sys.stdout.write('# results from:\n') + for line in commentlist: + sys.stdout.write('# ') + sys.stdout.write(line) # identifying information about search run + sys.stdout.write('\n') + if compute_rates: + if bottom_score != None: + lowscore_domains = get_domains_not_in_searchresult(searchresult, + scop, astral, + use_nonredundant) + searchresult += [(bottom_score, sid) for sid in lowscore_domains] + if verbose: + sys.stderr.write('set score to %f for %d domains\n' % + (bottom_score, len(lowscore_domains))) + searchresult.sort() # sort by ascending score + + eval_fn([scopdom.sid for scopdom in goldstd_domains], + searchresult, + ignore_search_keyerror) + else: + sys.stdout.write('score label\n') + sys.stdout.write('#-----------\n') + write_slrtab(searchresult, goldstd_domains, bottom_score, + use_nonredundant, + scop, astral) + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() diff --git a/scripts/tsevalnh3d.py b/scripts/tsevalnh3d.py new file mode 100755 index 0000000..f017234 --- /dev/null +++ b/scripts/tsevalnh3d.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python +############################################################################### +# +# tsevalnh3d.py - Evaluate tableaux search against Nh3D data set +# +# File: tsevalnh3d.py +# Author: Alex Stivala +# Created: June 2008 +# +# Evaluate QP tableau search for all against all matching +# for the Nh3D data set at CATH architecture level +# as per Pelta et all 2008 BMC Bioinformatics 9:161 +# +# $Id: tsevalnh3d.py 1956 2008-10-07 00:28:16Z astivala $ +# +############################################################################### + +# OBSOLETE - use rocrnh3d.py and rocauc.r now + +""" +Evaluate false negatives using CATH architecture : +domains that are not included above the cuttoff but have same architecture id +according to CATH are false negatives. +Output is to stdout in a format that is easily usable in R with the +read.table() function, i.e. we use '#' to prefix lines not to be parsed, +and have a header line with suitable R variable names for the columns. + +See usage in docstring for main() +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os +import getopt + +from cathmap import CATHMAP +from tsevalutils import compute_auc,parse_searchresult,eval_fn + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-rv] " + " \n") + sys.stderr.write(' -v verbose messages to stderr\n') + sys.stderr.write(' -r higher scores are better (for MSVNS4MaxCMO)\n') + sys.exit(1) + + +def main(): + """ + main for tsevalnh3d.py + + Usage: tsevalnh3d.py [-rv] + + + -r higher scores are better (rather than default of more negative + scores better). MSVNS4MaxCMO requires this (QP tableau search + gives lower (more negative) scores for better matches) + -v turns on debug output to stderr + + is the CATH id (e.g. 1.10.1290) of the query structure + + is the output from the tabsearchqpml.file, + which is a text file where each line is identifier then whitespace + then score, sorted by score from most negative to least negative e.g. + + 1101290 -35.99999999 + + ie this means the best hit as the top of the file, and worst at bottom. + May be specified as - for stdin. + + The table of positive and false negative rates is printed to stdout. + """ + global verbose + verbose = False + reverseflag = False + + try: + opts,args = getopt.getopt(sys.argv[1:], "rv?") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-v": # verbose + verbose = True # this module only + elif opt == "-r": # reversed: higher scores better + reverseflag = True + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 2: + usage(os.path.basename(sys.argv[0])) + + query_id = args[0] + tabsearch_file = args[1] + + # get the gold standard as the list of 'compreseed' CATH ids + # that have same architecture as query + query_id_split = query_id.split('.') + query_class = query_id_split[0] + query_arch = query_id_split[1] + goldstd_ids = [] + for (compressed_id, cathid) in CATHMAP.iteritems(): + cathid_split = cathid.split('.') + cathid_class = cathid_split[0] + cathid_arch = cathid_split[1] + if cathid_class == query_class and cathid_arch == query_arch: + goldstd_ids.append(compressed_id) + + if verbose: + sys.stderr.write('parsing search results...\n') + if tabsearch_file == '-': + tabsearch_fh = sys.stdin + else: + tabsearch_fh = open(tabsearch_file) + (searchresult,commentlist) = parse_searchresult(tabsearch_fh, reverseflag) + if tabsearch_file != '-': + tabsearch_fh.close() + + sys.stdout.write('#' + ' '.join(sys.argv) + '\n') #identifying info about us + sys.stdout.write('# results from:\n') + for line in commentlist: + sys.stdout.write('# ') + sys.stdout.write(line) # identifying information about search run + sys.stdout.write('\n') + eval_fn(goldstd_ids, searchresult, reverseflag) + + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() diff --git a/scripts/tsevalnh3d_all.sh b/scripts/tsevalnh3d_all.sh new file mode 100755 index 0000000..2d696e3 --- /dev/null +++ b/scripts/tsevalnh3d_all.sh @@ -0,0 +1,84 @@ +#!/bin/sh +############################################################################### +# +# tsevalnh3d_all.sh - run tsevalnh3d.py on all output in a directory +# +# File: tsevalnh3d_all.sh +# Author: Alex Stivala +# Created: September 2008 +# +# Run the tsevalnh3d.py script on all .out files in the supplied +# directory, creating correpsonding .rtab files in that directory +# WARNING: overwrites the .rtab files if they exist +# +# query id is dervied from filename e.g. 1.10.1290.out is for 1.10.1290 query, +# output is produced in this form from qptabmatch_allall.py script +# from input generated by build_nh3d_db.sh script +# +# +# Also print average AUC to stdout for each CATH architecture +# +# Usage: +# tsevalnh3d_all.sh [-n] outdir +# +# -n negate scores (so that most -ve is best rather than highest) +# +# Environment variables: +# +# PATH must contain the location of the Python scripts, ie where this +# script itself is, and tsevalnh3d.py +# +# PYTHONPATH must contain the directory containing the tsevalutils.py +# and other Python modules used by the Python scripts. +# +# $Id: tsevalnh3d_all.sh 2079 2009-03-03 07:43:11Z astivala $ +# +############################################################################### + +# OBSOLETE - use rocrnh3d.py and rocauc.r now + +negate=0 + +while getopts 'n' opt +do + case $opt in + n) negate=1 + ;; + ?) + echo "Usage: $0 [-rc] outdir" >&2 + exit 1 + ;; + esac +done +shift `expr $OPTIND - 1` + +if [ $# -ne 1 ]; then + echo "Usage: $0 [-r] outdir" >&2 + exit 1 +fi + +outdir=$1 + +pyopts="" +if [ $negate -eq 1 ]; then + pyopts="${pyopts} -n" +fi + +suffix='rtab' + +for resultfile in ${outdir}/*.out +do + queryid=`basename ${resultfile} .out` + tsevalnh3d.py ${pyopts} ${queryid} ${resultfile} > ${outdir}/${queryid}.${suffix} +done + +# list of different CATH architectures +# obtained by +# ls -1 results/nh3d/*.out |cut -d/ -f3 | cut -d. -f1,2|sort | uniq +ARCH_LIST="1.10 1.20 2.10 2.170 2.30 2.40 2.60 2.70 3.10 3.20 3.30 3.40 3.60 3.90 4.10" + +for arch in $ARCH_LIST +do + grep 'AUC' ${outdir}/${arch}.*.${suffix} | awk "{sum += \$4}; END {printf(\"for %d results in CATH ${arch}, average AUC is %4.2f\n\", NR, sum/NR)}" +done + diff --git a/scripts/tsevalrmsd.py b/scripts/tsevalrmsd.py new file mode 100755 index 0000000..01af7f8 --- /dev/null +++ b/scripts/tsevalrmsd.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python +############################################################################### +# +# tsevalrmsd.py - Evaluate tableaux search TPR/FPR against superposition +# +# File: tsevalrmsd.py +# Author: Alex Stivala +# Created: June 2008 +# +# $Id: tsevalrmsd.py 2079 2009-03-03 07:43:11Z astivala $ +# +############################################################################### + +""" +Evaluate true/false positives against superposition of +the matched substructures at each cutoff (rank/score). +Domains that are included below the cutoff and +with superposition RMSD below some (constant) threshold +at the cutoff are considered true positives, and false positives if they +are included below the cutoff score but RMSD is above the (constant) threshold. + +Output is to stdout in a format that is easily usable in R with the +read.table() function, i.e. we use '#' to prefix lines not to be parsed, +and have a header line with suitable R variable names for the columns. + +See usage in docstring for main() + +""" + +import warnings # so we can suppress the annoying tempnam 'security' warning +import sys,os +import getopt + +from tsevalutils import compute_auc,parse_searchresult,eval_fn + +#----------------------------------------------------------------------------- +# +# Constants +# +#----------------------------------------------------------------------------- + +# RMSD threshold equal to or below which structures considered +# adequately superimposed (Angstroms) +RMSD_THRESHOLD = 3.5 + + +#----------------------------------------------------------------------------- +# +# Class definitions +# +#----------------------------------------------------------------------------- + +class RMSDRecord: + """ + Dummy class for containing data from line of superimposessemap.py + output parsed by parse_superimposessemap() i.e. + + identifier, score, nsses nres, rmsd + """ + pass + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def parse_superimposessemap(fh): + """ + Parse the output of the superimposessemap.py script: + format is one result per line, fields whitespace delimited: + + identifier score num_sses_matched num_aligned_points rmsd + + e.g. + + d1t10a_ -40.9999 8 16 16.93 + + num_aligned_points is number of points used in the superposition, + RMSD is the RMS deviation of those points (in Angstroms). + + Parameters: + fh - open (read) filehandle to read superimposessemap.py output from + Return value: + list of RMSDRecord objects, each having the fields + identifier - identifier of structure + score - score of matching query to structure + nsses - number of SSEs matched by maximlly similar subtableau finding + nres - number of points used in superposition of query and structure + rmsd - RMSD of superposition with the nres points + """ + rr_list = [] + for line in fh: + if line[0] == '#': + continue # skip over comment lines + rr = RMSDRecord() + (rr.identifier, rr.score, rr.nsses, rr.nres, rr.rmsd) = line.split() + rr.score = float(rr.score) + rr.nsses = int(rr.nsses) + rr.nres = int(rr.nres) + rr.rmsd = float(rr.rmsd) + rr_list.append(rr) + return rr_list + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ + Print usage message and exit + """ + + sys.stderr.write("Usage: " +progname + " [-v] " + " \n") + sys.stderr.write(' -v verbose messages to stderr\n') + sys.stderr.write('superimposessemap.py output is read from stdin\n') + sys.exit(1) + + +def main(): + """ + main for tsevalfn.py + + Usage: tsevalfn.py [-v] + + -v turns on debug output to stderr + + superimposessemap.py output is read from stdin + + is the output from the tabsearchqpml.file, + which is a text file where each line is identifier then whitespace + then score, sorted by score from most negative to least negative e.g. + + d1xksa_ -35.99999999 + d3sila_ -35.99999999 + .... + d2mhua_ -0.499999999 + + ie this means the best hit as the top of the file, and worst at bottom. + + The superimposessemap.py output read on stdin + format is one result per line, fields whitespace delimited: + + identifier score num_aligned_points rmsd + + e.g. + + d1t10a_ -40.9999 16 16.93 + + num_aligned_points is number of points used in the superposition, + RMSD is the RMS deviation of those points (in Angstroms). + + The table of positive and false negative rates is printed to stdout. + """ + global verbose + verbose = False + + + try: + opts,args = getopt.getopt(sys.argv[1:], "v?") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-v": # verbose + verbose = True # this module only + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 1: + usage(os.path.basename(sys.argv[0])) + + tabsearch_file = args[0] + + if verbose: + sys.stderr.write('parsing supersimposessemap.py output...') + + rr_list = parse_superimposessemap(sys.stdin) + if verbose: + sys.stderr.write('parsed ' + str(len(rr_list)) + ' records\n') + + + # get dict of identifiers where number of SSEs matched less than query size + querysize = 8 #FIXME 8 for d1ubia_ + remove_dict = dict([(rr.identifier,True) + for rr in rr_list if rr.nsses < querysize]) + + + # select only those not less than query size and + # with RMSD below threshold as gold standard positives + goldstd_domains=[rr.identifier for rr in rr_list if not remove_dict.has_key(rr.identifier) and rr.rmsd <= RMSD_THRESHOLD] + if verbose: + sys.stderr.write('got ' + str(len(goldstd_domains)) + + ' structures below RMSD threshold ' + + str(RMSD_THRESHOLD) + '\n') + sys.stderr.write(str([str(d) for d in goldstd_domains])) + + if verbose: + sys.stderr.write('parsing search results...') + tabsearch_fh = open(tabsearch_file) + (searchresult,commentlist) = parse_searchresult(tabsearch_fh, negateflag=True) + tabsearch_fh.close() + if verbose: + sys.stderr.write('parsed ' + str(len(searchresult)) + ' records\n') + + # set score to 0 for those where number of SSEs matched less than query size + oldsearchresult = list(searchresult) + searchresult = [] + for i in range(len(oldsearchresult)): + # set score to 0 for identifiers matching fewer than query size SSEs + if remove_dict.has_key(oldsearchresult[i][1]): + searchresult.append((0.0, oldsearchresult[i][1])) + else: + searchresult.append(oldsearchresult[i]) + + if verbose: + sys.stderr.write('set score to 0 for ' + str(len(remove_dict)) + + ' structures matching fewer than ' + + str(querysize) + ' SSEs\n') + searchresult.sort() + + + sys.stdout.write('#' + ' '.join(sys.argv) + '\n') #identifying info about us + sys.stdout.write('# results from:\n') + for line in commentlist: + sys.stdout.write('# ') + sys.stdout.write(line) # identifying information about search run + sys.stdout.write('\n') + eval_fn(goldstd_domains, searchresult) + + +if __name__ == "__main__": + warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) + main() diff --git a/scripts/tsevalutils.py b/scripts/tsevalutils.py new file mode 100644 index 0000000..d93ee24 --- /dev/null +++ b/scripts/tsevalutils.py @@ -0,0 +1,848 @@ +############################################################################### +# +# tsevalutils.py - Functions for tableau search evaluation +# +# File: tsevalutils.py +# Author: Alex Stivala +# Created: June 2008 +# +# $Id: tsevalutils.py 3631 2010-05-12 01:20:01Z alexs $ +# +############################################################################### + +""" +Functions for protein db search program evaluation scripts tsevalfn.py +and tsevalrmsd.py and mkroc50tab, and others. + +Includes functions to parse output, compue AUC and ROC50 values, +get lists of true hits from SCOP, filter down to nonredundant ASTRAL +subsets, etc. +""" + +import sys +from math import log10 + +from Bio.SCOP import * + + +#----------------------------------------------------------------------------- +# +# Module globals +# +#----------------------------------------------------------------------------- + +verbose = False + + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + + +def compute_auc(fprlist, tprlist): + """ + Compute area under ROC curve (AUC) given a list of false postive + rates and corresponding list of true postive rates. + Sums the areas of trapezoids formed by each adjacent entry in + the lists. + + Parameters: + fprlist - list of false positive rates (FPR), ordered from 0 to 1 + tprlist - corresponding list of true positive rates (TPR) + Return value: + AUC (area under [ROC] curve) + """ + # in R: auc <- sum(diff(fprlist)*(tprlist[-1]+tprlist[-length(tprlist)]))/2 + + n = len(fprlist) + assert(len(tprlist) == n) + widths = [b - a for (a,b) in zip(fprlist[:n-1], fprlist[1:])] + avgheights = [(a+b)/2.0 for (a,b) in zip(tprlist[1:], tprlist[:n-1])] + auc = sum([x*y for (x,y) in zip(widths, avgheights)]) + return auc + + + + +def parse_searchresult(fh, negateflag=False, logflag=False, sortflag=True): + """ + Parse a text file where each line is identifier then whitespace + then score, e.g. + + d1xksa_ -35.99999999 + d3sila_ -35.99999999 + .... + d2mhua_ -0.499999999 + + Then sort it by score from lowest to highest + ie this means the best hit as the bottom of the file, and worst at top. + If the negateflag is True then the scores are negated before sorting + (so in the example above where -36 is the 'best' score, then it becomes + 36 and so is last in the list after sorting). + + Parameters: + fh - open (read) filehandle to parse from + negateflag - if True, negate scores before sorting + logflag - if True take log10 of scores. If used with negateflag + then log10 is taken first then negated + sortflag - If True sort results by score ascending. Default True. + + Return value: + tuple (scorelist, commentlines) where + scorelist is list of (score, domainid), sorted by score (if + sortflag=TRUE) + commentlines is list of strings, each one a comment line from file + """ + reslist = [] + commentlist = [] + for line in fh: + if line[0] == '#': + commentlist.append(line) + continue + splitline = line.split() + if len(splitline) < 2: + sys.stderr.write('bad line: ' + line + '\n') + continue + domainid = splitline[0] + score_str = splitline[1] + if score_str.lower() == 'nan' or score_str == '********': + # if we get any NaN values then then sort() gets completely + # screwed up and the evaluation is all wrong, so skip them. + sys.stderr.write('skipping NaN: ' + line + '\n') + continue + try: + score = float(score_str) + except ValueError: + sys.stderr.write('skipping invalid score ' + line + '\n') + continue + if logflag: + score = log10(score) + if negateflag: + score = -score + reslist.append((score, domainid)) + if sortflag: + reslist.sort() # sort ascending + return (reslist, commentlist) + + + +def eval_fn(goldstd_positive_domains, searchresult, + ignore_search_keyerror=False): + """ + Evaluate the false negative rate for different score cutoffs + and print table to stdout. + + Parameters: + goldstd_positive_domains - + list of identifiers that the query + should match i.e. the 'gold standard' list of + hits. + searchresult - list of (score, domainid), sorted by ascending score + ignore_search_keyerror - If True, ignore key errors in search result. + Usually this should not happen, but when using external + results i.e. those not from same database (ASTRAL subset) + used here, it can. Eg. using TableauSearch webserver + results which have older SCOP as database, + so there are SCOP sids in our database that are + not in the serach results at all. Then this option + just ignores them rather than raising exception KeyError. + + Return value: + None - table written to stdout. + + """ + + sys.stdout.write('score fp_count tpr fpr\n') + sys.stdout.write('#------------------------------\n') + + fprlist = [] + tprlist = [] + + resultlist = searchresult + + # remove gold standard SCOP domains that are not in the search + # result, which can happen from using external results, also + # happens for some domains where for some reason the Bio.SCOP + # isDomainInId(95) does not contain some domains that actually + # are in the downlaoded ASTRAL SCOP 95% set (e.g. d2aeph1) + # This can also happen when the search got an error for a domain + # so it is not in the search results. + # Use the -x option (ignore_search_keyerror) to handle these cases + # (not always on by default since these things "shouldn't" happen). + if ignore_search_keyerror: + search_dict = dict([(pdbid, (score, rank)) for (rank, (score, pdbid)) + in enumerate(searchresult)]) + our_goldstd_positive_domains = [scopdom for scopdom in goldstd_positive_domains if search_dict.has_key(scopdom) ] + else: + our_goldstd_positive_domains = goldstd_positive_domains + +# sys.stderr.write('original len, reduced len ' +str(len(goldstd_positive_domains)) + ' , ' + str(len(our_goldstd_positive_domains)) + '\n') + + + # convert goldstd list of domainids to dictionary + # keyed by domainid for fast lookup as we iterate through search results. + # The dictionary is { domainid : True } (we don't have a value, + # just need to quickly test for presendce of domainid in gold stad + # postive list0 + + goldstd_pos_dict = dict([(scopdom, True) for + scopdom in our_goldstd_positive_domains]) + + # start at classifying all as true (TPR=FPR=1) + tp_count = len(our_goldstd_positive_domains) + fp_count = len(resultlist) - len(our_goldstd_positive_domains) + for cutoff_rank in xrange(len(resultlist)): + cutoff_score = resultlist[cutoff_rank][0] + if cutoff_rank > 0: + # we are now classifying the previous one and all below as negative + prev_scopsid = resultlist[cutoff_rank - 1][1] + if goldstd_pos_dict.has_key(prev_scopsid): + tp_count -= 1 + else: + fp_count -= 1 + + tpr = float(tp_count) / float(len(our_goldstd_positive_domains)) #sensitivity = true pos rate + fpr = float(fp_count) / float(len(resultlist) - len(our_goldstd_positive_domains)) #FP rate + specificity = 1.0 - fpr + + fprlist.append(fpr) + tprlist.append(tpr) + + sys.stdout.write('%5.1f %8d %5.3f %5.3f\n' % + (cutoff_score, fp_count, tpr, fpr)) + + fprlist.reverse() + tprlist.reverse() + auc = compute_auc(fprlist, tprlist) + sys.stdout.write('\n') + sys.stdout.write('# AUC = %5.3f\n' % auc) + + +def iter_searchresult(fh, multiquery = False, skip_self_query = False, + negateflag = False, logflag = False): + """ + This is a generator function version of parse_searchresult(), + that yields one searh result at a time from the file, instead + of parsing the whole thing and returning a list. Note because of this + it cannot sort the result, the file must already be sorted. + + Also, if multiquery=True then this parses a format with multiple + queryies in the one file. In this format, a special 'comment' line + delimits results from each queryu. This line has the format: + + # QUERYID = querid + + or + + # QUERY ID = querid + + e.g. + + # QUERYID = d1ubia_ + + to delimit results for each separate query. The blastout2col.sh etc. + scripts create this format. Note that the ordering of results + (scores) is only within each individual query. + + The format of the file is identifier then whitespace then score e.g.: + + d2mhua_ 0.499999999 + .... + d1xksa_ 35.99999999 + d3sila_ 35.99999999 + + + Parameters: + fh - open (read) filehandle to parse from + multiquery - Boolean. If True multiquery format as described above. + skip_self_query - Boolean. If True, omit match of query against itself + (only for multiquery=True). + negateflag - if True, negate scores before sorting + logflag - if True take log10 of scores. If used with negateflag + then log10 is taken first then negated + + Return value: + Yields a tuple (score, domainid) for each search result line in the + file. Ordered according to the file order, i.e. must be from + worst to best hit. + If multiquery = True then the tuples yielded are + (queryid, score, domainid). All same queryid must be consecutive + and ordering is only within each block of consecutive identical + queryid. + """ + queryid = None + for line in fh: + if line[0] == '#': + if multiquery: + splitline = line.split() + if ( (len(splitline) > 1 and splitline[1] == 'QUERYID') or + (len(splitline) > 2 and splitline[1] == 'QUERY' and + splitline[2] == 'ID') ): + queryid = line.split('=')[1].lstrip().rstrip() + continue + splitline = line.split() + if len(splitline) != 2: + sys.stderr.write('bad line: ' + line + '\n') + continue + domainid = splitline[0] + score_str = splitline[1] + if score_str.lower() == 'nan' or score_str == '********': + # if we get any NaN values then then sort() gets completely + # screwed up and the evaluation is all wrong, so skip them. + sys.stderr.write('skipping NaN: ' + line + '\n') + continue + score = float(score_str) + + if logflag: + if (score == 0.0): + score = -1e308 # dodgy: very small number + else: + score = log10(score) + if negateflag: + score = -score + + if multiquery: + if skip_self_query and queryid.lower() == domainid.lower(): + continue + else: + yield((queryid, score, domainid)) + else: + yield((score, domainid)) + + +def get_searchresult_info(fh): + """ + Used in conjunction with iter_searchresult() to get comments and + number of search result lines from the search results file; cannot + combine this functionality in iter_searchresult() as that is a generator + function unlike parse_searchresult() - this function and iter_searchresult() + together form the generator function replacement for parse_searchresult() + for very large files so we don't have to read the whole file at once + into core. + + Parameters: + fh - open (read) filehandle to parse from + + Return value: + tuple (num_results, commentlines) + num_results is number of results lines in file, i.e. the number + of result tuples that will be returned by iter_searchresult() + commentlines is list of strings, each one a comment line from file + + """ + num_results = 0 + commentlist = [] + for line in fh: + if line[0] == '#': + commentlist.append(line) + continue + splitline = line.split() + if len(splitline) != 2: + sys.stderr.write('bad line: ' + line + '\n') + continue + domainid = splitline[0] + score_str = splitline[1] + if score_str.lower() == 'nan' or score_str == '********': + # if we get any NaN values then then sort() gets completely + # screwed up and the evaluation is all wrong, so skip them. + sys.stderr.write('skipping NaN: ' + line + '\n') + continue + num_results += 1 + return (num_results, commentlist) + + +def iter_slrtab(fh): + """ + This is a generator function + that yields one (score, label) tuple at a time from the slrtab + file, instead of parsing the whole thing and returning a list. + Note because of this + it cannot sort the result, the file must already be sorted, from + 'lowest' (i.e. least likely to be a positive instance, i.e. 'worst hit') + to 'highest' (least likely to be a postivie instance). + + The format of the slrtab file is score then class label (0 or 1) e.g. + + 0.499999999 0 + ... + 35.99999999 1 + 35.99999999 1 + + + Parameters: + fh - open (read) filehandle to parse from + + Return value: + Yields a tuple (score, label) for each search result line in the + file. Ordered according to the file order, i.e. must be from + best to worst hit. + """ + for line in fh: + if line[0] == '#': + continue + splitline = line.split() + if len(splitline) == 0: + continue # skip blank lines + if splitline[0] == "score": + continue + if len(splitline) != 2: + sys.stderr.write('bad line: ' + line + '\n') + continue + score_str = splitline[0] + label_str = splitline[1] + score = float(score_str) + label = int(label_str) + yield((score, label)) + + +def get_slrtab_info(fh): + """ + Used in conjunction with iter_slrtab() to get comments and + number of (Score, label) lines from the slrtab file; cannot + combine this functionality in iter_slrtab() as that is a generator. + + Parameters: + fh - open (read) filehandle to parse from + + Return value: + tuple (num_results, num_positive, commentlines, total_tp_possible, + num_queries) + num_results is number of results lines in file, i.e. the number + of (score, label) tuples that will be returned by iter_slrtab() + num_positive is number of results with class label 1 + commentlines is list of strings, each one a comment line from file + total_tp_possible is the total number of true positives possible in the database + num_queries is the number of queries in the slrtab + + total_tp_possible and num_queries are parsed from the + TOTAL_TP_POSSIBLE and NUM_QUERIES comment lines generated + by mkslrtabmultiquery.py or mkslrtab.py + + """ + num_results = 0 + num_positive = 0 + total_tp_possible = None + num_queries = None + commentlist = [] + for line in fh: + if line[0] == '#': + commentlist.append(line) + splitline = line.split('=') + if len(splitline) == 2: + if splitline[0] == '# TOTAL_TP_POSSIBLE ': + total_tp_possible = int(splitline[1]) + elif splitline[0] == '# NUM_QUERIES ': + num_queries = int(splitline[1]) + continue + splitline = line.split() + if len(splitline) != 2: + sys.stderr.write('bad line: ' + line + '\n') + continue + num_results += 1 + score_str = splitline[0] + label_str = splitline[1] + score = float(score_str) + label = int(label_str) + if (label == 1): + num_positive += 1 + elif (label != 0): + sys.stderr.write('bad label "%d"\n' % label) + + return (num_results, num_positive, commentlist, + total_tp_possible, num_queries) + + + +def get_domain_by_sid(query_sid, scop): + """ + Get a Bio.SCOP domain id for supplied sid. This could be done + directly by scop.getDomainBySid(query_sid) except the various + exceptions and error conditions that can arise and we need to deal + with (see comments in function). + + Parameters: + query_sid - SCOP domain id (eg 'd1ubia_' of the domain to query. + scop - previously built Bio.SCOP Scop instance + + Return value: + Bio.SCOP domain instance fo the query_sid + """ + dom =scop.getDomainBySid(query_sid) + if dom == None: + # some domains seem to start with g rather than d + # e.g. g1avo.4 or g1dk9.1 + # they are in the astral-scopdom-seqres-all-1.73.fa file + # with the g at the start, but for some reason get None when + # looking up that sid, need to replace g with d instead. + if query_sid[0] == 'g': + query_sid = 'd' + query_sid[1:] + # when using sids from older versions of ASTRAL SCOP + # (e.g. 1.65 as used in Hulsen et al. (2006) sometimes + # we get identifiers with _ as chainid e.g. d1ab4__ which + # have been changed to have a as chainid instead as per recent PDB + # remediation. So we'll try chainid a instead of _ + if query_sid[5] == '_': + query_sid = query_sid[:5] + 'a' + query_sid[6] + + dom = scop.getDomainBySid(query_sid) + + return dom + + +def get_scop_domains(query_sid, scop): + """ + Get a list of SCOP domains that have the same fold as the query domain, + + Parameters: + query_sid - SCOP domain id (eg 'd1ubia_' of the domain to query. + scop - previously built Bio.SCOP Scop instance + Return value: + list of Bio.SCOP domain instances that have the same fold as the + query_sid. + """ + if verbose: + sys.stderr.write('getting domains in same fold as ' + query_sid +'...\n') + dom = get_domain_by_sid(query_sid, scop) + fold = dom.getAscendent('fold') + related = fold.getDescendents('domain') + if verbose: + sys.stderr.write('found %d domains\n' % len(related)) + return related + + +def get_domains_in_same_superfamily(sid, scop): + """ + Return a list of SCOP domain instances that are in the same + superfamily as the supplied SCOP domain id. + + Parameters: + sid - (string) SCOP domain identifier as sid e.g. d1ubia_ + scop - previuosly created Bio.SCOP Scop object + + Return value: + list of SCOP domains that are in the same superfamily as the + supplied sid (including the domain for the sid itself). + """ + dom = get_domain_by_sid(sid, scop) + return dom.getAscendent('superfamily').getDescendents('domain') + + +def get_domains_in_same_family(sid, scop): + """ + Return a list of SCOP domain instances that are in the same + family as the supplied SCOP domain id. + + Parameters: + sid - (string) SCOP domain identifier as sid e.g. d1ubia_ + scop - previuosly created Bio.SCOP Scop object + + Return value: + list of SCOP domains that are in the same superfamily as the + supplied sid (including the domain for the sid itself). + """ + dom = get_domain_by_sid(sid, scop) + return dom.getAscendent('family').getDescendents('domain') + + +def filter_domains_astral_nrpercent(scop_domain_list, scop, astral, nrpercent): + """ + Given list of Bio.SCOP domain objects, return list of those domains + that are in the ASTRAL nr% sequence identity nonredundant subset. + + Parameters: + scop_domain_list - list of Bio.SCOP domain objects + scop - previously built Bio.SCOP Scop instance + astral - previously build Bio.SCOP Astral instance + nrpercent - (integer) percent sequence identity subset to use (e.g. 95) + + """ + related = [r for r in scop_domain_list if astral.isDomainInId(r, nrpercent)] + return related + + +def get_db_size(scop, astral, nrpercent): + """ + Return number of domains in supplied SCOP. If nrpercent is not None + then return nubmer of domains in specified ASTRAL sequence identity + nonredundant subset. + + Parameters: + scop - previously built Bio.SCOP Scop instance + astral - previously build Bio.SCOP Astral instance + nrpercent - (integer) percent sequence identity subset to use (e.g. 95) + or None for whole SCOP. + + Return value: + Number of domains in SCOP or ASTRAL nonredundant subset + """ + all_domains = scop.getRoot().getDescendents('domain') + if nrpercent != None: + all_domains = filter_domains_astral_nrpercent(all_domains, + scop, astral, + nrpercent) + return len(all_domains) + + + + +def get_seq_db_size(scop, astral, nrpercent): + """ + Return number of sequences in supplied SCOP. If nrpercent is not None + then return nubmer of domains in specified ASTRAL sequence identity + nonredundant subset. + This is not the same as get_db_size(), which gives all the domains + in SCOP, here we try to find the number of domains that have + sequences in the FASTA (or actually, due to the way Bio.SCOP works, + in the .id file, which is sometimes not the same for reasons + I don't understand). + + Parameters: + scop - previously built Bio.SCOP Scop instance + astral - previously build Bio.SCOP Astral instance + nrpercent - (integer) percent sequence identity subset to use (e.g. 95) + or None for whole SCOP. + + Return value: + Number of domains in SCOP or ASTRAL nonredundant subset + """ + if nrpercent == None: + db_size = len(astral.fasta_dict) # dodgy direct access, but works + else: + db_size = len(astral.domainsClusteredById(nrpercent)) + return db_size + + + +def get_goldstd_domains(query_sid, level, + use_nonredundant, + scop, astral, nrpercent): + """ + Geth the "gold standard" list of domains. This is the domains that are + in the same fold or superfamily or family (according to level option) + as the supplied domain specified by query_sid. + + Parameters: + query_sid - SCOP domain identifier (sid) of query + level - 'fold' or 'superfamily' or 'family' - the level of SCOP + hierarchy to use to define gold standard. + use_nonredundant - Bool if True use ASTRAL 95% nr subset + scop - previously built Bio.SCOP Scop instance + astral - previously build Bio.SCOP Astral instance + nrpercent - (integer) percent sequence identity subset to use (e.g. 95) + Only used if use_nonredundant=True + + Return value: + list of sids for domains in same fold/superfamily as query_sid + """ + if level == 'superfamily': + if verbose: + sys.stderr.write('getting domains in same superfamily as ' + query_sid + + '...\n') + goldstd_domains = get_domains_in_same_superfamily(query_sid, scop) + if verbose: + sys.stderr.write('found ' + str(len(goldstd_domains)) + ' domains\n') + elif level == 'family': + if verbose: + sys.stderr.write('getting domains in same family as ' + query_sid + + '...\n') + goldstd_domains = get_domains_in_same_family(query_sid, scop) + if verbose: + sys.stderr.write('found ' + str(len(goldstd_domains)) + ' domains\n') + elif level == 'fold': + goldstd_domains = get_scop_domains(query_sid,scop) + else: + raise ValueError('unknown level: ' + level) + + if use_nonredundant: + goldstd_domains = filter_domains_astral_nrpercent(goldstd_domains, scop, astral, nrpercent) + if verbose: + sys.stderr.write('got ' + str(len(goldstd_domains)) + ' domains in ASTRAL ' + str(nrpercent) + '% sequence nr subset.\n' ) + + return goldstd_domains + + +def get_betagrasp_containing_domains(scop): + """ + Get a list of SCOP domains that contain the beta-grasp motif. + Note that these are not just the domains in the SCOP beta-grasp + (ubiquitin-like) fold, but the three categories described in + + Shi et al (2007) 'Searching for three-dimensional secondary structure + patterns in proteins with ProSMoS' Bioinformatics 23(11):1331-1338 + + Refer to Table 1 in the above for the list of supefamilies built + by this subroutine. + + Parameters: + scop - previously built Bio.SCOP Scop instance + + Return value: + list of Bio.SCOP domain instances that contain the beta-grasp + fold, either as (1) the core (2) a gregarious fold or (3) strutural + drift (see Shi et al 2007 and refernces therein). + + Implemented with SCOP 1.73 (so note some sids are different due to + remediation of chainid e.g. d1ubq__ (SCOP 1.69 as used in Shi et al) + is now d1ubqa_ in SCOP 1.73). + """ + # + # Category 1: beta-grasp core + # + + # ubiquitin-like + +# core = get_domains_in_same_superfamily('d1ubqa_', scop) +# core += get_domains_in_same_superfamily('d1c9fa_', scop) +# core += get_domains_in_same_superfamily('d1fm0d_', scop) +# core += get_domains_in_same_superfamily('d1frda_', scop) +# core += get_domains_in_same_superfamily('d2saka_', scop) +# core += get_domains_in_same_superfamily('d1an8a2', scop) +# core += get_domains_in_same_superfamily('d1pgxa_', scop) +# core += get_domains_in_same_superfamily('d1tifa_', scop) +# core += get_domains_in_same_superfamily('d1f1hl1', scop) +# core += get_domains_in_same_superfamily('d1qf6a2', scop) +# core += get_domains_in_same_superfamily('d1mfwa_', scop) +# core += get_domains_in_same_superfamily('d1t0qc_', scop) +# core += get_domains_in_same_superfamily('d2fug13', scop) # Nqo1 like +# core += get_domains_in_same_superfamily('d2gria1', scop) # NSP3A like + + core = get_scop_domains('d1ubia_', scop) # ubiquitin-like fold + + # Nudix + core += get_domains_in_same_superfamily('d1muta_', scop) + + # Anthrax protective antigen + core += get_domains_in_same_superfamily('d1acca_', scop) + + # Oxidoreductase domain + core += get_domains_in_same_superfamily('d1soxa3', scop) + + # AF oxidoreductase domain + core += get_domains_in_same_superfamily('d1aorb2', scop) + + # + # Category 2: gregarous folds + # + gregarious = [] + # Ribosomal protein L25-like + gregarious += get_domains_in_same_superfamily('d1d6ka_', scop) + + # BtrG-like + gregarious += get_domains_in_same_superfamily('d1vkba_', scop) + + # RNA-polymerase + gregarious += get_domains_in_same_superfamily('d1i3qb_', scop) + + # Hypothetical protein HP Ym1108w + gregarious += get_domains_in_same_superfamily('d1n6za_',scop) + + # QueA-like + gregarious += get_domains_in_same_superfamily('d1vkyb_',scop) + + # + # Category 3: structural drift + # + drift = [] + + # NB as per notes to Table 1 in Shi et al (2007), note that in Category 3 + # (structural drift) proteins some SSEs of the beta-grasp motif are not + # part of the domain core, so not all superfamily members may hvae the + # beta-grasp motif. Hence we do NOT get all domains in the superfamily + # of the respresentative in the table, only the representative itself. + + + # 4Fe-4S ferredoxins + drift.append(scop.getDomainBySid('d1h0hb_')) + drift.append(scop.getDomainBySid('d1vlen2')) # prosmos found, manually checked + drift.append(scop.getDomainBySid('d1kqfb1')) # prosmos found, manually checked + + + # TIM barrel - Enolase C-domain-like + drift.append(scop.getDomainBySid('d1e9ia1')) + drift.append(scop.getDomainBySid('d1yela1')) # prosmos, manually checked + drift.append(scop.getDomainBySid('d1iyxa1')) # prosmos, manually checked + drift.append(scop.getDomainBySid('d2fyma1')) # prosmos, manually checked + + + # TIM barrel - (Trans)glycosidases + drift.append(scop.getDomainBySid('d1fhla_')) + drift.append(scop.getDomainBySid('d1hjsa_')) # prosmos, manually checked + drift.append(scop.getDomainBySid('d1ur4a_')) # prosmos, manually checked + drift.append(scop.getDomainBySid('d1hjqa_')) # prosmos, manually checked + + + # alpha/alpha-Toroid - six-hairpin glycosyltransferases + drift.append(scop.getDomainBySid('d1ut9a1')) + + # Metal ATPase domain - Metal cation-transporting ATPase + drift.append(scop.getDomainBySid('d1su4a3')) + drift.append(scop.getDomainBySid('d1q3ia_')) # prosmos, manually checked + drift.append(scop.getDomainBySid('d1wpga3')) # prosmos, manually checked (dubious on this one though) + + + # Cystatin-like - NTF2-like + drift.append(scop.getDomainBySid('d1e3va_')) + drift.append(scop.getDomainBySid('d1q40a_')) # prosmos, manually checked + drift.append(scop.getDomainBySid('d1q40b_')) # prosmos, manually checked + drift.append(scop.getDomainBySid('d1jkgb_')) # prosmos, manually checked + drift.append(scop.getDomainBySid('d1nwwa_')) # prosmos, manually checked + drift.append(scop.getDomainBySid('d1tuha_')) # prosmos, manually checked + drift.append(scop.getDomainBySid('d1s5aa_')) # prosmos, manually checked + drift.append(scop.getDomainBySid('d2a15a1')) # prosmos, manually checked + + + # Knottins - Growth factor receptor domain + drift.append(scop.getDomainBySid('d1igra3')) + + return core + gregarious + drift + + +def is_true_positive(domainid, goldstd_pos_dict): + """ + Return true if domainid is in positive class according to supplied + dict (i.e. in the dict) - need a function not just simple dict + lookup due to dodgy stuff with domains not starting with 'd' when + using Bio.SCOP (seem comments in code for details). + + Parameters: + domainid -domain identifier (sid) to test + goldstd_pos_dict - dict { domainid : True } for domains in pos class + Return value: + True if domainid is in positive class according to supplied dict + """ + + if not goldstd_pos_dict.has_key(domainid): + # some domains seem to start with g rather than d + # e.g. g1avo.4 or g1dk9.1 + # they are in the astral-scopdom-seqres-all-1.73.fa file + # with the g at the start, but for some reason get None when + # looking up that sid, need to replace g with d instead. + if domainid[0] == 'g': + domainid = 'd' + domainid[1:] + # when using sids from older versions of ASTRAL SCOP + # (e.g. 1.65 as used in Hulsen et al. (2006) sometimes + # we get identifiers with _ as chainid e.g. d1ab4__ which + # have been changed to have a as chainid instead as per recent PDB + # remediation. So we'll try chainid a instead of _ + if domainid[5] == '_': + if len(domainid) < 7: + sys.stderr.write('WARNING: Bad domain id %s\n' % domainid) + return False # bizarrely, we get 'd1orf_' from swsse2 sometimes + domainid = domainid[:5] + 'a' + domainid[6] + return goldstd_pos_dict.has_key(domainid) + else: + return True + + + +def tsevalutils_set_verbose(verb): + """ + set the module global verbose flag in this module to supplied value + Parameters: verb - True (for verbose output) or False + Return value: None + Uses globals: verbose (in this module) + """ + global verbose + verbose = verb diff --git a/scripts/tsrchd_pardiso_pbs_script.sh b/scripts/tsrchd_pardiso_pbs_script.sh new file mode 100644 index 0000000..629df93 --- /dev/null +++ b/scripts/tsrchd_pardiso_pbs_script.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +# File: tsrchd_pardiso_pbs_script.sh +# Author: Alex Stivala +# Created: July 2009 +# +# PBS script for submitting QP tableau search jobs on tango.vpac.org +# requires PATH and PYTHONPATH already set up in environment +# +# $Id: tsrchd_pardiso_pbs_script.sh 2941 2009-11-15 03:27:36Z astivala $ + +#PBS -N d1ae6h1_tsrchd_pardiso +#PBS -l walltime=23:0:0 +#PBS -l nodes=1 +#PBS -v MKL_NUM_THREADS=1 + + + +QUERY=d1ae6h1 +INPUT_DIR=${HOME}/phd/qptabsearch/data +OUTDIR=. +TSRCHD=${HOME}/phd/qptabsearch/src/tsrchd_pardiso +TIME=/usr/bin/time + +cd $PBS_O_WORKDIR +set CONV_RSH = ssh + + +$TIME $TSRCHD -t < ${INPUT_DIR}/${QUERY}.input > ${OUTDIR}/${QUERY}.out 2> ${OUTDIR}/${QUERY}.err + + diff --git a/scripts/tsrchd_umfpack_pbs_script.sh b/scripts/tsrchd_umfpack_pbs_script.sh new file mode 100644 index 0000000..2199558 --- /dev/null +++ b/scripts/tsrchd_umfpack_pbs_script.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# +# File: tsrchd_umfpack_pbs_script.sh +# Author: Alex Stivala +# Created: July 2009 +# +# PBS script for submitting QP tableau search jobs on tango.vpac.org +# requires PATH and PYTHONPATH already set up in environment +# +# Run from qptabsearch/ directory, i.e. : +# +# qsub scripts/tsrchd_umfpack_pbs_script.sh +# +# $Id: tsrchd_umfpack_pbs_script.sh 2726 2009-08-05 04:56:11Z astivala $ + +#PBS -N tsrchd_umfpack +#PBS -l walltime=16:0:0 +#PBS -l nodes=1 +#PBS -v MKL_NUM_THREADS=1 + + +QUERY=d1f6dc_ +OUTDIR=tango_results_umfpack + +TIME=/usr/bin/time +cd $PBS_O_WORKDIR +set CONV_RSH = ssh + + +$TIME src/tsrchd_sparse < data/${QUERY}.input > ${OUTDIR}/${QUERY}.out 2> ${OUTDIR}/${QUERY}.err + + diff --git a/scripts/tsrchn_pardiso_pbs_script.sh b/scripts/tsrchn_pardiso_pbs_script.sh new file mode 100644 index 0000000..b4cca3e --- /dev/null +++ b/scripts/tsrchn_pardiso_pbs_script.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +# File: tsrchn_pardiso_pbs_script.sh +# Author: Alex Stivala +# Created: July 2009 +# +# PBS script for submitting QP tableau search (numeric) jobs on tango.vpac.org +# requires PATH and PYTHONPATH already set up in environment +# +# $Id: tsrchn_pardiso_pbs_script.sh 2906 2009-11-06 00:05:34Z astivala $ + +#PBS -N tsrchn_pardiso +#PBS -l walltime=23:0:0 +#PBS -l nodes=1 +#PBS -v MKL_NUM_THREADS=1 + + + +QUERY=d1f6dc_ +INPUT_DIR=${HOME}/phd/qptabsearch/data +OUTDIR=. +TSRCHN=${HOME}/phd/qptabsearch/src/tsrchn_pardiso +TIME=/usr/bin/time + +cd $PBS_O_WORKDIR +set CONV_RSH = ssh + + +$TIME $TSRCHN -t < ${INPUT_DIR}/${QUERY}.omega.input > ${OUTDIR}/${QUERY}.tsrchn.out 2> ${OUTDIR}/${QUERY}.tsrchn.err + + diff --git a/scripts/tszscore.py b/scripts/tszscore.py new file mode 100755 index 0000000..d69e27e --- /dev/null +++ b/scripts/tszscore.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +############################################################################### +# +# tszscore.py - compute Z scores for QP tableau search output +# +# File: tszscore.py +# Author: Alex Stivala +# Created: March 2009 +# +# See usage in docstring for main() +# +# Imports from tsevalutils.py, so the directory containing it (ptgraph/) +# must be in the PYTHONPATH +# Also requires numpy. +# +# $Id: tszscore.py 2166 2009-03-30 03:59:01Z astivala $ +# +############################################################################### + +import sys,os +import getopt + +from tsevalutils import parse_searchresult +from numpy import mean,std + +#----------------------------------------------------------------------------- +# +# Function definitions +# +#----------------------------------------------------------------------------- + +def usage(progname): + """ print usage message and exit + """ + sys.stderr.write("Usage: " + progname + " [-no]\n") + sys.stderr.write(' -n negate scores\n') + sys.stderr.write(' -o take log10 of scores\n') + sys.exit(1) + + +#----------------------------------------------------------------------------- +# +# Main +# +#----------------------------------------------------------------------------- + +def main(): + """ + main for tszscore.py + + + Given list of scores from QP tableau search (tsrchd_sparse etc.) output, + compute Z-scores for hits to the query, and output sorted by + descending Z-score. + + Usage: + tszscore.py < tsrchd_output + + -n negate scores + -o take log10 of scores + + Input is from stdin, tab-delimited in the format + + pdbid score + + Output is to stdout, in the same format as input i.e. + + pdbid Z-score + """ + negateflag = False + logflag = False + + try: + opts,args = getopt.getopt(sys.argv[1:], "no?") + except: + usage(os.path.basename(sys.argv[0])) + for opt,arg in opts: + if opt == "-n": # negate scores + negateflag = True + elif opt == "-o": # take log10 of scores + logflag = True + else: + usage(os.path.basename(sys.argv[0])) + + if len(args) != 0: + usage(os.path.basename(sys.argv[0])) + + + (searchresult,commentlist) = parse_searchresult(sys.stdin, + negateflag, + logflag) + + sys.stdout.write('#' + ' '.join(sys.argv) + '\n') #identifying info about us + sys.stdout.write('# Z-scores computed from:\n') + for line in commentlist: + sys.stdout.write('# ') + sys.stdout.write(line) # identifying information about search run + sys.stdout.write('\n') + + scores = [s[0] for s in searchresult] + mu = mean(scores) + sigma = std(scores) + # Z-score for s is (s - mu) / sigma + zscores = [ ( (s[0] - mu) / sigma, s[1]) for s in searchresult ] + + zscores.sort(reverse=True) + for zs in zscores: + sys.stdout.write("%s %5.3f\n" % (zs[1], zs[0])) + + +if __name__ == "__main__": + main() diff --git a/scripts/update_query_list_for_new_astral_version.sh b/scripts/update_query_list_for_new_astral_version.sh new file mode 100755 index 0000000..2e2a127 --- /dev/null +++ b/scripts/update_query_list_for_new_astral_version.sh @@ -0,0 +1,56 @@ +#!/bin/bash +############################################################################### +# +# update_query_list_for_new_astral_version.sh.sh - +# update list of queries generated by genquerylist.sh for new ASTRAL +# +# File: update_query_list_for_new_astral_version.sh.shb +# Author: Alex Stivala +# Created: Februrary 2010 +# +# +# Given a list of domain sids generated by genquerylist.sh, find those +# not in the supplied tableaux database file (due to it being a later +# ASTRAL SCOP version than that on which the query list was generated), +# generate a new list with domains that are not in the new version replaced +# with ones in the same superfamily that are in the new version. +# +# Used to update the query200 list generated with ASTRAL SCOP 1.73 +# to ASTRAL SCOP 1.75. +# +# +# Usage: +# update_query_list_for_new_astral_version.sh.sh tableauxdb < query.list +# +# Environment variables: +# +# PATH must contain the location of the Python scripts, ie where this +# script itself is and the ptgraph/ directory with pytableaucreate.py etc., +# and the location of tsrchd_sparse. +# The dssp program must also be in the PATH. +# +# PYTHONPATH must contain the directory containing the +# Python modules used by the Python scripts. +# +# $Id: update_query_list_for_new_astral_version.sh 3340 2010-02-14 23:08:09Z alexs $ +# +############################################################################### + + +if [ $# -ne 1 ]; then + echo "usage: $0 tableauxdbfile < querylist" >&2 + exit 1 +fi +tableauxdbfile=$1 + +while read sid +do + grep $sid $tableauxdbfile >/dev/null 2>/dev/null + if [ $? -ne 0 ]; then + echo $sid | scopdominfo.py | grep '^d' | cut -f2 | cut -d'(' -f2 | \ + cut -d')' -f1 | getdomainsinsf.py + else + echo $sid + fi +done + diff --git a/scripts/vastout2col.py b/scripts/vastout2col.py new file mode 100755 index 0000000..e25d43c --- /dev/null +++ b/scripts/vastout2col.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# +# File: vastout2col.sh +# Author: Alex Stivala +# Created: November 2008 +# +# vastout2col.sh - Convert VAST .gibbs output format to 2 column +# format as output by tsrchd_sparse etc. which can +# be processed with tsevalfn.py etc. +# +# Usage: vastout2col.sh < domain.gibbs +# +# Output has two columns, database id and VAST Pcli score +# +# Output is to stdout. +# +# Uses the output format from VAST (Gibrat et al 1996; Madej et al 1995), +# available from +# http://migale.jouy.inra.fr/outils/mig/vast/ +# +# $Id: vastout2col.py 3603 2010-05-04 04:47:51Z alexs $ +# + +import os,sys +from itertools import groupby + +value_header = False +dbid = None + +scorelist = [] # list of (targetpdbid,Pcli) tuples + +for line in sys.stdin: + splitline = line.split() + if len(splitline) > 1 and splitline[1] == 'Nclique=': + dbid = splitline[0] + value_header = False + elif splitline[0] == 'Nres' and splitline[6] == 'Pcli': + value_header = True + elif value_header: + Pcli = splitline[6] + scorelist.append((dbid, Pcli)) + value_header = False + +# for reasons I don't entirely understand +# there are sometimes two or more entries for the same target +# with differing Pcli and other values. We will also choose the one +# with highest Pcli +single_scorelist = [] +targetpdbid_group_iter = groupby(sorted(scorelist), lambda t : t[0]) +for (targetpdbid, targetpdbid_iter) in targetpdbid_group_iter: + maxPcli = max([Pcli for (pdbid,Pcli) in targetpdbid_iter]) + single_scorelist.append((targetpdbid, maxPcli)) + + +for (targetpdbid, Pcli) in single_scorelist: + sys.stdout.write('%s %s\n' % (targetpdbid, Pcli)) + diff --git a/scripts/yakusaout2col.sh b/scripts/yakusaout2col.sh new file mode 100755 index 0000000..c0ff4ea --- /dev/null +++ b/scripts/yakusaout2col.sh @@ -0,0 +1,55 @@ +#!/bin/sh +# +# File: yakusaout2col.sh +# Author: Alex Stivala +# Created: March 2010 +# +# yakusaout2col.sh - Convert yakusa default output format to same +# format as output by tsrchd_sparse etc. which can +# be processed with tsevalfn.py etc. +# +# Usage: yakusaout2col.sh [-q] < yakusaoutput.yakusa.out +# +# -q : add QUERY ID = line +# +# Output has two columns, database id and SHEBA m-score. +# The query id is put in a comment line at top of file, it is assumed +# to be the same in every line of yakusa -A output since that mode +# runs one query against a db. +# +# Output is to stdout. +# +# Uses the output format from YAKUSA, see documentation at +# http://bioserv.rpbs.jussieu.fr/Yakusa/download/README_yakusa +# for more information +# E.g: +# +# Protein rank: 1 score: 118.48 Z-score: 24.29 name: d1u6ra1 : 0000 SCOP/ASTRAL domain d1u6ra1 [11960CHAIN A +# +# $Id: yakusaout2col.sh 3397 2010-03-05 04:28:21Z alexs $ +# +# Uses GNU head options (-n -1) + +outputqueryid=0 +if [ $# -gt 1 ]; then + echo "usage: $0 [-q] < yakusaoutput" >&2 + exit 1 +elif [ $# -eq 1 ]; then + if [ $1 = "-q" ]; then + outputqueryid=1 + else + echo "usage: $0 [-q] < yakusaoutput" >&2 + exit 1 + fi +fi + +awk -v outputqueryid=$outputqueryid ' + /^Protein rank:/ { score = $7; + if (score == "inf") score = 99999; + print $9, score; + } + /^Description query :/ { if (outputqueryid == 1) + printf("# QUERY ID = %s\n", $7); + } + /^Query: / { printf("# %s\n", $0);} + /^Database: / {printf("# %s\n", $0)}'