Skip to content

Commit

Permalink
update from 2011 PhD archive
Browse files Browse the repository at this point in the history
  • Loading branch information
stivalaa committed Aug 26, 2021
1 parent db9346d commit 71e94f8
Show file tree
Hide file tree
Showing 20 changed files with 752 additions and 60 deletions.
16 changes: 11 additions & 5 deletions scripts/fakepdb_to_cops.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@
# Author: Alex Stivala
# Created: April 2010
#
# $Id: fakepdb_to_cops.py 3686 2010-05-17 07:31:00Z alexs $
# $Id: fakepdb_to_cops.py 3909 2010-07-11 04:45:25Z alexs $
"""
fakepdb_to_cops.py - Convert fake PDB identifers back to COPS identifiers
Usage: fakepdb_to_cops.py < DaliteLite-2col-output
Usage: fakepdb_to_cops.py [fakepdbids_filename] < DaliteLite-2col-output
The input file is 2 column from dalilitout2col.py
on stdin.
Output is to stdout.
If the fakepdbids_filename is supplied, the translation table is read
from it, toerhwise the COPS file is used.
Note that DaliLite ONLY allows 4 char PDB codes, with chain appended, so
SCOP or COPS type codes will not work in cases where there are more than
Expand Down Expand Up @@ -68,13 +70,17 @@ def main():
"""
main for fakepdb_to_cops.py - see usage message at file header
"""
if len(sys.argv) != 1:
if len(sys.argv) == 2:
fakeids_filename = sys.argv[1]
elif len(sys.argv) == 1:
fakeids_filename = COPS_FAKEPDBIDS_FILE
else:
usage(os.path.basename(sys.argv[0]))

FAKE_TO_COPS_DICT = parse_fakepdbids_file(COPS_FAKEPDBIDS_FILE)[0]
FAKE_TO_COPS_DICT = parse_fakepdbids_file(fakeids_filename)[0]

for (score, dbid) in iter_searchresult(sys.stdin,multiquery=False):
sys.stdout.write("%s %f\n" % (FAKE_TO_COPS_DICT[dbid], score))
sys.stdout.write("%s %f\n" % (FAKE_TO_COPS_DICT[dbid[:4]], score))


if __name__ == "__main__":
Expand Down
214 changes: 214 additions & 0 deletions scripts/fitgumbeldist.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
#############################################################################3
#
# File: fitgumbeldist.r
# Author: Alex Stivala
# Created: July 2010
#
# fitgumbeldist.r - compute Gumbest dist. parameters and build plots
#
# R script for fitting Gumbel distribution (finding parameters location
# and scale) and plotting histograms and fit over them.
# Also, compute the z-score and p-values using this, then
# compute coverage-vs-EPQ graph as used in Ortiz et al 2002 (and earlier
# Levitt and Gerstein (1998) to compare the analytic and observed p-values)
#
#
# Uses commandArgs() R function to get trailing arguments from R command
# line ie after the --args option. The filename of the .slrtab file
# (score and label)
# is obtained from --args.
#
# label is used to separate same fold (1) and different fold (0) scores,
# different distributinos fitted to each
#
# outputs stuff on stdout (including the parameter values) and
# creates PostScript file with histograms and curves, with filename
# derived form input filename: basename_gumbel.eps
#
# R --vanilla -f plotssehistogram.r --args query200.slrtab
#
# creates query200_gumbel.eps
# and query200_epq_gumbel.eps
#
# Uses the evir R package for gumbel fitting and the evd package for dbumbel()
#
# $Id: fitgumbeldist.r 3955 2010-07-21 05:52:47Z alexs $
#############################################################################3

library(evir)
library(evd)


#############################################################################3
#
# constants
#
#############################################################################3

colorvec=c('deepskyblue4','brown','red','turquoise','blue','purple','green','cyan','gray20','magenta','darkolivegreen2','midnightblue','magenta3','darkseagreen','violetred3','darkslategray3')
ltyvec=c(1,2,4,5,6,1,2,1,5,6,1,2,4,5,6,1,2)
pchvec=c(20,21,22,23,24,25,19,21,22)

eulergamma <- 0.5772156649015328606

#############################################################################3
#
# functions
#
#############################################################################3


#
# z_gumbel() - compute Z-score from Gumbel distribution
#
# Parameters:
# x - score to compute Z-score for
# gumbel - evd object for Gumbel distribution from gumbel()
#
# Return value:
# Z-score computed for x according to gum distribution
#
z_gumbel <- function(x, gum )
{
a<- gum0$par.ests[2]
b <- gum0$par.ests[1]
mu <- a + b * eulergamma
sigma <- ( (pi / sqrt(6)) * b )
z <- (x - mu)/sigma
return (z)
}

#
# pv_gumbel() - compute P-value for Z-score from Gumbel distribution
#
# Parameters:
# z - Z-score from z_gumbel()
#
# Return value:
# P-value for the Z-score
#
pv_gumbel <- function(z)
{
return ( 1 - exp(-exp(-( (pi/sqrt(6)*z + eulergamma) ) ) ) )
}

#############################################################################3
#
# main
#
#############################################################################3


filename <- commandArgs(trailingOnly=TRUE)

# EPS suitable for inserting into LaTeX
postscript(sub('[.]slrtab$','_gumbel.eps',filename),
onefile=FALSE,paper="special",horizontal=FALSE,
width = 9, height = 6)

slrtab <- read.table(filename, header=T)
slrtab <- subset(slrtab, score >= 0) # one or two -ve scores sometimes

#
# Fit Gumbel distribution to same fold scores and different fold scores
#

gum0 <- gumbel(subset(slrtab, label == 0)$score)
gum1 <- gumbel(subset(slrtab, label == 1)$score)

cat('different folds: a = ', format(gum0$par.ests[2], digits=16), ' ' )
cat('b = ', format(gum0$par.ests[1], digits=16), '\n')
cat('same fold: a = ', format(gum1$par.ests[2], digits=16), ' ' )
cat('b = ', format(gum1$par.ests[1], digits=16), '\n')

#
# Plot histograms and Gumbel distributions fitted to them
#

h0 <- hist(subset(slrtab, label == 0)$score, breaks=20, plot=F)
h1 <- hist(subset(slrtab, label == 1)$score, breaks=20, plot=F)
xh0 <- c(min(h0$breaks), h0$breaks)
yh0 <- c(0, h0$density, 0)
xh1 <- c(min(h1$breaks), h1$breaks)
yh1 <- c(0, h1$density, 0)

plot(xh0, yh0, type='s', lty=ltyvec[1], col=colorvec[1], xlab='score',ylab='frequency', ylim=c(0,1.0)) # ylim=c(0, max(yh0, yh1)) )
lines(xh1, yh1, type='s', lty=ltyvec[2], col=colorvec[2])

curve(dgumbel(x, gum0$par.ests[2], gum0$par.ests[1]), col=colorvec[3], lty=ltyvec[3], add=T)
curve(dgumbel(x, gum1$par.ests[2], gum1$par.ests[1]), col=colorvec[4], lty=ltyvec[4], add=T)

legend("topright", lty=ltyvec, col=colorvec, bty='n',
legend=c("Different folds, histogram",
"Same fold, histogram",
"Different folds, fitted Gumbel distribution",
"Same fold, fitted Gumbel distribution") )


dev.off()

#
# compute P-values for all scores
#

pvslrtab <- slrtab
pvslrtab$pvalue <- pv_gumbel(z_gumbel(slrtab$score, gum0))

#
# compute coverage-vs-EPQ graph as used in Ortiz et al 2002 (and earlier
# Levitt and Gerstein (1998) to compare the analytic and observed p-values)
#

# sort by ascending P-value so 'best' are at start of list
pvslrtab <- pvslrtab[sort.list(pvslrtab$pvalue),]

tptotal <- length(subset(pvslrtab, label == 1)$label)
tpcount <- 0
fpcount <- 0

coverage_vec <- c()
obs_pv_vec <- c()
analytic_pv_vec <- c()

for (i in 1:length(pvslrtab$label))
{
if (pvslrtab$label[i] == 0)
{
fpcount <- fpcount + 1
}
else
{
tpcount <- tpcount + 1
}
observed_pvalue <- fpcount / (length(pvslrtab$label) - tptotal)
coverage <- tpcount / tptotal

coverage_vec <- c(coverage_vec, coverage)
obs_pv_vec <- c(obs_pv_vec, observed_pvalue)
analytic_pv_vec <- c(analytic_pv_vec, pvslrtab$pvalue[i])

#cat (sprintf("%d\t%f\t%f\t%f\n", i, pvslrtab$pvalue[i], observed_pvalue, coverage) )

if (observed_pvalue > 0.05)
{
break
}
}

# EPS suitable for inserting into LaTeX
postscript(sub('[.]slrtab$','_epq_gumbel.eps',filename),
onefile=FALSE,paper="special",horizontal=FALSE,
width = 9, height = 6)

plot(analytic_pv_vec*100, coverage_vec*100, type='n', xlab='Errors per query (%)',
ylab='Coverage (%)')

lines(analytic_pv_vec*100, coverage_vec*100, col=colorvec[1], lty=ltyvec[1])
lines(obs_pv_vec*100, coverage_vec*100, col=colorvec[2], lty=ltyvec[2])

legend('topleft', col=colorvec, lty=ltyvec, legend=c('Analytic','Observed'),
bty='n')

dev.off()


25 changes: 18 additions & 7 deletions scripts/generate_pml_pdb_for_top_scores.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#
# Usage:
# generate_pml_pdb_for_top_scores.sh [-q] [-e sse_num_list] [-n num_hits]
# [-m] query_domid out_dir
# [-m] [-h] query_domid out_dir
#
# -q: do not use the ordering constraint (allow nonsequential matchings)
#
Expand All @@ -35,6 +35,8 @@
# -m : also generate multiple alignment of structures with MUSTANG
# (may take a long time if many structures)
#
# -h: use fast heuristic (simulated annealing) version instead of QP
#
# query_domid is the SCOP sid for the query structure
#
# out_dir is the directory to place output files. It is created
Expand Down Expand Up @@ -65,7 +67,7 @@
# PYTHONPATH must contain the directory containing the ptsecstruct.py
# and other Python modules used by the Python scripts.
#
# $Id: generate_pml_pdb_for_top_scores.sh 2110 2009-03-18 05:58:44Z astivala $
# $Id: generate_pml_pdb_for_top_scores.sh 3733 2010-05-31 02:53:38Z alexs $
#
###############################################################################

Expand All @@ -92,15 +94,17 @@ function count_models() {


# Root of ASTRAL divided PDB style hierarchy
ASTRAL_ROOT=/local/charikar/ASTRAL/pdbstyle-1.73
#ASTRAL_ROOT=/local/charikar/ASTRAL/pdbstyle-1.73
ASTRAL_ROOT=/usr/local/ASTRAL/pdbstyle-1.75

use_ordering=1
sse_num_list=''
sse_num_list_opt=''
num_hits=10
run_mustang=0
use_heuristic=0

while getopts 'sqe:n:m' opt
while getopts 'sqe:n:mh' opt
do
case $opt in
q) use_ordering=0
Expand All @@ -113,8 +117,10 @@ do
;;
m) run_mustang=1
;;
h) use_heuristic=1
;;
?)
echo "Usage: $0 [-q] [-e sse_num_list] [-n num_hits] [-m] query_sid out_dir" >&2
echo "Usage: $0 [-q] [-h] [-e sse_num_list] [-n num_hits] [-m] query_sid out_dir" >&2
exit 1
;;
esac
Expand All @@ -123,7 +129,7 @@ shift $(($OPTIND - 1))


if [ $# -ne 2 ]; then
echo "Usage: $0 [-q] [-e sse_num_list] [-n num_hits] [-m] query_sid out_dir" >&2
echo "Usage: $0 [-q] [-h] [-e sse_num_list] [-n num_hits] [-m] query_sid out_dir" >&2
exit 1
fi

Expand All @@ -142,6 +148,11 @@ extra_opts="${sse_num_list_opt}"
if [ $use_ordering -eq 0 ]; then
extra_opts="${extra_opts} -q"
fi
sort_opts="-k2,2n"
if [ $use_heuristic -ne 0 ]; then
extra_opts="${extra_opts} -h"
sort_opts="-k2,2nr"
fi

# list of structures (whitespace delimited) for MUSTANG
structlist=${ASTRAL_ROOT}/${qdiv}/${qid}.ent
Expand All @@ -151,7 +162,7 @@ loadfile=loadhits_${qid}.pml

echo "# Generated by $0 $*" > $loadfile
echo "# on `date`" >> $loadfile
for dom in `grep -v '^#' | sort -k2,2n | head -${num_hits} | cut -d' ' -f1`
for dom in `grep -v '^#' | sort $sort_opts | head -${num_hits} | cut -d' ' -f1`
do
div=`echo $dom | cut -c3-4`
hit_struct=${ASTRAL_ROOT}/${div}/${dom}.ent
Expand Down
4 changes: 2 additions & 2 deletions scripts/mkfischer3dtab.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#
# Uses options specific to GNU sort
#
# $Id: mkfischer3dtab.sh 1961 2008-10-07 06:08:06Z astivala $
# $Id: mkfischer3dtab.sh 4055 2010-08-16 05:05:07Z alexs $
#

if [ $# -ne 2 ]; then
Expand All @@ -37,7 +37,7 @@ do
fi
resdir=`echo ${statsdir} | cut -d/ -f1`
if [ ${resdir} = "results" ]; then
method="QP tableau search"
method="QP Tableau Search"
elif [ ${resdir} = "maxcmo_results" ]; then
method="MSVNS3"
else
Expand Down
4 changes: 2 additions & 2 deletions scripts/mkquery200tab.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#
# Uses options specific to GNU sort
#
# $Id: mkquery200tab.sh 2153 2009-03-28 05:20:23Z astivala $
# $Id: mkquery200tab.sh 4052 2010-08-16 04:22:55Z alexs $
#

if [ $# -ne 0 ]; then
Expand Down Expand Up @@ -54,7 +54,7 @@ do
fi
resdir=`echo ${statsfile} | cut -d/ -f1`
if [ ${resdir} = "results" ]; then
method="QP tableau search"
method="QP Tableau Search"
else
method=`echo ${statsfile} | cut -d/ -f2`
if [ "${method}" != "TableauSearch" ]; then
Expand Down
4 changes: 2 additions & 2 deletions scripts/mkquery200timestab.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#
# Uses options specific to GNU sort
#
# $Id: mkquery200timestab.sh 3202 2010-01-04 00:26:06Z alexs $
# $Id: mkquery200timestab.sh 3909 2010-07-11 04:45:25Z alexs $
#

if [ $# -ne 0 ]; then
Expand All @@ -31,7 +31,7 @@ EOF

baseline_s=0

for statsfile in ../tango_results_umfpack/query200/norm2/*.stats ../tango_results_pardiso/query200/norm2/*.stats ../tango_results_ma57/query200/norm2/*.stats ../tango_results_umfpack/query200-minlen4/norm2/*.stats ../tango_results/localsearch/query200/norm2/*.stats
for statsfile in ../tango_results_umfpack/query200/norm2/*.stats ../tango_results_pardiso/query200/norm2/*.stats ../tango_results_ma57/query200/norm2/*.stats ../tango_results_umfpack/query200-minlen4/norm2/*.stats #../tango_results/localsearch/query200/norm2/*.stats
do
resdir=`echo ${statsfile} | cut -d/ -f2`
if [ ${resdir} = "tango_results_pardiso" ]; then
Expand Down
Loading

0 comments on commit 71e94f8

Please sign in to comment.