update from 2011 PhD archive

stivalaa · Aug 26, 2021 · 71e94f8 · 71e94f8
1 parent db9346d
commit 71e94f8
Show file tree

Hide file tree

Showing 20 changed files with 752 additions and 60 deletions.
diff --git a/scripts/fakepdb_to_cops.py b/scripts/fakepdb_to_cops.py
@@ -4,16 +4,18 @@
 # Author:  Alex Stivala
 # Created: April 2010
 #
-# $Id: fakepdb_to_cops.py 3686 2010-05-17 07:31:00Z alexs $
+# $Id: fakepdb_to_cops.py 3909 2010-07-11 04:45:25Z alexs $
 """
  fakepdb_to_cops.py - Convert fake PDB identifers back to COPS identifiers
 
- Usage: fakepdb_to_cops.py < DaliteLite-2col-output
+ Usage: fakepdb_to_cops.py [fakepdbids_filename] < DaliteLite-2col-output
 
  The input file is 2 column from dalilitout2col.py
  on stdin.
  Output is to stdout.
 
+  If the fakepdbids_filename is supplied, the translation table is read
+  from it, toerhwise the COPS file is used.
 
  Note that DaliLite ONLY allows 4 char PDB codes, with chain appended, so
  SCOP or COPS type codes will not work in cases where there are more than
@@ -68,13 +70,17 @@ def main():
     """
     main for fakepdb_to_cops.py - see usage message at file header
     """
-    if len(sys.argv) != 1:
+    if len(sys.argv) == 2:
+        fakeids_filename = sys.argv[1]
+    elif len(sys.argv) == 1:
+        fakeids_filename = COPS_FAKEPDBIDS_FILE
+    else:
         usage(os.path.basename(sys.argv[0]))
 
-    FAKE_TO_COPS_DICT = parse_fakepdbids_file(COPS_FAKEPDBIDS_FILE)[0]
+    FAKE_TO_COPS_DICT = parse_fakepdbids_file(fakeids_filename)[0]
 
     for (score, dbid) in iter_searchresult(sys.stdin,multiquery=False):
-        sys.stdout.write("%s    %f\n" % (FAKE_TO_COPS_DICT[dbid], score))
+        sys.stdout.write("%s    %f\n" % (FAKE_TO_COPS_DICT[dbid[:4]], score))
 
 
 if __name__ == "__main__":

diff --git a/scripts/fitgumbeldist.r b/scripts/fitgumbeldist.r
@@ -0,0 +1,214 @@
+#############################################################################3
+#
+# File:    fitgumbeldist.r
+# Author:  Alex Stivala
+# Created: July 2010
+#
+# fitgumbeldist.r - compute Gumbest dist. parameters and build plots
+#
+# R script for fitting Gumbel distribution (finding parameters location
+# and scale) and plotting histograms and fit over them.
+# Also, compute the z-score and p-values using this, then
+# compute coverage-vs-EPQ graph  as used in Ortiz et al 2002 (and earlier
+# Levitt and Gerstein (1998) to compare the analytic and observed p-values)
+#
+#
+# Uses commandArgs() R function to get trailing arguments from R command
+# line ie after the --args option. The filename of the .slrtab file
+# (score and label)
+# is obtained from --args.
+#
+# label is used to separate same fold (1) and different fold (0) scores,
+# different distributinos fitted to each
+#
+# outputs stuff on stdout (including the parameter values) and
+# creates PostScript file with histograms and curves, with filename
+# derived form input filename: basename_gumbel.eps
+# 
+#    R --vanilla -f plotssehistogram.r --args query200.slrtab
+#
+# creates query200_gumbel.eps
+# and     query200_epq_gumbel.eps
+#
+# Uses the evir R package for gumbel fitting and the evd package for dbumbel()
+#
+# $Id: fitgumbeldist.r 3955 2010-07-21 05:52:47Z alexs $
+#############################################################################3
+
+library(evir)
+library(evd)
+
+
+#############################################################################3
+#
+# constants
+#
+#############################################################################3
+
+colorvec=c('deepskyblue4','brown','red','turquoise','blue','purple','green','cyan','gray20','magenta','darkolivegreen2','midnightblue','magenta3','darkseagreen','violetred3','darkslategray3')
+ltyvec=c(1,2,4,5,6,1,2,1,5,6,1,2,4,5,6,1,2)
+pchvec=c(20,21,22,23,24,25,19,21,22)
+
+eulergamma <- 0.5772156649015328606
+
+#############################################################################3
+#
+# functions
+#
+#############################################################################3
+
+
+# 
+# z_gumbel() -  compute Z-score from Gumbel distribution
+#
+# Parameters:
+#    x - score to compute Z-score for
+#    gumbel - evd object for Gumbel distribution from gumbel()
+#
+# Return value:
+#    Z-score computed for x according to gum distribution
+#
+z_gumbel <- function(x, gum )
+{
+      a<- gum0$par.ests[2]
+      b <- gum0$par.ests[1]
+      mu <- a + b * eulergamma
+      sigma <- ( (pi  / sqrt(6)) * b )
+      z <- (x - mu)/sigma
+      return (z)
+}
+
+#
+# pv_gumbel() - compute P-value for Z-score from Gumbel distribution
+#
+# Parameters: 
+#    z - Z-score from z_gumbel()
+#
+# Return value:
+#    P-value for the Z-score
+#
+pv_gumbel <- function(z)
+{
+    return ( 1 - exp(-exp(-( (pi/sqrt(6)*z + eulergamma) ) ) ) )
+}
+
+#############################################################################3
+#
+# main
+#
+#############################################################################3
+
+
+filename <- commandArgs(trailingOnly=TRUE)
+
+# EPS suitable for inserting into LaTeX
+postscript(sub('[.]slrtab$','_gumbel.eps',filename),
+           onefile=FALSE,paper="special",horizontal=FALSE, 
+           width = 9, height = 6)
+
+slrtab <- read.table(filename, header=T)
+slrtab <- subset(slrtab, score >= 0) # one or two -ve scores sometimes
+
+#
+# Fit Gumbel distribution to same fold scores and different fold scores
+#
+
+gum0 <- gumbel(subset(slrtab, label == 0)$score)
+gum1 <- gumbel(subset(slrtab, label == 1)$score)
+
+cat('different folds: a = ', format(gum0$par.ests[2], digits=16), ' ' )
+cat('b = ', format(gum0$par.ests[1], digits=16), '\n')
+cat('same fold: a = ', format(gum1$par.ests[2], digits=16), ' ' )
+cat('b = ', format(gum1$par.ests[1], digits=16), '\n')
+
+#
+# Plot histograms and Gumbel distributions fitted to them
+#
+
+h0 <- hist(subset(slrtab, label == 0)$score, breaks=20, plot=F)
+h1 <- hist(subset(slrtab, label == 1)$score, breaks=20, plot=F)
+xh0 <- c(min(h0$breaks), h0$breaks)
+yh0 <- c(0, h0$density, 0)
+xh1 <- c(min(h1$breaks), h1$breaks)
+yh1 <- c(0, h1$density, 0)
+
+plot(xh0, yh0, type='s', lty=ltyvec[1], col=colorvec[1], xlab='score',ylab='frequency', ylim=c(0,1.0)) # ylim=c(0, max(yh0, yh1)) )
+lines(xh1, yh1, type='s', lty=ltyvec[2], col=colorvec[2])
+
+curve(dgumbel(x, gum0$par.ests[2], gum0$par.ests[1]), col=colorvec[3], lty=ltyvec[3], add=T)
+curve(dgumbel(x, gum1$par.ests[2], gum1$par.ests[1]), col=colorvec[4], lty=ltyvec[4], add=T)
+
+legend("topright", lty=ltyvec, col=colorvec, bty='n',
+       legend=c("Different folds, histogram",
+         "Same fold, histogram",
+         "Different folds, fitted Gumbel distribution",
+         "Same fold, fitted Gumbel distribution") )
+
+
+dev.off()
+
+#
+# compute P-values for all scores
+#
+
+pvslrtab <- slrtab
+pvslrtab$pvalue <- pv_gumbel(z_gumbel(slrtab$score, gum0))
+
+#
+# compute coverage-vs-EPQ graph  as used in Ortiz et al 2002 (and earlier
+# Levitt and Gerstein (1998) to compare the analytic and observed p-values)
+#
+
+# sort by ascending P-value so 'best' are at start of list
+pvslrtab <- pvslrtab[sort.list(pvslrtab$pvalue),]
+
+tptotal <- length(subset(pvslrtab, label == 1)$label)
+tpcount <- 0
+fpcount <- 0
+
+coverage_vec <- c()
+obs_pv_vec <- c()
+analytic_pv_vec <- c()
+
+for (i in 1:length(pvslrtab$label))
+{
+ if (pvslrtab$label[i] == 0)
+ {
+ fpcount <- fpcount + 1
+ }
+ else
+ {
+ tpcount <- tpcount + 1
+ }
+ observed_pvalue <- fpcount / (length(pvslrtab$label) - tptotal)
+ coverage <- tpcount / tptotal
+
+ coverage_vec <- c(coverage_vec, coverage)
+ obs_pv_vec <- c(obs_pv_vec, observed_pvalue)
+ analytic_pv_vec <- c(analytic_pv_vec, pvslrtab$pvalue[i])
+
+ #cat (sprintf("%d\t%f\t%f\t%f\n", i, pvslrtab$pvalue[i], observed_pvalue, coverage) )
+
+ if (observed_pvalue > 0.05)
+   {
+    break
+  }
+}
+
+# EPS suitable for inserting into LaTeX
+postscript(sub('[.]slrtab$','_epq_gumbel.eps',filename),
+           onefile=FALSE,paper="special",horizontal=FALSE, 
+           width = 9, height = 6)
+
+plot(analytic_pv_vec*100, coverage_vec*100, type='n', xlab='Errors per query (%)',
+     ylab='Coverage (%)')
+
+lines(analytic_pv_vec*100, coverage_vec*100, col=colorvec[1], lty=ltyvec[1])
+lines(obs_pv_vec*100, coverage_vec*100, col=colorvec[2], lty=ltyvec[2])
+
+legend('topleft', col=colorvec, lty=ltyvec, legend=c('Analytic','Observed'),
+       bty='n')
+
+dev.off()
+
+
diff --git a/scripts/generate_pml_pdb_for_top_scores.sh b/scripts/generate_pml_pdb_for_top_scores.sh
@@ -23,7 +23,7 @@
 #
 # Usage:
 #     generate_pml_pdb_for_top_scores.sh [-q] [-e sse_num_list] [-n num_hits]
-#                                        [-m] query_domid  out_dir
+#                                        [-m] [-h] query_domid  out_dir
 #
 #     -q: do not use the ordering constraint (allow nonsequential  matchings)
 #
@@ -35,6 +35,8 @@
 #     -m : also generate multiple alignment of structures with MUSTANG
 #          (may take a long time if many structures)
 #
+#     -h: use fast heuristic (simulated annealing) version instead of QP
+#
 #     query_domid is the SCOP sid for the query structure
 #
 #     out_dir is the directory to place output files. It is created
@@ -65,7 +67,7 @@
 #   PYTHONPATH must contain the directory containing the ptsecstruct.py 
 #   and other Python modules used by the Python scripts.
 #
-# $Id: generate_pml_pdb_for_top_scores.sh 2110 2009-03-18 05:58:44Z astivala $
+# $Id: generate_pml_pdb_for_top_scores.sh 3733 2010-05-31 02:53:38Z alexs $
 # 
 ###############################################################################
 
@@ -92,15 +94,17 @@ function count_models() {
 
 
 # Root of ASTRAL divided PDB style hierarchy
-ASTRAL_ROOT=/local/charikar/ASTRAL/pdbstyle-1.73
+#ASTRAL_ROOT=/local/charikar/ASTRAL/pdbstyle-1.73
+ASTRAL_ROOT=/usr/local/ASTRAL/pdbstyle-1.75
 
 use_ordering=1
 sse_num_list=''
 sse_num_list_opt=''
 num_hits=10
 run_mustang=0
+use_heuristic=0
 
-while getopts 'sqe:n:m' opt
+while getopts 'sqe:n:mh' opt
 do
     case $opt in
     q) use_ordering=0
@@ -113,8 +117,10 @@ do
        ;;
     m) run_mustang=1
        ;;
+    h) use_heuristic=1
+       ;;
     ?)
-    echo "Usage: $0 [-q] [-e sse_num_list] [-n num_hits] [-m] query_sid out_dir" >&2
+    echo "Usage: $0 [-q] [-h] [-e sse_num_list] [-n num_hits] [-m] query_sid out_dir" >&2
     exit 1
     ;;
     esac
@@ -123,7 +129,7 @@ shift $(($OPTIND - 1))
 
 
 if [ $# -ne 2 ]; then
-    echo "Usage: $0 [-q] [-e sse_num_list] [-n num_hits] [-m] query_sid out_dir" >&2
+    echo "Usage: $0 [-q] [-h] [-e sse_num_list] [-n num_hits] [-m] query_sid out_dir" >&2
     exit 1
 fi
 
@@ -142,6 +148,11 @@ extra_opts="${sse_num_list_opt}"
 if [ $use_ordering -eq 0 ]; then
     extra_opts="${extra_opts} -q"
 fi
+sort_opts="-k2,2n"
+if [ $use_heuristic -ne 0 ]; then
+    extra_opts="${extra_opts} -h"
+    sort_opts="-k2,2nr"
+fi
 
 # list of structures (whitespace delimited) for MUSTANG
 structlist=${ASTRAL_ROOT}/${qdiv}/${qid}.ent
@@ -151,7 +162,7 @@ loadfile=loadhits_${qid}.pml
 
 echo "# Generated by $0 $*" > $loadfile
 echo "# on `date`" >> $loadfile
-for dom in `grep -v '^#' | sort -k2,2n | head -${num_hits} | cut -d' ' -f1`
+for dom in `grep -v '^#' | sort $sort_opts | head -${num_hits} | cut -d' ' -f1`
 do 
     div=`echo $dom | cut -c3-4` 
     hit_struct=${ASTRAL_ROOT}/${div}/${dom}.ent

diff --git a/scripts/mkfischer3dtab.sh b/scripts/mkfischer3dtab.sh
@@ -18,7 +18,7 @@
 #
 # Uses options specific to GNU sort
 #
-# $Id: mkfischer3dtab.sh 1961 2008-10-07 06:08:06Z astivala $
+# $Id: mkfischer3dtab.sh 4055 2010-08-16 05:05:07Z alexs $
 #
 
 if [ $# -ne 2 ]; then
@@ -37,7 +37,7 @@ do
     fi
     resdir=`echo ${statsdir} | cut -d/ -f1`
     if [ ${resdir} = "results" ]; then
-        method="QP tableau search"
+        method="QP Tableau Search"
     elif [ ${resdir} = "maxcmo_results" ]; then
         method="MSVNS3"
     else

diff --git a/scripts/mkquery200tab.sh b/scripts/mkquery200tab.sh
@@ -18,7 +18,7 @@
 #
 # Uses options specific to GNU sort
 #
-# $Id: mkquery200tab.sh 2153 2009-03-28 05:20:23Z astivala $
+# $Id: mkquery200tab.sh 4052 2010-08-16 04:22:55Z alexs $
 #
 
 if [ $# -ne 0 ]; then
@@ -54,7 +54,7 @@ do
     fi
     resdir=`echo ${statsfile} | cut -d/ -f1`
     if [ ${resdir} = "results" ]; then
-        method="QP tableau search"
+        method="QP Tableau Search"
     else
         method=`echo ${statsfile} | cut -d/ -f2`
         if [ "${method}" != "TableauSearch" ]; then

diff --git a/scripts/mkquery200timestab.sh b/scripts/mkquery200timestab.sh
@@ -14,7 +14,7 @@
 #
 # Uses options specific to GNU sort
 #
-# $Id: mkquery200timestab.sh 3202 2010-01-04 00:26:06Z alexs $
+# $Id: mkquery200timestab.sh 3909 2010-07-11 04:45:25Z alexs $
 #
 
 if [ $# -ne 0 ]; then
@@ -31,7 +31,7 @@ EOF
 
 baseline_s=0
 
-for statsfile in ../tango_results_umfpack/query200/norm2/*.stats ../tango_results_pardiso/query200/norm2/*.stats ../tango_results_ma57/query200/norm2/*.stats ../tango_results_umfpack/query200-minlen4/norm2/*.stats  ../tango_results/localsearch/query200/norm2/*.stats
+for statsfile in ../tango_results_umfpack/query200/norm2/*.stats ../tango_results_pardiso/query200/norm2/*.stats ../tango_results_ma57/query200/norm2/*.stats ../tango_results_umfpack/query200-minlen4/norm2/*.stats  #../tango_results/localsearch/query200/norm2/*.stats
 do
     resdir=`echo ${statsfile} | cut -d/ -f2`
     if [ ${resdir} = "tango_results_pardiso" ]; then