-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #36 from kundajelab/signflip_thresholding
Improved Null Distributions
- Loading branch information
Showing
22 changed files
with
6,120 additions
and
7,520 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
**/*.gz | ||
**/*.txt | ||
**/*.fa | ||
lsgkm |
289 changes: 289 additions & 0 deletions
289
examples/H1ESC_Nanog_gkmsvm/Generate Importance Scores.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,289 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## This notebook demonstrates how to get importance scores for use with TF-MoDISco using GkmExplain\n", | ||
"\n", | ||
"It relies on a gkm-SVM model pretrained on Nanog H1ESC ChIP-seq" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Cloning into 'lsgkm'...\n", | ||
"remote: Enumerating objects: 4, done.\u001b[K\n", | ||
"remote: Counting objects: 100% (4/4), done.\u001b[K\n", | ||
"remote: Compressing objects: 100% (4/4), done.\u001b[K\n", | ||
"remote: Total 293 (delta 0), reused 1 (delta 0), pack-reused 289\u001b[K\n", | ||
"Receiving objects: 100% (293/293), 489.52 KiB | 0 bytes/s, done.\n", | ||
"Resolving deltas: 100% (196/196), done.\n", | ||
"Checking connectivity... done.\n", | ||
"/Users/avantishrikumar/Research/modisco/examples/H1ESC_Nanog_gkmsvm/lsgkm/src\n", | ||
"c++ -Wall -Wconversion -O3 -fPIC -c libsvm.cpp\n", | ||
"c++ -Wall -Wconversion -O3 -fPIC -c libsvm_gkm.c\n", | ||
"c++ -Wall -Wconversion -O3 -fPIC gkmtrain.c libsvm.o libsvm_gkm.o -o gkmtrain -lm -lpthread\n", | ||
"c++ -Wall -Wconversion -O3 -fPIC gkmpredict.c libsvm.o libsvm_gkm.o -o gkmpredict -lm -lpthread\n", | ||
"c++ -Wall -Wconversion -O3 -fPIC gkmexplain.c libsvm.o libsvm_gkm.o -o gkmexplain -lm -lpthread\n", | ||
"/Users/avantishrikumar/Research/modisco/examples/H1ESC_Nanog_gkmsvm\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from __future__ import print_function, division\n", | ||
"%matplotlib inline\n", | ||
"\n", | ||
"try:\n", | ||
" reload # Python 2.7\n", | ||
"except NameError:\n", | ||
" try:\n", | ||
" from importlib import reload # Python 3.4+\n", | ||
" except ImportError:\n", | ||
" from imp import reload # Python 3.0 - 3.3\n", | ||
" \n", | ||
"#install lsgkm from the kundajelab repo\n", | ||
"!rm -rf lsgkm\n", | ||
"! git clone https://github.com/kundajelab/lsgkm\n", | ||
"% cd lsgkm/src\n", | ||
"! make\n", | ||
"%cd ../.." | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Grab the data and pretrained model" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"--2019-02-11 15:05:27-- https://raw.githubusercontent.com/AvantiShri/model_storage/2e603c/modisco/gkmexplain_scores/positives_test.fa.gz\n", | ||
"Resolving raw.githubusercontent.com... 151.101.188.133\n", | ||
"Connecting to raw.githubusercontent.com|151.101.188.133|:443... connected.\n", | ||
"HTTP request sent, awaiting response... 200 OK\n", | ||
"Length: 75038 (73K) [application/octet-stream]\n", | ||
"Saving to: 'positives_test.fa.gz'\n", | ||
"\n", | ||
"100%[======================================>] 75,038 --.-K/s in 0.01s \n", | ||
"\n", | ||
"2019-02-11 15:05:27 (5.50 MB/s) - 'positives_test.fa.gz' saved [75038/75038]\n", | ||
"\n", | ||
"--2019-02-11 15:05:27-- https://raw.githubusercontent.com/AvantiShri/model_storage/2e603c/modisco/gkmexplain_scores/lsgkm_defaultsettings_t2.model.txt.gz\n", | ||
"Resolving raw.githubusercontent.com... 151.101.188.133\n", | ||
"Connecting to raw.githubusercontent.com|151.101.188.133|:443... connected.\n", | ||
"HTTP request sent, awaiting response... 200 OK\n", | ||
"Length: 655201 (640K) [application/octet-stream]\n", | ||
"Saving to: 'lsgkm_defaultsettings_t2.model.txt.gz'\n", | ||
"\n", | ||
"100%[======================================>] 655,201 --.-K/s in 0.1s \n", | ||
"\n", | ||
"2019-02-11 15:05:27 (5.37 MB/s) - 'lsgkm_defaultsettings_t2.model.txt.gz' saved [655201/655201]\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"#Download things and gunzip them\n", | ||
"!wget https://raw.githubusercontent.com/AvantiShri/model_storage/2e603c/modisco/gkmexplain_scores/positives_test.fa.gz -O positives_test.fa.gz \n", | ||
"!gunzip positives_test.fa.gz\n", | ||
"!wget https://raw.githubusercontent.com/AvantiShri/model_storage/2e603c/modisco/gkmexplain_scores/lsgkm_defaultsettings_t2.model.txt.gz -O lsgkm_defaultsettings_t2.model.txt.gz \n", | ||
"!gunzip lsgkm_defaultsettings_t2.model.txt.gz" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Compute the gkmexplain importance scores on positives" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"INFO 2019-02-11 15:05:28: Number of threads is set to 1\n", | ||
"INFO 2019-02-11 15:05:28: load model lsgkm_defaultsettings_t2.model.txt\n", | ||
"INFO 2019-02-11 15:05:28: reading... 1000/8873\n", | ||
"INFO 2019-02-11 15:05:28: reading... 2000/8873\n", | ||
"INFO 2019-02-11 15:05:28: reading... 3000/8873\n", | ||
"INFO 2019-02-11 15:05:28: reading... 4000/8873\n", | ||
"INFO 2019-02-11 15:05:29: reading... 5000/8873\n", | ||
"INFO 2019-02-11 15:05:29: reading... 6000/8873\n", | ||
"INFO 2019-02-11 15:05:29: reading... 7000/8873\n", | ||
"INFO 2019-02-11 15:05:29: reading... 8000/8873\n", | ||
"INFO 2019-02-11 15:05:29: write prediction result to gkmexplain_positives_impscores.txt\n", | ||
"INFO 2019-02-11 15:06:07: 100 scored\n", | ||
"INFO 2019-02-11 15:06:44: 200 scored\n", | ||
"INFO 2019-02-11 15:07:22: 300 scored\n", | ||
"INFO 2019-02-11 15:08:01: 400 scored\n", | ||
"INFO 2019-02-11 15:08:38: 500 scored\n", | ||
"INFO 2019-02-11 15:09:16: 600 scored\n", | ||
"INFO 2019-02-11 15:09:55: 700 scored\n", | ||
"INFO 2019-02-11 15:10:31: 800 scored\n", | ||
"INFO 2019-02-11 15:11:09: 900 scored\n", | ||
"INFO 2019-02-11 15:11:31: 960 scored\n", | ||
"INFO 2019-02-11 15:11:31: Number of threads is set to 1\n", | ||
"INFO 2019-02-11 15:11:31: load model lsgkm_defaultsettings_t2.model.txt\n", | ||
"INFO 2019-02-11 15:11:32: reading... 1000/8873\n", | ||
"INFO 2019-02-11 15:11:32: reading... 2000/8873\n", | ||
"INFO 2019-02-11 15:11:32: reading... 3000/8873\n", | ||
"INFO 2019-02-11 15:11:32: reading... 4000/8873\n", | ||
"INFO 2019-02-11 15:11:32: reading... 5000/8873\n", | ||
"INFO 2019-02-11 15:11:33: reading... 6000/8873\n", | ||
"INFO 2019-02-11 15:11:33: reading... 7000/8873\n", | ||
"INFO 2019-02-11 15:11:33: reading... 8000/8873\n", | ||
"INFO 2019-02-11 15:11:33: write prediction result to gkmexplain_positives_hypimpscores.txt\n", | ||
"INFO 2019-02-11 15:12:51: 100 scored\n", | ||
"INFO 2019-02-11 15:14:04: 200 scored\n", | ||
"INFO 2019-02-11 15:15:19: 300 scored\n", | ||
"INFO 2019-02-11 15:16:31: 400 scored\n", | ||
"INFO 2019-02-11 15:17:42: 500 scored\n", | ||
"INFO 2019-02-11 15:18:53: 600 scored\n", | ||
"INFO 2019-02-11 15:20:04: 700 scored\n", | ||
"INFO 2019-02-11 15:21:16: 800 scored\n", | ||
"INFO 2019-02-11 15:22:27: 900 scored\n", | ||
"INFO 2019-02-11 15:23:11: 960 scored\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"#importance scores on the positives\n", | ||
"# Note that it is not strictly necessary to compute these because they can be extracted from\n", | ||
"# the hypothetical importance scores by doing an elementwise multiplication of the one-hot\n", | ||
"# encoding with the hypothetical importance scores, as demonstrated in the TF-MoDISco notebook\n", | ||
"!lsgkm/src/gkmexplain positives_test.fa lsgkm_defaultsettings_t2.model.txt gkmexplain_positives_impscores.txt\n", | ||
" \n", | ||
"#hypothetical importance scores on the positives\n", | ||
"!lsgkm/src/gkmexplain -m 1 positives_test.fa lsgkm_defaultsettings_t2.model.txt gkmexplain_positives_hypimpscores.txt" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"source": [ | ||
"## Generate emprical null distribution of importance scores" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"For identifying regions of high importance, it is helpful (but not strictly necessary) to supply a null distribution of per-position scores to TF-MoDISco. We will do this by dinucleotide shuffling the sequences and scoring them." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Requirement already satisfied: deeplift in /Users/avantishrikumar/Research/clean_deeplift/deeplift (0.6.9.0)\n", | ||
"Requirement already satisfied: numpy>=1.9 in /Users/avantishrikumar/anaconda/lib/python2.7/site-packages (from deeplift) (1.14.6)\n", | ||
"\u001b[33mYou are using pip version 18.0, however version 19.0.2 is available.\n", | ||
"You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", | ||
"INFO 2019-02-11 15:28:35: Number of threads is set to 1\n", | ||
"INFO 2019-02-11 15:28:35: load model lsgkm_defaultsettings_t2.model.txt\n", | ||
"INFO 2019-02-11 15:28:35: reading... 1000/8873\n", | ||
"INFO 2019-02-11 15:28:35: reading... 2000/8873\n", | ||
"INFO 2019-02-11 15:28:35: reading... 3000/8873\n", | ||
"INFO 2019-02-11 15:28:36: reading... 4000/8873\n", | ||
"INFO 2019-02-11 15:28:36: reading... 5000/8873\n", | ||
"INFO 2019-02-11 15:28:36: reading... 6000/8873\n", | ||
"INFO 2019-02-11 15:28:36: reading... 7000/8873\n", | ||
"INFO 2019-02-11 15:28:36: reading... 8000/8873\n", | ||
"INFO 2019-02-11 15:28:37: write prediction result to gkmexplain_dnshuff_impscores.txt\n", | ||
"INFO 2019-02-11 15:29:12: 100 scored\n", | ||
"INFO 2019-02-11 15:29:48: 200 scored\n", | ||
"INFO 2019-02-11 15:30:23: 300 scored\n", | ||
"INFO 2019-02-11 15:30:59: 400 scored\n", | ||
"INFO 2019-02-11 15:31:35: 500 scored\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"#install the deeplift package to use the dinucleotide-shuffling code\n", | ||
"!pip install deeplift\n", | ||
"from deeplift.dinuc_shuffle import dinuc_shuffle\n", | ||
"\n", | ||
"import numpy as np\n", | ||
"import random\n", | ||
"np.random.seed(1234)\n", | ||
"random.seed(1234)\n", | ||
"\n", | ||
"num_dinuc_shuffled_seqs = 500\n", | ||
"#Generate the dinucleotide shuffled sequences and write to a file\n", | ||
"fasta_seqs_no_N = [x.rstrip() for (i,x) in enumerate(open(\"positives_test.fa\"))\n", | ||
" if (i%2==1 and ('N' not in x))]\n", | ||
"open(\"dnshuff_seqs.fa\", 'w').write(\n", | ||
" \"\\n\".join([\">seq\"+str(i)+\"\\n\"+dinuc_shuffle(np.random.choice(fasta_seqs_no_N))\n", | ||
" for i in range(num_dinuc_shuffled_seqs)]))\n", | ||
"\n", | ||
"#Score the dinucleotide shuffled sequences\n", | ||
"!lsgkm/src/gkmexplain dnshuff_seqs.fa lsgkm_defaultsettings_t2.model.txt gkmexplain_dnshuff_impscores.txt" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 2", | ||
"language": "python", | ||
"name": "python2" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
Oops, something went wrong.