Skip to content

Commit

Permalink
Merge pull request #36 from kundajelab/signflip_thresholding
Browse files Browse the repository at this point in the history
Improved Null Distributions
  • Loading branch information
AvantiShri authored Feb 12, 2019
2 parents 91a81d5 + 1c401ac commit f4f94d6
Show file tree
Hide file tree
Showing 22 changed files with 6,120 additions and 7,520 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ From source: clone the repo, then run `pip install --editable /path/to/cloned/re

Video of talk at NIPS MLCB: https://www.youtube.com/watch?v=fXPGVJg956E

Please see https://github.com/kundajelab/modisco_private/blob/master/examples/simulated_tf_binding/TF%20MoDISco%20TAL%20GATA.ipynb for a self-contained example notebook that uses pre-computed importance scores as inputs. See https://github.com/kundajelab/tfmodisco/blob/master/examples/simulated_tf_binding/Generate%20Importance%20Scores.ipynb for an example notebook demonstrating how to compute such scores using DeepLIFT.
Please see the following example notebooks:
- [TF MoDISco TAL GATA](examples/simulated_TAL_GATA_deeplearning/TF%20MoDISco%20TAL%20GATA.ipynb): a self-contained example notebook that uses pre-computed importance scores (generated by a neural network) as input. Scores were generated using deeplift as illustated in [this notebook](examples/simulated_TAL_GATA_deeplearning/Generate%20Importance%20Scores.ipynb)
- [TF MoDISco Nanog](examples/H1ESC_Nanog_gkmsvm/TF%20MoDISco%20Nanog.ipynb): a self-contained example notebook that uses pre-computed importance scores and an empirically-generated null distribution (generated by a gkm-SVM) as input. Scores were generated using gkmexplain as illustated in [this notebook](examples/H1ESC_Nanog_gkmsvm/Generate%20Importance%20Scores.ipynb)

A technical note describing version 0.4.4.2-alpha is available at [https://arxiv.org/abs/1811.00416](https://arxiv.org/abs/1811.00416).

Expand Down
4 changes: 4 additions & 0 deletions examples/H1ESC_Nanog_gkmsvm/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
**/*.gz
**/*.txt
**/*.fa
lsgkm
289 changes: 289 additions & 0 deletions examples/H1ESC_Nanog_gkmsvm/Generate Importance Scores.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## This notebook demonstrates how to get importance scores for use with TF-MoDISco using GkmExplain\n",
"\n",
"It relies on a gkm-SVM model pretrained on Nanog H1ESC ChIP-seq"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cloning into 'lsgkm'...\n",
"remote: Enumerating objects: 4, done.\u001b[K\n",
"remote: Counting objects: 100% (4/4), done.\u001b[K\n",
"remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
"remote: Total 293 (delta 0), reused 1 (delta 0), pack-reused 289\u001b[K\n",
"Receiving objects: 100% (293/293), 489.52 KiB | 0 bytes/s, done.\n",
"Resolving deltas: 100% (196/196), done.\n",
"Checking connectivity... done.\n",
"/Users/avantishrikumar/Research/modisco/examples/H1ESC_Nanog_gkmsvm/lsgkm/src\n",
"c++ -Wall -Wconversion -O3 -fPIC -c libsvm.cpp\n",
"c++ -Wall -Wconversion -O3 -fPIC -c libsvm_gkm.c\n",
"c++ -Wall -Wconversion -O3 -fPIC gkmtrain.c libsvm.o libsvm_gkm.o -o gkmtrain -lm -lpthread\n",
"c++ -Wall -Wconversion -O3 -fPIC gkmpredict.c libsvm.o libsvm_gkm.o -o gkmpredict -lm -lpthread\n",
"c++ -Wall -Wconversion -O3 -fPIC gkmexplain.c libsvm.o libsvm_gkm.o -o gkmexplain -lm -lpthread\n",
"/Users/avantishrikumar/Research/modisco/examples/H1ESC_Nanog_gkmsvm\n"
]
}
],
"source": [
"from __future__ import print_function, division\n",
"%matplotlib inline\n",
"\n",
"try:\n",
" reload # Python 2.7\n",
"except NameError:\n",
" try:\n",
" from importlib import reload # Python 3.4+\n",
" except ImportError:\n",
" from imp import reload # Python 3.0 - 3.3\n",
" \n",
"#install lsgkm from the kundajelab repo\n",
"!rm -rf lsgkm\n",
"! git clone https://github.com/kundajelab/lsgkm\n",
"% cd lsgkm/src\n",
"! make\n",
"%cd ../.."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Grab the data and pretrained model"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2019-02-11 15:05:27-- https://raw.githubusercontent.com/AvantiShri/model_storage/2e603c/modisco/gkmexplain_scores/positives_test.fa.gz\n",
"Resolving raw.githubusercontent.com... 151.101.188.133\n",
"Connecting to raw.githubusercontent.com|151.101.188.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 75038 (73K) [application/octet-stream]\n",
"Saving to: 'positives_test.fa.gz'\n",
"\n",
"100%[======================================>] 75,038 --.-K/s in 0.01s \n",
"\n",
"2019-02-11 15:05:27 (5.50 MB/s) - 'positives_test.fa.gz' saved [75038/75038]\n",
"\n",
"--2019-02-11 15:05:27-- https://raw.githubusercontent.com/AvantiShri/model_storage/2e603c/modisco/gkmexplain_scores/lsgkm_defaultsettings_t2.model.txt.gz\n",
"Resolving raw.githubusercontent.com... 151.101.188.133\n",
"Connecting to raw.githubusercontent.com|151.101.188.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 655201 (640K) [application/octet-stream]\n",
"Saving to: 'lsgkm_defaultsettings_t2.model.txt.gz'\n",
"\n",
"100%[======================================>] 655,201 --.-K/s in 0.1s \n",
"\n",
"2019-02-11 15:05:27 (5.37 MB/s) - 'lsgkm_defaultsettings_t2.model.txt.gz' saved [655201/655201]\n",
"\n"
]
}
],
"source": [
"#Download things and gunzip them\n",
"!wget https://raw.githubusercontent.com/AvantiShri/model_storage/2e603c/modisco/gkmexplain_scores/positives_test.fa.gz -O positives_test.fa.gz \n",
"!gunzip positives_test.fa.gz\n",
"!wget https://raw.githubusercontent.com/AvantiShri/model_storage/2e603c/modisco/gkmexplain_scores/lsgkm_defaultsettings_t2.model.txt.gz -O lsgkm_defaultsettings_t2.model.txt.gz \n",
"!gunzip lsgkm_defaultsettings_t2.model.txt.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compute the gkmexplain importance scores on positives"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO 2019-02-11 15:05:28: Number of threads is set to 1\n",
"INFO 2019-02-11 15:05:28: load model lsgkm_defaultsettings_t2.model.txt\n",
"INFO 2019-02-11 15:05:28: reading... 1000/8873\n",
"INFO 2019-02-11 15:05:28: reading... 2000/8873\n",
"INFO 2019-02-11 15:05:28: reading... 3000/8873\n",
"INFO 2019-02-11 15:05:28: reading... 4000/8873\n",
"INFO 2019-02-11 15:05:29: reading... 5000/8873\n",
"INFO 2019-02-11 15:05:29: reading... 6000/8873\n",
"INFO 2019-02-11 15:05:29: reading... 7000/8873\n",
"INFO 2019-02-11 15:05:29: reading... 8000/8873\n",
"INFO 2019-02-11 15:05:29: write prediction result to gkmexplain_positives_impscores.txt\n",
"INFO 2019-02-11 15:06:07: 100 scored\n",
"INFO 2019-02-11 15:06:44: 200 scored\n",
"INFO 2019-02-11 15:07:22: 300 scored\n",
"INFO 2019-02-11 15:08:01: 400 scored\n",
"INFO 2019-02-11 15:08:38: 500 scored\n",
"INFO 2019-02-11 15:09:16: 600 scored\n",
"INFO 2019-02-11 15:09:55: 700 scored\n",
"INFO 2019-02-11 15:10:31: 800 scored\n",
"INFO 2019-02-11 15:11:09: 900 scored\n",
"INFO 2019-02-11 15:11:31: 960 scored\n",
"INFO 2019-02-11 15:11:31: Number of threads is set to 1\n",
"INFO 2019-02-11 15:11:31: load model lsgkm_defaultsettings_t2.model.txt\n",
"INFO 2019-02-11 15:11:32: reading... 1000/8873\n",
"INFO 2019-02-11 15:11:32: reading... 2000/8873\n",
"INFO 2019-02-11 15:11:32: reading... 3000/8873\n",
"INFO 2019-02-11 15:11:32: reading... 4000/8873\n",
"INFO 2019-02-11 15:11:32: reading... 5000/8873\n",
"INFO 2019-02-11 15:11:33: reading... 6000/8873\n",
"INFO 2019-02-11 15:11:33: reading... 7000/8873\n",
"INFO 2019-02-11 15:11:33: reading... 8000/8873\n",
"INFO 2019-02-11 15:11:33: write prediction result to gkmexplain_positives_hypimpscores.txt\n",
"INFO 2019-02-11 15:12:51: 100 scored\n",
"INFO 2019-02-11 15:14:04: 200 scored\n",
"INFO 2019-02-11 15:15:19: 300 scored\n",
"INFO 2019-02-11 15:16:31: 400 scored\n",
"INFO 2019-02-11 15:17:42: 500 scored\n",
"INFO 2019-02-11 15:18:53: 600 scored\n",
"INFO 2019-02-11 15:20:04: 700 scored\n",
"INFO 2019-02-11 15:21:16: 800 scored\n",
"INFO 2019-02-11 15:22:27: 900 scored\n",
"INFO 2019-02-11 15:23:11: 960 scored\n"
]
}
],
"source": [
"#importance scores on the positives\n",
"# Note that it is not strictly necessary to compute these because they can be extracted from\n",
"# the hypothetical importance scores by doing an elementwise multiplication of the one-hot\n",
"# encoding with the hypothetical importance scores, as demonstrated in the TF-MoDISco notebook\n",
"!lsgkm/src/gkmexplain positives_test.fa lsgkm_defaultsettings_t2.model.txt gkmexplain_positives_impscores.txt\n",
" \n",
"#hypothetical importance scores on the positives\n",
"!lsgkm/src/gkmexplain -m 1 positives_test.fa lsgkm_defaultsettings_t2.model.txt gkmexplain_positives_hypimpscores.txt"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Generate emprical null distribution of importance scores"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For identifying regions of high importance, it is helpful (but not strictly necessary) to supply a null distribution of per-position scores to TF-MoDISco. We will do this by dinucleotide shuffling the sequences and scoring them."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: deeplift in /Users/avantishrikumar/Research/clean_deeplift/deeplift (0.6.9.0)\n",
"Requirement already satisfied: numpy>=1.9 in /Users/avantishrikumar/anaconda/lib/python2.7/site-packages (from deeplift) (1.14.6)\n",
"\u001b[33mYou are using pip version 18.0, however version 19.0.2 is available.\n",
"You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n",
"INFO 2019-02-11 15:28:35: Number of threads is set to 1\n",
"INFO 2019-02-11 15:28:35: load model lsgkm_defaultsettings_t2.model.txt\n",
"INFO 2019-02-11 15:28:35: reading... 1000/8873\n",
"INFO 2019-02-11 15:28:35: reading... 2000/8873\n",
"INFO 2019-02-11 15:28:35: reading... 3000/8873\n",
"INFO 2019-02-11 15:28:36: reading... 4000/8873\n",
"INFO 2019-02-11 15:28:36: reading... 5000/8873\n",
"INFO 2019-02-11 15:28:36: reading... 6000/8873\n",
"INFO 2019-02-11 15:28:36: reading... 7000/8873\n",
"INFO 2019-02-11 15:28:36: reading... 8000/8873\n",
"INFO 2019-02-11 15:28:37: write prediction result to gkmexplain_dnshuff_impscores.txt\n",
"INFO 2019-02-11 15:29:12: 100 scored\n",
"INFO 2019-02-11 15:29:48: 200 scored\n",
"INFO 2019-02-11 15:30:23: 300 scored\n",
"INFO 2019-02-11 15:30:59: 400 scored\n",
"INFO 2019-02-11 15:31:35: 500 scored\n"
]
}
],
"source": [
"#install the deeplift package to use the dinucleotide-shuffling code\n",
"!pip install deeplift\n",
"from deeplift.dinuc_shuffle import dinuc_shuffle\n",
"\n",
"import numpy as np\n",
"import random\n",
"np.random.seed(1234)\n",
"random.seed(1234)\n",
"\n",
"num_dinuc_shuffled_seqs = 500\n",
"#Generate the dinucleotide shuffled sequences and write to a file\n",
"fasta_seqs_no_N = [x.rstrip() for (i,x) in enumerate(open(\"positives_test.fa\"))\n",
" if (i%2==1 and ('N' not in x))]\n",
"open(\"dnshuff_seqs.fa\", 'w').write(\n",
" \"\\n\".join([\">seq\"+str(i)+\"\\n\"+dinuc_shuffle(np.random.choice(fasta_seqs_no_N))\n",
" for i in range(num_dinuc_shuffled_seqs)]))\n",
"\n",
"#Score the dinucleotide shuffled sequences\n",
"!lsgkm/src/gkmexplain dnshuff_seqs.fa lsgkm_defaultsettings_t2.model.txt gkmexplain_dnshuff_impscores.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading

0 comments on commit f4f94d6

Please sign in to comment.