Skip to content

Commit

Permalink
added all changes from annotation-speedups branch
Browse files Browse the repository at this point in the history
  • Loading branch information
“Marcel-Mueck” committed Feb 20, 2024
1 parent ccacfb6 commit 63f8737
Show file tree
Hide file tree
Showing 20 changed files with 2,687 additions and 376 deletions.
1,548 changes: 1,354 additions & 194 deletions deeprvat/annotations/annotations.py

Large diffs are not rendered by default.

273 changes: 273 additions & 0 deletions docs/_static/annotation_rulegraph.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
418 changes: 253 additions & 165 deletions pipelines/annotations.snakefile

Large diffs are not rendered by default.

73 changes: 73 additions & 0 deletions pipelines/config/annotation_colnames_filling_values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
annotation_column_names:
'CADD_RAW' :
'CADD_raw': 0
'PrimateAI' :
'PrimateAI_score' : 0
'SpliceAI_delta_score' :
'SpliceAI_delta_score' : 0
'am_pathogenicity' :
'alphamissense' : 0
'af' :
'combined_UKB_NFE_AF' : 0
'maf_mb' :
'combined_UKB_NFE_AF_MB' : 0
'maf' :
'combined_UKB_NFE_MAF' : 1000
'Condel' :
'condel_score' : 0
'PolyPhen' :
'polyphen_score' : 0
'SIFT' :
'sift_score' : 1
'QKI_hg2' :
'DeepRipe_plus_QKI_lip_hg2' : 0
'QKI_k5' :
'DeepRipe_plus_QKI_clip_k5' : 0
'KHDRBS1_k5' :
'DeepRipe_plus_KHDRBS1_clip_k5' : 0
'ELAVL1_parclip' :
'DeepRipe_plus_ELAVL1_parclip' : 0
'TARDBP_parclip' :
'DeepRipe_plus_TARDBP_parclip' : 0
'HNRNPD_parclip' :
'DeepRipe_plus_HNRNPD_parclip' : 0
'MBNL1_parclip' :
'DeepRipe_plus_MBNL1_parclip' : 0
'QKI_parclip' :
'DeepRipe_plus_QKI_parclip' : 0
'Consequence_splice_acceptor_variant' :
'Consequence_splice_acceptor_variant' : 0
'Consequence_splice_donor_variant' :
'Consequence_splice_donor_variant' : 0
'Consequence_stop_gained' :
'Consequence_stop_gained' : 0
'Consequence_frameshift_variant' :
'Consequence_frameshift_variant' : 0
'Consequence_stop_lost' :
'Consequence_stop_lost' : 0
'Consequence_start_lost' :
'Consequence_start_lost' : 0
'Consequence_inframe_insertion' :
'Consequence_inframe_insertion' : 0
'Consequence_inframe_deletion' :
'Consequence_inframe_deletion' : 0
'Consequence_missense_variant' :
'Consequence_missense_variant' : 0
'Consequence_protein_altering_variant' :
'Consequence_protein_altering_variant' : 0
'Consequence_splice_region_variant' :
'Consequence_splice_region_variant' : 0
'DeepSEA_PC_1' :
'DeepSEA_PC_1' : 0
'DeepSEA_PC_2' :
'DeepSEA_PC_2' : 0
'DeepSEA_PC_3' :
'DeepSEA_PC_3' : 0
'DeepSEA_PC_4' :
'DeepSEA_PC_4' : 0
'DeepSEA_PC_5' :
'DeepSEA_PC_5' : 0
'DeepSEA_PC_6' :
'DeepSEA_PC_6' : 0
'AF' :
'AF' : 0
29 changes: 12 additions & 17 deletions pipelines/config/deeprvat_annotation_config.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
fasta_dir : reference
fasta_file_name : hg38.fa
# Uncomment to use module load for required tools
#bcftools_load_cmd : module load bcftools/1.9
#htslib_load_cmd : module load htslib/1.9
#perl_load_cmd : module load perl/5.20.2
#vep_load_cmd : module load vep/108.1
fasta_dir : reference
fasta_file_name : hg38.fa
gtf_file_name : gencode.v44.annotation.gtf.gz

source_variant_file_pattern : test_vcf_data_c{chr}_b{block}
source_variant_file_type: 'vcf.gz'
Expand All @@ -18,23 +19,17 @@ source_variant_dir : input_dir/vcf
anno_tmp_dir : output_dir/annotations/tmp
anno_dir : output_dir/annotations

vep_cache_dir : repo_dir/ensembl-vep/cache/vep109
vep_cache_dir : repo_dir/ensembl-vep/cache/
vep_plugin_dir : repo_dir/ensembl-vep/Plugins
spliceAI_snv_file : annotation_data/spliceAI/spliceai_scores.raw.snv.hg38.vcf.gz
spliceAI_indel_file : annotation_data/spliceAI/spliceai_scores.raw.indel.hg38.vcf.gz
primateAI_file : annotation_data/primateAI/PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz
cadd_snv_file : annotation_data/cadd/whole_genome_SNVs.tsv.gz
cadd_indel_file : annotation_data/cadd/gnomad.genomes.r3.0.indel.tsv.gz
absplice_repo_dir : repo_dir/absplice
deeprvat_repo_dir : deeprvat_repo_dir
deeprvat_repo_dir : ../..
kipoiveff_repo_dir : repo_dir/kipoi-veff2
faatpipe_repo_dir : repo_dir/faatpipe
vep_repo_dir : repo_dir/ensembl-vep
vep_plugin_repo : repo_dir/VEP_plugins
variant_file_path : preprocessing_workdir/norm/variants/variants.tsv.gz
pybedtools_tmp_path : output_dir/annotations/tmp/pybedtools
n_jobs_deepripe : 32
n_cores_merge_absplice : 32
n_cores_absplice : 32
deepsea_pca_pickle_filepath : annotations/deepSea_pca/pca.pkl
deepsea_pca_n_components: 100
preprocessing_workdir : preprocessing_workdir
additional_vep_plugin_cmds:
cadd : CADD,annotation_data/cadd/whole_genome_SNVs.tsv.gz,annotation_data/cadd/gnomad.genomes.r3.0.indel.tsv.gz
spliceAI : SpliceAI,snv=annotation_data/spliceAI/spliceai_scores.raw.snv.hg38.vcf.gz,indel=annotation_data/spliceAI/spliceai_scores.raw.indel.hg38.vcf.gz
primateAI : PrimateAI,annotation_data/primateAI/PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz
condel: Condel,repo_dir/ensembl-vep/Plugin/config/Condel/config,s,2
alphamissense : AlphaMissense,file=annotation_data/AlphaMissense/AlphaMissense_hg38.tsv.gz
33 changes: 33 additions & 0 deletions pipelines/resources/absplice.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: absplice
channels:
- conda-forge
dependencies:
- python==3.8
- cython
- tensorflow
- pytest
- setuptools
- scipy
- pandas
- tqdm
- click
- pip
- pyarrow
- pandas
- numpy==1.23
# needed for splicemap
- seaborn
- scikit-learn
- bioconda::pyfaidx
- bioconda::pyranges>=0.0.71
- bioconda::cyvcf2==0.30.16
- bioconda::tabix
- bioconda::snakemake==7.26.0
- bioconda::spliceai
- bioconda::kipoiseq>=0.7
- deepdiff
- pip:
- mmsplice
- interpret == 0.2.7
- interpret-core == 0.2.7
- git+https://github.com/gagneurlab/splicemap.git
16 changes: 16 additions & 0 deletions pipelines/resources/absplice_config_cat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
gene_map: ../../absplice/precomputed/GENE_MAP.tsv.gz

# mapping of variants to individuals, result of variant filtering on provided vcfs
variant_sample_map: ../data/resources/analysis_files/variant_sample_map/{vcf_id}_variant_sample_map.csv

cat_count_table:
raw: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_count_table/tissue_cat={tissue_cat}_count_table_raw.csv
updated: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_count_table/tissue_cat={tissue_cat}_count_table.csv
cat_outliers:
qual_filtered:
junction_level: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_outliers/qual_filtered/tissue_cat={tissue_cat}_junction_level_signif.csv
gene_level: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_outliers/qual_filtered/tissue_cat={tissue_cat}_gene_level.csv
combine_gene_junction:
gene_junction_signif: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_outliers/combine_gene_junction/gene_junction_signif/tissue_cat={tissue_cat}_signif.csv
outlier_with_variant: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_outliers/combine_gene_junction/outlier_with_variant/{vcf_id}_tissue_cat={tissue_cat}_outlier_with_variant.csv
minus_log10_pval: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_outliers/combine_gene_junction/minus_log10_pval/{vcf_id}_tissue_cat={tissue_cat}_FRASER_pval.csv
83 changes: 83 additions & 0 deletions pipelines/resources/absplice_config_download.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
fasta:
hg19:
url: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz
file: 'GRCh37.primary_assembly.genome.fa'
hg38:
url: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/GRCh38.primary_assembly.genome.fa.gz
file: 'GRCh38.primary_assembly.genome.fa'

gtf:
hg19:
url: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/GRCh37_mapping/gencode.v39lift37.annotation.gtf.gz
file: 'gencode.v39lift37.annotation.gtf'
coding_genes: 'hg19_coding_genes.csv'
hg38:
url: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/gencode.v39.annotation.gtf.gz
file: 'gencode.v39.annotation.gtf'
coding_genes: 'hg38_coding_genes.csv'

splicemap_dir: 'splicemap_{genome}/'

splicemap:
psi3: 'splicemap_{genome}/{tissue}_splicemap_psi3_method=kn_event_filter=median_cutoff.csv.gz'
psi5: 'splicemap_{genome}/{tissue}_splicemap_psi5_method=kn_event_filter=median_cutoff.csv.gz'

spliceai_rocksdb:
hg19: 'spliceAI_grch37_chr{chromosome}.db'
hg38: 'spliceAI_grch38_chr{chromosome}.db'
chromosomes: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']

gnomad_rocksdb:
hg19: 'gnomad_maf_db_hg19/gnomad_maf_db_2.1.1.db'
hg38: 'gnomad_maf_db_hg38/gnomad_maf_db_3.1.2.db'

all_available_splicemap_tissues:
- Adipose_Subcutaneous
- Adipose_Visceral_Omentum
- Adrenal_Gland
- Artery_Aorta
- Artery_Coronary
- Artery_Tibial
- Brain_Amygdala
- Brain_Anterior_cingulate_cortex_BA24
- Brain_Caudate_basal_ganglia
- Brain_Cerebellar_Hemisphere
- Brain_Cerebellum
- Brain_Cortex
- Brain_Frontal_Cortex_BA9
- Brain_Hippocampus
- Brain_Hypothalamus
- Brain_Nucleus_accumbens_basal_ganglia
- Brain_Putamen_basal_ganglia
- Brain_Spinal_cord_cervical_c_1
- Brain_Substantia_nigra
- Breast_Mammary_Tissue
- Cells_Cultured_fibroblasts
- Cells_EBV_transformed_lymphocytes
- Colon_Sigmoid
- Colon_Transverse
- Esophagus_Gastroesophageal_Junction
- Esophagus_Mucosa
- Esophagus_Muscularis
- Heart_Atrial_Appendage
- Heart_Left_Ventricle
- Kidney_Cortex
- Liver
- Lung
- Minor_Salivary_Gland
- Muscle_Skeletal
- Nerve_Tibial
- Ovary
- Pancreas
- Pituitary
- Prostate
- Skin_Not_Sun_Exposed_Suprapubic
- Skin_Sun_Exposed_Lower_leg
- Small_Intestine_Terminal_Ileum
- Spleen
- Stomach
- Testis
- Thyroid
- Uterus
- Vagina
- Whole_Blood
9 changes: 9 additions & 0 deletions pipelines/resources/absplice_config_pred.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
splicing_pred:
mmsplice_splicemap: '{genome}/model_scores_from_absplice_features/{vcf_id}_MMSplice_SpliceMap.csv'
spliceai_vcf: '{genome}/model_scores_from_absplice_features/{vcf_id}_SpliceAI.vcf'
spliceai: '{genome}/model_scores_from_absplice_features/{vcf_id}_SpliceAI.csv'
absplice_dna: '{genome}/dna/{vcf_id}_AbSplice_DNA.csv'
delta_psi_inferred_from_cat: '{genome}/model_scores_from_absplice_features/{vcf_id}_tissue_cat={tissue_cat}_delta_psi_inferred_from_cat.csv'
cat_outliers: '{genome}/model_scores_from_absplice_features/{vcf_id}_tissue_cat={tissue_cat}_FRASER_pval.csv'
absplice_rna: '{genome}/{vcf_id}_tissue_cat={tissue_cat}_AbSplice_RNA.csv'
absplice_rna_with_dna_info: '{genome}/{vcf_id}_tissue_cat={tissue_cat}_AbSplice_all_info.csv'
8 changes: 8 additions & 0 deletions pipelines/resources/absplice_dna.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from absplice import SplicingOutlierResult

splicing_result = SplicingOutlierResult(
df_mmsplice=snakemake.input['mmsplice_splicemap'],
df_spliceai=snakemake.input['spliceai'],
)
splicing_result.predict_absplice_dna(extra_info=snakemake.params['extra_info'])
splicing_result._absplice_dna.to_csv(snakemake.output['absplice_dna'])
Loading

0 comments on commit 63f8737

Please sign in to comment.