Skip to content

Commit

Permalink
Using yaml file instead of named env to create absplice scores
Browse files Browse the repository at this point in the history
  • Loading branch information
Mück committed May 24, 2024
1 parent cd7037a commit 0c97bd5
Show file tree
Hide file tree
Showing 11 changed files with 41 additions and 32 deletions.
20 changes: 11 additions & 9 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1210,9 +1210,10 @@ def merge_abscores(
annotations = annotations.rename(columns={"Gene": "gene_id"})
annotations.drop_duplicates(inplace=True, subset=["gene_id", "id"])
original_len = len(annotations)

all_ids = set(annotations.id)
all_absplice_scores.drop_duplicates(subset = ["chrom", "pos", "ref", "alt", "gene_id"], inplace = True)
logger.info("Merging")
merged = pd.merge(
annotations = pd.merge(
annotations,
all_absplice_scores,
validate="1:1",
Expand All @@ -1221,24 +1222,25 @@ def merge_abscores(
)

logger.info("Sanity checking merge")
assert len(merged) == original_len
assert len(annotations) == original_len
assert set(annotations.id) == all_ids
logger.info(
f"len of merged after dropping duplicates: {len(merged.drop_duplicates(subset=['id', 'gene_id']))}"
f"len of merged after dropping duplicates: {len(annotations.drop_duplicates(subset=['id', 'gene_id']))}"
)
logger.info(f"len of merged without dropping duplicates: {len(merged)}")
logger.info(f"len of merged without dropping duplicates: {len(annotations)}")

assert len(merged.drop_duplicates(subset=["id", "gene_id"])) == len(merged)
assert len(annotations.drop_duplicates(subset=["id", "gene_id"])) == len(annotations)

logger.info(
f'Filling {merged["AbSplice_DNA"].isna().sum()} '
f'Filling {annotations["AbSplice_DNA"].isna().sum()} '
"missing AbSplice values with 0"
)
merged["AbSplice_DNA"] = merged["AbSplice_DNA"].fillna(0)
annotations["AbSplice_DNA"] = annotations["AbSplice_DNA"].fillna(0)

annotation_out_file = out_file

logger.info(f"Writing to {annotation_out_file}")
merged.to_parquet(annotation_out_file, engine="pyarrow")
annotations.to_parquet(annotation_out_file, engine="pyarrow")


pd.options.mode.chained_assignment = None
Expand Down
3 changes: 1 addition & 2 deletions pipelines/annotations.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ kipoi_repo_dir = Path(config["kipoiveff_repo_dir"])
ncores_addis = int(config.get("n_jobs_addids") or 32)

# init absplice
absplice_repo_dir = Path(config["absplice_repo_dir"])
n_cores_absplice = int(config.get("n_cores_absplice") or 4)
ncores_merge_absplice = int(config.get("n_cores_merge_absplice") or 8)
ncores_agg_absplice = int(config.get("ncores_agg_absplice") or 4)
Expand Down Expand Up @@ -154,7 +153,7 @@ file_stems = [

absplice_download_dir = (
config.get("absplice_download_dir")
or absplice_repo_dir / "example" / "data" / "resources" / "downloaded_files"
or anno_tmp_dir / "absplice"
)
absplice_output_dir = config.get("absplice_output_dir", anno_tmp_dir / "absplice")
vcf_id = anno_tmp_dir / "{file_stem}"
Expand Down
1 change: 0 additions & 1 deletion pipelines/config/deeprvat_annotation_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ anno_dir : output_dir/annotations

vep_cache_dir : repo_dir/ensembl-vep/cache/
vep_plugin_dir : repo_dir/ensembl-vep/Plugins
absplice_repo_dir : repo_dir/absplice
deeprvat_repo_dir : ../..
kipoiveff_repo_dir : repo_dir/kipoi-veff2
faatpipe_repo_dir : repo_dir/faatpipe
Expand Down
2 changes: 1 addition & 1 deletion pipelines/resources/absplice.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ dependencies:
- interpret == 0.2.7
- interpret-core == 0.2.7
- git+https://github.com/gagneurlab/splicemap.git@9e9831f32c221e850e26757a5f1c132dcd565640
- git+https://github.com/gagneurlab/absplice.git@e4060e63cca074ac938ea28850a38290c7e8b198
- -e git+https://github.com/gagneurlab/absplice.git@5d8b3fbe3ab1cc6ebd3a68857a50b48671753362#egg=absplice
8 changes: 4 additions & 4 deletions pipelines/resources/absplice_download.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ rule download_human_fasta:
output:
Path(absplice_download_dir) / config_download["fasta"][genome]["file"],
conda:
"absplice"
"./absplice.yaml"
shell:
"wget -O - {params} | gunzip -c > {output}"

Expand All @@ -75,7 +75,7 @@ rule download_splicemaps:
splicemap_psi5=Path(absplice_download_dir)
/ config_download["splicemap"]["psi5"],
conda:
"absplice"
"./absplice.yaml"
shell:
"splicemap_download --version {params.version} --splicemap_dir {params.dirname} --tissues {wildcards.tissue}"

Expand Down Expand Up @@ -103,7 +103,7 @@ if absplice_main_conf["AbSplice_RNA"] == True:
output:
Path(absplice_download_dir) / config_download["gtf"][genome]["file"],
conda:
"absplice"
"./absplice.yaml"
shell:
"wget -O - {params} | gunzip -c > {output}"

Expand All @@ -119,7 +119,7 @@ if absplice_main_conf["AbSplice_RNA"] == True:
coding_genes=Path(absplice_download_dir)
/ config_download["gtf"][genome]["coding_genes"],
conda:
"absplice"
"./absplice.yaml"
resources:
mem_mb=lambda wildcards, attempt: attempt * 16000,
script:
Expand Down
6 changes: 3 additions & 3 deletions pipelines/resources/absplice_splicing_pred_DNA.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ rule mmsplice_splicemap:
mem_mb=30_000,
threads=4,
conda:
"absplice"
"./absplice.yaml"
output:
result=Path(absplice_output_dir)
/ config_pred["splicing_pred"]["mmsplice_splicemap"],
Expand Down Expand Up @@ -113,7 +113,7 @@ else:
spliceai_csv=Path(absplice_output_dir)
/ config_pred["splicing_pred"]["spliceai"],
conda:
"absplice"
"./absplice.yaml"
run:
from absplice.utils import read_spliceai_vcf

Expand All @@ -130,7 +130,7 @@ rule absplice_dna:
params:
extra_info=absplice_main_conf["extra_info_dna"],
conda:
"absplice"
"./absplice.yaml"
output:
absplice_dna=absplice_output_dir
/ "{genome}"
Expand Down
26 changes: 14 additions & 12 deletions pipelines/resources/environment_spliceai_rocksdb.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,21 @@ channels:
dependencies:
- python==3.9.15
- python-rocksdb==0.7.0
- rocksdb==6.13.3
- tensorflow-gpu==2.6.2
- cudnn==8.8.0.121
- cudatoolkit==11.0.3
- pyarrow==6.0.1
- tqdm==4.65.0
- pip==23.2.1
- setuptools==68.0.0
- click==8.1.6
- pooch==1.7.0
- bioconda::snakemake==7.31.0
- rocksdb==6.29.5
- tensorflow-gpu==2.14.0
- cudnn==8.9.7.29
- cudatoolkit==11.8.0
- pyarrow==12.0.1
- tqdm==4.66.4
- pip==24.0
- pandas=2.2.2
- setuptools==70.0.0
- click==8.1.7
- pooch==1.8.1
- numpy=1.26.4
- bioconda::snakemake==7.32.4
- bioconda::kipoiseq==0.7.1
- bioconda::spliceai==1.3.1
- bioconda::cyvcf2==0.30.16
- bioconda::cyvcf2==0.30.28
- pip:
- git+https://github.com/gagneurlab/spliceai_rocksdb.git@3c40d6e61b19e907e802c979c76d52ea5c41c1d5
7 changes: 7 additions & 0 deletions tests/annotations/test_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,13 @@ def test_aggregate_abscores(
"vep_deepripe_deepsea.parquet",
"vep_deepripe_deepsea_absplice.parquet",
),
(
"merge_absplice_scores_exons",
"abSplice_score_file.parquet",
"vep_deepripe_deepsea.parquet",
"vep_deepripe_deepsea_absplice.parquet",
),
],
)
def test_merge_absplice_scores(
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 0c97bd5

Please sign in to comment.