From 21a8e465d68f2526c12832e270be1a453c7f44af Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Tue, 30 Jan 2024 08:45:50 +0100 Subject: [PATCH] Remove unsused directories in preprocessing pipeline (#49) --- docs/annotations.md | 2 +- docs/preprocessing.md | 22 ++++++------------- .../config/deeprvat_preprocess_config.yaml | 17 +++++--------- pipelines/preprocessing/preprocess.snakefile | 2 -- 4 files changed, 14 insertions(+), 29 deletions(-) diff --git a/docs/annotations.md b/docs/annotations.md index 6ba208b5..a9532220 100644 --- a/docs/annotations.md +++ b/docs/annotations.md @@ -22,7 +22,7 @@ BCFtools as well as HTSlib should be installed on the machine, will be installed by the pipeline together with the [plugins](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html) for primateAI and spliceAI. Annotation data for CADD, spliceAI and primateAI should be downloaded. The path to the data may be specified in the corresponding [config file](https://github.com/PMBio/deeprvat/blob/main/pipelines/config/deeprvat_annotation_config.yaml). Download path: -- [CADD](http://cadd.gs.washington.edu/download): "All possible SNVs of GRCh38/hg38" and "gnomad.genomes.r3.0.indel.tsv.gz" incl. their Tabix Indices +- [CADD](https://cadd.bihealth.org/download): "All possible SNVs of GRCh38/hg38" and "gnomad.genomes.r3.0.indel.tsv.gz" incl. their Tabix Indices - [SpliceAI](https://basespace.illumina.com/s/otSPW8hnhaZR): "genome_scores_v1.3"/"spliceai_scores.raw.snv.hg38.vcf.gz" and "spliceai_scores.raw.indel.hg38.vcf.gz" - [PrimateAI](https://basespace.illumina.com/s/yYGFdGih1rXL) PrimateAI supplementary data/"PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz" diff --git a/docs/preprocessing.md b/docs/preprocessing.md index 6db444d8..502b1e62 100644 --- a/docs/preprocessing.md +++ b/docs/preprocessing.md @@ -50,17 +50,18 @@ An example file is included in this repo: [example config](https://github.com/PM # What chromosomes should be processed included_chromosomes : [21,22] +# The format of the name of the "raw" vcf files +vcf_files_list: vcf_files_list.txt + +# Number of threads to use in the preprocessing script, separate from snakemake threads +preprocess_threads: 16 + # If you need to run a cmd to load bcf and samtools specify it here, see example bcftools_load_cmd : # module load bcftools/1.10.2 && samtools_load_cmd : # module load samtools/1.9 && # Path to where you want to write results and intermediate data working_dir: workdir -# Path to ukbb data -data_dir: data - -# These paths are all relative to the data dir -metadata_dir_name: metadata # These paths are all relative to the working dir # Here will the finished preprocessed files end up @@ -75,23 +76,14 @@ sparse_dir_name : sparse # Expected to be found in working_dir/reference_dir reference_fasta_file : GRCh38.primary_assembly.genome.fa -# The format of the name of the "raw" vcf files -vcf_files_list: vcf_files_list.txt - -# Number of threads to use in the preprocessing script, separate from snakemake threads -preprocess_threads: 16 - # You can specify a different zcat cmd for example gzcat here, default zcat -zcat_cmd: gzcat +zcat_cmd: ``` The config above would use the following directory structure: ```shell parent_directory -|-- data -| |-- metadata -| `-- vcf `-- workdir |-- norm | |-- bcf diff --git a/pipelines/config/deeprvat_preprocess_config.yaml b/pipelines/config/deeprvat_preprocess_config.yaml index 1a0173a9..0f1b146c 100644 --- a/pipelines/config/deeprvat_preprocess_config.yaml +++ b/pipelines/config/deeprvat_preprocess_config.yaml @@ -1,17 +1,18 @@ # What chromosomes should be processed included_chromosomes : [21,22] +# The format of the name of the "raw" vcf files +vcf_files_list: vcf_files_list.txt + +# Number of threads to use in the preprocessing script, separate from snakemake threads +preprocess_threads: 16 + # If you need to run a cmd to load bcf and samtools specify it here, see example bcftools_load_cmd : # module load bcftools/1.10.2 && samtools_load_cmd : # module load samtools/1.9 && # Path to where you want to write results and intermediate data working_dir: workdir -# Path to ukbb data -data_dir: data - -# These paths are all relative to the data dir -metadata_dir_name: metadata # These paths are all relative to the working dir # Here will the finished preprocessed files end up @@ -26,11 +27,5 @@ sparse_dir_name : sparse # Expected to be found in working_dir/reference_dir reference_fasta_file : GRCh38.primary_assembly.genome.fa -# The format of the name of the "raw" vcf files -vcf_files_list: vcf_files_list.txt - -# Number of threads to use in the preprocessing script, separate from snakemake threads -preprocess_threads: 16 - # You can specify a different zcat cmd for example gzcat here, default zcat zcat_cmd: \ No newline at end of file diff --git a/pipelines/preprocessing/preprocess.snakefile b/pipelines/preprocessing/preprocess.snakefile index 87320690..bc9e4702 100644 --- a/pipelines/preprocessing/preprocess.snakefile +++ b/pipelines/preprocessing/preprocess.snakefile @@ -11,9 +11,7 @@ zcat_cmd = config.get("zcat_cmd") or "zcat" preprocessing_cmd = "deeprvat_preprocess" working_dir = Path(config["working_dir"]) -data_dir = Path(config["data_dir"]) preprocessed_dir = working_dir / config["preprocessed_dir_name"] -metadata_dir = data_dir / config["metadata_dir_name"] reference_dir = working_dir / config["reference_dir_name"] preprocess_threads = config["preprocess_threads"]