Remove unsused directories in preprocessing pipeline (#49)

PMBio · Jan 30, 2024 · 21a8e46 · 21a8e46
1 parent fc14c51
commit 21a8e46
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 29 deletions.
diff --git a/docs/annotations.md b/docs/annotations.md
@@ -22,7 +22,7 @@ BCFtools as well as HTSlib should be installed on the machine,
 
 will be installed by the pipeline together with the [plugins](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html) for primateAI and spliceAI. Annotation data for CADD, spliceAI and primateAI should be downloaded. The path to the data may be specified in the corresponding [config file](https://github.com/PMBio/deeprvat/blob/main/pipelines/config/deeprvat_annotation_config.yaml). 
 Download path:
-- [CADD](http://cadd.gs.washington.edu/download): "All possible SNVs of GRCh38/hg38" and "gnomad.genomes.r3.0.indel.tsv.gz" incl. their  Tabix Indices
+- [CADD](https://cadd.bihealth.org/download): "All possible SNVs of GRCh38/hg38" and "gnomad.genomes.r3.0.indel.tsv.gz" incl. their  Tabix Indices
 - [SpliceAI](https://basespace.illumina.com/s/otSPW8hnhaZR): "genome_scores_v1.3"/"spliceai_scores.raw.snv.hg38.vcf.gz" and "spliceai_scores.raw.indel.hg38.vcf.gz" 
 - [PrimateAI](https://basespace.illumina.com/s/yYGFdGih1rXL) PrimateAI supplementary data/"PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz"
 

diff --git a/docs/preprocessing.md b/docs/preprocessing.md
@@ -50,17 +50,18 @@ An example file is included in this repo: [example config](https://github.com/PM
 # What chromosomes should be processed
 included_chromosomes : [21,22]
 
+# The format of the name of the "raw" vcf files
+vcf_files_list: vcf_files_list.txt
+
+# Number of threads to use in the preprocessing script, separate from snakemake threads
+preprocess_threads: 16
+
 # If you need to run a cmd to load bcf and samtools specify it here, see example
 bcftools_load_cmd : # module load bcftools/1.10.2 &&
 samtools_load_cmd : # module load samtools/1.9 &&
 
 # Path to where you want to write results and intermediate data
 working_dir: workdir
-# Path to ukbb data
-data_dir: data
-
-# These paths are all relative to the data dir
-metadata_dir_name: metadata
 
 # These paths are all relative to the working dir
 # Here will the finished preprocessed files end up
@@ -75,23 +76,14 @@ sparse_dir_name : sparse
 # Expected to be found in working_dir/reference_dir
 reference_fasta_file : GRCh38.primary_assembly.genome.fa
 
-# The format of the name of the "raw" vcf files
-vcf_files_list: vcf_files_list.txt
-
-# Number of threads to use in the preprocessing script, separate from snakemake threads
-preprocess_threads: 16
-
 # You can specify a different zcat cmd for example gzcat here, default zcat
-zcat_cmd: gzcat
+zcat_cmd:
    ```
 
 The config above would use the following directory structure:
 
 ```shell
 parent_directory
-|-- data
-|   |-- metadata
-|   `-- vcf
 `-- workdir
     |-- norm
     |   |-- bcf

diff --git a/pipelines/config/deeprvat_preprocess_config.yaml b/pipelines/config/deeprvat_preprocess_config.yaml
@@ -1,17 +1,18 @@
 # What chromosomes should be processed
 included_chromosomes : [21,22]
 
+# The format of the name of the "raw" vcf files
+vcf_files_list: vcf_files_list.txt
+
+# Number of threads to use in the preprocessing script, separate from snakemake threads
+preprocess_threads: 16
+
 # If you need to run a cmd to load bcf and samtools specify it here, see example
 bcftools_load_cmd : # module load bcftools/1.10.2 &&
 samtools_load_cmd : # module load samtools/1.9 &&
 
 # Path to where you want to write results and intermediate data
 working_dir: workdir
-# Path to ukbb data
-data_dir: data
-
-# These paths are all relative to the data dir
-metadata_dir_name: metadata
 
 # These paths are all relative to the working dir
 # Here will the finished preprocessed files end up
@@ -26,11 +27,5 @@ sparse_dir_name : sparse
 # Expected to be found in working_dir/reference_dir
 reference_fasta_file : GRCh38.primary_assembly.genome.fa
 
-# The format of the name of the "raw" vcf files
-vcf_files_list: vcf_files_list.txt
-
-# Number of threads to use in the preprocessing script, separate from snakemake threads
-preprocess_threads: 16
-
 # You can specify a different zcat cmd for example gzcat here, default zcat
 zcat_cmd:
diff --git a/pipelines/preprocessing/preprocess.snakefile b/pipelines/preprocessing/preprocess.snakefile
@@ -11,9 +11,7 @@ zcat_cmd = config.get("zcat_cmd") or "zcat"
 preprocessing_cmd = "deeprvat_preprocess"
 
 working_dir = Path(config["working_dir"])
-data_dir = Path(config["data_dir"])
 preprocessed_dir = working_dir / config["preprocessed_dir_name"]
-metadata_dir = data_dir / config["metadata_dir_name"]
 reference_dir = working_dir / config["reference_dir_name"]
 
 preprocess_threads = config["preprocess_threads"]