diff --git a/CITATION.cff b/CITATION.cff index d7123bb8..2ff1a871 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -22,6 +22,9 @@ authors: - given-names: Kayla family-names: Meyer orcid: 'https://orcid.org/0009-0003-5063-5266' + - given-names: Felix + family-names: Munzlinger + orcid: 'https://orcid.org/0009-0005-1407-8145' - given-names: Felix family-names: Brechtmann orcid: 'https://orcid.org/0000-0002-0110-152X' diff --git a/docs/_static/preprocess_rulegraph_no_qc.svg b/docs/_static/preprocess_rulegraph_no_qc.svg index 0fbb55ba..3edc0a0e 100644 --- a/docs/_static/preprocess_rulegraph_no_qc.svg +++ b/docs/_static/preprocess_rulegraph_no_qc.svg @@ -4,25 +4,25 @@ - - + + snakemake_dag - + 0 - + all 1 - + combine_genotypes - + 1->0 @@ -30,7 +30,7 @@ 2 - + preprocess_no_qc @@ -42,17 +42,17 @@ 3 - + add_variant_ids - + 3->0 - + 3->2 @@ -60,7 +60,7 @@ 4 - + concatenate_variants @@ -69,117 +69,99 @@ - - -10 - + + +9 + create_parquet_variant_ids - - -4->10 + + +4->9 5 - -variants + +variants 5->4 - - + + 6 - -normalize + +normalize 6->5 - - + + - - -11 - + + +10 + sparsify - - -6->11 - - + + +6->10 + + 7 - + extract_samples - + 7->2 - + 7->6 - - + + 8 - -uppercase_fasta + +index_fasta - + 8->6 - - - - - -9 - -index_fasta - - - -8->9 - - - - - -9->6 - - + + - - -10->0 + + +9->0 - - -10->2 + + +9->2 - - -11->2 + + +10->2 diff --git a/docs/_static/preprocess_rulegraph_with_qc.svg b/docs/_static/preprocess_rulegraph_with_qc.svg index 120f2ce8..5c3d86e5 100644 --- a/docs/_static/preprocess_rulegraph_with_qc.svg +++ b/docs/_static/preprocess_rulegraph_with_qc.svg @@ -4,266 +4,248 @@ - - + + snakemake_dag - + 0 - -all + +all 1 - -combine_genotypes + +combine_genotypes - + 1->0 - - + + 2 - + preprocess_with_qc 2->1 - - + + 3 - -add_variant_ids + +add_variant_ids - + 3->0 - - + + - + 3->2 - - + + 4 - -concatenate_variants + +concatenate_variants 4->3 - - + + - - -10 - -create_parquet_variant_ids + + +9 + +create_parquet_variant_ids - - -4->10 - - + + +4->9 + + 5 - -variants + +variants 5->4 - - + + 6 - -normalize + +normalize 6->5 - - + + + + + +10 + +sparsify + + + +6->10 + + 11 - -sparsify + +qc_varmiss - + 6->11 - - + + 12 - -qc_varmiss + +qc_hwe - + 6->12 - - + + 13 - -qc_hwe + +qc_read_depth - + 6->13 - - + + 14 - -qc_read_depth + +qc_allelic_imbalance - + 6->14 - - - - - -15 - -qc_allelic_imbalance - - - -6->15 - - + + 7 - -extract_samples + +extract_samples - + 7->2 - + - + 7->6 - - + + 8 - -uppercase_fasta + +index_fasta - + 8->6 - - - - - -9 - -index_fasta - - - -8->9 - - + + - - -9->6 - - + + +9->0 + + - - -10->0 - - + + +9->2 + + - + 10->2 - - + + - + 11->2 - - + + - + 12->2 - - + + - + 13->2 - - + + - + 14->2 - - + + - - -15->2 - - - - - -16 - + + +15 + create_excluded_samples_dir - - -16->2 + + +15->2 diff --git a/pipelines/preprocessing/preprocess.snakefile b/pipelines/preprocessing/preprocess.snakefile index 16f20536..87320690 100644 --- a/pipelines/preprocessing/preprocess.snakefile +++ b/pipelines/preprocessing/preprocess.snakefile @@ -18,9 +18,8 @@ reference_dir = working_dir / config["reference_dir_name"] preprocess_threads = config["preprocess_threads"] -src_fasta_file = reference_dir / config["reference_fasta_file"] -fasta_file_uppercase = reference_dir / f'{Path(config["reference_fasta_file"]).stem}_upper.fa' -fasta_index_file = f"{fasta_file_uppercase}.fai" +fasta_file = reference_dir / config["reference_fasta_file"] +fasta_index_file = reference_dir / f"{config['reference_fasta_file']}.fai" norm_dir = working_dir / config["norm_dir_name"] sparse_dir = norm_dir / config["sparse_dir_name"] @@ -65,7 +64,7 @@ rule combine_genotypes: rule normalize: input: samplefile=norm_dir / "samples_chr.csv", - fasta=fasta_file_uppercase, + fasta=fasta_file, fastaindex=fasta_index_file, params: vcf_file=lambda wildcards: vcf_look_up[wildcards.vcf_stem], @@ -77,20 +76,12 @@ rule normalize: rule index_fasta: input: - fasta=fasta_file_uppercase, + fasta=fasta_file, output: fasta_index_file, shell: f"{load_samtools} samtools faidx {{input.fasta}}" -rule uppercase_fasta: - input: - fasta=src_fasta_file, - output: - fasta_file_uppercase, - shell: - "awk '!/^>/ {{print toupper($0)}} /^>/ {{print}}'" + " {input.fasta} > {output}" - rule sparsify: input: