From 143eb806cb96381b9093dcb6ea5199e71976198b Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Tue, 28 May 2024 12:31:37 -0400 Subject: [PATCH 01/36] makes illumina_manifest_file optional in the config --- src/cgr_gwas_qc/models/config/reference_files.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cgr_gwas_qc/models/config/reference_files.py b/src/cgr_gwas_qc/models/config/reference_files.py index 98941c6b..cc04b0cd 100644 --- a/src/cgr_gwas_qc/models/config/reference_files.py +++ b/src/cgr_gwas_qc/models/config/reference_files.py @@ -17,7 +17,9 @@ class ReferenceFiles(BaseModel): """ - illumina_manifest_file: Path = Field(..., description="Path to the Illumina provided BPM file.") + illumina_manifest_file: Optional[Path] = Field( + None, description="Path to the Illumina provided BPM file." + ) illumina_cluster_file: Optional[Path] = Field( None, description="Path to the array cluster EGT file." From 8a28efbb3827b8865b38bcc63721116f214ae041 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Wed, 29 May 2024 16:23:16 -0400 Subject: [PATCH 02/36] Uses snps_array name instead of bpm to name *.abf.txt Avoids dependency of bpm to name allele B frequencies (abf) file. --- src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk b/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk index 3603bd0a..4141c2d6 100644 --- a/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk +++ b/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk @@ -21,7 +21,7 @@ rule pull_b_allele_freq_from_1kg: population=cfg.config.software_params.contam_population, output: abf_file="sample_level/{}.{}.abf.txt".format( - cfg.config.reference_files.illumina_manifest_file.stem, + cfg.config.snp_array, cfg.config.software_params.contam_population, ), script: From 36cfdaba7c9161ef99702d6e17c0b63338f3579d Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Mon, 10 Jun 2024 14:09:56 -0400 Subject: [PATCH 03/36] Adds a new idat_intensity workflow to separate median idat intensity retrieval from verifyIDintensity bundled into contamination.smk --- .../workflow/sub_workflows/idat_intensity.smk | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk b/src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk new file mode 100644 index 00000000..791c3f1e --- /dev/null +++ b/src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk @@ -0,0 +1,119 @@ +import pandas as pd + +from cgr_gwas_qc import load_config + +cfg = load_config() + + +################################################################################ +# Contamination Targets +################################################################################ +targets = [ + "sample_level/contamination/median_idat_intensity.csv", +] + + +rule all_contamination: + input: + targets, + + +################################################################################ +# PHONY Rules +################################################################################ +# NOTE: Because of job grouping it is cleaner to wrap the various CLI utilities in +# their own python script. This makes using conda more complicated. Instead of +# installing all of the python dependencies in each environment, I will just +# pass the conda environment and activate it internal. However, we want to make +# sure that snakemake creates the environments, so these PHONY rules will make +# sure that the conda env exists. +localrules: + illuminaio_conda, + + +rule illuminaio_conda: + output: + temp(".illuminaio_env_built"), + conda: + cfg.conda("illuminaio") + shell: + "touch {output[0]}" + + +################################################################################ +# Workflow Rules +################################################################################ +if config.get("cluster_mode", False): + + localrules: + agg_median_idat_intensity, + + rule grouped_median_idat_intensity: + """Calculate median intensity of Red and Green channels.""" + input: + sample_sheet_csv="cgr_sample_sheet.csv", + _=rules.illuminaio_conda.output[0], + params: + grp="{grp}", + idat_red_pattern=lambda _: cfg.config.user_files.idat_pattern.red, + idat_green_pattern=lambda _: cfg.config.user_files.idat_pattern.green, + rscript=cfg.scripts("median_idat_intensity.R"), + conda_env=cfg.conda("illuminaio"), + notemp=config.get("notemp", False), + output: + temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, + time_hr=lambda wildcards, attempt: 4 * attempt, + script: + "../scripts/grouped_median_idat_intensity.py" + + rule agg_median_idat_intensity: + input: + expand(rules.grouped_median_idat_intensity.output[0], grp=cfg.cluster_groups), + params: + notemp=config.get("notemp", False), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" + +else: + + rule per_sample_median_idat_intensity: + """Calculate median intensity of Red and Green channels.""" + input: + red=lambda wc: cfg.expand( + cfg.config.user_files.idat_pattern.red, + query=f"Sample_ID == '{wc.Sample_ID}'", + )[0], + green=lambda wc: cfg.expand( + cfg.config.user_files.idat_pattern.green, + query=f"Sample_ID == '{wc.Sample_ID}'", + )[0], + _=rules.illuminaio_conda.output[0], + params: + sample_id="{Sample_ID}", + rscript=cfg.scripts("median_idat_intensity.R"), + conda_env=cfg.conda("illuminaio"), + output: + temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/median_idat_intensity.py" + + rule agg_median_idat_intensity: + input: + cfg.expand(rules.per_sample_median_idat_intensity.output[0]), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" From 3c8259ead550cc3685d576db87dc0da964d64712 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Mon, 15 Jul 2024 14:37:25 -0400 Subject: [PATCH 04/36] Adds vcf2abf.py and vcf2adpc.py as prep scripts for contamination checks with vcf entry --- src/cgr_gwas_qc/workflow/scripts/vcf2abf.py | 142 +++++++++++++++++++ src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py | 142 +++++++++++++++++++ 2 files changed, 284 insertions(+) create mode 100755 src/cgr_gwas_qc/workflow/scripts/vcf2abf.py create mode 100755 src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py new file mode 100755 index 00000000..900fe31c --- /dev/null +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +from pathlib import Path +from typing import Optional + +import numpy as np +import pandas as pd +import typer +from pysam import VariantFile + +app = typer.Typer(add_completion=False) + + +# In[2]: + + +def is_biallelic_snp(rec): + return len(rec.ref) == 1 and len(rec.alts[0]) == 1 + + +# In[27]: + + +@app.command() +def main( + vcf_file: Path = typer.Argument( + ..., help="Path to the sample VCF.", exists=True, readable=True + ), + kgvcf_file: Path = typer.Argument( + ..., help="Path to the 1000 genomes VCF file to get ABF.", exists=True, readable=True + ), + population: str = typer.Argument( + ..., + help="Which population in VCF file to use for ABF [Not implemented,global/AF used currently].", + ), + abf_file: Optional[Path] = typer.Argument( + None, help="Output file name. If none given it will use the `VCF_FILE + .abf.txt`." + ), +): + """Pull out the B allele frequency for a given subpopulation. + + Takes variants from the provided `vcf_file` and tries to find their B + allele frequency in the provided `kgvcf_file`. The B allele frequencies are + based on the allele frequency for the provided subpopulation + `population`. If the provided subpopulation doesn't exist, it will output + a list of available populations. If the variant does not exists in the + `vcf_file` then the ABF will be set to "NA". + """ + + if abf_file is None: + abf_file = vcf_file.with_suffix(".abf.txt") + + # vcf_file='/scratch/rajwanir2/data_catalog/QC-test/results/GSAMD-24v1-0-contamseries/vcf/GSAMD-24v1-0-contamseries.bcf' + # abf_file='/scratch/rajwanir2/data_catalog/QC-test/GSAMD-24v1-0.new.abf' + # kg_vcf='/DCEG/CGF/Bioinformatics/Production/data/thousG/hg38/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz' + + # retrieving needed info from the input vcf + vcf = VariantFile(vcf_file, drop_samples=True) + vcf_info = [] + for rec in vcf: + vcf_info.append( + [ + rec.id, + rec.chrom, + rec.pos, + rec.ref[0], + rec.alts[0], + is_biallelic_snp(rec), + rec.info["GC_SCORE"], + rec.info["ALLELE_A"], + ] + ) + + vcf_info = pd.DataFrame( + vcf_info, + columns=["id", "chrom", "pos", "ref", "alt", "is_biallelic_snp", "gc_score", "allele_a"], + ) + vcf_info["is_chrompos_uniq"] = ( + vcf_info.duplicated(subset=["chrom", "pos"], keep=False) == False + ) # if the input vairants are split, this will catch the multi-allelics + + # fetching B-allele frequencies from the kgvcf + kg_vcf = VariantFile(kgvcf_file, drop_samples=True) + kg_info = [] + for i in range(vcf_info.shape[0]): + if kg_vcf.is_valid_reference_name(vcf_info.chrom[i]): + for rec in kg_vcf.fetch( + contig=vcf_info.chrom[i], start=vcf_info.pos[i] - 1, stop=vcf_info.pos[i] + ): + if ( + is_biallelic_snp(rec) + & (vcf_info.is_biallelic_snp[i]) + & (vcf_info.ref[i] == rec.ref[0]) + & (vcf_info.alt[i] == rec.alts[0]) + & (vcf_info.is_chrompos_uniq[i]) + ): + kg_info.append( + [ + rec.chrom, + rec.pos, + rec.ref[0], + rec.alts[0], + rec.info["AC"][0], + rec.info["AN"], + ] + ) + # else: + # kg_info.append([np.nan,np.nan]) + # else: + # kg_info.append([np.nan,np.nan]) + kg_info = pd.DataFrame(kg_info, columns=["chrom", "pos", "ref", "alt", "ac", "an"]) + + # left-joining the kgvcf B-allele freq to input vcf metrics. + vcf_info = vcf_info.merge( + right=kg_info, how="left", sort=False, on=["chrom", "pos", "ref", "alt"] + ) + # A pseudo way to switch from population AF to Illumina A/B frequency + vcf_info["af"] = abs(vcf_info["allele_a"] - (vcf_info["ac"] / vcf_info["an"])) + # Setting AF to na if gentrain score is not >0, if this is egt file was an older one + vcf_info["af"].where(vcf_info.gc_score > 0, np.nan, inplace=True) + # writing abf file + vcf_info[["id", "af"]].to_csv( + abf_file, sep="\t", index=False, header=["SNP_ID", "ABF"], na_rep="NA" + ) + + +# In[ ]: + + +if __name__ == "__main__": + if "snakemake" in locals(): + defaults = {} + defaults.update({k: Path(v) for k, v in snakemake.input.items()}) # type: ignore # noqa + defaults.update({k: Path(v) for k, v in snakemake.output.items()}) # type: ignore # noqa + defaults.update({k: v for k, v in snakemake.params.items()}) # type: ignore # noqa + main(**defaults) # type: ignore + else: + app() diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py new file mode 100755 index 00000000..1bd75e42 --- /dev/null +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +"""Extracts Illumina scores from a multisample VCF for target sample and writes into a single sample adpc.bin format.""" +from pathlib import Path +from typing import Generator + +import numpy as np +import pandas as pd +import typer +from pysam import VariantFile + +# from more_itertools import unzip +from cgr_gwas_qc.parsers.illumina import AdpcRecord, AdpcWriter + +# In[ ]: + + +app = typer.Typer(add_completion=False) + + +@app.command() +def main( + vcf_file: Path = typer.Argument( + ..., help="Path to a multisample VCF file with needed scores.", exists=True, readable=True + ), + target_sample: str = typer.Argument(..., help="Sample name.", exists=True, readable=True), + outfile: Path = typer.Argument( + ..., help="Path to output the adpc.bin file.", file_okay=True, writable=True + ), +) -> None: + """Extracts Illumina scores from a multisample VCF for target sample and writes into a single sample adpc.bin format. + + The adpc.bin format includes:: + + x_raw: Raw intensities for allele A + y_raw: Raw intensities for allele B + x_norm: Normalized intensities for allele A + y_norm: Normalized intensities for allele B + genotype_score: The genotype clustering confidence score + genotype: The called genotype (0: AA, 1: AB, 2: BB, 3: unknown or missing) + """ + + vcf = VariantFile(vcf_file) + sample_fields_to_fetch = ["X", "Y", "NORMX", "NORMY", "GT"] + info_fieilds_to_fetch = ["ALLELE_A", "ALLELE_B", "GC_SCORE"] + + renaming_scheme = { + "X": "x_raw", + "Y": "y_raw", + "NORMX": "x_norm", + "NORMY": "y_norm", + "GC_SCORE": "genotype_score", + } + + vcf_info = [] + for rec in vcf: + vcf_info.append( + [rec.info.get(key) for key in info_fieilds_to_fetch] + + [rec.samples[target_sample].get(key) for key in sample_fields_to_fetch] + ) + vcf_info = pd.DataFrame(vcf_info, columns=info_fieilds_to_fetch + sample_fields_to_fetch) + + # convert GT to illumina genotype: + vcf_info = pd.concat( + [ + vcf_info.reset_index(drop=True), + pd.DataFrame(vcf_info["GT"].tolist(), columns=["GT1", "GT2"]), + ], + axis=1, + ) + vcf_info["genotype"] = np.nan # setting missing by default. + vcf_info.loc[ + (vcf_info["GT1"] == vcf_info["ALLELE_A"]) & (vcf_info["GT2"] == vcf_info["ALLELE_A"]), + "genotype", + ] = "AA" + vcf_info.loc[ + (vcf_info["GT1"] == vcf_info["ALLELE_B"]) & (vcf_info["GT2"] == vcf_info["ALLELE_B"]), + "genotype", + ] = "BB" + vcf_info.loc[(vcf_info["GT1"] != vcf_info["GT2"]), "genotype"] = "AB" + vcf_info.loc[(vcf_info["GT1"].isna()) | (vcf_info["GT2"].isna()), "genotype"] = np.nan + + # renaming for consisting with gtc2adpc + vcf_info.rename(columns=renaming_scheme, inplace=True) + + # encode illumina genotypes per adpc.bin encoding + vcf_info = vcf_info.replace( + {"genotype": {"AA": 0, "AB": 1, "BA": 1, "BB": 2, "A": 3, "B": 3}} + ).fillna({"genotype": 3}) + vcf_info["genotype_score"] = vcf_info["genotype_score"].where(vcf_info["x_raw"] != 0, 0) + + # writing adpc.bin file + with AdpcWriter(outfile) as fh: + for record in vcf_to_adpc_record(vcf_info): + fh.write(record) + + +# In[2]: + + +## function to adpc records +def vcf_to_adpc_record(vcf_info) -> Generator[AdpcRecord, None, None]: + for x_raw, y_raw, x_norm, y_norm, genotype_score, genotype in zip( + vcf_info["x_raw"], + vcf_info["y_raw"], + vcf_info["x_norm"], + vcf_info["y_norm"], + vcf_info["genotype_score"], + vcf_info["genotype"], + ): + yield AdpcRecord(x_raw, y_raw, x_norm, y_norm, genotype_score, genotype) + + +# In[ ]: + + +# def adpc_as_df(adpc): +# i=0 +# adpc_records = [] +# with AdpcReader(adpc) as fh: +# for record in fh: +# # if i <10000: +# adpc_records.append(record) +# i+=1 +# return(pd.DataFrame(adpc_records,columns=['x_raw', 'y_raw', 'x_norm', 'y_norm', 'genotype_score', 'genotype'])) + + +# In[ ]: + + +if __name__ == "__main__": + if "snakemake" in locals(): + defaults = {} + defaults.update({k: Path(v) for k, v in snakemake.input.items()}) # type: ignore # noqa + defaults.update({"outfile": Path(snakemake.output[0])}) # type: ignore # noqa + main(**defaults) + else: + app() From a6cb033b96c6ffcc9f583a5a3d0e50c81bc7c558 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Mon, 15 Jul 2024 15:49:55 -0400 Subject: [PATCH 05/36] Creates a BCF entry point. Modifis the entry_points.smk to create BCF entry point by simply converting BCF to plink BED. Testing and validation yet to be done. --- src/cgr_gwas_qc/models/config/user_files.py | 6 ++ .../workflow/sub_workflows/entry_points.smk | 95 +++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/src/cgr_gwas_qc/models/config/user_files.py b/src/cgr_gwas_qc/models/config/user_files.py index d795265e..b40eaa02 100644 --- a/src/cgr_gwas_qc/models/config/user_files.py +++ b/src/cgr_gwas_qc/models/config/user_files.py @@ -89,6 +89,12 @@ class UserFiles(BaseModel): description="The full path to an aggregated FAM file if the sample level GTC files are not available.", ) + # BCF + bcf: Optional[Path] = Field( + None, + description="The full path to an aggregated BCF/VCF file perferably encoding the GenCall scores.", + ) + @validator("gtc_pattern") def validate_gtc_pattern(cls, v): if v is None: diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk b/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk index 46903516..5baf412f 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk @@ -26,6 +26,64 @@ targets = [ ] +rule all_entry_points: + input: + targets, + + +################################################################################ +# PHONY Rules +################################################################################ +# NOTE: Because of job grouping it is cleaner to wrap the various CLI utilities in +# their own python script. This makes using conda more complicated. Instead of +# installing all of the python dependencies in each environment, I will just +# pass the conda environment and activate it internal. However, we want to make +# sure that snakemake creates the environments, so these PHONY rules will make +# sure that the conda env exists. +localrules: + plink_conda, + + +rule plink_conda: + output: + temp(".plink_env_built"), + conda: + cfg.conda("plink2") + shell: + "touch {output[0]}" + + +################################################################################ +# Workflow Rules +################################################################################ +"""Entry points into the QC workflow. + +This module contains all the different data entry points into the QC +workflow. The most common case is a set of sample level GTC files provided by +the user. + +All entry points should create: + + - sample_level/samples.bed + - sample_level/samples.bim + - sample_level/samples.fam +""" +from cgr_gwas_qc import load_config +from cgr_gwas_qc.parsers import sample_sheet + +cfg = load_config() + + +################################################################################ +# Entry Points Targets +################################################################################ +targets = [ + "sample_level/samples.bed", + "sample_level/samples.bim", + "sample_level/samples.fam", +] + + rule all_entry_points: input: targets, @@ -177,3 +235,40 @@ elif cfg.config.user_files.bed and cfg.config.user_files.bim and cfg.config.user "ln -s $(readlink -f {input.bed}) {output.bed} && " "ln -s $(readlink -f {input.bim}) {output.bim} && " "ln -s $(readlink -f {input.fam}) {output.fam}" + +elif cfg.config.user_files.bcf: + + ################################################################################ + # Aggregated BCF + ################################################################################ + localrules: + convert_bcf_to_plink_bed, + + rule convert_bcf_to_plink_bed: + """Converts BCF to plink BED file + + Path to aggregated BCF file is expected in user_files in config. The expected + VCF should be created using BCFtools/gtc2vcf. The BCF will be converted to BED + file for processing. + """ + input: + bcf=cfg.config.user_files.bcf, + params: + prefix="sample_level/samples", + output: + bed="sample_level/samples.bed", + bim="sample_level/samples.bim", + fam="sample_level/samples.fam", + log: + "sample_level/samples.log", + conda: + cfg.conda("plink2") + resources: + mem_mb=lambda wildcards, attempt: 1024 * 8 * attempt, + shell: + "plink " + "--allow-extra-chr 0 --keep-allele-order --double-id " + "--bcf {input.bcf} " + "--make-bed " + "--out {params.prefix} " + "--memory {resources.mem_mb}" From 0146c1b88ecaed855b76b5f556e1a3617bb0fdec Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Mon, 15 Jul 2024 16:04:19 -0400 Subject: [PATCH 06/36] Removes IDAT intensity checks from contamination.smk A previous commit puts them in a separate idat_intensity.smk --- .../workflow/sub_workflows/contamination.smk | 85 +------------------ 1 file changed, 2 insertions(+), 83 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk index 832a6662..aa6f1d4c 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk @@ -9,7 +9,6 @@ cfg = load_config() # Contamination Targets ################################################################################ targets = [ - "sample_level/contamination/median_idat_intensity.csv", "sample_level/contamination/verifyIDintensity.csv", ] @@ -42,19 +41,9 @@ use rule pull_b_allele_freq_from_1kg from thousand_genomes # sure that snakemake creates the environments, so these PHONY rules will make # sure that the conda env exists. localrules: - illuminaio_conda, verifyidintensity_conda, -rule illuminaio_conda: - output: - temp(".illuminaio_env_built"), - conda: - cfg.conda("illuminaio") - shell: - "touch {output[0]}" - - rule verifyidintensity_conda: output: temp(".verifyidintensity_env_built"), @@ -70,43 +59,8 @@ rule verifyidintensity_conda: if config.get("cluster_mode", False): localrules: - agg_median_idat_intensity, agg_verifyidintensity, - rule grouped_median_idat_intensity: - """Calculate median intensity of Red and Green channels.""" - input: - sample_sheet_csv="cgr_sample_sheet.csv", - _=rules.illuminaio_conda.output[0], - params: - grp="{grp}", - idat_red_pattern=lambda _: cfg.config.user_files.idat_pattern.red, - idat_green_pattern=lambda _: cfg.config.user_files.idat_pattern.green, - rscript=cfg.scripts("median_idat_intensity.R"), - conda_env=cfg.conda("illuminaio"), - notemp=config.get("notemp", False), - output: - temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), - threads: 12 - resources: - mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, - time_hr=lambda wildcards, attempt: 4 * attempt, - script: - "../scripts/grouped_median_idat_intensity.py" - - rule agg_median_idat_intensity: - input: - expand(rules.grouped_median_idat_intensity.output[0], grp=cfg.cluster_groups), - params: - notemp=config.get("notemp", False), - output: - "sample_level/contamination/median_idat_intensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt ** 2, - script: - "../scripts/agg_median_idat_intensity.py" - rule grouped_contamination: """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. @@ -157,47 +111,12 @@ if config.get("cluster_mode", False): "sample_level/contamination/verifyIDintensity.csv", resources: mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt ** 2, + time_hr=lambda wildcards, attempt: attempt**2, script: "../scripts/agg_verifyidintensity.py" - else: - rule per_sample_median_idat_intensity: - """Calculate median intensity of Red and Green channels.""" - input: - red=lambda wc: cfg.expand( - cfg.config.user_files.idat_pattern.red, - query=f"Sample_ID == '{wc.Sample_ID}'", - )[0], - green=lambda wc: cfg.expand( - cfg.config.user_files.idat_pattern.green, - query=f"Sample_ID == '{wc.Sample_ID}'", - )[0], - _=rules.illuminaio_conda.output[0], - params: - sample_id="{Sample_ID}", - rscript=cfg.scripts("median_idat_intensity.R"), - conda_env=cfg.conda("illuminaio"), - output: - temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), - resources: - mem_mb=lambda wildcards, attempt: attempt * 1024, - script: - "../scripts/median_idat_intensity.py" - - rule agg_median_idat_intensity: - input: - cfg.expand(rules.per_sample_median_idat_intensity.output[0]), - output: - "sample_level/contamination/median_idat_intensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt ** 2, - script: - "../scripts/agg_median_idat_intensity.py" - rule per_sample_gtc_to_adpc: """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. @@ -256,6 +175,6 @@ else: "sample_level/contamination/verifyIDintensity.csv", resources: mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt ** 2, + time_hr=lambda wildcards, attempt: attempt**2, script: "../scripts/agg_verifyidintensity.py" From 0dd09f1611e52a7dbdc89d4d639d67d8094cd6e7 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Tue, 16 Jul 2024 12:14:27 -0400 Subject: [PATCH 07/36] Adds rules contamination.smk to do contamination checks with bcf input. --- .../workflow/modules/thousand_genomes.smk | 25 ++ src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py | 3 +- .../workflow/sub_workflows/contamination.smk | 375 ++++++++++++------ 3 files changed, 281 insertions(+), 122 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk b/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk index 4141c2d6..f871a34e 100644 --- a/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk +++ b/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk @@ -28,6 +28,31 @@ rule pull_b_allele_freq_from_1kg: "../scripts/bpm2abf.py" +rule pull_b_allele_freq_from_1kg_bcfinput: + """Pulls the population level allele frequencies from the 1KG project. + + ``verifyIDintensity`` requires population level allele frequencies + for its model. Here we use a custom script to pull out the allele B + frequencies (ABF) from the 1000 genomes project (1KG). To do this we + take each marker from the manifest file (BPM) and pull out ABF in the + 1KG ``.vcf`` from the ``INFO`` column. The script allows pulling out + allele frequencies for different super populations but defaults to + ``AF`` which ignores super population. + """ + input: + vcf_file=cfg.config.user_files.bcf, + kgvcf_file=cfg.config.reference_files.thousand_genome_vcf, + params: + population=cfg.config.software_params.contam_population, + output: + abf_file="sample_level/{}.{}.abf.txt".format( + cfg.config.snp_array, + cfg.config.software_params.contam_population, + ), + script: + "../scripts/vcf2abf.py" + + rule update_snps_to_1kg_rsID: """Update SNP IDs to rsID from the 1KG project. diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py index 1bd75e42..6037f0a6 100755 --- a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py @@ -27,7 +27,7 @@ def main( vcf_file: Path = typer.Argument( ..., help="Path to a multisample VCF file with needed scores.", exists=True, readable=True ), - target_sample: str = typer.Argument(..., help="Sample name.", exists=True, readable=True), + target_sample: str = typer.Argument(..., help="Sample name."), outfile: Path = typer.Argument( ..., help="Path to output the adpc.bin file.", file_okay=True, writable=True ), @@ -136,6 +136,7 @@ def vcf_to_adpc_record(vcf_info) -> Generator[AdpcRecord, None, None]: if "snakemake" in locals(): defaults = {} defaults.update({k: Path(v) for k, v in snakemake.input.items()}) # type: ignore # noqa + defaults.update({k: v for k, v in snakemake.params.items()}) # type: ignore # noqa defaults.update({"outfile": Path(snakemake.output[0])}) # type: ignore # noqa main(**defaults) else: diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk index aa6f1d4c..d537880e 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk @@ -28,7 +28,16 @@ module thousand_genomes: {} -use rule pull_b_allele_freq_from_1kg from thousand_genomes +if cfg.config.user_files.bcf: + + use rule pull_b_allele_freq_from_1kg_bcfinput from thousand_genomes as pull_b_allele_freq_from_1kg with: + input: + vcf_file=cfg.config.user_files.bcf, + kgvcf_file=cfg.config.reference_files.thousand_genome_vcf, + +else: + + use rule pull_b_allele_freq_from_1kg from thousand_genomes ################################################################################ @@ -56,125 +65,249 @@ rule verifyidintensity_conda: ################################################################################ # Workflow Rules ################################################################################ -if config.get("cluster_mode", False): - - localrules: - agg_verifyidintensity, - - rule grouped_contamination: - """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. - - This is the format required by ``verifyIDintensity``. The script also - runs some sanity checks (intensities and normalized intensities > 0; - genotypes are one of {0, 1, 2, 3}) while processing each file. - - .. warning:: - This is a submission hot spot creating 1 job per sample. - - Find contaminated samples using allele intensities. - - Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the - population. - - .. warning:: - This is a submission hot spot creating 1 job per sample. - - .. note:: - Here we are running ``verifyIDintensity`` in single sample mode. This software also has a - multi-sample mode which may be faster and give better estimates. The problem with - multi-sample mode is that it only works when you have a "large" number of samples. - """ - input: - sample_sheet_csv="cgr_sample_sheet.csv", - bpm_file=cfg.config.reference_files.illumina_manifest_file, - abf_file=rules.pull_b_allele_freq_from_1kg.output.abf_file, - _=rules.verifyidintensity_conda.output[0], - params: - grp="{grp}", - gtc_pattern=lambda _: cfg.config.user_files.gtc_pattern, - snps=cfg.config.num_snps, - conda_env=cfg.conda("verifyidintensity"), - notemp=config.get("notemp", False), - output: - temp("sample_level/contamination/{grp}/verifyIDintensity.csv"), - threads: 12 - resources: - mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, - time_hr=lambda wildcards, attempt: 4 * attempt, - script: - "../scripts/grouped_contamination.py" - - rule agg_verifyidintensity: - input: - expand(rules.grouped_contamination.output[0], grp=cfg.cluster_groups), - output: - "sample_level/contamination/verifyIDintensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt**2, - script: - "../scripts/agg_verifyidintensity.py" +# If BCF file is input +if cfg.config.user_files.bcf: + if config.get("cluster_mode", False): + + localrules: + agg_verifyidintensity, + + rule grouped_contamination: + """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. + + This is the format required by ``verifyIDintensity``. The script also + runs some sanity checks (intensities and normalized intensities > 0; + genotypes are one of {0, 1, 2, 3}) while processing each file. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + Find contaminated samples using allele intensities. + + Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the + population. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + .. note:: + Here we are running ``verifyIDintensity`` in single sample mode. This software also has a + multi-sample mode which may be faster and give better estimates. The problem with + multi-sample mode is that it only works when you have a "large" number of samples. + """ + input: + sample_sheet_csv="cgr_sample_sheet.csv", + bpm_file=cfg.config.reference_files.illumina_manifest_file, + abf_file=rules.pull_b_allele_freq_from_1kg.output.abf_file, + _=rules.verifyidintensity_conda.output[0], + params: + grp="{grp}", + gtc_pattern=lambda _: cfg.config.user_files.gtc_pattern, + snps=cfg.config.num_snps, + conda_env=cfg.conda("verifyidintensity"), + notemp=config.get("notemp", False), + output: + temp("sample_level/contamination/{grp}/verifyIDintensity.csv"), + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, + time_hr=lambda wildcards, attempt: 4 * attempt, + script: + "../scripts/grouped_contamination.py" + + rule agg_verifyidintensity: + input: + expand(rules.grouped_contamination.output[0], grp=cfg.cluster_groups), + output: + "sample_level/contamination/verifyIDintensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_verifyidintensity.py" + + else: + + rule per_sample_vcf_to_adpc: + """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. + + This is the format required by ``verifyIDintensity``. The script also + runs some sanity checks (intensities and normalized intensities > 0; + genotypes are one of {0, 1, 2, 3}) while processing each file. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + """ + input: + vcf_file=cfg.config.user_files.bcf, + params: + target_sample="{Sample_ID}", + output: + temp("sample_level/per_sample_adpc/{Sample_ID}.adpc.bin"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/vcf2adpc.py" + + rule per_sample_contamination_verifyIDintensity: + """Find contaminated samples using allele intensities. + + Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the + population. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + .. note:: + Here we are running ``verifyIDintensity`` in single sample mode. This software also has a + multi-sample mode which may be faster and give better estimates. The problem with + multi-sample mode is that it only works when you have a "large" number of samples. + """ + input: + adpc=rules.per_sample_vcf_to_adpc.output[0], + abf=rules.pull_b_allele_freq_from_1kg.output.abf_file, + _=rules.verifyidintensity_conda.output[0], + params: + snps=cfg.config.num_snps, + conda_env=cfg.conda("verifyidintensity"), + output: + temp("sample_level/per_sample_contamination_test/{Sample_ID}.contam.out"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/verifyidintensity.py" + + rule agg_verifyidintensity: + input: + cfg.expand(rules.per_sample_contamination_verifyIDintensity.output[0]), + output: + "sample_level/contamination/verifyIDintensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_verifyidintensity.py" else: - - rule per_sample_gtc_to_adpc: - """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. - - This is the format required by ``verifyIDintensity``. The script also - runs some sanity checks (intensities and normalized intensities > 0; - genotypes are one of {0, 1, 2, 3}) while processing each file. - - .. warning:: - This is a submission hot spot creating 1 job per sample. - """ - input: - gtc_file=lambda wc: cfg.expand( - cfg.config.user_files.gtc_pattern, - query=f"Sample_ID == '{wc.Sample_ID}'", - )[0], - bpm_file=cfg.config.reference_files.illumina_manifest_file, - output: - temp("sample_level/per_sample_adpc/{Sample_ID}.adpc.bin"), - resources: - mem_mb=lambda wildcards, attempt: attempt * 1024, - script: - "../scripts/gtc2adpc.py" - - rule per_sample_contamination_verifyIDintensity: - """Find contaminated samples using allele intensities. - - Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the - population. - - .. warning:: - This is a submission hot spot creating 1 job per sample. - - .. note:: - Here we are running ``verifyIDintensity`` in single sample mode. This software also has a - multi-sample mode which may be faster and give better estimates. The problem with - multi-sample mode is that it only works when you have a "large" number of samples. - """ - input: - adpc=rules.per_sample_gtc_to_adpc.output[0], - abf=rules.pull_b_allele_freq_from_1kg.output.abf_file, - _=rules.verifyidintensity_conda.output[0], - params: - snps=cfg.config.num_snps, - conda_env=cfg.conda("verifyidintensity"), - output: - temp("sample_level/per_sample_contamination_test/{Sample_ID}.contam.out"), - resources: - mem_mb=lambda wildcards, attempt: attempt * 1024, - script: - "../scripts/verifyidintensity.py" - - rule agg_verifyidintensity: - input: - cfg.expand(rules.per_sample_contamination_verifyIDintensity.output[0]), - output: - "sample_level/contamination/verifyIDintensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt**2, - script: - "../scripts/agg_verifyidintensity.py" + if config.get("cluster_mode", False): + + localrules: + agg_verifyidintensity, + + rule grouped_contamination: + """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. + + This is the format required by ``verifyIDintensity``. The script also + runs some sanity checks (intensities and normalized intensities > 0; + genotypes are one of {0, 1, 2, 3}) while processing each file. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + Find contaminated samples using allele intensities. + + Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the + population. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + .. note:: + Here we are running ``verifyIDintensity`` in single sample mode. This software also has a + multi-sample mode which may be faster and give better estimates. The problem with + multi-sample mode is that it only works when you have a "large" number of samples. + """ + input: + sample_sheet_csv="cgr_sample_sheet.csv", + bpm_file=cfg.config.reference_files.illumina_manifest_file, + abf_file=rules.pull_b_allele_freq_from_1kg.output.abf_file, + _=rules.verifyidintensity_conda.output[0], + params: + grp="{grp}", + gtc_pattern=lambda _: cfg.config.user_files.gtc_pattern, + snps=cfg.config.num_snps, + conda_env=cfg.conda("verifyidintensity"), + notemp=config.get("notemp", False), + output: + temp("sample_level/contamination/{grp}/verifyIDintensity.csv"), + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, + time_hr=lambda wildcards, attempt: 4 * attempt, + script: + "../scripts/grouped_contamination.py" + + rule agg_verifyidintensity: + input: + expand(rules.grouped_contamination.output[0], grp=cfg.cluster_groups), + output: + "sample_level/contamination/verifyIDintensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_verifyidintensity.py" + + else: + + rule per_sample_gtc_to_adpc: + """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. + + This is the format required by ``verifyIDintensity``. The script also + runs some sanity checks (intensities and normalized intensities > 0; + genotypes are one of {0, 1, 2, 3}) while processing each file. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + """ + input: + gtc_file=lambda wc: cfg.expand( + cfg.config.user_files.gtc_pattern, + query=f"Sample_ID == '{wc.Sample_ID}'", + )[0], + bpm_file=cfg.config.reference_files.illumina_manifest_file, + output: + temp("sample_level/per_sample_adpc/{Sample_ID}.adpc.bin"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/gtc2adpc.py" + + rule per_sample_contamination_verifyIDintensity: + """Find contaminated samples using allele intensities. + + Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the + population. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + .. note:: + Here we are running ``verifyIDintensity`` in single sample mode. This software also has a + multi-sample mode which may be faster and give better estimates. The problem with + multi-sample mode is that it only works when you have a "large" number of samples. + """ + input: + adpc=rules.per_sample_gtc_to_adpc.output[0], + abf=rules.pull_b_allele_freq_from_1kg.output.abf_file, + _=rules.verifyidintensity_conda.output[0], + params: + snps=cfg.config.num_snps, + conda_env=cfg.conda("verifyidintensity"), + output: + temp("sample_level/per_sample_contamination_test/{Sample_ID}.contam.out"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/verifyidintensity.py" + + rule agg_verifyidintensity: + input: + cfg.expand(rules.per_sample_contamination_verifyIDintensity.output[0]), + output: + "sample_level/contamination/verifyIDintensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_verifyidintensity.py" From 81697d6f6acb5df40bbad0d983cfb733084600ee Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Tue, 16 Jul 2024 15:11:41 -0400 Subject: [PATCH 08/36] Enables contamiantion check with bcf input in cluster mode. Modifies contamination.smk and grouped_contamination.py to enable contamination check in cluster mode. --- .../workflow/scripts/grouped_contamination.py | 68 +++++++++++++++---- .../workflow/sub_workflows/contamination.smk | 6 +- 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py b/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py index a8f79749..7c8083be 100644 --- a/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py +++ b/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py @@ -1,26 +1,42 @@ import shutil from concurrent.futures import ProcessPoolExecutor from pathlib import Path -from typing import List +from typing import List, Optional +import typer from snakemake.rules import expand +from cgr_gwas_qc import load_config from cgr_gwas_qc.parsers import sample_sheet from cgr_gwas_qc.typing import PathLike -from cgr_gwas_qc.workflow.scripts import agg_verifyidintensity, gtc2adpc, verifyidintensity +from cgr_gwas_qc.workflow.scripts import ( + agg_verifyidintensity, + gtc2adpc, + vcf2adpc, + verifyidintensity, +) +cfg = load_config() +app = typer.Typer(add_completion=False) + +@app.command() def main( - sample_sheet_csv: PathLike, - bpm_file: PathLike, - abf_file: PathLike, - grp: str, - gtc_pattern: str, - snps: int, - conda_env: PathLike, - outfile: PathLike, - notemp: bool = False, - threads: int = 8, + sample_sheet_csv: PathLike = typer.Argument(..., help="Path to cgr_samplesheet"), + bpm_file: Optional[PathLike] = typer.Argument( + ..., help="Path to bpm_file[optional, to be used if GTC input]" + ), + abf_file: PathLike = typer.Argument(..., help="Allele B frequency file"), + vcf_file: Optional[PathLike] = typer.Argument( + ..., help="vcf_file[optional, to be used if VCF/BCF input]" + ), + grp: str = typer.Argument(..., help="cluster_grp"), + gtc_pattern: Optional[str] = typer.Argument(..., help="gtc_pattern"), + snps: int = typer.Argument(..., help="number of snps"), + conda_env: PathLike = typer.Argument(..., help="conda enviornment"), + outfile: PathLike = typer.Argument(..., help="output file name"), + notemp: bool = typer.Argument(False, help="No temporary files should be created"), + threads: int = typer.Argument(8, help="number of threads"), ): ss = sample_sheet.read(sample_sheet_csv).query(f"cluster_group == '{grp}'") @@ -37,8 +53,12 @@ def main( adpc_pattern = (tmp_adpc / "{Sample_ID}.adpc.bin").as_posix() contam_pattern = (tmp_contam / "{Sample_ID}.txt").as_posix() - # Convert GTC to ADPC - convert_gtc_to_adpc(ss, bpm_file, gtc_pattern, adpc_pattern, threads) + # If BCF file is input, Convert VCF to ADPC per sample + if cfg.config.user_files.bcf: + convert_vcf_to_adpc(ss, vcf_file, adpc_pattern, threads) + # Otherwise, Convert GTC to ADPC - default + else: + convert_gtc_to_adpc(ss, bpm_file, gtc_pattern, adpc_pattern, threads) # Estimate Contamination contam_files = estimate_contamination( @@ -71,6 +91,26 @@ def convert_gtc_to_adpc(ss, bpm_file, gtc_pattern, outfile_pattern, threads): assert all(f.exists() for f in outfiles) +def convert_vcf_to_adpc(ss, vcf_file, outfile_pattern, threads): + with ProcessPoolExecutor(threads) as executor: + futures = [ + executor.submit( + vcf2adpc.main, + vcf_file, + record.Sample_ID, + outfile_pattern.format(**record.to_dict()), + ) + for _, record in ss.iterrows() + ] + + # Wait for results + [res.result() for res in futures] + + # Check the expected output files + outfiles = [Path(f) for f in expand(outfile_pattern, **ss.to_dict("list"))] + assert all(f.exists() for f in outfiles) + + def estimate_contamination( ss, adpc_pattern, abf_file, outfile_pattern, snps, conda_env, threads ) -> List[Path]: diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk index d537880e..ca520cd9 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk @@ -73,7 +73,8 @@ if cfg.config.user_files.bcf: agg_verifyidintensity, rule grouped_contamination: - """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. + """Extracts normalized intensities and other metrics from an aggregated + VCF file and writes to Illumina ADPC.BIN per sample. This is the format required by ``verifyIDintensity``. The script also runs some sanity checks (intensities and normalized intensities > 0; @@ -97,12 +98,11 @@ if cfg.config.user_files.bcf: """ input: sample_sheet_csv="cgr_sample_sheet.csv", - bpm_file=cfg.config.reference_files.illumina_manifest_file, + vcf_file=cfg.config.user_files.bcf, abf_file=rules.pull_b_allele_freq_from_1kg.output.abf_file, _=rules.verifyidintensity_conda.output[0], params: grp="{grp}", - gtc_pattern=lambda _: cfg.config.user_files.gtc_pattern, snps=cfg.config.num_snps, conda_env=cfg.conda("verifyidintensity"), notemp=config.get("notemp", False), From 56dd2459f94e252c93dabedf3484fe21827dc9fc Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Mon, 22 Jul 2024 10:42:18 -0400 Subject: [PATCH 09/36] Adds capability to infer median intensity directly from input BCF/VCF. Avoids the dependency on IDAT files for calculating median intensity with VCF/BCF input. Adds scripts for both in per-sample and grouped/cluster mode. Modifies the intensity workflow to execute appropriately if VCF/BCF input. --- .../scripts/grouped_intensity_from_vcf.py | 75 +++++++ .../scripts/median_intensity_from_vcf.py | 54 +++++ .../workflow/sub_workflows/idat_intensity.smk | 209 ++++++++++++------ 3 files changed, 265 insertions(+), 73 deletions(-) create mode 100755 src/cgr_gwas_qc/workflow/scripts/grouped_intensity_from_vcf.py create mode 100755 src/cgr_gwas_qc/workflow/scripts/median_intensity_from_vcf.py diff --git a/src/cgr_gwas_qc/workflow/scripts/grouped_intensity_from_vcf.py b/src/cgr_gwas_qc/workflow/scripts/grouped_intensity_from_vcf.py new file mode 100755 index 00000000..9bfd1411 --- /dev/null +++ b/src/cgr_gwas_qc/workflow/scripts/grouped_intensity_from_vcf.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[5]: + + +import shutil +from concurrent.futures import ProcessPoolExecutor +from pathlib import Path +from typing import List + +from snakemake.rules import expand + +from cgr_gwas_qc.parsers import sample_sheet +from cgr_gwas_qc.typing import PathLike +from cgr_gwas_qc.workflow.scripts import agg_median_idat_intensity, median_intensity_from_vcf + +# In[ ]: + + +def main( + vcf_file: PathLike, + sample_sheet_csv: PathLike, + grp: str, + outfile: PathLike, + notemp: bool = False, + threads: int = 8, +): + ss = sample_sheet.read(sample_sheet_csv).query(f"cluster_group == '{grp}'") + tmp_dir = Path(outfile).parent / "temp_median_idat" + tmp_dir.mkdir(exist_ok=True, parents=True) + + intensity_files = calculate_median_intensity_from_vcf(ss, vcf_file, tmp_dir, threads) + + agg_median_idat_intensity.main(intensity_files, outfile) + + if not notemp: + shutil.rmtree(tmp_dir) + + +# In[10]: + + +def calculate_median_intensity_from_vcf(ss, vcf_file, outdir, threads) -> List[Path]: + outfile_pattern = (outdir / "{Sample_ID}.csv").as_posix() + with ProcessPoolExecutor(threads) as executor: + futures = [ + executor.submit( + median_intensity_from_vcf.main, + vcf_file, + record.Sample_ID, + outfile_pattern.format(**record.to_dict()), + ) + for _, record in ss.iterrows() + ] + + # Wait for results + [res.result() for res in futures] + + # Check and Return files + outfiles = [Path(f) for f in expand(outfile_pattern, **ss.to_dict("list"))] + assert all(f.exists() for f in outfiles) + return outfiles + + +# In[6]: + + +if __name__ == "__main__" and "snakemake" in locals(): + main( + **{k: v for k, v in snakemake.input.items()}, # type: ignore # noqa + **{k: v for k, v in snakemake.params.items()}, # type: ignore # noqa + outfile=snakemake.output[0], # type: ignore # noqa + threads=snakemake.threads or 2, # type: ignore # noqa + ) diff --git a/src/cgr_gwas_qc/workflow/scripts/median_intensity_from_vcf.py b/src/cgr_gwas_qc/workflow/scripts/median_intensity_from_vcf.py new file mode 100755 index 00000000..5e541fd2 --- /dev/null +++ b/src/cgr_gwas_qc/workflow/scripts/median_intensity_from_vcf.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +from pathlib import Path +from statistics import median + +import numpy as np +import pandas as pd +import typer +from pysam import VariantFile + +app = typer.Typer(add_completion=False) + + +@app.command() +def main( + vcf_file: Path = typer.Argument( + ..., + help="Path to a multisample BCF/VCF file with needed scores.", + exists=True, + readable=True, + ), + sample_id: str = typer.Argument(..., help="Sample_ID. Should match as in VCF"), + outfile: Path = typer.Argument( + ..., + help="Path to output file to write per sample median intensity.", + file_okay=True, + writable=True, + ), +) -> None: + fields_to_fetch = ["X", "Y"] + vcf = VariantFile(vcf_file) + + # loops over all records in the VCF, fetches the 'X' and 'Y' intensity values, sums them and stores them in the list + # of 'sample_values'.z + sample_values = [] + for rec in vcf: + sample_values.append(np.sum([rec.samples[sample_id].get(key) for key in fields_to_fetch])) + vcf.close() + + median_intensity = median(sample_values) + df = pd.DataFrame({"Sample_ID": [sample_id], "median_intensity": [median_intensity]}) + df.to_csv(outfile, index=False) + + +if __name__ == "__main__" and "snakemake" in locals(): + main( + **{k: v for k, v in snakemake.input.items() if not k.startswith("_")}, # type: ignore # noqa + **{k: v for k, v in snakemake.params.items()}, # type: ignore # noqa + outfile=snakemake.output[0], # type: ignore # noqa + ) diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk b/src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk index 791c3f1e..73cb1d6e 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk @@ -6,7 +6,7 @@ cfg = load_config() ################################################################################ -# Contamination Targets +# Intensity Targets ################################################################################ targets = [ "sample_level/contamination/median_idat_intensity.csv", @@ -43,77 +43,140 @@ rule illuminaio_conda: ################################################################################ # Workflow Rules ################################################################################ -if config.get("cluster_mode", False): - - localrules: - agg_median_idat_intensity, - - rule grouped_median_idat_intensity: - """Calculate median intensity of Red and Green channels.""" - input: - sample_sheet_csv="cgr_sample_sheet.csv", - _=rules.illuminaio_conda.output[0], - params: - grp="{grp}", - idat_red_pattern=lambda _: cfg.config.user_files.idat_pattern.red, - idat_green_pattern=lambda _: cfg.config.user_files.idat_pattern.green, - rscript=cfg.scripts("median_idat_intensity.R"), - conda_env=cfg.conda("illuminaio"), - notemp=config.get("notemp", False), - output: - temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), - threads: 12 - resources: - mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, - time_hr=lambda wildcards, attempt: 4 * attempt, - script: - "../scripts/grouped_median_idat_intensity.py" - - rule agg_median_idat_intensity: - input: - expand(rules.grouped_median_idat_intensity.output[0], grp=cfg.cluster_groups), - params: - notemp=config.get("notemp", False), - output: - "sample_level/contamination/median_idat_intensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt**2, - script: - "../scripts/agg_median_idat_intensity.py" +if cfg.config.user_files.bcf: + if config.get("cluster_mode", False): + + localrules: + agg_median_idat_intensity, + + rule grouped_median_intensity_from_vcf: + """Calculate median intensity from raw intensities using VCF/BCF input.""" + input: + sample_sheet_csv="cgr_sample_sheet.csv", + vcf_file=cfg.config.user_files.bcf, + params: + grp="{grp}", + notemp=config.get("notemp", False), + output: + temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, + time_hr=lambda wildcards, attempt: 4 * attempt, + script: + "../scripts/grouped_intensity_from_vcf.py" + + rule agg_median_idat_intensity: + input: + expand(rules.grouped_median_intensity_from_vcf.output[0], grp=cfg.cluster_groups), + params: + notemp=config.get("notemp", False), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" + + else: + + rule per_sample_median_intensity_from_vcf: + """Calculate median intensity from raw intensities using VCF/BCF input.""" + input: + vcf_file=cfg.config.user_files.bcf, + params: + sample_id="{Sample_ID}", + output: + temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/median_intensity_from_vcf.py" + + rule agg_median_idat_intensity: + input: + cfg.expand(rules.per_sample_median_intensity_from_vcf.output[0]), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" else: - - rule per_sample_median_idat_intensity: - """Calculate median intensity of Red and Green channels.""" - input: - red=lambda wc: cfg.expand( - cfg.config.user_files.idat_pattern.red, - query=f"Sample_ID == '{wc.Sample_ID}'", - )[0], - green=lambda wc: cfg.expand( - cfg.config.user_files.idat_pattern.green, - query=f"Sample_ID == '{wc.Sample_ID}'", - )[0], - _=rules.illuminaio_conda.output[0], - params: - sample_id="{Sample_ID}", - rscript=cfg.scripts("median_idat_intensity.R"), - conda_env=cfg.conda("illuminaio"), - output: - temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), - resources: - mem_mb=lambda wildcards, attempt: attempt * 1024, - script: - "../scripts/median_idat_intensity.py" - - rule agg_median_idat_intensity: - input: - cfg.expand(rules.per_sample_median_idat_intensity.output[0]), - output: - "sample_level/contamination/median_idat_intensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt**2, - script: - "../scripts/agg_median_idat_intensity.py" + if config.get("cluster_mode", False): + + localrules: + agg_median_idat_intensity, + + rule grouped_median_idat_intensity: + """Calculate median intensity of Red and Green channels.""" + input: + sample_sheet_csv="cgr_sample_sheet.csv", + _=rules.illuminaio_conda.output[0], + params: + grp="{grp}", + idat_red_pattern=lambda _: cfg.config.user_files.idat_pattern.red, + idat_green_pattern=lambda _: cfg.config.user_files.idat_pattern.green, + rscript=cfg.scripts("median_idat_intensity.R"), + conda_env=cfg.conda("illuminaio"), + notemp=config.get("notemp", False), + output: + temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, + time_hr=lambda wildcards, attempt: 4 * attempt, + script: + "../scripts/grouped_median_idat_intensity.py" + + rule agg_median_idat_intensity: + input: + expand(rules.grouped_median_idat_intensity.output[0], grp=cfg.cluster_groups), + params: + notemp=config.get("notemp", False), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" + + else: + + rule per_sample_median_idat_intensity: + """Calculate median intensity of Red and Green channels.""" + input: + red=lambda wc: cfg.expand( + cfg.config.user_files.idat_pattern.red, + query=f"Sample_ID == '{wc.Sample_ID}'", + )[0], + green=lambda wc: cfg.expand( + cfg.config.user_files.idat_pattern.green, + query=f"Sample_ID == '{wc.Sample_ID}'", + )[0], + _=rules.illuminaio_conda.output[0], + params: + sample_id="{Sample_ID}", + rscript=cfg.scripts("median_idat_intensity.R"), + conda_env=cfg.conda("illuminaio"), + output: + temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/median_idat_intensity.py" + + rule agg_median_idat_intensity: + input: + cfg.expand(rules.per_sample_median_idat_intensity.output[0]), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" From 0aaaa9b144baae3ef6eca7fbdd7512b911c81f60 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Mon, 22 Jul 2024 12:05:25 -0400 Subject: [PATCH 10/36] Adds the distinction of contamination and intensity checks in the snakefile and sample_qc subworkflow Other than existing 'use_contamination' checks, also adds 'intensity_retreived' and 'contamination_checked' tests which simply tests specifically if output csv files were created regardless of configs/entry point to feed them to sample_qc. --- src/cgr_gwas_qc/workflow/Snakefile | 14 +- .../sub_workflows/intensity_check.smk | 182 ++++++++++++++++++ .../workflow/sub_workflows/sample_qc.smk | 10 +- 3 files changed, 203 insertions(+), 3 deletions(-) create mode 100644 src/cgr_gwas_qc/workflow/sub_workflows/intensity_check.smk diff --git a/src/cgr_gwas_qc/workflow/Snakefile b/src/cgr_gwas_qc/workflow/Snakefile index cf38277b..92fccd24 100644 --- a/src/cgr_gwas_qc/workflow/Snakefile +++ b/src/cgr_gwas_qc/workflow/Snakefile @@ -58,10 +58,22 @@ if sample_qc.use_contamination: use rule * from contamination - targets.extend(contamination.targets) +if sample_qc.use_contamination: + + module intensity_check: + snakefile: + cfg.subworkflow("intensity_check") + config: + config + + use rule * from intensity_check + + targets.extend(intensity_check.targets) + + module subject_qc: snakefile: cfg.subworkflow("subject_qc") diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/intensity_check.smk b/src/cgr_gwas_qc/workflow/sub_workflows/intensity_check.smk new file mode 100644 index 00000000..73cb1d6e --- /dev/null +++ b/src/cgr_gwas_qc/workflow/sub_workflows/intensity_check.smk @@ -0,0 +1,182 @@ +import pandas as pd + +from cgr_gwas_qc import load_config + +cfg = load_config() + + +################################################################################ +# Intensity Targets +################################################################################ +targets = [ + "sample_level/contamination/median_idat_intensity.csv", +] + + +rule all_contamination: + input: + targets, + + +################################################################################ +# PHONY Rules +################################################################################ +# NOTE: Because of job grouping it is cleaner to wrap the various CLI utilities in +# their own python script. This makes using conda more complicated. Instead of +# installing all of the python dependencies in each environment, I will just +# pass the conda environment and activate it internal. However, we want to make +# sure that snakemake creates the environments, so these PHONY rules will make +# sure that the conda env exists. +localrules: + illuminaio_conda, + + +rule illuminaio_conda: + output: + temp(".illuminaio_env_built"), + conda: + cfg.conda("illuminaio") + shell: + "touch {output[0]}" + + +################################################################################ +# Workflow Rules +################################################################################ +if cfg.config.user_files.bcf: + if config.get("cluster_mode", False): + + localrules: + agg_median_idat_intensity, + + rule grouped_median_intensity_from_vcf: + """Calculate median intensity from raw intensities using VCF/BCF input.""" + input: + sample_sheet_csv="cgr_sample_sheet.csv", + vcf_file=cfg.config.user_files.bcf, + params: + grp="{grp}", + notemp=config.get("notemp", False), + output: + temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, + time_hr=lambda wildcards, attempt: 4 * attempt, + script: + "../scripts/grouped_intensity_from_vcf.py" + + rule agg_median_idat_intensity: + input: + expand(rules.grouped_median_intensity_from_vcf.output[0], grp=cfg.cluster_groups), + params: + notemp=config.get("notemp", False), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" + + else: + + rule per_sample_median_intensity_from_vcf: + """Calculate median intensity from raw intensities using VCF/BCF input.""" + input: + vcf_file=cfg.config.user_files.bcf, + params: + sample_id="{Sample_ID}", + output: + temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/median_intensity_from_vcf.py" + + rule agg_median_idat_intensity: + input: + cfg.expand(rules.per_sample_median_intensity_from_vcf.output[0]), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" + +else: + if config.get("cluster_mode", False): + + localrules: + agg_median_idat_intensity, + + rule grouped_median_idat_intensity: + """Calculate median intensity of Red and Green channels.""" + input: + sample_sheet_csv="cgr_sample_sheet.csv", + _=rules.illuminaio_conda.output[0], + params: + grp="{grp}", + idat_red_pattern=lambda _: cfg.config.user_files.idat_pattern.red, + idat_green_pattern=lambda _: cfg.config.user_files.idat_pattern.green, + rscript=cfg.scripts("median_idat_intensity.R"), + conda_env=cfg.conda("illuminaio"), + notemp=config.get("notemp", False), + output: + temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, + time_hr=lambda wildcards, attempt: 4 * attempt, + script: + "../scripts/grouped_median_idat_intensity.py" + + rule agg_median_idat_intensity: + input: + expand(rules.grouped_median_idat_intensity.output[0], grp=cfg.cluster_groups), + params: + notemp=config.get("notemp", False), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" + + else: + + rule per_sample_median_idat_intensity: + """Calculate median intensity of Red and Green channels.""" + input: + red=lambda wc: cfg.expand( + cfg.config.user_files.idat_pattern.red, + query=f"Sample_ID == '{wc.Sample_ID}'", + )[0], + green=lambda wc: cfg.expand( + cfg.config.user_files.idat_pattern.green, + query=f"Sample_ID == '{wc.Sample_ID}'", + )[0], + _=rules.illuminaio_conda.output[0], + params: + sample_id="{Sample_ID}", + rscript=cfg.scripts("median_idat_intensity.R"), + conda_env=cfg.conda("illuminaio"), + output: + temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/median_idat_intensity.py" + + rule agg_median_idat_intensity: + input: + cfg.expand(rules.per_sample_median_idat_intensity.output[0]), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk index f6ee2231..490a825b 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk @@ -11,6 +11,10 @@ use_contamination = ( and cfg.config.workflow_params.remove_contam ) +idat_intensity_retrieved = Path("sample_level/contamination/median_idat_intensity.csv").is_file() + +contamination_checked = Path("sample_level/contamination/verifyIDintensity.csv").is_file() + localrules: all_sample_qc, @@ -196,7 +200,7 @@ use rule miss from plink as plink_call_rate_post2 with: # ------------------------------------------------------------------------------- # Contamination # ------------------------------------------------------------------------------- -if use_contamination: +if use_contamination or (idat_intensity_retrieved and contamination_checked): localrules: sample_contamination_verifyIDintensity, @@ -485,7 +489,7 @@ else: def _contam(wildcards): uf, wf = cfg.config.user_files, cfg.config.workflow_params - if use_contamination: + if use_contamination or (idat_intensity_retrieved and contamination_checked): return rules.sample_contamination_verifyIDintensity.output[0] return [] @@ -494,6 +498,8 @@ def _intensity(wildcards): uf, wf = cfg.config.user_files, cfg.config.workflow_params if use_contamination: return "sample_level/contamination/median_idat_intensity.csv" + elif idat_intensity_retrieved: + return "sample_level/contamination/median_idat_intensity.csv" return [] From 0dcbaf29f57cbfc100d92bfe8d0bf21cf3083e9a Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Fri, 16 Aug 2024 14:41:49 -0400 Subject: [PATCH 11/36] Correcting conditions such with BCF input intensity and contamination check is default --- src/cgr_gwas_qc/workflow/Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/Snakefile b/src/cgr_gwas_qc/workflow/Snakefile index 92fccd24..1aa2a824 100644 --- a/src/cgr_gwas_qc/workflow/Snakefile +++ b/src/cgr_gwas_qc/workflow/Snakefile @@ -48,7 +48,7 @@ use rule * from sample_qc targets.extend(sample_qc.targets) -if sample_qc.use_contamination: +if sample_qc.use_contamination or cfg.config.user_files.bcf: module contamination: snakefile: @@ -61,7 +61,7 @@ if sample_qc.use_contamination: targets.extend(contamination.targets) -if sample_qc.use_contamination: +if sample_qc.use_contamination or cfg.config.user_files.bcf: module intensity_check: snakefile: From df4e9f27bf599c79d269a2cf1020663f38f44d8f Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Wed, 11 Sep 2024 14:34:02 -0400 Subject: [PATCH 12/36] Renames variable vcf_file to bcf_file when bcf file is refered. --- src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk | 2 +- src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py | 8 ++++---- src/cgr_gwas_qc/workflow/scripts/vcf2abf.py | 6 +++--- src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk b/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk index f871a34e..ac36f41e 100644 --- a/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk +++ b/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk @@ -40,7 +40,7 @@ rule pull_b_allele_freq_from_1kg_bcfinput: ``AF`` which ignores super population. """ input: - vcf_file=cfg.config.user_files.bcf, + bcf_file=cfg.config.user_files.bcf, kgvcf_file=cfg.config.reference_files.thousand_genome_vcf, params: population=cfg.config.software_params.contam_population, diff --git a/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py b/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py index 7c8083be..e438f075 100644 --- a/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py +++ b/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py @@ -27,7 +27,7 @@ def main( ..., help="Path to bpm_file[optional, to be used if GTC input]" ), abf_file: PathLike = typer.Argument(..., help="Allele B frequency file"), - vcf_file: Optional[PathLike] = typer.Argument( + bcf_file: Optional[PathLike] = typer.Argument( ..., help="vcf_file[optional, to be used if VCF/BCF input]" ), grp: str = typer.Argument(..., help="cluster_grp"), @@ -55,7 +55,7 @@ def main( # If BCF file is input, Convert VCF to ADPC per sample if cfg.config.user_files.bcf: - convert_vcf_to_adpc(ss, vcf_file, adpc_pattern, threads) + convert_vcf_to_adpc(ss, bcf_file, adpc_pattern, threads) # Otherwise, Convert GTC to ADPC - default else: convert_gtc_to_adpc(ss, bpm_file, gtc_pattern, adpc_pattern, threads) @@ -91,12 +91,12 @@ def convert_gtc_to_adpc(ss, bpm_file, gtc_pattern, outfile_pattern, threads): assert all(f.exists() for f in outfiles) -def convert_vcf_to_adpc(ss, vcf_file, outfile_pattern, threads): +def convert_vcf_to_adpc(ss, bcf_file, outfile_pattern, threads): with ProcessPoolExecutor(threads) as executor: futures = [ executor.submit( vcf2adpc.main, - vcf_file, + bcf_file, record.Sample_ID, outfile_pattern.format(**record.to_dict()), ) diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py index 900fe31c..7056f76c 100755 --- a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py @@ -27,7 +27,7 @@ def is_biallelic_snp(rec): @app.command() def main( - vcf_file: Path = typer.Argument( + bcf_file: Path = typer.Argument( ..., help="Path to the sample VCF.", exists=True, readable=True ), kgvcf_file: Path = typer.Argument( @@ -52,14 +52,14 @@ def main( """ if abf_file is None: - abf_file = vcf_file.with_suffix(".abf.txt") + abf_file = bcf_file.with_suffix(".abf.txt") # vcf_file='/scratch/rajwanir2/data_catalog/QC-test/results/GSAMD-24v1-0-contamseries/vcf/GSAMD-24v1-0-contamseries.bcf' # abf_file='/scratch/rajwanir2/data_catalog/QC-test/GSAMD-24v1-0.new.abf' # kg_vcf='/DCEG/CGF/Bioinformatics/Production/data/thousG/hg38/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz' # retrieving needed info from the input vcf - vcf = VariantFile(vcf_file, drop_samples=True) + vcf = VariantFile(bcf_file, drop_samples=True) vcf_info = [] for rec in vcf: vcf_info.append( diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk index ca520cd9..c4a142b4 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk @@ -32,7 +32,7 @@ if cfg.config.user_files.bcf: use rule pull_b_allele_freq_from_1kg_bcfinput from thousand_genomes as pull_b_allele_freq_from_1kg with: input: - vcf_file=cfg.config.user_files.bcf, + bcf_file=cfg.config.user_files.bcf, kgvcf_file=cfg.config.reference_files.thousand_genome_vcf, else: @@ -98,7 +98,7 @@ if cfg.config.user_files.bcf: """ input: sample_sheet_csv="cgr_sample_sheet.csv", - vcf_file=cfg.config.user_files.bcf, + bcf_file=cfg.config.user_files.bcf, abf_file=rules.pull_b_allele_freq_from_1kg.output.abf_file, _=rules.verifyidintensity_conda.output[0], params: @@ -139,7 +139,7 @@ if cfg.config.user_files.bcf: This is a submission hot spot creating 1 job per sample. """ input: - vcf_file=cfg.config.user_files.bcf, + bcf_file=cfg.config.user_files.bcf, params: target_sample="{Sample_ID}", output: From 52f4a892f3bf40daba3ff84a05b324dc5a52a77b Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Thu, 12 Sep 2024 10:38:12 -0400 Subject: [PATCH 13/36] Removes the duplicated subworkflow file for intensity checks. Removes idat_intensity.smk and keeps intensity_check.smk --- .../workflow/sub_workflows/idat_intensity.smk | 182 ------------------ 1 file changed, 182 deletions(-) delete mode 100644 src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk b/src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk deleted file mode 100644 index 73cb1d6e..00000000 --- a/src/cgr_gwas_qc/workflow/sub_workflows/idat_intensity.smk +++ /dev/null @@ -1,182 +0,0 @@ -import pandas as pd - -from cgr_gwas_qc import load_config - -cfg = load_config() - - -################################################################################ -# Intensity Targets -################################################################################ -targets = [ - "sample_level/contamination/median_idat_intensity.csv", -] - - -rule all_contamination: - input: - targets, - - -################################################################################ -# PHONY Rules -################################################################################ -# NOTE: Because of job grouping it is cleaner to wrap the various CLI utilities in -# their own python script. This makes using conda more complicated. Instead of -# installing all of the python dependencies in each environment, I will just -# pass the conda environment and activate it internal. However, we want to make -# sure that snakemake creates the environments, so these PHONY rules will make -# sure that the conda env exists. -localrules: - illuminaio_conda, - - -rule illuminaio_conda: - output: - temp(".illuminaio_env_built"), - conda: - cfg.conda("illuminaio") - shell: - "touch {output[0]}" - - -################################################################################ -# Workflow Rules -################################################################################ -if cfg.config.user_files.bcf: - if config.get("cluster_mode", False): - - localrules: - agg_median_idat_intensity, - - rule grouped_median_intensity_from_vcf: - """Calculate median intensity from raw intensities using VCF/BCF input.""" - input: - sample_sheet_csv="cgr_sample_sheet.csv", - vcf_file=cfg.config.user_files.bcf, - params: - grp="{grp}", - notemp=config.get("notemp", False), - output: - temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), - threads: 12 - resources: - mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, - time_hr=lambda wildcards, attempt: 4 * attempt, - script: - "../scripts/grouped_intensity_from_vcf.py" - - rule agg_median_idat_intensity: - input: - expand(rules.grouped_median_intensity_from_vcf.output[0], grp=cfg.cluster_groups), - params: - notemp=config.get("notemp", False), - output: - "sample_level/contamination/median_idat_intensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt**2, - script: - "../scripts/agg_median_idat_intensity.py" - - else: - - rule per_sample_median_intensity_from_vcf: - """Calculate median intensity from raw intensities using VCF/BCF input.""" - input: - vcf_file=cfg.config.user_files.bcf, - params: - sample_id="{Sample_ID}", - output: - temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), - resources: - mem_mb=lambda wildcards, attempt: attempt * 1024, - script: - "../scripts/median_intensity_from_vcf.py" - - rule agg_median_idat_intensity: - input: - cfg.expand(rules.per_sample_median_intensity_from_vcf.output[0]), - output: - "sample_level/contamination/median_idat_intensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt**2, - script: - "../scripts/agg_median_idat_intensity.py" - -else: - if config.get("cluster_mode", False): - - localrules: - agg_median_idat_intensity, - - rule grouped_median_idat_intensity: - """Calculate median intensity of Red and Green channels.""" - input: - sample_sheet_csv="cgr_sample_sheet.csv", - _=rules.illuminaio_conda.output[0], - params: - grp="{grp}", - idat_red_pattern=lambda _: cfg.config.user_files.idat_pattern.red, - idat_green_pattern=lambda _: cfg.config.user_files.idat_pattern.green, - rscript=cfg.scripts("median_idat_intensity.R"), - conda_env=cfg.conda("illuminaio"), - notemp=config.get("notemp", False), - output: - temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), - threads: 12 - resources: - mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, - time_hr=lambda wildcards, attempt: 4 * attempt, - script: - "../scripts/grouped_median_idat_intensity.py" - - rule agg_median_idat_intensity: - input: - expand(rules.grouped_median_idat_intensity.output[0], grp=cfg.cluster_groups), - params: - notemp=config.get("notemp", False), - output: - "sample_level/contamination/median_idat_intensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt**2, - script: - "../scripts/agg_median_idat_intensity.py" - - else: - - rule per_sample_median_idat_intensity: - """Calculate median intensity of Red and Green channels.""" - input: - red=lambda wc: cfg.expand( - cfg.config.user_files.idat_pattern.red, - query=f"Sample_ID == '{wc.Sample_ID}'", - )[0], - green=lambda wc: cfg.expand( - cfg.config.user_files.idat_pattern.green, - query=f"Sample_ID == '{wc.Sample_ID}'", - )[0], - _=rules.illuminaio_conda.output[0], - params: - sample_id="{Sample_ID}", - rscript=cfg.scripts("median_idat_intensity.R"), - conda_env=cfg.conda("illuminaio"), - output: - temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), - resources: - mem_mb=lambda wildcards, attempt: attempt * 1024, - script: - "../scripts/median_idat_intensity.py" - - rule agg_median_idat_intensity: - input: - cfg.expand(rules.per_sample_median_idat_intensity.output[0]), - output: - "sample_level/contamination/median_idat_intensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt**2, - script: - "../scripts/agg_median_idat_intensity.py" From f8ba2529ffa85b6a7e2f9b76eeeca1ae5e8cdfa8 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Thu, 12 Sep 2024 10:54:43 -0400 Subject: [PATCH 14/36] Updates the docstring in contamination.smk to indicate when BCF file is used. --- src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk index c4a142b4..15be96d1 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk @@ -74,7 +74,7 @@ if cfg.config.user_files.bcf: rule grouped_contamination: """Extracts normalized intensities and other metrics from an aggregated - VCF file and writes to Illumina ADPC.BIN per sample. + BCF file and writes to Illumina ADPC.BIN per sample. This is the format required by ``verifyIDintensity``. The script also runs some sanity checks (intensities and normalized intensities > 0; @@ -129,7 +129,8 @@ if cfg.config.user_files.bcf: else: rule per_sample_vcf_to_adpc: - """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. + """From an aggregated BCF input file, extracts normalized intensities and + other metrics for the target sample and writes to Illumina ADPC.BIN per sample. This is the format required by ``verifyIDintensity``. The script also runs some sanity checks (intensities and normalized intensities > 0; From 20c0a47bc89ba057a4c242e0c98d4ac7dc448231 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Thu, 12 Sep 2024 11:30:10 -0400 Subject: [PATCH 15/36] Renames input variable to bcf_file. Renames from vcf_file to bcf_file explictly indicate that bcf is input. --- src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py index 6037f0a6..37ebdaf1 100755 --- a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py @@ -24,7 +24,7 @@ @app.command() def main( - vcf_file: Path = typer.Argument( + bcf_file: Path = typer.Argument( ..., help="Path to a multisample VCF file with needed scores.", exists=True, readable=True ), target_sample: str = typer.Argument(..., help="Sample name."), @@ -44,7 +44,7 @@ def main( genotype: The called genotype (0: AA, 1: AB, 2: BB, 3: unknown or missing) """ - vcf = VariantFile(vcf_file) + vcf = VariantFile(bcf_file) sample_fields_to_fetch = ["X", "Y", "NORMX", "NORMY", "GT"] info_fieilds_to_fetch = ["ALLELE_A", "ALLELE_B", "GC_SCORE"] From fa2bcf583ea0bf09d4040868d261428241c4a0bf Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Thu, 12 Sep 2024 13:40:49 -0400 Subject: [PATCH 16/36] Imports snakemake params all at once in grouped_contamination.py snakemake params were imported through named import. Changed to import all params through a loop in unnamed fashion. Allows seemless compatibility when gtc or bcf entry point is used. --- .../workflow/scripts/grouped_contamination.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py b/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py index e438f075..3341d7fd 100644 --- a/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py +++ b/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py @@ -138,12 +138,13 @@ def estimate_contamination( if __name__ == "__main__" and "snakemake" in locals(): main( - **{k: v for k, v in snakemake.input.items() if not k.startswith("_")}, # type: ignore # noqa - grp=snakemake.params.grp, # type: ignore # noqa - gtc_pattern=snakemake.params.gtc_pattern, # type: ignore # noqa - snps=int(snakemake.params.snps), # type: ignore # noqa - conda_env=snakemake.params.conda_env, # type: ignore # noqa - notemp=bool(snakemake.params.notemp), # type: ignore # noqa - outfile=snakemake.output[0], # type: ignore # noqa - threads=snakemake.threads or 2, # type: ignore # noqa + defaults = {} + defaults.update({k: Path(v) for k, v in snakemake.input.items() if not k.startswith("_")}) # type: ignore # noqa + defaults.update({k: v for k, v in snakemake.params.items()}) # type: ignore # noqa + # gtc_pattern=snakemake.params.gtc_pattern, # type: ignore # noqa + # snps=int(snakemake.params.snps), # type: ignore # noqa + # conda_env=snakemake.params.conda_env, # type: ignore # noqa + # notemp=bool(snakemake.params.notemp), # type: ignore # noqa + defaults.update({"outfile": Path(snakemake.output[0])}) # type: ignore # noqa + defaults.update({"threads": snakemake.threads[0] or 2 }) # type: ignore # noqa ) From 967ff885451d4661c11a7a3312a0a877df3dd4ef Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Fri, 13 Sep 2024 12:26:03 -0400 Subject: [PATCH 17/36] Removes typo duplicated header in entry_points The starting few lines were duplicated in the entry_points in copy/paste. This removes the duplicated lines. --- .../workflow/sub_workflows/entry_points.smk | 58 ------------------- 1 file changed, 58 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk b/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk index 5baf412f..07b9a2c1 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk @@ -26,64 +26,6 @@ targets = [ ] -rule all_entry_points: - input: - targets, - - -################################################################################ -# PHONY Rules -################################################################################ -# NOTE: Because of job grouping it is cleaner to wrap the various CLI utilities in -# their own python script. This makes using conda more complicated. Instead of -# installing all of the python dependencies in each environment, I will just -# pass the conda environment and activate it internal. However, we want to make -# sure that snakemake creates the environments, so these PHONY rules will make -# sure that the conda env exists. -localrules: - plink_conda, - - -rule plink_conda: - output: - temp(".plink_env_built"), - conda: - cfg.conda("plink2") - shell: - "touch {output[0]}" - - -################################################################################ -# Workflow Rules -################################################################################ -"""Entry points into the QC workflow. - -This module contains all the different data entry points into the QC -workflow. The most common case is a set of sample level GTC files provided by -the user. - -All entry points should create: - - - sample_level/samples.bed - - sample_level/samples.bim - - sample_level/samples.fam -""" -from cgr_gwas_qc import load_config -from cgr_gwas_qc.parsers import sample_sheet - -cfg = load_config() - - -################################################################################ -# Entry Points Targets -################################################################################ -targets = [ - "sample_level/samples.bed", - "sample_level/samples.bim", - "sample_level/samples.fam", -] - - rule all_entry_points: input: targets, From 0224aae39b6dac8b576b80344d0c35f372c9561a Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Fri, 13 Sep 2024 13:52:57 -0400 Subject: [PATCH 18/36] Shifts to using IGC score instead of gentrain score in vcf2adpc.py for contamination checks. Previously GC_SCORE was added to the adpc.bin which had depenency that a cluster egt file had to be used in preparation of vcf/bcf. IGC score is encoded in gtc so doesn't depended on cluster egt file. The conamination scores should also be more similar with the gtc input. --- src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py index 37ebdaf1..ab3c2c9e 100755 --- a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py @@ -45,15 +45,15 @@ def main( """ vcf = VariantFile(bcf_file) - sample_fields_to_fetch = ["X", "Y", "NORMX", "NORMY", "GT"] - info_fieilds_to_fetch = ["ALLELE_A", "ALLELE_B", "GC_SCORE"] + sample_fields_to_fetch = ["X", "Y", "NORMX", "NORMY", "GT", "IGC"] + info_fieilds_to_fetch = ["ALLELE_A", "ALLELE_B"] renaming_scheme = { "X": "x_raw", "Y": "y_raw", "NORMX": "x_norm", "NORMY": "y_norm", - "GC_SCORE": "genotype_score", + "IGC": "genotype_score", } vcf_info = [] From 53b4e33dd53fecf8f9735035a5cf1b1511ecbb4d Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Fri, 13 Sep 2024 15:06:16 -0400 Subject: [PATCH 19/36] Adds a config suffix validator for bcf_file. --- src/cgr_gwas_qc/models/config/user_files.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/cgr_gwas_qc/models/config/user_files.py b/src/cgr_gwas_qc/models/config/user_files.py index b40eaa02..1011b513 100644 --- a/src/cgr_gwas_qc/models/config/user_files.py +++ b/src/cgr_gwas_qc/models/config/user_files.py @@ -95,6 +95,13 @@ class UserFiles(BaseModel): description="The full path to an aggregated BCF/VCF file perferably encoding the GenCall scores.", ) + @validator("bcf") + def validate_bcf(cls, v): + if v.suffix == ".bcf": + return v + else: + raise ValueError("BCF suffix should be *.bcf") + @validator("gtc_pattern") def validate_gtc_pattern(cls, v): if v is None: From 416f2665048b1229d3278ce9bf2dd1afcdb8816b Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Fri, 13 Sep 2024 16:53:30 -0400 Subject: [PATCH 20/36] Removes some debugging lines from vcf2abf.py --- src/cgr_gwas_qc/workflow/scripts/vcf2abf.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py index 7056f76c..e7b7e158 100755 --- a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py @@ -54,10 +54,6 @@ def main( if abf_file is None: abf_file = bcf_file.with_suffix(".abf.txt") - # vcf_file='/scratch/rajwanir2/data_catalog/QC-test/results/GSAMD-24v1-0-contamseries/vcf/GSAMD-24v1-0-contamseries.bcf' - # abf_file='/scratch/rajwanir2/data_catalog/QC-test/GSAMD-24v1-0.new.abf' - # kg_vcf='/DCEG/CGF/Bioinformatics/Production/data/thousG/hg38/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz' - # retrieving needed info from the input vcf vcf = VariantFile(bcf_file, drop_samples=True) vcf_info = [] From bbe500687a40c7b825dab9a65e338e979cabb2fa Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Fri, 13 Sep 2024 17:00:26 -0400 Subject: [PATCH 21/36] Shifts to using IGC as genotype score for preparing abf using vcf2abf. Consistent with vcf2adpc.py. Now both should IGC and work with vcf/bcf prepared with gtc2vcf workflow. --- src/cgr_gwas_qc/workflow/scripts/vcf2abf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py index e7b7e158..ecc48a31 100755 --- a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py @@ -66,7 +66,7 @@ def main( rec.ref[0], rec.alts[0], is_biallelic_snp(rec), - rec.info["GC_SCORE"], + rec.info["IGC"], rec.info["ALLELE_A"], ] ) From 76c830449efa44b2b855e791775c9a65ea610da2 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Mon, 16 Sep 2024 06:07:08 -0400 Subject: [PATCH 22/36] Removes commented lines from vcf2adpc.py and vcf2abf.py --- src/cgr_gwas_qc/workflow/scripts/vcf2abf.py | 11 ----------- src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py | 11 ----------- 2 files changed, 22 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py index ecc48a31..562584ff 100755 --- a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py @@ -1,8 +1,6 @@ #!/usr/bin/env python # coding: utf-8 -# In[1]: - from pathlib import Path from typing import Optional @@ -15,16 +13,10 @@ app = typer.Typer(add_completion=False) -# In[2]: - - def is_biallelic_snp(rec): return len(rec.ref) == 1 and len(rec.alts[0]) == 1 -# In[27]: - - @app.command() def main( bcf_file: Path = typer.Argument( @@ -124,9 +116,6 @@ def main( ) -# In[ ]: - - if __name__ == "__main__": if "snakemake" in locals(): defaults = {} diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py index ab3c2c9e..6be1d5ec 100755 --- a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py @@ -1,8 +1,6 @@ #!/usr/bin/env python # coding: utf-8 -# In[1]: - """Extracts Illumina scores from a multisample VCF for target sample and writes into a single sample adpc.bin format.""" from pathlib import Path @@ -16,9 +14,6 @@ # from more_itertools import unzip from cgr_gwas_qc.parsers.illumina import AdpcRecord, AdpcWriter -# In[ ]: - - app = typer.Typer(add_completion=False) @@ -99,9 +94,6 @@ def main( fh.write(record) -# In[2]: - - ## function to adpc records def vcf_to_adpc_record(vcf_info) -> Generator[AdpcRecord, None, None]: for x_raw, y_raw, x_norm, y_norm, genotype_score, genotype in zip( @@ -129,9 +121,6 @@ def vcf_to_adpc_record(vcf_info) -> Generator[AdpcRecord, None, None]: # return(pd.DataFrame(adpc_records,columns=['x_raw', 'y_raw', 'x_norm', 'y_norm', 'genotype_score', 'genotype'])) -# In[ ]: - - if __name__ == "__main__": if "snakemake" in locals(): defaults = {} From d840118a94349d085ae42ea9521ec294337d243f Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Mon, 16 Sep 2024 06:57:49 -0400 Subject: [PATCH 23/36] fixes a black syntax issue in importing snakemake params in grouped_contamatination.py --- .../workflow/scripts/grouped_contamination.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py b/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py index 3341d7fd..07787f47 100644 --- a/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py +++ b/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py @@ -138,13 +138,8 @@ def estimate_contamination( if __name__ == "__main__" and "snakemake" in locals(): main( - defaults = {} - defaults.update({k: Path(v) for k, v in snakemake.input.items() if not k.startswith("_")}) # type: ignore # noqa - defaults.update({k: v for k, v in snakemake.params.items()}) # type: ignore # noqa - # gtc_pattern=snakemake.params.gtc_pattern, # type: ignore # noqa - # snps=int(snakemake.params.snps), # type: ignore # noqa - # conda_env=snakemake.params.conda_env, # type: ignore # noqa - # notemp=bool(snakemake.params.notemp), # type: ignore # noqa - defaults.update({"outfile": Path(snakemake.output[0])}) # type: ignore # noqa - defaults.update({"threads": snakemake.threads[0] or 2 }) # type: ignore # noqa + **{k: v for k, v in snakemake.input.items() if not k.startswith("_")}, # type: ignore # noqa + **{k: v for k, v in snakemake.params.items()}, # type: ignore # noqa + outfile=snakemake.output[0], # type: ignore # noqa + threads=snakemake.threads or 2, # type: ignore # noqa ) From 2c526697083aed1e3f8f33df44eda4344133e2d6 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Fri, 20 Sep 2024 12:43:01 -0400 Subject: [PATCH 24/36] Modifies vcf2abf to not use gentrain score in preparing abf. Earliar gentrain score was used to mark AF as NA if score is negative. This change excludes using it to ensure compatibility with gtc2vcf workflow prepared bcf. --- src/cgr_gwas_qc/workflow/scripts/vcf2abf.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py index 562584ff..c379c06b 100755 --- a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Optional -import numpy as np +# import numpy as np import pandas as pd import typer from pysam import VariantFile @@ -58,14 +58,23 @@ def main( rec.ref[0], rec.alts[0], is_biallelic_snp(rec), - rec.info["IGC"], + # rec.info["IGC"], rec.info["ALLELE_A"], ] ) vcf_info = pd.DataFrame( vcf_info, - columns=["id", "chrom", "pos", "ref", "alt", "is_biallelic_snp", "gc_score", "allele_a"], + columns=[ + "id", + "chrom", + "pos", + "ref", + "alt", + "is_biallelic_snp", + # "gc_score", + "allele_a", + ], ) vcf_info["is_chrompos_uniq"] = ( vcf_info.duplicated(subset=["chrom", "pos"], keep=False) == False @@ -109,7 +118,7 @@ def main( # A pseudo way to switch from population AF to Illumina A/B frequency vcf_info["af"] = abs(vcf_info["allele_a"] - (vcf_info["ac"] / vcf_info["an"])) # Setting AF to na if gentrain score is not >0, if this is egt file was an older one - vcf_info["af"].where(vcf_info.gc_score > 0, np.nan, inplace=True) + # vcf_info["af"].where(vcf_info.gc_score > 0, np.nan, inplace=True) # writing abf file vcf_info[["id", "af"]].to_csv( abf_file, sep="\t", index=False, header=["SNP_ID", "ABF"], na_rep="NA" From 4e993bcfccff4931a5c606460af235cd11b1d4f0 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Mon, 23 Sep 2024 13:49:47 -0400 Subject: [PATCH 25/36] Includes a None case in the BCF config validator to work with unit testing. --- src/cgr_gwas_qc/models/config/user_files.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cgr_gwas_qc/models/config/user_files.py b/src/cgr_gwas_qc/models/config/user_files.py index 1011b513..9cc3e2f3 100644 --- a/src/cgr_gwas_qc/models/config/user_files.py +++ b/src/cgr_gwas_qc/models/config/user_files.py @@ -97,7 +97,9 @@ class UserFiles(BaseModel): @validator("bcf") def validate_bcf(cls, v): - if v.suffix == ".bcf": + if v is None: + return v + elif v.suffix == ".bcf": return v else: raise ValueError("BCF suffix should be *.bcf") From 605006c38adff5662182db42ac8c372f3ec1ccf0 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Mon, 23 Sep 2024 13:51:10 -0400 Subject: [PATCH 26/36] Adds the app mode in median_intensity_from_vcf to work directly as stand-alone. --- .../scripts/median_intensity_from_vcf.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/median_intensity_from_vcf.py b/src/cgr_gwas_qc/workflow/scripts/median_intensity_from_vcf.py index 5e541fd2..8b5bcf3e 100755 --- a/src/cgr_gwas_qc/workflow/scripts/median_intensity_from_vcf.py +++ b/src/cgr_gwas_qc/workflow/scripts/median_intensity_from_vcf.py @@ -1,8 +1,6 @@ #!/usr/bin/env python # coding: utf-8 -# In[ ]: - from pathlib import Path from statistics import median @@ -35,7 +33,7 @@ def main( vcf = VariantFile(vcf_file) # loops over all records in the VCF, fetches the 'X' and 'Y' intensity values, sums them and stores them in the list - # of 'sample_values'.z + # of 'sample_values' sample_values = [] for rec in vcf: sample_values.append(np.sum([rec.samples[sample_id].get(key) for key in fields_to_fetch])) @@ -46,9 +44,12 @@ def main( df.to_csv(outfile, index=False) -if __name__ == "__main__" and "snakemake" in locals(): - main( - **{k: v for k, v in snakemake.input.items() if not k.startswith("_")}, # type: ignore # noqa - **{k: v for k, v in snakemake.params.items()}, # type: ignore # noqa - outfile=snakemake.output[0], # type: ignore # noqa - ) +if __name__ == "__main__": + if "snakemake" in locals(): + main( + **{k: v for k, v in snakemake.input.items() if not k.startswith("_")}, # type: ignore # noqa + **{k: v for k, v in snakemake.params.items()}, # type: ignore # noqa + outfile=snakemake.output[0], # type: ignore # noqa + ) + else: + app() From b69a3fc18d28c68991f1a48c7561be922d05f37f Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Mon, 23 Sep 2024 13:59:19 -0400 Subject: [PATCH 27/36] Adds the new unit test scripts relevent to BCF entry --- .../scripts/test_median_intensity_from_vcf.py | 27 ++++++++++++++ tests/workflow/scripts/test_vcf2abf.py | 37 +++++++++++++++++++ tests/workflow/scripts/test_vcf2adpc.py | 33 +++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 tests/workflow/scripts/test_median_intensity_from_vcf.py create mode 100755 tests/workflow/scripts/test_vcf2abf.py create mode 100644 tests/workflow/scripts/test_vcf2adpc.py diff --git a/tests/workflow/scripts/test_median_intensity_from_vcf.py b/tests/workflow/scripts/test_median_intensity_from_vcf.py new file mode 100644 index 00000000..542b889e --- /dev/null +++ b/tests/workflow/scripts/test_median_intensity_from_vcf.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# coding: utf-8 + +"""Test calculation of median intensity from BCF + +We need to assess whether the intensity meets the miniumum threshold. For this purpose, we typically check median IDAT intensity. Here we attempt to test the same with BCF as input file. +""" + +import pandas as pd +import pytest +from numpy import isclose + +from cgr_gwas_qc.workflow.scripts import median_intensity_from_vcf + + +@pytest.mark.regression +def test_median_intensity_from_vcf_fake_data(fake_data_cache, tmp_path): + obs_median_int = tmp_path / "test_median_int.csv" + median_intensity_from_vcf.main( + fake_data_cache / "cgr/bcf/small_genotype.bcf", "small_genotype", obs_median_int + ) + + expected_median_int = fake_data_cache / "cgr/bcf/small_genotype_median_int.csv" + + expected_median_int = pd.read_csv(expected_median_int) + obs_median_int = pd.read_csv(obs_median_int) + assert isclose(expected_median_int.median_intensity, obs_median_int.median_intensity) diff --git a/tests/workflow/scripts/test_vcf2abf.py b/tests/workflow/scripts/test_vcf2abf.py new file mode 100755 index 00000000..784ea549 --- /dev/null +++ b/tests/workflow/scripts/test_vcf2abf.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# coding: utf-8 + +"""Test the conversion of BCF to ABF. + +For contamination checking we need to have the population level allele +frequencies. We pull these allele frequencies from the 1KG project for each +SNP in the BCF. +""" + +import pandas as pd +import pytest + +from cgr_gwas_qc.workflow.scripts import vcf2abf + + +@pytest.mark.regression +def test_vcf2abf_fake_data(fake_data_cache, tmp_path): + obs_abf = tmp_path / "test.abf.txt" + vcf2abf.main( + fake_data_cache / "cgr/bcf/small_genotype.bcf", + fake_data_cache / "1KG/small_1KG.vcf.gz", + "AF", + obs_abf, + ) + + expected_abf = fake_data_cache / "cgr/bcf/small_genotype.abf.txt" + + obs_abf = pd.read_csv(obs_abf, sep="\t") + expected_abf = pd.read_csv(expected_abf, sep="\t") + + comparision = expected_abf.merge( + obs_abf, how="left", on="SNP_ID", suffixes=("_expected", "_obs") + ) + comparision = comparision.dropna() # excluding the na from the test + prop_very_different = (abs(comparision.ABF_expected - comparision.ABF_obs) > 1e-6).mean() + assert 0.01 > prop_very_different # less than 1% are more than 0.000001 different diff --git a/tests/workflow/scripts/test_vcf2adpc.py b/tests/workflow/scripts/test_vcf2adpc.py new file mode 100644 index 00000000..a7c0c9e1 --- /dev/null +++ b/tests/workflow/scripts/test_vcf2adpc.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# coding: utf-8 + +"""Test the extraction of sample scores from BCF and writing to adpc.bin. + +For contamination checking we need to have the sample scores written into adpc.bin to serve as input for verifyIDintensity. +""" + +import pytest +from numpy import isclose + +from cgr_gwas_qc.parsers.illumina import AdpcReader +from cgr_gwas_qc.workflow.scripts import vcf2adpc + + +@pytest.mark.regression +def test_vcf2adpc_fake_data(fake_data_cache, tmp_path): + obs_adpc = tmp_path / "test.adpc.bin" + vcf2adpc.main(fake_data_cache / "cgr/bcf/small_genotype.bcf", "small_genotype", obs_adpc) + + expected_adpc = fake_data_cache / "cgr/bcf/small_genotype.adpc.bin" + + with AdpcReader(expected_adpc) as expected, AdpcReader(obs_adpc) as obs: + for exp_row, obs_row in zip(expected, obs): + # Integers should be the exactly the same + assert exp_row.x_raw == obs_row.x_raw + assert exp_row.y_raw == obs_row.y_raw + assert exp_row.genotype == obs_row.genotype + + # Floats should be really close + assert isclose(exp_row.x_norm, obs_row.x_norm) + assert isclose(exp_row.y_norm, obs_row.y_norm) + assert isclose(exp_row.genotype_score, obs_row.genotype_score) From d1fd658779d0c18864f8a3050c3058acb8f08885 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Tue, 24 Sep 2024 12:16:38 -0400 Subject: [PATCH 28/36] docs: updates docstring to show bcf as a user_file in config --- src/cgr_gwas_qc/models/config/user_files.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/cgr_gwas_qc/models/config/user_files.py b/src/cgr_gwas_qc/models/config/user_files.py index 9cc3e2f3..00566626 100644 --- a/src/cgr_gwas_qc/models/config/user_files.py +++ b/src/cgr_gwas_qc/models/config/user_files.py @@ -35,9 +35,19 @@ class UserFiles(BaseModel): bim: /path/to/samples.bim fam: /path/to/samples.fam + or + + .. code-block:: yaml + + user_files: + output_pattern: '{prefix}/{file_type}.{ext}' + bcf: /path/to/samples.bcf + .. note:: - The IDAT/GTC patterns, PED/MAP paths, and BED/BIM/FAM paths are mutually exclusive. + The BCF file, IDAT/GTC patterns, PED/MAP paths, BED/BIM/FAM paths are all mutually exclusive. You should only provide one set of patterns/paths. + + """ output_pattern: str = Field( From 5ef948d96822b638d709bc34f315fff8aa5fd491 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Tue, 24 Sep 2024 12:18:32 -0400 Subject: [PATCH 29/36] docs: Updates the entry_point documentation to show BCF as an entry_point. --- docs/sub_workflows/entry_points.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/sub_workflows/entry_points.rst b/docs/sub_workflows/entry_points.rst index 5a77ced4..7c2852fd 100644 --- a/docs/sub_workflows/entry_points.rst +++ b/docs/sub_workflows/entry_points.rst @@ -15,6 +15,7 @@ Entry Points Sub-workflow - ``user_files.bed`` - ``user_files.bim`` - ``user_files.fam`` + - ``user_files.bcf`` **Major Outputs**: @@ -22,7 +23,7 @@ Entry Points Sub-workflow - ``sample_level/samples.bim`` - ``sample_level/samples.fam`` -There are three paths we can take to create these files: +There are four paths we can take to create these files: 1. If GTC files are provided using ``user_files.gtc_pattern`` then we will @@ -34,3 +35,4 @@ There are three paths we can take to create these files: 2. If an aggregated PED/MAP is provided using ``user_files.ped`` and ``user_files.map`` then we will convert the PED/MAP to BED/BIM/FAM. 3. If an aggregated BED/BIM/FAM is provided using ``user_files.bed``, ``user_files.bim``, ``user_files.fam`` then we will create a symbolic link. +4. If an aggregated BCF file is provided using ``user_files.bcf`` then we will convert the BCF to BED/BIM/FAM. From 949443e8acad4d137126f1a9576075875436ee69 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Tue, 24 Sep 2024 12:20:30 -0400 Subject: [PATCH 30/36] docs: Updates contamination subworkflow documentation to show processing with BCF input. --- docs/static/bcf_contamination.svg | 80 ++++++++++++++++++++++++++++ docs/static/gtc_contamination.svg | 80 ++++++++++++++++++++++++++++ docs/sub_workflows/contamination.rst | 31 +++++------ 3 files changed, 176 insertions(+), 15 deletions(-) create mode 100755 docs/static/bcf_contamination.svg create mode 100755 docs/static/gtc_contamination.svg diff --git a/docs/static/bcf_contamination.svg b/docs/static/bcf_contamination.svg new file mode 100755 index 00000000..982a4260 --- /dev/null +++ b/docs/static/bcf_contamination.svg @@ -0,0 +1,80 @@ + + + + + + +snakemake_dag + +Aggregated BCF input + + +0 + +all_contamination + + + +1 + +agg_verifyidintensity + + + +1->0 + + + + + +2 + +per_sample_contamination_verifyIDintensity + + + +2->1 + + + + + +3 + +per_sample_vcf_to_adpc + + + +3->2 + + + + + +4 + +pull_b_allele_freq_from_1kg + + + +4->2 + + + + + +5 + +verifyidintensity_conda + + + +5->2 + + + + + diff --git a/docs/static/gtc_contamination.svg b/docs/static/gtc_contamination.svg new file mode 100755 index 00000000..e4cf2451 --- /dev/null +++ b/docs/static/gtc_contamination.svg @@ -0,0 +1,80 @@ + + + + + + +snakemake_dag + +GTC input + + +0 + +all_contamination + + + +1 + +agg_verifyidintensity + + + +1->0 + + + + + +2 + +per_sample_contamination_verifyIDintensity + + + +2->1 + + + + + +3 + +per_sample_gtc_to_adpc + + + +3->2 + + + + + +4 + +pull_b_allele_freq_from_1kg + + + +4->2 + + + + + +5 + +verifyidintensity_conda + + + +5->2 + + + + + diff --git a/docs/sub_workflows/contamination.rst b/docs/sub_workflows/contamination.rst index 974c06be..2d1d0f11 100644 --- a/docs/sub_workflows/contamination.rst +++ b/docs/sub_workflows/contamination.rst @@ -6,27 +6,28 @@ Contamination Sub-workflow **Workflow File**: https://github.com/NCI-CGR/GwasQcPipeline/blob/default/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk +**Major Outputs**: + + - ``sample_level/..abf.txt`` B allele frequencies from the 1000 genomes. + - ``sample_level/contamination/verifyIDintensity.csv`` aggregated table of contamination scores. + **Config Options**: see :ref:`config-yaml` for more details - ``reference_files.thousand_genome_vcf`` - ``reference_files.thousand_genome_tbi`` - - ``user_files.gtc_pattern`` - - ``user_files.idat_pattern`` + - ``user_files.bcf`` or ( ``reference_files.illumina_manifest_file`` and ``user_files.gtc_pattern`` ) - ``software_params.contam_population`` -**Major Outputs**: - - - ``sample_level/..abf.txt`` B allele frequencies from the 1000 genomes. - - ``sample_level/contamination/median_idat_intensity.csv`` aggregated table of median IDAT intensities. - - ``sample_level/contamination/verifyIDintensity.csv`` aggregated table of contamination scores. +|bcf_input_contamination| |gtc_input_contamination| +.. |gtc_input_contamination| image:: ../static/gtc_contamination.svg + :width: 45% -.. figure:: ../static/contamination.png - :name: fig-contamination-workflow +.. |bcf_input_contamination| image:: ../static/bcf_contamination.svg + :width: 45% - The contamination sub-workflow. - This workflow will estimate contamination using verifyIDintensity on each sample individually. - It requires that you have GTC/IDAT files. - It first pulls B-allele frequencies from the 1000 Genomes VCF file. - It then estimate contamination for each sample and aggregates these results. - Finally, it also estimates the per sample median IDAT intensity, which is used to filter contamination results in the :ref:`sample-qc` +The contamination sub-workflow. +This workflow will estimate contamination using verifyIDintensity on each sample individually. +It requires that you have aggregated BCF or GTC files. +It first pulls B-allele frequencies from the 1000 Genomes VCF file. +It then estimate contamination for each sample and aggregates these results. From f28fc9c839771dfa77f9b627b2d52a7a97d9445e Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Tue, 24 Sep 2024 12:22:03 -0400 Subject: [PATCH 31/36] docs: Adds the documentation for new intensity subworkflow and shows processing for both IDAT and BCF input. --- docs/static/bcf_intensity.svg | 44 ++++++++++++++++++++ docs/static/idat_intensity.svg | 56 ++++++++++++++++++++++++++ docs/sub_workflows/intensity_check.rst | 27 +++++++++++++ 3 files changed, 127 insertions(+) create mode 100755 docs/static/bcf_intensity.svg create mode 100755 docs/static/idat_intensity.svg create mode 100755 docs/sub_workflows/intensity_check.rst diff --git a/docs/static/bcf_intensity.svg b/docs/static/bcf_intensity.svg new file mode 100755 index 00000000..bcca12ce --- /dev/null +++ b/docs/static/bcf_intensity.svg @@ -0,0 +1,44 @@ + + + + + + +snakemake_dag + +Aggregated BCF input + + +0 + +all_contamination + + + +1 + +agg_median_idat_intensity + + + +1->0 + + + + + +2 + +per_sample_median_intensity_from_vcf + + + +2->1 + + + + + diff --git a/docs/static/idat_intensity.svg b/docs/static/idat_intensity.svg new file mode 100755 index 00000000..1b1aa12d --- /dev/null +++ b/docs/static/idat_intensity.svg @@ -0,0 +1,56 @@ + + + + + + +snakemake_dag + +IDAT input + + +0 + +all_contamination + + + +1 + +agg_median_idat_intensity + + + +1->0 + + + + + +2 + +per_sample_median_idat_intensity + + + +2->1 + + + + + +3 + +illuminaio_conda + + + +3->2 + + + + + diff --git a/docs/sub_workflows/intensity_check.rst b/docs/sub_workflows/intensity_check.rst new file mode 100755 index 00000000..de33aade --- /dev/null +++ b/docs/sub_workflows/intensity_check.rst @@ -0,0 +1,27 @@ +.. _intensity: + +Intensity check Sub-workflow +============================ + +**Workflow File**: + https://github.com/NCI-CGR/GwasQcPipeline/blob/default/src/cgr_gwas_qc/workflow/sub_workflows/intensity_check.smk + +**Config Options**: see :ref:`config-yaml` for more details + + - ``user_files.bcf`` or ``user_files.idat_pattern`` + +**Major Outputs**: + + - ``sample_level/contamination/median_idat_intensity.csv`` aggregated table of median IDAT intensities. + +|bcf_input_intensity| |idat_input_intensity| + +.. |idat_input_intensity| image:: ../static/idat_intensity.svg + :width: 45% + +.. |bcf_input_intensity| image:: ../static/bcf_intensity.svg + :width: 45% + +The Intenstiy check sub-workflow. +This workflow will caclucate the median intensity for each sample individually. +It requires that you have an aggregated BCF or IDAT files. From d42a035abadb0d673d66dbf04d0b29ccaa6e9a20 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Tue, 24 Sep 2024 12:24:04 -0400 Subject: [PATCH 32/36] docs: Updates index to include intensity_check subworkflow. --- docs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.rst b/docs/index.rst index f53e93d1..89d3b560 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,6 +29,7 @@ This lets us take advantage of snakemake_'s amazing workflow management system, :maxdepth: 1 sub_workflows/entry_points + sub_workflows/intensity_check sub_workflows/contamination sub_workflows/sample_qc sub_workflows/subject_qc From 3ea4d4cafa9acd5fa106cbee4c7673a288532bc2 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Wed, 25 Sep 2024 15:51:22 -0400 Subject: [PATCH 33/36] docs: corrects spellings in documentation --- docs/sub_workflows/contamination.rst | 2 +- docs/sub_workflows/intensity_check.rst | 4 ++-- tests/workflow/scripts/test_median_intensity_from_vcf.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/sub_workflows/contamination.rst b/docs/sub_workflows/contamination.rst index 2d1d0f11..4b833dda 100644 --- a/docs/sub_workflows/contamination.rst +++ b/docs/sub_workflows/contamination.rst @@ -30,4 +30,4 @@ The contamination sub-workflow. This workflow will estimate contamination using verifyIDintensity on each sample individually. It requires that you have aggregated BCF or GTC files. It first pulls B-allele frequencies from the 1000 Genomes VCF file. -It then estimate contamination for each sample and aggregates these results. +It then estimates contamination for each sample and aggregates these results. diff --git a/docs/sub_workflows/intensity_check.rst b/docs/sub_workflows/intensity_check.rst index de33aade..25bbb4d1 100755 --- a/docs/sub_workflows/intensity_check.rst +++ b/docs/sub_workflows/intensity_check.rst @@ -22,6 +22,6 @@ Intensity check Sub-workflow .. |bcf_input_intensity| image:: ../static/bcf_intensity.svg :width: 45% -The Intenstiy check sub-workflow. -This workflow will caclucate the median intensity for each sample individually. +The Intensity check sub-workflow. +This workflow will calculate the median intensity for each sample individually. It requires that you have an aggregated BCF or IDAT files. diff --git a/tests/workflow/scripts/test_median_intensity_from_vcf.py b/tests/workflow/scripts/test_median_intensity_from_vcf.py index 542b889e..3c650985 100644 --- a/tests/workflow/scripts/test_median_intensity_from_vcf.py +++ b/tests/workflow/scripts/test_median_intensity_from_vcf.py @@ -3,7 +3,7 @@ """Test calculation of median intensity from BCF -We need to assess whether the intensity meets the miniumum threshold. For this purpose, we typically check median IDAT intensity. Here we attempt to test the same with BCF as input file. +We need to assess whether the intensity meets the minimum threshold. For this purpose, we typically check median IDAT intensity. Here we attempt to test the same with BCF as input file. """ import pandas as pd From 4bcf9e67ec55730b6bc9e5fac169658f006215d6 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Wed, 25 Sep 2024 15:59:31 -0400 Subject: [PATCH 34/36] Removes jupyter comment lines from scripts. --- .../workflow/scripts/grouped_intensity_from_vcf.py | 10 ---------- src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py | 14 -------------- 2 files changed, 24 deletions(-) diff --git a/src/cgr_gwas_qc/workflow/scripts/grouped_intensity_from_vcf.py b/src/cgr_gwas_qc/workflow/scripts/grouped_intensity_from_vcf.py index 9bfd1411..f3ea4363 100755 --- a/src/cgr_gwas_qc/workflow/scripts/grouped_intensity_from_vcf.py +++ b/src/cgr_gwas_qc/workflow/scripts/grouped_intensity_from_vcf.py @@ -1,8 +1,6 @@ #!/usr/bin/env python # coding: utf-8 -# In[5]: - import shutil from concurrent.futures import ProcessPoolExecutor @@ -15,8 +13,6 @@ from cgr_gwas_qc.typing import PathLike from cgr_gwas_qc.workflow.scripts import agg_median_idat_intensity, median_intensity_from_vcf -# In[ ]: - def main( vcf_file: PathLike, @@ -38,9 +34,6 @@ def main( shutil.rmtree(tmp_dir) -# In[10]: - - def calculate_median_intensity_from_vcf(ss, vcf_file, outdir, threads) -> List[Path]: outfile_pattern = (outdir / "{Sample_ID}.csv").as_posix() with ProcessPoolExecutor(threads) as executor: @@ -63,9 +56,6 @@ def calculate_median_intensity_from_vcf(ss, vcf_file, outdir, threads) -> List[P return outfiles -# In[6]: - - if __name__ == "__main__" and "snakemake" in locals(): main( **{k: v for k, v in snakemake.input.items()}, # type: ignore # noqa diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py index 6be1d5ec..58799ce7 100755 --- a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py @@ -107,20 +107,6 @@ def vcf_to_adpc_record(vcf_info) -> Generator[AdpcRecord, None, None]: yield AdpcRecord(x_raw, y_raw, x_norm, y_norm, genotype_score, genotype) -# In[ ]: - - -# def adpc_as_df(adpc): -# i=0 -# adpc_records = [] -# with AdpcReader(adpc) as fh: -# for record in fh: -# # if i <10000: -# adpc_records.append(record) -# i+=1 -# return(pd.DataFrame(adpc_records,columns=['x_raw', 'y_raw', 'x_norm', 'y_norm', 'genotype_score', 'genotype'])) - - if __name__ == "__main__": if "snakemake" in locals(): defaults = {} From 37de67c731447a0c1e606e22b49958eb76cd5b03 Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Wed, 25 Sep 2024 16:39:54 -0400 Subject: [PATCH 35/36] docs: updates the docstring for user_files in config section to include BCF. --- src/cgr_gwas_qc/models/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cgr_gwas_qc/models/config/__init__.py b/src/cgr_gwas_qc/models/config/__init__.py index 0b4d23e9..08070e4b 100644 --- a/src/cgr_gwas_qc/models/config/__init__.py +++ b/src/cgr_gwas_qc/models/config/__init__.py @@ -106,7 +106,7 @@ class Config(BaseModel): user_files: UserFiles = Field( ..., description="User file namespace. " - "User files include the user provided genotype data in IDAT/GTC, PED/MAP, or BED/BIM/FAM formats.", + "User files include the user provided genotype data in BCF, IDAT/GTC, PED/MAP, BED/BIM/FAM formats.", ) software_params: SoftwareParams = Field( From 58ed006a25e49a6c4a8810b75e8989d667b6c28a Mon Sep 17 00:00:00 2001 From: rajwanir2 Date: Wed, 25 Sep 2024 16:42:18 -0400 Subject: [PATCH 36/36] docs: redo updates the user_files docstring to include BCF. --- src/cgr_gwas_qc/models/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cgr_gwas_qc/models/config/__init__.py b/src/cgr_gwas_qc/models/config/__init__.py index 08070e4b..2b415e2e 100644 --- a/src/cgr_gwas_qc/models/config/__init__.py +++ b/src/cgr_gwas_qc/models/config/__init__.py @@ -106,7 +106,7 @@ class Config(BaseModel): user_files: UserFiles = Field( ..., description="User file namespace. " - "User files include the user provided genotype data in BCF, IDAT/GTC, PED/MAP, BED/BIM/FAM formats.", + "User files include the user provided genotype data in BCF, IDAT/GTC, PED/MAP or BED/BIM/FAM formats.", ) software_params: SoftwareParams = Field(