diff --git a/docs/index.rst b/docs/index.rst index f53e93d1..89d3b560 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,6 +29,7 @@ This lets us take advantage of snakemake_'s amazing workflow management system, :maxdepth: 1 sub_workflows/entry_points + sub_workflows/intensity_check sub_workflows/contamination sub_workflows/sample_qc sub_workflows/subject_qc diff --git a/docs/static/bcf_contamination.svg b/docs/static/bcf_contamination.svg new file mode 100755 index 00000000..982a4260 --- /dev/null +++ b/docs/static/bcf_contamination.svg @@ -0,0 +1,80 @@ + + + + + + +snakemake_dag + +Aggregated BCF input + + +0 + +all_contamination + + + +1 + +agg_verifyidintensity + + + +1->0 + + + + + +2 + +per_sample_contamination_verifyIDintensity + + + +2->1 + + + + + +3 + +per_sample_vcf_to_adpc + + + +3->2 + + + + + +4 + +pull_b_allele_freq_from_1kg + + + +4->2 + + + + + +5 + +verifyidintensity_conda + + + +5->2 + + + + + diff --git a/docs/static/bcf_intensity.svg b/docs/static/bcf_intensity.svg new file mode 100755 index 00000000..bcca12ce --- /dev/null +++ b/docs/static/bcf_intensity.svg @@ -0,0 +1,44 @@ + + + + + + +snakemake_dag + +Aggregated BCF input + + +0 + +all_contamination + + + +1 + +agg_median_idat_intensity + + + +1->0 + + + + + +2 + +per_sample_median_intensity_from_vcf + + + +2->1 + + + + + diff --git a/docs/static/gtc_contamination.svg b/docs/static/gtc_contamination.svg new file mode 100755 index 00000000..e4cf2451 --- /dev/null +++ b/docs/static/gtc_contamination.svg @@ -0,0 +1,80 @@ + + + + + + +snakemake_dag + +GTC input + + +0 + +all_contamination + + + +1 + +agg_verifyidintensity + + + +1->0 + + + + + +2 + +per_sample_contamination_verifyIDintensity + + + +2->1 + + + + + +3 + +per_sample_gtc_to_adpc + + + +3->2 + + + + + +4 + +pull_b_allele_freq_from_1kg + + + +4->2 + + + + + +5 + +verifyidintensity_conda + + + +5->2 + + + + + diff --git a/docs/static/idat_intensity.svg b/docs/static/idat_intensity.svg new file mode 100755 index 00000000..1b1aa12d --- /dev/null +++ b/docs/static/idat_intensity.svg @@ -0,0 +1,56 @@ + + + + + + +snakemake_dag + +IDAT input + + +0 + +all_contamination + + + +1 + +agg_median_idat_intensity + + + +1->0 + + + + + +2 + +per_sample_median_idat_intensity + + + +2->1 + + + + + +3 + +illuminaio_conda + + + +3->2 + + + + + diff --git a/docs/sub_workflows/contamination.rst b/docs/sub_workflows/contamination.rst index 974c06be..4b833dda 100644 --- a/docs/sub_workflows/contamination.rst +++ b/docs/sub_workflows/contamination.rst @@ -6,27 +6,28 @@ Contamination Sub-workflow **Workflow File**: https://github.com/NCI-CGR/GwasQcPipeline/blob/default/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk +**Major Outputs**: + + - ``sample_level/..abf.txt`` B allele frequencies from the 1000 genomes. + - ``sample_level/contamination/verifyIDintensity.csv`` aggregated table of contamination scores. + **Config Options**: see :ref:`config-yaml` for more details - ``reference_files.thousand_genome_vcf`` - ``reference_files.thousand_genome_tbi`` - - ``user_files.gtc_pattern`` - - ``user_files.idat_pattern`` + - ``user_files.bcf`` or ( ``reference_files.illumina_manifest_file`` and ``user_files.gtc_pattern`` ) - ``software_params.contam_population`` -**Major Outputs**: - - - ``sample_level/..abf.txt`` B allele frequencies from the 1000 genomes. - - ``sample_level/contamination/median_idat_intensity.csv`` aggregated table of median IDAT intensities. - - ``sample_level/contamination/verifyIDintensity.csv`` aggregated table of contamination scores. +|bcf_input_contamination| |gtc_input_contamination| +.. |gtc_input_contamination| image:: ../static/gtc_contamination.svg + :width: 45% -.. figure:: ../static/contamination.png - :name: fig-contamination-workflow +.. |bcf_input_contamination| image:: ../static/bcf_contamination.svg + :width: 45% - The contamination sub-workflow. - This workflow will estimate contamination using verifyIDintensity on each sample individually. - It requires that you have GTC/IDAT files. - It first pulls B-allele frequencies from the 1000 Genomes VCF file. - It then estimate contamination for each sample and aggregates these results. - Finally, it also estimates the per sample median IDAT intensity, which is used to filter contamination results in the :ref:`sample-qc` +The contamination sub-workflow. +This workflow will estimate contamination using verifyIDintensity on each sample individually. +It requires that you have aggregated BCF or GTC files. +It first pulls B-allele frequencies from the 1000 Genomes VCF file. +It then estimates contamination for each sample and aggregates these results. diff --git a/docs/sub_workflows/entry_points.rst b/docs/sub_workflows/entry_points.rst index 5a77ced4..7c2852fd 100644 --- a/docs/sub_workflows/entry_points.rst +++ b/docs/sub_workflows/entry_points.rst @@ -15,6 +15,7 @@ Entry Points Sub-workflow - ``user_files.bed`` - ``user_files.bim`` - ``user_files.fam`` + - ``user_files.bcf`` **Major Outputs**: @@ -22,7 +23,7 @@ Entry Points Sub-workflow - ``sample_level/samples.bim`` - ``sample_level/samples.fam`` -There are three paths we can take to create these files: +There are four paths we can take to create these files: 1. If GTC files are provided using ``user_files.gtc_pattern`` then we will @@ -34,3 +35,4 @@ There are three paths we can take to create these files: 2. If an aggregated PED/MAP is provided using ``user_files.ped`` and ``user_files.map`` then we will convert the PED/MAP to BED/BIM/FAM. 3. If an aggregated BED/BIM/FAM is provided using ``user_files.bed``, ``user_files.bim``, ``user_files.fam`` then we will create a symbolic link. +4. If an aggregated BCF file is provided using ``user_files.bcf`` then we will convert the BCF to BED/BIM/FAM. diff --git a/docs/sub_workflows/intensity_check.rst b/docs/sub_workflows/intensity_check.rst new file mode 100755 index 00000000..25bbb4d1 --- /dev/null +++ b/docs/sub_workflows/intensity_check.rst @@ -0,0 +1,27 @@ +.. _intensity: + +Intensity check Sub-workflow +============================ + +**Workflow File**: + https://github.com/NCI-CGR/GwasQcPipeline/blob/default/src/cgr_gwas_qc/workflow/sub_workflows/intensity_check.smk + +**Config Options**: see :ref:`config-yaml` for more details + + - ``user_files.bcf`` or ``user_files.idat_pattern`` + +**Major Outputs**: + + - ``sample_level/contamination/median_idat_intensity.csv`` aggregated table of median IDAT intensities. + +|bcf_input_intensity| |idat_input_intensity| + +.. |idat_input_intensity| image:: ../static/idat_intensity.svg + :width: 45% + +.. |bcf_input_intensity| image:: ../static/bcf_intensity.svg + :width: 45% + +The Intensity check sub-workflow. +This workflow will calculate the median intensity for each sample individually. +It requires that you have an aggregated BCF or IDAT files. diff --git a/docs/sub_workflows/subject_qc.rst b/docs/sub_workflows/subject_qc.rst index def14c1a..6536d395 100644 --- a/docs/sub_workflows/subject_qc.rst +++ b/docs/sub_workflows/subject_qc.rst @@ -52,7 +52,7 @@ Selecting Subject Representative During :ref:`sample-qc` we build the ``sample_level/sample_qc.csv``. In this table we have a flag ``is_subject_representative`` to indicate if a sample was selected to represent the subject. -The ``subject_level?subject_qc.csv`` is simply the ``sample_level/sample_qc.csv`` but only with subject representatives. +The ``subject_level/subject_qc.csv`` is simply the ``sample_level/sample_qc.csv`` but only with subject representatives. We then pull out these samples from ``sample_level/call_rate_2/samples.{bed,bim,fam}`` and rename them to subject IDs. Population Level Analysis diff --git a/src/cgr_gwas_qc/models/config/__init__.py b/src/cgr_gwas_qc/models/config/__init__.py index 0b4d23e9..2b415e2e 100644 --- a/src/cgr_gwas_qc/models/config/__init__.py +++ b/src/cgr_gwas_qc/models/config/__init__.py @@ -106,7 +106,7 @@ class Config(BaseModel): user_files: UserFiles = Field( ..., description="User file namespace. " - "User files include the user provided genotype data in IDAT/GTC, PED/MAP, or BED/BIM/FAM formats.", + "User files include the user provided genotype data in BCF, IDAT/GTC, PED/MAP or BED/BIM/FAM formats.", ) software_params: SoftwareParams = Field( diff --git a/src/cgr_gwas_qc/models/config/reference_files.py b/src/cgr_gwas_qc/models/config/reference_files.py index 98941c6b..cc04b0cd 100644 --- a/src/cgr_gwas_qc/models/config/reference_files.py +++ b/src/cgr_gwas_qc/models/config/reference_files.py @@ -17,7 +17,9 @@ class ReferenceFiles(BaseModel): """ - illumina_manifest_file: Path = Field(..., description="Path to the Illumina provided BPM file.") + illumina_manifest_file: Optional[Path] = Field( + None, description="Path to the Illumina provided BPM file." + ) illumina_cluster_file: Optional[Path] = Field( None, description="Path to the array cluster EGT file." diff --git a/src/cgr_gwas_qc/models/config/user_files.py b/src/cgr_gwas_qc/models/config/user_files.py index d795265e..00566626 100644 --- a/src/cgr_gwas_qc/models/config/user_files.py +++ b/src/cgr_gwas_qc/models/config/user_files.py @@ -35,9 +35,19 @@ class UserFiles(BaseModel): bim: /path/to/samples.bim fam: /path/to/samples.fam + or + + .. code-block:: yaml + + user_files: + output_pattern: '{prefix}/{file_type}.{ext}' + bcf: /path/to/samples.bcf + .. note:: - The IDAT/GTC patterns, PED/MAP paths, and BED/BIM/FAM paths are mutually exclusive. + The BCF file, IDAT/GTC patterns, PED/MAP paths, BED/BIM/FAM paths are all mutually exclusive. You should only provide one set of patterns/paths. + + """ output_pattern: str = Field( @@ -89,6 +99,21 @@ class UserFiles(BaseModel): description="The full path to an aggregated FAM file if the sample level GTC files are not available.", ) + # BCF + bcf: Optional[Path] = Field( + None, + description="The full path to an aggregated BCF/VCF file perferably encoding the GenCall scores.", + ) + + @validator("bcf") + def validate_bcf(cls, v): + if v is None: + return v + elif v.suffix == ".bcf": + return v + else: + raise ValueError("BCF suffix should be *.bcf") + @validator("gtc_pattern") def validate_gtc_pattern(cls, v): if v is None: diff --git a/src/cgr_gwas_qc/workflow/Snakefile b/src/cgr_gwas_qc/workflow/Snakefile index cf38277b..1aa2a824 100644 --- a/src/cgr_gwas_qc/workflow/Snakefile +++ b/src/cgr_gwas_qc/workflow/Snakefile @@ -48,7 +48,7 @@ use rule * from sample_qc targets.extend(sample_qc.targets) -if sample_qc.use_contamination: +if sample_qc.use_contamination or cfg.config.user_files.bcf: module contamination: snakefile: @@ -58,10 +58,22 @@ if sample_qc.use_contamination: use rule * from contamination - targets.extend(contamination.targets) +if sample_qc.use_contamination or cfg.config.user_files.bcf: + + module intensity_check: + snakefile: + cfg.subworkflow("intensity_check") + config: + config + + use rule * from intensity_check + + targets.extend(intensity_check.targets) + + module subject_qc: snakefile: cfg.subworkflow("subject_qc") diff --git a/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk b/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk index 3603bd0a..ac36f41e 100644 --- a/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk +++ b/src/cgr_gwas_qc/workflow/modules/thousand_genomes.smk @@ -21,13 +21,38 @@ rule pull_b_allele_freq_from_1kg: population=cfg.config.software_params.contam_population, output: abf_file="sample_level/{}.{}.abf.txt".format( - cfg.config.reference_files.illumina_manifest_file.stem, + cfg.config.snp_array, cfg.config.software_params.contam_population, ), script: "../scripts/bpm2abf.py" +rule pull_b_allele_freq_from_1kg_bcfinput: + """Pulls the population level allele frequencies from the 1KG project. + + ``verifyIDintensity`` requires population level allele frequencies + for its model. Here we use a custom script to pull out the allele B + frequencies (ABF) from the 1000 genomes project (1KG). To do this we + take each marker from the manifest file (BPM) and pull out ABF in the + 1KG ``.vcf`` from the ``INFO`` column. The script allows pulling out + allele frequencies for different super populations but defaults to + ``AF`` which ignores super population. + """ + input: + bcf_file=cfg.config.user_files.bcf, + kgvcf_file=cfg.config.reference_files.thousand_genome_vcf, + params: + population=cfg.config.software_params.contam_population, + output: + abf_file="sample_level/{}.{}.abf.txt".format( + cfg.config.snp_array, + cfg.config.software_params.contam_population, + ), + script: + "../scripts/vcf2abf.py" + + rule update_snps_to_1kg_rsID: """Update SNP IDs to rsID from the 1KG project. diff --git a/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py b/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py index a8f79749..07787f47 100644 --- a/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py +++ b/src/cgr_gwas_qc/workflow/scripts/grouped_contamination.py @@ -1,26 +1,42 @@ import shutil from concurrent.futures import ProcessPoolExecutor from pathlib import Path -from typing import List +from typing import List, Optional +import typer from snakemake.rules import expand +from cgr_gwas_qc import load_config from cgr_gwas_qc.parsers import sample_sheet from cgr_gwas_qc.typing import PathLike -from cgr_gwas_qc.workflow.scripts import agg_verifyidintensity, gtc2adpc, verifyidintensity +from cgr_gwas_qc.workflow.scripts import ( + agg_verifyidintensity, + gtc2adpc, + vcf2adpc, + verifyidintensity, +) +cfg = load_config() +app = typer.Typer(add_completion=False) + +@app.command() def main( - sample_sheet_csv: PathLike, - bpm_file: PathLike, - abf_file: PathLike, - grp: str, - gtc_pattern: str, - snps: int, - conda_env: PathLike, - outfile: PathLike, - notemp: bool = False, - threads: int = 8, + sample_sheet_csv: PathLike = typer.Argument(..., help="Path to cgr_samplesheet"), + bpm_file: Optional[PathLike] = typer.Argument( + ..., help="Path to bpm_file[optional, to be used if GTC input]" + ), + abf_file: PathLike = typer.Argument(..., help="Allele B frequency file"), + bcf_file: Optional[PathLike] = typer.Argument( + ..., help="vcf_file[optional, to be used if VCF/BCF input]" + ), + grp: str = typer.Argument(..., help="cluster_grp"), + gtc_pattern: Optional[str] = typer.Argument(..., help="gtc_pattern"), + snps: int = typer.Argument(..., help="number of snps"), + conda_env: PathLike = typer.Argument(..., help="conda enviornment"), + outfile: PathLike = typer.Argument(..., help="output file name"), + notemp: bool = typer.Argument(False, help="No temporary files should be created"), + threads: int = typer.Argument(8, help="number of threads"), ): ss = sample_sheet.read(sample_sheet_csv).query(f"cluster_group == '{grp}'") @@ -37,8 +53,12 @@ def main( adpc_pattern = (tmp_adpc / "{Sample_ID}.adpc.bin").as_posix() contam_pattern = (tmp_contam / "{Sample_ID}.txt").as_posix() - # Convert GTC to ADPC - convert_gtc_to_adpc(ss, bpm_file, gtc_pattern, adpc_pattern, threads) + # If BCF file is input, Convert VCF to ADPC per sample + if cfg.config.user_files.bcf: + convert_vcf_to_adpc(ss, bcf_file, adpc_pattern, threads) + # Otherwise, Convert GTC to ADPC - default + else: + convert_gtc_to_adpc(ss, bpm_file, gtc_pattern, adpc_pattern, threads) # Estimate Contamination contam_files = estimate_contamination( @@ -71,6 +91,26 @@ def convert_gtc_to_adpc(ss, bpm_file, gtc_pattern, outfile_pattern, threads): assert all(f.exists() for f in outfiles) +def convert_vcf_to_adpc(ss, bcf_file, outfile_pattern, threads): + with ProcessPoolExecutor(threads) as executor: + futures = [ + executor.submit( + vcf2adpc.main, + bcf_file, + record.Sample_ID, + outfile_pattern.format(**record.to_dict()), + ) + for _, record in ss.iterrows() + ] + + # Wait for results + [res.result() for res in futures] + + # Check the expected output files + outfiles = [Path(f) for f in expand(outfile_pattern, **ss.to_dict("list"))] + assert all(f.exists() for f in outfiles) + + def estimate_contamination( ss, adpc_pattern, abf_file, outfile_pattern, snps, conda_env, threads ) -> List[Path]: @@ -99,11 +139,7 @@ def estimate_contamination( if __name__ == "__main__" and "snakemake" in locals(): main( **{k: v for k, v in snakemake.input.items() if not k.startswith("_")}, # type: ignore # noqa - grp=snakemake.params.grp, # type: ignore # noqa - gtc_pattern=snakemake.params.gtc_pattern, # type: ignore # noqa - snps=int(snakemake.params.snps), # type: ignore # noqa - conda_env=snakemake.params.conda_env, # type: ignore # noqa - notemp=bool(snakemake.params.notemp), # type: ignore # noqa + **{k: v for k, v in snakemake.params.items()}, # type: ignore # noqa outfile=snakemake.output[0], # type: ignore # noqa threads=snakemake.threads or 2, # type: ignore # noqa ) diff --git a/src/cgr_gwas_qc/workflow/scripts/grouped_intensity_from_vcf.py b/src/cgr_gwas_qc/workflow/scripts/grouped_intensity_from_vcf.py new file mode 100755 index 00000000..f3ea4363 --- /dev/null +++ b/src/cgr_gwas_qc/workflow/scripts/grouped_intensity_from_vcf.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# coding: utf-8 + + +import shutil +from concurrent.futures import ProcessPoolExecutor +from pathlib import Path +from typing import List + +from snakemake.rules import expand + +from cgr_gwas_qc.parsers import sample_sheet +from cgr_gwas_qc.typing import PathLike +from cgr_gwas_qc.workflow.scripts import agg_median_idat_intensity, median_intensity_from_vcf + + +def main( + vcf_file: PathLike, + sample_sheet_csv: PathLike, + grp: str, + outfile: PathLike, + notemp: bool = False, + threads: int = 8, +): + ss = sample_sheet.read(sample_sheet_csv).query(f"cluster_group == '{grp}'") + tmp_dir = Path(outfile).parent / "temp_median_idat" + tmp_dir.mkdir(exist_ok=True, parents=True) + + intensity_files = calculate_median_intensity_from_vcf(ss, vcf_file, tmp_dir, threads) + + agg_median_idat_intensity.main(intensity_files, outfile) + + if not notemp: + shutil.rmtree(tmp_dir) + + +def calculate_median_intensity_from_vcf(ss, vcf_file, outdir, threads) -> List[Path]: + outfile_pattern = (outdir / "{Sample_ID}.csv").as_posix() + with ProcessPoolExecutor(threads) as executor: + futures = [ + executor.submit( + median_intensity_from_vcf.main, + vcf_file, + record.Sample_ID, + outfile_pattern.format(**record.to_dict()), + ) + for _, record in ss.iterrows() + ] + + # Wait for results + [res.result() for res in futures] + + # Check and Return files + outfiles = [Path(f) for f in expand(outfile_pattern, **ss.to_dict("list"))] + assert all(f.exists() for f in outfiles) + return outfiles + + +if __name__ == "__main__" and "snakemake" in locals(): + main( + **{k: v for k, v in snakemake.input.items()}, # type: ignore # noqa + **{k: v for k, v in snakemake.params.items()}, # type: ignore # noqa + outfile=snakemake.output[0], # type: ignore # noqa + threads=snakemake.threads or 2, # type: ignore # noqa + ) diff --git a/src/cgr_gwas_qc/workflow/scripts/median_intensity_from_vcf.py b/src/cgr_gwas_qc/workflow/scripts/median_intensity_from_vcf.py new file mode 100755 index 00000000..8b5bcf3e --- /dev/null +++ b/src/cgr_gwas_qc/workflow/scripts/median_intensity_from_vcf.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# coding: utf-8 + + +from pathlib import Path +from statistics import median + +import numpy as np +import pandas as pd +import typer +from pysam import VariantFile + +app = typer.Typer(add_completion=False) + + +@app.command() +def main( + vcf_file: Path = typer.Argument( + ..., + help="Path to a multisample BCF/VCF file with needed scores.", + exists=True, + readable=True, + ), + sample_id: str = typer.Argument(..., help="Sample_ID. Should match as in VCF"), + outfile: Path = typer.Argument( + ..., + help="Path to output file to write per sample median intensity.", + file_okay=True, + writable=True, + ), +) -> None: + fields_to_fetch = ["X", "Y"] + vcf = VariantFile(vcf_file) + + # loops over all records in the VCF, fetches the 'X' and 'Y' intensity values, sums them and stores them in the list + # of 'sample_values' + sample_values = [] + for rec in vcf: + sample_values.append(np.sum([rec.samples[sample_id].get(key) for key in fields_to_fetch])) + vcf.close() + + median_intensity = median(sample_values) + df = pd.DataFrame({"Sample_ID": [sample_id], "median_intensity": [median_intensity]}) + df.to_csv(outfile, index=False) + + +if __name__ == "__main__": + if "snakemake" in locals(): + main( + **{k: v for k, v in snakemake.input.items() if not k.startswith("_")}, # type: ignore # noqa + **{k: v for k, v in snakemake.params.items()}, # type: ignore # noqa + outfile=snakemake.output[0], # type: ignore # noqa + ) + else: + app() diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py new file mode 100755 index 00000000..c379c06b --- /dev/null +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2abf.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +# coding: utf-8 + + +from pathlib import Path +from typing import Optional + +# import numpy as np +import pandas as pd +import typer +from pysam import VariantFile + +app = typer.Typer(add_completion=False) + + +def is_biallelic_snp(rec): + return len(rec.ref) == 1 and len(rec.alts[0]) == 1 + + +@app.command() +def main( + bcf_file: Path = typer.Argument( + ..., help="Path to the sample VCF.", exists=True, readable=True + ), + kgvcf_file: Path = typer.Argument( + ..., help="Path to the 1000 genomes VCF file to get ABF.", exists=True, readable=True + ), + population: str = typer.Argument( + ..., + help="Which population in VCF file to use for ABF [Not implemented,global/AF used currently].", + ), + abf_file: Optional[Path] = typer.Argument( + None, help="Output file name. If none given it will use the `VCF_FILE + .abf.txt`." + ), +): + """Pull out the B allele frequency for a given subpopulation. + + Takes variants from the provided `vcf_file` and tries to find their B + allele frequency in the provided `kgvcf_file`. The B allele frequencies are + based on the allele frequency for the provided subpopulation + `population`. If the provided subpopulation doesn't exist, it will output + a list of available populations. If the variant does not exists in the + `vcf_file` then the ABF will be set to "NA". + """ + + if abf_file is None: + abf_file = bcf_file.with_suffix(".abf.txt") + + # retrieving needed info from the input vcf + vcf = VariantFile(bcf_file, drop_samples=True) + vcf_info = [] + for rec in vcf: + vcf_info.append( + [ + rec.id, + rec.chrom, + rec.pos, + rec.ref[0], + rec.alts[0], + is_biallelic_snp(rec), + # rec.info["IGC"], + rec.info["ALLELE_A"], + ] + ) + + vcf_info = pd.DataFrame( + vcf_info, + columns=[ + "id", + "chrom", + "pos", + "ref", + "alt", + "is_biallelic_snp", + # "gc_score", + "allele_a", + ], + ) + vcf_info["is_chrompos_uniq"] = ( + vcf_info.duplicated(subset=["chrom", "pos"], keep=False) == False + ) # if the input vairants are split, this will catch the multi-allelics + + # fetching B-allele frequencies from the kgvcf + kg_vcf = VariantFile(kgvcf_file, drop_samples=True) + kg_info = [] + for i in range(vcf_info.shape[0]): + if kg_vcf.is_valid_reference_name(vcf_info.chrom[i]): + for rec in kg_vcf.fetch( + contig=vcf_info.chrom[i], start=vcf_info.pos[i] - 1, stop=vcf_info.pos[i] + ): + if ( + is_biallelic_snp(rec) + & (vcf_info.is_biallelic_snp[i]) + & (vcf_info.ref[i] == rec.ref[0]) + & (vcf_info.alt[i] == rec.alts[0]) + & (vcf_info.is_chrompos_uniq[i]) + ): + kg_info.append( + [ + rec.chrom, + rec.pos, + rec.ref[0], + rec.alts[0], + rec.info["AC"][0], + rec.info["AN"], + ] + ) + # else: + # kg_info.append([np.nan,np.nan]) + # else: + # kg_info.append([np.nan,np.nan]) + kg_info = pd.DataFrame(kg_info, columns=["chrom", "pos", "ref", "alt", "ac", "an"]) + + # left-joining the kgvcf B-allele freq to input vcf metrics. + vcf_info = vcf_info.merge( + right=kg_info, how="left", sort=False, on=["chrom", "pos", "ref", "alt"] + ) + # A pseudo way to switch from population AF to Illumina A/B frequency + vcf_info["af"] = abs(vcf_info["allele_a"] - (vcf_info["ac"] / vcf_info["an"])) + # Setting AF to na if gentrain score is not >0, if this is egt file was an older one + # vcf_info["af"].where(vcf_info.gc_score > 0, np.nan, inplace=True) + # writing abf file + vcf_info[["id", "af"]].to_csv( + abf_file, sep="\t", index=False, header=["SNP_ID", "ABF"], na_rep="NA" + ) + + +if __name__ == "__main__": + if "snakemake" in locals(): + defaults = {} + defaults.update({k: Path(v) for k, v in snakemake.input.items()}) # type: ignore # noqa + defaults.update({k: Path(v) for k, v in snakemake.output.items()}) # type: ignore # noqa + defaults.update({k: v for k, v in snakemake.params.items()}) # type: ignore # noqa + main(**defaults) # type: ignore + else: + app() diff --git a/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py new file mode 100755 index 00000000..58799ce7 --- /dev/null +++ b/src/cgr_gwas_qc/workflow/scripts/vcf2adpc.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python +# coding: utf-8 + + +"""Extracts Illumina scores from a multisample VCF for target sample and writes into a single sample adpc.bin format.""" +from pathlib import Path +from typing import Generator + +import numpy as np +import pandas as pd +import typer +from pysam import VariantFile + +# from more_itertools import unzip +from cgr_gwas_qc.parsers.illumina import AdpcRecord, AdpcWriter + +app = typer.Typer(add_completion=False) + + +@app.command() +def main( + bcf_file: Path = typer.Argument( + ..., help="Path to a multisample VCF file with needed scores.", exists=True, readable=True + ), + target_sample: str = typer.Argument(..., help="Sample name."), + outfile: Path = typer.Argument( + ..., help="Path to output the adpc.bin file.", file_okay=True, writable=True + ), +) -> None: + """Extracts Illumina scores from a multisample VCF for target sample and writes into a single sample adpc.bin format. + + The adpc.bin format includes:: + + x_raw: Raw intensities for allele A + y_raw: Raw intensities for allele B + x_norm: Normalized intensities for allele A + y_norm: Normalized intensities for allele B + genotype_score: The genotype clustering confidence score + genotype: The called genotype (0: AA, 1: AB, 2: BB, 3: unknown or missing) + """ + + vcf = VariantFile(bcf_file) + sample_fields_to_fetch = ["X", "Y", "NORMX", "NORMY", "GT", "IGC"] + info_fieilds_to_fetch = ["ALLELE_A", "ALLELE_B"] + + renaming_scheme = { + "X": "x_raw", + "Y": "y_raw", + "NORMX": "x_norm", + "NORMY": "y_norm", + "IGC": "genotype_score", + } + + vcf_info = [] + for rec in vcf: + vcf_info.append( + [rec.info.get(key) for key in info_fieilds_to_fetch] + + [rec.samples[target_sample].get(key) for key in sample_fields_to_fetch] + ) + vcf_info = pd.DataFrame(vcf_info, columns=info_fieilds_to_fetch + sample_fields_to_fetch) + + # convert GT to illumina genotype: + vcf_info = pd.concat( + [ + vcf_info.reset_index(drop=True), + pd.DataFrame(vcf_info["GT"].tolist(), columns=["GT1", "GT2"]), + ], + axis=1, + ) + vcf_info["genotype"] = np.nan # setting missing by default. + vcf_info.loc[ + (vcf_info["GT1"] == vcf_info["ALLELE_A"]) & (vcf_info["GT2"] == vcf_info["ALLELE_A"]), + "genotype", + ] = "AA" + vcf_info.loc[ + (vcf_info["GT1"] == vcf_info["ALLELE_B"]) & (vcf_info["GT2"] == vcf_info["ALLELE_B"]), + "genotype", + ] = "BB" + vcf_info.loc[(vcf_info["GT1"] != vcf_info["GT2"]), "genotype"] = "AB" + vcf_info.loc[(vcf_info["GT1"].isna()) | (vcf_info["GT2"].isna()), "genotype"] = np.nan + + # renaming for consisting with gtc2adpc + vcf_info.rename(columns=renaming_scheme, inplace=True) + + # encode illumina genotypes per adpc.bin encoding + vcf_info = vcf_info.replace( + {"genotype": {"AA": 0, "AB": 1, "BA": 1, "BB": 2, "A": 3, "B": 3}} + ).fillna({"genotype": 3}) + vcf_info["genotype_score"] = vcf_info["genotype_score"].where(vcf_info["x_raw"] != 0, 0) + + # writing adpc.bin file + with AdpcWriter(outfile) as fh: + for record in vcf_to_adpc_record(vcf_info): + fh.write(record) + + +## function to adpc records +def vcf_to_adpc_record(vcf_info) -> Generator[AdpcRecord, None, None]: + for x_raw, y_raw, x_norm, y_norm, genotype_score, genotype in zip( + vcf_info["x_raw"], + vcf_info["y_raw"], + vcf_info["x_norm"], + vcf_info["y_norm"], + vcf_info["genotype_score"], + vcf_info["genotype"], + ): + yield AdpcRecord(x_raw, y_raw, x_norm, y_norm, genotype_score, genotype) + + +if __name__ == "__main__": + if "snakemake" in locals(): + defaults = {} + defaults.update({k: Path(v) for k, v in snakemake.input.items()}) # type: ignore # noqa + defaults.update({k: v for k, v in snakemake.params.items()}) # type: ignore # noqa + defaults.update({"outfile": Path(snakemake.output[0])}) # type: ignore # noqa + main(**defaults) + else: + app() diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk index 832a6662..15be96d1 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/contamination.smk @@ -9,7 +9,6 @@ cfg = load_config() # Contamination Targets ################################################################################ targets = [ - "sample_level/contamination/median_idat_intensity.csv", "sample_level/contamination/verifyIDintensity.csv", ] @@ -29,7 +28,16 @@ module thousand_genomes: {} -use rule pull_b_allele_freq_from_1kg from thousand_genomes +if cfg.config.user_files.bcf: + + use rule pull_b_allele_freq_from_1kg_bcfinput from thousand_genomes as pull_b_allele_freq_from_1kg with: + input: + bcf_file=cfg.config.user_files.bcf, + kgvcf_file=cfg.config.reference_files.thousand_genome_vcf, + +else: + + use rule pull_b_allele_freq_from_1kg from thousand_genomes ################################################################################ @@ -42,19 +50,9 @@ use rule pull_b_allele_freq_from_1kg from thousand_genomes # sure that snakemake creates the environments, so these PHONY rules will make # sure that the conda env exists. localrules: - illuminaio_conda, verifyidintensity_conda, -rule illuminaio_conda: - output: - temp(".illuminaio_env_built"), - conda: - cfg.conda("illuminaio") - shell: - "touch {output[0]}" - - rule verifyidintensity_conda: output: temp(".verifyidintensity_env_built"), @@ -67,195 +65,250 @@ rule verifyidintensity_conda: ################################################################################ # Workflow Rules ################################################################################ -if config.get("cluster_mode", False): - - localrules: - agg_median_idat_intensity, - agg_verifyidintensity, - - rule grouped_median_idat_intensity: - """Calculate median intensity of Red and Green channels.""" - input: - sample_sheet_csv="cgr_sample_sheet.csv", - _=rules.illuminaio_conda.output[0], - params: - grp="{grp}", - idat_red_pattern=lambda _: cfg.config.user_files.idat_pattern.red, - idat_green_pattern=lambda _: cfg.config.user_files.idat_pattern.green, - rscript=cfg.scripts("median_idat_intensity.R"), - conda_env=cfg.conda("illuminaio"), - notemp=config.get("notemp", False), - output: - temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), - threads: 12 - resources: - mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, - time_hr=lambda wildcards, attempt: 4 * attempt, - script: - "../scripts/grouped_median_idat_intensity.py" - - rule agg_median_idat_intensity: - input: - expand(rules.grouped_median_idat_intensity.output[0], grp=cfg.cluster_groups), - params: - notemp=config.get("notemp", False), - output: - "sample_level/contamination/median_idat_intensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt ** 2, - script: - "../scripts/agg_median_idat_intensity.py" - - rule grouped_contamination: - """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. - - This is the format required by ``verifyIDintensity``. The script also - runs some sanity checks (intensities and normalized intensities > 0; - genotypes are one of {0, 1, 2, 3}) while processing each file. - - .. warning:: - This is a submission hot spot creating 1 job per sample. - - Find contaminated samples using allele intensities. - - Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the - population. - - .. warning:: - This is a submission hot spot creating 1 job per sample. - - .. note:: - Here we are running ``verifyIDintensity`` in single sample mode. This software also has a - multi-sample mode which may be faster and give better estimates. The problem with - multi-sample mode is that it only works when you have a "large" number of samples. - """ - input: - sample_sheet_csv="cgr_sample_sheet.csv", - bpm_file=cfg.config.reference_files.illumina_manifest_file, - abf_file=rules.pull_b_allele_freq_from_1kg.output.abf_file, - _=rules.verifyidintensity_conda.output[0], - params: - grp="{grp}", - gtc_pattern=lambda _: cfg.config.user_files.gtc_pattern, - snps=cfg.config.num_snps, - conda_env=cfg.conda("verifyidintensity"), - notemp=config.get("notemp", False), - output: - temp("sample_level/contamination/{grp}/verifyIDintensity.csv"), - threads: 12 - resources: - mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, - time_hr=lambda wildcards, attempt: 4 * attempt, - script: - "../scripts/grouped_contamination.py" - - rule agg_verifyidintensity: - input: - expand(rules.grouped_contamination.output[0], grp=cfg.cluster_groups), - output: - "sample_level/contamination/verifyIDintensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt ** 2, - script: - "../scripts/agg_verifyidintensity.py" - +# If BCF file is input +if cfg.config.user_files.bcf: + if config.get("cluster_mode", False): + + localrules: + agg_verifyidintensity, + + rule grouped_contamination: + """Extracts normalized intensities and other metrics from an aggregated + BCF file and writes to Illumina ADPC.BIN per sample. + + This is the format required by ``verifyIDintensity``. The script also + runs some sanity checks (intensities and normalized intensities > 0; + genotypes are one of {0, 1, 2, 3}) while processing each file. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + Find contaminated samples using allele intensities. + + Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the + population. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + .. note:: + Here we are running ``verifyIDintensity`` in single sample mode. This software also has a + multi-sample mode which may be faster and give better estimates. The problem with + multi-sample mode is that it only works when you have a "large" number of samples. + """ + input: + sample_sheet_csv="cgr_sample_sheet.csv", + bcf_file=cfg.config.user_files.bcf, + abf_file=rules.pull_b_allele_freq_from_1kg.output.abf_file, + _=rules.verifyidintensity_conda.output[0], + params: + grp="{grp}", + snps=cfg.config.num_snps, + conda_env=cfg.conda("verifyidintensity"), + notemp=config.get("notemp", False), + output: + temp("sample_level/contamination/{grp}/verifyIDintensity.csv"), + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, + time_hr=lambda wildcards, attempt: 4 * attempt, + script: + "../scripts/grouped_contamination.py" + + rule agg_verifyidintensity: + input: + expand(rules.grouped_contamination.output[0], grp=cfg.cluster_groups), + output: + "sample_level/contamination/verifyIDintensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_verifyidintensity.py" + + else: + + rule per_sample_vcf_to_adpc: + """From an aggregated BCF input file, extracts normalized intensities and + other metrics for the target sample and writes to Illumina ADPC.BIN per sample. + + This is the format required by ``verifyIDintensity``. The script also + runs some sanity checks (intensities and normalized intensities > 0; + genotypes are one of {0, 1, 2, 3}) while processing each file. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + """ + input: + bcf_file=cfg.config.user_files.bcf, + params: + target_sample="{Sample_ID}", + output: + temp("sample_level/per_sample_adpc/{Sample_ID}.adpc.bin"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/vcf2adpc.py" + + rule per_sample_contamination_verifyIDintensity: + """Find contaminated samples using allele intensities. + + Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the + population. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + .. note:: + Here we are running ``verifyIDintensity`` in single sample mode. This software also has a + multi-sample mode which may be faster and give better estimates. The problem with + multi-sample mode is that it only works when you have a "large" number of samples. + """ + input: + adpc=rules.per_sample_vcf_to_adpc.output[0], + abf=rules.pull_b_allele_freq_from_1kg.output.abf_file, + _=rules.verifyidintensity_conda.output[0], + params: + snps=cfg.config.num_snps, + conda_env=cfg.conda("verifyidintensity"), + output: + temp("sample_level/per_sample_contamination_test/{Sample_ID}.contam.out"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/verifyidintensity.py" + + rule agg_verifyidintensity: + input: + cfg.expand(rules.per_sample_contamination_verifyIDintensity.output[0]), + output: + "sample_level/contamination/verifyIDintensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_verifyidintensity.py" else: - - rule per_sample_median_idat_intensity: - """Calculate median intensity of Red and Green channels.""" - input: - red=lambda wc: cfg.expand( - cfg.config.user_files.idat_pattern.red, - query=f"Sample_ID == '{wc.Sample_ID}'", - )[0], - green=lambda wc: cfg.expand( - cfg.config.user_files.idat_pattern.green, - query=f"Sample_ID == '{wc.Sample_ID}'", - )[0], - _=rules.illuminaio_conda.output[0], - params: - sample_id="{Sample_ID}", - rscript=cfg.scripts("median_idat_intensity.R"), - conda_env=cfg.conda("illuminaio"), - output: - temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), - resources: - mem_mb=lambda wildcards, attempt: attempt * 1024, - script: - "../scripts/median_idat_intensity.py" - - rule agg_median_idat_intensity: - input: - cfg.expand(rules.per_sample_median_idat_intensity.output[0]), - output: - "sample_level/contamination/median_idat_intensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt ** 2, - script: - "../scripts/agg_median_idat_intensity.py" - - rule per_sample_gtc_to_adpc: - """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. - - This is the format required by ``verifyIDintensity``. The script also - runs some sanity checks (intensities and normalized intensities > 0; - genotypes are one of {0, 1, 2, 3}) while processing each file. - - .. warning:: - This is a submission hot spot creating 1 job per sample. - """ - input: - gtc_file=lambda wc: cfg.expand( - cfg.config.user_files.gtc_pattern, - query=f"Sample_ID == '{wc.Sample_ID}'", - )[0], - bpm_file=cfg.config.reference_files.illumina_manifest_file, - output: - temp("sample_level/per_sample_adpc/{Sample_ID}.adpc.bin"), - resources: - mem_mb=lambda wildcards, attempt: attempt * 1024, - script: - "../scripts/gtc2adpc.py" - - rule per_sample_contamination_verifyIDintensity: - """Find contaminated samples using allele intensities. - - Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the - population. - - .. warning:: - This is a submission hot spot creating 1 job per sample. - - .. note:: - Here we are running ``verifyIDintensity`` in single sample mode. This software also has a - multi-sample mode which may be faster and give better estimates. The problem with - multi-sample mode is that it only works when you have a "large" number of samples. - """ - input: - adpc=rules.per_sample_gtc_to_adpc.output[0], - abf=rules.pull_b_allele_freq_from_1kg.output.abf_file, - _=rules.verifyidintensity_conda.output[0], - params: - snps=cfg.config.num_snps, - conda_env=cfg.conda("verifyidintensity"), - output: - temp("sample_level/per_sample_contamination_test/{Sample_ID}.contam.out"), - resources: - mem_mb=lambda wildcards, attempt: attempt * 1024, - script: - "../scripts/verifyidintensity.py" - - rule agg_verifyidintensity: - input: - cfg.expand(rules.per_sample_contamination_verifyIDintensity.output[0]), - output: - "sample_level/contamination/verifyIDintensity.csv", - resources: - mem_gb=lambda wildcards, attempt: attempt * 4, - time_hr=lambda wildcards, attempt: attempt ** 2, - script: - "../scripts/agg_verifyidintensity.py" + if config.get("cluster_mode", False): + + localrules: + agg_verifyidintensity, + + rule grouped_contamination: + """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. + + This is the format required by ``verifyIDintensity``. The script also + runs some sanity checks (intensities and normalized intensities > 0; + genotypes are one of {0, 1, 2, 3}) while processing each file. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + Find contaminated samples using allele intensities. + + Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the + population. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + .. note:: + Here we are running ``verifyIDintensity`` in single sample mode. This software also has a + multi-sample mode which may be faster and give better estimates. The problem with + multi-sample mode is that it only works when you have a "large" number of samples. + """ + input: + sample_sheet_csv="cgr_sample_sheet.csv", + bpm_file=cfg.config.reference_files.illumina_manifest_file, + abf_file=rules.pull_b_allele_freq_from_1kg.output.abf_file, + _=rules.verifyidintensity_conda.output[0], + params: + grp="{grp}", + gtc_pattern=lambda _: cfg.config.user_files.gtc_pattern, + snps=cfg.config.num_snps, + conda_env=cfg.conda("verifyidintensity"), + notemp=config.get("notemp", False), + output: + temp("sample_level/contamination/{grp}/verifyIDintensity.csv"), + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, + time_hr=lambda wildcards, attempt: 4 * attempt, + script: + "../scripts/grouped_contamination.py" + + rule agg_verifyidintensity: + input: + expand(rules.grouped_contamination.output[0], grp=cfg.cluster_groups), + output: + "sample_level/contamination/verifyIDintensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_verifyidintensity.py" + + else: + + rule per_sample_gtc_to_adpc: + """Converts a sample's GTC/BPM to an Illumina ADPC.BIN. + + This is the format required by ``verifyIDintensity``. The script also + runs some sanity checks (intensities and normalized intensities > 0; + genotypes are one of {0, 1, 2, 3}) while processing each file. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + """ + input: + gtc_file=lambda wc: cfg.expand( + cfg.config.user_files.gtc_pattern, + query=f"Sample_ID == '{wc.Sample_ID}'", + )[0], + bpm_file=cfg.config.reference_files.illumina_manifest_file, + output: + temp("sample_level/per_sample_adpc/{Sample_ID}.adpc.bin"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/gtc2adpc.py" + + rule per_sample_contamination_verifyIDintensity: + """Find contaminated samples using allele intensities. + + Uses ``verifyIDintensity`` to find samples with allele intensities that deviate from the + population. + + .. warning:: + This is a submission hot spot creating 1 job per sample. + + .. note:: + Here we are running ``verifyIDintensity`` in single sample mode. This software also has a + multi-sample mode which may be faster and give better estimates. The problem with + multi-sample mode is that it only works when you have a "large" number of samples. + """ + input: + adpc=rules.per_sample_gtc_to_adpc.output[0], + abf=rules.pull_b_allele_freq_from_1kg.output.abf_file, + _=rules.verifyidintensity_conda.output[0], + params: + snps=cfg.config.num_snps, + conda_env=cfg.conda("verifyidintensity"), + output: + temp("sample_level/per_sample_contamination_test/{Sample_ID}.contam.out"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/verifyidintensity.py" + + rule agg_verifyidintensity: + input: + cfg.expand(rules.per_sample_contamination_verifyIDintensity.output[0]), + output: + "sample_level/contamination/verifyIDintensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_verifyidintensity.py" diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk b/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk index 46903516..07b9a2c1 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/entry_points.smk @@ -177,3 +177,40 @@ elif cfg.config.user_files.bed and cfg.config.user_files.bim and cfg.config.user "ln -s $(readlink -f {input.bed}) {output.bed} && " "ln -s $(readlink -f {input.bim}) {output.bim} && " "ln -s $(readlink -f {input.fam}) {output.fam}" + +elif cfg.config.user_files.bcf: + + ################################################################################ + # Aggregated BCF + ################################################################################ + localrules: + convert_bcf_to_plink_bed, + + rule convert_bcf_to_plink_bed: + """Converts BCF to plink BED file + + Path to aggregated BCF file is expected in user_files in config. The expected + VCF should be created using BCFtools/gtc2vcf. The BCF will be converted to BED + file for processing. + """ + input: + bcf=cfg.config.user_files.bcf, + params: + prefix="sample_level/samples", + output: + bed="sample_level/samples.bed", + bim="sample_level/samples.bim", + fam="sample_level/samples.fam", + log: + "sample_level/samples.log", + conda: + cfg.conda("plink2") + resources: + mem_mb=lambda wildcards, attempt: 1024 * 8 * attempt, + shell: + "plink " + "--allow-extra-chr 0 --keep-allele-order --double-id " + "--bcf {input.bcf} " + "--make-bed " + "--out {params.prefix} " + "--memory {resources.mem_mb}" diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/intensity_check.smk b/src/cgr_gwas_qc/workflow/sub_workflows/intensity_check.smk new file mode 100644 index 00000000..73cb1d6e --- /dev/null +++ b/src/cgr_gwas_qc/workflow/sub_workflows/intensity_check.smk @@ -0,0 +1,182 @@ +import pandas as pd + +from cgr_gwas_qc import load_config + +cfg = load_config() + + +################################################################################ +# Intensity Targets +################################################################################ +targets = [ + "sample_level/contamination/median_idat_intensity.csv", +] + + +rule all_contamination: + input: + targets, + + +################################################################################ +# PHONY Rules +################################################################################ +# NOTE: Because of job grouping it is cleaner to wrap the various CLI utilities in +# their own python script. This makes using conda more complicated. Instead of +# installing all of the python dependencies in each environment, I will just +# pass the conda environment and activate it internal. However, we want to make +# sure that snakemake creates the environments, so these PHONY rules will make +# sure that the conda env exists. +localrules: + illuminaio_conda, + + +rule illuminaio_conda: + output: + temp(".illuminaio_env_built"), + conda: + cfg.conda("illuminaio") + shell: + "touch {output[0]}" + + +################################################################################ +# Workflow Rules +################################################################################ +if cfg.config.user_files.bcf: + if config.get("cluster_mode", False): + + localrules: + agg_median_idat_intensity, + + rule grouped_median_intensity_from_vcf: + """Calculate median intensity from raw intensities using VCF/BCF input.""" + input: + sample_sheet_csv="cgr_sample_sheet.csv", + vcf_file=cfg.config.user_files.bcf, + params: + grp="{grp}", + notemp=config.get("notemp", False), + output: + temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, + time_hr=lambda wildcards, attempt: 4 * attempt, + script: + "../scripts/grouped_intensity_from_vcf.py" + + rule agg_median_idat_intensity: + input: + expand(rules.grouped_median_intensity_from_vcf.output[0], grp=cfg.cluster_groups), + params: + notemp=config.get("notemp", False), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" + + else: + + rule per_sample_median_intensity_from_vcf: + """Calculate median intensity from raw intensities using VCF/BCF input.""" + input: + vcf_file=cfg.config.user_files.bcf, + params: + sample_id="{Sample_ID}", + output: + temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/median_intensity_from_vcf.py" + + rule agg_median_idat_intensity: + input: + cfg.expand(rules.per_sample_median_intensity_from_vcf.output[0]), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" + +else: + if config.get("cluster_mode", False): + + localrules: + agg_median_idat_intensity, + + rule grouped_median_idat_intensity: + """Calculate median intensity of Red and Green channels.""" + input: + sample_sheet_csv="cgr_sample_sheet.csv", + _=rules.illuminaio_conda.output[0], + params: + grp="{grp}", + idat_red_pattern=lambda _: cfg.config.user_files.idat_pattern.red, + idat_green_pattern=lambda _: cfg.config.user_files.idat_pattern.green, + rscript=cfg.scripts("median_idat_intensity.R"), + conda_env=cfg.conda("illuminaio"), + notemp=config.get("notemp", False), + output: + temp("sample_level/contamination/{grp}/median_idat_intensity.csv"), + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 1024 * 12 * attempt, + time_hr=lambda wildcards, attempt: 4 * attempt, + script: + "../scripts/grouped_median_idat_intensity.py" + + rule agg_median_idat_intensity: + input: + expand(rules.grouped_median_idat_intensity.output[0], grp=cfg.cluster_groups), + params: + notemp=config.get("notemp", False), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" + + else: + + rule per_sample_median_idat_intensity: + """Calculate median intensity of Red and Green channels.""" + input: + red=lambda wc: cfg.expand( + cfg.config.user_files.idat_pattern.red, + query=f"Sample_ID == '{wc.Sample_ID}'", + )[0], + green=lambda wc: cfg.expand( + cfg.config.user_files.idat_pattern.green, + query=f"Sample_ID == '{wc.Sample_ID}'", + )[0], + _=rules.illuminaio_conda.output[0], + params: + sample_id="{Sample_ID}", + rscript=cfg.scripts("median_idat_intensity.R"), + conda_env=cfg.conda("illuminaio"), + output: + temp("sample_level/contamination/per_sample_median_idat_intensity/{Sample_ID}.csv"), + resources: + mem_mb=lambda wildcards, attempt: attempt * 1024, + script: + "../scripts/median_idat_intensity.py" + + rule agg_median_idat_intensity: + input: + cfg.expand(rules.per_sample_median_idat_intensity.output[0]), + output: + "sample_level/contamination/median_idat_intensity.csv", + resources: + mem_gb=lambda wildcards, attempt: attempt * 4, + time_hr=lambda wildcards, attempt: attempt**2, + script: + "../scripts/agg_median_idat_intensity.py" diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk index f6ee2231..490a825b 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk @@ -11,6 +11,10 @@ use_contamination = ( and cfg.config.workflow_params.remove_contam ) +idat_intensity_retrieved = Path("sample_level/contamination/median_idat_intensity.csv").is_file() + +contamination_checked = Path("sample_level/contamination/verifyIDintensity.csv").is_file() + localrules: all_sample_qc, @@ -196,7 +200,7 @@ use rule miss from plink as plink_call_rate_post2 with: # ------------------------------------------------------------------------------- # Contamination # ------------------------------------------------------------------------------- -if use_contamination: +if use_contamination or (idat_intensity_retrieved and contamination_checked): localrules: sample_contamination_verifyIDintensity, @@ -485,7 +489,7 @@ else: def _contam(wildcards): uf, wf = cfg.config.user_files, cfg.config.workflow_params - if use_contamination: + if use_contamination or (idat_intensity_retrieved and contamination_checked): return rules.sample_contamination_verifyIDintensity.output[0] return [] @@ -494,6 +498,8 @@ def _intensity(wildcards): uf, wf = cfg.config.user_files, cfg.config.workflow_params if use_contamination: return "sample_level/contamination/median_idat_intensity.csv" + elif idat_intensity_retrieved: + return "sample_level/contamination/median_idat_intensity.csv" return [] diff --git a/tests/data b/tests/data index 5def846f..0574c4b2 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 5def846ff2b0459b3bee4711a32481e14c5e2f58 +Subproject commit 0574c4b2a37b6d9ce8ea2764a4bff24bc30af4d5 diff --git a/tests/workflow/scripts/test_median_intensity_from_vcf.py b/tests/workflow/scripts/test_median_intensity_from_vcf.py new file mode 100644 index 00000000..3c650985 --- /dev/null +++ b/tests/workflow/scripts/test_median_intensity_from_vcf.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# coding: utf-8 + +"""Test calculation of median intensity from BCF + +We need to assess whether the intensity meets the minimum threshold. For this purpose, we typically check median IDAT intensity. Here we attempt to test the same with BCF as input file. +""" + +import pandas as pd +import pytest +from numpy import isclose + +from cgr_gwas_qc.workflow.scripts import median_intensity_from_vcf + + +@pytest.mark.regression +def test_median_intensity_from_vcf_fake_data(fake_data_cache, tmp_path): + obs_median_int = tmp_path / "test_median_int.csv" + median_intensity_from_vcf.main( + fake_data_cache / "cgr/bcf/small_genotype.bcf", "small_genotype", obs_median_int + ) + + expected_median_int = fake_data_cache / "cgr/bcf/small_genotype_median_int.csv" + + expected_median_int = pd.read_csv(expected_median_int) + obs_median_int = pd.read_csv(obs_median_int) + assert isclose(expected_median_int.median_intensity, obs_median_int.median_intensity) diff --git a/tests/workflow/scripts/test_vcf2abf.py b/tests/workflow/scripts/test_vcf2abf.py new file mode 100755 index 00000000..784ea549 --- /dev/null +++ b/tests/workflow/scripts/test_vcf2abf.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# coding: utf-8 + +"""Test the conversion of BCF to ABF. + +For contamination checking we need to have the population level allele +frequencies. We pull these allele frequencies from the 1KG project for each +SNP in the BCF. +""" + +import pandas as pd +import pytest + +from cgr_gwas_qc.workflow.scripts import vcf2abf + + +@pytest.mark.regression +def test_vcf2abf_fake_data(fake_data_cache, tmp_path): + obs_abf = tmp_path / "test.abf.txt" + vcf2abf.main( + fake_data_cache / "cgr/bcf/small_genotype.bcf", + fake_data_cache / "1KG/small_1KG.vcf.gz", + "AF", + obs_abf, + ) + + expected_abf = fake_data_cache / "cgr/bcf/small_genotype.abf.txt" + + obs_abf = pd.read_csv(obs_abf, sep="\t") + expected_abf = pd.read_csv(expected_abf, sep="\t") + + comparision = expected_abf.merge( + obs_abf, how="left", on="SNP_ID", suffixes=("_expected", "_obs") + ) + comparision = comparision.dropna() # excluding the na from the test + prop_very_different = (abs(comparision.ABF_expected - comparision.ABF_obs) > 1e-6).mean() + assert 0.01 > prop_very_different # less than 1% are more than 0.000001 different diff --git a/tests/workflow/scripts/test_vcf2adpc.py b/tests/workflow/scripts/test_vcf2adpc.py new file mode 100644 index 00000000..a7c0c9e1 --- /dev/null +++ b/tests/workflow/scripts/test_vcf2adpc.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# coding: utf-8 + +"""Test the extraction of sample scores from BCF and writing to adpc.bin. + +For contamination checking we need to have the sample scores written into adpc.bin to serve as input for verifyIDintensity. +""" + +import pytest +from numpy import isclose + +from cgr_gwas_qc.parsers.illumina import AdpcReader +from cgr_gwas_qc.workflow.scripts import vcf2adpc + + +@pytest.mark.regression +def test_vcf2adpc_fake_data(fake_data_cache, tmp_path): + obs_adpc = tmp_path / "test.adpc.bin" + vcf2adpc.main(fake_data_cache / "cgr/bcf/small_genotype.bcf", "small_genotype", obs_adpc) + + expected_adpc = fake_data_cache / "cgr/bcf/small_genotype.adpc.bin" + + with AdpcReader(expected_adpc) as expected, AdpcReader(obs_adpc) as obs: + for exp_row, obs_row in zip(expected, obs): + # Integers should be the exactly the same + assert exp_row.x_raw == obs_row.x_raw + assert exp_row.y_raw == obs_row.y_raw + assert exp_row.genotype == obs_row.genotype + + # Floats should be really close + assert isclose(exp_row.x_norm, obs_row.x_norm) + assert isclose(exp_row.y_norm, obs_row.y_norm) + assert isclose(exp_row.genotype_score, obs_row.genotype_score)