From f9a4f2edc2fac3667e866193d99fc0dffc11a648 Mon Sep 17 00:00:00 2001 From: Kevin Liao Date: Fri, 12 Apr 2024 15:49:43 -0400 Subject: [PATCH 1/3] issue237 --- src/cgr_gwas_qc/reporting/constants.py | 10 +- .../workflow/scripts/plot_ancestry.py | 39 +++++++- .../workflow/scripts/plot_ancestry_grafpop.py | 95 ------------------- .../scripts/plot_autosomal_heterozygosity.py | 12 ++- .../workflow/scripts/plot_call_rate.py | 9 +- .../workflow/scripts/plot_chrx_inbreeding.py | 28 ++++-- src/cgr_gwas_qc/workflow/scripts/plot_pca.py | 9 +- .../workflow/scripts/sample_qc_table.py | 24 +++-- .../workflow/sub_workflows/sample_qc.smk | 28 +++--- .../workflow/sub_workflows/subject_qc.smk | 6 -- 10 files changed, 116 insertions(+), 144 deletions(-) delete mode 100644 src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py diff --git a/src/cgr_gwas_qc/reporting/constants.py b/src/cgr_gwas_qc/reporting/constants.py index 88144661..a9715531 100644 --- a/src/cgr_gwas_qc/reporting/constants.py +++ b/src/cgr_gwas_qc/reporting/constants.py @@ -1,7 +1,15 @@ import pandas as pd CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"]) -CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"] # red # blue # gray # green +CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"] # red # blue # gray #gold + +# Assign labels to colors for plotting consistency +CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], +} SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"]) diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py index 4327333a..374459c9 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py @@ -31,7 +31,11 @@ def main(sample_qc: Path, outfile: Path): def load_sample_data(sample_qc: Path) -> pd.DataFrame: - return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"]) + return ( + sample_qc_table.read(sample_qc) + .query("is_subject_representative") + .dropna(subset=["EUR", "AFR", "ASN"]) + ) def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): @@ -42,17 +46,21 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): fig, tax = ternary.figure(scale=1) # Set scale 0 to 1 fig.set_size_inches(6, 5) - # Plot cases and controls separately + # Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible case = sample.query("case_control == 'Case'") if case.shape[0] > 0: case_color = CASE_CONTROL_COLORS[0] tax.scatter( - case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults + case[["EUR", "AFR", "ASN"]].values, + color=case_color, + label="Case", + zorder=4, + **style_defaults ) control = sample.query("case_control == 'Control'") if control.shape[0] > 0: - control_color = CASE_CONTROL_COLORS[1] + control_color = CASE_CONTROL_COLORS[1] # blue tax.scatter( control[["EUR", "AFR", "ASN"]].values, color=control_color, @@ -60,6 +68,29 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): **style_defaults ) + # Issue 237: Add samples if they are neither case or control. + project_qc = sample.query("case_control == 'QC'") + if project_qc.shape[0] > 0: + project_qc_color = CASE_CONTROL_COLORS[2] # Yellow + tax.scatter( + project_qc[["EUR", "AFR", "ASN"]].values, + color=project_qc_color, + label="QC", + **style_defaults + ) + + unknown = sample.query( + "case_control != 'Control' and case_control != 'Case' and case_control != 'QC'" + ) + if unknown.shape[0] > 0: + unknown_color = CASE_CONTROL_COLORS[3] # Gray + tax.scatter( + unknown[["EUR", "AFR", "ASN"]].values, + color=unknown_color, + label="Unknown", + **style_defaults + ) + # Add plot elements multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments tax.boundary(linewidth=0.5) diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py deleted file mode 100644 index 2a0d7242..00000000 --- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -plot_ancestry.py ----------------- - -This script plots the triangle plot of ancestries from GRAF ancestry -estimates. - -Output: - ``sample_level/ancestry.png`` - -""" -import os -from pathlib import Path -from typing import Optional - -import pandas as pd -import seaborn as sns -import ternary -import typer - -from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS -from cgr_gwas_qc.workflow.scripts import sample_qc_table - -app = typer.Typer(add_completion=False) - - -@app.command() -def main(sample_qc: Path, outfile: Path): - sample = load_sample_data(sample_qc) - plot(sample, outfile) - - -def load_sample_data(sample_qc: Path) -> pd.DataFrame: - return sample_qc_table.read(sample_qc).dropna(subset=["E(%)", "F(%)", "A(%)"]) - - -def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): - sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - - # Create plots - style_defaults = dict(linewidth=0, alpha=0.8, s=8) - fig, tax = ternary.figure(scale=1) # Set scale 0 to 1 - fig.set_size_inches(6, 5) - - # Plot cases and controls separately - case = sample.query("case_control == 'Case'") - if case.shape[0] > 0: - case_color = CASE_CONTROL_COLORS[0] - tax.scatter( - case[["E(%)", "F(%)", "A(%)"]].values, color=case_color, label="Case", **style_defaults - ) - - control = sample.query("case_control == 'Control'") - if control.shape[0] > 0: - control_color = CASE_CONTROL_COLORS[1] - tax.scatter( - control[["E(%)", "F(%)", "A(%)"]].values, - color=control_color, - label="Control", - **style_defaults - ) - - # Add plot elements - multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments - tax.boundary(linewidth=0.5) - tax.gridlines(multiple=multiple, color="gray") - tax.ticks(axis="lbr", linewidth=1, multiple=multiple, offset=0.02, tick_formats="%.1f") - - # Set Axis labels - label_defaults = dict(fontsize=12, offset=0.14) - tax.left_axis_label("Asian", **label_defaults) - tax.right_axis_label("African", **label_defaults) - tax.bottom_axis_label("European", **label_defaults) - - # Add legend - tax.legend(title="case_control") - - # Clean-up plot - tax.set_background_color(color="white") - tax.clear_matplotlib_ticks() - tax.get_axes().axis("off") # removes outer square axes - - # Save if given an outfile - if outfile: - tax.savefig(outfile) - - -if __name__ == "__main__": - if "snakemake" in locals(): - defaults = {} - defaults.update({"sample_qc": Path(snakemake.input[0])}) # type: ignore # noqa - defaults.update({"outfile": Path(snakemake.output[0])}) # type: ignore # noqa - main(**defaults) - else: - app() diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py index 72eacc11..caf3dda5 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py @@ -31,7 +31,6 @@ @app.command() def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path): - df = ( read_het(het) .join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left") @@ -50,13 +49,20 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: def plot(df: pd.DataFrame, population: str, threshold: float): sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + fig, ax = plt.subplots(figsize=(6, 6)) sns.scatterplot( x="x_label", y="F", data=df, hue="case_control", - palette=COLORS, + palette=CASE_CONTROL_LABEL_COLORS, ax=ax, alpha=0.8, linewidth=0, @@ -67,7 +73,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float): ax.set_xlabel("Subjects sorted by F") ax.set_ylabel("F") ax.set_ylim(_get_ylim(df.F, threshold)) - ax.set_title(f"{population} Homozygosity F Coefficient") + ax.set_title(f"{population} Heterozygosity F Coefficient") # Move legend plt.legend(loc="upper left") diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py index c248e8e0..3fe23a48 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py @@ -88,9 +88,16 @@ def plot_panel( ) # Set basic defaults so I don't have to repeat myself + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + style_defaults = dict(linewidth=0, alpha=0.8, s=5) sample_defaults = { - **dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample), + **dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample), **style_defaults, } snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults} diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py index 97c4b30a..ecfa444e 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py @@ -18,11 +18,12 @@ import seaborn as sns import typer -# import snakemake - from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS from cgr_gwas_qc.workflow.scripts import sample_qc_table +# import snakemake + + app = typer.Typer(add_completion=False) @@ -30,7 +31,7 @@ def main(sample_qc: Path, outfile: Path, xchr: str): sample = load_sample_data(sample_qc) xchr = str(snakemake.params) # type: ignore # noqa - plot(sample, outfile, xchr) + plot(sample, xchr, outfile) """ @@ -67,16 +68,23 @@ def _update_categories(sr: pd.DataFrame): return sr -def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True): +def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None): sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + # Create plots style_defaults = dict(linewidth=0, alpha=0.8, s=2) defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample) fig, ax = plt.subplots(figsize=(6, 6)) sns.boxplot(ax=ax, showfliers=False, **defaults) sns.stripplot( - ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults + ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults ) # Make boxplot black and white @@ -87,13 +95,13 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool # ax.set_xlabel("Reported Sex") ax.set_ylabel("ChrX Inbreeding Coeff") - xchr = xchr.strip().lower() == "true" - print(type(xchr), " ", xchr) - if xchr: - print("sex chr included", xchr) + xchr_bool = xchr.strip().lower() == "true" + print(type(xchr_bool), " ", xchr_bool) + if xchr_bool: + print("sex chr included", xchr_bool) ax.set_xlabel("Reported Sex") else: - print("No sex chromosome ", xchr) + print("No sex chromosome ", xchr_bool) ax.set_xlabel("No sex chromosome \nSkipping sex condordace") # Add line at 0.5 diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py index 868d79b9..60b5f2ba 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py @@ -50,7 +50,14 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path): def plot(df: pd.DataFrame, population: str) -> sns.PairGrid: sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True) + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + + g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True) g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0) g.map_diag(sns.kdeplot) g.add_legend( diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py index ee39a2e5..c121fb3a 100755 --- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py +++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py @@ -192,7 +192,9 @@ def main( ) add_qc_columns( - sample_qc, remove_contam, remove_rep_discordant, + sample_qc, + remove_contam, + remove_rep_discordant, ) save(sample_qc, outfile) @@ -320,12 +322,9 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame: .. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/ """ - return ( pd.read_csv(file_name, sep="\t") - .assign( - Sample_ID=lambda x: x["Subject"].astype(str) - ) # Issue 216: When subject IDs are numeric reindex fails. This makes sure index Sample_ID will always be as a character + .rename({"Subject": "Sample_ID"}, axis=1) .assign(Ancestry=lambda x: x["Computed population"].str.replace(" ", "_")) .assign(AFR=lambda x: x["P_f (%)"] / 100) .assign(EUR=lambda x: x["P_e (%)"] / 100) @@ -402,7 +401,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram if file_name is None: return pd.DataFrame( - index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"], + index=Sample_IDs, + columns=["Contamination_Rate", "is_contaminated"], ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"}) return ( @@ -445,12 +445,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie def add_qc_columns( - sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, + sample_qc: pd.DataFrame, + remove_contam: bool, + remove_rep_discordant: bool, ) -> pd.DataFrame: add_call_rate_flags(sample_qc) _add_identifiler(sample_qc) _add_analytic_exclusion( - sample_qc, remove_contam, remove_rep_discordant, + sample_qc, + remove_contam, + remove_rep_discordant, ) _add_subject_representative(sample_qc) _add_subject_dropped_from_study(sample_qc) @@ -496,7 +500,9 @@ def reason_string(row: pd.Series) -> str: def _add_analytic_exclusion( - sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, + sample_qc: pd.DataFrame, + remove_contam: bool, + remove_rep_discordant: bool, ) -> pd.DataFrame: """Adds a flag to remove samples based on provided conditions. diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk index 3c0cd1ff..c5bd18ae 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk @@ -2,10 +2,6 @@ from cgr_gwas_qc import load_config cfg = load_config() -PLINK_BIG_MEM = {1: 1024 * 4, 2: 1024 * 64, 3: 1024 * 250} -BIG_TIME = {1: 8, 2: 24, 3: 48} - - use_contamination = ( cfg.config.user_files.idat_pattern and cfg.config.user_files.gtc_pattern @@ -15,6 +11,8 @@ use_contamination = ( localrules: all_sample_qc, + sample_concordance_plink, + sample_concordance_summary, split_sample_concordance, snp_qc_table, sample_level_sexcheck, @@ -65,14 +63,17 @@ module thousand_genomes: config: {} + module plink: snakefile: cfg.modules("plink") + module graf: snakefile: cfg.modules("graf_v2.4") + module grafpop: snakefile: cfg.modules("grafpop") @@ -316,10 +317,6 @@ rule sample_concordance_plink: pi_hat_threshold=cfg.config.software_params.pi_hat_threshold, output: "sample_level/concordance/plink.csv", - resources: - mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt], - time_hr=lambda wildcards, attempt: BIG_TIME[attempt], - script: "../scripts/concordance_table.py" @@ -386,9 +383,8 @@ rule sample_concordance_summary: output: "sample_level/concordance/summary.csv", resources: - mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt], - time_hr=lambda wildcards, attempt: BIG_TIME[attempt], - + mem_mb=lambda wildcards, attempt: 1024 * 2 * attempt, + time_hr=lambda wildcards, attempt: 2 * attempt, script: "../scripts/sample_concordance.py" @@ -413,9 +409,9 @@ use rule grafpop_populations from grafpop as graf_populations with: bed=rules.snp_call_rate_filter_2.output.bed, bim=rules.snp_call_rate_filter_2.output.bim, fam=rules.snp_call_rate_filter_2.output.fam, - #bed="sample_level/call_rate_2/samples.bed", - #bim="sample_level/call_rate_2/samples.bim", - #fam="sample_level/call_rate_2/samples.fam", + # bed="sample_level/call_rate_2/samples.bed", + # bim="sample_level/call_rate_2/samples.bim", + # fam="sample_level/call_rate_2/samples.fam", output: "sample_level/ancestry/grafpop_populations.txt", log: @@ -454,6 +450,7 @@ rule snp_qc_table: sex_chr_included = cfg.config.workflow_params.sex_chr_included if sex_chr_included: print("sex_chr_included ", sex_chr_included) + use rule sexcheck from plink as sample_level_sexcheck with: input: bed=rules.snp_call_rate_filter_1.output.bed, @@ -463,8 +460,10 @@ if sex_chr_included: out_prefix="sample_level/call_rate_1/samples", output: "sample_level/call_rate_1/samples.sexcheck", + else: print("sex_chr_included ", sex_chr_included) + rule sample_level_sexcheck: input: bed=rules.snp_call_rate_filter_1.output.bed, @@ -478,6 +477,7 @@ else: echo "FID IID PEDSEX SNPSEX STATUS F" >> sample_level/call_rate_1/samples.sexcheck """ + def _contam(wildcards): uf, wf = cfg.config.user_files, cfg.config.workflow_params if use_contamination: diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk index 397ce871..0ddf5cd0 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk @@ -8,9 +8,6 @@ import shutil cfg = load_config() -PLINK_BIG_MEM = {1: 1024 * 4, 2: 1024 * 64, 3: 1024 * 250} -BIG_TIME = {1: 8, 2: 24, 3: 48} - localrules: all_subject_qc, @@ -277,9 +274,6 @@ use rule genome from plink as population_level_ibd with: out_prefix="subject_level/{population}/subjects_maf{maf}_ld{ld}_ibd", output: "subject_level/{population}/subjects_maf{maf}_ld{ld}_ibd.genome", - resources: - mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt], - time_hr=lambda wildcards, attempt: BIG_TIME[attempt], rule population_level_concordance_plink: From 39ce76928242feed6bef9fe9fb9ec3cdf52645de Mon Sep 17 00:00:00 2001 From: Kevin Liao Date: Fri, 12 Apr 2024 15:59:43 -0400 Subject: [PATCH 2/3] Revert "issue237" This reverts commit f9a4f2edc2fac3667e866193d99fc0dffc11a648. --- src/cgr_gwas_qc/reporting/constants.py | 10 +- .../workflow/scripts/plot_ancestry.py | 39 +------- .../workflow/scripts/plot_ancestry_grafpop.py | 95 +++++++++++++++++++ .../scripts/plot_autosomal_heterozygosity.py | 12 +-- .../workflow/scripts/plot_call_rate.py | 9 +- .../workflow/scripts/plot_chrx_inbreeding.py | 28 ++---- src/cgr_gwas_qc/workflow/scripts/plot_pca.py | 9 +- .../workflow/scripts/sample_qc_table.py | 24 ++--- .../workflow/sub_workflows/sample_qc.smk | 28 +++--- .../workflow/sub_workflows/subject_qc.smk | 6 ++ 10 files changed, 144 insertions(+), 116 deletions(-) create mode 100644 src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py diff --git a/src/cgr_gwas_qc/reporting/constants.py b/src/cgr_gwas_qc/reporting/constants.py index a9715531..88144661 100644 --- a/src/cgr_gwas_qc/reporting/constants.py +++ b/src/cgr_gwas_qc/reporting/constants.py @@ -1,15 +1,7 @@ import pandas as pd CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"]) -CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"] # red # blue # gray #gold - -# Assign labels to colors for plotting consistency -CASE_CONTROL_LABEL_COLORS = { - "Case": CASE_CONTROL_COLORS[0], - "Control": CASE_CONTROL_COLORS[1], - "QC": CASE_CONTROL_COLORS[2], - "Unknown": CASE_CONTROL_COLORS[3], -} +CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"] # red # blue # gray # green SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"]) diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py index 374459c9..4327333a 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py @@ -31,11 +31,7 @@ def main(sample_qc: Path, outfile: Path): def load_sample_data(sample_qc: Path) -> pd.DataFrame: - return ( - sample_qc_table.read(sample_qc) - .query("is_subject_representative") - .dropna(subset=["EUR", "AFR", "ASN"]) - ) + return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"]) def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): @@ -46,21 +42,17 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): fig, tax = ternary.figure(scale=1) # Set scale 0 to 1 fig.set_size_inches(6, 5) - # Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible + # Plot cases and controls separately case = sample.query("case_control == 'Case'") if case.shape[0] > 0: case_color = CASE_CONTROL_COLORS[0] tax.scatter( - case[["EUR", "AFR", "ASN"]].values, - color=case_color, - label="Case", - zorder=4, - **style_defaults + case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults ) control = sample.query("case_control == 'Control'") if control.shape[0] > 0: - control_color = CASE_CONTROL_COLORS[1] # blue + control_color = CASE_CONTROL_COLORS[1] tax.scatter( control[["EUR", "AFR", "ASN"]].values, color=control_color, @@ -68,29 +60,6 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): **style_defaults ) - # Issue 237: Add samples if they are neither case or control. - project_qc = sample.query("case_control == 'QC'") - if project_qc.shape[0] > 0: - project_qc_color = CASE_CONTROL_COLORS[2] # Yellow - tax.scatter( - project_qc[["EUR", "AFR", "ASN"]].values, - color=project_qc_color, - label="QC", - **style_defaults - ) - - unknown = sample.query( - "case_control != 'Control' and case_control != 'Case' and case_control != 'QC'" - ) - if unknown.shape[0] > 0: - unknown_color = CASE_CONTROL_COLORS[3] # Gray - tax.scatter( - unknown[["EUR", "AFR", "ASN"]].values, - color=unknown_color, - label="Unknown", - **style_defaults - ) - # Add plot elements multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments tax.boundary(linewidth=0.5) diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py new file mode 100644 index 00000000..2a0d7242 --- /dev/null +++ b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py @@ -0,0 +1,95 @@ +""" +plot_ancestry.py +---------------- + +This script plots the triangle plot of ancestries from GRAF ancestry +estimates. + +Output: + ``sample_level/ancestry.png`` + +""" +import os +from pathlib import Path +from typing import Optional + +import pandas as pd +import seaborn as sns +import ternary +import typer + +from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS +from cgr_gwas_qc.workflow.scripts import sample_qc_table + +app = typer.Typer(add_completion=False) + + +@app.command() +def main(sample_qc: Path, outfile: Path): + sample = load_sample_data(sample_qc) + plot(sample, outfile) + + +def load_sample_data(sample_qc: Path) -> pd.DataFrame: + return sample_qc_table.read(sample_qc).dropna(subset=["E(%)", "F(%)", "A(%)"]) + + +def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): + sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper + + # Create plots + style_defaults = dict(linewidth=0, alpha=0.8, s=8) + fig, tax = ternary.figure(scale=1) # Set scale 0 to 1 + fig.set_size_inches(6, 5) + + # Plot cases and controls separately + case = sample.query("case_control == 'Case'") + if case.shape[0] > 0: + case_color = CASE_CONTROL_COLORS[0] + tax.scatter( + case[["E(%)", "F(%)", "A(%)"]].values, color=case_color, label="Case", **style_defaults + ) + + control = sample.query("case_control == 'Control'") + if control.shape[0] > 0: + control_color = CASE_CONTROL_COLORS[1] + tax.scatter( + control[["E(%)", "F(%)", "A(%)"]].values, + color=control_color, + label="Control", + **style_defaults + ) + + # Add plot elements + multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments + tax.boundary(linewidth=0.5) + tax.gridlines(multiple=multiple, color="gray") + tax.ticks(axis="lbr", linewidth=1, multiple=multiple, offset=0.02, tick_formats="%.1f") + + # Set Axis labels + label_defaults = dict(fontsize=12, offset=0.14) + tax.left_axis_label("Asian", **label_defaults) + tax.right_axis_label("African", **label_defaults) + tax.bottom_axis_label("European", **label_defaults) + + # Add legend + tax.legend(title="case_control") + + # Clean-up plot + tax.set_background_color(color="white") + tax.clear_matplotlib_ticks() + tax.get_axes().axis("off") # removes outer square axes + + # Save if given an outfile + if outfile: + tax.savefig(outfile) + + +if __name__ == "__main__": + if "snakemake" in locals(): + defaults = {} + defaults.update({"sample_qc": Path(snakemake.input[0])}) # type: ignore # noqa + defaults.update({"outfile": Path(snakemake.output[0])}) # type: ignore # noqa + main(**defaults) + else: + app() diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py index caf3dda5..72eacc11 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py @@ -31,6 +31,7 @@ @app.command() def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path): + df = ( read_het(het) .join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left") @@ -49,20 +50,13 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: def plot(df: pd.DataFrame, population: str, threshold: float): sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - CASE_CONTROL_LABEL_COLORS = { - "Case": CASE_CONTROL_COLORS[0], - "Control": CASE_CONTROL_COLORS[1], - "QC": CASE_CONTROL_COLORS[2], - "Unknown": CASE_CONTROL_COLORS[3], - } - fig, ax = plt.subplots(figsize=(6, 6)) sns.scatterplot( x="x_label", y="F", data=df, hue="case_control", - palette=CASE_CONTROL_LABEL_COLORS, + palette=COLORS, ax=ax, alpha=0.8, linewidth=0, @@ -73,7 +67,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float): ax.set_xlabel("Subjects sorted by F") ax.set_ylabel("F") ax.set_ylim(_get_ylim(df.F, threshold)) - ax.set_title(f"{population} Heterozygosity F Coefficient") + ax.set_title(f"{population} Homozygosity F Coefficient") # Move legend plt.legend(loc="upper left") diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py index 3fe23a48..c248e8e0 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py @@ -88,16 +88,9 @@ def plot_panel( ) # Set basic defaults so I don't have to repeat myself - CASE_CONTROL_LABEL_COLORS = { - "Case": CASE_CONTROL_COLORS[0], - "Control": CASE_CONTROL_COLORS[1], - "QC": CASE_CONTROL_COLORS[2], - "Unknown": CASE_CONTROL_COLORS[3], - } - style_defaults = dict(linewidth=0, alpha=0.8, s=5) sample_defaults = { - **dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample), + **dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample), **style_defaults, } snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults} diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py index ecfa444e..97c4b30a 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py @@ -18,11 +18,10 @@ import seaborn as sns import typer -from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS -from cgr_gwas_qc.workflow.scripts import sample_qc_table - # import snakemake +from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS +from cgr_gwas_qc.workflow.scripts import sample_qc_table app = typer.Typer(add_completion=False) @@ -31,7 +30,7 @@ def main(sample_qc: Path, outfile: Path, xchr: str): sample = load_sample_data(sample_qc) xchr = str(snakemake.params) # type: ignore # noqa - plot(sample, xchr, outfile) + plot(sample, outfile, xchr) """ @@ -68,23 +67,16 @@ def _update_categories(sr: pd.DataFrame): return sr -def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None): +def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True): sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - CASE_CONTROL_LABEL_COLORS = { - "Case": CASE_CONTROL_COLORS[0], - "Control": CASE_CONTROL_COLORS[1], - "QC": CASE_CONTROL_COLORS[2], - "Unknown": CASE_CONTROL_COLORS[3], - } - # Create plots style_defaults = dict(linewidth=0, alpha=0.8, s=2) defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample) fig, ax = plt.subplots(figsize=(6, 6)) sns.boxplot(ax=ax, showfliers=False, **defaults) sns.stripplot( - ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults + ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults ) # Make boxplot black and white @@ -95,13 +87,13 @@ def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None) # ax.set_xlabel("Reported Sex") ax.set_ylabel("ChrX Inbreeding Coeff") - xchr_bool = xchr.strip().lower() == "true" - print(type(xchr_bool), " ", xchr_bool) - if xchr_bool: - print("sex chr included", xchr_bool) + xchr = xchr.strip().lower() == "true" + print(type(xchr), " ", xchr) + if xchr: + print("sex chr included", xchr) ax.set_xlabel("Reported Sex") else: - print("No sex chromosome ", xchr_bool) + print("No sex chromosome ", xchr) ax.set_xlabel("No sex chromosome \nSkipping sex condordace") # Add line at 0.5 diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py index 60b5f2ba..868d79b9 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py @@ -50,14 +50,7 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path): def plot(df: pd.DataFrame, population: str) -> sns.PairGrid: sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - CASE_CONTROL_LABEL_COLORS = { - "Case": CASE_CONTROL_COLORS[0], - "Control": CASE_CONTROL_COLORS[1], - "QC": CASE_CONTROL_COLORS[2], - "Unknown": CASE_CONTROL_COLORS[3], - } - - g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True) + g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True) g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0) g.map_diag(sns.kdeplot) g.add_legend( diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py index c121fb3a..ee39a2e5 100755 --- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py +++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py @@ -192,9 +192,7 @@ def main( ) add_qc_columns( - sample_qc, - remove_contam, - remove_rep_discordant, + sample_qc, remove_contam, remove_rep_discordant, ) save(sample_qc, outfile) @@ -322,9 +320,12 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame: .. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/ """ + return ( pd.read_csv(file_name, sep="\t") - .rename({"Subject": "Sample_ID"}, axis=1) + .assign( + Sample_ID=lambda x: x["Subject"].astype(str) + ) # Issue 216: When subject IDs are numeric reindex fails. This makes sure index Sample_ID will always be as a character .assign(Ancestry=lambda x: x["Computed population"].str.replace(" ", "_")) .assign(AFR=lambda x: x["P_f (%)"] / 100) .assign(EUR=lambda x: x["P_e (%)"] / 100) @@ -401,8 +402,7 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram if file_name is None: return pd.DataFrame( - index=Sample_IDs, - columns=["Contamination_Rate", "is_contaminated"], + index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"], ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"}) return ( @@ -445,16 +445,12 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie def add_qc_columns( - sample_qc: pd.DataFrame, - remove_contam: bool, - remove_rep_discordant: bool, + sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, ) -> pd.DataFrame: add_call_rate_flags(sample_qc) _add_identifiler(sample_qc) _add_analytic_exclusion( - sample_qc, - remove_contam, - remove_rep_discordant, + sample_qc, remove_contam, remove_rep_discordant, ) _add_subject_representative(sample_qc) _add_subject_dropped_from_study(sample_qc) @@ -500,9 +496,7 @@ def reason_string(row: pd.Series) -> str: def _add_analytic_exclusion( - sample_qc: pd.DataFrame, - remove_contam: bool, - remove_rep_discordant: bool, + sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, ) -> pd.DataFrame: """Adds a flag to remove samples based on provided conditions. diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk index c5bd18ae..3c0cd1ff 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk @@ -2,6 +2,10 @@ from cgr_gwas_qc import load_config cfg = load_config() +PLINK_BIG_MEM = {1: 1024 * 4, 2: 1024 * 64, 3: 1024 * 250} +BIG_TIME = {1: 8, 2: 24, 3: 48} + + use_contamination = ( cfg.config.user_files.idat_pattern and cfg.config.user_files.gtc_pattern @@ -11,8 +15,6 @@ use_contamination = ( localrules: all_sample_qc, - sample_concordance_plink, - sample_concordance_summary, split_sample_concordance, snp_qc_table, sample_level_sexcheck, @@ -63,17 +65,14 @@ module thousand_genomes: config: {} - module plink: snakefile: cfg.modules("plink") - module graf: snakefile: cfg.modules("graf_v2.4") - module grafpop: snakefile: cfg.modules("grafpop") @@ -317,6 +316,10 @@ rule sample_concordance_plink: pi_hat_threshold=cfg.config.software_params.pi_hat_threshold, output: "sample_level/concordance/plink.csv", + resources: + mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt], + time_hr=lambda wildcards, attempt: BIG_TIME[attempt], + script: "../scripts/concordance_table.py" @@ -383,8 +386,9 @@ rule sample_concordance_summary: output: "sample_level/concordance/summary.csv", resources: - mem_mb=lambda wildcards, attempt: 1024 * 2 * attempt, - time_hr=lambda wildcards, attempt: 2 * attempt, + mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt], + time_hr=lambda wildcards, attempt: BIG_TIME[attempt], + script: "../scripts/sample_concordance.py" @@ -409,9 +413,9 @@ use rule grafpop_populations from grafpop as graf_populations with: bed=rules.snp_call_rate_filter_2.output.bed, bim=rules.snp_call_rate_filter_2.output.bim, fam=rules.snp_call_rate_filter_2.output.fam, - # bed="sample_level/call_rate_2/samples.bed", - # bim="sample_level/call_rate_2/samples.bim", - # fam="sample_level/call_rate_2/samples.fam", + #bed="sample_level/call_rate_2/samples.bed", + #bim="sample_level/call_rate_2/samples.bim", + #fam="sample_level/call_rate_2/samples.fam", output: "sample_level/ancestry/grafpop_populations.txt", log: @@ -450,7 +454,6 @@ rule snp_qc_table: sex_chr_included = cfg.config.workflow_params.sex_chr_included if sex_chr_included: print("sex_chr_included ", sex_chr_included) - use rule sexcheck from plink as sample_level_sexcheck with: input: bed=rules.snp_call_rate_filter_1.output.bed, @@ -460,10 +463,8 @@ if sex_chr_included: out_prefix="sample_level/call_rate_1/samples", output: "sample_level/call_rate_1/samples.sexcheck", - else: print("sex_chr_included ", sex_chr_included) - rule sample_level_sexcheck: input: bed=rules.snp_call_rate_filter_1.output.bed, @@ -477,7 +478,6 @@ else: echo "FID IID PEDSEX SNPSEX STATUS F" >> sample_level/call_rate_1/samples.sexcheck """ - def _contam(wildcards): uf, wf = cfg.config.user_files, cfg.config.workflow_params if use_contamination: diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk index 0ddf5cd0..397ce871 100644 --- a/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk +++ b/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk @@ -8,6 +8,9 @@ import shutil cfg = load_config() +PLINK_BIG_MEM = {1: 1024 * 4, 2: 1024 * 64, 3: 1024 * 250} +BIG_TIME = {1: 8, 2: 24, 3: 48} + localrules: all_subject_qc, @@ -274,6 +277,9 @@ use rule genome from plink as population_level_ibd with: out_prefix="subject_level/{population}/subjects_maf{maf}_ld{ld}_ibd", output: "subject_level/{population}/subjects_maf{maf}_ld{ld}_ibd.genome", + resources: + mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt], + time_hr=lambda wildcards, attempt: BIG_TIME[attempt], rule population_level_concordance_plink: From 139faab43d575af9075f32f9c7e04f60aff46ebe Mon Sep 17 00:00:00 2001 From: Kevin Liao Date: Fri, 12 Apr 2024 16:13:48 -0400 Subject: [PATCH 3/3] issue237_precommitted --- src/cgr_gwas_qc/reporting/constants.py | 10 +- .../workflow/scripts/plot_ancestry.py | 39 +++++++- .../workflow/scripts/plot_ancestry_grafpop.py | 95 ------------------- .../scripts/plot_autosomal_heterozygosity.py | 12 ++- .../workflow/scripts/plot_call_rate.py | 9 +- .../workflow/scripts/plot_chrx_inbreeding.py | 28 ++++-- src/cgr_gwas_qc/workflow/scripts/plot_pca.py | 9 +- .../workflow/scripts/sample_qc_table.py | 20 ++-- 8 files changed, 101 insertions(+), 121 deletions(-) delete mode 100644 src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py diff --git a/src/cgr_gwas_qc/reporting/constants.py b/src/cgr_gwas_qc/reporting/constants.py index 88144661..a9715531 100644 --- a/src/cgr_gwas_qc/reporting/constants.py +++ b/src/cgr_gwas_qc/reporting/constants.py @@ -1,7 +1,15 @@ import pandas as pd CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"]) -CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"] # red # blue # gray # green +CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"] # red # blue # gray #gold + +# Assign labels to colors for plotting consistency +CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], +} SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"]) diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py index 4327333a..374459c9 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py @@ -31,7 +31,11 @@ def main(sample_qc: Path, outfile: Path): def load_sample_data(sample_qc: Path) -> pd.DataFrame: - return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"]) + return ( + sample_qc_table.read(sample_qc) + .query("is_subject_representative") + .dropna(subset=["EUR", "AFR", "ASN"]) + ) def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): @@ -42,17 +46,21 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): fig, tax = ternary.figure(scale=1) # Set scale 0 to 1 fig.set_size_inches(6, 5) - # Plot cases and controls separately + # Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible case = sample.query("case_control == 'Case'") if case.shape[0] > 0: case_color = CASE_CONTROL_COLORS[0] tax.scatter( - case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults + case[["EUR", "AFR", "ASN"]].values, + color=case_color, + label="Case", + zorder=4, + **style_defaults ) control = sample.query("case_control == 'Control'") if control.shape[0] > 0: - control_color = CASE_CONTROL_COLORS[1] + control_color = CASE_CONTROL_COLORS[1] # blue tax.scatter( control[["EUR", "AFR", "ASN"]].values, color=control_color, @@ -60,6 +68,29 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): **style_defaults ) + # Issue 237: Add samples if they are neither case or control. + project_qc = sample.query("case_control == 'QC'") + if project_qc.shape[0] > 0: + project_qc_color = CASE_CONTROL_COLORS[2] # Yellow + tax.scatter( + project_qc[["EUR", "AFR", "ASN"]].values, + color=project_qc_color, + label="QC", + **style_defaults + ) + + unknown = sample.query( + "case_control != 'Control' and case_control != 'Case' and case_control != 'QC'" + ) + if unknown.shape[0] > 0: + unknown_color = CASE_CONTROL_COLORS[3] # Gray + tax.scatter( + unknown[["EUR", "AFR", "ASN"]].values, + color=unknown_color, + label="Unknown", + **style_defaults + ) + # Add plot elements multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments tax.boundary(linewidth=0.5) diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py deleted file mode 100644 index 2a0d7242..00000000 --- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -plot_ancestry.py ----------------- - -This script plots the triangle plot of ancestries from GRAF ancestry -estimates. - -Output: - ``sample_level/ancestry.png`` - -""" -import os -from pathlib import Path -from typing import Optional - -import pandas as pd -import seaborn as sns -import ternary -import typer - -from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS -from cgr_gwas_qc.workflow.scripts import sample_qc_table - -app = typer.Typer(add_completion=False) - - -@app.command() -def main(sample_qc: Path, outfile: Path): - sample = load_sample_data(sample_qc) - plot(sample, outfile) - - -def load_sample_data(sample_qc: Path) -> pd.DataFrame: - return sample_qc_table.read(sample_qc).dropna(subset=["E(%)", "F(%)", "A(%)"]) - - -def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): - sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - - # Create plots - style_defaults = dict(linewidth=0, alpha=0.8, s=8) - fig, tax = ternary.figure(scale=1) # Set scale 0 to 1 - fig.set_size_inches(6, 5) - - # Plot cases and controls separately - case = sample.query("case_control == 'Case'") - if case.shape[0] > 0: - case_color = CASE_CONTROL_COLORS[0] - tax.scatter( - case[["E(%)", "F(%)", "A(%)"]].values, color=case_color, label="Case", **style_defaults - ) - - control = sample.query("case_control == 'Control'") - if control.shape[0] > 0: - control_color = CASE_CONTROL_COLORS[1] - tax.scatter( - control[["E(%)", "F(%)", "A(%)"]].values, - color=control_color, - label="Control", - **style_defaults - ) - - # Add plot elements - multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments - tax.boundary(linewidth=0.5) - tax.gridlines(multiple=multiple, color="gray") - tax.ticks(axis="lbr", linewidth=1, multiple=multiple, offset=0.02, tick_formats="%.1f") - - # Set Axis labels - label_defaults = dict(fontsize=12, offset=0.14) - tax.left_axis_label("Asian", **label_defaults) - tax.right_axis_label("African", **label_defaults) - tax.bottom_axis_label("European", **label_defaults) - - # Add legend - tax.legend(title="case_control") - - # Clean-up plot - tax.set_background_color(color="white") - tax.clear_matplotlib_ticks() - tax.get_axes().axis("off") # removes outer square axes - - # Save if given an outfile - if outfile: - tax.savefig(outfile) - - -if __name__ == "__main__": - if "snakemake" in locals(): - defaults = {} - defaults.update({"sample_qc": Path(snakemake.input[0])}) # type: ignore # noqa - defaults.update({"outfile": Path(snakemake.output[0])}) # type: ignore # noqa - main(**defaults) - else: - app() diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py index 72eacc11..caf3dda5 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py @@ -31,7 +31,6 @@ @app.command() def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path): - df = ( read_het(het) .join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left") @@ -50,13 +49,20 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: def plot(df: pd.DataFrame, population: str, threshold: float): sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + fig, ax = plt.subplots(figsize=(6, 6)) sns.scatterplot( x="x_label", y="F", data=df, hue="case_control", - palette=COLORS, + palette=CASE_CONTROL_LABEL_COLORS, ax=ax, alpha=0.8, linewidth=0, @@ -67,7 +73,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float): ax.set_xlabel("Subjects sorted by F") ax.set_ylabel("F") ax.set_ylim(_get_ylim(df.F, threshold)) - ax.set_title(f"{population} Homozygosity F Coefficient") + ax.set_title(f"{population} Heterozygosity F Coefficient") # Move legend plt.legend(loc="upper left") diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py index c248e8e0..3fe23a48 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py @@ -88,9 +88,16 @@ def plot_panel( ) # Set basic defaults so I don't have to repeat myself + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + style_defaults = dict(linewidth=0, alpha=0.8, s=5) sample_defaults = { - **dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample), + **dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample), **style_defaults, } snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults} diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py index 97c4b30a..ecfa444e 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py @@ -18,11 +18,12 @@ import seaborn as sns import typer -# import snakemake - from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS from cgr_gwas_qc.workflow.scripts import sample_qc_table +# import snakemake + + app = typer.Typer(add_completion=False) @@ -30,7 +31,7 @@ def main(sample_qc: Path, outfile: Path, xchr: str): sample = load_sample_data(sample_qc) xchr = str(snakemake.params) # type: ignore # noqa - plot(sample, outfile, xchr) + plot(sample, xchr, outfile) """ @@ -67,16 +68,23 @@ def _update_categories(sr: pd.DataFrame): return sr -def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True): +def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None): sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + # Create plots style_defaults = dict(linewidth=0, alpha=0.8, s=2) defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample) fig, ax = plt.subplots(figsize=(6, 6)) sns.boxplot(ax=ax, showfliers=False, **defaults) sns.stripplot( - ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults + ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults ) # Make boxplot black and white @@ -87,13 +95,13 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool # ax.set_xlabel("Reported Sex") ax.set_ylabel("ChrX Inbreeding Coeff") - xchr = xchr.strip().lower() == "true" - print(type(xchr), " ", xchr) - if xchr: - print("sex chr included", xchr) + xchr_bool = xchr.strip().lower() == "true" + print(type(xchr_bool), " ", xchr_bool) + if xchr_bool: + print("sex chr included", xchr_bool) ax.set_xlabel("Reported Sex") else: - print("No sex chromosome ", xchr) + print("No sex chromosome ", xchr_bool) ax.set_xlabel("No sex chromosome \nSkipping sex condordace") # Add line at 0.5 diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py index 868d79b9..60b5f2ba 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py @@ -50,7 +50,14 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path): def plot(df: pd.DataFrame, population: str) -> sns.PairGrid: sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True) + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + + g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True) g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0) g.map_diag(sns.kdeplot) g.add_legend( diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py index ee39a2e5..5e6a584c 100755 --- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py +++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py @@ -192,7 +192,9 @@ def main( ) add_qc_columns( - sample_qc, remove_contam, remove_rep_discordant, + sample_qc, + remove_contam, + remove_rep_discordant, ) save(sample_qc, outfile) @@ -320,7 +322,6 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame: .. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/ """ - return ( pd.read_csv(file_name, sep="\t") .assign( @@ -402,7 +403,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram if file_name is None: return pd.DataFrame( - index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"], + index=Sample_IDs, + columns=["Contamination_Rate", "is_contaminated"], ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"}) return ( @@ -445,12 +447,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie def add_qc_columns( - sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, + sample_qc: pd.DataFrame, + remove_contam: bool, + remove_rep_discordant: bool, ) -> pd.DataFrame: add_call_rate_flags(sample_qc) _add_identifiler(sample_qc) _add_analytic_exclusion( - sample_qc, remove_contam, remove_rep_discordant, + sample_qc, + remove_contam, + remove_rep_discordant, ) _add_subject_representative(sample_qc) _add_subject_dropped_from_study(sample_qc) @@ -496,7 +502,9 @@ def reason_string(row: pd.Series) -> str: def _add_analytic_exclusion( - sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, + sample_qc: pd.DataFrame, + remove_contam: bool, + remove_rep_discordant: bool, ) -> pd.DataFrame: """Adds a flag to remove samples based on provided conditions.