diff --git a/src/cgr_gwas_qc/reporting/constants.py b/src/cgr_gwas_qc/reporting/constants.py index 88144661..a9715531 100644 --- a/src/cgr_gwas_qc/reporting/constants.py +++ b/src/cgr_gwas_qc/reporting/constants.py @@ -1,7 +1,15 @@ import pandas as pd CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"]) -CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"] # red # blue # gray # green +CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"] # red # blue # gray #gold + +# Assign labels to colors for plotting consistency +CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], +} SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"]) diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py index 4327333a..374459c9 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py @@ -31,7 +31,11 @@ def main(sample_qc: Path, outfile: Path): def load_sample_data(sample_qc: Path) -> pd.DataFrame: - return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"]) + return ( + sample_qc_table.read(sample_qc) + .query("is_subject_representative") + .dropna(subset=["EUR", "AFR", "ASN"]) + ) def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): @@ -42,17 +46,21 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): fig, tax = ternary.figure(scale=1) # Set scale 0 to 1 fig.set_size_inches(6, 5) - # Plot cases and controls separately + # Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible case = sample.query("case_control == 'Case'") if case.shape[0] > 0: case_color = CASE_CONTROL_COLORS[0] tax.scatter( - case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults + case[["EUR", "AFR", "ASN"]].values, + color=case_color, + label="Case", + zorder=4, + **style_defaults ) control = sample.query("case_control == 'Control'") if control.shape[0] > 0: - control_color = CASE_CONTROL_COLORS[1] + control_color = CASE_CONTROL_COLORS[1] # blue tax.scatter( control[["EUR", "AFR", "ASN"]].values, color=control_color, @@ -60,6 +68,29 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): **style_defaults ) + # Issue 237: Add samples if they are neither case or control. + project_qc = sample.query("case_control == 'QC'") + if project_qc.shape[0] > 0: + project_qc_color = CASE_CONTROL_COLORS[2] # Yellow + tax.scatter( + project_qc[["EUR", "AFR", "ASN"]].values, + color=project_qc_color, + label="QC", + **style_defaults + ) + + unknown = sample.query( + "case_control != 'Control' and case_control != 'Case' and case_control != 'QC'" + ) + if unknown.shape[0] > 0: + unknown_color = CASE_CONTROL_COLORS[3] # Gray + tax.scatter( + unknown[["EUR", "AFR", "ASN"]].values, + color=unknown_color, + label="Unknown", + **style_defaults + ) + # Add plot elements multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments tax.boundary(linewidth=0.5) diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py deleted file mode 100644 index 2a0d7242..00000000 --- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -plot_ancestry.py ----------------- - -This script plots the triangle plot of ancestries from GRAF ancestry -estimates. - -Output: - ``sample_level/ancestry.png`` - -""" -import os -from pathlib import Path -from typing import Optional - -import pandas as pd -import seaborn as sns -import ternary -import typer - -from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS -from cgr_gwas_qc.workflow.scripts import sample_qc_table - -app = typer.Typer(add_completion=False) - - -@app.command() -def main(sample_qc: Path, outfile: Path): - sample = load_sample_data(sample_qc) - plot(sample, outfile) - - -def load_sample_data(sample_qc: Path) -> pd.DataFrame: - return sample_qc_table.read(sample_qc).dropna(subset=["E(%)", "F(%)", "A(%)"]) - - -def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None): - sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - - # Create plots - style_defaults = dict(linewidth=0, alpha=0.8, s=8) - fig, tax = ternary.figure(scale=1) # Set scale 0 to 1 - fig.set_size_inches(6, 5) - - # Plot cases and controls separately - case = sample.query("case_control == 'Case'") - if case.shape[0] > 0: - case_color = CASE_CONTROL_COLORS[0] - tax.scatter( - case[["E(%)", "F(%)", "A(%)"]].values, color=case_color, label="Case", **style_defaults - ) - - control = sample.query("case_control == 'Control'") - if control.shape[0] > 0: - control_color = CASE_CONTROL_COLORS[1] - tax.scatter( - control[["E(%)", "F(%)", "A(%)"]].values, - color=control_color, - label="Control", - **style_defaults - ) - - # Add plot elements - multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments - tax.boundary(linewidth=0.5) - tax.gridlines(multiple=multiple, color="gray") - tax.ticks(axis="lbr", linewidth=1, multiple=multiple, offset=0.02, tick_formats="%.1f") - - # Set Axis labels - label_defaults = dict(fontsize=12, offset=0.14) - tax.left_axis_label("Asian", **label_defaults) - tax.right_axis_label("African", **label_defaults) - tax.bottom_axis_label("European", **label_defaults) - - # Add legend - tax.legend(title="case_control") - - # Clean-up plot - tax.set_background_color(color="white") - tax.clear_matplotlib_ticks() - tax.get_axes().axis("off") # removes outer square axes - - # Save if given an outfile - if outfile: - tax.savefig(outfile) - - -if __name__ == "__main__": - if "snakemake" in locals(): - defaults = {} - defaults.update({"sample_qc": Path(snakemake.input[0])}) # type: ignore # noqa - defaults.update({"outfile": Path(snakemake.output[0])}) # type: ignore # noqa - main(**defaults) - else: - app() diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py index 72eacc11..caf3dda5 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py @@ -31,7 +31,6 @@ @app.command() def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path): - df = ( read_het(het) .join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left") @@ -50,13 +49,20 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: def plot(df: pd.DataFrame, population: str, threshold: float): sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + fig, ax = plt.subplots(figsize=(6, 6)) sns.scatterplot( x="x_label", y="F", data=df, hue="case_control", - palette=COLORS, + palette=CASE_CONTROL_LABEL_COLORS, ax=ax, alpha=0.8, linewidth=0, @@ -67,7 +73,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float): ax.set_xlabel("Subjects sorted by F") ax.set_ylabel("F") ax.set_ylim(_get_ylim(df.F, threshold)) - ax.set_title(f"{population} Homozygosity F Coefficient") + ax.set_title(f"{population} Heterozygosity F Coefficient") # Move legend plt.legend(loc="upper left") diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py index c248e8e0..3fe23a48 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py @@ -88,9 +88,16 @@ def plot_panel( ) # Set basic defaults so I don't have to repeat myself + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + style_defaults = dict(linewidth=0, alpha=0.8, s=5) sample_defaults = { - **dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample), + **dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample), **style_defaults, } snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults} diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py index 97c4b30a..ecfa444e 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py @@ -18,11 +18,12 @@ import seaborn as sns import typer -# import snakemake - from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS from cgr_gwas_qc.workflow.scripts import sample_qc_table +# import snakemake + + app = typer.Typer(add_completion=False) @@ -30,7 +31,7 @@ def main(sample_qc: Path, outfile: Path, xchr: str): sample = load_sample_data(sample_qc) xchr = str(snakemake.params) # type: ignore # noqa - plot(sample, outfile, xchr) + plot(sample, xchr, outfile) """ @@ -67,16 +68,23 @@ def _update_categories(sr: pd.DataFrame): return sr -def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True): +def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None): sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + # Create plots style_defaults = dict(linewidth=0, alpha=0.8, s=2) defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample) fig, ax = plt.subplots(figsize=(6, 6)) sns.boxplot(ax=ax, showfliers=False, **defaults) sns.stripplot( - ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults + ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults ) # Make boxplot black and white @@ -87,13 +95,13 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool # ax.set_xlabel("Reported Sex") ax.set_ylabel("ChrX Inbreeding Coeff") - xchr = xchr.strip().lower() == "true" - print(type(xchr), " ", xchr) - if xchr: - print("sex chr included", xchr) + xchr_bool = xchr.strip().lower() == "true" + print(type(xchr_bool), " ", xchr_bool) + if xchr_bool: + print("sex chr included", xchr_bool) ax.set_xlabel("Reported Sex") else: - print("No sex chromosome ", xchr) + print("No sex chromosome ", xchr_bool) ax.set_xlabel("No sex chromosome \nSkipping sex condordace") # Add line at 0.5 diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py index 868d79b9..60b5f2ba 100644 --- a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py +++ b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py @@ -50,7 +50,14 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path): def plot(df: pd.DataFrame, population: str) -> sns.PairGrid: sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper - g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True) + CASE_CONTROL_LABEL_COLORS = { + "Case": CASE_CONTROL_COLORS[0], + "Control": CASE_CONTROL_COLORS[1], + "QC": CASE_CONTROL_COLORS[2], + "Unknown": CASE_CONTROL_COLORS[3], + } + + g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True) g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0) g.map_diag(sns.kdeplot) g.add_legend( diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py index ee39a2e5..5e6a584c 100755 --- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py +++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py @@ -192,7 +192,9 @@ def main( ) add_qc_columns( - sample_qc, remove_contam, remove_rep_discordant, + sample_qc, + remove_contam, + remove_rep_discordant, ) save(sample_qc, outfile) @@ -320,7 +322,6 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame: .. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/ """ - return ( pd.read_csv(file_name, sep="\t") .assign( @@ -402,7 +403,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram if file_name is None: return pd.DataFrame( - index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"], + index=Sample_IDs, + columns=["Contamination_Rate", "is_contaminated"], ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"}) return ( @@ -445,12 +447,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie def add_qc_columns( - sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, + sample_qc: pd.DataFrame, + remove_contam: bool, + remove_rep_discordant: bool, ) -> pd.DataFrame: add_call_rate_flags(sample_qc) _add_identifiler(sample_qc) _add_analytic_exclusion( - sample_qc, remove_contam, remove_rep_discordant, + sample_qc, + remove_contam, + remove_rep_discordant, ) _add_subject_representative(sample_qc) _add_subject_dropped_from_study(sample_qc) @@ -496,7 +502,9 @@ def reason_string(row: pd.Series) -> str: def _add_analytic_exclusion( - sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool, + sample_qc: pd.DataFrame, + remove_contam: bool, + remove_rep_discordant: bool, ) -> pd.DataFrame: """Adds a flag to remove samples based on provided conditions.