Merge pull request #274 from NCI-CGR/issue-237

Issue 237
NCI-CGR · Apr 12, 2024 · 83c4682 · 83c4682
2 parents 45f2764 + 139faab
commit 83c4682
Show file tree

Hide file tree

Showing 8 changed files with 101 additions and 121 deletions.
diff --git a/src/cgr_gwas_qc/reporting/constants.py b/src/cgr_gwas_qc/reporting/constants.py
@@ -1,7 +1,15 @@
 import pandas as pd
 
 CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"])
-CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"]  # red  # blue  # gray  # green
+CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"]  # red  # blue  # gray  #gold
+
+# Assign labels to colors for plotting consistency
+CASE_CONTROL_LABEL_COLORS = {
+    "Case": CASE_CONTROL_COLORS[0],
+    "Control": CASE_CONTROL_COLORS[1],
+    "QC": CASE_CONTROL_COLORS[2],
+    "Unknown": CASE_CONTROL_COLORS[3],
+}
 
 SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"])
 

diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
@@ -31,7 +31,11 @@ def main(sample_qc: Path, outfile: Path):
 
 
 def load_sample_data(sample_qc: Path) -> pd.DataFrame:
-    return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"])
+    return (
+        sample_qc_table.read(sample_qc)
+        .query("is_subject_representative")
+        .dropna(subset=["EUR", "AFR", "ASN"])
+    )
 
 
 def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
@@ -42,24 +46,51 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
     fig, tax = ternary.figure(scale=1)  # Set scale 0 to 1
     fig.set_size_inches(6, 5)
 
-    # Plot cases and controls separately
+    # Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible
     case = sample.query("case_control == 'Case'")
     if case.shape[0] > 0:
         case_color = CASE_CONTROL_COLORS[0]
         tax.scatter(
-            case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults
+            case[["EUR", "AFR", "ASN"]].values,
+            color=case_color,
+            label="Case",
+            zorder=4,
+            **style_defaults
         )
 
     control = sample.query("case_control == 'Control'")
     if control.shape[0] > 0:
-        control_color = CASE_CONTROL_COLORS[1]
+        control_color = CASE_CONTROL_COLORS[1]  # blue
         tax.scatter(
             control[["EUR", "AFR", "ASN"]].values,
             color=control_color,
             label="Control",
             **style_defaults
         )
 
+    # Issue 237: Add samples if they are neither case or control.
+    project_qc = sample.query("case_control == 'QC'")
+    if project_qc.shape[0] > 0:
+        project_qc_color = CASE_CONTROL_COLORS[2]  # Yellow
+        tax.scatter(
+            project_qc[["EUR", "AFR", "ASN"]].values,
+            color=project_qc_color,
+            label="QC",
+            **style_defaults
+        )
+
+    unknown = sample.query(
+        "case_control != 'Control' and case_control != 'Case' and case_control != 'QC'"
+    )
+    if unknown.shape[0] > 0:
+        unknown_color = CASE_CONTROL_COLORS[3]  # Gray
+        tax.scatter(
+            unknown[["EUR", "AFR", "ASN"]].values,
+            color=unknown_color,
+            label="Unknown",
+            **style_defaults
+        )
+
     # Add plot elements
     multiple = 0.1  # Our scale is 0 to 1 and we want 0.1 increments
     tax.boundary(linewidth=0.5)

diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py
@@ -31,7 +31,6 @@
 
 @app.command()
 def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path):
-
     df = (
         read_het(het)
         .join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left")
@@ -50,13 +49,20 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile:
 def plot(df: pd.DataFrame, population: str, threshold: float):
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
     fig, ax = plt.subplots(figsize=(6, 6))
     sns.scatterplot(
         x="x_label",
         y="F",
         data=df,
         hue="case_control",
-        palette=COLORS,
+        palette=CASE_CONTROL_LABEL_COLORS,
         ax=ax,
         alpha=0.8,
         linewidth=0,
@@ -67,7 +73,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float):
     ax.set_xlabel("Subjects sorted by F")
     ax.set_ylabel("F")
     ax.set_ylim(_get_ylim(df.F, threshold))
-    ax.set_title(f"{population} Homozygosity F Coefficient")
+    ax.set_title(f"{population} Heterozygosity F Coefficient")
 
     # Move legend
     plt.legend(loc="upper left")

diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
@@ -88,9 +88,16 @@ def plot_panel(
     )
 
     # Set basic defaults so I don't have to repeat myself
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
     style_defaults = dict(linewidth=0, alpha=0.8, s=5)
     sample_defaults = {
-        **dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample),
+        **dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample),
         **style_defaults,
     }
     snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults}

diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
@@ -18,19 +18,20 @@
 import seaborn as sns
 import typer
 
-# import snakemake
-
 from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
 from cgr_gwas_qc.workflow.scripts import sample_qc_table
 
+# import snakemake
+
+
 app = typer.Typer(add_completion=False)
 
 
 @app.command()
 def main(sample_qc: Path, outfile: Path, xchr: str):
     sample = load_sample_data(sample_qc)
     xchr = str(snakemake.params)  # type: ignore # noqa
-    plot(sample, outfile, xchr)
+    plot(sample, xchr, outfile)
 
 
 """
@@ -67,16 +68,23 @@ def _update_categories(sr: pd.DataFrame):
     return sr
 
 
-def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True):
+def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None):
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
     # Create plots
     style_defaults = dict(linewidth=0, alpha=0.8, s=2)
     defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample)
     fig, ax = plt.subplots(figsize=(6, 6))
     sns.boxplot(ax=ax, showfliers=False, **defaults)
     sns.stripplot(
-        ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults
+        ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults
     )
 
     # Make boxplot black and white
@@ -87,13 +95,13 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool
     # ax.set_xlabel("Reported Sex")
     ax.set_ylabel("ChrX Inbreeding Coeff")
 
-    xchr = xchr.strip().lower() == "true"
-    print(type(xchr), " ", xchr)
-    if xchr:
-        print("sex chr included", xchr)
+    xchr_bool = xchr.strip().lower() == "true"
+    print(type(xchr_bool), " ", xchr_bool)
+    if xchr_bool:
+        print("sex chr included", xchr_bool)
         ax.set_xlabel("Reported Sex")
     else:
-        print("No sex chromosome ", xchr)
+        print("No sex chromosome ", xchr_bool)
         ax.set_xlabel("No sex chromosome \nSkipping sex condordace")
 
     # Add line at 0.5

diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py
@@ -50,7 +50,14 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path):
 def plot(df: pd.DataFrame, population: str) -> sns.PairGrid:
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
-    g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True)
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
+    g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True)
     g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0)
     g.map_diag(sns.kdeplot)
     g.add_legend(

diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
@@ -192,7 +192,9 @@ def main(
     )
 
     add_qc_columns(
-        sample_qc, remove_contam, remove_rep_discordant,
+        sample_qc,
+        remove_contam,
+        remove_rep_discordant,
     )
     save(sample_qc, outfile)
 
@@ -320,7 +322,6 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
     .. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/
 
     """
-
     return (
         pd.read_csv(file_name, sep="\t")
         .assign(
@@ -402,7 +403,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram
 
     if file_name is None:
         return pd.DataFrame(
-            index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
+            index=Sample_IDs,
+            columns=["Contamination_Rate", "is_contaminated"],
         ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})
 
     return (
@@ -445,12 +447,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie
 
 
 def add_qc_columns(
-    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame,
+    remove_contam: bool,
+    remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     add_call_rate_flags(sample_qc)
     _add_identifiler(sample_qc)
     _add_analytic_exclusion(
-        sample_qc, remove_contam, remove_rep_discordant,
+        sample_qc,
+        remove_contam,
+        remove_rep_discordant,
     )
     _add_subject_representative(sample_qc)
     _add_subject_dropped_from_study(sample_qc)
@@ -496,7 +502,9 @@ def reason_string(row: pd.Series) -> str:
 
 
 def _add_analytic_exclusion(
-    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame,
+    remove_contam: bool,
+    remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     """Adds a flag to remove samples based on provided conditions.