From f9a4f2edc2fac3667e866193d99fc0dffc11a648 Mon Sep 17 00:00:00 2001
From: Kevin Liao <liaoks@ccad2.cm.cluster>
Date: Fri, 12 Apr 2024 15:49:43 -0400
Subject: [PATCH 1/3] issue237

---
 src/cgr_gwas_qc/reporting/constants.py        | 10 +-
 .../workflow/scripts/plot_ancestry.py         | 39 +++++++-
 .../workflow/scripts/plot_ancestry_grafpop.py | 95 -------------------
 .../scripts/plot_autosomal_heterozygosity.py  | 12 ++-
 .../workflow/scripts/plot_call_rate.py        |  9 +-
 .../workflow/scripts/plot_chrx_inbreeding.py  | 28 ++++--
 src/cgr_gwas_qc/workflow/scripts/plot_pca.py  |  9 +-
 .../workflow/scripts/sample_qc_table.py       | 24 +++--
 .../workflow/sub_workflows/sample_qc.smk      | 28 +++---
 .../workflow/sub_workflows/subject_qc.smk     |  6 --
 10 files changed, 116 insertions(+), 144 deletions(-)
 delete mode 100644 src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py

diff --git a/src/cgr_gwas_qc/reporting/constants.py b/src/cgr_gwas_qc/reporting/constants.py
index 88144661..a9715531 100644
--- a/src/cgr_gwas_qc/reporting/constants.py
+++ b/src/cgr_gwas_qc/reporting/constants.py
@@ -1,7 +1,15 @@
 import pandas as pd
 
 CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"])
-CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"]  # red  # blue  # gray  # green
+CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"]  # red  # blue  # gray  #gold
+
+# Assign labels to colors for plotting consistency
+CASE_CONTROL_LABEL_COLORS = {
+    "Case": CASE_CONTROL_COLORS[0],
+    "Control": CASE_CONTROL_COLORS[1],
+    "QC": CASE_CONTROL_COLORS[2],
+    "Unknown": CASE_CONTROL_COLORS[3],
+}
 
 SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"])
 
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
index 4327333a..374459c9 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
@@ -31,7 +31,11 @@ def main(sample_qc: Path, outfile: Path):
 
 
 def load_sample_data(sample_qc: Path) -> pd.DataFrame:
-    return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"])
+    return (
+        sample_qc_table.read(sample_qc)
+        .query("is_subject_representative")
+        .dropna(subset=["EUR", "AFR", "ASN"])
+    )
 
 
 def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
@@ -42,17 +46,21 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
     fig, tax = ternary.figure(scale=1)  # Set scale 0 to 1
     fig.set_size_inches(6, 5)
 
-    # Plot cases and controls separately
+    # Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible
     case = sample.query("case_control == 'Case'")
     if case.shape[0] > 0:
         case_color = CASE_CONTROL_COLORS[0]
         tax.scatter(
-            case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults
+            case[["EUR", "AFR", "ASN"]].values,
+            color=case_color,
+            label="Case",
+            zorder=4,
+            **style_defaults
         )
 
     control = sample.query("case_control == 'Control'")
     if control.shape[0] > 0:
-        control_color = CASE_CONTROL_COLORS[1]
+        control_color = CASE_CONTROL_COLORS[1]  # blue
         tax.scatter(
             control[["EUR", "AFR", "ASN"]].values,
             color=control_color,
@@ -60,6 +68,29 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
             **style_defaults
         )
 
+    # Issue 237: Add samples if they are neither case or control.
+    project_qc = sample.query("case_control == 'QC'")
+    if project_qc.shape[0] > 0:
+        project_qc_color = CASE_CONTROL_COLORS[2]  # Yellow
+        tax.scatter(
+            project_qc[["EUR", "AFR", "ASN"]].values,
+            color=project_qc_color,
+            label="QC",
+            **style_defaults
+        )
+
+    unknown = sample.query(
+        "case_control != 'Control' and case_control != 'Case' and case_control != 'QC'"
+    )
+    if unknown.shape[0] > 0:
+        unknown_color = CASE_CONTROL_COLORS[3]  # Gray
+        tax.scatter(
+            unknown[["EUR", "AFR", "ASN"]].values,
+            color=unknown_color,
+            label="Unknown",
+            **style_defaults
+        )
+
     # Add plot elements
     multiple = 0.1  # Our scale is 0 to 1 and we want 0.1 increments
     tax.boundary(linewidth=0.5)
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py
deleted file mode 100644
index 2a0d7242..00000000
--- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""
-plot_ancestry.py
-----------------
-
-This script plots the triangle plot of ancestries from GRAF ancestry
-estimates.
-
-Output:
-    ``sample_level/ancestry.png``
-
-"""
-import os
-from pathlib import Path
-from typing import Optional
-
-import pandas as pd
-import seaborn as sns
-import ternary
-import typer
-
-from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
-from cgr_gwas_qc.workflow.scripts import sample_qc_table
-
-app = typer.Typer(add_completion=False)
-
-
-@app.command()
-def main(sample_qc: Path, outfile: Path):
-    sample = load_sample_data(sample_qc)
-    plot(sample, outfile)
-
-
-def load_sample_data(sample_qc: Path) -> pd.DataFrame:
-    return sample_qc_table.read(sample_qc).dropna(subset=["E(%)", "F(%)", "A(%)"])
-
-
-def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
-    sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
-
-    # Create plots
-    style_defaults = dict(linewidth=0, alpha=0.8, s=8)
-    fig, tax = ternary.figure(scale=1)  # Set scale 0 to 1
-    fig.set_size_inches(6, 5)
-
-    # Plot cases and controls separately
-    case = sample.query("case_control == 'Case'")
-    if case.shape[0] > 0:
-        case_color = CASE_CONTROL_COLORS[0]
-        tax.scatter(
-            case[["E(%)", "F(%)", "A(%)"]].values, color=case_color, label="Case", **style_defaults
-        )
-
-    control = sample.query("case_control == 'Control'")
-    if control.shape[0] > 0:
-        control_color = CASE_CONTROL_COLORS[1]
-        tax.scatter(
-            control[["E(%)", "F(%)", "A(%)"]].values,
-            color=control_color,
-            label="Control",
-            **style_defaults
-        )
-
-    # Add plot elements
-    multiple = 0.1  # Our scale is 0 to 1 and we want 0.1 increments
-    tax.boundary(linewidth=0.5)
-    tax.gridlines(multiple=multiple, color="gray")
-    tax.ticks(axis="lbr", linewidth=1, multiple=multiple, offset=0.02, tick_formats="%.1f")
-
-    # Set Axis labels
-    label_defaults = dict(fontsize=12, offset=0.14)
-    tax.left_axis_label("Asian", **label_defaults)
-    tax.right_axis_label("African", **label_defaults)
-    tax.bottom_axis_label("European", **label_defaults)
-
-    # Add legend
-    tax.legend(title="case_control")
-
-    # Clean-up plot
-    tax.set_background_color(color="white")
-    tax.clear_matplotlib_ticks()
-    tax.get_axes().axis("off")  # removes outer square axes
-
-    # Save if given an outfile
-    if outfile:
-        tax.savefig(outfile)
-
-
-if __name__ == "__main__":
-    if "snakemake" in locals():
-        defaults = {}
-        defaults.update({"sample_qc": Path(snakemake.input[0])})  # type: ignore # noqa
-        defaults.update({"outfile": Path(snakemake.output[0])})  # type: ignore # noqa
-        main(**defaults)
-    else:
-        app()
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py
index 72eacc11..caf3dda5 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py
@@ -31,7 +31,6 @@
 
 @app.command()
 def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path):
-
     df = (
         read_het(het)
         .join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left")
@@ -50,13 +49,20 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile:
 def plot(df: pd.DataFrame, population: str, threshold: float):
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
     fig, ax = plt.subplots(figsize=(6, 6))
     sns.scatterplot(
         x="x_label",
         y="F",
         data=df,
         hue="case_control",
-        palette=COLORS,
+        palette=CASE_CONTROL_LABEL_COLORS,
         ax=ax,
         alpha=0.8,
         linewidth=0,
@@ -67,7 +73,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float):
     ax.set_xlabel("Subjects sorted by F")
     ax.set_ylabel("F")
     ax.set_ylim(_get_ylim(df.F, threshold))
-    ax.set_title(f"{population} Homozygosity F Coefficient")
+    ax.set_title(f"{population} Heterozygosity F Coefficient")
 
     # Move legend
     plt.legend(loc="upper left")
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
index c248e8e0..3fe23a48 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
@@ -88,9 +88,16 @@ def plot_panel(
     )
 
     # Set basic defaults so I don't have to repeat myself
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
     style_defaults = dict(linewidth=0, alpha=0.8, s=5)
     sample_defaults = {
-        **dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample),
+        **dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample),
         **style_defaults,
     }
     snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults}
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
index 97c4b30a..ecfa444e 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
@@ -18,11 +18,12 @@
 import seaborn as sns
 import typer
 
-# import snakemake
-
 from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
 from cgr_gwas_qc.workflow.scripts import sample_qc_table
 
+# import snakemake
+
+
 app = typer.Typer(add_completion=False)
 
 
@@ -30,7 +31,7 @@
 def main(sample_qc: Path, outfile: Path, xchr: str):
     sample = load_sample_data(sample_qc)
     xchr = str(snakemake.params)  # type: ignore # noqa
-    plot(sample, outfile, xchr)
+    plot(sample, xchr, outfile)
 
 
 """
@@ -67,16 +68,23 @@ def _update_categories(sr: pd.DataFrame):
     return sr
 
 
-def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True):
+def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None):
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
     # Create plots
     style_defaults = dict(linewidth=0, alpha=0.8, s=2)
     defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample)
     fig, ax = plt.subplots(figsize=(6, 6))
     sns.boxplot(ax=ax, showfliers=False, **defaults)
     sns.stripplot(
-        ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults
+        ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults
     )
 
     # Make boxplot black and white
@@ -87,13 +95,13 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool
     # ax.set_xlabel("Reported Sex")
     ax.set_ylabel("ChrX Inbreeding Coeff")
 
-    xchr = xchr.strip().lower() == "true"
-    print(type(xchr), " ", xchr)
-    if xchr:
-        print("sex chr included", xchr)
+    xchr_bool = xchr.strip().lower() == "true"
+    print(type(xchr_bool), " ", xchr_bool)
+    if xchr_bool:
+        print("sex chr included", xchr_bool)
         ax.set_xlabel("Reported Sex")
     else:
-        print("No sex chromosome ", xchr)
+        print("No sex chromosome ", xchr_bool)
         ax.set_xlabel("No sex chromosome \nSkipping sex condordace")
 
     # Add line at 0.5
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py
index 868d79b9..60b5f2ba 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py
@@ -50,7 +50,14 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path):
 def plot(df: pd.DataFrame, population: str) -> sns.PairGrid:
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
-    g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True)
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
+    g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True)
     g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0)
     g.map_diag(sns.kdeplot)
     g.add_legend(
diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
index ee39a2e5..c121fb3a 100755
--- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
+++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
@@ -192,7 +192,9 @@ def main(
     )
 
     add_qc_columns(
-        sample_qc, remove_contam, remove_rep_discordant,
+        sample_qc,
+        remove_contam,
+        remove_rep_discordant,
     )
     save(sample_qc, outfile)
 
@@ -320,12 +322,9 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
     .. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/
 
     """
-
     return (
         pd.read_csv(file_name, sep="\t")
-        .assign(
-            Sample_ID=lambda x: x["Subject"].astype(str)
-        )  # Issue 216: When subject IDs are numeric reindex fails. This makes sure index Sample_ID will always be as a character
+        .rename({"Subject": "Sample_ID"}, axis=1)
         .assign(Ancestry=lambda x: x["Computed population"].str.replace(" ", "_"))
         .assign(AFR=lambda x: x["P_f (%)"] / 100)
         .assign(EUR=lambda x: x["P_e (%)"] / 100)
@@ -402,7 +401,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram
 
     if file_name is None:
         return pd.DataFrame(
-            index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
+            index=Sample_IDs,
+            columns=["Contamination_Rate", "is_contaminated"],
         ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})
 
     return (
@@ -445,12 +445,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie
 
 
 def add_qc_columns(
-    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame,
+    remove_contam: bool,
+    remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     add_call_rate_flags(sample_qc)
     _add_identifiler(sample_qc)
     _add_analytic_exclusion(
-        sample_qc, remove_contam, remove_rep_discordant,
+        sample_qc,
+        remove_contam,
+        remove_rep_discordant,
     )
     _add_subject_representative(sample_qc)
     _add_subject_dropped_from_study(sample_qc)
@@ -496,7 +500,9 @@ def reason_string(row: pd.Series) -> str:
 
 
 def _add_analytic_exclusion(
-    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame,
+    remove_contam: bool,
+    remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     """Adds a flag to remove samples based on provided conditions.
 
diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk
index 3c0cd1ff..c5bd18ae 100644
--- a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk
+++ b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk
@@ -2,10 +2,6 @@ from cgr_gwas_qc import load_config
 
 cfg = load_config()
 
-PLINK_BIG_MEM = {1: 1024 * 4, 2: 1024 * 64, 3: 1024 * 250}
-BIG_TIME = {1: 8, 2: 24, 3: 48}
-
-
 use_contamination = (
     cfg.config.user_files.idat_pattern
     and cfg.config.user_files.gtc_pattern
@@ -15,6 +11,8 @@ use_contamination = (
 
 localrules:
     all_sample_qc,
+    sample_concordance_plink,
+    sample_concordance_summary,
     split_sample_concordance,
     snp_qc_table,
     sample_level_sexcheck,
@@ -65,14 +63,17 @@ module thousand_genomes:
     config:
         {}
 
+
 module plink:
     snakefile:
         cfg.modules("plink")
 
+
 module graf:
     snakefile:
         cfg.modules("graf_v2.4")
 
+
 module grafpop:
     snakefile:
         cfg.modules("grafpop")
@@ -316,10 +317,6 @@ rule sample_concordance_plink:
         pi_hat_threshold=cfg.config.software_params.pi_hat_threshold,
     output:
         "sample_level/concordance/plink.csv",
-    resources:
-        mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt],
-        time_hr=lambda wildcards, attempt: BIG_TIME[attempt],
-
     script:
         "../scripts/concordance_table.py"
 
@@ -386,9 +383,8 @@ rule sample_concordance_summary:
     output:
         "sample_level/concordance/summary.csv",
     resources:
-        mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt],
-        time_hr=lambda wildcards, attempt: BIG_TIME[attempt],
-
+        mem_mb=lambda wildcards, attempt: 1024 * 2 * attempt,
+        time_hr=lambda wildcards, attempt: 2 * attempt,
     script:
         "../scripts/sample_concordance.py"
 
@@ -413,9 +409,9 @@ use rule grafpop_populations from grafpop as graf_populations with:
         bed=rules.snp_call_rate_filter_2.output.bed,
         bim=rules.snp_call_rate_filter_2.output.bim,
         fam=rules.snp_call_rate_filter_2.output.fam,
-	#bed="sample_level/call_rate_2/samples.bed",
-        #bim="sample_level/call_rate_2/samples.bim",
-        #fam="sample_level/call_rate_2/samples.fam", 
+    # bed="sample_level/call_rate_2/samples.bed",
+    # bim="sample_level/call_rate_2/samples.bim",
+    # fam="sample_level/call_rate_2/samples.fam",
     output:
         "sample_level/ancestry/grafpop_populations.txt",
     log:
@@ -454,6 +450,7 @@ rule snp_qc_table:
 sex_chr_included = cfg.config.workflow_params.sex_chr_included
 if sex_chr_included:
     print("sex_chr_included ", sex_chr_included)
+
     use rule sexcheck from plink as sample_level_sexcheck with:
         input:
             bed=rules.snp_call_rate_filter_1.output.bed,
@@ -463,8 +460,10 @@ if sex_chr_included:
             out_prefix="sample_level/call_rate_1/samples",
         output:
             "sample_level/call_rate_1/samples.sexcheck",
+
 else:
     print("sex_chr_included ", sex_chr_included)
+
     rule sample_level_sexcheck:
         input:
             bed=rules.snp_call_rate_filter_1.output.bed,
@@ -478,6 +477,7 @@ else:
             echo "FID IID PEDSEX SNPSEX STATUS F" >> sample_level/call_rate_1/samples.sexcheck
             """
 
+
 def _contam(wildcards):
     uf, wf = cfg.config.user_files, cfg.config.workflow_params
     if use_contamination:
diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk
index 397ce871..0ddf5cd0 100644
--- a/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk
+++ b/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk
@@ -8,9 +8,6 @@ import shutil
 
 cfg = load_config()
 
-PLINK_BIG_MEM = {1: 1024 * 4, 2: 1024 * 64, 3: 1024 * 250}
-BIG_TIME = {1: 8, 2: 24, 3: 48}
-
 
 localrules:
     all_subject_qc,
@@ -277,9 +274,6 @@ use rule genome from plink as population_level_ibd with:
         out_prefix="subject_level/{population}/subjects_maf{maf}_ld{ld}_ibd",
     output:
         "subject_level/{population}/subjects_maf{maf}_ld{ld}_ibd.genome",
-    resources:
-        mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt],
-        time_hr=lambda wildcards, attempt: BIG_TIME[attempt],
 
 
 rule population_level_concordance_plink:

From 39ce76928242feed6bef9fe9fb9ec3cdf52645de Mon Sep 17 00:00:00 2001
From: Kevin Liao <liaoks@ccad2.cm.cluster>
Date: Fri, 12 Apr 2024 15:59:43 -0400
Subject: [PATCH 2/3] Revert "issue237"

This reverts commit f9a4f2edc2fac3667e866193d99fc0dffc11a648.
---
 src/cgr_gwas_qc/reporting/constants.py        | 10 +-
 .../workflow/scripts/plot_ancestry.py         | 39 +-------
 .../workflow/scripts/plot_ancestry_grafpop.py | 95 +++++++++++++++++++
 .../scripts/plot_autosomal_heterozygosity.py  | 12 +--
 .../workflow/scripts/plot_call_rate.py        |  9 +-
 .../workflow/scripts/plot_chrx_inbreeding.py  | 28 ++----
 src/cgr_gwas_qc/workflow/scripts/plot_pca.py  |  9 +-
 .../workflow/scripts/sample_qc_table.py       | 24 ++---
 .../workflow/sub_workflows/sample_qc.smk      | 28 +++---
 .../workflow/sub_workflows/subject_qc.smk     |  6 ++
 10 files changed, 144 insertions(+), 116 deletions(-)
 create mode 100644 src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py

diff --git a/src/cgr_gwas_qc/reporting/constants.py b/src/cgr_gwas_qc/reporting/constants.py
index a9715531..88144661 100644
--- a/src/cgr_gwas_qc/reporting/constants.py
+++ b/src/cgr_gwas_qc/reporting/constants.py
@@ -1,15 +1,7 @@
 import pandas as pd
 
 CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"])
-CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"]  # red  # blue  # gray  #gold
-
-# Assign labels to colors for plotting consistency
-CASE_CONTROL_LABEL_COLORS = {
-    "Case": CASE_CONTROL_COLORS[0],
-    "Control": CASE_CONTROL_COLORS[1],
-    "QC": CASE_CONTROL_COLORS[2],
-    "Unknown": CASE_CONTROL_COLORS[3],
-}
+CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"]  # red  # blue  # gray  # green
 
 SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"])
 
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
index 374459c9..4327333a 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
@@ -31,11 +31,7 @@ def main(sample_qc: Path, outfile: Path):
 
 
 def load_sample_data(sample_qc: Path) -> pd.DataFrame:
-    return (
-        sample_qc_table.read(sample_qc)
-        .query("is_subject_representative")
-        .dropna(subset=["EUR", "AFR", "ASN"])
-    )
+    return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"])
 
 
 def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
@@ -46,21 +42,17 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
     fig, tax = ternary.figure(scale=1)  # Set scale 0 to 1
     fig.set_size_inches(6, 5)
 
-    # Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible
+    # Plot cases and controls separately
     case = sample.query("case_control == 'Case'")
     if case.shape[0] > 0:
         case_color = CASE_CONTROL_COLORS[0]
         tax.scatter(
-            case[["EUR", "AFR", "ASN"]].values,
-            color=case_color,
-            label="Case",
-            zorder=4,
-            **style_defaults
+            case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults
         )
 
     control = sample.query("case_control == 'Control'")
     if control.shape[0] > 0:
-        control_color = CASE_CONTROL_COLORS[1]  # blue
+        control_color = CASE_CONTROL_COLORS[1]
         tax.scatter(
             control[["EUR", "AFR", "ASN"]].values,
             color=control_color,
@@ -68,29 +60,6 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
             **style_defaults
         )
 
-    # Issue 237: Add samples if they are neither case or control.
-    project_qc = sample.query("case_control == 'QC'")
-    if project_qc.shape[0] > 0:
-        project_qc_color = CASE_CONTROL_COLORS[2]  # Yellow
-        tax.scatter(
-            project_qc[["EUR", "AFR", "ASN"]].values,
-            color=project_qc_color,
-            label="QC",
-            **style_defaults
-        )
-
-    unknown = sample.query(
-        "case_control != 'Control' and case_control != 'Case' and case_control != 'QC'"
-    )
-    if unknown.shape[0] > 0:
-        unknown_color = CASE_CONTROL_COLORS[3]  # Gray
-        tax.scatter(
-            unknown[["EUR", "AFR", "ASN"]].values,
-            color=unknown_color,
-            label="Unknown",
-            **style_defaults
-        )
-
     # Add plot elements
     multiple = 0.1  # Our scale is 0 to 1 and we want 0.1 increments
     tax.boundary(linewidth=0.5)
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py
new file mode 100644
index 00000000..2a0d7242
--- /dev/null
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py
@@ -0,0 +1,95 @@
+"""
+plot_ancestry.py
+----------------
+
+This script plots the triangle plot of ancestries from GRAF ancestry
+estimates.
+
+Output:
+    ``sample_level/ancestry.png``
+
+"""
+import os
+from pathlib import Path
+from typing import Optional
+
+import pandas as pd
+import seaborn as sns
+import ternary
+import typer
+
+from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
+from cgr_gwas_qc.workflow.scripts import sample_qc_table
+
+app = typer.Typer(add_completion=False)
+
+
+@app.command()
+def main(sample_qc: Path, outfile: Path):
+    sample = load_sample_data(sample_qc)
+    plot(sample, outfile)
+
+
+def load_sample_data(sample_qc: Path) -> pd.DataFrame:
+    return sample_qc_table.read(sample_qc).dropna(subset=["E(%)", "F(%)", "A(%)"])
+
+
+def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
+    sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
+
+    # Create plots
+    style_defaults = dict(linewidth=0, alpha=0.8, s=8)
+    fig, tax = ternary.figure(scale=1)  # Set scale 0 to 1
+    fig.set_size_inches(6, 5)
+
+    # Plot cases and controls separately
+    case = sample.query("case_control == 'Case'")
+    if case.shape[0] > 0:
+        case_color = CASE_CONTROL_COLORS[0]
+        tax.scatter(
+            case[["E(%)", "F(%)", "A(%)"]].values, color=case_color, label="Case", **style_defaults
+        )
+
+    control = sample.query("case_control == 'Control'")
+    if control.shape[0] > 0:
+        control_color = CASE_CONTROL_COLORS[1]
+        tax.scatter(
+            control[["E(%)", "F(%)", "A(%)"]].values,
+            color=control_color,
+            label="Control",
+            **style_defaults
+        )
+
+    # Add plot elements
+    multiple = 0.1  # Our scale is 0 to 1 and we want 0.1 increments
+    tax.boundary(linewidth=0.5)
+    tax.gridlines(multiple=multiple, color="gray")
+    tax.ticks(axis="lbr", linewidth=1, multiple=multiple, offset=0.02, tick_formats="%.1f")
+
+    # Set Axis labels
+    label_defaults = dict(fontsize=12, offset=0.14)
+    tax.left_axis_label("Asian", **label_defaults)
+    tax.right_axis_label("African", **label_defaults)
+    tax.bottom_axis_label("European", **label_defaults)
+
+    # Add legend
+    tax.legend(title="case_control")
+
+    # Clean-up plot
+    tax.set_background_color(color="white")
+    tax.clear_matplotlib_ticks()
+    tax.get_axes().axis("off")  # removes outer square axes
+
+    # Save if given an outfile
+    if outfile:
+        tax.savefig(outfile)
+
+
+if __name__ == "__main__":
+    if "snakemake" in locals():
+        defaults = {}
+        defaults.update({"sample_qc": Path(snakemake.input[0])})  # type: ignore # noqa
+        defaults.update({"outfile": Path(snakemake.output[0])})  # type: ignore # noqa
+        main(**defaults)
+    else:
+        app()
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py
index caf3dda5..72eacc11 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py
@@ -31,6 +31,7 @@
 
 @app.command()
 def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path):
+
     df = (
         read_het(het)
         .join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left")
@@ -49,20 +50,13 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile:
 def plot(df: pd.DataFrame, population: str, threshold: float):
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
-    CASE_CONTROL_LABEL_COLORS = {
-        "Case": CASE_CONTROL_COLORS[0],
-        "Control": CASE_CONTROL_COLORS[1],
-        "QC": CASE_CONTROL_COLORS[2],
-        "Unknown": CASE_CONTROL_COLORS[3],
-    }
-
     fig, ax = plt.subplots(figsize=(6, 6))
     sns.scatterplot(
         x="x_label",
         y="F",
         data=df,
         hue="case_control",
-        palette=CASE_CONTROL_LABEL_COLORS,
+        palette=COLORS,
         ax=ax,
         alpha=0.8,
         linewidth=0,
@@ -73,7 +67,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float):
     ax.set_xlabel("Subjects sorted by F")
     ax.set_ylabel("F")
     ax.set_ylim(_get_ylim(df.F, threshold))
-    ax.set_title(f"{population} Heterozygosity F Coefficient")
+    ax.set_title(f"{population} Homozygosity F Coefficient")
 
     # Move legend
     plt.legend(loc="upper left")
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
index 3fe23a48..c248e8e0 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
@@ -88,16 +88,9 @@ def plot_panel(
     )
 
     # Set basic defaults so I don't have to repeat myself
-    CASE_CONTROL_LABEL_COLORS = {
-        "Case": CASE_CONTROL_COLORS[0],
-        "Control": CASE_CONTROL_COLORS[1],
-        "QC": CASE_CONTROL_COLORS[2],
-        "Unknown": CASE_CONTROL_COLORS[3],
-    }
-
     style_defaults = dict(linewidth=0, alpha=0.8, s=5)
     sample_defaults = {
-        **dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample),
+        **dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample),
         **style_defaults,
     }
     snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults}
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
index ecfa444e..97c4b30a 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
@@ -18,11 +18,10 @@
 import seaborn as sns
 import typer
 
-from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
-from cgr_gwas_qc.workflow.scripts import sample_qc_table
-
 # import snakemake
 
+from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
+from cgr_gwas_qc.workflow.scripts import sample_qc_table
 
 app = typer.Typer(add_completion=False)
 
@@ -31,7 +30,7 @@
 def main(sample_qc: Path, outfile: Path, xchr: str):
     sample = load_sample_data(sample_qc)
     xchr = str(snakemake.params)  # type: ignore # noqa
-    plot(sample, xchr, outfile)
+    plot(sample, outfile, xchr)
 
 
 """
@@ -68,23 +67,16 @@ def _update_categories(sr: pd.DataFrame):
     return sr
 
 
-def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None):
+def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True):
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
-    CASE_CONTROL_LABEL_COLORS = {
-        "Case": CASE_CONTROL_COLORS[0],
-        "Control": CASE_CONTROL_COLORS[1],
-        "QC": CASE_CONTROL_COLORS[2],
-        "Unknown": CASE_CONTROL_COLORS[3],
-    }
-
     # Create plots
     style_defaults = dict(linewidth=0, alpha=0.8, s=2)
     defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample)
     fig, ax = plt.subplots(figsize=(6, 6))
     sns.boxplot(ax=ax, showfliers=False, **defaults)
     sns.stripplot(
-        ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults
+        ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults
     )
 
     # Make boxplot black and white
@@ -95,13 +87,13 @@ def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None)
     # ax.set_xlabel("Reported Sex")
     ax.set_ylabel("ChrX Inbreeding Coeff")
 
-    xchr_bool = xchr.strip().lower() == "true"
-    print(type(xchr_bool), " ", xchr_bool)
-    if xchr_bool:
-        print("sex chr included", xchr_bool)
+    xchr = xchr.strip().lower() == "true"
+    print(type(xchr), " ", xchr)
+    if xchr:
+        print("sex chr included", xchr)
         ax.set_xlabel("Reported Sex")
     else:
-        print("No sex chromosome ", xchr_bool)
+        print("No sex chromosome ", xchr)
         ax.set_xlabel("No sex chromosome \nSkipping sex condordace")
 
     # Add line at 0.5
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py
index 60b5f2ba..868d79b9 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py
@@ -50,14 +50,7 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path):
 def plot(df: pd.DataFrame, population: str) -> sns.PairGrid:
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
-    CASE_CONTROL_LABEL_COLORS = {
-        "Case": CASE_CONTROL_COLORS[0],
-        "Control": CASE_CONTROL_COLORS[1],
-        "QC": CASE_CONTROL_COLORS[2],
-        "Unknown": CASE_CONTROL_COLORS[3],
-    }
-
-    g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True)
+    g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True)
     g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0)
     g.map_diag(sns.kdeplot)
     g.add_legend(
diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
index c121fb3a..ee39a2e5 100755
--- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
+++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
@@ -192,9 +192,7 @@ def main(
     )
 
     add_qc_columns(
-        sample_qc,
-        remove_contam,
-        remove_rep_discordant,
+        sample_qc, remove_contam, remove_rep_discordant,
     )
     save(sample_qc, outfile)
 
@@ -322,9 +320,12 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
     .. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/
 
     """
+
     return (
         pd.read_csv(file_name, sep="\t")
-        .rename({"Subject": "Sample_ID"}, axis=1)
+        .assign(
+            Sample_ID=lambda x: x["Subject"].astype(str)
+        )  # Issue 216: When subject IDs are numeric reindex fails. This makes sure index Sample_ID will always be as a character
         .assign(Ancestry=lambda x: x["Computed population"].str.replace(" ", "_"))
         .assign(AFR=lambda x: x["P_f (%)"] / 100)
         .assign(EUR=lambda x: x["P_e (%)"] / 100)
@@ -401,8 +402,7 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram
 
     if file_name is None:
         return pd.DataFrame(
-            index=Sample_IDs,
-            columns=["Contamination_Rate", "is_contaminated"],
+            index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
         ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})
 
     return (
@@ -445,16 +445,12 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie
 
 
 def add_qc_columns(
-    sample_qc: pd.DataFrame,
-    remove_contam: bool,
-    remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     add_call_rate_flags(sample_qc)
     _add_identifiler(sample_qc)
     _add_analytic_exclusion(
-        sample_qc,
-        remove_contam,
-        remove_rep_discordant,
+        sample_qc, remove_contam, remove_rep_discordant,
     )
     _add_subject_representative(sample_qc)
     _add_subject_dropped_from_study(sample_qc)
@@ -500,9 +496,7 @@ def reason_string(row: pd.Series) -> str:
 
 
 def _add_analytic_exclusion(
-    sample_qc: pd.DataFrame,
-    remove_contam: bool,
-    remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     """Adds a flag to remove samples based on provided conditions.
 
diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk
index c5bd18ae..3c0cd1ff 100644
--- a/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk
+++ b/src/cgr_gwas_qc/workflow/sub_workflows/sample_qc.smk
@@ -2,6 +2,10 @@ from cgr_gwas_qc import load_config
 
 cfg = load_config()
 
+PLINK_BIG_MEM = {1: 1024 * 4, 2: 1024 * 64, 3: 1024 * 250}
+BIG_TIME = {1: 8, 2: 24, 3: 48}
+
+
 use_contamination = (
     cfg.config.user_files.idat_pattern
     and cfg.config.user_files.gtc_pattern
@@ -11,8 +15,6 @@ use_contamination = (
 
 localrules:
     all_sample_qc,
-    sample_concordance_plink,
-    sample_concordance_summary,
     split_sample_concordance,
     snp_qc_table,
     sample_level_sexcheck,
@@ -63,17 +65,14 @@ module thousand_genomes:
     config:
         {}
 
-
 module plink:
     snakefile:
         cfg.modules("plink")
 
-
 module graf:
     snakefile:
         cfg.modules("graf_v2.4")
 
-
 module grafpop:
     snakefile:
         cfg.modules("grafpop")
@@ -317,6 +316,10 @@ rule sample_concordance_plink:
         pi_hat_threshold=cfg.config.software_params.pi_hat_threshold,
     output:
         "sample_level/concordance/plink.csv",
+    resources:
+        mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt],
+        time_hr=lambda wildcards, attempt: BIG_TIME[attempt],
+
     script:
         "../scripts/concordance_table.py"
 
@@ -383,8 +386,9 @@ rule sample_concordance_summary:
     output:
         "sample_level/concordance/summary.csv",
     resources:
-        mem_mb=lambda wildcards, attempt: 1024 * 2 * attempt,
-        time_hr=lambda wildcards, attempt: 2 * attempt,
+        mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt],
+        time_hr=lambda wildcards, attempt: BIG_TIME[attempt],
+
     script:
         "../scripts/sample_concordance.py"
 
@@ -409,9 +413,9 @@ use rule grafpop_populations from grafpop as graf_populations with:
         bed=rules.snp_call_rate_filter_2.output.bed,
         bim=rules.snp_call_rate_filter_2.output.bim,
         fam=rules.snp_call_rate_filter_2.output.fam,
-    # bed="sample_level/call_rate_2/samples.bed",
-    # bim="sample_level/call_rate_2/samples.bim",
-    # fam="sample_level/call_rate_2/samples.fam",
+	#bed="sample_level/call_rate_2/samples.bed",
+        #bim="sample_level/call_rate_2/samples.bim",
+        #fam="sample_level/call_rate_2/samples.fam", 
     output:
         "sample_level/ancestry/grafpop_populations.txt",
     log:
@@ -450,7 +454,6 @@ rule snp_qc_table:
 sex_chr_included = cfg.config.workflow_params.sex_chr_included
 if sex_chr_included:
     print("sex_chr_included ", sex_chr_included)
-
     use rule sexcheck from plink as sample_level_sexcheck with:
         input:
             bed=rules.snp_call_rate_filter_1.output.bed,
@@ -460,10 +463,8 @@ if sex_chr_included:
             out_prefix="sample_level/call_rate_1/samples",
         output:
             "sample_level/call_rate_1/samples.sexcheck",
-
 else:
     print("sex_chr_included ", sex_chr_included)
-
     rule sample_level_sexcheck:
         input:
             bed=rules.snp_call_rate_filter_1.output.bed,
@@ -477,7 +478,6 @@ else:
             echo "FID IID PEDSEX SNPSEX STATUS F" >> sample_level/call_rate_1/samples.sexcheck
             """
 
-
 def _contam(wildcards):
     uf, wf = cfg.config.user_files, cfg.config.workflow_params
     if use_contamination:
diff --git a/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk b/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk
index 0ddf5cd0..397ce871 100644
--- a/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk
+++ b/src/cgr_gwas_qc/workflow/sub_workflows/subject_qc.smk
@@ -8,6 +8,9 @@ import shutil
 
 cfg = load_config()
 
+PLINK_BIG_MEM = {1: 1024 * 4, 2: 1024 * 64, 3: 1024 * 250}
+BIG_TIME = {1: 8, 2: 24, 3: 48}
+
 
 localrules:
     all_subject_qc,
@@ -274,6 +277,9 @@ use rule genome from plink as population_level_ibd with:
         out_prefix="subject_level/{population}/subjects_maf{maf}_ld{ld}_ibd",
     output:
         "subject_level/{population}/subjects_maf{maf}_ld{ld}_ibd.genome",
+    resources:
+        mem_mb=lambda wildcards, attempt: PLINK_BIG_MEM[attempt],
+        time_hr=lambda wildcards, attempt: BIG_TIME[attempt],
 
 
 rule population_level_concordance_plink:

From 139faab43d575af9075f32f9c7e04f60aff46ebe Mon Sep 17 00:00:00 2001
From: Kevin Liao <liaoks@ccad2.cm.cluster>
Date: Fri, 12 Apr 2024 16:13:48 -0400
Subject: [PATCH 3/3] issue237_precommitted

---
 src/cgr_gwas_qc/reporting/constants.py        | 10 +-
 .../workflow/scripts/plot_ancestry.py         | 39 +++++++-
 .../workflow/scripts/plot_ancestry_grafpop.py | 95 -------------------
 .../scripts/plot_autosomal_heterozygosity.py  | 12 ++-
 .../workflow/scripts/plot_call_rate.py        |  9 +-
 .../workflow/scripts/plot_chrx_inbreeding.py  | 28 ++++--
 src/cgr_gwas_qc/workflow/scripts/plot_pca.py  |  9 +-
 .../workflow/scripts/sample_qc_table.py       | 20 ++--
 8 files changed, 101 insertions(+), 121 deletions(-)
 delete mode 100644 src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py

diff --git a/src/cgr_gwas_qc/reporting/constants.py b/src/cgr_gwas_qc/reporting/constants.py
index 88144661..a9715531 100644
--- a/src/cgr_gwas_qc/reporting/constants.py
+++ b/src/cgr_gwas_qc/reporting/constants.py
@@ -1,7 +1,15 @@
 import pandas as pd
 
 CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"])
-CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"]  # red  # blue  # gray  # green
+CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"]  # red  # blue  # gray  #gold
+
+# Assign labels to colors for plotting consistency
+CASE_CONTROL_LABEL_COLORS = {
+    "Case": CASE_CONTROL_COLORS[0],
+    "Control": CASE_CONTROL_COLORS[1],
+    "QC": CASE_CONTROL_COLORS[2],
+    "Unknown": CASE_CONTROL_COLORS[3],
+}
 
 SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"])
 
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
index 4327333a..374459c9 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
@@ -31,7 +31,11 @@ def main(sample_qc: Path, outfile: Path):
 
 
 def load_sample_data(sample_qc: Path) -> pd.DataFrame:
-    return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"])
+    return (
+        sample_qc_table.read(sample_qc)
+        .query("is_subject_representative")
+        .dropna(subset=["EUR", "AFR", "ASN"])
+    )
 
 
 def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
@@ -42,17 +46,21 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
     fig, tax = ternary.figure(scale=1)  # Set scale 0 to 1
     fig.set_size_inches(6, 5)
 
-    # Plot cases and controls separately
+    # Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible
     case = sample.query("case_control == 'Case'")
     if case.shape[0] > 0:
         case_color = CASE_CONTROL_COLORS[0]
         tax.scatter(
-            case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults
+            case[["EUR", "AFR", "ASN"]].values,
+            color=case_color,
+            label="Case",
+            zorder=4,
+            **style_defaults
         )
 
     control = sample.query("case_control == 'Control'")
     if control.shape[0] > 0:
-        control_color = CASE_CONTROL_COLORS[1]
+        control_color = CASE_CONTROL_COLORS[1]  # blue
         tax.scatter(
             control[["EUR", "AFR", "ASN"]].values,
             color=control_color,
@@ -60,6 +68,29 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
             **style_defaults
         )
 
+    # Issue 237: Add samples if they are neither case or control.
+    project_qc = sample.query("case_control == 'QC'")
+    if project_qc.shape[0] > 0:
+        project_qc_color = CASE_CONTROL_COLORS[2]  # Yellow
+        tax.scatter(
+            project_qc[["EUR", "AFR", "ASN"]].values,
+            color=project_qc_color,
+            label="QC",
+            **style_defaults
+        )
+
+    unknown = sample.query(
+        "case_control != 'Control' and case_control != 'Case' and case_control != 'QC'"
+    )
+    if unknown.shape[0] > 0:
+        unknown_color = CASE_CONTROL_COLORS[3]  # Gray
+        tax.scatter(
+            unknown[["EUR", "AFR", "ASN"]].values,
+            color=unknown_color,
+            label="Unknown",
+            **style_defaults
+        )
+
     # Add plot elements
     multiple = 0.1  # Our scale is 0 to 1 and we want 0.1 increments
     tax.boundary(linewidth=0.5)
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py b/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py
deleted file mode 100644
index 2a0d7242..00000000
--- a/src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""
-plot_ancestry.py
-----------------
-
-This script plots the triangle plot of ancestries from GRAF ancestry
-estimates.
-
-Output:
-    ``sample_level/ancestry.png``
-
-"""
-import os
-from pathlib import Path
-from typing import Optional
-
-import pandas as pd
-import seaborn as sns
-import ternary
-import typer
-
-from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
-from cgr_gwas_qc.workflow.scripts import sample_qc_table
-
-app = typer.Typer(add_completion=False)
-
-
-@app.command()
-def main(sample_qc: Path, outfile: Path):
-    sample = load_sample_data(sample_qc)
-    plot(sample, outfile)
-
-
-def load_sample_data(sample_qc: Path) -> pd.DataFrame:
-    return sample_qc_table.read(sample_qc).dropna(subset=["E(%)", "F(%)", "A(%)"])
-
-
-def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
-    sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
-
-    # Create plots
-    style_defaults = dict(linewidth=0, alpha=0.8, s=8)
-    fig, tax = ternary.figure(scale=1)  # Set scale 0 to 1
-    fig.set_size_inches(6, 5)
-
-    # Plot cases and controls separately
-    case = sample.query("case_control == 'Case'")
-    if case.shape[0] > 0:
-        case_color = CASE_CONTROL_COLORS[0]
-        tax.scatter(
-            case[["E(%)", "F(%)", "A(%)"]].values, color=case_color, label="Case", **style_defaults
-        )
-
-    control = sample.query("case_control == 'Control'")
-    if control.shape[0] > 0:
-        control_color = CASE_CONTROL_COLORS[1]
-        tax.scatter(
-            control[["E(%)", "F(%)", "A(%)"]].values,
-            color=control_color,
-            label="Control",
-            **style_defaults
-        )
-
-    # Add plot elements
-    multiple = 0.1  # Our scale is 0 to 1 and we want 0.1 increments
-    tax.boundary(linewidth=0.5)
-    tax.gridlines(multiple=multiple, color="gray")
-    tax.ticks(axis="lbr", linewidth=1, multiple=multiple, offset=0.02, tick_formats="%.1f")
-
-    # Set Axis labels
-    label_defaults = dict(fontsize=12, offset=0.14)
-    tax.left_axis_label("Asian", **label_defaults)
-    tax.right_axis_label("African", **label_defaults)
-    tax.bottom_axis_label("European", **label_defaults)
-
-    # Add legend
-    tax.legend(title="case_control")
-
-    # Clean-up plot
-    tax.set_background_color(color="white")
-    tax.clear_matplotlib_ticks()
-    tax.get_axes().axis("off")  # removes outer square axes
-
-    # Save if given an outfile
-    if outfile:
-        tax.savefig(outfile)
-
-
-if __name__ == "__main__":
-    if "snakemake" in locals():
-        defaults = {}
-        defaults.update({"sample_qc": Path(snakemake.input[0])})  # type: ignore # noqa
-        defaults.update({"outfile": Path(snakemake.output[0])})  # type: ignore # noqa
-        main(**defaults)
-    else:
-        app()
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py
index 72eacc11..caf3dda5 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_autosomal_heterozygosity.py
@@ -31,7 +31,6 @@
 
 @app.command()
 def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path):
-
     df = (
         read_het(het)
         .join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left")
@@ -50,13 +49,20 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile:
 def plot(df: pd.DataFrame, population: str, threshold: float):
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
     fig, ax = plt.subplots(figsize=(6, 6))
     sns.scatterplot(
         x="x_label",
         y="F",
         data=df,
         hue="case_control",
-        palette=COLORS,
+        palette=CASE_CONTROL_LABEL_COLORS,
         ax=ax,
         alpha=0.8,
         linewidth=0,
@@ -67,7 +73,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float):
     ax.set_xlabel("Subjects sorted by F")
     ax.set_ylabel("F")
     ax.set_ylim(_get_ylim(df.F, threshold))
-    ax.set_title(f"{population} Homozygosity F Coefficient")
+    ax.set_title(f"{population} Heterozygosity F Coefficient")
 
     # Move legend
     plt.legend(loc="upper left")
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
index c248e8e0..3fe23a48 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
@@ -88,9 +88,16 @@ def plot_panel(
     )
 
     # Set basic defaults so I don't have to repeat myself
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
     style_defaults = dict(linewidth=0, alpha=0.8, s=5)
     sample_defaults = {
-        **dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample),
+        **dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample),
         **style_defaults,
     }
     snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults}
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
index 97c4b30a..ecfa444e 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
@@ -18,11 +18,12 @@
 import seaborn as sns
 import typer
 
-# import snakemake
-
 from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
 from cgr_gwas_qc.workflow.scripts import sample_qc_table
 
+# import snakemake
+
+
 app = typer.Typer(add_completion=False)
 
 
@@ -30,7 +31,7 @@
 def main(sample_qc: Path, outfile: Path, xchr: str):
     sample = load_sample_data(sample_qc)
     xchr = str(snakemake.params)  # type: ignore # noqa
-    plot(sample, outfile, xchr)
+    plot(sample, xchr, outfile)
 
 
 """
@@ -67,16 +68,23 @@ def _update_categories(sr: pd.DataFrame):
     return sr
 
 
-def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True):
+def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None):
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
     # Create plots
     style_defaults = dict(linewidth=0, alpha=0.8, s=2)
     defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample)
     fig, ax = plt.subplots(figsize=(6, 6))
     sns.boxplot(ax=ax, showfliers=False, **defaults)
     sns.stripplot(
-        ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults
+        ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults
     )
 
     # Make boxplot black and white
@@ -87,13 +95,13 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool
     # ax.set_xlabel("Reported Sex")
     ax.set_ylabel("ChrX Inbreeding Coeff")
 
-    xchr = xchr.strip().lower() == "true"
-    print(type(xchr), " ", xchr)
-    if xchr:
-        print("sex chr included", xchr)
+    xchr_bool = xchr.strip().lower() == "true"
+    print(type(xchr_bool), " ", xchr_bool)
+    if xchr_bool:
+        print("sex chr included", xchr_bool)
         ax.set_xlabel("Reported Sex")
     else:
-        print("No sex chromosome ", xchr)
+        print("No sex chromosome ", xchr_bool)
         ax.set_xlabel("No sex chromosome \nSkipping sex condordace")
 
     # Add line at 0.5
diff --git a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py
index 868d79b9..60b5f2ba 100644
--- a/src/cgr_gwas_qc/workflow/scripts/plot_pca.py
+++ b/src/cgr_gwas_qc/workflow/scripts/plot_pca.py
@@ -50,7 +50,14 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path):
 def plot(df: pd.DataFrame, population: str) -> sns.PairGrid:
     sns.set_context("paper")  # use seaborn's context to make sane plot defaults for a paper
 
-    g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True)
+    CASE_CONTROL_LABEL_COLORS = {
+        "Case": CASE_CONTROL_COLORS[0],
+        "Control": CASE_CONTROL_COLORS[1],
+        "QC": CASE_CONTROL_COLORS[2],
+        "Unknown": CASE_CONTROL_COLORS[3],
+    }
+
+    g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True)
     g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0)
     g.map_diag(sns.kdeplot)
     g.add_legend(
diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
index ee39a2e5..5e6a584c 100755
--- a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
+++ b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
@@ -192,7 +192,9 @@ def main(
     )
 
     add_qc_columns(
-        sample_qc, remove_contam, remove_rep_discordant,
+        sample_qc,
+        remove_contam,
+        remove_rep_discordant,
     )
     save(sample_qc, outfile)
 
@@ -320,7 +322,6 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
     .. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/
 
     """
-
     return (
         pd.read_csv(file_name, sep="\t")
         .assign(
@@ -402,7 +403,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram
 
     if file_name is None:
         return pd.DataFrame(
-            index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
+            index=Sample_IDs,
+            columns=["Contamination_Rate", "is_contaminated"],
         ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})
 
     return (
@@ -445,12 +447,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie
 
 
 def add_qc_columns(
-    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame,
+    remove_contam: bool,
+    remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     add_call_rate_flags(sample_qc)
     _add_identifiler(sample_qc)
     _add_analytic_exclusion(
-        sample_qc, remove_contam, remove_rep_discordant,
+        sample_qc,
+        remove_contam,
+        remove_rep_discordant,
     )
     _add_subject_representative(sample_qc)
     _add_subject_dropped_from_study(sample_qc)
@@ -496,7 +502,9 @@ def reason_string(row: pd.Series) -> str:
 
 
 def _add_analytic_exclusion(
-    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame,
+    remove_contam: bool,
+    remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     """Adds a flag to remove samples based on provided conditions.