Skip to content

Commit

Permalink
Merge pull request #274 from NCI-CGR/issue-237
Browse files Browse the repository at this point in the history
Issue 237
  • Loading branch information
kliao12 authored Apr 12, 2024
2 parents 45f2764 + 139faab commit 83c4682
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 121 deletions.
10 changes: 9 additions & 1 deletion src/cgr_gwas_qc/reporting/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
import pandas as pd

CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"])
CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"] # red # blue # gray # green
CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"] # red # blue # gray #gold

# Assign labels to colors for plotting consistency
CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"])

Expand Down
39 changes: 35 additions & 4 deletions src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ def main(sample_qc: Path, outfile: Path):


def load_sample_data(sample_qc: Path) -> pd.DataFrame:
return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"])
return (
sample_qc_table.read(sample_qc)
.query("is_subject_representative")
.dropna(subset=["EUR", "AFR", "ASN"])
)


def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
Expand All @@ -42,24 +46,51 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
fig, tax = ternary.figure(scale=1) # Set scale 0 to 1
fig.set_size_inches(6, 5)

# Plot cases and controls separately
# Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible
case = sample.query("case_control == 'Case'")
if case.shape[0] > 0:
case_color = CASE_CONTROL_COLORS[0]
tax.scatter(
case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults
case[["EUR", "AFR", "ASN"]].values,
color=case_color,
label="Case",
zorder=4,
**style_defaults
)

control = sample.query("case_control == 'Control'")
if control.shape[0] > 0:
control_color = CASE_CONTROL_COLORS[1]
control_color = CASE_CONTROL_COLORS[1] # blue
tax.scatter(
control[["EUR", "AFR", "ASN"]].values,
color=control_color,
label="Control",
**style_defaults
)

# Issue 237: Add samples if they are neither case or control.
project_qc = sample.query("case_control == 'QC'")
if project_qc.shape[0] > 0:
project_qc_color = CASE_CONTROL_COLORS[2] # Yellow
tax.scatter(
project_qc[["EUR", "AFR", "ASN"]].values,
color=project_qc_color,
label="QC",
**style_defaults
)

unknown = sample.query(
"case_control != 'Control' and case_control != 'Case' and case_control != 'QC'"
)
if unknown.shape[0] > 0:
unknown_color = CASE_CONTROL_COLORS[3] # Gray
tax.scatter(
unknown[["EUR", "AFR", "ASN"]].values,
color=unknown_color,
label="Unknown",
**style_defaults
)

# Add plot elements
multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments
tax.boundary(linewidth=0.5)
Expand Down
95 changes: 0 additions & 95 deletions src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@

@app.command()
def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path):

df = (
read_het(het)
.join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left")
Expand All @@ -50,13 +49,20 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile:
def plot(df: pd.DataFrame, population: str, threshold: float):
sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper

CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

fig, ax = plt.subplots(figsize=(6, 6))
sns.scatterplot(
x="x_label",
y="F",
data=df,
hue="case_control",
palette=COLORS,
palette=CASE_CONTROL_LABEL_COLORS,
ax=ax,
alpha=0.8,
linewidth=0,
Expand All @@ -67,7 +73,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float):
ax.set_xlabel("Subjects sorted by F")
ax.set_ylabel("F")
ax.set_ylim(_get_ylim(df.F, threshold))
ax.set_title(f"{population} Homozygosity F Coefficient")
ax.set_title(f"{population} Heterozygosity F Coefficient")

# Move legend
plt.legend(loc="upper left")
Expand Down
9 changes: 8 additions & 1 deletion src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,16 @@ def plot_panel(
)

# Set basic defaults so I don't have to repeat myself
CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

style_defaults = dict(linewidth=0, alpha=0.8, s=5)
sample_defaults = {
**dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample),
**dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample),
**style_defaults,
}
snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults}
Expand Down
28 changes: 18 additions & 10 deletions src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,20 @@
import seaborn as sns
import typer

# import snakemake

from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
from cgr_gwas_qc.workflow.scripts import sample_qc_table

# import snakemake


app = typer.Typer(add_completion=False)


@app.command()
def main(sample_qc: Path, outfile: Path, xchr: str):
sample = load_sample_data(sample_qc)
xchr = str(snakemake.params) # type: ignore # noqa
plot(sample, outfile, xchr)
plot(sample, xchr, outfile)


"""
Expand Down Expand Up @@ -67,16 +68,23 @@ def _update_categories(sr: pd.DataFrame):
return sr


def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True):
def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None):
sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper

CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

# Create plots
style_defaults = dict(linewidth=0, alpha=0.8, s=2)
defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample)
fig, ax = plt.subplots(figsize=(6, 6))
sns.boxplot(ax=ax, showfliers=False, **defaults)
sns.stripplot(
ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults
ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults
)

# Make boxplot black and white
Expand All @@ -87,13 +95,13 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool
# ax.set_xlabel("Reported Sex")
ax.set_ylabel("ChrX Inbreeding Coeff")

xchr = xchr.strip().lower() == "true"
print(type(xchr), " ", xchr)
if xchr:
print("sex chr included", xchr)
xchr_bool = xchr.strip().lower() == "true"
print(type(xchr_bool), " ", xchr_bool)
if xchr_bool:
print("sex chr included", xchr_bool)
ax.set_xlabel("Reported Sex")
else:
print("No sex chromosome ", xchr)
print("No sex chromosome ", xchr_bool)
ax.set_xlabel("No sex chromosome \nSkipping sex condordace")

# Add line at 0.5
Expand Down
9 changes: 8 additions & 1 deletion src/cgr_gwas_qc/workflow/scripts/plot_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,14 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path):
def plot(df: pd.DataFrame, population: str) -> sns.PairGrid:
sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper

g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True)
CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True)
g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0)
g.map_diag(sns.kdeplot)
g.add_legend(
Expand Down
20 changes: 14 additions & 6 deletions src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,9 @@ def main(
)

add_qc_columns(
sample_qc, remove_contam, remove_rep_discordant,
sample_qc,
remove_contam,
remove_rep_discordant,
)
save(sample_qc, outfile)

Expand Down Expand Up @@ -320,7 +322,6 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
.. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/
"""

return (
pd.read_csv(file_name, sep="\t")
.assign(
Expand Down Expand Up @@ -402,7 +403,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram

if file_name is None:
return pd.DataFrame(
index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
index=Sample_IDs,
columns=["Contamination_Rate", "is_contaminated"],
).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})

return (
Expand Down Expand Up @@ -445,12 +447,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie


def add_qc_columns(
sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
sample_qc: pd.DataFrame,
remove_contam: bool,
remove_rep_discordant: bool,
) -> pd.DataFrame:
add_call_rate_flags(sample_qc)
_add_identifiler(sample_qc)
_add_analytic_exclusion(
sample_qc, remove_contam, remove_rep_discordant,
sample_qc,
remove_contam,
remove_rep_discordant,
)
_add_subject_representative(sample_qc)
_add_subject_dropped_from_study(sample_qc)
Expand Down Expand Up @@ -496,7 +502,9 @@ def reason_string(row: pd.Series) -> str:


def _add_analytic_exclusion(
sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
sample_qc: pd.DataFrame,
remove_contam: bool,
remove_rep_discordant: bool,
) -> pd.DataFrame:
"""Adds a flag to remove samples based on provided conditions.
Expand Down

0 comments on commit 83c4682

Please sign in to comment.