Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 237 #274

Merged
merged 3 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/cgr_gwas_qc/reporting/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
import pandas as pd

CASE_CONTROL_DTYPE = pd.CategoricalDtype(categories=["Case", "Control", "QC", "Unknown"])
CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "#1bfc06"] # red # blue # gray # green
CASE_CONTROL_COLORS = ["#f7022a", "#3e82fc", "gray", "gold"] # red # blue # gray #gold

# Assign labels to colors for plotting consistency
CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"])

Expand Down
39 changes: 35 additions & 4 deletions src/cgr_gwas_qc/workflow/scripts/plot_ancestry.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ def main(sample_qc: Path, outfile: Path):


def load_sample_data(sample_qc: Path) -> pd.DataFrame:
return sample_qc_table.read(sample_qc).dropna(subset=["EUR", "AFR", "ASN"])
return (
sample_qc_table.read(sample_qc)
.query("is_subject_representative")
.dropna(subset=["EUR", "AFR", "ASN"])
)


def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
Expand All @@ -42,24 +46,51 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None):
fig, tax = ternary.figure(scale=1) # Set scale 0 to 1
fig.set_size_inches(6, 5)

# Plot cases and controls separately
# Plot cases, controls, QC, and unknowns separately. Make sure case is last so most visible
case = sample.query("case_control == 'Case'")
if case.shape[0] > 0:
case_color = CASE_CONTROL_COLORS[0]
tax.scatter(
case[["EUR", "AFR", "ASN"]].values, color=case_color, label="Case", **style_defaults
case[["EUR", "AFR", "ASN"]].values,
color=case_color,
label="Case",
zorder=4,
**style_defaults
)

control = sample.query("case_control == 'Control'")
if control.shape[0] > 0:
control_color = CASE_CONTROL_COLORS[1]
control_color = CASE_CONTROL_COLORS[1] # blue
tax.scatter(
control[["EUR", "AFR", "ASN"]].values,
color=control_color,
label="Control",
**style_defaults
)

# Issue 237: Add samples if they are neither case or control.
project_qc = sample.query("case_control == 'QC'")
if project_qc.shape[0] > 0:
project_qc_color = CASE_CONTROL_COLORS[2] # Yellow
tax.scatter(
project_qc[["EUR", "AFR", "ASN"]].values,
color=project_qc_color,
label="QC",
**style_defaults
)

unknown = sample.query(
"case_control != 'Control' and case_control != 'Case' and case_control != 'QC'"
)
if unknown.shape[0] > 0:
unknown_color = CASE_CONTROL_COLORS[3] # Gray
tax.scatter(
unknown[["EUR", "AFR", "ASN"]].values,
color=unknown_color,
label="Unknown",
**style_defaults
)

# Add plot elements
multiple = 0.1 # Our scale is 0 to 1 and we want 0.1 increments
tax.boundary(linewidth=0.5)
Expand Down
95 changes: 0 additions & 95 deletions src/cgr_gwas_qc/workflow/scripts/plot_ancestry_grafpop.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@

@app.command()
def main(qc_table: Path, het: Path, population: str, threshold: float, outfile: Path):

df = (
read_het(het)
.join(subject_qc_table.read(qc_table).set_index("Group_By_Subject_ID"), how="left")
Expand All @@ -50,13 +49,20 @@ def main(qc_table: Path, het: Path, population: str, threshold: float, outfile:
def plot(df: pd.DataFrame, population: str, threshold: float):
sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper

CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

fig, ax = plt.subplots(figsize=(6, 6))
sns.scatterplot(
x="x_label",
y="F",
data=df,
hue="case_control",
palette=COLORS,
palette=CASE_CONTROL_LABEL_COLORS,
ax=ax,
alpha=0.8,
linewidth=0,
Expand All @@ -67,7 +73,7 @@ def plot(df: pd.DataFrame, population: str, threshold: float):
ax.set_xlabel("Subjects sorted by F")
ax.set_ylabel("F")
ax.set_ylim(_get_ylim(df.F, threshold))
ax.set_title(f"{population} Homozygosity F Coefficient")
ax.set_title(f"{population} Heterozygosity F Coefficient")

# Move legend
plt.legend(loc="upper left")
Expand Down
9 changes: 8 additions & 1 deletion src/cgr_gwas_qc/workflow/scripts/plot_call_rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,16 @@ def plot_panel(
)

# Set basic defaults so I don't have to repeat myself
CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

style_defaults = dict(linewidth=0, alpha=0.8, s=5)
sample_defaults = {
**dict(hue="case_control", palette=CASE_CONTROL_COLORS, data=sample),
**dict(hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, data=sample),
**style_defaults,
}
snp_defaults = {**dict(data=snp, palette="gray"), **style_defaults}
Expand Down
28 changes: 18 additions & 10 deletions src/cgr_gwas_qc/workflow/scripts/plot_chrx_inbreeding.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,20 @@
import seaborn as sns
import typer

# import snakemake

from cgr_gwas_qc.reporting import CASE_CONTROL_COLORS
from cgr_gwas_qc.workflow.scripts import sample_qc_table

# import snakemake


app = typer.Typer(add_completion=False)


@app.command()
def main(sample_qc: Path, outfile: Path, xchr: str):
sample = load_sample_data(sample_qc)
xchr = str(snakemake.params) # type: ignore # noqa
plot(sample, outfile, xchr)
plot(sample, xchr, outfile)


"""
Expand Down Expand Up @@ -67,16 +68,23 @@ def _update_categories(sr: pd.DataFrame):
return sr


def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool = True):
def plot(sample: pd.DataFrame, xchr: str, outfile: Optional[os.PathLike] = None):
sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper

CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

# Create plots
style_defaults = dict(linewidth=0, alpha=0.8, s=2)
defaults = dict(x="expected_sex", y="X_inbreeding_coefficient", data=sample)
fig, ax = plt.subplots(figsize=(6, 6))
sns.boxplot(ax=ax, showfliers=False, **defaults)
sns.stripplot(
ax=ax, hue="case_control", palette=CASE_CONTROL_COLORS, **defaults, **style_defaults
ax=ax, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, **defaults, **style_defaults
)

# Make boxplot black and white
Expand All @@ -87,13 +95,13 @@ def plot(sample: pd.DataFrame, outfile: Optional[os.PathLike] = None, xchr: bool
# ax.set_xlabel("Reported Sex")
ax.set_ylabel("ChrX Inbreeding Coeff")

xchr = xchr.strip().lower() == "true"
print(type(xchr), " ", xchr)
if xchr:
print("sex chr included", xchr)
xchr_bool = xchr.strip().lower() == "true"
print(type(xchr_bool), " ", xchr_bool)
if xchr_bool:
print("sex chr included", xchr_bool)
ax.set_xlabel("Reported Sex")
else:
print("No sex chromosome ", xchr)
print("No sex chromosome ", xchr_bool)
ax.set_xlabel("No sex chromosome \nSkipping sex condordace")

# Add line at 0.5
Expand Down
9 changes: 8 additions & 1 deletion src/cgr_gwas_qc/workflow/scripts/plot_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,14 @@ def main(qc_table: Path, eigenvec: Path, population: str, outfile: Path):
def plot(df: pd.DataFrame, population: str) -> sns.PairGrid:
sns.set_context("paper") # use seaborn's context to make sane plot defaults for a paper

g = sns.PairGrid(df, hue="case_control", palette=COLORS, corner=True)
CASE_CONTROL_LABEL_COLORS = {
"Case": CASE_CONTROL_COLORS[0],
"Control": CASE_CONTROL_COLORS[1],
"QC": CASE_CONTROL_COLORS[2],
"Unknown": CASE_CONTROL_COLORS[3],
}

g = sns.PairGrid(df, hue="case_control", palette=CASE_CONTROL_LABEL_COLORS, corner=True)
g.map_lower(sns.scatterplot, s=10, alpha=0.8, linewidth=0)
g.map_diag(sns.kdeplot)
g.add_legend(
Expand Down
20 changes: 14 additions & 6 deletions src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,9 @@ def main(
)

add_qc_columns(
sample_qc, remove_contam, remove_rep_discordant,
sample_qc,
remove_contam,
remove_rep_discordant,
)
save(sample_qc, outfile)

Expand Down Expand Up @@ -320,7 +322,6 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
.. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/

"""

return (
pd.read_csv(file_name, sep="\t")
.assign(
Expand Down Expand Up @@ -402,7 +403,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram

if file_name is None:
return pd.DataFrame(
index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
index=Sample_IDs,
columns=["Contamination_Rate", "is_contaminated"],
).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})

return (
Expand Down Expand Up @@ -445,12 +447,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie


def add_qc_columns(
sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
sample_qc: pd.DataFrame,
remove_contam: bool,
remove_rep_discordant: bool,
) -> pd.DataFrame:
add_call_rate_flags(sample_qc)
_add_identifiler(sample_qc)
_add_analytic_exclusion(
sample_qc, remove_contam, remove_rep_discordant,
sample_qc,
remove_contam,
remove_rep_discordant,
)
_add_subject_representative(sample_qc)
_add_subject_dropped_from_study(sample_qc)
Expand Down Expand Up @@ -496,7 +502,9 @@ def reason_string(row: pd.Series) -> str:


def _add_analytic_exclusion(
sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
sample_qc: pd.DataFrame,
remove_contam: bool,
remove_rep_discordant: bool,
) -> pd.DataFrame:
"""Adds a flag to remove samples based on provided conditions.

Expand Down
Loading