Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix: Convert missing ancestry to Other category for grafpop output (Issue 326) #331

Merged
merged 2 commits into from
Sep 26, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 28 additions & 18 deletions src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,9 @@ def main(
)

add_qc_columns(
sample_qc, remove_contam, remove_rep_discordant,
sample_qc,
remove_contam,
remove_rep_discordant,
)

sample_qc = sample_qc.rename(
Expand Down Expand Up @@ -328,19 +330,20 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
.. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/

"""
return (
pd.read_csv(file_name, sep="\t")
.assign(
Sample_ID=lambda x: x["Subject"].astype(str)
) # Issue 216: When subject IDs are numeric reindex fails. This makes sure index Sample_ID will always be as a character
.assign(Ancestry=lambda x: x["Computed population"].str.replace(" ", "_"))
.assign(AFR=lambda x: x["P_f (%)"] / 100)
.assign(EUR=lambda x: x["P_e (%)"] / 100)
.assign(ASN=lambda x: x["P_a (%)"] / 100)
.set_index("Sample_ID")
.loc[:, ("AFR", "EUR", "ASN", "Ancestry")]
.reindex(Sample_IDs)
)

graf = pd.read_csv(file_name, sep="\t")
graf = graf.assign(
Sample_ID=lambda x: x["Subject"].astype(str)
) # Issue 216: When subject IDs are numeric reindex fails. This makes sure index Sample_ID will always be as a character
graf = graf.assign(Ancestry=lambda x: x["Computed population"].str.replace(" ", "_"))
graf = graf.assign(AFR=lambda x: x["P_f (%)"] / 100)
graf = graf.assign(EUR=lambda x: x["P_e (%)"] / 100)
graf = graf.assign(ASN=lambda x: x["P_a (%)"] / 100)
graf = graf.set_index("Sample_ID")
graf = graf.loc[:, ("AFR", "EUR", "ASN", "Ancestry")]
graf["Ancestry"] = graf["Ancestry"].fillna("Other")
graf = graf.reindex(Sample_IDs)
return graf


def _read_SNPweights(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
Expand Down Expand Up @@ -411,7 +414,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram

if file_name is None:
return pd.DataFrame(
index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
index=Sample_IDs,
columns=["Contamination_Rate", "is_contaminated"],
).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})

return (
Expand Down Expand Up @@ -454,12 +458,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie


def add_qc_columns(
sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
sample_qc: pd.DataFrame,
remove_contam: bool,
remove_rep_discordant: bool,
) -> pd.DataFrame:
add_call_rate_flags(sample_qc)
_add_identifiler(sample_qc)
_add_analytic_exclusion(
sample_qc, remove_contam, remove_rep_discordant,
sample_qc,
remove_contam,
remove_rep_discordant,
)
_add_subject_representative(sample_qc)
_add_subject_dropped_from_study(sample_qc)
Expand Down Expand Up @@ -505,7 +513,9 @@ def reason_string(row: pd.Series) -> str:


def _add_analytic_exclusion(
sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
sample_qc: pd.DataFrame,
remove_contam: bool,
remove_rep_discordant: bool,
) -> pd.DataFrame:
"""Adds a flag to remove samples based on provided conditions.

Expand Down
Loading