Skip to content

Commit

Permalink
Fix multiple merge problems from example run
Browse files Browse the repository at this point in the history
  • Loading branch information
endast committed Oct 18, 2023
1 parent e65e2c6 commit c7a21dc
Showing 1 changed file with 13 additions and 4 deletions.
17 changes: 13 additions & 4 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,12 +783,14 @@ def get_abscores(
all_absplice_scores = pd.read_parquet(ab_splice_agg_score_file)

all_absplice_scores = all_absplice_scores[
["chrom", "pos", "ref", "alt", "gene_id", "AbSplice_DNA"]
["chrom", "pos", "ref", "alt", "gene_id", "AbSplice_DNA", "id"]
]

annotations = pd.read_parquet(current_annotation_file, engine="pyarrow").drop(
columns=["AbSplice_DNA"], errors="ignore"
)
annotations.drop_duplicates(inplace=True,subset=["chrom", "pos", "ref", "alt", "gene_id", "id"])

original_len = len(annotations)

logger.info("Merging")
Expand All @@ -797,12 +799,12 @@ def get_abscores(
all_absplice_scores,
validate="1:1",
how="left",
on=["chrom", "pos", "ref", "alt", "gene_id"],
on=["chrom", "pos", "ref", "alt", "gene_id", "id"],
)

logger.info("Sanity checking merge")
assert len(merged) == original_len
assert merged["censequence_id"].unique().shape[0] == len(merged)
assert len(merged[["gene_id", "id"]].drop_duplicates()) == len(merged)

logger.info(
f'Filling {merged["AbSplice_DNA"].isna().sum()} '
Expand Down Expand Up @@ -904,8 +906,11 @@ def merge_deepsea_pcas(annotation_file: str, deepripe_pca_file: str, out_file: s
deepripe_pcas = pd.read_parquet(deepripe_pca_file)
orig_len = len(annotations)
merged = annotations.merge(
deepripe_pcas, how="left", on=["chrom", "pos", "ref", "alt"]
deepripe_pcas, how="left", on=["chrom", "pos", "ref", "alt", "id"]
)

merged.rename(columns={"Gene": "gene_id"}, inplace=True)

assert len(merged) == orig_len
merged.to_parquet(out_file)

Expand Down Expand Up @@ -1074,6 +1079,10 @@ def merge_annotations(vep_header_line:int,
logger.info(f"reading in {variant_file}")
variants = pd.read_csv(variant_file, sep="\t")

# If variants start with chr
# TODO Check if this is always true
variants["chrom"] = variants["chrom"].str.replace("chr","")

#merge vep to variants M:1
ca = vep_df.merge(variants, how = "left", on=["chrom", "pos", "ref", "alt"], validate= "m:1")
del vep_df
Expand Down

0 comments on commit c7a21dc

Please sign in to comment.