Skip to content

Commit

Permalink
Merge branch 'annotations-new-features' of github.com:PMBio/deeprvat …
Browse files Browse the repository at this point in the history
…into annotations-new-features

Conflicts:
	deeprvat/annotations/annotations.py
  • Loading branch information
“Marcel-Mueck” committed Apr 5, 2024
2 parents f5a3c9c + 6d2bfc6 commit c2d659f
Showing 1 changed file with 24 additions and 15 deletions.
39 changes: 24 additions & 15 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,7 @@ def deepripe_encode_variant_bedline(bedline, genomefasta, flank_size=75):

return encoded_seqs


def readYamlColumns(annotation_columns_yaml_file):
with open(annotation_columns_yaml_file, "r") as fd:
config = yaml.safe_load(fd)
Expand All @@ -569,12 +570,13 @@ def readYamlColumns(annotation_columns_yaml_file):
column_name_mapping = dict(zip(prior_names, post_names))
fill_value_mapping = dict(zip(post_names, fill_vals))
return prior_names, post_names, fill_vals, column_name_mapping, fill_value_mapping



def get_parquet_columns(parquet_file):
pfile = ParquetFile(parquet_file)
pcols = pfile.columns
return pcols


@click.group()
def cli():
Expand Down Expand Up @@ -1293,7 +1295,9 @@ def merge_deepripe(
@click.argument("deepripe_pca_file", type=click.Path(exists=True))
@click.argument("column_yaml_file", type=click.Path(exists=True))
@click.argument("out_file", type=click.Path())
def merge_deepsea_pcas(annotation_file: str, deepripe_pca_file: str, column_yaml_file:str, out_file: str):
def merge_deepsea_pcas(
annotation_file: str, deepripe_pca_file: str, column_yaml_file: str, out_file: str
):
"""
Merge deepRiPe PCA scores with an annotation file and save the result.
Expand All @@ -1315,17 +1319,19 @@ def merge_deepsea_pcas(annotation_file: str, deepripe_pca_file: str, column_yaml
Example:
$ python annotations.py merge_deepsea_pcas annotations.parquet deepripe_pca_scores.parquet merged_deepsea_pcas.parquet
"""

pcols = get_parquet_columns(deepripe_pca_file)
anno_cols = get_parquet_columns(annotation_file)
logger.info("reading current annotations")
prior_names,*_ = readYamlColumns(column_yaml_file)
prior_names, *_ = readYamlColumns(column_yaml_file)

DScommonCols = list(set(prior_names).intersection(set(pcols)))
AnnoCommonCols = list(set(prior_names).intersection(set(anno_cols)))
annotations = pd.read_parquet(annotation_file, columns = AnnoCommonCols+['chrom','pos', 'ref','alt','id', 'Gene'])
logger.info("reading PCAs")
deepripe_pcas = pd.read_parquet(deepripe_pca_file, columns=DScommonCols+['chrom','pos', 'ref','alt','id'])
deepripe_pcas = pd.read_parquet(
deepripe_pca_file, columns=DScommonCols + ["chrom", "pos", "ref", "alt", "id"]
)
deepripe_pcas = deepripe_pcas.drop_duplicates(
subset=["chrom", "pos", "ref", "alt", "id"]
)
Expand All @@ -1350,7 +1356,6 @@ def merge_deepsea_pcas(annotation_file: str, deepripe_pca_file: str, column_yaml
merged.to_parquet(out_file)



@cli.command()
@click.argument("in_variants", type=click.Path(exists=True))
@click.argument("out_variants", type=click.Path())
Expand Down Expand Up @@ -1599,7 +1604,7 @@ def concatenate_deepsea(
$ python annotations.py concatenate_deepSEA chr1_block0.CLI.deepseapredict.diff.tsv,chr1_block1.CLI.deepseapredict.diff.tsv,chr1_block2.CLI.deepseapredict.diff.tsv concatenated_output.parquet 4
"""

file_paths = deepsea_files.split(',')
file_paths = deepsea_files.split(",")
logger.info("check if out_file already exists")
if os.path.exists(out_file):
logger.info("file exists, removing existing file")
Expand Down Expand Up @@ -1748,7 +1753,9 @@ def process_deepripe(deepripe_df: pd.DataFrame, column_prefix: str) -> pd.DataFr
return deepripe_df


def process_vep(vep_file: pd.DataFrame, vcf_file : str, vepcols_to_retain: list = []) -> pd.DataFrame:
def process_vep(
vep_file: pd.DataFrame, vcf_file: str, vepcols_to_retain: list = []
) -> pd.DataFrame:
"""
Process the VEP DataFrame, extracting relevant columns and handling data types.
Expand All @@ -1762,9 +1769,11 @@ def process_vep(vep_file: pd.DataFrame, vcf_file : str, vepcols_to_retain: list
Example:
vep_file = process_vep(vep_file, vepcols_to_retain=["additional_col1", "additional_col2"])
"""
vcf_df = pd.read_table(vcf_file, names=['chrom', 'pos', '#Uploaded_variation', 'ref', 'alt'])
vcf_df = pd.read_table(
vcf_file, names=["chrom", "pos", "#Uploaded_variation", "ref", "alt"]
)
if "#Uploaded_variation" in vep_file.columns:
vep_file = vep_file.merge(vcf_df, on = "#Uploaded_variation")
vep_file = vep_file.merge(vcf_df, on="#Uploaded_variation")

if "pos" in vep_file.columns:
vep_file["pos"] = vep_file["pos"].astype(int)
Expand Down Expand Up @@ -1902,7 +1911,6 @@ def process_vep(vep_file: pd.DataFrame, vcf_file : str, vepcols_to_retain: list
return vep_file



@cli.command()
@click.argument("filenames", type=str)
@click.argument("out_file", type=click.Path())
Expand All @@ -1924,7 +1932,7 @@ def concat_annotations(
concat_annotations "annotations/chr1_block0_merged.parquet,annotations/chr1_block1_merged.parquet,annotations/chr1_block2_merged.parquet " "output.parquet")
"""
logger.info("reading pvcf block file")
file_paths = filenames.split(',')
file_paths = filenames.split(",")
for f in tqdm(file_paths):
logger.info(f"processing file {f}")
file = pd.read_parquet(f)
Expand Down Expand Up @@ -2092,11 +2100,12 @@ def select_rename_fill_annotations(
- out_file (str): Path to save the modified annotations file.
"""


logger.info(
f"reading in yaml file containing name and fill value mappings from {annotation_columns_yaml_file}"
)
prior_names, _, _, column_name_mapping, fill_value_mapping = readYamlColumns(annotation_columns_yaml_file)
prior_names, _, _, column_name_mapping, fill_value_mapping = readYamlColumns(
annotation_columns_yaml_file
)
key_cols = ["id", "gene_id"]
anno_df = pd.read_parquet(
annotations_path, columns=list(set(prior_names + key_cols))
Expand Down

0 comments on commit c2d659f

Please sign in to comment.