Skip to content

Commit

Permalink
Merge pull request #681 from biolink/fix_isoforms
Browse files Browse the repository at this point in the history
swaps subject with gene when subject is PR isoform
  • Loading branch information
sierra-moxon authored Jul 24, 2024
2 parents df8f688 + b76b665 commit 0b9336a
Show file tree
Hide file tree
Showing 6 changed files with 4,845 additions and 4,728 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ poetry.lock
pip-log.txt
pip-delete-this-directory.txt

groups/

# Unit test / coverage reports
htmlcov/
.tox/
Expand Down
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,17 @@ travis_test:
tests/test_goassociation_model.py tests/test_relations.py \
tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
tests/test_collections.py \
tests/test_gocamgen.py; \
tests/test_gocamgen.py \
tests/test_gpi_isoform_replacement.py; \
else \
pytest tests/test_*local*.py tests/test_*parse*.py tests/test*writer*.py tests/test_qc.py \
tests/test_rdfgen.py tests/test_phenosim_engine.py tests/test_ontol.py \
tests/test_validation_rules.py tests/unit/test_annotation_scorer.py \
tests/test_goassociation_model.py tests/test_relations.py \
tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
tests/test_collections.py \
tests/test_gocamgen.py; \
tests/test_gocamgen.py \
tests/test_gpi_isoform_replacement.py; \
fi

cleandist:
Expand Down
4 changes: 2 additions & 2 deletions bin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ To test validate.py "validate" command, the command that produces the final GPAD

```bash
poetry install
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset mgi MGI
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset goa_chicken goa
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset mgi MGI --gpad-gpi-output-version 2.0
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset goa_chicken goa --gpad-gpi-output-version 2.0
```


Expand Down
135 changes: 109 additions & 26 deletions bin/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,14 @@
import click
import json
import os
import yaml
import requests
import gzip
import urllib
import shutil
import re
import glob
import logging
import sys
import traceback
from typing import Dict, List
import yamldown

from functools import wraps

# from ontobio.util.user_agent import get_user_agent
from ontobio.model.association import Curie, ExtensionUnit
from ontobio.io.entityparser import GpiParser
from ontobio.ontol_factory import OntologyFactory
from ontobio.io.gafparser import GafParser
from ontobio.io.gpadparser import GpadParser
Expand All @@ -34,7 +26,6 @@
from ontobio.validation import tools
from ontobio.validation import rules


from typing import Dict, Set

# logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.WARNING)
Expand Down Expand Up @@ -224,7 +215,8 @@ def create_parser(config, group, dataset, format="gaf"):

@tools.gzips
def produce_gaf(dataset, source_gaf, ontology_graph, gpipaths=None, paint=False, group="unknown", rule_metadata=None,
goref_metadata=None, ref_species_metadata=None, db_type_name_regex_id_syntax=None, retracted_pub_set=None, db_entities=None, group_idspace=None,
goref_metadata=None, ref_species_metadata=None, db_type_name_regex_id_syntax=None,
retracted_pub_set=None, db_entities=None, group_idspace=None,
format="gaf", suppress_rule_reporting_tags=[], annotation_inferences=None, group_metadata=None,
extensions_constraints=None, rule_contexts=[], gaf_output_version="2.2",
rule_set=assocparser.RuleSet.ALL) -> list[str]:
Expand Down Expand Up @@ -613,7 +605,8 @@ def cli(ctx, verbose):
@click.option("--only-dataset", default=None)
@click.option("--gaf-output-version", default="2.2", type=click.Choice(["2.1", "2.2"]))
@click.option("--rule-set", "-l", "rule_set", default=[assocparser.RuleSet.ALL], multiple=True)
@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False, help="Path to retracted publications file")
@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False,
help="Path to retracted publications file")
def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target, ontology, exclude, base_download_url,
suppress_rule_reporting_tag, skip_existing_files, gaferencer_file, only_dataset, gaf_output_version,
rule_set, retracted_pub_set):
Expand Down Expand Up @@ -676,7 +669,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target

db_entities = metadata.database_entities(absolute_metadata)
group_ids = metadata.groups(absolute_metadata)
extensions_constraints = metadata.extensions_constraints_file(absolute_metadata)
extensions_constraints = metadata.extensions_constraints_file(absolute_metadata)

gaferences = None
if gaferencer_file:
Expand All @@ -685,21 +678,20 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
# Default comes through as single-element tuple
if rule_set == (assocparser.RuleSet.ALL,):
rule_set = assocparser.RuleSet.ALL

db_type_name_regex_id_syntax = metadata.database_type_name_regex_id_syntax(absolute_metadata)

retracted_pubs = None
if retracted_pub_set:
retracted_pubs = metadata.retracted_pub_set(retracted_pub_set)
else:
retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata)
retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata)

for dataset_metadata, source_gaf in downloaded_gaf_sources:
dataset = dataset_metadata["dataset"]
# Set paint to True when the group is "paint".
# This will prevent filtering of IBA (GO_RULE:26) when paint is being treated as a top level group,
# like for paint_other.
click.echo("source_gaf: {}".format(source_gaf))
click.echo("Producing GAF by passing through validation rules... {}".format(dataset))
valid_gaf = produce_gaf(dataset, source_gaf, ontology_graph,
paint=(group == "paint"),
group=group,
Expand All @@ -719,10 +711,14 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
rule_set=rule_set
)[0]

click.echo("Producing GPI from GAF files...")
gpi = produce_gpi(dataset, absolute_target, valid_gaf, ontology_graph, gpad_gpi_output_version)

gpi_list = [gpi]
# Try to find other GPIs in metadata and merge

matching_gpi_path = None
click.echo("Try to find other GPIs in metadata and merge...")

for ds in group_metadata["datasets"]:
# Where type=GPI for the same dataset (e.g. "zfin", "goa_cow")
if ds["type"] == "gpi" and ds["dataset"] == dataset and ds.get("source"):
Expand All @@ -732,6 +728,9 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
matching_gpi_path = unzip_simple(matching_gpi_path)
gpi_list.append(matching_gpi_path)

click.echo("Found the matching gpi path...{}".format(matching_gpi_path))

click.echo("Downloading the noctua and paint GPAD files...")
noctua_gpad_src = check_and_download_mixin_source(noctua_metadata, group_metadata["id"], dataset, target,
base_download_url=base_download_url,
replace_existing_files=not skip_existing_files)
Expand All @@ -740,6 +739,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
replace_existing_files=not skip_existing_files)
if paint_metadata else None)

click.echo("Executing 'make_gpads' in validate.produce with all the assembled GAF files...")
make_gpads(dataset, valid_gaf, products,
ontology_graph, noctua_gpad_src, paint_gaf_src,
gpi, gpad_gpi_output_version)
Expand All @@ -750,9 +750,92 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
rule_metadata=rule_metadata, replace_existing_files=not skip_existing_files,
gaf_output_version=gaf_output_version)

click.echo(end_gaf)
click.echo("Executing the isoform fixing step in validate.produce...")
# run the resulting gaf through one last parse and replace, to handle the isoforms
# see: https://github.com/geneontology/go-site/issues/2291
output_gaf_path = os.path.join(os.path.split(end_gaf)[0], "{}.gaf".format(dataset))
isoform_fixed_gaf = fix_pro_isoforms_in_gaf(end_gaf, matching_gpi_path, ontology_graph, output_gaf_path)
click.echo(isoform_fixed_gaf)

make_ttls(dataset, end_gaf, products, ontology_graph)
click.echo("Creating ttl files...")
make_ttls(dataset, isoform_fixed_gaf, products, ontology_graph)


def fix_pro_isoforms_in_gaf(gaf_file_to_fix: str, gpi_file: str, ontology_graph, output_file_path: str) -> str:
"""
Given a GAF file and a GPI file, fix the GAF file by converting isoform annotations to gene annotations. Storing
the isoforms back in subject_extensions collection, changing the full_name, synonyms, label, and type back to the
gene in the GPI file.
:param gaf_file_to_fix: The path to the GAF file to fix
:param gpi_file: The path to the GPI file
:param ontology_graph: The ontology graph to use for parsing the associations
:param output_file_path: The path to write the fixed GAF file to
:return: The path to the fixed GAF file
"""
fixed_associations = []
gpiparser = GpiParser(config=assocparser.AssocParserConfig(ontology=ontology_graph))
# Parse the GPI file, creating a map of identifiers to GPI entries
gpis = gpiparser.parse(gpi_file, None)
gpi_map = {}
for gpi_entry in gpis:
gpi_map[gpi_entry.get('id')] = {"encoded_by": gpi_entry.get('encoded_by'),
"full_name": gpi_entry.get('full_name'),
"label": gpi_entry.get('label'),
"synonyms": gpi_entry.get('synonyms'),
# GPI spec says this is single valued, but GpiParser returns this as a list.
"type": gpi_entry.get('type')[0],
"id": gpi_entry.get('id')}

gafparser = GafParser(config=assocparser.AssocParserConfig(ontology=ontology_graph))
gafwriter = GafWriter(file=open(output_file_path, "w"), source="test", version=gafparser.version)

# these are statistic parameters that record when a substitution is made.
substitution_count = 0
no_substitution_count = 0

with open(gaf_file_to_fix, "r") as file:
for line in file:
annotation = gafparser.parse_line(line)
for source_assoc in annotation.associations:
if isinstance(source_assoc, dict):
continue # skip the header
if source_assoc.subject.id.namespace.startswith("PR"):
full_old_identifier = source_assoc.subject.id.namespace + ":" + source_assoc.subject.id.identity
old_namespace = source_assoc.subject.id.namespace
old_identity = source_assoc.subject.id.identity
# TODO: right now we get the FIRST encoded_by result -- this is what the original script from Chris did??
if "MGI" == gpi_map[full_old_identifier].get("encoded_by")[0].split(":")[0]:
source_assoc.subject.id.namespace = gpi_map[full_old_identifier].get("encoded_by")[0].split(":")[0]
source_assoc.subject.id.identity = "MGI:" + gpi_map[full_old_identifier].get("encoded_by")[0].split(":")[2]
else:
source_assoc.subject.id.namespace = \
gpi_map[full_old_identifier].get("encoded_by")[0].split(":")[0]
source_assoc.subject.id.identity = \
gpi_map[full_old_identifier].get("encoded_by")[0].split(":")[1]
full_new_identifier = source_assoc.subject.id.namespace + ":" + source_assoc.subject.id.identity
source_assoc.subject.full_name = gpi_map[full_new_identifier].get("full_name")
source_assoc.subject.label = gpi_map[full_new_identifier].get("label")
source_assoc.subject.synonyms = gpi_map[full_new_identifier].get("synonyms")
source_assoc.subject.type = gpi_map[full_new_identifier].get("type")

# we need to put the isoform currently being swapped, back into "Column 17" which is a
# subject_extension member.
isoform_term = Curie(namespace=old_identity, identity=old_namespace)
isoform_relation = Curie(namespace="RO", identity="0002327")
new_subject_extension = ExtensionUnit(relation=isoform_relation, term=isoform_term)
source_assoc.subject_extensions.append(new_subject_extension)

# count the substitution here for reporting later
substitution_count += 1
else:
no_substitution_count += 1

# Join fields back into a string and write to output file
fixed_associations.append(source_assoc)

gafwriter.write(fixed_associations)
click.echo(f"Substituted {substitution_count} entries in {gaf_file_to_fix} "
f"and left {no_substitution_count} entries unchanged.")


@cli.command()
Expand Down Expand Up @@ -808,14 +891,14 @@ def paint(group, dataset, metadata, target, ontology):
absolute_target = os.path.abspath(target)
os.makedirs(os.path.join(absolute_target, "groups"), exist_ok=True)
paint_metadata = metadata.dataset_metadata_file(absolute_metadata, "paint")

paint_src_gaf = check_and_download_mixin_source(paint_metadata, dataset, absolute_target)

click.echo("Loading ontology: {}...".format(ontology))
ontology_graph = OntologyFactory().create(ontology)

gpi_path = os.path.join(absolute_target, "groups", dataset, "{}.gpi".format(dataset))
click.echo("Using GPI at {}".format(gpi_path))
click.echo("Using GPI at {}".format(gpi_path))
paint_gaf = produce_gaf("paint_{}".format(dataset), paint_src_gaf, ontology_graph, gpipath=gpi_path)


Expand All @@ -825,7 +908,8 @@ def paint(group, dataset, metadata, target, ontology):
@click.option("--ontology", type=click.Path(), required=True)
@click.option("--gaferencer-file", "-I", type=click.Path(exists=True), default=None, required=False,
help="Path to Gaferencer output to be used for inferences")
@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False, help="Path to retracted publications file")
@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False,
help="Path to retracted publications file")
def rule(metadata_dir, out, ontology, gaferencer_file, retracted_pub_set):
absolute_metadata = os.path.abspath(metadata_dir)

Expand All @@ -840,8 +924,7 @@ def rule(metadata_dir, out, ontology, gaferencer_file, retracted_pub_set):
if retracted_pub_set:
retracted_pubs = metadata.retracted_pub_set(retracted_pub_set)
else:
retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata)

retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata)

click.echo("Found {} GO Rules".format(len(gorule_metadata.keys())))

Expand Down
Loading

0 comments on commit 0b9336a

Please sign in to comment.