diff --git a/bin/ontobio-parse-assocs.py b/bin/ontobio-parse-assocs.py index beece86b..5f5f9f52 100755 --- a/bin/ontobio-parse-assocs.py +++ b/bin/ontobio-parse-assocs.py @@ -81,7 +81,9 @@ def main(): parser.add_argument("-g", "--gpi", type=str, required=False, default=None, help="GPI file") parser.add_argument("-m", "--metadata_dir", type=dir_path, required=False, - help="Path to metadata directory") + help="Path to metadata directory") + parser.add_argument("--retracted_pub_set", type=argparse.FileType('r'), required=False, + help="Path to retracted publications file") parser.add_argument("-l", "--rule", action="append", required=None, default=[], dest="rule_set", help="Set of rules to be run. Default is no rules to be run, with the exception \ of gorule-0000027 and gorule-0000020. See command line documentation in the \ @@ -144,11 +146,17 @@ def main(): rule_set = assocparser.RuleSet.ALL goref_metadata = None - ref_species_metadata = None + ref_species_metadata = None if args.metadata_dir: absolute_metadata = os.path.abspath(args.metadata_dir) goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs")) ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id") + + retracted_pub_set = None + if args.retracted_pub_set: + retracted_pub_set = metadata.retracted_pub_set(args.retracted_pub_set.name) + elif args.metadata_dir: + retracted_pub_set = metadata.retracted_pub_set_from_meta(absolute_metadata) # set configuration filtered_evidence_file = open(args.filtered_file, "w") if args.filtered_file else None @@ -165,6 +173,7 @@ def main(): gpi_authority_path=args.gpi, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, + retracted_pub_set=retracted_pub_set, rule_set=rule_set ) p = None diff --git a/bin/validate.py b/bin/validate.py index 0eb8ff83..f604a6c3 100755 --- a/bin/validate.py +++ b/bin/validate.py @@ -34,6 +34,7 @@ from ontobio.validation import tools from ontobio.validation import rules + from typing import Dict, Set # logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.WARNING) @@ -223,7 +224,7 @@ def create_parser(config, group, dataset, format="gaf"): @tools.gzips def produce_gaf(dataset, source_gaf, ontology_graph, gpipaths=None, paint=False, group="unknown", rule_metadata=None, - goref_metadata=None, ref_species_metadata=None, db_entities=None, group_idspace=None, + goref_metadata=None, ref_species_metadata=None, retracted_pub_set=None, db_entities=None, group_idspace=None, format="gaf", suppress_rule_reporting_tags=[], annotation_inferences=None, group_metadata=None, extensions_constraints=None, rule_contexts=[], gaf_output_version="2.2", rule_set=assocparser.RuleSet.ALL) -> list[str]: @@ -237,6 +238,7 @@ def produce_gaf(dataset, source_gaf, ontology_graph, gpipaths=None, paint=False, rule_metadata=rule_metadata, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, + retracted_pub_set=retracted_pub_set, entity_idspaces=db_entities, group_idspace=group_idspace, suppress_rule_reporting_tags=suppress_rule_reporting_tags, @@ -610,9 +612,32 @@ def cli(ctx, verbose): @click.option("--only-dataset", default=None) @click.option("--gaf-output-version", default="2.2", type=click.Choice(["2.1", "2.2"])) @click.option("--rule-set", "-l", "rule_set", default=[assocparser.RuleSet.ALL], multiple=True) +@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False, help="Path to retracted publications file") def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target, ontology, exclude, base_download_url, suppress_rule_reporting_tag, skip_existing_files, gaferencer_file, only_dataset, gaf_output_version, - rule_set): + rule_set, retracted_pub_set): + """ + Produce GAF, GPI, and TTL files for a group. + + This command will download the GAF files for a group, validate them, and then produce GPI and TTL files. + :param ctx: Click context + :param group: The group to produce files for + :param metadata_dir: The directory containing the metadata files + :param gpad: Produce GPAD files + :param gpad_gpi_output_version: The version of the GPAD and GPI files to produce + :param ttl: Produce TTL files + :param target: The directory to put the files in + :param ontology: The ontology to use for validation + :param exclude: Datasets to exclude + :param base_download_url: The base URL to download files from + :param suppress_rule_reporting_tag: Tags to suppress in the rule reporting + :param skip_existing_files: Skip downloading files that already exist + :param gaferencer_file: The path to the Gaferencer output file + :param only_dataset: Only process a single dataset + :param gaf_output_version: The version of the GAF files to produce + :param rule_set: The rule set to use + :param retracted_pub_set: The path to the retracted publications file + """ logger.info("Logging is verbose") products = { "gaf": True, @@ -650,7 +675,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target db_entities = metadata.database_entities(absolute_metadata) group_ids = metadata.groups(absolute_metadata) - extensions_constraints = metadata.extensions_constraints_file(absolute_metadata) + extensions_constraints = metadata.extensions_constraints_file(absolute_metadata) gaferences = None if gaferencer_file: @@ -660,6 +685,12 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target if rule_set == (assocparser.RuleSet.ALL,): rule_set = assocparser.RuleSet.ALL + retracted_pubs = None + if retracted_pub_set: + retracted_pubs = metadata.retracted_pub_set(retracted_pub_set) + else: + retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata) + for dataset_metadata, source_gaf in downloaded_gaf_sources: dataset = dataset_metadata["dataset"] # Set paint to True when the group is "paint". @@ -772,13 +803,14 @@ def paint(group, dataset, metadata, target, ontology): absolute_target = os.path.abspath(target) os.makedirs(os.path.join(absolute_target, "groups"), exist_ok=True) paint_metadata = metadata.dataset_metadata_file(absolute_metadata, "paint") + paint_src_gaf = check_and_download_mixin_source(paint_metadata, dataset, absolute_target) click.echo("Loading ontology: {}...".format(ontology)) ontology_graph = OntologyFactory().create(ontology) gpi_path = os.path.join(absolute_target, "groups", dataset, "{}.gpi".format(dataset)) - click.echo("Using GPI at {}".format(gpi_path)) + click.echo("Using GPI at {}".format(gpi_path)) paint_gaf = produce_gaf("paint_{}".format(dataset), paint_src_gaf, ontology_graph, gpipath=gpi_path) @@ -788,7 +820,8 @@ def paint(group, dataset, metadata, target, ontology): @click.option("--ontology", type=click.Path(), required=True) @click.option("--gaferencer-file", "-I", type=click.Path(exists=True), default=None, required=False, help="Path to Gaferencer output to be used for inferences") -def rule(metadata_dir, out, ontology, gaferencer_file): +@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False, help="Path to retracted publications file") +def rule(metadata_dir, out, ontology, gaferencer_file, retracted_pub_set): absolute_metadata = os.path.abspath(metadata_dir) click.echo("Loading ontology: {}...".format(ontology)) @@ -797,6 +830,12 @@ def rule(metadata_dir, out, ontology, gaferencer_file): goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs")) gorule_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "rules")) ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id") + retracted_pubs = None + if retracted_pub_set: + retracted_pubs = metadata.retracted_pub_set(retracted_pub_set) + else: + retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata) + click.echo("Found {} GO Rules".format(len(gorule_metadata.keys()))) @@ -811,6 +850,7 @@ def rule(metadata_dir, out, ontology, gaferencer_file): ontology=ontology_graph, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, + retracted_pub_set=retracted_pubs, entity_idspaces=db_entities, group_idspace=group_ids, annotation_inferences=gaferences, diff --git a/ontobio/__init__.py b/ontobio/__init__.py index e76ee76b..430adbb8 100644 --- a/ontobio/__init__.py +++ b/ontobio/__init__.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -__version__ = '2.8.24' +__version__ = '2.8.25' from .ontol_factory import OntologyFactory diff --git a/ontobio/io/assocparser.py b/ontobio/io/assocparser.py index 8f00c72e..bf0a490e 100644 --- a/ontobio/io/assocparser.py +++ b/ontobio/io/assocparser.py @@ -234,6 +234,7 @@ def __init__(self, ref_species_metadata=None, group_metadata=None, dbxrefs=None, + retracted_pub_set=None, suppress_rule_reporting_tags=[], annotation_inferences=None, extensions_constraints=None, @@ -258,6 +259,7 @@ def __init__(self, self.goref_metadata = goref_metadata self.ref_species_metadata = ref_species_metadata self.group_metadata = group_metadata + self.retracted_pub_set = retracted_pub_set self.suppress_rule_reporting_tags = suppress_rule_reporting_tags self.annotation_inferences = annotation_inferences self.entity_idspaces = entity_idspaces diff --git a/ontobio/io/qc.py b/ontobio/io/qc.py index f15e73f0..f08668a2 100644 --- a/ontobio/io/qc.py +++ b/ontobio/io/qc.py @@ -421,6 +421,20 @@ def test(self, annotation: association.GoAssociation, config: assocparser.AssocP return self._result(bool(withfrom)) else: return self._result(True) + +class GoRule22(GoRule): + + def __init__(self): + super().__init__("GORULE:0000022", "Check for, and filter, annotations made to retracted publications", FailMode.HARD) + + def test(self, annotation: association.GoAssociation, config: assocparser.AssocParserConfig, group=None) -> TestResult: + if config.retracted_pub_set is not None: + references = annotation.evidence.has_supporting_reference + for ref in references: + ref = str(ref) + if ref in config.retracted_pub_set: + return self._result(False) + return self._result(True) class GoRule26(GoRule): @@ -952,6 +966,7 @@ def test(self, annotation: association.GoAssociation, config: assocparser.AssocP "GoRule16": GoRule16(), "GoRule17": GoRule17(), "GoRule18": GoRule18(), + "GoRule22": GoRule22(), "GoRule26": GoRule26(), "GoRule28": GoRule28(), "GoRule29": GoRule29(), diff --git a/ontobio/validation/metadata.py b/ontobio/validation/metadata.py index 8b370635..b5be0c1d 100644 --- a/ontobio/validation/metadata.py +++ b/ontobio/validation/metadata.py @@ -182,7 +182,32 @@ def yaml_set(metadata, yaml_file_name, field) -> Set[str]: except Exception as e: raise click.ClickException("Could not find or read {}: {}".format(yaml_path, str(e))) - return set([yaml[field] for yaml in yaml_list]) + return set([yaml[field] for yaml in yaml_list]) + +def retracted_pub_set_from_meta(metadata) -> Set: + retracted_path = os.path.join(metadata, "retracted-publications.txt") + if os.access(retracted_path, os.R_OK): + return retracted_pub_set_use_abspath(retracted_path) + else: + return set() + +def retracted_pub_set(abspath_retracted_file) -> Set: + return retracted_pub_set_use_abspath(os.path.abspath(abspath_retracted_file)) + +def retracted_pub_set_use_abspath(abspath_retracted_file) -> Set: + try: + retracted_pubs = None + with open(abspath_retracted_file, "r") as f: + retracted_pubs = set() + for line in f: + li=line.strip() + if not li.startswith("!"): + if "," in li: + li = li.partition(',')[0] + retracted_pubs.add(li) + return retracted_pubs + except Exception as e: + raise click.ClickException("Could not find or read {}: {}".format(abspath_retracted_file, str(e))) diff --git a/tests/test_qc.py b/tests/test_qc.py index 90d92368..ef4bd800 100644 --- a/tests/test_qc.py +++ b/tests/test_qc.py @@ -354,6 +354,20 @@ def test_go_rule_18(): test_result = qc.GoRule18().test(assoc, all_rules_config()) assert test_result.result_type == qc.ResultType.PASS +def test_go_rule22(): + config = assocparser.AssocParserConfig( + ontology=ontology, + retracted_pub_set={"RETRACTED:1234","PMID:37772366"}, + rule_set=assocparser.RuleSet.ALL + ) + assoc = make_annotation(goid="GO:1234567", evidence="IBA", references="PMID:12345").associations[0] + test_result = qc.GoRule22().test(assoc, config) + assert test_result.result_type == qc.ResultType.PASS + + assoc = make_annotation(goid="GO:1234567", evidence="IBA", references="PMID:37772366").associations[0] + test_result = qc.GoRule22().test(assoc, config) + assert test_result.result_type == qc.ResultType.ERROR + def test_go_rule26(): config = assocparser.AssocParserConfig( @@ -819,7 +833,7 @@ def test_all_rules(): assoc = gafparser.to_association(a).associations[0] test_results = qc.test_go_rules(assoc, config).all_results - assert len(test_results.keys()) == 26 + assert len(test_results.keys()) == 27 assert test_results[qc.GoRules.GoRule26.value].result_type == qc.ResultType.PASS assert test_results[qc.GoRules.GoRule29.value].result_type == qc.ResultType.PASS