From 00e6f563dbfe61453d1f2e34aece494049196e85 Mon Sep 17 00:00:00 2001 From: Corentin <> Date: Mon, 11 Sep 2023 21:44:36 +0200 Subject: [PATCH] ontology conversion class --- notebooks/import_ontology.ipynb | 261 ++++++++++++-------------------- 1 file changed, 97 insertions(+), 164 deletions(-) diff --git a/notebooks/import_ontology.ipynb b/notebooks/import_ontology.ipynb index f2139d7..a20b909 100644 --- a/notebooks/import_ontology.ipynb +++ b/notebooks/import_ontology.ipynb @@ -1,26 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pronto import Ontology\n", - "go = Ontology(\"go.obo\")\n", - "go" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open(\"ms.json\", \"wb\") as f:\n", - " go.dump(f, format=\"json\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -28,59 +7,87 @@ "outputs": [], "source": [ "import json\n", - "with open(\"ms.json\", \"r\") as f:\n", - " go = json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "go[\"graphs\"][0].keys()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "go[\"graphs\"][0][\"nodes\"][0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "edge_dict: dict = {}\n", - "for relationship in go[\"graphs\"][0][\"edges\"]:\n", - " parent_list = edge_dict.get(relationship[\"sub\"].split(\"/\")[-1], [])\n", - " parent_list.append((relationship[\"obj\"].split(\"/\")[-1], relationship[\"pred\"]))\n", - " edge_dict[relationship[\"sub\"].split(\"/\")[-1]] = parent_list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "edge_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for go_term in go[\"graphs\"][0][\"nodes\"]:\n", - " if go_term[\"type\"] != \"CLASS\":\n", - " print(go_term)" + "import random\n", + "from pronto import Ontology, Definition\n", + "\n", + "class ImpatientVocab():\n", + " def __init__(self) -> None:\n", + " self.used_colors: list[str] = []\n", + " self.impatient_json: list[dict] = []\n", + " self.impatient_onto: Ontology = None\n", + " self.list_of_terms: list[str] = []\n", + "\n", + " def load_json(self, path: str) -> list[dict]:\n", + " self.impatient_json = json.load(open(path, \"r\"))\n", + " return self.impatient_json\n", + " \n", + " def load_ontology(self, path: str) -> Ontology:\n", + " self.impatient_onto = Ontology(path)\n", + " return self.impatient_onto\n", + " \n", + " def json_to_onto(self) -> Ontology:\n", + " self.impatient_onto = Ontology()\n", + " for term in self.impatient_json:\n", + " added_term = self.impatient_onto.create_term(term[\"id\"].replace(\"_\", \":\"))\n", + " added_term.name = term[\"text\"]\n", + " for syn in term[\"data\"][\"synonymes\"].split(\",\"):\n", + " if syn != \"\":\n", + " added_term.add_synonym(syn, scope=\"EXACT\")\n", + " if term[\"data\"][\"description\"] != \"\":\n", + " added_term.definition = Definition(term[\"data\"][\"description\"])\n", + " if term[\"parent\"] != \"#\":\n", + " added_term.superclasses().add(self.impatient_onto[term[\"parent\"].replace(\"_\", \":\")])\n", + " \n", + " self.list_of_terms.append(added_term)\n", + " return self.impatient_onto\n", + " \n", + " def onto_to_json(self) -> list[dict]:\n", + " self.impatient_json = []\n", + " index = 0\n", + " for term in self.impatient_onto.terms():\n", + " relationships = []\n", + " for rel in term.superclasses():\n", + " relationships.append(rel.id)\n", + " relationships.pop(0)\n", + " self.impatient_json.append(\n", + " {\n", + " \"id\": term.id.replace(\"_\", \":\"),\n", + " \"text\": term.name,\n", + " \"icon\": True,\n", + " \"data\": {\n", + " \"description\": term.definition if term.definition is not None else \"\",\n", + " \"synonymes\": \",\".join([syn.description for syn in term.synonyms]),\n", + " \"phenotype_datamined\": \"\",\n", + " \"gene_datamined\": \"\",\n", + " \"alternative_language\": term.name,\n", + " \"correlates_with\": \"\",\n", + " \"image_annotation\": True if index == 0 else False,\n", + " \"hex_color\": self._generate_hex_color(),\n", + " \"hpo_datamined\": \"\",\n", + " },\n", + " \"parent\": relationships[0].replace(\"_\", \":\") if relationships != [] else \"#\"\n", + " }\n", + " )\n", + " index += 1\n", + " return self.impatient_json\n", + " \n", + " def _generate_hex_color(self):\n", + " while True:\n", + " # Generate a random hex color\n", + " color = \"#{:06x}\".format(random.randint(0, 0xFFFFFF))\n", + " # Check if the color has already been used\n", + " if color not in self.used_colors:\n", + " # Add the color to the list of used colors and return it\n", + " self.used_colors.append(color)\n", + " return color\n", + " \n", + " def dump_onto(self, path: str) -> None:\n", + " with open(path, \"wb\") as f:\n", + " self.impatient_onto.dump(f, format=\"obo\")\n", + "\n", + " def dump_json(self, path: str) -> None:\n", + " with open(path, \"w\") as f:\n", + " json.dump(self.impatient_json, f, indent=2)" ] }, { @@ -89,101 +96,22 @@ "metadata": {}, "outputs": [], "source": [ - "names: list[str] = []\n", - "id: list[str] = []\n", - "desc: list[str] = []\n", - "synonymes: list[list[str]] = []\n", - "\n", - "for go_term in go[\"graphs\"][0][\"nodes\"]:\n", - " if go_term[\"type\"] == \"CLASS\":\n", - " id.append(go_term[\"id\"].split(\"/\")[-1])\n", - " names.append(go_term[\"lbl\"])\n", - " desc.append(go_term[\"meta\"][\"definition\"][\"val\"])\n", - " synonymes.append([syn[\"val\"] for syn in go_term[\"meta\"][\"synonyms\"]])" + "my_onto = ImpatientVocab()\n", + "my_onto.load_json(\"ontology.json.demo\")\n", + "my_onto.json_to_onto()\n", + "my_onto.dump_onto(\"ontology_imp.obo\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "import jsonschema\n", - "from jsonschema import validate\n", - "\n", - "impatient_json: list[dict] = []\n", - "impatient_json_schema = {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"id\": {\"type\": \"string\"},\n", - " \"text\": {\"type\": \"string\"},\n", - " \"icon\": {\"type\": \"boolean\"},\n", - " \"data\": {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"description\": {\"type\": \"string\"},\n", - " \"synonymes\": {\"type\": \"string\"},\n", - " \"phenotype_datamined\": {\"type\": \"string\"},\n", - " \"gene_datamined\": {\"type\": \"string\"},\n", - " \"alternative_language\": {\"type\": \"string\"},\n", - " \"correlates_with\": {\"type\": \"string\"},\n", - " \"image_annotation\": {\"type\": \"boolean\"},\n", - " \"hex_color\": {\"type\": \"string\", \"pattern\": \"^#[0-9a-fA-F]{6}$\"},\n", - " \"hpo_datamined\": {\"type\": \"string\"},\n", - " },\n", - " \"required\": [\n", - " \"description\",\n", - " \"synonymes\",\n", - " \"phenotype_datamined\",\n", - " \"gene_datamined\",\n", - " \"alternative_language\",\n", - " \"correlates_with\",\n", - " \"image_annotation\",\n", - " \"hex_color\",\n", - " \"hpo_datamined\",\n", - " ],\n", - " },\n", - " \"parent\": {\"type\": \"string\"},\n", - " },\n", - " \"required\": [\"id\", \"text\", \"icon\", \"data\", \"parent\"],\n", - "}\n", - "\n", - "for index in range(len(id)):\n", - " impatient_json.append(\n", - " {\n", - " \"id\": id[index].replace(\"_\", \":\"),\n", - " \"text\": names[index],\n", - " \"icon\": True,\n", - " \"data\": {\n", - " \"description\": desc[index],\n", - " \"synonymes\": ','.join(synonymes[index]),\n", - " \"phenotype_datamined\": \"\",\n", - " \"gene_datamined\": \"\",\n", - " \"alternative_language\": names[index],\n", - " \"correlates_with\": \"\",\n", - " \"image_annotation\": True if index==0 else False,\n", - " \"hex_color\": \"#FFFFFF\",\n", - " \"hpo_datamined\": \"\",\n", - " },\n", - " \"parent\": \"#\",\n", - " }\n", - " )\n", - " \n", - "for child, parent in edge_dict.items():\n", - " try:\n", - " index_term = id.index(child)\n", - " except ValueError:\n", - " print(f\"Term {child} not found in the list of terms\")\n", - " continue\n", - " # Only one parent so yeah we are loosing information.\n", - " impatient_json[index_term][\"parent\"] = parent[0][0].replace(\"_\", \":\")" + "my_onto = ImpatientVocab()\n", + "my_onto.load_ontology(\"goslim_agr.obo\")\n", + "my_onto.onto_to_json()\n", + "my_onto.dump_json(\"obo_to_json_GO.json\")" ] }, { @@ -192,7 +120,10 @@ "metadata": {}, "outputs": [], "source": [ - "json.dump(impatient_json, open(\"impatient.json\", \"w\"))" + "my_onto = ImpatientVocab()\n", + "my_onto.load_ontology(\"ontology_imp.obo\")\n", + "my_onto.onto_to_json()\n", + "my_onto.dump_json(\"obo_to_json_IMP.json\")" ] }, { @@ -201,8 +132,10 @@ "metadata": {}, "outputs": [], "source": [ - "for idx, json_data in enumerate(impatient_json, start=1):\n", - " validate(instance=json_data, schema=impatient_json_schema)" + "my_onto = ImpatientVocab()\n", + "my_onto.load_ontology(\"hp.owl\")\n", + "my_onto.onto_to_json()\n", + "my_onto.dump_json(\"obo_to_json_HPO.json\")" ] } ],