From e749409d17534e52d55de275accd4957d6a980e5 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Fri, 17 Nov 2023 19:25:31 -0800 Subject: [PATCH] addressing issue #140 --- Main.py | 1 + README.rst | 17 +------------ main.ipynb | 52 +++++++++++++++++++++++++------------- pkt_kg/__version__.py | 2 +- pkt_kg/knowledge_graph.py | 4 +-- pkt_kg/utils/data_utils.py | 9 ++++--- pkt_kg/utils/kg_utils.py | 2 +- 7 files changed, 47 insertions(+), 40 deletions(-) diff --git a/Main.py b/Main.py index b8d366be..1f44655d 100644 --- a/Main.py +++ b/Main.py @@ -69,6 +69,7 @@ def main(): start = time.time() combined_edges = dict(ent.data_files, **ont.data_files) # master_edges = CreatesEdgeList(data_files=combined_edges, source_file='resources/resource_info.txt') + # master_edges.runs_creates_knowledge_graph_edges(source_file='resources/resource_info.txt', data_files=combined_edges, cpus=cpus) master_edges = CreatesEdgeList(data_files=combined_edges, source_file=args.res) master_edges.runs_creates_knowledge_graph_edges(source_file=args.res, data_files=combined_edges, cpus=cpus) end = time.time(); timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") diff --git a/README.rst b/README.rst index b7dff45a..3784f536 100644 --- a/README.rst +++ b/README.rst @@ -308,22 +308,7 @@ This project is licensed under Apache License 2.0 - see the `LICENSE.md`_ file f Citing this Work ================= -**ISMB Conference Pre-print:** - -Callahan TJ, Tripodi IJ, Hunter LE, Baumgartner WA. `A Framework for Automated Construction of Heterogeneous Large-Scale Biomedical Knowledge Graphs `_. bioRxiv. 2020 Jan 1. - - -**Zenodo** - -.. code:: bash - - @misc{callahan_tj_2019_3401437, - author = {Callahan, TJ}, - title = {PheKnowLator}, - year = 2019, - doi = {10.5281/zenodo.3401437}, - url = {https://doi.org/10.5281/zenodo.3401437}} - +Please see our preprint: https://arxiv.org/abs/2307.05727 .. |logo| image:: https://user-images.githubusercontent.com/8030363/195494933-d0faba60-5643-4cc6-8a48-41b4a94a7afe.png :target: https://github.com/callahantiff/PheKnowLator diff --git a/main.ipynb b/main.ipynb index e9177fc1..8d980101 100644 --- a/main.ipynb +++ b/main.ipynb @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -92,11 +92,14 @@ "import glob\n", "import json\n", "import pandas\n", + "import psutil\n", "import ray\n", "import time\n", "\n", "# import module\n", - "from pkt_kg import downloads, edge_list, knowledge_graph" + "from pkt_kg.downloads import OntData, LinkedData\n", + "from pkt_kg.edge_list import CreatesEdgeList\n", + "from pkt_kg.knowledge_graph import FullBuild, PartialBuild, PostClosureBuild" ] }, { @@ -143,9 +146,8 @@ "metadata": {}, "outputs": [], "source": [ - "ont = pkt.OntData('resources/ontology_source_list.txt', 'resources/resource_info.txt')\n", - "ont.downloads_data_from_url()\n", - "ont.writes_source_metadata_locally()" + "ont = OntData('resources/ontology_source_list.txt', 'resources/resource_info.txt')\n", + "ont.downloads_data_from_url()" ] }, { @@ -164,9 +166,8 @@ "metadata": {}, "outputs": [], "source": [ - "edges = pkt.LinkedData('resources/edge_source_list.txt', 'resources/resource_info.txt')\n", - "edges.downloads_data_from_url()\n", - "edges.writes_source_metadata_locally()" + "edges = LinkedData('resources/edge_source_list.txt', 'resources/resource_info.txt')\n", + "edges.downloads_data_from_url()" ] }, { @@ -229,6 +230,15 @@ "ray.init()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "combined_edges" + ] + }, { "cell_type": "code", "execution_count": null, @@ -259,18 +269,17 @@ "outputs": [], "source": [ "# # read in master edge dictionary\n", - "# master_edges = json.load(open('resources/Master_Edge_List_Dict.json', 'r'))\n", + "master_edges = json.load(open('resources/Master_Edge_List_Dict.json', 'r'))\n", "\n", "# read in relation data\n", "relation_data = open('./resources/relations_data/RELATIONS_LABELS.txt').readlines()\n", "relation_dict = {x.split('\\t')[0]: x.split('\\t')[1].strip('\\n') for x in relation_data}\n", "\n", "# print basic stats on each resource\n", - "edge_data = [[key,\n", - " relation_dict[master_edges.source_info[key]['edge_relation']],\n", - " ', '.join(master_edges.source_info[key]['edge_list'][0]),\n", - " len(master_edges.source_info[key]['edge_list'])]\n", - " for key in master_edges.source_info.keys()]\n", + "edge_data = [[key, master_edges[key]['edge_relation'],\n", + " ', '.join(master_edges[key]['edge_list'][0]),\n", + " len(master_edges[key]['edge_list'])]\n", + " for key in master_edges.keys()]\n", "\n", "# convert dict to pandas df for nice printing\n", "df = pandas.DataFrame(edge_data, columns = ['Edge Type', 'Relation', 'Example Edge', 'Unique Edges']) \n", @@ -357,9 +366,9 @@ "source": [ "# specify input arguments\n", "build = 'full'\n", - "construction_approach = 'subclass'\n", - "add_node_data_to_kg = 'yes'\n", - "add_inverse_relations_to_kg = 'yes'\n", + "construction_approach = 'instance'\n", + "add_node_data_to_kg = 'no'\n", + "add_inverse_relations_to_kg = 'no'\n", "decode_owl_semantics = 'yes'\n", "kg_directory_location = './resources/knowledge_graphs'\n" ] @@ -397,6 +406,15 @@ "ray.shutdown()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kg.ontologies" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/pkt_kg/__version__.py b/pkt_kg/__version__.py index fa1e0ae0..01866329 100644 --- a/pkt_kg/__version__.py +++ b/pkt_kg/__version__.py @@ -1,2 +1,2 @@ """Current version of package pkt_kg""" -__version__ = "3.1.1" +__version__ = "3.1.2" diff --git a/pkt_kg/knowledge_graph.py b/pkt_kg/knowledge_graph.py index 30a4e937..51f26590 100644 --- a/pkt_kg/knowledge_graph.py +++ b/pkt_kg/knowledge_graph.py @@ -384,7 +384,7 @@ def construct_knowledge_graph(self) -> None: self.graph = Graph().parse(self.merged_ont_kg, format='xml') else: log_str = '*** Merging Ontology Data ***'; print(log_str); logger.info(log_str) - merges_ontologies(self.ontologies, self.merged_ont_kg.split('/')[-1], self.owl_tools) + merges_ontologies(self.ontologies, self.write_location + '/', self.merged_ont_kg.split('/')[-1], self.owl_tools) self.graph.parse(self.merged_ont_kg, format='xml') stats = 'Merged Ontologies {}'.format(derives_graph_statistics(self.graph)); print(stats); logger.info(stats) @@ -561,7 +561,7 @@ def construct_knowledge_graph(self) -> None: self.graph = Graph().parse(self.merged_ont_kg, format='xml') else: log_str = '*** Merging Ontology Data ***'; print(log_str); logger.info(log_str) - merges_ontologies(self.ontologies, self.merged_ont_kg.split('/')[-1], self.owl_tools) + merges_ontologies(self.ontologies, self.write_location + '/', self.merged_ont_kg.split('/')[-1], self.owl_tools) self.graph.parse(self.merged_ont_kg, format='xml') stats = 'Merged Ontologies {}'.format(derives_graph_statistics(self.graph)); print(stats); logger.info(stats) diff --git a/pkt_kg/utils/data_utils.py b/pkt_kg/utils/data_utils.py index 977fbd9a..bb953fa2 100644 --- a/pkt_kg/utils/data_utils.py +++ b/pkt_kg/utils/data_utils.py @@ -471,10 +471,13 @@ def sublist_creator(actors: Union[Dict, List], chunk_size: int) -> List: # update list to return string identifier associated with each list length if isinstance(actors, Dict): - updated_lists = []; used_ids = set() + updated_lists = [] for sub in lists: - sub_list = [[k for k, v in actors.items() if v == x and k not in used_ids][0] for x in sub] - updated_lists += [sub_list]; used_ids |= set(x for y in sub_list for x in y) + sub_list = [] + for x in sub: + temp_list = [k for k, v in actors.items() if v == x][0] + sub_list.append(temp_list); del actors[temp_list] + updated_lists += [sub_list] else: updated_lists = lists return updated_lists diff --git a/pkt_kg/utils/kg_utils.py b/pkt_kg/utils/kg_utils.py index 8373c2e0..c7cb61ab 100644 --- a/pkt_kg/utils/kg_utils.py +++ b/pkt_kg/utils/kg_utils.py @@ -239,7 +239,7 @@ def merges_ontologies(onts: List[str], loc: str, merged: str, None. """ - if not onts: return None + if not onts or len(onts) == 0: return None else: if loc + merged in glob.glob(loc + '/*.owl'): o1, o2 = onts.pop(), loc + merged else: o1, o2 = onts.pop(), onts.pop()