improve handling of multiple ontologies

vemonet · Dec 9, 2023 · b7f5e0d · b7f5e0d
1 parent 7224490
commit b7f5e0d
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 23 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ dependencies = [
     "openpyxl",
     "qdrant-client[fastembed]",
     "owlready2",
+    "oxrdflib",
 ]
 
 
@@ -189,6 +190,7 @@ ignore = [
     "C901", # too complex
     "T201", # do not use print
     "B008", # do not perform function calls in argument defaults
+    "E722", "S110", # Do not use bare `except`
 ]
 
 [tool.ruff.per-file-ignores]

diff --git a/src/csvw_ontomap/ontology.py b/src/csvw_ontomap/ontology.py
@@ -6,6 +6,9 @@
 from qdrant_client import QdrantClient
 from qdrant_client.http.models import (
     Distance,
+    FieldCondition,
+    Filter,
+    MatchText,
     PointStruct,
     VectorParams,
 )
@@ -24,45 +27,93 @@ def load_vectordb(ontologies: List[str], vectordb_path: str, recreate: bool = Fa
     embedding_model = Embedding(model_name=EMBEDDING_MODEL_NAME, max_length=512)
     vectordb = QdrantClient(path=vectordb_path)
 
-    # Check if vectordb is already loaded
     try:
-        vectors_count = vectordb.get_collection(COLLECTION_NAME).points_count
-        # all_onto_count = len(list(onto.classes())) + len(list(onto.properties()))
-        # print(f"{all_onto_count} classes/properties in the ontology. And currently {BOLD}{vectors_count}{END} loaded in the VectorDB")
-        if vectors_count <= 2:
-            raise Exception("Not enough vectors")
-    except Exception:
+        print(f"Vectors in DB: {vectordb.get_collection(COLLECTION_NAME).points_count}")
+    except:
         recreate = True
-    # TODO: for each ontology check if there are more vectors than classes/properties
-    # And skip building if enough vectors for this ontology
+
     if recreate:
         print(f"🔄 Recreating VectorDB in {vectordb_path}")
         vectordb.recreate_collection(
             collection_name=COLLECTION_NAME,
             vectors_config=VectorParams(size=EMBEDDING_MODEL_SIZE, distance=Distance.COSINE),
         )
-        for ontology_url in ontologies:
-            print(f"📚 Loading ontology from {BOLD}{CYAN}{ontology_url}{END}")
-            onto = get_ontology(ontology_url).load()
-            # Find labels, generate embeddings, and upload them
-            upload_concepts(onto.classes(), "class", ontology_url, vectordb, embedding_model)
-            upload_concepts(onto.properties(), "property", ontology_url, vectordb, embedding_model)
+
+    for ontology_url in ontologies:
+        print(f"\n📚 Loading ontology from {BOLD}{CYAN}{ontology_url}{END}")
+
+        onto = get_ontology(ontology_url).load()
+        # For each ontology check if there are more vectors than classes/properties, and skip building if enough vectors for this ontology
+        all_onto_count = len(list(onto.classes())) + len(list(onto.properties()))
+        onto_vector_count = get_vectors_count(vectordb, ontology_url)
+        print(
+            f"{all_onto_count} classes/properties in the ontology | {BOLD}{onto_vector_count}{END} loaded in the VectorDB"
+        )
+        if onto_vector_count > all_onto_count:
+            print("⏩ Skip loading")
+            continue
+
+        # Find labels, generate embeddings, and upload them using owlready2
+        upload_concepts(onto.classes(), "class", ontology_url, vectordb, embedding_model)
+        upload_concepts(onto.properties(), "property", ontology_url, vectordb, embedding_model)
+
+        # NOTE: Try to use oxrdflib to handle large ontologies (600M+)
+        # g = Graph(store="Oxigraph")
+        # g.parse(ontology_url)
+        # q = """
+        # PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+        # PREFIX rdfs:  <http://www.w3.org/2000/01/rdf-schema#>
+        # PREFIX owl:  <http://www.w3.org/2002/07/owl#>
+        # PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
+
+        # SELECT *
+        # WHERE {
+        #     ?class a owl:Class ;
+        #         rdfs:label/skos:prefLabel ?label .
+        # }
+        # """
+        # for r in g.query(q):
+        #     print(r)
+
+
+def get_vectors_count(vectordb: Any, ontology: str) -> int:
+    search_result = vectordb.scroll(
+        collection_name=COLLECTION_NAME,
+        scroll_filter=Filter(should=[FieldCondition(key="ontology", match=MatchText(text=ontology))]),
+        # with_vectors=True,
+        # with_payload=True,
+        limit=999999,
+    )
+    return len(list(search_result[0]))
 
 
 def upload_concepts(onto_concepts: Any, category: str, ontology_url: str, vectordb: Any, embedding_model: Any) -> None:
     """Generate and upload embeddings for label and description of a list of owlready2 classes/properties"""
-    print(f"⏳ Generating embeddings for {category}")
+    # concepts_count = len(list(onto_concepts))
+    concepts_count = 0
     concept_labels = []
     concept_uris = []
     for concept in onto_concepts:
         # print(f"Class URI: {ent.iri}, Label: {ent.label}, Description: {str(ent.description.first())}, Comment: {ent.comment}")
-        print(concept.label, concept.description, concept.name)
+        # print(concept.label, concept.comment, concept.name)
         if concept.label:
             concept_uris.append(concept.iri)
             concept_labels.append(str(concept.label.first()))
-        if concept.description:
+        try:
+            if concept.description:
+                concept_uris.append(concept.iri)
+                concept_labels.append(str(concept.description[0]))
+        except:
+            pass
+        # try:
+        if concept.comment:
+            # print("COMMENT", concept.comment[0])
             concept_uris.append(concept.iri)
-            concept_labels.append(str(concept.description.first()))
+            concept_labels.append(str(concept.comment[0]))
+        # except:
+        #     pass
+        concepts_count += 1
+    print(f"⏳ Generating {len(concept_uris)} embeddings for {concepts_count} {category}")
 
     # Generate embeddings, and upload them
     embeddings = list(embedding_model.embed(concept_labels))
@@ -75,7 +126,7 @@ def upload_concepts(onto_concepts: Any, category: str, ontology_url: str, vector
         )
         for i, (uri, label, embedding) in enumerate(zip(concept_uris, concept_labels, embeddings))
     ]
-    print(f"{BOLD}{len(class_points)}{END} vectors generated for {len(list(onto_concepts))} {category}")
+    # print(f"{BOLD}{len(class_points)}{END} vectors generated for {concepts_count} {category}")
     vectordb.upsert(collection_name=COLLECTION_NAME, points=class_points)
 
 

diff --git a/tests/test_profiler.py b/tests/test_profiler.py
@@ -6,9 +6,9 @@
 from csvw_ontomap import CsvwProfiler, OntomapConfig, __version__
 
 ONTOLOGIES = [
-    # "https://semanticscience.org/ontology/sio.owl",
-    # "http://www.lesfleursdunormal.fr/static/_downloads/omop_cdm_v6.owl",
-    "data/LOINC.ttl",
+    "https://vemonet.github.io/omop-cdm-owl/ontology.owl",
+    "https://semanticscience.org/ontology/sio.owl",
+    # "data/LOINC.ttl",
 ]