Skip to content

Commit

Permalink
improve handling of multiple ontologies
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Dec 9, 2023
1 parent 7224490 commit b7f5e0d
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 23 deletions.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ dependencies = [
"openpyxl",
"qdrant-client[fastembed]",
"owlready2",
"oxrdflib",
]


Expand Down Expand Up @@ -189,6 +190,7 @@ ignore = [
"C901", # too complex
"T201", # do not use print
"B008", # do not perform function calls in argument defaults
"E722", "S110", # Do not use bare `except`
]

[tool.ruff.per-file-ignores]
Expand Down
91 changes: 71 additions & 20 deletions src/csvw_ontomap/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from qdrant_client import QdrantClient
from qdrant_client.http.models import (
Distance,
FieldCondition,
Filter,
MatchText,
PointStruct,
VectorParams,
)
Expand All @@ -24,45 +27,93 @@ def load_vectordb(ontologies: List[str], vectordb_path: str, recreate: bool = Fa
embedding_model = Embedding(model_name=EMBEDDING_MODEL_NAME, max_length=512)
vectordb = QdrantClient(path=vectordb_path)

# Check if vectordb is already loaded
try:
vectors_count = vectordb.get_collection(COLLECTION_NAME).points_count
# all_onto_count = len(list(onto.classes())) + len(list(onto.properties()))
# print(f"{all_onto_count} classes/properties in the ontology. And currently {BOLD}{vectors_count}{END} loaded in the VectorDB")
if vectors_count <= 2:
raise Exception("Not enough vectors")
except Exception:
print(f"Vectors in DB: {vectordb.get_collection(COLLECTION_NAME).points_count}")
except:
recreate = True
# TODO: for each ontology check if there are more vectors than classes/properties
# And skip building if enough vectors for this ontology

if recreate:
print(f"🔄 Recreating VectorDB in {vectordb_path}")
vectordb.recreate_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(size=EMBEDDING_MODEL_SIZE, distance=Distance.COSINE),
)
for ontology_url in ontologies:
print(f"📚 Loading ontology from {BOLD}{CYAN}{ontology_url}{END}")
onto = get_ontology(ontology_url).load()
# Find labels, generate embeddings, and upload them
upload_concepts(onto.classes(), "class", ontology_url, vectordb, embedding_model)
upload_concepts(onto.properties(), "property", ontology_url, vectordb, embedding_model)

for ontology_url in ontologies:
print(f"\n📚 Loading ontology from {BOLD}{CYAN}{ontology_url}{END}")

onto = get_ontology(ontology_url).load()
# For each ontology check if there are more vectors than classes/properties, and skip building if enough vectors for this ontology
all_onto_count = len(list(onto.classes())) + len(list(onto.properties()))
onto_vector_count = get_vectors_count(vectordb, ontology_url)
print(
f"{all_onto_count} classes/properties in the ontology | {BOLD}{onto_vector_count}{END} loaded in the VectorDB"
)
if onto_vector_count > all_onto_count:
print("⏩ Skip loading")
continue

# Find labels, generate embeddings, and upload them using owlready2
upload_concepts(onto.classes(), "class", ontology_url, vectordb, embedding_model)
upload_concepts(onto.properties(), "property", ontology_url, vectordb, embedding_model)

# NOTE: Try to use oxrdflib to handle large ontologies (600M+)
# g = Graph(store="Oxigraph")
# g.parse(ontology_url)
# q = """
# PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# PREFIX owl: <http://www.w3.org/2002/07/owl#>
# PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

# SELECT *
# WHERE {
# ?class a owl:Class ;
# rdfs:label/skos:prefLabel ?label .
# }
# """
# for r in g.query(q):
# print(r)


def get_vectors_count(vectordb: Any, ontology: str) -> int:
search_result = vectordb.scroll(
collection_name=COLLECTION_NAME,
scroll_filter=Filter(should=[FieldCondition(key="ontology", match=MatchText(text=ontology))]),
# with_vectors=True,
# with_payload=True,
limit=999999,
)
return len(list(search_result[0]))


def upload_concepts(onto_concepts: Any, category: str, ontology_url: str, vectordb: Any, embedding_model: Any) -> None:
"""Generate and upload embeddings for label and description of a list of owlready2 classes/properties"""
print(f"⏳ Generating embeddings for {category}")
# concepts_count = len(list(onto_concepts))
concepts_count = 0
concept_labels = []
concept_uris = []
for concept in onto_concepts:
# print(f"Class URI: {ent.iri}, Label: {ent.label}, Description: {str(ent.description.first())}, Comment: {ent.comment}")
print(concept.label, concept.description, concept.name)
# print(concept.label, concept.comment, concept.name)
if concept.label:
concept_uris.append(concept.iri)
concept_labels.append(str(concept.label.first()))
if concept.description:
try:
if concept.description:
concept_uris.append(concept.iri)
concept_labels.append(str(concept.description[0]))
except:
pass
# try:
if concept.comment:
# print("COMMENT", concept.comment[0])
concept_uris.append(concept.iri)
concept_labels.append(str(concept.description.first()))
concept_labels.append(str(concept.comment[0]))
# except:
# pass
concepts_count += 1
print(f"⏳ Generating {len(concept_uris)} embeddings for {concepts_count} {category}")

# Generate embeddings, and upload them
embeddings = list(embedding_model.embed(concept_labels))
Expand All @@ -75,7 +126,7 @@ def upload_concepts(onto_concepts: Any, category: str, ontology_url: str, vector
)
for i, (uri, label, embedding) in enumerate(zip(concept_uris, concept_labels, embeddings))
]
print(f"{BOLD}{len(class_points)}{END} vectors generated for {len(list(onto_concepts))} {category}")
# print(f"{BOLD}{len(class_points)}{END} vectors generated for {concepts_count} {category}")
vectordb.upsert(collection_name=COLLECTION_NAME, points=class_points)


Expand Down
6 changes: 3 additions & 3 deletions tests/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from csvw_ontomap import CsvwProfiler, OntomapConfig, __version__

ONTOLOGIES = [
# "https://semanticscience.org/ontology/sio.owl",
# "http://www.lesfleursdunormal.fr/static/_downloads/omop_cdm_v6.owl",
"data/LOINC.ttl",
"https://vemonet.github.io/omop-cdm-owl/ontology.owl",
"https://semanticscience.org/ontology/sio.owl",
# "data/LOINC.ttl",
]


Expand Down

0 comments on commit b7f5e0d

Please sign in to comment.