From 399bfb1f8175d4a1ff4a18a5eae3a753c72ee692 Mon Sep 17 00:00:00 2001 From: Michael Sekamanya <86433807+mawandm@users.noreply.github.com> Date: Wed, 21 Aug 2024 07:45:34 -0700 Subject: [PATCH] fix(api): refactor document loaders for generic ingestion and extraction (#142) This PR refactors all minio, sharepoint, samba, s3 document loaders - to use generic document loaders. - to have unique document_ids generated from from a combination of datasource and self_reference Closes #143 --- .../core/document_loaders/loader_helper.py | 125 ++++- nesis/api/core/document_loaders/minio.py | 156 ++---- nesis/api/core/document_loaders/runners.py | 4 +- nesis/api/core/document_loaders/s3.py | 435 ++++++++--------- nesis/api/core/document_loaders/samba.py | 458 ++++++++---------- nesis/api/core/document_loaders/sharepoint.py | 437 +++++++---------- nesis/api/core/tasks/document_management.py | 31 +- .../tests/core/document_loaders/test_minio.py | 16 +- .../tests/core/document_loaders/test_s3.py | 262 ++++++++-- .../tests/core/document_loaders/test_samba.py | 199 +++++++- .../core/document_loaders/test_sharepoint.py | 59 ++- .../tests/tasks/test_document_management.py | 92 +--- 12 files changed, 1216 insertions(+), 1058 deletions(-) diff --git a/nesis/api/core/document_loaders/loader_helper.py b/nesis/api/core/document_loaders/loader_helper.py index bfc3127..c15ef54 100644 --- a/nesis/api/core/document_loaders/loader_helper.py +++ b/nesis/api/core/document_loaders/loader_helper.py @@ -1,20 +1,31 @@ -import uuid +import datetime import json -from nesis.api.core.models.entities import Document -from nesis.api.core.util.dateutil import strptime import logging -from nesis.api.core.services.util import get_document, delete_document +import uuid +from typing import Optional, Dict, Any, Callable + +import nesis.api.core.util.http as http +from nesis.api.core.document_loaders.runners import ( + IngestRunner, + ExtractRunner, + RagRunner, +) +from nesis.api.core.models.entities import Document, Datasource +from nesis.api.core.services.util import delete_document +from nesis.api.core.services.util import ( + get_document, +) +from nesis.api.core.util.constants import DEFAULT_DATETIME_FORMAT +from nesis.api.core.util.dateutil import strptime _LOG = logging.getLogger(__name__) def upload_document_to_llm(upload_document, file_metadata, rag_endpoint, http_client): - return _upload_document_to_pgpt( - upload_document, file_metadata, rag_endpoint, http_client - ) + return _upload_document(upload_document, file_metadata, rag_endpoint, http_client) -def _upload_document_to_pgpt(upload_document, file_metadata, rag_endpoint, http_client): +def _upload_document(upload_document, file_metadata, rag_endpoint, http_client): document_id = file_metadata["unique_id"] file_name = file_metadata["name"] @@ -51,3 +62,101 @@ def _upload_document_to_pgpt(upload_document, file_metadata, rag_endpoint, http_ request_object = {"file_name": file_name, "text": upload_document.page_content} response = http_client.post(url=f"{rag_endpoint}/v1/ingest", payload=request_object) return json.loads(response) + + +class DocumentProcessor(object): + def __init__( + self, + config, + http_client: http.HttpClient, + datasource: Datasource, + ): + self._datasource = datasource + + # This is left package public for testing + self._extract_runner: ExtractRunner = Optional[None] + _ingest_runner = IngestRunner(config=config, http_client=http_client) + + if self._datasource.connection.get("destination") is not None: + self._extract_runner = ExtractRunner( + config=config, + http_client=http_client, + destination=self._datasource.connection.get("destination"), + ) + + self._mode = self._datasource.connection.get("mode") or "ingest" + + match self._mode: + case "ingest": + self._ingest_runners: list[RagRunner] = [_ingest_runner] + case "extract": + self._ingest_runners: list[RagRunner] = [self._extract_runner] + case _: + raise ValueError( + f"Invalid mode {self._mode}. Expected 'ingest' or 'extract'" + ) + + def sync( + self, + endpoint: str, + file_path: str, + last_modified: datetime.datetime, + metadata: Dict[str, Any], + store_metadata: Dict[str, Any], + ) -> None: + """ + Here we check if this file has been updated. + If the file has been updated, we delete it from the vector store and re-ingest the new updated file + """ + document_id = str( + uuid.uuid5( + uuid.NAMESPACE_DNS, f"{self._datasource.uuid}:{metadata['self_link']}" + ) + ) + document: Document = get_document(document_id=document_id) + for _ingest_runner in self._ingest_runners: + try: + response_json = _ingest_runner.run( + file_path=file_path, + metadata=metadata, + document_id=None if document is None else document.uuid, + last_modified=last_modified.replace(tzinfo=None).replace( + microsecond=0 + ), + datasource=self._datasource, + ) + except ValueError: + _LOG.warning(f"File {file_path} ingestion failed", exc_info=True) + response_json = None + except UserWarning: + _LOG.warning(f"File {file_path} is already processing") + continue + + if response_json is None: + _LOG.warning("No response from ingest runner received") + continue + + _ingest_runner.save( + document_id=document_id, + datasource_id=self._datasource.uuid, + filename=store_metadata["filename"], + base_uri=endpoint, + rag_metadata=response_json, + store_metadata=store_metadata, + last_modified=last_modified, + ) + + def unsync(self, clean: Callable) -> None: + endpoint = self._datasource.connection.get("endpoint") + + for _ingest_runner in self._ingest_runners: + documents = _ingest_runner.get(base_uri=endpoint) + for document in documents: + store_metadata = document.store_metadata + try: + rag_metadata = document.rag_metadata + except AttributeError: + rag_metadata = document.extract_metadata + + if clean(store_metadata=store_metadata): + _ingest_runner.delete(document=document, rag_metadata=rag_metadata) diff --git a/nesis/api/core/document_loaders/minio.py b/nesis/api/core/document_loaders/minio.py index 30b1b43..aee6318 100644 --- a/nesis/api/core/document_loaders/minio.py +++ b/nesis/api/core/document_loaders/minio.py @@ -1,39 +1,26 @@ -import concurrent -import concurrent.futures import logging -import multiprocessing +import logging import os -import queue import tempfile -from typing import Dict, Any, Optional +from typing import Dict, Any import memcache -import minio from minio import Minio import nesis.api.core.util.http as http -from nesis.api.core.document_loaders.runners import ( - IngestRunner, - ExtractRunner, - RagRunner, -) -from nesis.api.core.models.entities import Document, Datasource -from nesis.api.core.services.util import ( - get_document, - get_documents, -) +from nesis.api.core.document_loaders.loader_helper import DocumentProcessor +from nesis.api.core.models.entities import Datasource from nesis.api.core.util import clean_control, isblank from nesis.api.core.util.concurrency import ( IOBoundPool, as_completed, - BlockingThreadPoolExecutor, ) from nesis.api.core.util.constants import DEFAULT_DATETIME_FORMAT _LOG = logging.getLogger(__name__) -class MinioProcessor(object): +class MinioProcessor(DocumentProcessor): def __init__( self, config, @@ -41,36 +28,12 @@ def __init__( cache_client: memcache.Client, datasource: Datasource, ): + super().__init__(config, http_client, datasource) self._config = config self._http_client = http_client self._cache_client = cache_client self._datasource = datasource - # This is left public for testing - self._extract_runner: ExtractRunner = Optional[None] - _ingest_runner = IngestRunner(config=config, http_client=http_client) - if self._datasource.connection.get("destination") is not None: - self._extract_runner = ExtractRunner( - config=config, - http_client=http_client, - destination=self._datasource.connection.get("destination"), - ) - self._ingest_runners = [] - - self._ingest_runners = [IngestRunner(config=config, http_client=http_client)] - - self._mode = self._datasource.connection.get("mode") or "ingest" - - match self._mode: - case "ingest": - self._ingest_runners: list[RagRunner] = [_ingest_runner] - case "extract": - self._ingest_runners: list[RagRunner] = [self._extract_runner] - case _: - raise ValueError( - f"Invalid mode {self._mode}. Expected 'ingest' or 'extract'" - ) - def run(self, metadata: Dict[str, Any]): connection: Dict[str, str] = self._datasource.connection try: @@ -93,7 +56,6 @@ def run(self, metadata: Dict[str, Any]): ) self._unsync_documents( client=_minio_client, - connection=connection, ) except: _LOG.exception("Error fetching sharepoint documents") @@ -192,52 +154,22 @@ def _sync_document( file_path=file_path, ) - """ - Here we check if this file has been updated. - If the file has been updated, we delete it from the vector store and re-ingest the new updated file - """ - document: Document = get_document(document_id=item.etag) - document_id = None if document is None else document.uuid - - for _ingest_runner in self._ingest_runners: - try: - response_json = _ingest_runner.run( - file_path=file_path, - metadata=metadata, - document_id=document_id, - last_modified=item.last_modified.replace(tzinfo=None).replace( - microsecond=0 - ), - datasource=datasource, - ) - except ValueError: - _LOG.warning(f"File {file_path} ingestion failed", exc_info=True) - response_json = None - except UserWarning: - _LOG.debug(f"File {file_path} is already processing") - return - - if response_json is None: - return - - _ingest_runner.save( - document_id=item.etag, - datasource_id=datasource.uuid, - filename=item.object_name, - base_uri=endpoint, - rag_metadata=response_json, - store_metadata={ - "bucket_name": item.bucket_name, - "object_name": item.object_name, - "etag": item.etag, - "size": item.size, - "last_modified": item.last_modified.strftime( - DEFAULT_DATETIME_FORMAT - ), - "version_id": item.version_id, - }, - last_modified=item.last_modified, - ) + self.sync( + endpoint, + file_path, + item.last_modified, + metadata, + store_metadata={ + "bucket_name": item.bucket_name, + "object_name": item.object_name, + "filename": item.object_name, + "size": item.size, + "last_modified": item.last_modified.strftime( + DEFAULT_DATETIME_FORMAT + ), + "version_id": item.version_id, + }, + ) _LOG.info( f"Done {self._mode}ing object {item.object_name} in bucket {bucket_name}" @@ -254,37 +186,27 @@ def _sync_document( def _unsync_documents( self, client: Minio, - connection: dict, ) -> None: - try: - endpoint = connection.get("endpoint") - - for _ingest_runner in self._ingest_runners: - documents = _ingest_runner.get(base_uri=endpoint) - for document in documents: - store_metadata = document.store_metadata - try: - rag_metadata = document.rag_metadata - except AttributeError: - rag_metadata = document.extract_metadata - bucket_name = store_metadata["bucket_name"] - object_name = store_metadata["object_name"] - try: - client.stat_object( - bucket_name=bucket_name, object_name=object_name - ) - except Exception as ex: - str_ex = str(ex) - if "NoSuchKey" in str_ex and "does not exist" in str_ex: - _ingest_runner.delete( - document=document, rag_metadata=rag_metadata - ) - else: - raise + def clean(**kwargs): + store_metadata = kwargs["store_metadata"] + try: + client.stat_object( + bucket_name=store_metadata["bucket_name"], + object_name=store_metadata["object_name"], + ) + return False + except Exception as ex: + str_ex = str(ex) + if "NoSuchKey" in str_ex and "does not exist" in str_ex: + return True + else: + raise + try: + self.unsync(clean=clean) except: - _LOG.warn("Error fetching and updating documents", exc_info=True) + _LOG.warning("Error fetching and updating documents", exc_info=True) def validate_connection_info(connection: Dict[str, Any]) -> Dict[str, Any]: diff --git a/nesis/api/core/document_loaders/runners.py b/nesis/api/core/document_loaders/runners.py index baf408c..279cb01 100644 --- a/nesis/api/core/document_loaders/runners.py +++ b/nesis/api/core/document_loaders/runners.py @@ -74,14 +74,16 @@ def run( ) -> Union[Dict[str, Any], None]: if document_id is not None: + _LOG.debug(f"Checking if document {document_id} is modified") _is_modified = self._is_modified( document_id=document_id, last_modified=last_modified ) if _is_modified is None or not _is_modified: + _LOG.debug(f"Document {document_id} is not modified") return url = f"{self._rag_endpoint}/v1/extractions/text" - + _LOG.debug(f"Document {document_id} is modified, performing extraction") response = self._http_client.upload( url=url, filepath=file_path, diff --git a/nesis/api/core/document_loaders/s3.py b/nesis/api/core/document_loaders/s3.py index 7d6f274..e46ce6a 100644 --- a/nesis/api/core/document_loaders/s3.py +++ b/nesis/api/core/document_loaders/s3.py @@ -2,12 +2,14 @@ import logging import pathlib import tempfile +from concurrent.futures import as_completed from typing import Dict, Any import boto3 import memcache import nesis.api.core.util.http as http +from nesis.api.core.document_loaders.loader_helper import DocumentProcessor from nesis.api.core.models.entities import Document, Datasource from nesis.api.core.services import util from nesis.api.core.services.util import ( @@ -18,282 +20,223 @@ ingest_file, ) from nesis.api.core.util import clean_control, isblank +from nesis.api.core.util.concurrency import IOBoundPool +from nesis.api.core.util.constants import DEFAULT_DATETIME_FORMAT from nesis.api.core.util.dateutil import strptime _LOG = logging.getLogger(__name__) -def fetch_documents( - datasource: Datasource, - rag_endpoint: str, - http_client: http.HttpClient, - cache_client: memcache.Client, - metadata: Dict[str, Any], -) -> None: - try: - connection = datasource.connection - endpoint = connection.get("endpoint") - access_key = connection.get("user") - secret_key = connection.get("password") - region = connection.get("region") - if all([access_key, secret_key]): - if endpoint: - s3_client = boto3.client( - "s3", - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - region_name=region, - endpoint_url=endpoint, - ) - else: - s3_client = boto3.client( - "s3", - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - region_name=region, - ) - else: - if endpoint: - s3_client = boto3.client( - "s3", region_name=region, endpoint_url=endpoint - ) +class Processor(DocumentProcessor): + def __init__( + self, + config, + http_client: http.HttpClient, + cache_client: memcache.Client, + datasource: Datasource, + ): + super().__init__(config, http_client, datasource) + self._config = config + self._http_client = http_client + self._cache_client = cache_client + self._datasource = datasource + + def run(self, metadata: Dict[str, Any]): + connection: Dict[str, str] = self._datasource.connection + try: + endpoint = connection.get("endpoint") + access_key = connection.get("user") + secret_key = connection.get("password") + region = connection.get("region") + if all([access_key, secret_key]): + if endpoint: + s3_client = boto3.client( + "s3", + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + region_name=region, + endpoint_url=endpoint, + ) + else: + s3_client = boto3.client( + "s3", + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + region_name=region, + ) else: - s3_client = boto3.client("s3", region_name=region) - - _sync_documents( - client=s3_client, - datasource=datasource, - rag_endpoint=rag_endpoint, - http_client=http_client, - cache_client=cache_client, - metadata=metadata, - ) - _unsync_documents( - client=s3_client, - connection=connection, - rag_endpoint=rag_endpoint, - http_client=http_client, - ) - except Exception as ex: - _LOG.exception(f"Error fetching s3 documents - {ex}") - - -def _sync_documents( - client, - datasource: Datasource, - rag_endpoint: str, - http_client: http.HttpClient, - cache_client: memcache.Client, - metadata: dict, -) -> None: + if endpoint: + s3_client = boto3.client( + "s3", region_name=region, endpoint_url=endpoint + ) + else: + s3_client = boto3.client("s3", region_name=region) - try: + self._sync_documents( + client=s3_client, + datasource=self._datasource, + metadata=metadata, + ) + self._unsync_documents( + client=s3_client, + ) + except: + _LOG.exception("Error fetching sharepoint documents") - # Data objects allow us to specify bucket names - connection = datasource.connection - bucket_paths = connection.get("dataobjects") - if bucket_paths is None: - _LOG.warning("No bucket names supplied, so I can't do much") + def _sync_documents( + self, + client, + datasource: Datasource, + metadata: dict, + ) -> None: - bucket_paths_parts = bucket_paths.split(",") + try: - _LOG.info(f"Initializing syncing to endpoint {rag_endpoint}") + # Data objects allow us to specify bucket names + connection = datasource.connection + bucket_paths = connection.get("dataobjects") + if bucket_paths is None: + _LOG.warning("No bucket names supplied, so I can't do much") - for bucket_path in bucket_paths_parts: + bucket_paths_parts = bucket_paths.split(",") + futures = [] + for bucket_path in bucket_paths_parts: - # a/b/c/// should only give [a,b,c] - bucket_path_parts = [ - part for part in bucket_path.split("/") if len(part) != 0 - ] + # a/b/c/// should only give [a,b,c] + bucket_path_parts = [ + part for part in bucket_path.split("/") if len(part) != 0 + ] - path = "/".join(bucket_path_parts[1:]) - bucket_name = bucket_path_parts[0] + path = "/".join(bucket_path_parts[1:]) + bucket_name = bucket_path_parts[0] - paginator = client.get_paginator("list_objects_v2") - page_iterator = paginator.paginate( - Bucket=bucket_name, - Prefix="" if path == "" else f"{path}/", - ) - for result in page_iterator: - if result["KeyCount"] == 0: - continue - # iterate through files - for item in result["Contents"]: - # Paths ending in / are folders so we skip them - if item["Key"].endswith("/"): + paginator = client.get_paginator("list_objects_v2") + page_iterator = paginator.paginate( + Bucket=bucket_name, + Prefix="" if path == "" else f"{path}/", + ) + for result in page_iterator: + if result["KeyCount"] == 0: continue - - endpoint = connection["endpoint"] - self_link = f"{endpoint}/{bucket_name}/{item['Key']}" - _metadata = { - **(metadata or {}), - "file_name": f"{bucket_name}/{item['Key']}", - "self_link": self_link, - } - - """ - We use memcache's add functionality to implement a shared lock to allow for multiple instances - operating - """ - _lock_key = clean_control(f"{__name__}/locks/{self_link}") - if cache_client.add(key=_lock_key, val=_lock_key, time=30 * 60): - try: - _sync_document( - client=client, - datasource=datasource, - rag_endpoint=rag_endpoint, - http_client=http_client, - metadata=_metadata, - bucket_name=bucket_name, - item=item, + # iterate through files + for item in result["Contents"]: + # Paths ending in / are folders so we skip them + if item["Key"].endswith("/"): + continue + futures.append( + IOBoundPool.submit( + self._process_object, + bucket_name, + client, + datasource, + item, + metadata, ) - finally: - cache_client.delete(_lock_key) - else: - _LOG.info(f"Document {self_link} is already processing") - - _LOG.info(f"Completed syncing to endpoint {rag_endpoint}") - - except: - _LOG.warning("Error fetching and updating documents", exc_info=True) - - -def _sync_document( - client, - datasource: Datasource, - rag_endpoint: str, - http_client: http.HttpClient, - metadata: dict, - bucket_name: str, - item, -): - connection = datasource.connection - endpoint = connection["endpoint"] - _metadata = metadata - - with tempfile.NamedTemporaryFile( - dir=tempfile.gettempdir(), - ) as tmp: - key_parts = item["Key"].split("/") - - path_to_tmp = f"{str(pathlib.Path(tmp.name).absolute())}-{key_parts[-1]}" - - try: - _LOG.info(f"Starting syncing object {item['Key']} in bucket {bucket_name}") - # Write item to file - client.download_file(bucket_name, item["Key"], path_to_tmp) - - document: Document = get_document(document_id=item["ETag"]) - if document and document.base_uri == endpoint: - store_metadata = document.store_metadata - if store_metadata and store_metadata.get("last_modified"): - last_modified = store_metadata["last_modified"] - if not strptime(date_string=last_modified).replace( - tzinfo=None - ) < item["LastModified"].replace(tzinfo=None).replace( - microsecond=0 - ): - _LOG.debug( - f"Skipping document {item['Key']} already up to date" ) - return - rag_metadata: dict = document.rag_metadata - if rag_metadata is None: - return - for document_data in rag_metadata.get("data") or []: - try: - util.un_ingest_file( - http_client=http_client, - endpoint=rag_endpoint, - doc_id=document_data["doc_id"], - ) - except: - _LOG.warning( - f"Failed to delete document {document_data['doc_id']}" - ) + for future in as_completed(futures): + try: + future.result() + except: + _LOG.warning(future.exception()) - try: - delete_document(document_id=document.id) - except: - _LOG.warning( - f"Failed to delete document {item.object_name}'s record. Continuing anyway..." - ) + except: + _LOG.warning("Error fetching and updating documents", exc_info=True) + def _process_object(self, bucket_name, client, datasource, item, metadata): + connection = datasource.connection + endpoint = connection["endpoint"] + self_link = f"{endpoint}/{bucket_name}/{item['Key']}" + _metadata = { + **(metadata or {}), + "file_name": f"{bucket_name}/{item['Key']}", + "self_link": self_link, + } + """ + We use memcache's add functionality to implement a shared lock to allow for multiple instances + operating + """ + _lock_key = clean_control(f"{__name__}/locks/{self_link}") + if self._cache_client.add(key=_lock_key, val=_lock_key, time=30 * 60): try: - response = ingest_file( - http_client=http_client, - endpoint=rag_endpoint, + self._sync_document( + client=client, + datasource=datasource, metadata=_metadata, - file_path=path_to_tmp, + bucket_name=bucket_name, + item=item, ) - response_json = json.loads(response) - - except ValueError: - _LOG.warning(f"File {path_to_tmp} ingestion failed", exc_info=True) - response_json = {} - except UserWarning: - _LOG.debug(f"File {path_to_tmp} is already processing") - return - - save_document( - document_id=item["ETag"], - filename=item["Key"], - datasource_id=datasource.uuid, - base_uri=endpoint, - rag_metadata=response_json, - store_metadata={ - "bucket_name": bucket_name, - "object_name": item["Key"], - "etag": item["ETag"], - "size": item["Size"], - "last_modified": str(item["LastModified"]), - }, - last_modified=item["LastModified"], - ) - - _LOG.info(f"Done syncing object {item['Key']} in bucket {bucket_name}") - except Exception as ex: - _LOG.warning( - f"Error when getting and ingesting document {item['Key']} - {ex}" - ) - + finally: + self._cache_client.delete(_lock_key) + else: + _LOG.info(f"Document {self_link} is already processing") + + def _sync_document( + self, + client, + datasource: Datasource, + metadata: dict, + bucket_name: str, + item, + ): + endpoint = datasource.connection["endpoint"] + _metadata = metadata + + with tempfile.NamedTemporaryFile( + dir=tempfile.gettempdir(), + ) as tmp: + key_parts = item["Key"].split("/") + + path_to_tmp = f"{str(pathlib.Path(tmp.name).absolute())}-{key_parts[-1]}" -def _unsync_documents( - client, connection: dict, rag_endpoint: str, http_client: http.HttpClient -) -> None: + try: + _LOG.info( + f"Starting syncing object {item['Key']} in bucket {bucket_name}" + ) + # Write item to file + client.download_file(bucket_name, item["Key"], path_to_tmp) + self.sync( + endpoint, + path_to_tmp, + last_modified=item["LastModified"], + metadata=metadata, + store_metadata={ + "bucket_name": bucket_name, + "object_name": item["Key"], + "filename": item["Key"], + "size": item["Size"], + "last_modified": item["LastModified"].strftime( + DEFAULT_DATETIME_FORMAT + ), + }, + ) - try: - endpoint = connection.get("endpoint") + _LOG.info(f"Done syncing object {item['Key']} in bucket {bucket_name}") + except: + _LOG.warning( + f"Error when getting and ingesting document {item['Key']}", + exc_info=True, + ) - documents = get_documents(base_uri=endpoint) - for document in documents: - store_metadata = document.store_metadata - rag_metadata = document.rag_metadata - bucket_name = store_metadata["bucket_name"] - object_name = store_metadata["object_name"] + def _unsync_documents(self, client) -> None: + def clean(**kwargs): + store_metadata = kwargs["store_metadata"] try: - client.head_object(Bucket=bucket_name, Key=object_name) + client.head_object( + Bucket=store_metadata["bucket_name"], + Key=store_metadata["object_name"], + ) + return False except Exception as ex: - str_ex = str(ex).lower() + str_ex = str(ex) if not ("object" in str_ex and "not found" in str_ex): + return True + else: raise - try: - http_client.deletes( - urls=[ - f"{rag_endpoint}/v1/ingest/documents/{document_data['doc_id']}" - for document_data in rag_metadata.get("data") or [] - ] - ) - _LOG.info(f"Deleting document {document.filename}") - delete_document(document_id=document.id) - except: - _LOG.warning( - f"Failed to delete document {document.filename}", - exc_info=True, - ) - except: - _LOG.warn("Error fetching and updating documents", exc_info=True) + try: + self.unsync(clean=clean) + except: + _LOG.warning("Error fetching and updating documents", exc_info=True) def validate_connection_info(connection: Dict[str, Any]) -> Dict[str, Any]: diff --git a/nesis/api/core/document_loaders/samba.py b/nesis/api/core/document_loaders/samba.py index dd368a6..8cd92da 100644 --- a/nesis/api/core/document_loaders/samba.py +++ b/nesis/api/core/document_loaders/samba.py @@ -1,316 +1,254 @@ -import uuid +import logging import pathlib -import json -import memcache +import uuid +from concurrent.futures import as_completed from datetime import datetime from typing import Dict, Any +import memcache import smbprotocol from smbclient import scandir, stat, shutil -import logging -from nesis.api.core.models.entities import Document -from nesis.api.core.services import util -from nesis.api.core.services.util import ( - save_document, - get_document, - delete_document, - get_documents, - ValidationException, - ingest_file, -) +from nesis.api.core.document_loaders.loader_helper import DocumentProcessor +from nesis.api.core.models.entities import Datasource from nesis.api.core.util import http, clean_control, isblank +from nesis.api.core.util.concurrency import IOBoundPool from nesis.api.core.util.constants import DEFAULT_DATETIME_FORMAT, DEFAULT_SAMBA_PORT -from nesis.api.core.util.dateutil import strptime _LOG = logging.getLogger(__name__) -def fetch_documents( - connection: Dict[str, str], - rag_endpoint: str, - http_client: http.HttpClient, - cache_client: memcache.Client, - metadata: Dict[str, Any], -) -> None: - try: - _sync_samba_documents( - connection=connection, - rag_endpoint=rag_endpoint, - http_client=http_client, - metadata=metadata, - cache_client=cache_client, - ) - except: - _LOG.exception(f"Error syncing documents") - - try: - _unsync_samba_documents( - connection=connection, rag_endpoint=rag_endpoint, http_client=http_client - ) - except Exception as ex: - _LOG.exception(f"Error unsyncing documents") - - -def validate_connection_info(connection: Dict[str, Any]) -> Dict[str, Any]: - port = connection.get("port") or DEFAULT_SAMBA_PORT - _valid_keys = ["port", "endpoint", "user", "password", "dataobjects"] - if not str(port).isnumeric(): - raise ValueError("Port value cannot be non numeric") +class Processor(DocumentProcessor): + def __init__( + self, + config, + http_client: http.HttpClient, + cache_client: memcache.Client, + datasource: Datasource, + ): + super().__init__(config, http_client, datasource) + self._config = config + self._http_client = http_client + self._cache_client = cache_client + self._datasource = datasource + self._futures = [] + + def run(self, metadata: Dict[str, Any]): + connection = self._datasource.connection + try: + self._sync_samba_documents( + metadata=metadata, + ) + except: + _LOG.exception(f"Error syncing documents") - assert not isblank( - connection.get("endpoint") - ), "A valid share address must be supplied" + try: + self._unsync_samba_documents( + connection=connection, + ) + except: + _LOG.exception(f"Error unsyncing documents") - try: - _connect_samba_server(connection) - except Exception as ex: - _LOG.exception( - f"Failed to connect to samba server at {connection['endpoint']}", - ) - raise ValueError(ex) - connection["port"] = port - return { - key: val - for key, val in connection.items() - if key in _valid_keys and not isblank(connection[key]) - } + for future in as_completed(self._futures): + try: + future.result() + except: + _LOG.warning(future.exception()) + def _sync_samba_documents(self, metadata): -def _connect_samba_server(connection): - username = connection.get("user") - password = connection.get("password") - endpoint = connection.get("endpoint") - port = connection.get("port") - next(scandir(endpoint, username=username, password=password, port=port)) + connection = self._datasource.connection + username = connection["user"] + password = connection["password"] + endpoint = connection["endpoint"] + port = connection["port"] + # These are any folder specified to scope the sync to + dataobjects = connection.get("dataobjects") or "" -def _sync_samba_documents( - connection, rag_endpoint, http_client, metadata, cache_client -): + dataobjects_parts = [do.strip() for do in dataobjects.split(",")] - username = connection["user"] - password = connection["password"] - endpoint = connection["endpoint"] - port = connection["port"] - # These are any folder specified to scope the sync to - dataobjects = connection.get("dataobjects") or "" + try: + file_shares = scandir( + endpoint, username=username, password=password, port=port + ) + except Exception as ex: + _LOG.exception( + f"Error while scanning share on samba server {endpoint} - {ex}" + ) + raise - dataobjects_parts = [do.strip() for do in dataobjects.split(",")] + work_dir = f"/tmp/{uuid.uuid4()}" + pathlib.Path(work_dir).mkdir(parents=True) - try: - file_shares = scandir(endpoint, username=username, password=password, port=port) - except Exception as ex: - _LOG.exception(f"Error while scanning share on samba server {endpoint} - {ex}") - raise - - work_dir = f"/tmp/{uuid.uuid4()}" - pathlib.Path(work_dir).mkdir(parents=True) - - for file_share in file_shares: - if ( - len(dataobjects_parts) > 0 - and file_share.is_dir() - and file_share.name not in dataobjects_parts - ): - continue - try: - self_link = file_share.path - _lock_key = clean_control(f"{__name__}/locks/{self_link}") + for file_share in file_shares: + if ( + len(dataobjects_parts) > 0 + and file_share.is_dir() + and file_share.name not in dataobjects_parts + ): + continue + try: - if cache_client.add(key=_lock_key, val=_lock_key, time=30 * 60): _metadata = { **(metadata or {}), "file_name": file_share.path, - "self_link": self_link, } - try: - _process_file( + + self._futures.append( + IOBoundPool.submit( + self._process_file, connection=connection, file_share=file_share, work_dir=work_dir, - http_client=http_client, - rag_endpoint=rag_endpoint, metadata=_metadata, ) - finally: - cache_client.delete(_lock_key) - else: - _LOG.info(f"Document {self_link} is already processing") - except: - _LOG.warn( - f"Error fetching and updating documents from shared_file share {file_share.path} - ", - exc_info=True, - ) + ) - _LOG.info( - f"Completed syncing files from samba server {endpoint} " - f"to endpoint {rag_endpoint}" - ) + except: + _LOG.warning( + f"Error fetching and updating documents from shared_file share {file_share.path} - ", + exc_info=True, + ) + def _process_file(self, connection, file_share, work_dir, metadata): + username = connection["user"] + password = connection["password"] + endpoint = connection["endpoint"] + port = connection["port"] -def _process_file( - connection, file_share, work_dir, http_client, rag_endpoint, metadata -): - username = connection["user"] - password = connection["password"] - endpoint = connection["endpoint"] - port = connection["port"] + if file_share.is_dir(): + if not file_share.name.startswith("."): + dir_files = scandir( + file_share.path, username=username, password=password, port=port + ) + for dir_file in dir_files: + self._process_file( + connection=connection, + file_share=dir_file, + work_dir=work_dir, + metadata=metadata, + ) + return + + file_name = file_share.name + file_stats = stat( + file_share.path, username=username, password=password, port=port + ) + last_change_datetime = datetime.fromtimestamp(file_stats.st_chgtime) - if file_share.is_dir(): - if not file_share.name.startswith("."): - dir_files = scandir( - file_share.path, username=username, password=password, port=port + try: + file_path = f"{work_dir}/{file_share.name}" + file_unique_id = f"{uuid.uuid5(uuid.NAMESPACE_DNS, file_share.path)}" + + _LOG.info( + f"Starting syncing shared_file {file_name} in shared directory share {file_share.path}" ) - for dir_file in dir_files: - _process_file( - connection=connection, - file_share=dir_file, - work_dir=work_dir, - http_client=http_client, - rag_endpoint=rag_endpoint, - metadata=metadata, + + try: + shutil.copyfile( + file_share.path, + file_path, + username=username, + password=password, + port=port, ) - return + self_link = file_share.path + _lock_key = clean_control(f"{__name__}/locks/{self_link}") - file_name = file_share.name - file_stats = stat(file_share.path, username=username, password=password, port=port) - last_change_datetime = datetime.fromtimestamp(file_stats.st_chgtime) + metadata["self_link"] = self_link - try: - file_path = f"{work_dir}/{file_share.name}" - file_unique_id = f"{uuid.uuid5(uuid.NAMESPACE_DNS, file_share.path)}" - - _LOG.info( - f"Starting syncing shared_file {file_name} in shared directory share {file_share.path}" - ) + if self._cache_client.add(key=_lock_key, val=_lock_key, time=30 * 60): + try: + self.sync( + endpoint, + file_path, + last_modified=last_change_datetime, + metadata=metadata, + store_metadata={ + "shared_folder": file_share.name, + "file_path": file_share.path, + "filename": file_share.path, + "file_id": file_unique_id, + "size": file_stats.st_size, + "name": file_name, + "last_modified": last_change_datetime.strftime( + DEFAULT_DATETIME_FORMAT + ), + }, + ) + finally: + self._cache_client.delete(_lock_key) + else: + _LOG.info(f"Document {self_link} is already processing") + + except: + _LOG.warning( + f"Failed to copy contents of shared_file {file_name} from shared location {file_share.path}", + exc_info=True, + ) + return - try: - shutil.copyfile( - file_share.path, - file_path, - username=username, - password=password, - port=port, + _LOG.info( + f"Done syncing shared_file {file_name} in location {file_share.path}" ) except Exception as ex: _LOG.warn( - f"Failed to copy contents of shared_file {file_name} from shared location {file_share.path}", + f"Error when getting and ingesting shared_file {file_name} - {ex}", exc_info=True, ) - return - """ - Here we check if this file has been updated. - If the file has been updated, we delete it from the vector store and re-ingest the new updated file - """ - document: Document = get_document(document_id=file_unique_id) - if document and document.base_uri == endpoint: - store_metadata = document.store_metadata - if store_metadata and store_metadata.get("last_modified"): - if not strptime(date_string=store_metadata["last_modified"]).replace( - tzinfo=None - ) < last_change_datetime.replace(tzinfo=None).replace(microsecond=0): - _LOG.debug(f"Skipping shared_file {file_name} already up to date") - return - rag_metadata: dict = document.rag_metadata - if rag_metadata is None: - return - for document_data in rag_metadata.get("data") or []: - try: - util.un_ingest_file( - http_client=http_client, - endpoint=rag_endpoint, - doc_id=document_data["doc_id"], - ) - except: - _LOG.warn( - f"Failed to delete document {document_data['doc_id']}", - exc_info=True, - ) - try: - delete_document(document_id=file_unique_id) - except: - _LOG.warn( - f"Failed to delete shared_file {file_name}'s record. Continuing anyway...", - exc_info=True, - ) + def _unsync_samba_documents(self, connection): + username = connection["user"] + password = connection["password"] + port = connection["port"] - file_metadata = { - "shared_folder": file_share.name, - "file_path": file_share.path, - "file_id": file_unique_id, - "size": file_stats.st_size, - "name": file_name, - "last_modified": last_change_datetime.strftime(DEFAULT_DATETIME_FORMAT), - } + def clean(**kwargs): + store_metadata = kwargs["store_metadata"] + file_path = store_metadata["file_path"] + try: + stat(file_path, username=username, password=password, port=port) + return False + except Exception as error: + if "No such file" in str(error): + return True + else: + raise try: - response = ingest_file( - http_client=http_client, - endpoint=rag_endpoint, - metadata=metadata, - file_path=file_path, - ) - except UserWarning: - _LOG.debug(f"File {file_path} is already processing") - return - response_json = json.loads(response) - - save_document( - document_id=file_unique_id, - filename=file_share.path, - base_uri=endpoint, - rag_metadata=response_json, - store_metadata=file_metadata, - ) + self.unsync(clean=clean) + except: + _LOG.warning("Error fetching and updating documents", exc_info=True) - _LOG.info(f"Done syncing shared_file {file_name} in location {file_share.path}") - except Exception as ex: - _LOG.warn( - f"Error when getting and ingesting shared_file {file_name} - {ex}", - exc_info=True, - ) - _LOG.info( - f"Completed syncing files from shared_file share {file_share.path} to endpoint {rag_endpoint}" - ) +def _connect_samba_server(connection): + username = connection.get("user") + password = connection.get("password") + endpoint = connection.get("endpoint") + port = connection.get("port") + next(scandir(endpoint, username=username, password=password, port=port)) -def _unsync_samba_documents(connection, rag_endpoint, http_client): - try: - username = connection["user"] - password = connection["password"] - endpoint = connection["endpoint"] - port = connection["port"] - work_dir = f"/tmp/{uuid.uuid4()}" - pathlib.Path(work_dir).mkdir(parents=True) +def validate_connection_info(connection: Dict[str, Any]) -> Dict[str, Any]: + port = connection.get("port") or DEFAULT_SAMBA_PORT + _valid_keys = ["port", "endpoint", "user", "password", "dataobjects"] + if not str(port).isnumeric(): + raise ValueError("Port value cannot be non numeric") - documents = get_documents(base_uri=endpoint) - for document in documents: - store_metadata = document.store_metadata - rag_metadata = document.rag_metadata + assert not isblank( + connection.get("endpoint") + ), "A valid share address must be supplied" - file_path = store_metadata["file_path"] - try: - stat(file_path, username=username, password=password, port=port) - except smbprotocol.exceptions.SMBOSError as error: - if "No such file" not in str(error): - raise - try: - http_client.deletes( - [ - f"{rag_endpoint}/v1/ingest/documents/{document_data['doc_id']}" - for document_data in rag_metadata.get("data") or [] - ] - ) - _LOG.info(f"Deleting document {document.filename}") - delete_document(document_id=document.id) - except: - _LOG.warning( - f"Failed to delete document {document.filename}", - exc_info=True, - ) - _LOG.info(f"Completed unsyncing files from endpoint {rag_endpoint}") - except: - _LOG.warn("Error fetching and updating documents", exc_info=True) + try: + _connect_samba_server(connection) + except Exception as ex: + _LOG.exception( + f"Failed to connect to samba server at {connection['endpoint']}", + ) + raise ValueError(ex) + connection["port"] = port + return { + key: val + for key, val in connection.items() + if key in _valid_keys and not isblank(connection[key]) + } diff --git a/nesis/api/core/document_loaders/sharepoint.py b/nesis/api/core/document_loaders/sharepoint.py index 1ab496a..94f768e 100644 --- a/nesis/api/core/document_loaders/sharepoint.py +++ b/nesis/api/core/document_loaders/sharepoint.py @@ -10,6 +10,7 @@ from office365.sharepoint.client_context import ClientContext from office365.runtime.client_request_exception import ClientRequestException +from nesis.api.core.document_loaders.loader_helper import DocumentProcessor from nesis.api.core.util import http, clean_control, isblank import logging from nesis.api.core.models.entities import Document, Datasource @@ -28,278 +29,195 @@ _LOG = logging.getLogger(__name__) -def fetch_documents( - datasource: Datasource, - rag_endpoint: str, - http_client: http.HttpClient, - metadata: Dict[str, Any], - cache_client: memcache.Client, -) -> None: - try: +class Processor(DocumentProcessor): + def __init__( + self, + config, + http_client: http.HttpClient, + cache_client: memcache.Client, + datasource: Datasource, + ): + super().__init__(config, http_client, datasource) + self._config = config + self._http_client = http_client + self._cache_client = cache_client + self._datasource = datasource + self._futures = [] - connection = datasource.connection - site_url = connection.get("endpoint") - client_id = connection.get("client_id") - tenant = connection.get("tenant_id") - thumbprint = connection.get("thumbprint") - - with tempfile.NamedTemporaryFile(dir=tempfile.gettempdir()) as tmp: - cert_path = f"{str(pathlib.Path(tmp.name).absolute())}-{uuid.uuid4()}.key" - pathlib.Path(cert_path).write_text(connection["certificate"]) - - _sharepoint_context = ClientContext(site_url).with_client_certificate( - tenant=tenant, - client_id=client_id, - thumbprint=thumbprint, - cert_path=cert_path, - ) + def run(self, metadata: Dict[str, Any]): - _sync_sharepoint_documents( - sp_context=_sharepoint_context, - datasource=datasource, - rag_endpoint=rag_endpoint, - http_client=http_client, - metadata=metadata, - cache_client=cache_client, - ) - _unsync_sharepoint_documents( - sp_context=_sharepoint_context, - connection=connection, - rag_endpoint=rag_endpoint, - http_client=http_client, - ) - except Exception as ex: - _LOG.exception(f"Error fetching sharepoint documents - {ex}") + try: + connection = self._datasource.connection + site_url = connection.get("endpoint") + client_id = connection.get("client_id") + tenant = connection.get("tenant_id") + thumbprint = connection.get("thumbprint") -def _sync_sharepoint_documents( - sp_context, datasource, rag_endpoint, http_client, metadata, cache_client -): - try: - _LOG.info(f"Initializing sharepoint syncing to endpoint {rag_endpoint}") + with tempfile.NamedTemporaryFile(dir=tempfile.gettempdir()) as tmp: + cert_path = ( + f"{str(pathlib.Path(tmp.name).absolute())}-{uuid.uuid4()}.key" + ) + pathlib.Path(cert_path).write_text(connection["certificate"]) - if sp_context is None: - raise Exception( - "Sharepoint context is null, cannot proceed with document processing." - ) + _sharepoint_context = ClientContext(site_url).with_client_certificate( + tenant=tenant, + client_id=client_id, + thumbprint=thumbprint, + cert_path=cert_path, + ) + + self._sync_sharepoint_documents( + sp_context=_sharepoint_context, + metadata=metadata, + ) + self._unsync_sharepoint_documents( + sp_context=_sharepoint_context, + ) + except Exception as ex: + _LOG.exception(f"Error fetching sharepoint documents - {ex}") - # Data objects allow us to specify folder names - connection = datasource.connection - sharepoint_folders = connection.get("dataobjects") - if sharepoint_folders is None: - _LOG.warning("Sharepoint folders are specified, so I can't do much") + def _sync_sharepoint_documents(self, sp_context, metadata): + try: - sp_folders = sharepoint_folders.split(",") + if sp_context is None: + raise Exception( + "Sharepoint context is null, cannot proceed with document processing." + ) - root_folder = sp_context.web.default_document_library().root_folder + # Data objects allow us to specify folder names + connection = self._datasource.connection + sharepoint_folders = connection.get("dataobjects") + if sharepoint_folders is None: + _LOG.warning("Sharepoint folders are specified, so I can't do much") - for folder_name in sp_folders: - sharepoint_folder = root_folder.folders.get_by_path(folder_name) + sp_folders = sharepoint_folders.split(",") - if sharepoint_folder is None: - _LOG.warning( - f"Cannot retrieve Sharepoint folder {sharepoint_folder} proceeding to process other folders" - ) - continue + root_folder = sp_context.web.default_document_library().root_folder - _process_folder_files( - sharepoint_folder, - datasource=datasource, - rag_endpoint=rag_endpoint, - http_client=http_client, - metadata=metadata, - cache_client=cache_client, - ) + for folder_name in sp_folders: + sharepoint_folder = root_folder.folders.get_by_path(folder_name) + + if sharepoint_folder is None: + _LOG.warning( + f"Cannot retrieve Sharepoint folder {sharepoint_folder} proceeding to process other folders" + ) + continue - # Recursively get all the child folders - _child_folders_recursive = sharepoint_folder.get_folders( - True - ).execute_query() - for _child_folder in _child_folders_recursive: - _process_folder_files( - _child_folder, - connection=connection, - rag_endpoint=rag_endpoint, - http_client=http_client, + self._process_folder_files( + sharepoint_folder, metadata=metadata, - cache_client=cache_client, ) - _LOG.info(f"Completed syncing to endpoint {rag_endpoint}") - - except Exception as file_ex: - _LOG.exception( - f"Error fetching and updating documents - Error: {file_ex}", exc_info=True - ) - - -def _process_file( - file, datasource: Datasource, rag_endpoint, http_client, metadata, cache_client -): - connection = datasource.connection - site_url = connection.get("endpoint") - parsed_site_url = urlparse(site_url) - site_root_url = "{uri.scheme}://{uri.netloc}".format(uri=parsed_site_url) - self_link = f"{site_root_url}{file.serverRelativeUrl}" - _metadata = { - **(metadata or {}), - "file_name": file.name, - "self_link": self_link, - } - """ - We use memcache's add functionality to implement a shared lock to allow for multiple instances - operating - """ - _lock_key = clean_control(f"{__name__}/locks/{self_link}") - if cache_client.add(key=_lock_key, val=_lock_key, time=30 * 60): - try: - _sync_document( - datasource=datasource, - rag_endpoint=rag_endpoint, - http_client=http_client, - metadata=_metadata, - file=file, + # Recursively get all the child folders + _child_folders_recursive = sharepoint_folder.get_folders( + True + ).execute_query() + for _child_folder in _child_folders_recursive: + self._process_folder_files( + _child_folder, + metadata=metadata, + ) + + except Exception as file_ex: + _LOG.exception( + f"Error fetching and updating documents - Error: {file_ex}", + exc_info=True, ) - finally: - cache_client.delete(_lock_key) - else: - _LOG.info(f"Document {self_link} is already processing") - - -def _process_folder_files( - folder, datasource, rag_endpoint, http_client, metadata, cache_client -): - # process files in folder - _files = folder.get_files(False).execute_query() - for file in _files: - _process_file( - file=file, - datasource=datasource, - rag_endpoint=rag_endpoint, - http_client=http_client, - metadata=metadata, - cache_client=cache_client, - ) - - -def _sync_document( - datasource: Datasource, - rag_endpoint: str, - http_client: http.HttpClient, - metadata: dict, - file, -): - connection = datasource.connection - site_url = connection["endpoint"] - _metadata = metadata - - with tempfile.NamedTemporaryFile( - dir=tempfile.gettempdir(), - ) as tmp: - key_parts = file.serverRelativeUrl.split("/") - - path_to_tmp = f"{str(pathlib.Path(tmp.name).absolute())}-{key_parts[-1]}" - try: - _LOG.info( - f"Starting syncing file {file.name} from {file.serverRelativeUrl}" - ) - # Write item to file - downloaded_file_name = path_to_tmp # os.path.join(path_to_tmp, file.name) - # How can we refine this for efficiency - with open(downloaded_file_name, "wb") as local_file: - file.download(local_file).execute_query() - - document: Document = get_document(document_id=file.unique_id) - if document and document.base_uri == site_url: - store_metadata = document.store_metadata - if store_metadata and store_metadata.get("last_modified"): - last_modified = store_metadata["last_modified"] - if ( - not strptime(date_string=last_modified).replace(tzinfo=None) - < file.time_last_modified - ): - _LOG.debug( - f"Skipping sharepoint document {file.name} already up to date" - ) - return - rag_metadata: dict = document.rag_metadata - if rag_metadata is None: - return - for document_data in rag_metadata.get("data") or []: - try: - un_ingest_file( - http_client=http_client, - endpoint=rag_endpoint, - doc_id=document_data["doc_id"], - ) - except: - _LOG.warning( - f"Failed to delete document {document_data['doc_id']}" - ) - try: - delete_document(document_id=document.id) - except: - _LOG.warning( - f"Failed to delete document {file.name}'s record. Continuing anyway..." - ) + def _process_file( + self, + file, + metadata, + ): + connection = self._datasource.connection + site_url = connection.get("endpoint") + parsed_site_url = urlparse(site_url) + site_root_url = "{uri.scheme}://{uri.netloc}".format(uri=parsed_site_url) + self_link = f"{site_root_url}{file.serverRelativeUrl}" + _metadata = { + **(metadata or {}), + "file_name": file.name, + "self_link": self_link, + } + + """ + We use memcache's add functionality to implement a shared lock to allow for multiple instances + operating + """ + _lock_key = clean_control(f"{__name__}/locks/{self_link}") + if self._cache_client.add(key=_lock_key, val=_lock_key, time=30 * 60): try: - response = ingest_file( - http_client=http_client, - endpoint=rag_endpoint, + self._sync_document( metadata=_metadata, - file_path=downloaded_file_name, + file=file, ) - response_json = json.loads(response) + finally: + self._cache_client.delete(_lock_key) + else: + _LOG.info(f"Document {self_link} is already processing") + + def _process_folder_files(self, folder, metadata): + # process files in folder + _files = folder.get_files(False).execute_query() + for file in _files: + self._process_file( + file=file, + metadata=metadata, + ) - except ValueError: - _LOG.warning( - f"File {downloaded_file_name} ingestion failed", exc_info=True + def _sync_document( + self, + metadata: dict, + file, + ): + connection = self._datasource.connection + site_url = connection["endpoint"] + _metadata = metadata + + with tempfile.NamedTemporaryFile( + dir=tempfile.gettempdir(), + ) as tmp: + key_parts = file.serverRelativeUrl.split("/") + + path_to_tmp = f"{str(pathlib.Path(tmp.name).absolute())}-{key_parts[-1]}" + + try: + _LOG.info( + f"Starting syncing file {file.name} from {file.serverRelativeUrl}" ) - response_json = {} - except UserWarning: - _LOG.debug(f"File {downloaded_file_name} is already processing") - return - - save_document( - document_id=file.unique_id, - filename=file.serverRelativeUrl, - base_uri=site_url, - rag_metadata=response_json, - datasource_id=datasource.uuid, - store_metadata={ - "file_name": file.name, - "file_url": file.serverRelativeUrl, - "etag": file.unique_id, - "size": file.length, - "author": file.author, - "last_modified": file.time_last_modified.strftime( - DEFAULT_DATETIME_FORMAT - ), - }, - last_modified=file.time_last_modified, - ) - _LOG.info(f"Done syncing object {file.name} in at {file.serverRelativeUrl}") - except Exception as ex: - _LOG.warning( - f"Error when getting and ingesting file {file.name}", exc_info=True - ) + # Write item to file + # How can we refine this for efficiency + with open(path_to_tmp, "wb") as local_file: + file.download(local_file).execute_query() -def _unsync_sharepoint_documents(sp_context, http_client, rag_endpoint, connection): + self.sync( + site_url, + path_to_tmp, + last_modified=file.time_last_modified, + metadata=metadata, + store_metadata={ + "filename": file.name, + "file_url": file.serverRelativeUrl, + "etag": file.unique_id, + "size": file.length, + "author": file.author, + "last_modified": file.time_last_modified.strftime( + DEFAULT_DATETIME_FORMAT + ), + }, + ) + except: + _LOG.warning( + f"Error when getting and ingesting file {file.name}", exc_info=True + ) - try: - site_url = connection.get("endpoint") + def _unsync_sharepoint_documents(self, sp_context): - if sp_context is None: - raise Exception( - "Sharepoint context is null, cannot proceed with document processing." - ) + def clean(**kwargs): + store_metadata = kwargs["store_metadata"] - documents = get_documents(base_uri=site_url) - for document in documents: - store_metadata = document.store_metadata - rag_metadata = document.rag_metadata file_url = store_metadata["file_url"] try: # Check that the file still exists on the sharepoint server @@ -308,27 +226,14 @@ def _unsync_sharepoint_documents(sp_context, http_client, rag_endpoint, connecti ).get().execute_query() except ClientRequestException as e: if e.response.status_code == 404: - # File no longer exists on sharepoint server so we need to delete from model - try: - http_client.deletes( - urls=[ - f"{rag_endpoint}/v1/ingest/documents/{document_data['doc_id']}" - for document_data in rag_metadata.get("data") or [] - ] - ) - _LOG.info(f"Deleting document {document.filename}") - delete_document(document_id=document.id) - except: - _LOG.warning( - f"Failed to delete document {document.filename}", - exc_info=True, - ) - except Exception as ex: - _LOG.warning( - f"Failed to retrieve file {file_url} from sharepoint - {ex}" - ) - except: - _LOG.warning("Error fetching and updating documents", exc_info=True) + return True + else: + raise + + try: + self.unsync(clean=clean) + except: + _LOG.warning("Error fetching and updating documents", exc_info=True) def validate_connection_info(connection: Dict[str, Any]) -> Dict[str, Any]: diff --git a/nesis/api/core/tasks/document_management.py b/nesis/api/core/tasks/document_management.py index 890babd..c6e06bd 100644 --- a/nesis/api/core/tasks/document_management.py +++ b/nesis/api/core/tasks/document_management.py @@ -51,28 +51,35 @@ def ingest_datasource(**kwargs) -> None: minio_ingestor.run(metadata=metadata) case DatasourceType.SHAREPOINT: - sharepoint.fetch_documents( - datasource=datasource, - rag_endpoint=rag_endpoint, + + ingestor = sharepoint.Processor( + config=config, http_client=http_client, cache_client=cache_client, - metadata={"datasource": datasource.name}, + datasource=datasource, ) + + ingestor.run(metadata=metadata) case DatasourceType.WINDOWS_SHARE: - samba.fetch_documents( - connection=datasource.connection, - rag_endpoint=rag_endpoint, + + ingestor = samba.Processor( + config=config, http_client=http_client, - metadata={"datasource": datasource.name}, cache_client=cache_client, + datasource=datasource, ) + + ingestor.run(metadata=metadata) + case DatasourceType.S3: - s3.fetch_documents( - datasource=datasource, - rag_endpoint=rag_endpoint, + minio_ingestor = s3.Processor( + config=config, http_client=http_client, - metadata={"datasource": datasource.name}, cache_client=cache_client, + datasource=datasource, ) + + minio_ingestor.run(metadata=metadata) + case _: raise ValueError("Invalid datasource type") diff --git a/nesis/api/tests/core/document_loaders/test_minio.py b/nesis/api/tests/core/document_loaders/test_minio.py index 8148d71..9e51366 100644 --- a/nesis/api/tests/core/document_loaders/test_minio.py +++ b/nesis/api/tests/core/document_loaders/test_minio.py @@ -400,11 +400,17 @@ def test_update_ingest_documents( session.add(datasource) session.commit() - # The document record + self_link = "http://localhost:4566/my-test-bucket/SomeName" + # The document record document = Document( base_uri="http://localhost:4566", - document_id="d41d8cd98f00b204e9800998ecf8427e", + document_id=str( + uuid.uuid5( + uuid.NAMESPACE_DNS, + f"{datasource.uuid}:{self_link}", + ) + ), filename="invalid.pdf", rag_metadata={"data": [{"doc_id": str(uuid.uuid4())}]}, store_metadata={ @@ -449,8 +455,8 @@ def test_update_ingest_documents( ) # The document would be deleted from the rag engine - _, upload_kwargs = http_client.deletes.call_args_list[0] - urls = upload_kwargs["urls"] + _, deletes_kwargs = http_client.deletes.call_args_list[0] + urls = deletes_kwargs["urls"] assert ( urls[0] @@ -474,7 +480,7 @@ def test_update_ingest_documents( { "datasource": "documents", "file_name": "my-test-bucket/SomeName", - "self_link": "http://localhost:4566/my-test-bucket/SomeName", + "self_link": self_link, }, ) diff --git a/nesis/api/tests/core/document_loaders/test_s3.py b/nesis/api/tests/core/document_loaders/test_s3.py index d95f50d..f1c2028 100644 --- a/nesis/api/tests/core/document_loaders/test_s3.py +++ b/nesis/api/tests/core/document_loaders/test_s3.py @@ -13,6 +13,7 @@ import nesis.api.core.services as services import nesis.api.core.document_loaders.s3 as s3 from nesis.api import tests +from nesis.api.core.document_loaders.stores import SqlDocumentStore from nesis.api.core.models import DBSession from nesis.api.core.models import initialize_engine from nesis.api.core.models.entities import ( @@ -107,12 +108,14 @@ def test_sync_documents( ] s3_client.get_paginator.return_value = paginator - s3.fetch_documents( - datasource=datasource, + ingestor = s3.Processor( + config=tests.config, http_client=http_client, - metadata={"datasource": "documents"}, - rag_endpoint="http://localhost:8080", cache_client=cache, + datasource=datasource, + ) + ingestor.run( + metadata={"datasource": "documents"}, ) _, upload_kwargs = http_client.upload.call_args_list[0] @@ -150,7 +153,7 @@ def test_update_sync_documents( "connection": { "endpoint": "http://localhost:4566", "region": "us-east-1", - "dataobjects": "my-test-bucket", + "dataobjects": "some-bucket", }, } @@ -164,17 +167,26 @@ def test_update_sync_documents( session.add(datasource) session.commit() + self_link = "http://localhost:4566/some-bucket/invalid.pdf" + + # The document record document = Document( base_uri="http://localhost:4566", - document_id="d41d8cd98f00b204e9800998ecf8427e", + document_id=str( + uuid.uuid5( + uuid.NAMESPACE_DNS, + f"{datasource.uuid}:{self_link}", + ) + ), filename="invalid.pdf", rag_metadata={"data": [{"doc_id": str(uuid.uuid4())}]}, store_metadata={ "bucket_name": "some-bucket", - "object_name": "file/path.pdf", + "object_name": "invalid.pdf", "last_modified": "2023-07-18 06:40:07", }, - last_modified=datetime.datetime.utcnow(), + last_modified=strptime("2023-07-19 06:40:07"), + datasource_id=datasource.uuid, ) session.add(document) @@ -191,8 +203,8 @@ def test_update_sync_documents( "KeyCount": 1, "Contents": [ { - "Key": "image.jpg", - "LastModified": strptime("2023-07-19 06:40:07"), + "Key": "invalid.pdf", + "LastModified": strptime("2023-07-20 06:40:07"), "ETag": "d41d8cd98f00b204e9800998ecf8427e", "Size": 0, "StorageClass": "STANDARD", @@ -206,22 +218,24 @@ def test_update_sync_documents( ] s3_client.get_paginator.return_value = paginator - s3.fetch_documents( - datasource=datasource, + ingestor = s3.Processor( + config=tests.config, http_client=http_client, - metadata={"datasource": "documents"}, - rag_endpoint="http://localhost:8080", cache_client=cache, + datasource=datasource, + ) + + ingestor.run( + metadata={"datasource": "documents"}, ) # The document would be deleted from the rag engine - _, upload_kwargs = http_client.delete.call_args_list[0] - url = upload_kwargs["url"] + _, deletes_kwargs = http_client.deletes.call_args_list[0] + url = deletes_kwargs["urls"] - assert ( - url - == f"http://localhost:8080/v1/ingest/documents/{document.rag_metadata['data'][0]['doc_id']}" - ) + assert url == [ + f"http://localhost:8080/v1/ingest/documents/{document.rag_metadata['data'][0]['doc_id']}" + ] # And then re-ingested _, upload_kwargs = http_client.upload.call_args_list[0] @@ -231,21 +245,22 @@ def test_update_sync_documents( field = upload_kwargs["field"] assert url == f"http://localhost:8080/v1/ingest/files" - assert file_path.endswith("image.jpg") + assert file_path.endswith("invalid.pdf") assert field == "file" ut.TestCase().assertDictEqual( metadata, { "datasource": "documents", - "file_name": "my-test-bucket/image.jpg", - "self_link": "http://localhost:4566/my-test-bucket/image.jpg", + "file_name": "some-bucket/invalid.pdf", + "self_link": self_link, }, ) # The document has now been updated documents = session.query(Document).all() assert len(documents) == 1 - assert documents[0].store_metadata["last_modified"] == "2023-07-19 06:40:07" + assert documents[0].store_metadata["last_modified"] == "2023-07-20 06:40:07" + assert str(documents[0].last_modified) == "2023-07-20 06:40:07" @mock.patch("nesis.api.core.document_loaders.s3.boto3.client") @@ -260,8 +275,6 @@ def test_unsync_s3_documents( "engine": "s3", "connection": { "endpoint": "http://localhost:4566", - # "user": "test", - # "password": "test", "region": "us-east-1", "dataobjects": "some-non-existing-bucket", }, @@ -299,12 +312,15 @@ def test_unsync_s3_documents( documents = session.query(Document).all() assert len(documents) == 1 - s3.fetch_documents( - datasource=datasource, + ingestor = s3.Processor( + config=tests.config, http_client=http_client, - metadata={"datasource": "documents"}, - rag_endpoint="http://localhost:8080", cache_client=cache, + datasource=datasource, + ) + + ingestor.run( + metadata={"datasource": "documents"}, ) _, upload_kwargs = http_client.deletes.call_args_list[0] @@ -316,3 +332,189 @@ def test_unsync_s3_documents( ) documents = session.query(Document).all() assert len(documents) == 0 + + +@mock.patch("nesis.api.core.document_loaders.s3.boto3.client") +def test_extract_documents( + client: mock.MagicMock, cache: mock.MagicMock, session: Session +) -> None: + destination_sql_url = tests.config["database"]["url"] + # destination_sql_url = "mssql+pymssql://sa:Pa55woR.d12345@localhost:11433/master" + data = { + "name": "s3 documents", + "engine": "s3", + "connection": { + "endpoint": "https://s3.endpoint", + "access_key": "", + "secret_key": "", + "dataobjects": "buckets", + "mode": "extract", + "destination": { + "sql": {"url": destination_sql_url}, + }, + }, + } + + datasource = Datasource( + name=data["name"], + connection=data["connection"], + source_type=DatasourceType.S3, + status=DatasourceStatus.ONLINE, + ) + + session.add(datasource) + session.commit() + + http_client = mock.MagicMock() + http_client.upload.return_value = json.dumps({}) + s3_client = mock.MagicMock() + + client.return_value = s3_client + paginator = mock.MagicMock() + paginator.paginate.return_value = [ + { + "KeyCount": 1, + "Contents": [ + { + "Key": "image.jpg", + "LastModified": strptime("2023-07-18 06:40:07"), + "ETag": '"d41d8cd98f00b204e9800998ecf8427e"', + "Size": 0, + "StorageClass": "STANDARD", + "Owner": { + "DisplayName": "webfile", + "ID": "75aa57f09aa0c8caeab4f8c24e99d10f8e7faeebf76c078efc7c6caea54ba06a", + }, + } + ], + } + ] + s3_client.get_paginator.return_value = paginator + + ingestor = s3.Processor( + config=tests.config, + http_client=http_client, + cache_client=cache, + datasource=datasource, + ) + + extract_store = SqlDocumentStore( + url=data["connection"]["destination"]["sql"]["url"] + ) + + with Session(extract_store._engine) as session: + initial_count = len( + session.query(ingestor._extract_runner._extraction_store.Store) + .filter() + .all() + ) + + ingestor.run( + metadata={"datasource": "documents"}, + ) + + _, upload_kwargs = http_client.upload.call_args_list[0] + url = upload_kwargs["url"] + metadata = upload_kwargs["metadata"] + field = upload_kwargs["field"] + + assert url == "http://localhost:8080/v1/extractions/text" + assert field == "file" + ut.TestCase().assertDictEqual( + metadata, + { + "datasource": "documents", + "file_name": "buckets/image.jpg", + "self_link": "https://s3.endpoint/buckets/image.jpg", + }, + ) + + with Session(extract_store._engine) as session: + all_documents = ( + session.query(ingestor._extract_runner._extraction_store.Store) + .filter() + .all() + ) + assert len(all_documents) == initial_count + 1 + + +@mock.patch("nesis.api.core.document_loaders.s3.boto3.client") +def test_unextract_documents( + client: mock.MagicMock, cache: mock.MagicMock, session: Session +) -> None: + """ + Test deleting of s3 documents from the rag engine if they have been deleted from the s3 bucket + """ + destination_sql_url = tests.config["database"]["url"] + data = { + "name": "s3 documents", + "engine": "s3", + "connection": { + "endpoint": "https://s3.endpoint", + "access_key": "", + "secret_key": "", + "dataobjects": "buckets", + "mode": "extract", + "destination": { + "sql": {"url": destination_sql_url}, + }, + }, + } + datasource = Datasource( + name=data["name"], + connection=data["connection"], + source_type=DatasourceType.MINIO, + status=DatasourceStatus.ONLINE, + ) + + session.add(datasource) + session.commit() + + http_client = mock.MagicMock() + s3_client = mock.MagicMock() + + client.return_value = s3_client + s3_client.head_object.side_effect = Exception("HeadObject Not Found") + + minio_ingestor = s3.Processor( + config=tests.config, + http_client=http_client, + cache_client=cache, + datasource=datasource, + ) + + extract_store = SqlDocumentStore( + url=data["connection"]["destination"]["sql"]["url"] + ) + + with Session(extract_store._engine) as session: + session.query(minio_ingestor._extract_runner._extraction_store.Store).delete() + document = minio_ingestor._extract_runner._extraction_store.Store() + document.base_uri = data["connection"]["endpoint"] + document.uuid = str(uuid.uuid4()) + document.filename = "invalid.pdf" + document.extract_metadata = {"data": [{"doc_id": str(uuid.uuid4())}]} + document.store_metadata = { + "bucket_name": "some-bucket", + "object_name": "file/path.pdf", + } + document.last_modified = datetime.datetime.utcnow() + + session.add(document) + session.commit() + + initial_count = len( + session.query(minio_ingestor._extract_runner._extraction_store.Store) + .filter() + .all() + ) + + minio_ingestor.run( + metadata={"datasource": "documents"}, + ) + + with Session(extract_store._engine) as session: + documents = session.query( + minio_ingestor._extract_runner._extraction_store.Store + ).all() + assert len(documents) == initial_count - 1 diff --git a/nesis/api/tests/core/document_loaders/test_samba.py b/nesis/api/tests/core/document_loaders/test_samba.py index ca65db8..30d2eda 100644 --- a/nesis/api/tests/core/document_loaders/test_samba.py +++ b/nesis/api/tests/core/document_loaders/test_samba.py @@ -1,8 +1,10 @@ +import datetime import json import os import time import unittest as ut import unittest.mock as mock +import uuid import pytest from sqlalchemy.orm.session import Session @@ -14,12 +16,14 @@ from nesis.api.core.models import initialize_engine from nesis.api.core.models.entities import ( Datasource, + Document, ) from nesis.api.core.models.objects import ( DatasourceType, DatasourceStatus, ) +from nesis.api.core.util.dateutil import strptime @pytest.fixture @@ -48,14 +52,12 @@ def configure() -> None: @mock.patch("nesis.api.core.document_loaders.samba.scandir") @mock.patch("nesis.api.core.document_loaders.samba.stat") @mock.patch("nesis.api.core.document_loaders.samba.shutil") -def test_fetch_documents( - shutil, stat, scandir, cache: mock.MagicMock, session: Session -) -> None: +def test_ingest(shutil, stat, scandir, cache: mock.MagicMock, session: Session) -> None: data = { "name": "s3 documents", - "engine": "s3", + "engine": "samba", "connection": { - "endpoint": "https://s3.endpoint", + "endpoint": r"\\Share", "user": "user", "port": "445", "password": "password", @@ -87,12 +89,14 @@ def test_fetch_documents( http_client = mock.MagicMock() http_client.upload.return_value = json.dumps({}) - samba.fetch_documents( - connection=data["connection"], + ingestor = samba.Processor( + config=tests.config, http_client=http_client, - metadata={"datasource": "documents"}, - rag_endpoint="http://localhost:8080", cache_client=cache, + datasource=datasource, + ) + ingestor.run( + metadata={"datasource": "documents"}, ) _, upload_kwargs = http_client.upload.call_args_list[0] @@ -112,3 +116,180 @@ def test_fetch_documents( "self_link": r"\\Share\SomeName", }, ) + + +@mock.patch("nesis.api.core.document_loaders.samba.scandir") +@mock.patch("nesis.api.core.document_loaders.samba.stat") +@mock.patch("nesis.api.core.document_loaders.samba.shutil") +def test_uningest( + shutil, stat, scandir, cache: mock.MagicMock, session: Session +) -> None: + """ + Test deleting of s3 documents from the rag engine if they have been deleted from the s3 bucket + """ + data = { + "name": "s3 documents", + "engine": "windows_share", + "connection": { + "endpoint": r"\\Share", + "user": "user", + "port": "445", + "password": "password", + "dataobjects": "buckets", + }, + } + + datasource = Datasource( + name=data["name"], + connection=data["connection"], + source_type=DatasourceType.SHAREPOINT, + status=DatasourceStatus.ONLINE, + ) + + session.add(datasource) + session.commit() + + document = Document( + base_uri=datasource.connection["endpoint"], + datasource_id=datasource.uuid, + document_id=str(uuid.uuid4()), + filename="invalid.pdf", + rag_metadata={"data": [{"doc_id": str(uuid.uuid4())}]}, + store_metadata={ + "bucket_name": "some-bucket", + "object_name": "file/path.pdf", + "file_path": r"\\Share\file\path.pdf", + }, + last_modified=datetime.datetime.utcnow(), + ) + + session.add(document) + session.commit() + + http_client = mock.MagicMock() + + stat.side_effect = Exception("No such file") + + documents = session.query(Document).all() + assert len(documents) == 1 + + ingestor = samba.Processor( + config=tests.config, + http_client=http_client, + cache_client=cache, + datasource=datasource, + ) + + ingestor.run( + metadata={"datasource": "documents"}, + ) + + _, upload_kwargs = http_client.deletes.call_args_list[0] + urls = upload_kwargs["urls"] + + assert ( + urls[0] + == f"http://localhost:8080/v1/ingest/documents/{document.rag_metadata['data'][0]['doc_id']}" + ) + documents = session.query(Document).all() + assert len(documents) == 0 + + +@mock.patch("nesis.api.core.document_loaders.samba.scandir") +@mock.patch("nesis.api.core.document_loaders.samba.stat") +@mock.patch("nesis.api.core.document_loaders.samba.shutil") +def test_update(shutil, stat, scandir, cache: mock.MagicMock, session: Session) -> None: + """ + Test updating documents if they have been updated at the s3 bucket end. + """ + + data = { + "name": "s3 documents", + "engine": "windows_share", + "connection": { + "endpoint": r"\\Share", + "user": "user", + "port": "445", + "password": "password", + "dataobjects": "buckets", + }, + } + + datasource = Datasource( + name=data["name"], + connection=data["connection"], + source_type=DatasourceType.SHAREPOINT, + status=DatasourceStatus.ONLINE, + ) + + session.add(datasource) + session.commit() + + self_link = "http://localhost:4566/some-bucket/invalid.pdf" + + # The document record + document = Document( + base_uri=r"\\Share", + document_id=str( + uuid.uuid5( + uuid.NAMESPACE_DNS, + rf"{datasource.uuid}:\\Share\file\path.pdf", + ) + ), + filename="invalid.pdf", + rag_metadata={"data": [{"doc_id": str(uuid.uuid4())}]}, + store_metadata={ + "bucket_name": "some-bucket", + "object_name": "invalid.pdf", + "last_modified": "2023-07-18 06:40:07", + "file_path": r"\\Share\file\path.pdf", + }, + last_modified=strptime("2023-07-19 06:40:07"), + datasource_id=datasource.uuid, + ) + + session.add(document) + session.commit() + + share = mock.MagicMock() + share.is_dir.return_value = False + type(share).name = mock.PropertyMock(return_value="SomeName") + type(share).path = mock.PropertyMock(return_value=r"\\Share\file\path.pdf") + scandir.return_value = [share] + + file_stat = mock.MagicMock() + stat.return_value = file_stat + type(file_stat).st_size = mock.PropertyMock(return_value=1) + type(file_stat).st_chgtime = mock.PropertyMock( + return_value=strptime("2023-07-20 06:40:07").timestamp() + ) + + http_client = mock.MagicMock() + http_client.upload.return_value = json.dumps({}) + + ingestor = samba.Processor( + config=tests.config, + http_client=http_client, + cache_client=cache, + datasource=datasource, + ) + ingestor.run( + metadata={"datasource": "documents"}, + ) + + # The document would be deleted from the rag engine + _, deletes_kwargs = http_client.deletes.call_args_list[0] + url = deletes_kwargs["urls"] + + assert url == [ + f"http://localhost:8080/v1/ingest/documents/{document.rag_metadata['data'][0]['doc_id']}" + ] + + # And then re-ingested + _, upload_kwargs = http_client.upload.call_args_list[0] + + # The document has now been updated + documents = session.query(Document).all() + assert len(documents) == 1 + assert documents[0].store_metadata["last_modified"] == "2023-07-20 06:40:07" + assert str(documents[0].last_modified) == "2023-07-20 06:40:07" diff --git a/nesis/api/tests/core/document_loaders/test_sharepoint.py b/nesis/api/tests/core/document_loaders/test_sharepoint.py index 5ad3d91..fcc48ec 100644 --- a/nesis/api/tests/core/document_loaders/test_sharepoint.py +++ b/nesis/api/tests/core/document_loaders/test_sharepoint.py @@ -22,10 +22,11 @@ DatasourceType, DatasourceStatus, ) +from nesis.api.core.util.dateutil import strptime @mock.patch("nesis.api.core.document_loaders.sharepoint.ClientContext") -def test_sync_sharepoint_documents( +def test_ingest( client_context: mock.MagicMock, cache: mock.MagicMock, session: Session ) -> None: data = { @@ -81,12 +82,14 @@ def test_sync_sharepoint_documents( http_client = mock.MagicMock() http_client.upload.return_value = json.dumps({}) - sharepoint.fetch_documents( - datasource=datasource, + ingestor = sharepoint.Processor( + config=tests.config, http_client=http_client, - metadata={"datasource": "documents"}, - rag_endpoint="http://localhost:8080", cache_client=cache, + datasource=datasource, + ) + ingestor.run( + metadata={"datasource": "documents"}, ) _, upload_kwargs = http_client.upload.call_args_list[0] @@ -109,7 +112,7 @@ def test_sync_sharepoint_documents( @mock.patch("nesis.api.core.document_loaders.sharepoint.ClientContext") -def test_sync_updated_sharepoint_documents( +def test_updated( sharepoint_context: mock.MagicMock, cache: mock.MagicMock, session: Session ) -> None: """ @@ -139,10 +142,13 @@ def test_sync_updated_sharepoint_documents( session.add(datasource) session.commit() + self_link = "https://ametnes.sharepoint.com/sites/nesit-test/Shared Documents/sharepoint_file.pdf" document = Document( datasource_id=datasource.uuid, base_uri="https://ametnes.sharepoint.com/sites/nesis-test/", - document_id="edu323-23423-23frs-234232", + document_id=str( + uuid.uuid5(uuid.NAMESPACE_DNS, f"{datasource.uuid}:{self_link}") + ), filename="sharepoint_file.pdf", rag_metadata={"data": [{"doc_id": str(uuid.uuid4())}]}, store_metadata={ @@ -153,7 +159,7 @@ def test_sync_updated_sharepoint_documents( "author": "author_name", "last_modified": "2024-01-10 06:40:07", }, - last_modified=datetime.datetime.utcnow(), + last_modified=strptime("2024-01-10 06:40:07"), ) session.add(document) @@ -180,7 +186,7 @@ def test_sync_updated_sharepoint_documents( ) type(file_mock).time_last_modified = mock.PropertyMock( return_value=datetime.datetime.strptime( - "2024-04-10 06:40:07", "%Y-%m-%d %H:%M:%S" + "2024-04-11 06:40:07", "%Y-%m-%d %H:%M:%S" ) ) type(file_mock).length = mock.PropertyMock(return_value=2023) @@ -195,22 +201,23 @@ def test_sync_updated_sharepoint_documents( http_client = mock.MagicMock() http_client.upload.return_value = json.dumps({}) - sharepoint.fetch_documents( - datasource=datasource, + ingestor = sharepoint.Processor( + config=tests.config, http_client=http_client, - metadata={"datasource": "documents"}, - rag_endpoint="http://localhost:8080", cache_client=cache, + datasource=datasource, + ) + ingestor.run( + metadata={"datasource": "documents"}, ) # The document would be deleted from the rag engine - _, upload_kwargs = http_client.delete.call_args_list[0] - url = upload_kwargs["url"] + _, upload_kwargs = http_client.deletes.call_args_list[0] + urls = upload_kwargs["urls"] - assert ( - url - == f"http://localhost:8080/v1/ingest/documents/{document.rag_metadata['data'][0]['doc_id']}" - ) + assert urls == [ + f"http://localhost:8080/v1/ingest/documents/{document.rag_metadata['data'][0]['doc_id']}" + ] # And then re-ingested _, upload_kwargs = http_client.upload.call_args_list[0] @@ -234,11 +241,11 @@ def test_sync_updated_sharepoint_documents( # The document has now been updated documents = session.query(Document).all() assert len(documents) == 1 - assert documents[0].store_metadata["last_modified"] == "2024-04-10 06:40:07" + assert documents[0].store_metadata["last_modified"] == "2024-04-11 06:40:07" @mock.patch("nesis.api.core.document_loaders.sharepoint.ClientContext") -def test_unsync_sharepoint_documents( +def test_uningest( sharepoint_context: mock.MagicMock, cache: mock.MagicMock, session: Session ) -> None: """ @@ -318,12 +325,14 @@ def test_unsync_sharepoint_documents( documents = session.query(Document).all() assert len(documents) == 1 - sharepoint.fetch_documents( - datasource=datasource, + ingestor = sharepoint.Processor( + config=tests.config, http_client=http_client, - metadata={"datasource": "documents"}, - rag_endpoint="http://localhost:8080", cache_client=cache, + datasource=datasource, + ) + ingestor.run( + metadata={"datasource": "documents"}, ) _, upload_kwargs = http_client.deletes.call_args_list[0] diff --git a/nesis/api/tests/tasks/test_document_management.py b/nesis/api/tests/tasks/test_document_management.py index c8bf7a6..55c9546 100644 --- a/nesis/api/tests/tasks/test_document_management.py +++ b/nesis/api/tests/tasks/test_document_management.py @@ -97,15 +97,9 @@ def test_ingest_datasource_minio( @mock.patch("nesis.api.core.document_loaders.samba.scandir") -@mock.patch("nesis.api.core.tasks.document_management.samba._unsync_samba_documents") -@mock.patch("nesis.api.core.tasks.document_management.samba._sync_samba_documents") +@mock.patch("nesis.api.core.tasks.document_management.samba.Processor") def test_ingest_datasource_samba( - _sync_samba_documents: mock.MagicMock, - _unsync_samba_documents: mock.MagicMock, - scandir, - tc, - cache_client, - http_client, + ingestor: mock.MagicMock, scandir, tc, cache_client, http_client ): """ Test the ingestion happy path @@ -117,7 +111,7 @@ def test_ingest_datasource_samba( password=tests.admin_password, ) datasource: Datasource = create_datasource( - token=admin_user.token, datasource_type="windows_share" + token=admin_user.token, datasource_type="WINDOWS_SHARE" ) ingest_datasource( @@ -127,21 +121,9 @@ def test_ingest_datasource_samba( params={"datasource": {"id": datasource.uuid}}, ) - _, kwargs_sync_samba_documents = _sync_samba_documents.call_args_list[0] - assert ( - kwargs_sync_samba_documents["rag_endpoint"] == tests.config["rag"]["endpoint"] - ) - tc.assertDictEqual(kwargs_sync_samba_documents["connection"], datasource.connection) - tc.assertDictEqual( - kwargs_sync_samba_documents["metadata"], {"datasource": datasource.name} - ) - - _, kwargs_unsync_samba_documents = _unsync_samba_documents.call_args_list[0] - assert ( - kwargs_unsync_samba_documents["rag_endpoint"] == tests.config["rag"]["endpoint"] - ) + _, kwargs_fetch_documents = ingestor.return_value.run.call_args_list[0] tc.assertDictEqual( - kwargs_unsync_samba_documents["connection"], datasource.connection + kwargs_fetch_documents["metadata"], {"datasource": datasource.name} ) @@ -162,17 +144,8 @@ def test_ingest_datasource_invalid_datasource( assert "Invalid datasource" in str(ex_info) -@mock.patch("nesis.api.core.document_loaders.s3.boto3.client") -@mock.patch("nesis.api.core.tasks.document_management.s3._unsync_documents") -@mock.patch("nesis.api.core.tasks.document_management.s3._sync_documents") -def test_ingest_datasource_s3( - _sync_documents: mock.MagicMock, - _unsync_documents: mock.MagicMock, - client: mock.MagicMock(), - tc, - cache_client, - http_client, -): +@mock.patch("nesis.api.core.tasks.document_management.s3.Processor") +def test_ingest_datasource_s3(ingestor: mock.MagicMock, tc, cache_client, http_client): """ Test the ingestion happy path """ @@ -193,37 +166,15 @@ def test_ingest_datasource_s3( params={"datasource": {"id": datasource.uuid}}, ) - _, kwargs_sync_samba_documents = _sync_documents.call_args_list[0] - assert ( - kwargs_sync_samba_documents["rag_endpoint"] == tests.config["rag"]["endpoint"] - ) - assert kwargs_sync_samba_documents.get("datasource") is not None - - tc.assertDictEqual( - kwargs_sync_samba_documents["metadata"], {"datasource": datasource.name} - ) - - _, kwargs_unsync_samba_documents = _unsync_documents.call_args_list[0] - assert ( - kwargs_unsync_samba_documents["rag_endpoint"] == tests.config["rag"]["endpoint"] - ) + _, kwargs_fetch_documents = ingestor.return_value.run.call_args_list[0] tc.assertDictEqual( - kwargs_unsync_samba_documents["connection"], datasource.connection + kwargs_fetch_documents["metadata"], {"datasource": datasource.name} ) -@mock.patch( - "nesis.api.core.tasks.document_management.sharepoint._unsync_sharepoint_documents" -) -@mock.patch( - "nesis.api.core.tasks.document_management.sharepoint._sync_sharepoint_documents" -) +@mock.patch("nesis.api.core.tasks.document_management.sharepoint.Processor") def test_ingest_datasource_sharepoint( - _sync_sharepoint_documents: mock.MagicMock, - _unsync_sharepoint_documents: mock.MagicMock, - tc, - cache_client, - http_client, + ingestor: mock.MagicMock, tc, cache_client, http_client ): """ Test the ingestion happy path @@ -245,24 +196,7 @@ def test_ingest_datasource_sharepoint( params={"datasource": {"id": datasource.uuid}}, ) - _, kwargs_sync_sharepoint_documents = _sync_sharepoint_documents.call_args_list[0] - assert ( - kwargs_sync_sharepoint_documents["rag_endpoint"] - == tests.config["rag"]["endpoint"] - ) - assert kwargs_sync_sharepoint_documents.get("datasource") is not None - - tc.assertDictEqual( - kwargs_sync_sharepoint_documents["metadata"], {"datasource": datasource.name} - ) - - _, kwargs_unsync_sharepoint_documents = _unsync_sharepoint_documents.call_args_list[ - 0 - ] - assert ( - kwargs_unsync_sharepoint_documents["rag_endpoint"] - == tests.config["rag"]["endpoint"] - ) + _, kwargs_fetch_documents = ingestor.return_value.run.call_args_list[0] tc.assertDictEqual( - kwargs_unsync_sharepoint_documents["connection"], datasource.connection + kwargs_fetch_documents["metadata"], {"datasource": datasource.name} )