From decfb0fc2609a856746f9252c28dc77c6502bc7e Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 17 Jun 2024 20:40:10 -0400 Subject: [PATCH 01/28] Add job types --- .../scripts/native/search.py | 6 +- .../initialize-orchestration-db.py | 1 + .../job_orchestration/scheduler/constants.py | 10 ++ .../scheduler/query/query_scheduler.py | 92 ++++++++++--------- .../scheduler/scheduler_data.py | 13 ++- 5 files changed, 75 insertions(+), 47 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/search.py b/components/clp-package-utils/clp_package_utils/scripts/native/search.py index aa261d904..e8fc8da3c 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/search.py @@ -15,7 +15,7 @@ import pymongo from clp_py_utils.clp_config import Database, QUERY_JOBS_TABLE_NAME, ResultsCache from clp_py_utils.sql_adapter import SQL_Adapter -from job_orchestration.scheduler.constants import QueryJobStatus +from job_orchestration.scheduler.constants import QueryJobStatus, QueryJobType from job_orchestration.scheduler.job_config import AggregationConfig, SearchConfig from clp_package_utils.general import ( @@ -111,8 +111,8 @@ def create_and_monitor_job_in_db( ) as db_cursor: # Create job db_cursor.execute( - f"INSERT INTO `{QUERY_JOBS_TABLE_NAME}` (`job_config`) VALUES (%s)", - (msgpack.packb(search_config.dict()),), + f"INSERT INTO `{QUERY_JOBS_TABLE_NAME}` (`job_config`, `type`) VALUES (%s, %s)", + (msgpack.packb(search_config.dict()), QueryJobType.SEARCH), ) db_conn.commit() job_id = db_cursor.lastrowid diff --git a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py index 32a285c42..4899fb85d 100644 --- a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py +++ b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py @@ -97,6 +97,7 @@ def main(argv): f""" CREATE TABLE IF NOT EXISTS `{QUERY_JOBS_TABLE_NAME}` ( `id` INT NOT NULL AUTO_INCREMENT, + `type`INT NOT NULL, `status` INT NOT NULL DEFAULT '{QueryJobStatus.PENDING}', `creation_time` DATETIME(3) NOT NULL DEFAULT CURRENT_TIMESTAMP(3), `num_tasks` INT NOT NULL DEFAULT '0', diff --git a/components/job-orchestration/job_orchestration/scheduler/constants.py b/components/job-orchestration/job_orchestration/scheduler/constants.py index 62f06f0cf..3d813e30f 100644 --- a/components/job-orchestration/job_orchestration/scheduler/constants.py +++ b/components/job-orchestration/job_orchestration/scheduler/constants.py @@ -67,3 +67,13 @@ def __str__(self) -> str: def to_str(self) -> str: return str(self.name) + +class QueryJobType(IntEnum): + SEARCH = 0 + EXTRACT_IR = auto() + + def __str__(self) -> str: + return str(self.value) + + def to_str(self) -> str: + return str(self.name) \ No newline at end of file diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index d8a045f31..51b93fe1a 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -40,7 +40,7 @@ from clp_py_utils.decorators import exception_default_value from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.fs_search_task import search -from job_orchestration.scheduler.constants import QueryJobStatus, QueryTaskStatus +from job_orchestration.scheduler.constants import QueryJobStatus, QueryTaskStatus, QueryJobType from job_orchestration.scheduler.job_config import SearchConfig from job_orchestration.scheduler.query.reducer_handler import ( handle_reducer_connection, @@ -102,7 +102,8 @@ def fetch_new_search_jobs(db_conn) -> list: db_cursor.execute( f""" SELECT {QUERY_JOBS_TABLE_NAME}.id as job_id, - {QUERY_JOBS_TABLE_NAME}.job_config + {QUERY_JOBS_TABLE_NAME}.job_config, + {QUERY_JOBS_TABLE_NAME}.type FROM {QUERY_JOBS_TABLE_NAME} WHERE {QUERY_JOBS_TABLE_NAME}.status={QueryJobStatus.PENDING} """ @@ -370,55 +371,62 @@ def handle_pending_search_jobs( reducer_acquisition_tasks = [] - pending_jobs = [ - job for job in active_jobs.values() if InternalJobState.WAITING_FOR_DISPATCH == job.state + pending_search_jobs = [ + job for job in active_jobs.values() if InternalJobState.WAITING_FOR_DISPATCH == job.state and QueryJobType.SEARCH == job.type ] with contextlib.closing(db_conn_pool.connect()) as db_conn: for job in fetch_new_search_jobs(db_conn): job_id = str(job["job_id"]) + job_type = job["type"] + job_config = job["job_config"] + + if QueryJobType.SEARCH == job_type: + # Avoid double-dispatch when a job is WAITING_FOR_REDUCER + if job_id in active_jobs: + continue + + search_config = SearchConfig.parse_obj(msgpack.unpackb(job_config)) + archives_for_search = get_archives_for_search(db_conn, search_config) + if len(archives_for_search) == 0: + if set_job_or_task_status( + db_conn, + QUERY_JOBS_TABLE_NAME, + job_id, + QueryJobStatus.SUCCEEDED, + QueryJobStatus.PENDING, + start_time=datetime.datetime.now(), + num_tasks=0, + duration=0, + ): + logger.info(f"No matching archives, skipping job {job_id}.") + continue + + new_search_job = SearchJob( + id=job_id, + search_config=search_config, + state=InternalJobState.WAITING_FOR_DISPATCH, + num_archives_to_search=len(archives_for_search), + num_archives_searched=0, + remaining_archives_for_search=archives_for_search, + ) - # Avoid double-dispatch when a job is WAITING_FOR_REDUCER - if job_id in active_jobs: - continue - - search_config = SearchConfig.parse_obj(msgpack.unpackb(job["job_config"])) - archives_for_search = get_archives_for_search(db_conn, search_config) - if len(archives_for_search) == 0: - if set_job_or_task_status( - db_conn, - QUERY_JOBS_TABLE_NAME, - job_id, - QueryJobStatus.SUCCEEDED, - QueryJobStatus.PENDING, - start_time=datetime.datetime.now(), - num_tasks=0, - duration=0, - ): - logger.info(f"No matching archives, skipping job {job['job_id']}.") - continue - - new_search_job = SearchJob( - id=job_id, - search_config=search_config, - state=InternalJobState.WAITING_FOR_DISPATCH, - num_archives_to_search=len(archives_for_search), - num_archives_searched=0, - remaining_archives_for_search=archives_for_search, - ) + if search_config.aggregation_config is not None: + new_search_job.search_config.aggregation_config.job_id = int(job_id) + new_search_job.state = InternalJobState.WAITING_FOR_REDUCER + new_search_job.reducer_acquisition_task = asyncio.create_task( + acquire_reducer_for_job(new_search_job) + ) + reducer_acquisition_tasks.append(new_search_job.reducer_acquisition_task) + else: + pending_search_jobs.append(new_search_job) + active_jobs[job_id] = new_search_job - if search_config.aggregation_config is not None: - new_search_job.search_config.aggregation_config.job_id = job["job_id"] - new_search_job.state = InternalJobState.WAITING_FOR_REDUCER - new_search_job.reducer_acquisition_task = asyncio.create_task( - acquire_reducer_for_job(new_search_job) - ) - reducer_acquisition_tasks.append(new_search_job.reducer_acquisition_task) else: - pending_jobs.append(new_search_job) - active_jobs[job_id] = new_search_job + logger.error(f"Unexpected job type: {job_type}, skipping job {job_id}") + continue - for job in pending_jobs: + for job in pending_search_jobs: job_id = job.id if ( diff --git a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py index a3aa5f436..5939ba0f2 100644 --- a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py +++ b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py @@ -3,10 +3,10 @@ from enum import auto, Enum from typing import Any, Dict, List, Optional -from job_orchestration.scheduler.constants import CompressionTaskStatus, QueryTaskStatus +from job_orchestration.scheduler.constants import CompressionTaskStatus, QueryTaskStatus, QueryJobType from job_orchestration.scheduler.job_config import SearchConfig from job_orchestration.scheduler.query.reducer_handler import ReducerHandlerMessageQueues -from pydantic import BaseModel, validator +from pydantic import BaseModel, validator, Field class CompressionJob(BaseModel): @@ -37,10 +37,17 @@ class InternalJobState(Enum): class QueryJob(BaseModel): id: str + type: QueryJobType state: InternalJobState start_time: Optional[datetime.datetime] current_sub_job_async_task_result: Optional[Any] + @validator("type") + def valid_type(cls, field): + supported_job = [QueryJobType.SEARCH, QueryJobType.EXTRACT_IR] + if field not in supported_job: + raise ValueError(f'must be one of the following {"|".join(supported_job)}') + return field class SearchJob(QueryJob): search_config: SearchConfig @@ -50,6 +57,8 @@ class SearchJob(QueryJob): reducer_acquisition_task: Optional[asyncio.Task] reducer_handler_msg_queues: Optional[ReducerHandlerMessageQueues] + type: QueryJobType = Field(default=QueryJobType.SEARCH, const=True) + class Config: # To allow asyncio.Task and asyncio.Queue arbitrary_types_allowed = True From e52877ce75141d16312f1def48b3bd886541c414 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 19 Jun 2024 11:50:02 -0400 Subject: [PATCH 02/28] Add base class for SearchQuery --- .../executor/query/fs_search_task.py | 4 +- .../job_orchestration/scheduler/job_config.py | 15 +- .../scheduler/query/query_scheduler.py | 320 ++++++++++-------- .../scheduler/scheduler_data.py | 39 ++- .../webui/imports/api/search/constants.js | 14 + .../api/search/server/QueryJobsDbManager.js | 8 +- 6 files changed, 239 insertions(+), 161 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index f51eae407..ac3d1312f 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -168,7 +168,7 @@ def search( task_id=task_id, status=QueryTaskStatus.FAILED, duration=0, - error_log_path=clo_log_path, + error_log_path=str(clo_log_path), ).dict() update_search_task_metadata( @@ -231,6 +231,6 @@ def sigterm_handler(_signo, _stack_frame): ) if QueryTaskStatus.FAILED == search_status: - search_task_result.error_log_path = clo_log_path + search_task_result.error_log_path = str(clo_log_path) return search_task_result.dict() diff --git a/components/job-orchestration/job_orchestration/scheduler/job_config.py b/components/job-orchestration/job_orchestration/scheduler/job_config.py index 93d4ede4e..6968b6dad 100644 --- a/components/job-orchestration/job_orchestration/scheduler/job_config.py +++ b/components/job-orchestration/job_orchestration/scheduler/job_config.py @@ -3,7 +3,7 @@ import typing from pydantic import BaseModel, validator - +from abc import ABC class PathsToCompress(BaseModel): file_paths: typing.List[str] @@ -39,7 +39,18 @@ class AggregationConfig(BaseModel): count_by_time_bucket_size: typing.Optional[int] = None # Milliseconds -class SearchConfig(BaseModel): +class QueryConfig(BaseModel, ABC): + ... + + +class ExtractConfig(QueryConfig): + orig_file_id: str + msg_ix: int + file_split_id: typing.Optional[str] = None + target_size: typing.Optional[int] = None + + +class SearchConfig(QueryConfig): query_string: str max_num_results: int tags: typing.Optional[typing.List[str]] = None diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index 51b93fe1a..434a03f7f 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -24,7 +24,7 @@ import pathlib import sys from pathlib import Path -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Any import celery import msgpack @@ -41,21 +41,21 @@ from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.fs_search_task import search from job_orchestration.scheduler.constants import QueryJobStatus, QueryTaskStatus, QueryJobType -from job_orchestration.scheduler.job_config import SearchConfig +from job_orchestration.scheduler.job_config import SearchConfig, ExtractConfig from job_orchestration.scheduler.query.reducer_handler import ( handle_reducer_connection, ReducerHandlerMessage, ReducerHandlerMessageQueues, ReducerHandlerMessageType, ) -from job_orchestration.scheduler.scheduler_data import InternalJobState, QueryTaskResult, SearchJob +from job_orchestration.scheduler.scheduler_data import InternalJobState, QueryTaskResult, SearchJob, QueryJob, ExtractJob from pydantic import ValidationError # Setup logging logger = get_logger("search-job-handler") # Dictionary of active jobs indexed by job id -active_jobs: Dict[str, SearchJob] = {} +active_jobs: Dict[str, QueryJob] = {} reducer_connection_queue: Optional[asyncio.Queue] = None @@ -91,7 +91,7 @@ async def release_reducer_for_job(job: SearchJob): @exception_default_value(default=[]) -def fetch_new_search_jobs(db_conn) -> list: +def fetch_new_query_jobs(db_conn) -> list: """ Fetches search jobs with status=PENDING from the database. :param db_conn: @@ -112,7 +112,7 @@ def fetch_new_search_jobs(db_conn) -> list: @exception_default_value(default=[]) -def fetch_cancelling_search_jobs(db_conn) -> list: +def fetch_cancelling_query_jobs(db_conn) -> list: """ Fetches search jobs with status=CANCELLING from the database. :param db_conn: @@ -175,55 +175,67 @@ def set_job_or_task_status( return rval -async def handle_cancelling_search_jobs(db_conn_pool) -> None: +async def handle_cancelling_query_jobs(db_conn_pool) -> None: global active_jobs with contextlib.closing(db_conn_pool.connect()) as db_conn: - cancelling_jobs = fetch_cancelling_search_jobs(db_conn) - + cancelling_jobs = fetch_cancelling_query_jobs(db_conn) for cancelling_job in cancelling_jobs: job_id = str(cancelling_job["job_id"]) - if job_id in active_jobs: - job = active_jobs.pop(job_id) - cancel_job_except_reducer(job) - # Perform any async tasks last so that it's easier to reason about synchronization - # issues between concurrent tasks - await release_reducer_for_job(job) - else: - continue + job_type = job.type() + if QueryJobType.SEARCH == job_type: + if job_id in active_jobs: + job = active_jobs.pop(job_id) + cancel_job_except_reducer(job) + # Perform any async tasks last so that it's easier to reason about synchronization + # issues between concurrent tasks + await release_reducer_for_job(job) + else: + continue - set_job_or_task_status( - db_conn, - QUERY_TASKS_TABLE_NAME, - job_id, - QueryTaskStatus.CANCELLED, - QueryTaskStatus.PENDING, - duration=0, - ) + set_job_or_task_status( + db_conn, + QUERY_TASKS_TABLE_NAME, + job_id, + QueryTaskStatus.CANCELLED, + QueryTaskStatus.PENDING, + duration=0, + ) - set_job_or_task_status( - db_conn, - QUERY_TASKS_TABLE_NAME, - job_id, - QueryTaskStatus.CANCELLED, - QueryTaskStatus.RUNNING, - duration="TIMESTAMPDIFF(MICROSECOND, start_time, NOW())/1000000.0", - ) + set_job_or_task_status( + db_conn, + QUERY_TASKS_TABLE_NAME, + job_id, + QueryTaskStatus.CANCELLED, + QueryTaskStatus.RUNNING, + duration="TIMESTAMPDIFF(MICROSECOND, start_time, NOW())/1000000.0", + ) - if set_job_or_task_status( - db_conn, - QUERY_JOBS_TABLE_NAME, - job_id, - QueryJobStatus.CANCELLED, - QueryJobStatus.CANCELLING, - duration=(datetime.datetime.now() - job.start_time).total_seconds(), - ): - logger.info(f"Cancelled job {job_id}.") + if set_job_or_task_status( + db_conn, + QUERY_JOBS_TABLE_NAME, + job_id, + QueryJobStatus.CANCELLED, + QueryJobStatus.CANCELLING, + duration=(datetime.datetime.now() - job.start_time).total_seconds(), + ): + logger.info(f"Cancelled job {job_id}.") + else: + logger.error(f"Failed to cancel job {job_id}.") else: - logger.error(f"Failed to cancel job {job_id}.") + logger.error(f"Unexpected job type: {job_type} for cancellation, marking job {job_id} as failed.") + if not set_job_or_task_status( + db_conn, + QUERY_JOBS_TABLE_NAME, + job_id, + QueryJobStatus.FAILED, + duration=(datetime.datetime.now() - job.start_time).total_seconds(), + ): + logger.error(f"Failed to mark job {job_id} as failed.") + -def insert_search_tasks_into_db(db_conn, job_id, archive_ids: List[str]) -> List[int]: +def insert_query_tasks_into_db(db_conn, job_id, archive_ids: List[str]) -> List[int]: task_ids = [] with contextlib.closing(db_conn.cursor()) as cursor: for archive_id in archive_ids: @@ -274,18 +286,17 @@ def get_archives_for_search( def get_task_group_for_job( archive_ids: List[str], task_ids: List[int], - job_id: str, - search_config: SearchConfig, + job: QueryJob, clp_metadata_db_conn_params: Dict[str, any], results_cache_uri: str, ): - search_config_obj = search_config.dict() + job_config_obj = job.job_config().dict() return celery.group( search.s( - job_id=job_id, + job_id=job.id, archive_id=archive_ids[i], task_id=task_ids[i], - search_config_obj=search_config_obj, + search_config_obj=job_config_obj, clp_metadata_db_conn_params=clp_metadata_db_conn_params, results_cache_uri=results_cache_uri, ) @@ -293,22 +304,20 @@ def get_task_group_for_job( ) -def dispatch_search_job( +def dispatch_query_job( db_conn, - job: SearchJob, - archives_for_search: List[Dict[str, any]], + job: QueryJob, + archive_ids: List[str], clp_metadata_db_conn_params: Dict[str, any], results_cache_uri: str, ) -> None: global active_jobs - archive_ids = [archive["archive_id"] for archive in archives_for_search] - task_ids = insert_search_tasks_into_db(db_conn, job.id, archive_ids) + task_ids = insert_query_tasks_into_db(db_conn, job.id, archive_ids) task_group = get_task_group_for_job( archive_ids, task_ids, - job.id, - job.search_config, + job, clp_metadata_db_conn_params, results_cache_uri, ) @@ -376,7 +385,7 @@ def handle_pending_search_jobs( ] with contextlib.closing(db_conn_pool.connect()) as db_conn: - for job in fetch_new_search_jobs(db_conn): + for job in fetch_new_query_jobs(db_conn): job_id = str(job["job_id"]) job_type = job["type"] job_config = job["job_config"] @@ -421,14 +430,12 @@ def handle_pending_search_jobs( else: pending_search_jobs.append(new_search_job) active_jobs[job_id] = new_search_job - else: logger.error(f"Unexpected job type: {job_type}, skipping job {job_id}") continue for job in pending_search_jobs: job_id = job.id - if ( job.search_config.network_address is None and len(job.remaining_archives_for_search) > num_archives_to_search_per_sub_job @@ -443,11 +450,15 @@ def handle_pending_search_jobs( archives_for_search = job.remaining_archives_for_search job.remaining_archives_for_search = [] - dispatch_search_job( - db_conn, job, archives_for_search, clp_metadata_db_conn_params, results_cache_uri + archive_ids_for_search = [ + archive["archive_id"] for archive in archives_for_search + ] + + dispatch_query_job( + db_conn, job, archive_ids_for_search, clp_metadata_db_conn_params, results_cache_uri ) logger.info( - f"Dispatched job {job_id} with {len(archives_for_search)} archives to search." + f"Dispatched job {job_id} with {len(archive_ids_for_search)} archives to search." ) start_time = datetime.datetime.now() job.start_time = start_time @@ -495,6 +506,95 @@ def found_max_num_latest_results( return max_timestamp_in_remaining_archives <= min_timestamp_in_top_results +async def handle_returned_search_job( + db_conn, + job: SearchJob, + task_results: Optional[Any], + results_cache_uri: str +) -> None: + global active_jobs + + job_id = job.id + is_reducer_job = job.reducer_handler_msg_queues is not None + new_job_status = QueryJobStatus.RUNNING + for task_result_obj in task_results: + task_result = QueryTaskResult.parse_obj(task_result_obj) + task_id = task_result.task_id + task_status = task_result.status + if not task_status == QueryTaskStatus.SUCCEEDED: + new_job_status = QueryJobStatus.FAILED + logger.error( + f"Search task job-{job_id}-task-{task_id} failed. " + f"Check {task_result.error_log_path} for details." + ) + else: + job.num_archives_searched += 1 + logger.info( + f"Search task job-{job_id}-task-{task_id} succeeded in " + f"{task_result.duration} second(s)." + ) + + if new_job_status != QueryJobStatus.FAILED: + max_num_results = job.search_config.max_num_results + # Check if we've searched all archives + if len(job.remaining_archives_for_search) == 0: + new_job_status = QueryJobStatus.SUCCEEDED + # Check if we've reached max results + elif False == is_reducer_job and max_num_results > 0: + if found_max_num_latest_results( + results_cache_uri, + job_id, + max_num_results, + job.remaining_archives_for_search[0]["end_timestamp"], + ): + new_job_status = QueryJobStatus.SUCCEEDED + if new_job_status == QueryJobStatus.RUNNING: + job.current_sub_job_async_task_result = None + job.state = InternalJobState.WAITING_FOR_DISPATCH + logger.info(f"Job {job_id} waiting for more archives to search.") + set_job_or_task_status( + db_conn, + QUERY_JOBS_TABLE_NAME, + job_id, + QueryJobStatus.RUNNING, + QueryJobStatus.RUNNING, + num_tasks_completed=job.num_archives_searched, + ) + return + + reducer_failed = False + if is_reducer_job: + # Notify reducer that it should have received all results + msg = ReducerHandlerMessage(ReducerHandlerMessageType.SUCCESS) + await job.reducer_handler_msg_queues.put_to_handler(msg) + + msg = await job.reducer_handler_msg_queues.get_from_handler() + if ReducerHandlerMessageType.FAILURE == msg.msg_type: + reducer_failed = True + new_job_status = QueryJobStatus.FAILED + elif ReducerHandlerMessageType.SUCCESS != msg.msg_type: + error_msg = f"Unexpected msg_type: {msg.msg_type.name}" + raise NotImplementedError(error_msg) + + # We set the status regardless of the job's previous status to handle the case where the + # job is cancelled (status = CANCELLING) while we're in this method. + if set_job_or_task_status( + db_conn, + QUERY_JOBS_TABLE_NAME, + job_id, + new_job_status, + num_tasks_completed=job.num_archives_searched, + duration=(datetime.datetime.now() - job.start_time).total_seconds(), + ): + if new_job_status == QueryJobStatus.SUCCEEDED: + logger.info(f"Completed job {job_id}.") + elif reducer_failed: + logger.error(f"Completed job {job_id} with failing reducer.") + else: + logger.info(f"Completed job {job_id} with failing tasks.") + del active_jobs[job_id] + + async def check_job_status_and_update_db(db_conn_pool, results_cache_uri): global active_jobs @@ -503,16 +603,15 @@ async def check_job_status_and_update_db(db_conn_pool, results_cache_uri): id for id, job in active_jobs.items() if InternalJobState.RUNNING == job.state ]: job = active_jobs[job_id] - is_reducer_job = job.reducer_handler_msg_queues is not None - try: returned_results = try_getting_task_result(job.current_sub_job_async_task_result) except Exception as e: logger.error(f"Job `{job_id}` failed: {e}.") # Clean up - if is_reducer_job: - msg = ReducerHandlerMessage(ReducerHandlerMessageType.FAILURE) - await job.reducer_handler_msg_queues.put_to_handler(msg) + if QueryJobType.SEARCH == job.type(): + if job.reducer_handler_msg_queues is not None: + msg = ReducerHandlerMessage(ReducerHandlerMessageType.FAILURE) + await job.reducer_handler_msg_queues.put_to_handler(msg) del active_jobs[job_id] set_job_or_task_status( @@ -527,89 +626,24 @@ async def check_job_status_and_update_db(db_conn_pool, results_cache_uri): if returned_results is None: continue - - new_job_status = QueryJobStatus.RUNNING - for task_result_obj in returned_results: - task_result = QueryTaskResult.parse_obj(task_result_obj) - task_id = task_result.task_id - task_status = task_result.status - if not task_status == QueryTaskStatus.SUCCEEDED: - new_job_status = QueryJobStatus.FAILED - logger.error( - f"Search task job-{job_id}-task-{task_id} failed. " - f"Check {task_result.error_log_path} for details." - ) - else: - job.num_archives_searched += 1 - logger.info( - f"Search task job-{job_id}-task-{task_id} succeeded in " - f"{task_result.duration} second(s)." - ) - - if new_job_status != QueryJobStatus.FAILED: - max_num_results = job.search_config.max_num_results - # Check if we've searched all archives - if len(job.remaining_archives_for_search) == 0: - new_job_status = QueryJobStatus.SUCCEEDED - # Check if we've reached max results - elif False == is_reducer_job and max_num_results > 0: - if found_max_num_latest_results( - results_cache_uri, - job_id, - max_num_results, - job.remaining_archives_for_search[0]["end_timestamp"], - ): - new_job_status = QueryJobStatus.SUCCEEDED - if new_job_status == QueryJobStatus.RUNNING: - job.current_sub_job_async_task_result = None - job.state = InternalJobState.WAITING_FOR_DISPATCH - logger.info(f"Job {job_id} waiting for more archives to search.") - set_job_or_task_status( + job_type = job.type() + if QueryJobType.SEARCH == job.type(): + search_job: SearchJob = job + await handle_returned_search_job( db_conn, - QUERY_JOBS_TABLE_NAME, - job_id, - QueryJobStatus.RUNNING, - QueryJobStatus.RUNNING, - num_tasks_completed=job.num_archives_searched, + search_job, + returned_results, + results_cache_uri ) - continue + else: + logger.error(f"Unexpected job type: {job_type}, skipping job {job_id}") + - reducer_failed = False - if is_reducer_job: - # Notify reducer that it should have received all results - msg = ReducerHandlerMessage(ReducerHandlerMessageType.SUCCESS) - await job.reducer_handler_msg_queues.put_to_handler(msg) - - msg = await job.reducer_handler_msg_queues.get_from_handler() - if ReducerHandlerMessageType.FAILURE == msg.msg_type: - reducer_failed = True - new_job_status = QueryJobStatus.FAILED - elif ReducerHandlerMessageType.SUCCESS != msg.msg_type: - error_msg = f"Unexpected msg_type: {msg.msg_type.name}" - raise NotImplementedError(error_msg) - - # We set the status regardless of the job's previous status to handle the case where the - # job is cancelled (status = CANCELLING) while we're in this method. - if set_job_or_task_status( - db_conn, - QUERY_JOBS_TABLE_NAME, - job_id, - new_job_status, - num_tasks_completed=job.num_archives_searched, - duration=(datetime.datetime.now() - job.start_time).total_seconds(), - ): - if new_job_status == QueryJobStatus.SUCCEEDED: - logger.info(f"Completed job {job_id}.") - elif reducer_failed: - logger.error(f"Completed job {job_id} with failing reducer.") - else: - logger.info(f"Completed job {job_id} with failing tasks.") - del active_jobs[job_id] async def handle_job_updates(db_conn_pool, results_cache_uri: str, jobs_poll_delay: float): while True: - await handle_cancelling_search_jobs(db_conn_pool) + await handle_cancelling_query_jobs(db_conn_pool) await check_job_status_and_update_db(db_conn_pool, results_cache_uri) await asyncio.sleep(jobs_poll_delay) diff --git a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py index 5939ba0f2..04ff0fc50 100644 --- a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py +++ b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py @@ -4,10 +4,10 @@ from typing import Any, Dict, List, Optional from job_orchestration.scheduler.constants import CompressionTaskStatus, QueryTaskStatus, QueryJobType -from job_orchestration.scheduler.job_config import SearchConfig +from job_orchestration.scheduler.job_config import SearchConfig, QueryConfig, ExtractConfig from job_orchestration.scheduler.query.reducer_handler import ReducerHandlerMessageQueues from pydantic import BaseModel, validator, Field - +from abc import ABC, abstractmethod class CompressionJob(BaseModel): id: int @@ -30,24 +30,37 @@ def valid_status(cls, field): class InternalJobState(Enum): + PENDING = auto() WAITING_FOR_REDUCER = auto() WAITING_FOR_DISPATCH = auto() RUNNING = auto() -class QueryJob(BaseModel): +class QueryJob(BaseModel, ABC): id: str - type: QueryJobType state: InternalJobState start_time: Optional[datetime.datetime] current_sub_job_async_task_result: Optional[Any] - @validator("type") - def valid_type(cls, field): - supported_job = [QueryJobType.SEARCH, QueryJobType.EXTRACT_IR] - if field not in supported_job: - raise ValueError(f'must be one of the following {"|".join(supported_job)}') - return field + @abstractmethod + def type(self) -> QueryJobType: + ... + + @abstractmethod + def job_config(self) -> QueryConfig: + ... + + +class ExtractJob(QueryJob): + extract_config: ExtractConfig + archive_id: str + + def type(self) -> QueryJobType: + return QueryJobType.EXTRACT_IR + + def job_config(self) -> QueryConfig: + return self.extract_config + class SearchJob(QueryJob): search_config: SearchConfig @@ -57,7 +70,11 @@ class SearchJob(QueryJob): reducer_acquisition_task: Optional[asyncio.Task] reducer_handler_msg_queues: Optional[ReducerHandlerMessageQueues] - type: QueryJobType = Field(default=QueryJobType.SEARCH, const=True) + def type(self) -> QueryJobType: + return QueryJobType.SEARCH + + def job_config(self) -> QueryConfig: + return self.search_config class Config: # To allow asyncio.Task and asyncio.Queue arbitrary_types_allowed = True diff --git a/components/webui/imports/api/search/constants.js b/components/webui/imports/api/search/constants.js index ec4c13ad6..2da303915 100644 --- a/components/webui/imports/api/search/constants.js +++ b/components/webui/imports/api/search/constants.js @@ -56,6 +56,20 @@ const isOperationInProgress = (s) => ( (true === isSearchSignalReq(s)) || (true === isSearchSignalQuerying(s)) ); +/* eslint-disable sort-keys */ +let enumQueryType; +/** + * Enum of job type, matching the `QueryJobType` class in + * `job_orchestration.query_scheduler.constants`. + * + * @enum {number} + */ +const QUERY_JOB_TYPE = Object.freeze({ + SEARCH: (enumQueryType = 0), + EXTRACT_IR: ++enumQueryType, + +}); +/* eslint-enable sort-keys */ /* eslint-disable sort-keys */ let enumQueryJobStatus; diff --git a/components/webui/imports/api/search/server/QueryJobsDbManager.js b/components/webui/imports/api/search/server/QueryJobsDbManager.js index 4d3bed94a..266aac523 100644 --- a/components/webui/imports/api/search/server/QueryJobsDbManager.js +++ b/components/webui/imports/api/search/server/QueryJobsDbManager.js @@ -5,6 +5,7 @@ import {sleep} from "/imports/utils/misc"; import { QUERY_JOB_STATUS, QUERY_JOB_STATUS_WAITING_STATES, + QUERY_JOB_TYPE, } from "../constants"; @@ -21,6 +22,7 @@ const JOB_COMPLETION_STATUS_POLL_INTERVAL_MILLIS = 0.5; const QUERY_JOBS_TABLE_COLUMN_NAMES = Object.freeze({ ID: "id", STATUS: "status", + TYPE: "type", JOB_CONFIG: "job_config", }); @@ -52,9 +54,9 @@ class QueryJobsDbManager { async submitSearchJob (searchConfig) { const [queryInsertResults] = await this.#sqlDbConnPool.query( `INSERT INTO ${this.#queryJobsTableName} - (${QUERY_JOBS_TABLE_COLUMN_NAMES.JOB_CONFIG}) - VALUES (?)`, - [Buffer.from(msgpack.encode(searchConfig))], + (${QUERY_JOBS_TABLE_COLUMN_NAMES.JOB_CONFIG}, ${QUERY_JOBS_TABLE_COLUMN_NAMES.TYPE}) + VALUES (?, ?)`, + [Buffer.from(msgpack.encode(searchConfig)), QUERY_JOB_TYPE.SEARCH], ); return queryInsertResults.insertId; From d28681cc3adbe5461ddfdd81083f74cb1d1f221a Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 19 Jun 2024 12:58:32 -0400 Subject: [PATCH 03/28] Fix --- .../scheduler/query/query_scheduler.py | 94 +++++++++---------- .../webui/imports/api/search/constants.js | 29 +++--- 2 files changed, 57 insertions(+), 66 deletions(-) diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index 434a03f7f..9107a5aa4 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -93,9 +93,9 @@ async def release_reducer_for_job(job: SearchJob): @exception_default_value(default=[]) def fetch_new_query_jobs(db_conn) -> list: """ - Fetches search jobs with status=PENDING from the database. + Fetches query jobs with status=PENDING from the database. :param db_conn: - :return: The pending search jobs on success. An empty list if an exception occurs while + :return: The pending query jobs on success. An empty list if an exception occurs while interacting with the database. """ with contextlib.closing(db_conn.cursor(dictionary=True)) as db_cursor: @@ -112,7 +112,7 @@ def fetch_new_query_jobs(db_conn) -> list: @exception_default_value(default=[]) -def fetch_cancelling_query_jobs(db_conn) -> list: +def fetch_cancelling_search_jobs(db_conn) -> list: """ Fetches search jobs with status=CANCELLING from the database. :param db_conn: @@ -125,6 +125,7 @@ def fetch_cancelling_query_jobs(db_conn) -> list: SELECT {QUERY_JOBS_TABLE_NAME}.id as job_id FROM {QUERY_JOBS_TABLE_NAME} WHERE {QUERY_JOBS_TABLE_NAME}.status={QueryJobStatus.CANCELLING} + AND {QUERY_JOBS_TABLE_NAME}.type={QueryJobType.SEARCH} """ ) return db_cursor.fetchall() @@ -175,63 +176,52 @@ def set_job_or_task_status( return rval -async def handle_cancelling_query_jobs(db_conn_pool) -> None: +async def handle_cancelling_search_jobs(db_conn_pool) -> None: global active_jobs with contextlib.closing(db_conn_pool.connect()) as db_conn: - cancelling_jobs = fetch_cancelling_query_jobs(db_conn) + cancelling_jobs = fetch_cancelling_search_jobs(db_conn) + for cancelling_job in cancelling_jobs: job_id = str(cancelling_job["job_id"]) - job_type = job.type() - if QueryJobType.SEARCH == job_type: - if job_id in active_jobs: - job = active_jobs.pop(job_id) - cancel_job_except_reducer(job) - # Perform any async tasks last so that it's easier to reason about synchronization - # issues between concurrent tasks - await release_reducer_for_job(job) - else: - continue + if job_id in active_jobs: + job = active_jobs.pop(job_id) + cancel_job_except_reducer(job) + # Perform any async tasks last so that it's easier to reason about synchronization + # issues between concurrent tasks + await release_reducer_for_job(job) + else: + continue - set_job_or_task_status( - db_conn, - QUERY_TASKS_TABLE_NAME, - job_id, - QueryTaskStatus.CANCELLED, - QueryTaskStatus.PENDING, - duration=0, - ) + set_job_or_task_status( + db_conn, + QUERY_TASKS_TABLE_NAME, + job_id, + QueryTaskStatus.CANCELLED, + QueryTaskStatus.PENDING, + duration=0, + ) - set_job_or_task_status( - db_conn, - QUERY_TASKS_TABLE_NAME, - job_id, - QueryTaskStatus.CANCELLED, - QueryTaskStatus.RUNNING, - duration="TIMESTAMPDIFF(MICROSECOND, start_time, NOW())/1000000.0", - ) + set_job_or_task_status( + db_conn, + QUERY_TASKS_TABLE_NAME, + job_id, + QueryTaskStatus.CANCELLED, + QueryTaskStatus.RUNNING, + duration="TIMESTAMPDIFF(MICROSECOND, start_time, NOW())/1000000.0", + ) - if set_job_or_task_status( - db_conn, - QUERY_JOBS_TABLE_NAME, - job_id, - QueryJobStatus.CANCELLED, - QueryJobStatus.CANCELLING, - duration=(datetime.datetime.now() - job.start_time).total_seconds(), - ): - logger.info(f"Cancelled job {job_id}.") - else: - logger.error(f"Failed to cancel job {job_id}.") + if set_job_or_task_status( + db_conn, + QUERY_JOBS_TABLE_NAME, + job_id, + QueryJobStatus.CANCELLED, + QueryJobStatus.CANCELLING, + duration=(datetime.datetime.now() - job.start_time).total_seconds(), + ): + logger.info(f"Cancelled job {job_id}.") else: - logger.error(f"Unexpected job type: {job_type} for cancellation, marking job {job_id} as failed.") - if not set_job_or_task_status( - db_conn, - QUERY_JOBS_TABLE_NAME, - job_id, - QueryJobStatus.FAILED, - duration=(datetime.datetime.now() - job.start_time).total_seconds(), - ): - logger.error(f"Failed to mark job {job_id} as failed.") + logger.error(f"Failed to cancel job {job_id}.") @@ -643,7 +633,7 @@ async def check_job_status_and_update_db(db_conn_pool, results_cache_uri): async def handle_job_updates(db_conn_pool, results_cache_uri: str, jobs_poll_delay: float): while True: - await handle_cancelling_query_jobs(db_conn_pool) + await handle_cancelling_search_jobs(db_conn_pool) await check_job_status_and_update_db(db_conn_pool, results_cache_uri) await asyncio.sleep(jobs_poll_delay) diff --git a/components/webui/imports/api/search/constants.js b/components/webui/imports/api/search/constants.js index 2da303915..908367b76 100644 --- a/components/webui/imports/api/search/constants.js +++ b/components/webui/imports/api/search/constants.js @@ -56,20 +56,6 @@ const isOperationInProgress = (s) => ( (true === isSearchSignalReq(s)) || (true === isSearchSignalQuerying(s)) ); -/* eslint-disable sort-keys */ -let enumQueryType; -/** - * Enum of job type, matching the `QueryJobType` class in - * `job_orchestration.query_scheduler.constants`. - * - * @enum {number} - */ -const QUERY_JOB_TYPE = Object.freeze({ - SEARCH: (enumQueryType = 0), - EXTRACT_IR: ++enumQueryType, - -}); -/* eslint-enable sort-keys */ /* eslint-disable sort-keys */ let enumQueryJobStatus; @@ -95,6 +81,20 @@ const QUERY_JOB_STATUS_WAITING_STATES = [ QUERY_JOB_STATUS.CANCELLING, ]; +/* eslint-disable sort-keys */ +let enumQueryType; +/** + * Enum of job type, matching the `QueryJobType` class in + * `job_orchestration.query_scheduler.constants`. + * + * @enum {number} + */ +const QUERY_JOB_TYPE = Object.freeze({ + SEARCH: (enumQueryType = 0), + EXTRACT_IR: ++enumQueryType, +}); +/* eslint-enable sort-keys */ + /** * Enum of Mongo Collection sort orders. * @@ -128,6 +128,7 @@ export { MONGO_SORT_ORDER, QUERY_JOB_STATUS, QUERY_JOB_STATUS_WAITING_STATES, + QUERY_JOB_TYPE, SEARCH_MAX_NUM_RESULTS, SEARCH_RESULTS_FIELDS, SEARCH_SIGNAL, From d027279bfae0a4524d99597a832b150782ef9238 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 19 Jun 2024 14:53:13 -0400 Subject: [PATCH 04/28] Linter --- .../job_orchestration/scheduler/constants.py | 3 +- .../job_orchestration/scheduler/job_config.py | 6 +- .../scheduler/query/query_scheduler.py | 55 +++++++++---------- .../scheduler/scheduler_data.py | 19 ++++--- .../api/search/server/QueryJobsDbManager.js | 3 +- 5 files changed, 44 insertions(+), 42 deletions(-) diff --git a/components/job-orchestration/job_orchestration/scheduler/constants.py b/components/job-orchestration/job_orchestration/scheduler/constants.py index 3d813e30f..22e5e34f4 100644 --- a/components/job-orchestration/job_orchestration/scheduler/constants.py +++ b/components/job-orchestration/job_orchestration/scheduler/constants.py @@ -68,6 +68,7 @@ def __str__(self) -> str: def to_str(self) -> str: return str(self.name) + class QueryJobType(IntEnum): SEARCH = 0 EXTRACT_IR = auto() @@ -76,4 +77,4 @@ def __str__(self) -> str: return str(self.value) def to_str(self) -> str: - return str(self.name) \ No newline at end of file + return str(self.name) diff --git a/components/job-orchestration/job_orchestration/scheduler/job_config.py b/components/job-orchestration/job_orchestration/scheduler/job_config.py index 6968b6dad..fda83af6b 100644 --- a/components/job-orchestration/job_orchestration/scheduler/job_config.py +++ b/components/job-orchestration/job_orchestration/scheduler/job_config.py @@ -1,9 +1,10 @@ from __future__ import annotations import typing +from abc import ABC from pydantic import BaseModel, validator -from abc import ABC + class PathsToCompress(BaseModel): file_paths: typing.List[str] @@ -39,8 +40,7 @@ class AggregationConfig(BaseModel): count_by_time_bucket_size: typing.Optional[int] = None # Milliseconds -class QueryConfig(BaseModel, ABC): - ... +class QueryConfig(BaseModel, ABC): ... class ExtractConfig(QueryConfig): diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index 9107a5aa4..bbcc14fc3 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -24,7 +24,7 @@ import pathlib import sys from pathlib import Path -from typing import Dict, List, Optional, Any +from typing import Any, Dict, List, Optional import celery import msgpack @@ -40,15 +40,21 @@ from clp_py_utils.decorators import exception_default_value from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.fs_search_task import search -from job_orchestration.scheduler.constants import QueryJobStatus, QueryTaskStatus, QueryJobType -from job_orchestration.scheduler.job_config import SearchConfig, ExtractConfig +from job_orchestration.scheduler.constants import QueryJobStatus, QueryJobType, QueryTaskStatus +from job_orchestration.scheduler.job_config import ExtractConfig, SearchConfig from job_orchestration.scheduler.query.reducer_handler import ( handle_reducer_connection, ReducerHandlerMessage, ReducerHandlerMessageQueues, ReducerHandlerMessageType, ) -from job_orchestration.scheduler.scheduler_data import InternalJobState, QueryTaskResult, SearchJob, QueryJob, ExtractJob +from job_orchestration.scheduler.scheduler_data import ( + ExtractJob, + InternalJobState, + QueryJob, + QueryTaskResult, + SearchJob, +) from pydantic import ValidationError # Setup logging @@ -224,7 +230,6 @@ async def handle_cancelling_search_jobs(db_conn_pool) -> None: logger.error(f"Failed to cancel job {job_id}.") - def insert_query_tasks_into_db(db_conn, job_id, archive_ids: List[str]) -> List[int]: task_ids = [] with contextlib.closing(db_conn.cursor()) as cursor: @@ -371,7 +376,9 @@ def handle_pending_search_jobs( reducer_acquisition_tasks = [] pending_search_jobs = [ - job for job in active_jobs.values() if InternalJobState.WAITING_FOR_DISPATCH == job.state and QueryJobType.SEARCH == job.type + job + for job in active_jobs.values() + if InternalJobState.WAITING_FOR_DISPATCH == job.state and job.type() == QueryJobType.SEARCH ] with contextlib.closing(db_conn_pool.connect()) as db_conn: @@ -440,9 +447,7 @@ def handle_pending_search_jobs( archives_for_search = job.remaining_archives_for_search job.remaining_archives_for_search = [] - archive_ids_for_search = [ - archive["archive_id"] for archive in archives_for_search - ] + archive_ids_for_search = [archive["archive_id"] for archive in archives_for_search] dispatch_query_job( db_conn, job, archive_ids_for_search, clp_metadata_db_conn_params, results_cache_uri @@ -497,10 +502,7 @@ def found_max_num_latest_results( async def handle_returned_search_job( - db_conn, - job: SearchJob, - task_results: Optional[Any], - results_cache_uri: str + db_conn, job: SearchJob, task_results: Optional[Any], results_cache_uri: str ) -> None: global active_jobs @@ -532,10 +534,10 @@ async def handle_returned_search_job( # Check if we've reached max results elif False == is_reducer_job and max_num_results > 0: if found_max_num_latest_results( - results_cache_uri, - job_id, - max_num_results, - job.remaining_archives_for_search[0]["end_timestamp"], + results_cache_uri, + job_id, + max_num_results, + job.remaining_archives_for_search[0]["end_timestamp"], ): new_job_status = QueryJobStatus.SUCCEEDED if new_job_status == QueryJobStatus.RUNNING: @@ -569,12 +571,12 @@ async def handle_returned_search_job( # We set the status regardless of the job's previous status to handle the case where the # job is cancelled (status = CANCELLING) while we're in this method. if set_job_or_task_status( - db_conn, - QUERY_JOBS_TABLE_NAME, - job_id, - new_job_status, - num_tasks_completed=job.num_archives_searched, - duration=(datetime.datetime.now() - job.start_time).total_seconds(), + db_conn, + QUERY_JOBS_TABLE_NAME, + job_id, + new_job_status, + num_tasks_completed=job.num_archives_searched, + duration=(datetime.datetime.now() - job.start_time).total_seconds(), ): if new_job_status == QueryJobStatus.SUCCEEDED: logger.info(f"Completed job {job_id}.") @@ -620,17 +622,12 @@ async def check_job_status_and_update_db(db_conn_pool, results_cache_uri): if QueryJobType.SEARCH == job.type(): search_job: SearchJob = job await handle_returned_search_job( - db_conn, - search_job, - returned_results, - results_cache_uri + db_conn, search_job, returned_results, results_cache_uri ) else: logger.error(f"Unexpected job type: {job_type}, skipping job {job_id}") - - async def handle_job_updates(db_conn_pool, results_cache_uri: str, jobs_poll_delay: float): while True: await handle_cancelling_search_jobs(db_conn_pool) diff --git a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py index 04ff0fc50..145f99c56 100644 --- a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py +++ b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py @@ -1,13 +1,18 @@ import asyncio import datetime +from abc import ABC, abstractmethod from enum import auto, Enum from typing import Any, Dict, List, Optional -from job_orchestration.scheduler.constants import CompressionTaskStatus, QueryTaskStatus, QueryJobType -from job_orchestration.scheduler.job_config import SearchConfig, QueryConfig, ExtractConfig +from job_orchestration.scheduler.constants import ( + CompressionTaskStatus, + QueryJobType, + QueryTaskStatus, +) +from job_orchestration.scheduler.job_config import ExtractConfig, QueryConfig, SearchConfig from job_orchestration.scheduler.query.reducer_handler import ReducerHandlerMessageQueues -from pydantic import BaseModel, validator, Field -from abc import ABC, abstractmethod +from pydantic import BaseModel, Field, validator + class CompressionJob(BaseModel): id: int @@ -43,12 +48,10 @@ class QueryJob(BaseModel, ABC): current_sub_job_async_task_result: Optional[Any] @abstractmethod - def type(self) -> QueryJobType: - ... + def type(self) -> QueryJobType: ... @abstractmethod - def job_config(self) -> QueryConfig: - ... + def job_config(self) -> QueryConfig: ... class ExtractJob(QueryJob): diff --git a/components/webui/imports/api/search/server/QueryJobsDbManager.js b/components/webui/imports/api/search/server/QueryJobsDbManager.js index 266aac523..68b3f5a9b 100644 --- a/components/webui/imports/api/search/server/QueryJobsDbManager.js +++ b/components/webui/imports/api/search/server/QueryJobsDbManager.js @@ -56,7 +56,8 @@ class QueryJobsDbManager { `INSERT INTO ${this.#queryJobsTableName} (${QUERY_JOBS_TABLE_COLUMN_NAMES.JOB_CONFIG}, ${QUERY_JOBS_TABLE_COLUMN_NAMES.TYPE}) VALUES (?, ?)`, - [Buffer.from(msgpack.encode(searchConfig)), QUERY_JOB_TYPE.SEARCH], + [Buffer.from(msgpack.encode(searchConfig)), + QUERY_JOB_TYPE.SEARCH], ); return queryInsertResults.insertId; From 92e69fd7f4fe55bde677ce5ca45426925b6ac722 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 19 Jun 2024 15:03:39 -0400 Subject: [PATCH 05/28] fixes --- .../clp-py-utils/clp_py_utils/initialize-orchestration-db.py | 2 +- .../job_orchestration/executor/query/fs_search_task.py | 4 ++-- .../job_orchestration/scheduler/query/query_scheduler.py | 2 +- .../job_orchestration/scheduler/scheduler_data.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py index 4899fb85d..1ed727367 100644 --- a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py +++ b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py @@ -97,7 +97,7 @@ def main(argv): f""" CREATE TABLE IF NOT EXISTS `{QUERY_JOBS_TABLE_NAME}` ( `id` INT NOT NULL AUTO_INCREMENT, - `type`INT NOT NULL, + `type` INT NOT NULL, `status` INT NOT NULL DEFAULT '{QueryJobStatus.PENDING}', `creation_time` DATETIME(3) NOT NULL DEFAULT CURRENT_TIMESTAMP(3), `num_tasks` INT NOT NULL DEFAULT '0', diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index ac3d1312f..a17c75a5b 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -113,7 +113,7 @@ def search( self: Task, job_id: str, task_id: int, - search_config_obj: dict, + job_config_obj: dict, archive_id: str, clp_metadata_db_conn_params: dict, results_cache_uri: str, @@ -133,7 +133,7 @@ def search( logger.info(f"Started task for job {job_id}") - search_config = SearchConfig.parse_obj(search_config_obj) + search_config = SearchConfig.parse_obj(job_config_obj) sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) start_time = datetime.datetime.now() diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index bbcc14fc3..67a55a9e0 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -291,7 +291,7 @@ def get_task_group_for_job( job_id=job.id, archive_id=archive_ids[i], task_id=task_ids[i], - search_config_obj=job_config_obj, + job_config_obj=job_config_obj, clp_metadata_db_conn_params=clp_metadata_db_conn_params, results_cache_uri=results_cache_uri, ) diff --git a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py index 145f99c56..6f885b62a 100644 --- a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py +++ b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py @@ -11,7 +11,7 @@ ) from job_orchestration.scheduler.job_config import ExtractConfig, QueryConfig, SearchConfig from job_orchestration.scheduler.query.reducer_handler import ReducerHandlerMessageQueues -from pydantic import BaseModel, Field, validator +from pydantic import BaseModel, validator class CompressionJob(BaseModel): From f0ee6862b37df900577e30f12fce90b1f4c7f32c Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 19 Jun 2024 15:22:11 -0400 Subject: [PATCH 06/28] fixes --- .../job_orchestration/scheduler/query/query_scheduler.py | 4 ++-- .../job_orchestration/scheduler/scheduler_data.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index 67a55a9e0..94247cf01 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -365,7 +365,7 @@ async def acquire_reducer_for_job(job: SearchJob): logger.info(f"Got reducer for job {job.id} at {reducer_host}:{reducer_port}") -def handle_pending_search_jobs( +def handle_pending_query_jobs( db_conn_pool, clp_metadata_db_conn_params: Dict[str, any], results_cache_uri: str, @@ -648,7 +648,7 @@ async def handle_jobs( tasks = [handle_updating_task] while True: - reducer_acquisition_tasks = handle_pending_search_jobs( + reducer_acquisition_tasks = handle_pending_query_jobs( db_conn_pool, clp_metadata_db_conn_params, results_cache_uri, diff --git a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py index 6f885b62a..e05abb7e8 100644 --- a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py +++ b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py @@ -35,7 +35,6 @@ def valid_status(cls, field): class InternalJobState(Enum): - PENDING = auto() WAITING_FOR_REDUCER = auto() WAITING_FOR_DISPATCH = auto() RUNNING = auto() From 14100c2545538fee4cdd7ba893f779ff03eb9c6e Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 19 Jun 2024 17:03:28 -0400 Subject: [PATCH 07/28] Initial support for IR task flow in the scheduler --- .../executor/query/celeryconfig.py | 6 +- .../executor/query/extract_ir_task.py | 88 +++++++ .../executor/query/fs_search_task.py | 25 +- .../job_orchestration/executor/query/utils.py | 16 ++ .../scheduler/query/query_scheduler.py | 216 ++++++++++++++++-- 5 files changed, 305 insertions(+), 46 deletions(-) create mode 100644 components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py create mode 100644 components/job-orchestration/job_orchestration/executor/query/utils.py diff --git a/components/job-orchestration/job_orchestration/executor/query/celeryconfig.py b/components/job-orchestration/job_orchestration/executor/query/celeryconfig.py index 4b9949091..994c0bbcf 100644 --- a/components/job-orchestration/job_orchestration/executor/query/celeryconfig.py +++ b/components/job-orchestration/job_orchestration/executor/query/celeryconfig.py @@ -2,10 +2,14 @@ from job_orchestration.scheduler.constants import QueueName -imports = "job_orchestration.executor.query.fs_search_task" +imports = ( + "job_orchestration.executor.query.fs_search_task", + "job_orchestration.executor.query.extract_ir_task", +) task_routes = { "job_orchestration.executor.query.fs_search_task.search": QueueName.QUERY, + "job_orchestration.executor.query.extract_ir_task.extract_ir": QueueName.QUERY, } task_create_missing_queues = True diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py new file mode 100644 index 000000000..b5cb61666 --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -0,0 +1,88 @@ +import datetime +import os +import signal +import subprocess +import sys +from contextlib import closing +from pathlib import Path +from typing import Any, Dict + +from celery.app.task import Task +from celery.utils.log import get_task_logger +from clp_py_utils.clp_config import Database, QUERY_TASKS_TABLE_NAME, StorageEngine +from clp_py_utils.clp_logging import set_logging_level +from clp_py_utils.sql_adapter import SQL_Adapter +from job_orchestration.executor.query.celery import app +from job_orchestration.scheduler.job_config import ExtractConfig +from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus +from .utils import update_query_task_metadata + +# Setup logging +logger = get_task_logger(__name__) + +@app.task(bind=True) +def extract_ir( + self: Task, + job_id: str, + task_id: int, + job_config_obj: dict, + archive_id: str, + clp_metadata_db_conn_params: dict, + results_cache_uri: str, +) -> Dict[str, Any]: + clp_home = Path(os.getenv("CLP_HOME")) + archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) + clp_logs_dir = Path(os.getenv("CLP_LOGS_DIR")) + clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) + clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) + + # Setup logging to file + worker_logs_dir = clp_logs_dir / job_id + worker_logs_dir.mkdir(exist_ok=True, parents=True) + set_logging_level(logger, clp_logging_level) + clo_log_path = worker_logs_dir / f"{task_id}-clo.log" + clo_log_file = open(clo_log_path, "w") + + logger.info(f"Started extract IR task for job {job_id}") + + extract_ir_config = ExtractConfig.parse_obj(job_config_obj) + sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) + + start_time = datetime.datetime.now() + search_status = QueryTaskStatus.RUNNING + with closing(sql_adapter.create_connection(True)) as db_conn, closing( + db_conn.cursor(dictionary=True) + ) as db_cursor: + update_query_task_metadata( + db_cursor, task_id, dict(status=search_status, start_time=start_time) + ) + db_conn.commit() + + logger.info(f'Running Placeholder task for job {job_id}') + logger.info(f'Arguments: split_id: {extract_ir_config.file_split_id}, msg_ix: {extract_ir_config.msg_ix}') + + # Mark job succeed + search_status = QueryTaskStatus.SUCCEEDED + + # Close log files + clo_log_file.close() + duration = (datetime.datetime.now() - start_time).total_seconds() + + with closing(sql_adapter.create_connection(True)) as db_conn, closing( + db_conn.cursor(dictionary=True) + ) as db_cursor: + update_query_task_metadata( + db_cursor, task_id, dict(status=search_status, start_time=start_time, duration=duration) + ) + db_conn.commit() + + extract_ir_task_result = QueryTaskResult( + status=search_status, + task_id=task_id, + duration=duration, + ) + + if QueryTaskStatus.FAILED == search_status: + extract_ir_task_result.error_log_path = str(clo_log_path) + logger.info(f'Finished Placeholder task for job {job_id}') + return extract_ir_task_result.dict() diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index a17c75a5b..8aab3ff3b 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -15,27 +15,10 @@ from job_orchestration.executor.query.celery import app from job_orchestration.scheduler.job_config import SearchConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus - +from .utils import update_query_task_metadata # Setup logging logger = get_task_logger(__name__) - -def update_search_task_metadata( - db_cursor, - task_id: int, - kv_pairs: Dict[str, Any], -): - if not kv_pairs or len(kv_pairs) == 0: - raise ValueError("No key-value pairs provided to update search task metadata") - - query = f""" - UPDATE {QUERY_TASKS_TABLE_NAME} - SET {', '.join([f'{k}="{v}"' for k, v in kv_pairs.items()])} - WHERE id = {task_id} - """ - db_cursor.execute(query) - - def make_command( storage_engine: str, clp_home: Path, @@ -155,7 +138,7 @@ def search( error_message = f"Error creating search command: {e}" logger.error(error_message) - update_search_task_metadata( + update_query_task_metadata( db_cursor, task_id, dict(status=QueryTaskStatus.FAILED, duration=0, start_time=start_time), @@ -171,7 +154,7 @@ def search( error_log_path=str(clo_log_path), ).dict() - update_search_task_metadata( + update_query_task_metadata( db_cursor, task_id, dict(status=search_status, start_time=start_time) ) db_conn.commit() @@ -219,7 +202,7 @@ def sigterm_handler(_signo, _stack_frame): with closing(sql_adapter.create_connection(True)) as db_conn, closing( db_conn.cursor(dictionary=True) ) as db_cursor: - update_search_task_metadata( + update_query_task_metadata( db_cursor, task_id, dict(status=search_status, start_time=start_time, duration=duration) ) db_conn.commit() diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py new file mode 100644 index 000000000..f5bf04c39 --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/query/utils.py @@ -0,0 +1,16 @@ +from typing import Any, Dict +from clp_py_utils.clp_config import QUERY_TASKS_TABLE_NAME +def update_query_task_metadata( + db_cursor, + task_id: int, + kv_pairs: Dict[str, Any], +): + if not kv_pairs or len(kv_pairs) == 0: + raise ValueError("No key-value pairs provided to update query task metadata") + + query = f""" + UPDATE {QUERY_TASKS_TABLE_NAME} + SET {', '.join([f'{k}="{v}"' for k, v in kv_pairs.items()])} + WHERE id = {task_id} + """ + db_cursor.execute(query) diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index 94247cf01..a19cf9a61 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -40,6 +40,7 @@ from clp_py_utils.decorators import exception_default_value from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.fs_search_task import search +from job_orchestration.executor.query.extract_ir_task import extract_ir from job_orchestration.scheduler.constants import QueryJobStatus, QueryJobType, QueryTaskStatus from job_orchestration.scheduler.job_config import ExtractConfig, SearchConfig from job_orchestration.scheduler.query.reducer_handler import ( @@ -278,6 +279,52 @@ def get_archives_for_search( return archives_for_search +def get_archive_and_update_config_for_extraction( + db_conn, + extract_config: ExtractConfig, +) -> Optional[str]: + + orig_file_id = extract_config.orig_file_id + msg_ix = extract_config.msg_ix + + results = get_archive_and_file_split_for_extraction(db_conn, orig_file_id, msg_ix) + if len(results) == 0: + logger.error(f"No file split and archive match with config: {orig_file_id}:{msg_ix}") + return None + elif len(results) > 1: + logger.error(f"Multiple splits match with config: {orig_file_id}:{msg_ix}") + for result in results: + logger.error(f"{result['archive_id']}:{result['id']}") + return None + + file_split_id = results[0]["id"] + archive_id = results[0]["archive_id"] + logger.info(f"archive: {archive_id}, file: {file_split_id}") + extract_config.file_split_id = file_split_id + return archive_id + + +@exception_default_value(default=[]) +def get_archive_and_file_split_for_extraction( + db_conn, + orig_file_id: str, + msg_ix: int, +): + query = f"""SELECT id, archive_id + FROM {CLP_METADATA_TABLE_PREFIX}files WHERE + orig_file_id = '{orig_file_id}' AND + begin_message_ix <= {msg_ix} AND + (begin_message_ix + num_messages) > {msg_ix} + """ + + logger.info(query) + + with contextlib.closing(db_conn.cursor(dictionary=True)) as cursor: + cursor.execute(query) + results = list(cursor.fetchall()) + return results + + def get_task_group_for_job( archive_ids: List[str], task_ids: List[int], @@ -286,17 +333,30 @@ def get_task_group_for_job( results_cache_uri: str, ): job_config_obj = job.job_config().dict() - return celery.group( - search.s( - job_id=job.id, - archive_id=archive_ids[i], - task_id=task_ids[i], - job_config_obj=job_config_obj, - clp_metadata_db_conn_params=clp_metadata_db_conn_params, - results_cache_uri=results_cache_uri, + if job.type() == QueryJobType.SEARCH: + return celery.group( + search.s( + job_id=job.id, + archive_id=archive_ids[i], + task_id=task_ids[i], + job_config_obj=job_config_obj, + clp_metadata_db_conn_params=clp_metadata_db_conn_params, + results_cache_uri=results_cache_uri, + ) + for i in range(len(archive_ids)) + ) + if job.type() == QueryJobType.EXTRACT_IR: + return celery.group( + extract_ir.s( + job_id=job.id, + archive_id=archive_ids[i], + task_id=task_ids[i], + job_config_obj=job_config_obj, + clp_metadata_db_conn_params=clp_metadata_db_conn_params, + results_cache_uri=results_cache_uri, + ) + for i in range(len(archive_ids)) ) - for i in range(len(archive_ids)) - ) def dispatch_query_job( @@ -365,6 +425,30 @@ async def acquire_reducer_for_job(job: SearchJob): logger.info(f"Got reducer for job {job.id} at {reducer_host}:{reducer_port}") +def dispatch_job_and_update_db( + db_conn, + new_job: QueryJob, + target_archives: List[str], + clp_metadata_db_conn_params: Dict[str, any], + results_cache_uri: str, + num_task: int, +) -> None: + dispatch_query_job( + db_conn, new_job, target_archives, clp_metadata_db_conn_params, results_cache_uri + ) + start_time = datetime.datetime.now() + new_job.start_time = start_time + set_job_or_task_status( + db_conn, + QUERY_JOBS_TABLE_NAME, + new_job.id, + QueryJobStatus.RUNNING, + QueryJobStatus.PENDING, + start_time=start_time, + num_tasks=num_task, + ) + + def handle_pending_query_jobs( db_conn_pool, clp_metadata_db_conn_params: Dict[str, any], @@ -427,6 +511,46 @@ def handle_pending_query_jobs( else: pending_search_jobs.append(new_search_job) active_jobs[job_id] = new_search_job + + elif QueryJobType.EXTRACT_IR == job_type: + extract_config = ExtractConfig.parse_obj(msgpack.unpackb(job_config)) + archive_id = get_archive_and_update_config_for_extraction(db_conn, extract_config) + if not archive_id: + logger.error(f"Failed to get archive for extraction") + if not set_job_or_task_status( + db_conn, + QUERY_JOBS_TABLE_NAME, + job_id, + QueryJobStatus.FAILED, + QueryJobStatus.PENDING, + start_time=datetime.datetime.now(), + num_tasks=0, + duration=0, + ): + logger.error(f"Failed to set job: {job_id} as failed") + continue + + new_extraction_job = ExtractJob( + id=job_id, + archive_id=archive_id, + extract_config=extract_config, + state=InternalJobState.WAITING_FOR_DISPATCH + ) + target_archive = [new_extraction_job.archive_id] + + dispatch_job_and_update_db( + db_conn, + new_extraction_job, + target_archive, + clp_metadata_db_conn_params, + results_cache_uri, + 1 + ) + active_jobs[new_extraction_job.id] = new_extraction_job + logger.info( + f"Dispatched extraction job {job_id} on archive: {archive_id}" + ) + else: logger.error(f"Unexpected job type: {job_type}, skipping job {job_id}") continue @@ -449,23 +573,17 @@ def handle_pending_query_jobs( archive_ids_for_search = [archive["archive_id"] for archive in archives_for_search] - dispatch_query_job( - db_conn, job, archive_ids_for_search, clp_metadata_db_conn_params, results_cache_uri + dispatch_job_and_update_db( + db_conn, + job, + archive_ids_for_search, + clp_metadata_db_conn_params, + results_cache_uri, + job.num_archives_to_search ) logger.info( f"Dispatched job {job_id} with {len(archive_ids_for_search)} archives to search." ) - start_time = datetime.datetime.now() - job.start_time = start_time - set_job_or_task_status( - db_conn, - QUERY_JOBS_TABLE_NAME, - job_id, - QueryJobStatus.RUNNING, - QueryJobStatus.PENDING, - start_time=start_time, - num_tasks=job.num_archives_to_search, - ) return reducer_acquisition_tasks @@ -587,6 +705,51 @@ async def handle_returned_search_job( del active_jobs[job_id] +async def handle_returned_extract_ir_job( + db_conn, job: SearchJob, task_results: Optional[Any] +) -> None: + global active_jobs + + job_id = job.id + new_job_status = QueryJobStatus.SUCCEEDED + num_task = len(task_results) + if 1 != num_task: + logger.error( + f"Unexpected number of task under extraction job: {job_id}. " + f"expected 1, got {num_task}" + ) + new_job_status = QueryJobStatus.FAILED + else: + task_result = QueryTaskResult.parse_obj(task_results[0]) + task_id = task_result.task_id + if not QueryJobStatus.SUCCEEDED == task_result.status: + logger.error( + f"Extraction task job-{job_id}-task-{task_id} failed. " + f"Check {task_result.error_log_path} for details." + ) + new_job_status = QueryJobStatus.FAILED + else: + logger.info( + f"Extraction task job-{job_id}-task-{task_id} succeeded in " + f"{task_result.duration} second(s)." + ) + + if set_job_or_task_status( + db_conn, + QUERY_JOBS_TABLE_NAME, + job_id, + new_job_status, + QueryJobStatus.RUNNING, + num_tasks_completed=num_task, + duration=(datetime.datetime.now() - job.start_time).total_seconds(), + ): + if new_job_status == QueryJobStatus.SUCCEEDED: + logger.info(f"Completed job {job_id}.") + else: + logger.info(f"Completed job {job_id} with failing tasks.") + del active_jobs[job_id] + + async def check_job_status_and_update_db(db_conn_pool, results_cache_uri): global active_jobs @@ -619,11 +782,16 @@ async def check_job_status_and_update_db(db_conn_pool, results_cache_uri): if returned_results is None: continue job_type = job.type() - if QueryJobType.SEARCH == job.type(): + if QueryJobType.SEARCH == job_type: search_job: SearchJob = job await handle_returned_search_job( db_conn, search_job, returned_results, results_cache_uri ) + elif QueryJobType.EXTRACT_IR == job_type: + extract_ir_job: ExtractJob = job + await handle_returned_extract_ir_job( + db_conn, extract_ir_job, returned_results + ) else: logger.error(f"Unexpected job type: {job_type}, skipping job {job_id}") From 133481f7e8c43ae8974617fa2212cc8d0b41e898 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 19 Jun 2024 17:40:56 -0400 Subject: [PATCH 08/28] Some renaming and remove debug print --- .../executor/query/extract_ir_task.py | 4 +- .../job_orchestration/scheduler/job_config.py | 2 +- .../scheduler/query/query_scheduler.py | 39 +++++++++---------- .../scheduler/scheduler_data.py | 8 ++-- 4 files changed, 25 insertions(+), 28 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index b5cb61666..b17364586 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -13,7 +13,7 @@ from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app -from job_orchestration.scheduler.job_config import ExtractConfig +from job_orchestration.scheduler.job_config import ExtractIrConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus from .utils import update_query_task_metadata @@ -45,7 +45,7 @@ def extract_ir( logger.info(f"Started extract IR task for job {job_id}") - extract_ir_config = ExtractConfig.parse_obj(job_config_obj) + extract_ir_config = ExtractIrConfig.parse_obj(job_config_obj) sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) start_time = datetime.datetime.now() diff --git a/components/job-orchestration/job_orchestration/scheduler/job_config.py b/components/job-orchestration/job_orchestration/scheduler/job_config.py index fda83af6b..2a4021f9a 100644 --- a/components/job-orchestration/job_orchestration/scheduler/job_config.py +++ b/components/job-orchestration/job_orchestration/scheduler/job_config.py @@ -43,7 +43,7 @@ class AggregationConfig(BaseModel): class QueryConfig(BaseModel, ABC): ... -class ExtractConfig(QueryConfig): +class ExtractIrConfig(QueryConfig): orig_file_id: str msg_ix: int file_split_id: typing.Optional[str] = None diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index a19cf9a61..55f96a12e 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -42,7 +42,7 @@ from job_orchestration.executor.query.fs_search_task import search from job_orchestration.executor.query.extract_ir_task import extract_ir from job_orchestration.scheduler.constants import QueryJobStatus, QueryJobType, QueryTaskStatus -from job_orchestration.scheduler.job_config import ExtractConfig, SearchConfig +from job_orchestration.scheduler.job_config import ExtractIrConfig, SearchConfig from job_orchestration.scheduler.query.reducer_handler import ( handle_reducer_connection, ReducerHandlerMessage, @@ -50,7 +50,7 @@ ReducerHandlerMessageType, ) from job_orchestration.scheduler.scheduler_data import ( - ExtractJob, + ExtractIrJob, InternalJobState, QueryJob, QueryTaskResult, @@ -281,11 +281,11 @@ def get_archives_for_search( def get_archive_and_update_config_for_extraction( db_conn, - extract_config: ExtractConfig, + extract_ir_config: ExtractIrConfig, ) -> Optional[str]: - orig_file_id = extract_config.orig_file_id - msg_ix = extract_config.msg_ix + orig_file_id = extract_ir_config.orig_file_id + msg_ix = extract_ir_config.msg_ix results = get_archive_and_file_split_for_extraction(db_conn, orig_file_id, msg_ix) if len(results) == 0: @@ -299,8 +299,7 @@ def get_archive_and_update_config_for_extraction( file_split_id = results[0]["id"] archive_id = results[0]["archive_id"] - logger.info(f"archive: {archive_id}, file: {file_split_id}") - extract_config.file_split_id = file_split_id + extract_ir_config.file_split_id = file_split_id return archive_id @@ -317,8 +316,6 @@ def get_archive_and_file_split_for_extraction( (begin_message_ix + num_messages) > {msg_ix} """ - logger.info(query) - with contextlib.closing(db_conn.cursor(dictionary=True)) as cursor: cursor.execute(query) results = list(cursor.fetchall()) @@ -513,8 +510,8 @@ def handle_pending_query_jobs( active_jobs[job_id] = new_search_job elif QueryJobType.EXTRACT_IR == job_type: - extract_config = ExtractConfig.parse_obj(msgpack.unpackb(job_config)) - archive_id = get_archive_and_update_config_for_extraction(db_conn, extract_config) + extract_ir_config = ExtractIrConfig.parse_obj(msgpack.unpackb(job_config)) + archive_id = get_archive_and_update_config_for_extraction(db_conn, extract_ir_config) if not archive_id: logger.error(f"Failed to get archive for extraction") if not set_job_or_task_status( @@ -530,25 +527,25 @@ def handle_pending_query_jobs( logger.error(f"Failed to set job: {job_id} as failed") continue - new_extraction_job = ExtractJob( + new_extract_ir_job = ExtractIrJob( id=job_id, archive_id=archive_id, - extract_config=extract_config, + extract_ir_config=extract_ir_config, state=InternalJobState.WAITING_FOR_DISPATCH ) - target_archive = [new_extraction_job.archive_id] + target_archive = [new_extract_ir_job.archive_id] dispatch_job_and_update_db( db_conn, - new_extraction_job, + new_extract_ir_job, target_archive, clp_metadata_db_conn_params, results_cache_uri, 1 ) - active_jobs[new_extraction_job.id] = new_extraction_job + active_jobs[new_extract_ir_job.id] = new_extract_ir_job logger.info( - f"Dispatched extraction job {job_id} on archive: {archive_id}" + f"Dispatched IR extraction job {job_id} on archive: {archive_id}" ) else: @@ -715,7 +712,7 @@ async def handle_returned_extract_ir_job( num_task = len(task_results) if 1 != num_task: logger.error( - f"Unexpected number of task under extraction job: {job_id}. " + f"Unexpected number of task under IR extraction job: {job_id}. " f"expected 1, got {num_task}" ) new_job_status = QueryJobStatus.FAILED @@ -724,13 +721,13 @@ async def handle_returned_extract_ir_job( task_id = task_result.task_id if not QueryJobStatus.SUCCEEDED == task_result.status: logger.error( - f"Extraction task job-{job_id}-task-{task_id} failed. " + f"IR extraction task job-{job_id}-task-{task_id} failed. " f"Check {task_result.error_log_path} for details." ) new_job_status = QueryJobStatus.FAILED else: logger.info( - f"Extraction task job-{job_id}-task-{task_id} succeeded in " + f"IR extraction task job-{job_id}-task-{task_id} succeeded in " f"{task_result.duration} second(s)." ) @@ -788,7 +785,7 @@ async def check_job_status_and_update_db(db_conn_pool, results_cache_uri): db_conn, search_job, returned_results, results_cache_uri ) elif QueryJobType.EXTRACT_IR == job_type: - extract_ir_job: ExtractJob = job + extract_ir_job: ExtractIrJob = job await handle_returned_extract_ir_job( db_conn, extract_ir_job, returned_results ) diff --git a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py index e05abb7e8..23fc8a603 100644 --- a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py +++ b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py @@ -9,7 +9,7 @@ QueryJobType, QueryTaskStatus, ) -from job_orchestration.scheduler.job_config import ExtractConfig, QueryConfig, SearchConfig +from job_orchestration.scheduler.job_config import ExtractIrConfig, QueryConfig, SearchConfig from job_orchestration.scheduler.query.reducer_handler import ReducerHandlerMessageQueues from pydantic import BaseModel, validator @@ -53,15 +53,15 @@ def type(self) -> QueryJobType: ... def job_config(self) -> QueryConfig: ... -class ExtractJob(QueryJob): - extract_config: ExtractConfig +class ExtractIrJob(QueryJob): + extract_ir_config: ExtractIrConfig archive_id: str def type(self) -> QueryJobType: return QueryJobType.EXTRACT_IR def job_config(self) -> QueryConfig: - return self.extract_config + return self.extract_ir_config class SearchJob(QueryJob): From 7b1dfd5777f74797e7d64147d63e580dd226a10b Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 21 Jun 2024 10:17:15 -0400 Subject: [PATCH 09/28] First draft --- .../clp_package_utils/general.py | 14 +++ .../clp_package_utils/scripts/start_clp.py | 4 + .../clp-py-utils/clp_py_utils/clp_config.py | 29 +++++ .../executor/query/extract_ir_task.py | 106 +++++++++++++++--- .../executor/query/fs_search_task.py | 11 +- .../job_orchestration/executor/query/utils.py | 10 ++ .../package-template/src/etc/clp-config.yml | 5 + 7 files changed, 159 insertions(+), 20 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 29c421109..1d46a0ea1 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -69,6 +69,7 @@ def __init__(self, clp_home: pathlib.Path, docker_clp_home: pathlib.Path): self.data_dir: typing.Optional[DockerMount] = None self.logs_dir: typing.Optional[DockerMount] = None self.archives_output_dir: typing.Optional[DockerMount] = None + self.ir_output_dir: typing.Optional[DockerMount] = None def get_clp_home(): @@ -224,6 +225,19 @@ def generate_container_config(clp_config: CLPConfig, clp_home: pathlib.Path): container_clp_config.archive_output.directory, ) + container_clp_config.ir_output.directory = pathlib.Path("/") / "mnt" / "ir-output" + if not is_path_already_mounted( + clp_home, + CONTAINER_CLP_HOME, + clp_config.ir_output.directory, + container_clp_config.ir_output.directory, + ): + docker_mounts.ir_output_dir = DockerMount( + DockerMountType.BIND, + clp_config.ir_output.directory, + container_clp_config.ir_output.directory, + ) + return container_clp_config, docker_mounts diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index a798d2112..7901f0f8c 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -578,6 +578,7 @@ def generic_start_worker( # Create necessary directories clp_config.archive_output.directory.mkdir(parents=True, exist_ok=True) + clp_config.ir_output.directory.mkdir(parents=True, exist_ok=True) clp_site_packages_dir = CONTAINER_CLP_HOME / "lib" / "python3" / "site-packages" # fmt: off @@ -601,9 +602,11 @@ def generic_start_worker( "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", "-e", f"CLP_DATA_DIR={container_clp_config.data_directory}", "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}", + "-e", f"CLP_IR_OUTPUT_DIR={container_clp_config.ir_output.directory}", "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", + "-e", f"CLP_IR_COLLECTION={clp_config.results_cache.ir_collection_name}", "-u", f"{os.getuid()}:{os.getgid()}", "--mount", str(mounts.clp_home), ] @@ -612,6 +615,7 @@ def generic_start_worker( mounts.data_dir, mounts.logs_dir, mounts.archives_output_dir, + mounts.ir_output_dir, mounts.input_logs_dir, ] for mount in necessary_mounts: diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index b3410925e..ef75ba10f 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -255,6 +255,7 @@ class ResultsCache(BaseModel): host: str = "localhost" port: int = 27017 db_name: str = "clp-search" + ir_collection_name: str = "clp-ir" @validator("host") def validate_host(cls, field): @@ -268,6 +269,12 @@ def validate_db_name(cls, field): raise ValueError(f"{RESULTS_CACHE_COMPONENT_NAME}.db_name cannot be empty.") return field + @validator("ir_collection_name") + def validate_ir_collection_name(cls, field): + if "" == field: + raise ValueError(f"{RESULTS_CACHE_COMPONENT_NAME}.ir_collection_name cannot be empty.") + return field + def get_uri(self): return f"mongodb://{self.host}:{self.port}/{self.db_name}" @@ -321,6 +328,25 @@ def dump_to_primitive_dict(self): return d +class IrOutput(BaseModel): + directory: pathlib.Path = pathlib.Path("var") / "data" / "ir" + + @validator("directory") + def validate_directory(cls, field): + if "" == field: + raise ValueError("directory can not be empty") + return field + + def make_config_paths_absolute(self, clp_home: pathlib.Path): + self.directory = make_config_path_absolute(clp_home, self.directory) + + def dump_to_primitive_dict(self): + d = self.dict() + # Turn directory (pathlib.Path) into a primitive string + d["directory"] = str(d["directory"]) + return d + + class WebUi(BaseModel): host: str = "localhost" port: int = 4000 @@ -368,6 +394,7 @@ class CLPConfig(BaseModel): credentials_file_path: pathlib.Path = CLP_DEFAULT_CREDENTIALS_FILE_PATH archive_output: ArchiveOutput = ArchiveOutput() + ir_output: IrOutput = IrOutput() data_directory: pathlib.Path = pathlib.Path("var") / "data" logs_directory: pathlib.Path = pathlib.Path("var") / "log" @@ -377,6 +404,7 @@ def make_config_paths_absolute(self, clp_home: pathlib.Path): self.input_logs_directory = make_config_path_absolute(clp_home, self.input_logs_directory) self.credentials_file_path = make_config_path_absolute(clp_home, self.credentials_file_path) self.archive_output.make_config_paths_absolute(clp_home) + self.ir_output.make_config_paths_absolute(clp_home) self.data_directory = make_config_path_absolute(clp_home, self.data_directory) self.logs_directory = make_config_path_absolute(clp_home, self.logs_directory) self._os_release_file_path = make_config_path_absolute(clp_home, self._os_release_file_path) @@ -463,6 +491,7 @@ def load_redis_credentials_from_file(self): def dump_to_primitive_dict(self): d = self.dict() d["archive_output"] = self.archive_output.dump_to_primitive_dict() + d["ir_output"] = self.ir_output.dump_to_primitive_dict() # Turn paths into primitive strings d["input_logs_directory"] = str(self.input_logs_directory) d["credentials_file_path"] = str(self.credentials_file_path) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index b17364586..259004523 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -15,11 +15,41 @@ from job_orchestration.executor.query.celery import app from job_orchestration.scheduler.job_config import ExtractIrConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus -from .utils import update_query_task_metadata +from .utils import update_query_task_metadata, get_logger_file_path # Setup logging logger = get_task_logger(__name__) +def make_command( + storage_engine: str, + clp_home: Path, + archives_dir: Path, + ir_output_dir: Path, + archive_id: str, + extract_ir_config: ExtractIrConfig, + results_cache_uri: str, + results_collection: str, +): + if StorageEngine.CLP == storage_engine: + if not extract_ir_config.file_split_id: + raise ValueError(f"file_split_id not supplied") + command = [ + str(clp_home / "bin" / "clo"), + "i", + str(archives_dir / archive_id), + extract_ir_config.file_split_id, + str(ir_output_dir), + results_cache_uri, + results_collection + ] + if extract_ir_config.target_size is not None: + command.append("--target-size") + command.append(extract_ir_config.target_size) + else: + raise ValueError(f"Unsupported storage engine {storage_engine}") + + return command + @app.task(bind=True) def extract_ir( self: Task, @@ -32,15 +62,16 @@ def extract_ir( ) -> Dict[str, Any]: clp_home = Path(os.getenv("CLP_HOME")) archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) + ir_directory = Path(os.getenv("CLP_IR_OUTPUT_DIR")) clp_logs_dir = Path(os.getenv("CLP_LOGS_DIR")) clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) + ir_collection = str(os.getenv("CLP_IR_COLLECTION")) + # Setup logging to file - worker_logs_dir = clp_logs_dir / job_id - worker_logs_dir.mkdir(exist_ok=True, parents=True) set_logging_level(logger, clp_logging_level) - clo_log_path = worker_logs_dir / f"{task_id}-clo.log" + clo_log_path = get_logger_file_path(clp_logs_dir, job_id, task_id) clo_log_file = open(clo_log_path, "w") logger.info(f"Started extract IR task for job {job_id}") @@ -49,20 +80,67 @@ def extract_ir( sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) start_time = datetime.datetime.now() - search_status = QueryTaskStatus.RUNNING + job_status: QueryTaskStatus with closing(sql_adapter.create_connection(True)) as db_conn, closing( db_conn.cursor(dictionary=True) ) as db_cursor: + try: + extract_ir_command = make_command( + storage_engine=clp_storage_engine, + clp_home=clp_home, + archives_dir=archive_directory, + ir_output_dir=ir_directory, + archive_id=archive_id, + extract_ir_config=extract_ir_config, + results_cache_uri=results_cache_uri, + results_collection=ir_collection, + ) + except ValueError as e: + error_message = f"Error creating extract command: {e}" + logger.error(error_message) + job_status = QueryTaskStatus.FAILED + update_query_task_metadata( + db_cursor, + task_id, + dict(status=job_status, duration=0, start_time=start_time), + ) + db_conn.commit() + clo_log_file.write(error_message) + clo_log_file.close() + + return QueryTaskResult( + task_id=task_id, + status=job_status, + duration=0, + error_log_path=str(clo_log_path), + ).dict() + + job_status = QueryTaskStatus.RUNNING update_query_task_metadata( - db_cursor, task_id, dict(status=search_status, start_time=start_time) + db_cursor, task_id, dict(status=job_status, start_time=start_time) ) db_conn.commit() - logger.info(f'Running Placeholder task for job {job_id}') - logger.info(f'Arguments: split_id: {extract_ir_config.file_split_id}, msg_ix: {extract_ir_config.msg_ix}') + logger.info(f'Running: {" ".join(extract_ir_command)}') + extract_proc = subprocess.Popen( + extract_ir_command, + preexec_fn=os.setpgrp, + close_fds=True, + stdout=clo_log_file, + stderr=clo_log_file, + ) - # Mark job succeed - search_status = QueryTaskStatus.SUCCEEDED + logger.info("Waiting for extraction to finish") + # communicate is equivalent to wait in this case, but avoids deadlocks if we switch to piping + # stdout/stderr in the future. + extract_proc.communicate() + return_code = extract_proc.returncode + if 0 != return_code: + job_status = QueryTaskStatus.FAILED + logger.error(f"Failed extraction task for job {job_id} - return_code={return_code}") + else: + job_status = QueryTaskStatus.SUCCEEDED + logger.info(f"Extraction task completed for job {job_id}") # Close log files clo_log_file.close() @@ -72,17 +150,17 @@ def extract_ir( db_conn.cursor(dictionary=True) ) as db_cursor: update_query_task_metadata( - db_cursor, task_id, dict(status=search_status, start_time=start_time, duration=duration) + db_cursor, task_id, dict(status=job_status, start_time=start_time, duration=duration) ) db_conn.commit() extract_ir_task_result = QueryTaskResult( - status=search_status, + status=job_status, task_id=task_id, duration=duration, ) - if QueryTaskStatus.FAILED == search_status: + if QueryTaskStatus.FAILED == job_status: extract_ir_task_result.error_log_path = str(clo_log_path) - logger.info(f'Finished Placeholder task for job {job_id}') + return extract_ir_task_result.dict() diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 8aab3ff3b..7906bf619 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -15,7 +15,7 @@ from job_orchestration.executor.query.celery import app from job_orchestration.scheduler.job_config import SearchConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus -from .utils import update_query_task_metadata +from .utils import update_query_task_metadata, get_logger_file_path # Setup logging logger = get_task_logger(__name__) @@ -29,7 +29,7 @@ def make_command( results_collection: str, ): if StorageEngine.CLP == storage_engine: - command = [str(clp_home / "bin" / "clo"), str(archives_dir / archive_id)] + command = [str(clp_home / "bin" / "clo"), "s", str(archives_dir / archive_id)] if search_config.path_filter is not None: command.append("--file-path") command.append(search_config.path_filter) @@ -108,10 +108,8 @@ def search( clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) # Setup logging to file - worker_logs_dir = clp_logs_dir / job_id - worker_logs_dir.mkdir(exist_ok=True, parents=True) set_logging_level(logger, clp_logging_level) - clo_log_path = worker_logs_dir / f"{task_id}-clo.log" + clo_log_path = get_logger_file_path(clp_logs_dir, job_id, task_id) clo_log_file = open(clo_log_path, "w") logger.info(f"Started task for job {job_id}") @@ -120,7 +118,7 @@ def search( sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) start_time = datetime.datetime.now() - search_status = QueryTaskStatus.RUNNING + search_status: QueryTaskStatus with closing(sql_adapter.create_connection(True)) as db_conn, closing( db_conn.cursor(dictionary=True) ) as db_cursor: @@ -154,6 +152,7 @@ def search( error_log_path=str(clo_log_path), ).dict() + search_status = QueryTaskStatus.RUNNING update_query_task_metadata( db_cursor, task_id, dict(status=search_status, start_time=start_time) ) diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py index f5bf04c39..b19e6f1a6 100644 --- a/components/job-orchestration/job_orchestration/executor/query/utils.py +++ b/components/job-orchestration/job_orchestration/executor/query/utils.py @@ -1,4 +1,5 @@ from typing import Any, Dict +from pathlib import Path from clp_py_utils.clp_config import QUERY_TASKS_TABLE_NAME def update_query_task_metadata( db_cursor, @@ -14,3 +15,12 @@ def update_query_task_metadata( WHERE id = {task_id} """ db_cursor.execute(query) + +def get_logger_file_path( + clp_logs_dir: Path, + job_id: str, + task_id: int +) -> Path: + worker_logs_dir = clp_logs_dir / job_id + worker_logs_dir.mkdir(exist_ok=True, parents=True) + return worker_logs_dir / f"{task_id}-clo.log" diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml index 740146ab9..84cac8eaa 100644 --- a/components/package-template/src/etc/clp-config.yml +++ b/components/package-template/src/etc/clp-config.yml @@ -47,6 +47,7 @@ # host: "localhost" # port: 27017 # db_name: "clp-search" +# ir_collection_name: "clp-ir" # #compression_worker: # logging_level: "INFO" @@ -77,6 +78,10 @@ # # How much data CLP should try to fit into each segment within an archive # target_segment_size: 268435456 # 256 MB # +## Where IR should be output to +#ir_output: +# directory: "var/data/ir" +# ## Location where other data (besides archives) are stored. It will be created if ## it doesn't exist. #data_directory: "var/data" From afa26f9218d7fa743c43a07d81ede15a351d32af Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 21 Jun 2024 11:50:07 -0400 Subject: [PATCH 10/28] First Refactoring --- .../executor/query/extract_ir_task.py | 99 +++++++---------- .../executor/query/fs_search_task.py | 101 ++++++++---------- .../job_orchestration/executor/query/utils.py | 62 +++++++---- .../scheduler/query/query_scheduler.py | 36 +++---- .../package-template/src/etc/clp-config.yml | 12 +-- 5 files changed, 149 insertions(+), 161 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index 259004523..aac2673e6 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -9,17 +9,19 @@ from celery.app.task import Task from celery.utils.log import get_task_logger -from clp_py_utils.clp_config import Database, QUERY_TASKS_TABLE_NAME, StorageEngine +from clp_py_utils.clp_config import Database, StorageEngine from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app from job_orchestration.scheduler.job_config import ExtractIrConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus -from .utils import update_query_task_metadata, get_logger_file_path + +from .utils import get_logger_file_path, get_task_results, update_query_task_metadata # Setup logging logger = get_task_logger(__name__) + def make_command( storage_engine: str, clp_home: Path, @@ -40,7 +42,7 @@ def make_command( extract_ir_config.file_split_id, str(ir_output_dir), results_cache_uri, - results_collection + results_collection, ] if extract_ir_config.target_size is not None: command.append("--target-size") @@ -50,6 +52,7 @@ def make_command( return command + @app.task(bind=True) def extract_ir( self: Task, @@ -81,45 +84,40 @@ def extract_ir( start_time = datetime.datetime.now() job_status: QueryTaskStatus - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - try: - extract_ir_command = make_command( - storage_engine=clp_storage_engine, - clp_home=clp_home, - archives_dir=archive_directory, - ir_output_dir=ir_directory, - archive_id=archive_id, - extract_ir_config=extract_ir_config, - results_cache_uri=results_cache_uri, - results_collection=ir_collection, - ) - except ValueError as e: - error_message = f"Error creating extract command: {e}" - logger.error(error_message) - job_status = QueryTaskStatus.FAILED - update_query_task_metadata( - db_cursor, - task_id, - dict(status=job_status, duration=0, start_time=start_time), - ) - db_conn.commit() - clo_log_file.write(error_message) - clo_log_file.close() - - return QueryTaskResult( - task_id=task_id, - status=job_status, - duration=0, - error_log_path=str(clo_log_path), - ).dict() - - job_status = QueryTaskStatus.RUNNING + try: + extract_ir_command = make_command( + storage_engine=clp_storage_engine, + clp_home=clp_home, + archives_dir=archive_directory, + ir_output_dir=ir_directory, + archive_id=archive_id, + extract_ir_config=extract_ir_config, + results_cache_uri=results_cache_uri, + results_collection=ir_collection, + ) + except ValueError as e: + error_message = f"Error creating extract command: {e}" + logger.error(error_message) + clo_log_file.write(error_message) + + job_status = QueryTaskStatus.FAILED update_query_task_metadata( - db_cursor, task_id, dict(status=job_status, start_time=start_time) + sql_adapter, + task_id, + dict(status=job_status, duration=0, start_time=start_time), ) - db_conn.commit() + clo_log_file.write(error_message) + + clo_log_file.close() + return QueryTaskResult( + task_id=task_id, + status=job_status, + duration=0, + error_log_path=str(clo_log_path), + ).dict() + + job_status = QueryTaskStatus.RUNNING + update_query_task_metadata(sql_adapter, task_id, dict(status=job_status, start_time=start_time)) logger.info(f'Running: {" ".join(extract_ir_command)}') extract_proc = subprocess.Popen( @@ -130,7 +128,6 @@ def extract_ir( stderr=clo_log_file, ) - logger.info("Waiting for extraction to finish") # communicate is equivalent to wait in this case, but avoids deadlocks if we switch to piping # stdout/stderr in the future. extract_proc.communicate() @@ -142,25 +139,11 @@ def extract_ir( job_status = QueryTaskStatus.SUCCEEDED logger.info(f"Extraction task completed for job {job_id}") - # Close log files clo_log_file.close() duration = (datetime.datetime.now() - start_time).total_seconds() - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - update_query_task_metadata( - db_cursor, task_id, dict(status=job_status, start_time=start_time, duration=duration) - ) - db_conn.commit() - - extract_ir_task_result = QueryTaskResult( - status=job_status, - task_id=task_id, - duration=duration, + update_query_task_metadata( + sql_adapter, task_id, dict(status=job_status, start_time=start_time, duration=duration) ) - if QueryTaskStatus.FAILED == job_status: - extract_ir_task_result.error_log_path = str(clo_log_path) - - return extract_ir_task_result.dict() + return get_task_results(task_id, job_status, duration, clo_log_path) diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 7906bf619..fad0af8b0 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -9,16 +9,19 @@ from celery.app.task import Task from celery.utils.log import get_task_logger -from clp_py_utils.clp_config import Database, QUERY_TASKS_TABLE_NAME, StorageEngine +from clp_py_utils.clp_config import Database, StorageEngine from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app from job_orchestration.scheduler.job_config import SearchConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus -from .utils import update_query_task_metadata, get_logger_file_path + +from .utils import get_logger_file_path, get_task_results, update_query_task_metadata + # Setup logging logger = get_task_logger(__name__) + def make_command( storage_engine: str, clp_home: Path, @@ -118,45 +121,41 @@ def search( sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) start_time = datetime.datetime.now() - search_status: QueryTaskStatus - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - try: - search_command = make_command( - storage_engine=clp_storage_engine, - clp_home=clp_home, - archives_dir=archive_directory, - archive_id=archive_id, - search_config=search_config, - results_cache_uri=results_cache_uri, - results_collection=job_id, - ) - except ValueError as e: - error_message = f"Error creating search command: {e}" - logger.error(error_message) - - update_query_task_metadata( - db_cursor, - task_id, - dict(status=QueryTaskStatus.FAILED, duration=0, start_time=start_time), - ) - db_conn.commit() - clo_log_file.write(error_message) - clo_log_file.close() - - return QueryTaskResult( - task_id=task_id, - status=QueryTaskStatus.FAILED, - duration=0, - error_log_path=str(clo_log_path), - ).dict() - - search_status = QueryTaskStatus.RUNNING + job_status: QueryTaskStatus + try: + search_command = make_command( + storage_engine=clp_storage_engine, + clp_home=clp_home, + archives_dir=archive_directory, + archive_id=archive_id, + search_config=search_config, + results_cache_uri=results_cache_uri, + results_collection=job_id, + ) + except ValueError as e: + error_message = f"Error creating search command: {e}" + logger.error(error_message) + clo_log_file.write(error_message) + + job_status = QueryTaskStatus.FAILED update_query_task_metadata( - db_cursor, task_id, dict(status=search_status, start_time=start_time) + sql_adapter, + task_id, + dict(status=job_status, duration=0, start_time=start_time), ) - db_conn.commit() + + clo_log_file.close() + return QueryTaskResult( + task_id=task_id, + status=job_status, + duration=0, + error_log_path=str(clo_log_path), + ).dict() + + search_status = QueryTaskStatus.RUNNING + update_query_task_metadata( + sql_adapter, task_id, dict(status=search_status, start_time=start_time) + ) logger.info(f'Running: {" ".join(search_command)}') search_proc = subprocess.Popen( @@ -188,31 +187,17 @@ def sigterm_handler(_signo, _stack_frame): search_proc.communicate() return_code = search_proc.returncode if 0 != return_code: - search_status = QueryTaskStatus.FAILED + job_status = QueryTaskStatus.FAILED logger.error(f"Failed search task for job {job_id} - return_code={return_code}") else: - search_status = QueryTaskStatus.SUCCEEDED + job_status = QueryTaskStatus.SUCCEEDED logger.info(f"Search task completed for job {job_id}") - # Close log files clo_log_file.close() duration = (datetime.datetime.now() - start_time).total_seconds() - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - update_query_task_metadata( - db_cursor, task_id, dict(status=search_status, start_time=start_time, duration=duration) - ) - db_conn.commit() - - search_task_result = QueryTaskResult( - status=search_status, - task_id=task_id, - duration=duration, + update_query_task_metadata( + sql_adapter, task_id, dict(status=job_status, start_time=start_time, duration=duration) ) - if QueryTaskStatus.FAILED == search_status: - search_task_result.error_log_path = str(clo_log_path) - - return search_task_result.dict() + return get_task_results(task_id, job_status, duration, clo_log_path) diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py index b19e6f1a6..0c163591b 100644 --- a/components/job-orchestration/job_orchestration/executor/query/utils.py +++ b/components/job-orchestration/job_orchestration/executor/query/utils.py @@ -1,26 +1,48 @@ -from typing import Any, Dict +from contextlib import closing from pathlib import Path +from typing import Any, Dict + from clp_py_utils.clp_config import QUERY_TASKS_TABLE_NAME +from clp_py_utils.sql_adapter import SQL_Adapter +from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus + + +def get_logger_file_path(clp_logs_dir: Path, job_id: str, task_id: int) -> Path: + worker_logs_dir = clp_logs_dir / job_id + worker_logs_dir.mkdir(exist_ok=True, parents=True) + return worker_logs_dir / f"{task_id}-clo.log" + + +def get_task_results( + task_id: int, job_status: QueryTaskStatus, duration: float, clo_log_path: Path +) -> Dict[Any, Any]: + + task_result = QueryTaskResult( + status=job_status, + task_id=task_id, + duration=duration, + ) + + if QueryTaskStatus.FAILED == job_status: + task_result.error_log_path = str(clo_log_path) + + return task_result.dict() + + def update_query_task_metadata( - db_cursor, + sql_adapter: SQL_Adapter, task_id: int, kv_pairs: Dict[str, Any], ): - if not kv_pairs or len(kv_pairs) == 0: - raise ValueError("No key-value pairs provided to update query task metadata") - - query = f""" - UPDATE {QUERY_TASKS_TABLE_NAME} - SET {', '.join([f'{k}="{v}"' for k, v in kv_pairs.items()])} - WHERE id = {task_id} - """ - db_cursor.execute(query) - -def get_logger_file_path( - clp_logs_dir: Path, - job_id: str, - task_id: int -) -> Path: - worker_logs_dir = clp_logs_dir / job_id - worker_logs_dir.mkdir(exist_ok=True, parents=True) - return worker_logs_dir / f"{task_id}-clo.log" + with closing(sql_adapter.create_connection(True)) as db_conn, closing( + db_conn.cursor(dictionary=True) + ) as db_cursor: + if not kv_pairs or len(kv_pairs) == 0: + raise ValueError("No key-value pairs provided to update query task metadata") + + query = f""" + UPDATE {QUERY_TASKS_TABLE_NAME} + SET {', '.join([f'{k}="{v}"' for k, v in kv_pairs.items()])} + WHERE id = {task_id} + """ + db_cursor.execute(query) diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index 55f96a12e..6dd74f031 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -39,8 +39,8 @@ from clp_py_utils.core import read_yaml_config_file from clp_py_utils.decorators import exception_default_value from clp_py_utils.sql_adapter import SQL_Adapter -from job_orchestration.executor.query.fs_search_task import search from job_orchestration.executor.query.extract_ir_task import extract_ir +from job_orchestration.executor.query.fs_search_task import search from job_orchestration.scheduler.constants import QueryJobStatus, QueryJobType, QueryTaskStatus from job_orchestration.scheduler.job_config import ExtractIrConfig, SearchConfig from job_orchestration.scheduler.query.reducer_handler import ( @@ -511,18 +511,20 @@ def handle_pending_query_jobs( elif QueryJobType.EXTRACT_IR == job_type: extract_ir_config = ExtractIrConfig.parse_obj(msgpack.unpackb(job_config)) - archive_id = get_archive_and_update_config_for_extraction(db_conn, extract_ir_config) + archive_id = get_archive_and_update_config_for_extraction( + db_conn, extract_ir_config + ) if not archive_id: logger.error(f"Failed to get archive for extraction") if not set_job_or_task_status( - db_conn, - QUERY_JOBS_TABLE_NAME, - job_id, - QueryJobStatus.FAILED, - QueryJobStatus.PENDING, - start_time=datetime.datetime.now(), - num_tasks=0, - duration=0, + db_conn, + QUERY_JOBS_TABLE_NAME, + job_id, + QueryJobStatus.FAILED, + QueryJobStatus.PENDING, + start_time=datetime.datetime.now(), + num_tasks=0, + duration=0, ): logger.error(f"Failed to set job: {job_id} as failed") continue @@ -531,7 +533,7 @@ def handle_pending_query_jobs( id=job_id, archive_id=archive_id, extract_ir_config=extract_ir_config, - state=InternalJobState.WAITING_FOR_DISPATCH + state=InternalJobState.WAITING_FOR_DISPATCH, ) target_archive = [new_extract_ir_job.archive_id] @@ -541,12 +543,10 @@ def handle_pending_query_jobs( target_archive, clp_metadata_db_conn_params, results_cache_uri, - 1 + 1, ) active_jobs[new_extract_ir_job.id] = new_extract_ir_job - logger.info( - f"Dispatched IR extraction job {job_id} on archive: {archive_id}" - ) + logger.info(f"Dispatched IR extraction job {job_id} on archive: {archive_id}") else: logger.error(f"Unexpected job type: {job_type}, skipping job {job_id}") @@ -576,7 +576,7 @@ def handle_pending_query_jobs( archive_ids_for_search, clp_metadata_db_conn_params, results_cache_uri, - job.num_archives_to_search + job.num_archives_to_search, ) logger.info( f"Dispatched job {job_id} with {len(archive_ids_for_search)} archives to search." @@ -786,9 +786,7 @@ async def check_job_status_and_update_db(db_conn_pool, results_cache_uri): ) elif QueryJobType.EXTRACT_IR == job_type: extract_ir_job: ExtractIrJob = job - await handle_returned_extract_ir_job( - db_conn, extract_ir_job, returned_results - ) + await handle_returned_extract_ir_job(db_conn, extract_ir_job, returned_results) else: logger.error(f"Unexpected job type: {job_type}, skipping job {job_id}") diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml index 84cac8eaa..7b583e1c6 100644 --- a/components/package-template/src/etc/clp-config.yml +++ b/components/package-template/src/etc/clp-config.yml @@ -10,12 +10,12 @@ #package: # storage_engine: "clp" # -#database: -# type: "mariadb" # "mariadb" or "mysql" -# host: "localhost" -# port: 3306 -# name: "clp-db" -# +database: + type: "mariadb" # "mariadb" or "mysql" + host: "localhost" + port: 4001 + name: "clp-db" + #compression_scheduler: # jobs_poll_delay: 0.1 # seconds # logging_level: "INFO" From e2cdcb1c16f4f576f983b23d47ba7c32b95a6310 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 21 Jun 2024 12:17:24 -0400 Subject: [PATCH 11/28] Refactoring --- .../clp_package_utils/scripts/start_clp.py | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 7901f0f8c..cf5194667 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -8,8 +8,8 @@ import subprocess import sys import time -import typing import uuid +from typing import Any, Dict, List, Optional import yaml from clp_py_utils.clp_config import ( @@ -526,6 +526,8 @@ def start_compression_worker( clp_config.redis.compression_backend_database, num_cpus, mounts, + None, + None, ) @@ -538,6 +540,10 @@ def start_query_worker( ): celery_method = "job_orchestration.executor.query" celery_route = f"{QueueName.QUERY}" + + query_worker_mount = [mounts.ir_output_dir] + query_worker_env = {"CLP_IR_OUTPUT_DIR": container_clp_config.ir_output.directory} + generic_start_worker( QUERY_WORKER_COMPONENT_NAME, instance_id, @@ -549,6 +555,8 @@ def start_query_worker( clp_config.redis.query_backend_database, num_cpus, mounts, + query_worker_env, + query_worker_mount, ) @@ -563,6 +571,8 @@ def generic_start_worker( redis_database: int, num_cpus: int, mounts: CLPDockerMounts, + worker_specific_env: Dict[str, Any], + worker_specific_mount: List[Optional[DockerMount]], ): logger.info(f"Starting {component_name}...") @@ -589,6 +599,7 @@ def generic_start_worker( "-w", str(CONTAINER_CLP_HOME), "--name", container_name, "--log-driver", "local", + "-u", f"{os.getuid()}:{os.getgid()}", "-e", f"PYTHONPATH={clp_site_packages_dir}", "-e", ( f"BROKER_URL=amqp://" @@ -607,21 +618,28 @@ def generic_start_worker( "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", "-e", f"CLP_IR_COLLECTION={clp_config.results_cache.ir_collection_name}", - "-u", f"{os.getuid()}:{os.getgid()}", - "--mount", str(mounts.clp_home), ] + if worker_specific_env: + for env_name, env_value in worker_specific_env.items(): + container_start_cmd.append("-e") + container_start_cmd.append(f"{env_name}={env_value}") + # fmt: on necessary_mounts = [ + mounts.clp_home, mounts.data_dir, mounts.logs_dir, mounts.archives_output_dir, - mounts.ir_output_dir, mounts.input_logs_dir, ] + if worker_specific_mount: + necessary_mounts.extend(worker_specific_mount) + for mount in necessary_mounts: - if mount: - container_start_cmd.append("--mount") - container_start_cmd.append(str(mount)) + if not mount: + raise ValueError(f"Required mount configuration is empty: {necessary_mounts}") + container_start_cmd.append("--mount") + container_start_cmd.append(str(mount)) container_start_cmd.append(clp_config.execution_container) worker_cmd = [ @@ -649,8 +667,8 @@ def generic_start_worker( def update_meteor_settings( parent_key_prefix: str, - settings: typing.Dict[str, typing.Any], - updates: typing.Dict[str, typing.Any], + settings: Dict[str, Any], + updates: Dict[str, Any], ): """ Recursively updates the given Meteor settings object with the values from `updates`. From 46431cf2838866706b94f62bda1c09b3df59bcc3 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 21 Jun 2024 12:21:20 -0400 Subject: [PATCH 12/28] refactor log messages and names --- .../executor/query/extract_ir_task.py | 37 ++++++++++--------- .../executor/query/fs_search_task.py | 24 ++++++------ .../job_orchestration/executor/query/utils.py | 3 +- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index aac2673e6..6efcddd3b 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -16,7 +16,7 @@ from job_orchestration.scheduler.job_config import ExtractIrConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus -from .utils import get_logger_file_path, get_task_results, update_query_task_metadata +from .utils import get_logger_file_path, generate_final_task_results, update_query_task_metadata # Setup logging logger = get_task_logger(__name__) @@ -77,15 +77,15 @@ def extract_ir( clo_log_path = get_logger_file_path(clp_logs_dir, job_id, task_id) clo_log_file = open(clo_log_path, "w") - logger.info(f"Started extract IR task for job {job_id}") + logger.info(f"Started IR extraction task for job {job_id}") extract_ir_config = ExtractIrConfig.parse_obj(job_config_obj) sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) start_time = datetime.datetime.now() - job_status: QueryTaskStatus + task_status: QueryTaskStatus try: - extract_ir_command = make_command( + task_command = make_command( storage_engine=clp_storage_engine, clp_home=clp_home, archives_dir=archive_directory, @@ -96,54 +96,55 @@ def extract_ir( results_collection=ir_collection, ) except ValueError as e: - error_message = f"Error creating extract command: {e}" + error_message = f"Error creating IR extraction command: {e}" logger.error(error_message) clo_log_file.write(error_message) - job_status = QueryTaskStatus.FAILED + task_status = QueryTaskStatus.FAILED update_query_task_metadata( sql_adapter, task_id, - dict(status=job_status, duration=0, start_time=start_time), + dict(status=task_status, duration=0, start_time=start_time), ) clo_log_file.write(error_message) clo_log_file.close() return QueryTaskResult( task_id=task_id, - status=job_status, + status=task_status, duration=0, error_log_path=str(clo_log_path), ).dict() - job_status = QueryTaskStatus.RUNNING - update_query_task_metadata(sql_adapter, task_id, dict(status=job_status, start_time=start_time)) + task_status = QueryTaskStatus.RUNNING + update_query_task_metadata(sql_adapter, task_id, dict(status=task_status, start_time=start_time)) - logger.info(f'Running: {" ".join(extract_ir_command)}') + logger.info(f'Running: {" ".join(task_command)}') extract_proc = subprocess.Popen( - extract_ir_command, + task_command, preexec_fn=os.setpgrp, close_fds=True, stdout=clo_log_file, stderr=clo_log_file, ) + logger.info("Waiting for IR extraction to finish") # communicate is equivalent to wait in this case, but avoids deadlocks if we switch to piping # stdout/stderr in the future. extract_proc.communicate() return_code = extract_proc.returncode if 0 != return_code: - job_status = QueryTaskStatus.FAILED - logger.error(f"Failed extraction task for job {job_id} - return_code={return_code}") + task_status = QueryTaskStatus.FAILED + logger.error(f"Failed IR extraction task for job {job_id} - return_code={return_code}") else: - job_status = QueryTaskStatus.SUCCEEDED - logger.info(f"Extraction task completed for job {job_id}") + task_status = QueryTaskStatus.SUCCEEDED + logger.info(f"IR extraction task completed for job {job_id}") clo_log_file.close() duration = (datetime.datetime.now() - start_time).total_seconds() update_query_task_metadata( - sql_adapter, task_id, dict(status=job_status, start_time=start_time, duration=duration) + sql_adapter, task_id, dict(status=task_status, start_time=start_time, duration=duration) ) - return get_task_results(task_id, job_status, duration, clo_log_path) + return generate_final_task_results(task_id, task_status, duration, clo_log_path) diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index fad0af8b0..efaab2809 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -16,7 +16,7 @@ from job_orchestration.scheduler.job_config import SearchConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus -from .utils import get_logger_file_path, get_task_results, update_query_task_metadata +from .utils import get_logger_file_path, generate_final_task_results, update_query_task_metadata # Setup logging logger = get_task_logger(__name__) @@ -121,9 +121,9 @@ def search( sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) start_time = datetime.datetime.now() - job_status: QueryTaskStatus + task_status: QueryTaskStatus try: - search_command = make_command( + task_command = make_command( storage_engine=clp_storage_engine, clp_home=clp_home, archives_dir=archive_directory, @@ -137,17 +137,17 @@ def search( logger.error(error_message) clo_log_file.write(error_message) - job_status = QueryTaskStatus.FAILED + task_status = QueryTaskStatus.FAILED update_query_task_metadata( sql_adapter, task_id, - dict(status=job_status, duration=0, start_time=start_time), + dict(status=task_status, duration=0, start_time=start_time), ) clo_log_file.close() return QueryTaskResult( task_id=task_id, - status=job_status, + status=task_status, duration=0, error_log_path=str(clo_log_path), ).dict() @@ -157,9 +157,9 @@ def search( sql_adapter, task_id, dict(status=search_status, start_time=start_time) ) - logger.info(f'Running: {" ".join(search_command)}') + logger.info(f'Running: {" ".join(task_command)}') search_proc = subprocess.Popen( - search_command, + task_command, preexec_fn=os.setpgrp, close_fds=True, stdout=clo_log_file, @@ -187,17 +187,17 @@ def sigterm_handler(_signo, _stack_frame): search_proc.communicate() return_code = search_proc.returncode if 0 != return_code: - job_status = QueryTaskStatus.FAILED + task_status = QueryTaskStatus.FAILED logger.error(f"Failed search task for job {job_id} - return_code={return_code}") else: - job_status = QueryTaskStatus.SUCCEEDED + task_status = QueryTaskStatus.SUCCEEDED logger.info(f"Search task completed for job {job_id}") clo_log_file.close() duration = (datetime.datetime.now() - start_time).total_seconds() update_query_task_metadata( - sql_adapter, task_id, dict(status=job_status, start_time=start_time, duration=duration) + sql_adapter, task_id, dict(status=task_status, start_time=start_time, duration=duration) ) - return get_task_results(task_id, job_status, duration, clo_log_path) + return generate_final_task_results(task_id, task_status, duration, clo_log_path) diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py index 0c163591b..340b617db 100644 --- a/components/job-orchestration/job_orchestration/executor/query/utils.py +++ b/components/job-orchestration/job_orchestration/executor/query/utils.py @@ -13,10 +13,9 @@ def get_logger_file_path(clp_logs_dir: Path, job_id: str, task_id: int) -> Path: return worker_logs_dir / f"{task_id}-clo.log" -def get_task_results( +def generate_final_task_results( task_id: int, job_status: QueryTaskStatus, duration: float, clo_log_path: Path ) -> Dict[Any, Any]: - task_result = QueryTaskResult( status=job_status, task_id=task_id, From 43dede43a05804bc577de0ef0083bcf488704c84 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 21 Jun 2024 12:24:41 -0400 Subject: [PATCH 13/28] remove unused imports --- .../job_orchestration/executor/query/extract_ir_task.py | 3 --- .../job_orchestration/executor/query/fs_search_task.py | 1 - 2 files changed, 4 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index 6efcddd3b..1885b6a21 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -1,9 +1,6 @@ import datetime import os -import signal import subprocess -import sys -from contextlib import closing from pathlib import Path from typing import Any, Dict diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index efaab2809..b4111323e 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -3,7 +3,6 @@ import signal import subprocess import sys -from contextlib import closing from pathlib import Path from typing import Any, Dict From 57492bb0160117d8f55f1cca0d3b556db6f52745 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 21 Jun 2024 12:53:01 -0400 Subject: [PATCH 14/28] fix --- .../clp_package_utils/scripts/start_clp.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index cf5194667..842927c98 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -542,7 +542,10 @@ def start_query_worker( celery_route = f"{QueueName.QUERY}" query_worker_mount = [mounts.ir_output_dir] - query_worker_env = {"CLP_IR_OUTPUT_DIR": container_clp_config.ir_output.directory} + query_worker_env = { + "CLP_IR_OUTPUT_DIR": container_clp_config.ir_output.directory, + "CLP_IR_COLLECTION": clp_config.results_cache.ir_collection_name + } generic_start_worker( QUERY_WORKER_COMPONENT_NAME, @@ -613,11 +616,9 @@ def generic_start_worker( "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", "-e", f"CLP_DATA_DIR={container_clp_config.data_directory}", "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}", - "-e", f"CLP_IR_OUTPUT_DIR={container_clp_config.ir_output.directory}", "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", - "-e", f"CLP_IR_COLLECTION={clp_config.results_cache.ir_collection_name}", ] if worker_specific_env: for env_name, env_value in worker_specific_env.items(): From 3a785f8c43080050133b005f61cc6ec8d95094d2 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 21 Jun 2024 17:53:10 -0400 Subject: [PATCH 15/28] Polishing --- .../job_orchestration/executor/query/extract_ir_task.py | 5 ++--- .../job_orchestration/executor/query/fs_search_task.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index 1885b6a21..88e6022e9 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -103,7 +103,6 @@ def extract_ir( task_id, dict(status=task_status, duration=0, start_time=start_time), ) - clo_log_file.write(error_message) clo_log_file.close() return QueryTaskResult( @@ -132,10 +131,10 @@ def extract_ir( return_code = extract_proc.returncode if 0 != return_code: task_status = QueryTaskStatus.FAILED - logger.error(f"Failed IR extraction task for job {job_id} - return_code={return_code}") + logger.error(f"IR extraction task {task_id} failed for job {job_id} - return_code={return_code}") else: task_status = QueryTaskStatus.SUCCEEDED - logger.info(f"IR extraction task completed for job {job_id}") + logger.info(f"IR extraction task {task_id} completed for job {job_id}") clo_log_file.close() duration = (datetime.datetime.now() - start_time).total_seconds() diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index b4111323e..5027de632 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -187,10 +187,10 @@ def sigterm_handler(_signo, _stack_frame): return_code = search_proc.returncode if 0 != return_code: task_status = QueryTaskStatus.FAILED - logger.error(f"Failed search task for job {job_id} - return_code={return_code}") + logger.error(f"Search task {task_id} failed for job {job_id} - return_code={return_code}") else: task_status = QueryTaskStatus.SUCCEEDED - logger.info(f"Search task completed for job {job_id}") + logger.info(f"Search task {task_id} completed for job {job_id}") clo_log_file.close() duration = (datetime.datetime.now() - start_time).total_seconds() From d740476b039a7f73c479351ec490e8d9bf3768ec Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 24 Jun 2024 19:12:40 -0400 Subject: [PATCH 16/28] Fixes --- .../scheduler/query/query_scheduler.py | 4 ++-- .../job_orchestration/scheduler/scheduler_data.py | 2 +- components/package-template/src/etc/clp-config.yml | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index 4b94581af..f5ea42396 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -330,7 +330,7 @@ def get_task_group_for_job( results_cache_uri: str, ): job_config_obj = job.get_config().dict() - if job.type() == QueryJobType.SEARCH_OR_AGGREGATION: + if job.get_type() == QueryJobType.SEARCH_OR_AGGREGATION: return celery.group( search.s( job_id=job.id, @@ -342,7 +342,7 @@ def get_task_group_for_job( ) for i in range(len(archive_ids)) ) - if job.type() == QueryJobType.EXTRACT_IR: + if job.get_type() == QueryJobType.EXTRACT_IR: return celery.group( extract_ir.s( job_id=job.id, diff --git a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py index 91fcec217..6dc83931f 100644 --- a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py +++ b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py @@ -50,7 +50,7 @@ class QueryJob(BaseModel, ABC): def get_type(self) -> QueryJobType: ... @abstractmethod - def get_config(self) -> QueryConfig: ... + def get_config(self) -> QueryJobConfig: ... class ExtractIrJob(QueryJob): diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml index 7b583e1c6..84cac8eaa 100644 --- a/components/package-template/src/etc/clp-config.yml +++ b/components/package-template/src/etc/clp-config.yml @@ -10,12 +10,12 @@ #package: # storage_engine: "clp" # -database: - type: "mariadb" # "mariadb" or "mysql" - host: "localhost" - port: 4001 - name: "clp-db" - +#database: +# type: "mariadb" # "mariadb" or "mysql" +# host: "localhost" +# port: 3306 +# name: "clp-db" +# #compression_scheduler: # jobs_poll_delay: 0.1 # seconds # logging_level: "INFO" From fbbf7f501dc6275aeb3f0e98a2ebf046484c87bd Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 24 Jun 2024 19:17:27 -0400 Subject: [PATCH 17/28] Fixes again --- .../job_orchestration/scheduler/job_config.py | 1 - .../job_orchestration/scheduler/query/query_scheduler.py | 9 +++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/components/job-orchestration/job_orchestration/scheduler/job_config.py b/components/job-orchestration/job_orchestration/scheduler/job_config.py index 71659c8c8..f8c81a3ba 100644 --- a/components/job-orchestration/job_orchestration/scheduler/job_config.py +++ b/components/job-orchestration/job_orchestration/scheduler/job_config.py @@ -1,7 +1,6 @@ from __future__ import annotations import typing -from abc import ABC from pydantic import BaseModel, validator diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index f5ea42396..7787cd414 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -330,7 +330,8 @@ def get_task_group_for_job( results_cache_uri: str, ): job_config_obj = job.get_config().dict() - if job.get_type() == QueryJobType.SEARCH_OR_AGGREGATION: + job_type = job.get_type() + if QueryJobType.SEARCH_OR_AGGREGATION == job_type: return celery.group( search.s( job_id=job.id, @@ -342,7 +343,7 @@ def get_task_group_for_job( ) for i in range(len(archive_ids)) ) - if job.get_type() == QueryJobType.EXTRACT_IR: + elif QueryJobType.EXTRACT_IR == job_type: return celery.group( extract_ir.s( job_id=job.id, @@ -354,6 +355,10 @@ def get_task_group_for_job( ) for i in range(len(archive_ids)) ) + else: + error_msg = f"Unexpected job type: {job_type}" + logger.error(error_msg) + raise NotImplementedError(error_msg) def dispatch_query_job( From f00029c0771538182334ae2d0c02ff5e67abe58d Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 24 Jun 2024 19:20:25 -0400 Subject: [PATCH 18/28] linter --- .../clp_package_utils/scripts/start_clp.py | 2 +- .../executor/query/extract_ir_task.py | 10 +++++++--- .../job_orchestration/executor/query/fs_search_task.py | 2 +- .../job_orchestration/scheduler/job_config.py | 1 + .../job_orchestration/scheduler/scheduler_data.py | 6 +++++- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 842927c98..23128ef8b 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -544,7 +544,7 @@ def start_query_worker( query_worker_mount = [mounts.ir_output_dir] query_worker_env = { "CLP_IR_OUTPUT_DIR": container_clp_config.ir_output.directory, - "CLP_IR_COLLECTION": clp_config.results_cache.ir_collection_name + "CLP_IR_COLLECTION": clp_config.results_cache.ir_collection_name, } generic_start_worker( diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index 74bba1fc3..bb5f222fb 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -13,7 +13,7 @@ from job_orchestration.scheduler.job_config import ExtractIrJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus -from .utils import get_logger_file_path, generate_final_task_results, update_query_task_metadata +from .utils import generate_final_task_results, get_logger_file_path, update_query_task_metadata # Setup logging logger = get_task_logger(__name__) @@ -113,7 +113,9 @@ def extract_ir( ).dict() task_status = QueryTaskStatus.RUNNING - update_query_task_metadata(sql_adapter, task_id, dict(status=task_status, start_time=start_time)) + update_query_task_metadata( + sql_adapter, task_id, dict(status=task_status, start_time=start_time) + ) logger.info(f'Running: {" ".join(task_command)}') extract_proc = subprocess.Popen( @@ -131,7 +133,9 @@ def extract_ir( return_code = extract_proc.returncode if 0 != return_code: task_status = QueryTaskStatus.FAILED - logger.error(f"IR extraction task {task_id} failed for job {job_id} - return_code={return_code}") + logger.error( + f"IR extraction task {task_id} failed for job {job_id} - return_code={return_code}" + ) else: task_status = QueryTaskStatus.SUCCEEDED logger.info(f"IR extraction task {task_id} completed for job {job_id}") diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 4a95fbe11..844af24e7 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -15,7 +15,7 @@ from job_orchestration.scheduler.job_config import SearchJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus -from .utils import get_logger_file_path, generate_final_task_results, update_query_task_metadata +from .utils import generate_final_task_results, get_logger_file_path, update_query_task_metadata # Setup logging logger = get_task_logger(__name__) diff --git a/components/job-orchestration/job_orchestration/scheduler/job_config.py b/components/job-orchestration/job_orchestration/scheduler/job_config.py index f8c81a3ba..6ae820713 100644 --- a/components/job-orchestration/job_orchestration/scheduler/job_config.py +++ b/components/job-orchestration/job_orchestration/scheduler/job_config.py @@ -41,6 +41,7 @@ class AggregationConfig(BaseModel): class QueryJobConfig(BaseModel): ... + class ExtractIrJobConfig(QueryJobConfig): orig_file_id: str msg_ix: int diff --git a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py index 6dc83931f..3d4c0d7a7 100644 --- a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py +++ b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py @@ -9,7 +9,11 @@ QueryJobType, QueryTaskStatus, ) -from job_orchestration.scheduler.job_config import ExtractIrJobConfig, QueryJobConfig, SearchJobConfig +from job_orchestration.scheduler.job_config import ( + ExtractIrJobConfig, + QueryJobConfig, + SearchJobConfig, +) from job_orchestration.scheduler.query.reducer_handler import ReducerHandlerMessageQueues from pydantic import BaseModel, validator From df88105a06d2a696d82438be7d4463a25e8b7687 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Tue, 25 Jun 2024 17:39:22 -0400 Subject: [PATCH 19/28] Add configurable option for target_uncompressed_size --- .../clp_package_utils/scripts/start_clp.py | 2 +- components/clp-py-utils/clp_py_utils/clp_config.py | 7 +++++++ components/package-template/src/etc/clp-config.yml | 4 ++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 23128ef8b..5ba5bbe15 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -602,7 +602,6 @@ def generic_start_worker( "-w", str(CONTAINER_CLP_HOME), "--name", container_name, "--log-driver", "local", - "-u", f"{os.getuid()}:{os.getgid()}", "-e", f"PYTHONPATH={clp_site_packages_dir}", "-e", ( f"BROKER_URL=amqp://" @@ -619,6 +618,7 @@ def generic_start_worker( "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", + "-u", f"{os.getuid()}:{os.getgid()}", ] if worker_specific_env: for env_name, env_value in worker_specific_env.items(): diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index ef75ba10f..8ed4b3a8f 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -330,6 +330,7 @@ def dump_to_primitive_dict(self): class IrOutput(BaseModel): directory: pathlib.Path = pathlib.Path("var") / "data" / "ir" + target_uncompressed_size: int = 128 * 1024 * 1024 @validator("directory") def validate_directory(cls, field): @@ -337,6 +338,12 @@ def validate_directory(cls, field): raise ValueError("directory can not be empty") return field + @validator("target_uncompressed_size") + def validate_target_uncompressed_size(cls, field): + if field <= 0: + raise ValueError("target_uncompressed_size must be greater than 0") + return field + def make_config_paths_absolute(self, clp_home: pathlib.Path): self.directory = make_config_path_absolute(clp_home, self.directory) diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml index 84cac8eaa..15fbcc2d3 100644 --- a/components/package-template/src/etc/clp-config.yml +++ b/components/package-template/src/etc/clp-config.yml @@ -82,6 +82,10 @@ #ir_output: # directory: "var/data/ir" # +# # How large each IR chunk should be before being +# # split into a new IR chunk +# target_uncompressed_size: 134217728 # 128 MB +# ## Location where other data (besides archives) are stored. It will be created if ## it doesn't exist. #data_directory: "var/data" From bebde5409a1be6477881e2cd4e26fe569a5a4313 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 26 Jun 2024 10:19:55 -0400 Subject: [PATCH 20/28] Apply suggestions from code review Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .../executor/query/fs_search_task.py | 2 +- .../scheduler/query/query_scheduler.py | 17 ++++++++--------- .../package-template/src/etc/clp-config.yml | 5 ++--- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 844af24e7..169aeb774 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -114,7 +114,7 @@ def search( clo_log_path = get_logger_file_path(clp_logs_dir, job_id, task_id) clo_log_file = open(clo_log_path, "w") - logger.info(f"Started task for job {job_id}") + logger.info(f"Started search task for job {job_id}") search_config = SearchJobConfig.parse_obj(job_config_obj) sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index 7787cd414..d51642a7b 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -283,16 +283,15 @@ def get_archive_and_update_config_for_extraction( db_conn, extract_ir_config: ExtractIrJobConfig, ) -> Optional[str]: - orig_file_id = extract_ir_config.orig_file_id msg_ix = extract_ir_config.msg_ix results = get_archive_and_file_split_for_extraction(db_conn, orig_file_id, msg_ix) if len(results) == 0: - logger.error(f"No file split and archive match with config: {orig_file_id}:{msg_ix}") + logger.error(f"No matching file splits for orig_file_id={orig_file_id}, msg_ix={msg_ix}") return None elif len(results) > 1: - logger.error(f"Multiple splits match with config: {orig_file_id}:{msg_ix}") + logger.error(f"Multiple file splits found for orig_file_id={orig_file_id}, msg_ix={msg_ix}") for result in results: logger.error(f"{result['archive_id']}:{result['id']}") return None @@ -532,7 +531,7 @@ def handle_pending_query_jobs( num_tasks=0, duration=0, ): - logger.error(f"Failed to set job: {job_id} as failed") + logger.error(f"Failed to set job {job_id} as failed") continue new_extract_ir_job = ExtractIrJob( @@ -712,7 +711,7 @@ async def handle_finished_search_job( async def handle_finished_extract_ir_job( - db_conn, job: SearchJob, task_results: Optional[Any] + db_conn, job: ExtractIrJob, task_results: Optional[Any] ) -> None: global active_jobs @@ -721,8 +720,8 @@ async def handle_finished_extract_ir_job( num_task = len(task_results) if 1 != num_task: logger.error( - f"Unexpected number of task under IR extraction job: {job_id}. " - f"expected 1, got {num_task}" + f"Unexpected number of tasks for IR extraction job {job_id}. " + f"Expected 1, got {num_tasks}." ) new_job_status = QueryJobStatus.FAILED else: @@ -750,9 +749,9 @@ async def handle_finished_extract_ir_job( duration=(datetime.datetime.now() - job.start_time).total_seconds(), ): if new_job_status == QueryJobStatus.SUCCEEDED: - logger.info(f"Completed job {job_id}.") + logger.info(f"Completed IR extraction job {job_id}.") else: - logger.info(f"Completed job {job_id} with failing tasks.") + logger.info(f"Completed IR extraction job {job_id} with failing tasks.") del active_jobs[job_id] diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml index 15fbcc2d3..98759a041 100644 --- a/components/package-template/src/etc/clp-config.yml +++ b/components/package-template/src/etc/clp-config.yml @@ -78,12 +78,11 @@ # # How much data CLP should try to fit into each segment within an archive # target_segment_size: 268435456 # 256 MB # -## Where IR should be output to +## Where CLP IR files should be output #ir_output: # directory: "var/data/ir" # -# # How large each IR chunk should be before being -# # split into a new IR chunk +# # How large each IR file should be before being split into a new IR file # target_uncompressed_size: 134217728 # 128 MB # ## Location where other data (besides archives) are stored. It will be created if From b5d40ea39cbd61b429aa982da9105250bd5d10af Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 26 Jun 2024 10:39:13 -0400 Subject: [PATCH 21/28] First batch of update --- .../clp-py-utils/clp_py_utils/clp_config.py | 4 ++-- .../executor/query/extract_ir_task.py | 19 +++++++++++-------- .../executor/query/fs_search_task.py | 14 +++++++++----- .../job_orchestration/executor/query/utils.py | 10 +++++----- .../job_orchestration/scheduler/job_config.py | 2 +- .../scheduler/query/query_scheduler.py | 15 +++++++-------- .../package-template/src/etc/clp-config.yml | 4 ++-- docs/src/dev-guide/components-webui.md | 2 +- 8 files changed, 38 insertions(+), 32 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 8ed4b3a8f..d8fed9803 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -254,8 +254,8 @@ def validate_upsert_interval(cls, field): class ResultsCache(BaseModel): host: str = "localhost" port: int = 27017 - db_name: str = "clp-search" - ir_collection_name: str = "clp-ir" + db_name: str = "clp-query-results" + ir_collection_name: str = "ir-files" @validator("host") def validate_host(cls, field): diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index bb5f222fb..c724ab6b5 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -13,8 +13,11 @@ from job_orchestration.scheduler.job_config import ExtractIrJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus -from .utils import generate_final_task_results, get_logger_file_path, update_query_task_metadata - +from job_orchestration.executor.query.utils import ( + generate_final_task_result, + get_task_log_file_path, + update_query_task_metadata, +) # Setup logging logger = get_task_logger(__name__) @@ -23,8 +26,8 @@ def make_command( storage_engine: str, clp_home: Path, archives_dir: Path, - ir_output_dir: Path, archive_id: str, + ir_output_dir: Path, extract_ir_config: ExtractIrJobConfig, results_cache_uri: str, results_collection: str, @@ -41,9 +44,9 @@ def make_command( results_cache_uri, results_collection, ] - if extract_ir_config.target_size is not None: + if extract_ir_config.target_uncompressed_size is not None: command.append("--target-size") - command.append(extract_ir_config.target_size) + command.append(extract_ir_config.target_uncompressed_size) else: raise ValueError(f"Unsupported storage engine {storage_engine}") @@ -71,7 +74,7 @@ def extract_ir( # Setup logging to file set_logging_level(logger, clp_logging_level) - clo_log_path = get_logger_file_path(clp_logs_dir, job_id, task_id) + clo_log_path = get_task_log_file_path(clp_logs_dir, job_id, task_id) clo_log_file = open(clo_log_path, "w") logger.info(f"Started IR extraction task for job {job_id}") @@ -86,8 +89,8 @@ def extract_ir( storage_engine=clp_storage_engine, clp_home=clp_home, archives_dir=archive_directory, - ir_output_dir=ir_directory, archive_id=archive_id, + ir_output_dir=ir_directory, extract_ir_config=extract_ir_config, results_cache_uri=results_cache_uri, results_collection=ir_collection, @@ -147,4 +150,4 @@ def extract_ir( sql_adapter, task_id, dict(status=task_status, start_time=start_time, duration=duration) ) - return generate_final_task_results(task_id, task_status, duration, clo_log_path) + return generate_final_task_result(task_id, task_status, duration, clo_log_path) diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 169aeb774..d16295860 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -15,7 +15,11 @@ from job_orchestration.scheduler.job_config import SearchJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus -from .utils import generate_final_task_results, get_logger_file_path, update_query_task_metadata +from job_orchestration.executor.query.utils import ( + generate_final_task_result, + get_task_log_file_path, + update_query_task_metadata, +) # Setup logging logger = get_task_logger(__name__) @@ -111,7 +115,7 @@ def search( # Setup logging to file set_logging_level(logger, clp_logging_level) - clo_log_path = get_logger_file_path(clp_logs_dir, job_id, task_id) + clo_log_path = get_task_log_file_path(clp_logs_dir, job_id, task_id) clo_log_file = open(clo_log_path, "w") logger.info(f"Started search task for job {job_id}") @@ -151,9 +155,9 @@ def search( error_log_path=str(clo_log_path), ).dict() - search_status = QueryTaskStatus.RUNNING + task_status = QueryTaskStatus.RUNNING update_query_task_metadata( - sql_adapter, task_id, dict(status=search_status, start_time=start_time) + sql_adapter, task_id, dict(status=task_status, start_time=start_time) ) logger.info(f'Running: {" ".join(task_command)}') @@ -199,4 +203,4 @@ def sigterm_handler(_signo, _stack_frame): sql_adapter, task_id, dict(status=task_status, start_time=start_time, duration=duration) ) - return generate_final_task_results(task_id, task_status, duration, clo_log_path) + return generate_final_task_result(task_id, task_status, duration, clo_log_path) diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py index 340b617db..1d34ef2c0 100644 --- a/components/job-orchestration/job_orchestration/executor/query/utils.py +++ b/components/job-orchestration/job_orchestration/executor/query/utils.py @@ -7,22 +7,22 @@ from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus -def get_logger_file_path(clp_logs_dir: Path, job_id: str, task_id: int) -> Path: +def get_task_log_file_path(clp_logs_dir: Path, job_id: str, task_id: int) -> Path: worker_logs_dir = clp_logs_dir / job_id worker_logs_dir.mkdir(exist_ok=True, parents=True) return worker_logs_dir / f"{task_id}-clo.log" -def generate_final_task_results( - task_id: int, job_status: QueryTaskStatus, duration: float, clo_log_path: Path +def generate_final_task_result( + task_id: int, task_status: QueryTaskStatus, duration: float, clo_log_path: Path ) -> Dict[Any, Any]: task_result = QueryTaskResult( - status=job_status, + status=task_status, task_id=task_id, duration=duration, ) - if QueryTaskStatus.FAILED == job_status: + if QueryTaskStatus.FAILED == task_status: task_result.error_log_path = str(clo_log_path) return task_result.dict() diff --git a/components/job-orchestration/job_orchestration/scheduler/job_config.py b/components/job-orchestration/job_orchestration/scheduler/job_config.py index 6ae820713..e90e2ee7f 100644 --- a/components/job-orchestration/job_orchestration/scheduler/job_config.py +++ b/components/job-orchestration/job_orchestration/scheduler/job_config.py @@ -46,7 +46,7 @@ class ExtractIrJobConfig(QueryJobConfig): orig_file_id: str msg_ix: int file_split_id: typing.Optional[str] = None - target_size: typing.Optional[int] = None + target_uncompressed_size: typing.Optional[int] = None class SearchJobConfig(QueryJobConfig): diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index d51642a7b..f31d4ecd4 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -286,7 +286,7 @@ def get_archive_and_update_config_for_extraction( orig_file_id = extract_ir_config.orig_file_id msg_ix = extract_ir_config.msg_ix - results = get_archive_and_file_split_for_extraction(db_conn, orig_file_id, msg_ix) + results = get_archive_and_file_split_ids(db_conn, orig_file_id, msg_ix) if len(results) == 0: logger.error(f"No matching file splits for orig_file_id={orig_file_id}, msg_ix={msg_ix}") return None @@ -303,7 +303,7 @@ def get_archive_and_update_config_for_extraction( @exception_default_value(default=[]) -def get_archive_and_file_split_for_extraction( +def get_archive_and_file_split_ids( db_conn, orig_file_id: str, msg_ix: int, @@ -432,7 +432,7 @@ def dispatch_job_and_update_db( target_archives: List[str], clp_metadata_db_conn_params: Dict[str, any], results_cache_uri: str, - num_task: int, + num_tasks: int, ) -> None: dispatch_query_job( db_conn, new_job, target_archives, clp_metadata_db_conn_params, results_cache_uri @@ -446,7 +446,7 @@ def dispatch_job_and_update_db( QueryJobStatus.RUNNING, QueryJobStatus.PENDING, start_time=start_time, - num_tasks=num_task, + num_tasks=num_tasks, ) @@ -520,7 +520,6 @@ def handle_pending_query_jobs( db_conn, extract_ir_config ) if not archive_id: - logger.error(f"Failed to get archive for extraction") if not set_job_or_task_status( db_conn, QUERY_JOBS_TABLE_NAME, @@ -717,8 +716,8 @@ async def handle_finished_extract_ir_job( job_id = job.id new_job_status = QueryJobStatus.SUCCEEDED - num_task = len(task_results) - if 1 != num_task: + num_tasks = len(task_results) + if 1 != num_tasks: logger.error( f"Unexpected number of tasks for IR extraction job {job_id}. " f"Expected 1, got {num_tasks}." @@ -745,7 +744,7 @@ async def handle_finished_extract_ir_job( job_id, new_job_status, QueryJobStatus.RUNNING, - num_tasks_completed=num_task, + num_tasks_completed=num_tasks, duration=(datetime.datetime.now() - job.start_time).total_seconds(), ): if new_job_status == QueryJobStatus.SUCCEEDED: diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml index 98759a041..3f658211e 100644 --- a/components/package-template/src/etc/clp-config.yml +++ b/components/package-template/src/etc/clp-config.yml @@ -46,8 +46,8 @@ #results_cache: # host: "localhost" # port: 27017 -# db_name: "clp-search" -# ir_collection_name: "clp-ir" +# db_name: "clp-query-results" +# ir_collection_name: "ir-files" # #compression_worker: # logging_level: "INFO" diff --git a/docs/src/dev-guide/components-webui.md b/docs/src/dev-guide/components-webui.md index bf2f76f6f..c5a482e12 100644 --- a/docs/src/dev-guide/components-webui.md +++ b/docs/src/dev-guide/components-webui.md @@ -41,7 +41,7 @@ package: ```shell # Please update `` accordingly. - MONGO_URL="mongodb://localhost:27017/clp-search" \ + MONGO_URL="mongodb://localhost:27017/clp-query-results" \ ROOT_URL="http://localhost:4000" \ CLP_DB_USER="clp-user" \ CLP_DB_PASS="" \ From fb680c076e70f57b397ba0ff9f02776d9fb0c275 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 26 Jun 2024 10:58:17 -0400 Subject: [PATCH 22/28] Another batch of change. DOCSTRING IS NOT READY YET --- .../clp_package_utils/general.py | 1 + .../clp-py-utils/clp_py_utils/clp_config.py | 5 ++++ .../scheduler/query/query_scheduler.py | 30 +++++++++++-------- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 1d46a0ea1..ce0f10309 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -405,6 +405,7 @@ def validate_results_cache_config( def validate_worker_config(clp_config: CLPConfig): clp_config.validate_input_logs_dir() clp_config.validate_archive_output_dir() + clp_config.validate_ir_output_dir() def validate_webui_config( diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index d8fed9803..4cef22548 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -430,6 +430,11 @@ def validate_archive_output_dir(self): validate_path_could_be_dir(self.archive_output.directory) except ValueError as ex: raise ValueError(f"archive_output.directory is invalid: {ex}") + def validate_ir_output_dir(self): + try: + validate_path_could_be_dir(self.ir_output.directory) + except ValueError as ex: + raise ValueError(f"ir_output.directory is invalid: {ex}") def validate_data_dir(self): try: diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index f31d4ecd4..2068ee4a6 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -24,7 +24,7 @@ import pathlib import sys from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple import celery import msgpack @@ -279,27 +279,24 @@ def get_archives_for_search( return archives_for_search -def get_archive_and_update_config_for_extraction( +def get_archive_and_file_split_ids_for_extraction( db_conn, extract_ir_config: ExtractIrJobConfig, -) -> Optional[str]: +) -> Tuple[Optional[str], Optional[str]]: orig_file_id = extract_ir_config.orig_file_id msg_ix = extract_ir_config.msg_ix results = get_archive_and_file_split_ids(db_conn, orig_file_id, msg_ix) if len(results) == 0: logger.error(f"No matching file splits for orig_file_id={orig_file_id}, msg_ix={msg_ix}") - return None + return None, None elif len(results) > 1: logger.error(f"Multiple file splits found for orig_file_id={orig_file_id}, msg_ix={msg_ix}") for result in results: logger.error(f"{result['archive_id']}:{result['id']}") - return None + return None, None - file_split_id = results[0]["id"] - archive_id = results[0]["archive_id"] - extract_ir_config.file_split_id = file_split_id - return archive_id + return results[0]["archive_id"], results[0]["file_split_id"] @exception_default_value(default=[]) @@ -308,7 +305,15 @@ def get_archive_and_file_split_ids( orig_file_id: str, msg_ix: int, ): - query = f"""SELECT id, archive_id + """ + TBD + :param job: + :param + :param + :return: + """ + + query = f"""SELECT archive_id, id as file_split_id FROM {CLP_METADATA_TABLE_PREFIX}files WHERE orig_file_id = '{orig_file_id}' AND begin_message_ix <= {msg_ix} AND @@ -516,10 +521,10 @@ def handle_pending_query_jobs( elif QueryJobType.EXTRACT_IR == job_type: extract_ir_config = ExtractIrJobConfig.parse_obj(msgpack.unpackb(job_config)) - archive_id = get_archive_and_update_config_for_extraction( + archive_id, file_split_id = get_archive_and_file_split_ids_for_extraction( db_conn, extract_ir_config ) - if not archive_id: + if not archive_id or not file_split_id: if not set_job_or_task_status( db_conn, QUERY_JOBS_TABLE_NAME, @@ -533,6 +538,7 @@ def handle_pending_query_jobs( logger.error(f"Failed to set job {job_id} as failed") continue + extract_ir_config.file_split_id = file_split_id new_extract_ir_job = ExtractIrJob( id=job_id, archive_id=archive_id, From 76e16b33105e3edb83defa8e495fed0f3a6176f6 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 26 Jun 2024 15:50:59 -0400 Subject: [PATCH 23/28] Refactor common functions --- .../clp-py-utils/clp_py_utils/clp_config.py | 1 + .../executor/query/extract_ir_task.py | 135 +++++++----------- .../executor/query/fs_search_task.py | 131 +++++------------ .../job_orchestration/executor/query/utils.py | 96 ++++++++++++- 4 files changed, 179 insertions(+), 184 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 4cef22548..0c0ce6893 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -430,6 +430,7 @@ def validate_archive_output_dir(self): validate_path_could_be_dir(self.archive_output.directory) except ValueError as ex: raise ValueError(f"archive_output.directory is invalid: {ex}") + def validate_ir_output_dir(self): try: validate_path_could_be_dir(self.ir_output.directory) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index c724ab6b5..3b4965099 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -1,8 +1,7 @@ import datetime import os -import subprocess from pathlib import Path -from typing import Any, Dict +from typing import Any, Dict, List, Optional from celery.app.task import Task from celery.utils.log import get_task_logger @@ -10,14 +9,10 @@ from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app +from job_orchestration.executor.query.utils import generic_run_query_task, report_command_creation_failure from job_orchestration.scheduler.job_config import ExtractIrJobConfig -from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus +from job_orchestration.scheduler.scheduler_data import QueryTaskStatus -from job_orchestration.executor.query.utils import ( - generate_final_task_result, - get_task_log_file_path, - update_query_task_metadata, -) # Setup logging logger = get_task_logger(__name__) @@ -30,11 +25,12 @@ def make_command( ir_output_dir: Path, extract_ir_config: ExtractIrJobConfig, results_cache_uri: str, - results_collection: str, -): + ir_collection: str, +) -> Optional[List[str]]: if StorageEngine.CLP == storage_engine: if not extract_ir_config.file_split_id: - raise ValueError(f"file_split_id not supplied") + logger.error("file_split_id not supplied") + return None command = [ str(clp_home / "bin" / "clo"), "i", @@ -42,13 +38,14 @@ def make_command( extract_ir_config.file_split_id, str(ir_output_dir), results_cache_uri, - results_collection, + ir_collection, ] if extract_ir_config.target_uncompressed_size is not None: command.append("--target-size") command.append(extract_ir_config.target_uncompressed_size) else: - raise ValueError(f"Unsupported storage engine {storage_engine}") + logger.error(f"Unsupported storage engine {storage_engine}") + return None return command @@ -63,91 +60,55 @@ def extract_ir( clp_metadata_db_conn_params: dict, results_cache_uri: str, ) -> Dict[str, Any]: - clp_home = Path(os.getenv("CLP_HOME")) - archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - ir_directory = Path(os.getenv("CLP_IR_OUTPUT_DIR")) - clp_logs_dir = Path(os.getenv("CLP_LOGS_DIR")) - clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) - clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) - - ir_collection = str(os.getenv("CLP_IR_COLLECTION")) + # Task name + TASK_NAME = "IR extraction" # Setup logging to file + clp_logs_dir = Path(os.getenv("CLP_LOGS_DIR")) + clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) set_logging_level(logger, clp_logging_level) - clo_log_path = get_task_log_file_path(clp_logs_dir, job_id, task_id) - clo_log_file = open(clo_log_path, "w") - logger.info(f"Started IR extraction task for job {job_id}") - - extract_ir_config = ExtractIrJobConfig.parse_obj(job_config_obj) - sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) + logger.info(f"Started {TASK_NAME} task for job {job_id}") start_time = datetime.datetime.now() task_status: QueryTaskStatus - try: - task_command = make_command( - storage_engine=clp_storage_engine, - clp_home=clp_home, - archives_dir=archive_directory, - archive_id=archive_id, - ir_output_dir=ir_directory, - extract_ir_config=extract_ir_config, - results_cache_uri=results_cache_uri, - results_collection=ir_collection, - ) - except ValueError as e: - error_message = f"Error creating IR extraction command: {e}" - logger.error(error_message) - clo_log_file.write(error_message) - - task_status = QueryTaskStatus.FAILED - update_query_task_metadata( - sql_adapter, - task_id, - dict(status=task_status, duration=0, start_time=start_time), - ) - - clo_log_file.close() - return QueryTaskResult( - task_id=task_id, - status=task_status, - duration=0, - error_log_path=str(clo_log_path), - ).dict() + sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) - task_status = QueryTaskStatus.RUNNING - update_query_task_metadata( - sql_adapter, task_id, dict(status=task_status, start_time=start_time) - ) + # Make task_command + clp_home = Path(os.getenv("CLP_HOME")) + archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) + clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) + ir_output_dir = Path(os.getenv("CLP_IR_OUTPUT_DIR")) + ir_collection = str(os.getenv("CLP_IR_COLLECTION")) + extract_ir_config = ExtractIrJobConfig.parse_obj(job_config_obj) - logger.info(f'Running: {" ".join(task_command)}') - extract_proc = subprocess.Popen( - task_command, - preexec_fn=os.setpgrp, - close_fds=True, - stdout=clo_log_file, - stderr=clo_log_file, + task_command = make_command( + storage_engine=clp_storage_engine, + clp_home=clp_home, + archives_dir=archive_directory, + archive_id=archive_id, + ir_output_dir=ir_output_dir, + extract_ir_config=extract_ir_config, + results_cache_uri=results_cache_uri, + ir_collection=ir_collection, ) - logger.info("Waiting for IR extraction to finish") - # communicate is equivalent to wait in this case, but avoids deadlocks if we switch to piping - # stdout/stderr in the future. - extract_proc.communicate() - return_code = extract_proc.returncode - if 0 != return_code: - task_status = QueryTaskStatus.FAILED - logger.error( - f"IR extraction task {task_id} failed for job {job_id} - return_code={return_code}" + if not task_command: + return report_command_creation_failure( + sql_adapter=sql_adapter, + logger=logger, + task_name=TASK_NAME, + task_id=task_id, + start_time=start_time, ) - else: - task_status = QueryTaskStatus.SUCCEEDED - logger.info(f"IR extraction task {task_id} completed for job {job_id}") - - clo_log_file.close() - duration = (datetime.datetime.now() - start_time).total_seconds() - update_query_task_metadata( - sql_adapter, task_id, dict(status=task_status, start_time=start_time, duration=duration) + return generic_run_query_task( + sql_adapter=sql_adapter, + logger=logger, + clp_logs_dir=clp_logs_dir, + task_command=task_command, + task_name=TASK_NAME, + job_id=job_id, + task_id=task_id, + start_time=start_time, ) - - return generate_final_task_result(task_id, task_status, duration, clo_log_path) diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index d16295860..9042c1242 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -1,8 +1,5 @@ import datetime import os -import signal -import subprocess -import sys from pathlib import Path from typing import Any, Dict @@ -12,14 +9,9 @@ from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app +from job_orchestration.executor.query.utils import generic_run_query_task, report_command_creation_failure from job_orchestration.scheduler.job_config import SearchJobConfig -from job_orchestration.scheduler.scheduler_data import QueryTaskResult, QueryTaskStatus - -from job_orchestration.executor.query.utils import ( - generate_final_task_result, - get_task_log_file_path, - update_query_task_metadata, -) +from job_orchestration.scheduler.scheduler_data import QueryTaskStatus # Setup logging logger = get_task_logger(__name__) @@ -107,100 +99,53 @@ def search( clp_metadata_db_conn_params: dict, results_cache_uri: str, ) -> Dict[str, Any]: - clp_home = Path(os.getenv("CLP_HOME")) - archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - clp_logs_dir = Path(os.getenv("CLP_LOGS_DIR")) - clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) - clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) + # Task name + TASK_NAME = "search" # Setup logging to file + clp_logs_dir = Path(os.getenv("CLP_LOGS_DIR")) + clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) set_logging_level(logger, clp_logging_level) - clo_log_path = get_task_log_file_path(clp_logs_dir, job_id, task_id) - clo_log_file = open(clo_log_path, "w") - logger.info(f"Started search task for job {job_id}") + logger.info(f"Started {TASK_NAME} task for job {job_id}") - search_config = SearchJobConfig.parse_obj(job_config_obj) - sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) start_time = datetime.datetime.now() task_status: QueryTaskStatus - try: - task_command = make_command( - storage_engine=clp_storage_engine, - clp_home=clp_home, - archives_dir=archive_directory, - archive_id=archive_id, - search_config=search_config, - results_cache_uri=results_cache_uri, - results_collection=job_id, - ) - except ValueError as e: - error_message = f"Error creating search command: {e}" - logger.error(error_message) - clo_log_file.write(error_message) - - task_status = QueryTaskStatus.FAILED - update_query_task_metadata( - sql_adapter, - task_id, - dict(status=task_status, duration=0, start_time=start_time), - ) + sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) - clo_log_file.close() - return QueryTaskResult( - task_id=task_id, - status=task_status, - duration=0, - error_log_path=str(clo_log_path), - ).dict() - - task_status = QueryTaskStatus.RUNNING - update_query_task_metadata( - sql_adapter, task_id, dict(status=task_status, start_time=start_time) - ) + # Make task_command + clp_home = Path(os.getenv("CLP_HOME")) + archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) + clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) + search_config = SearchJobConfig.parse_obj(job_config_obj) - logger.info(f'Running: {" ".join(task_command)}') - search_proc = subprocess.Popen( - task_command, - preexec_fn=os.setpgrp, - close_fds=True, - stdout=clo_log_file, - stderr=clo_log_file, + task_command = make_command( + storage_engine=clp_storage_engine, + clp_home=clp_home, + archives_dir=archive_directory, + archive_id=archive_id, + search_config=search_config, + results_cache_uri=results_cache_uri, + results_collection=str(task_id), ) - def sigterm_handler(_signo, _stack_frame): - logger.debug("Entered sigterm handler") - if search_proc.poll() is None: - logger.debug("Trying to kill search process") - # Kill the process group in case the search process also forked - os.killpg(os.getpgid(search_proc.pid), signal.SIGTERM) - os.waitpid(search_proc.pid, 0) - logger.info(f"Cancelling search task.") - # Add 128 to follow convention for exit codes from signals - # https://tldp.org/LDP/abs/html/exitcodes.html#AEN23549 - sys.exit(_signo + 128) - - # Register the function to kill the child process at exit - signal.signal(signal.SIGTERM, sigterm_handler) - - logger.info("Waiting for search to finish") - # communicate is equivalent to wait in this case, but avoids deadlocks if we switch to piping - # stdout/stderr in the future. - search_proc.communicate() - return_code = search_proc.returncode - if 0 != return_code: - task_status = QueryTaskStatus.FAILED - logger.error(f"Search task {task_id} failed for job {job_id} - return_code={return_code}") - else: - task_status = QueryTaskStatus.SUCCEEDED - logger.info(f"Search task {task_id} completed for job {job_id}") - - clo_log_file.close() - duration = (datetime.datetime.now() - start_time).total_seconds() + if not task_command: + return report_command_creation_failure( + sql_adapter=sql_adapter, + logger=logger, + task_name=TASK_NAME, + task_id=task_id, + start_time=start_time, + ) - update_query_task_metadata( - sql_adapter, task_id, dict(status=task_status, start_time=start_time, duration=duration) + return generic_run_query_task( + sql_adapter=sql_adapter, + logger=logger, + clp_logs_dir=clp_logs_dir, + task_command=task_command, + task_name=TASK_NAME, + job_id=job_id, + task_id=task_id, + start_time=start_time, ) - - return generate_final_task_result(task_id, task_status, duration, clo_log_path) diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py index 1d34ef2c0..b27b407a2 100644 --- a/components/job-orchestration/job_orchestration/executor/query/utils.py +++ b/components/job-orchestration/job_orchestration/executor/query/utils.py @@ -1,6 +1,12 @@ +import datetime +import os +import signal +import subprocess +import sys from contextlib import closing +from logging import Logger from pathlib import Path -from typing import Any, Dict +from typing import Any, Dict, List from clp_py_utils.clp_config import QUERY_TASKS_TABLE_NAME from clp_py_utils.sql_adapter import SQL_Adapter @@ -13,9 +19,91 @@ def get_task_log_file_path(clp_logs_dir: Path, job_id: str, task_id: int) -> Pat return worker_logs_dir / f"{task_id}-clo.log" -def generate_final_task_result( - task_id: int, task_status: QueryTaskStatus, duration: float, clo_log_path: Path -) -> Dict[Any, Any]: +def report_command_creation_failure( + sql_adapter: SQL_Adapter, + logger: Logger, + task_name: str, + task_id: int, + start_time: datetime.datetime, +): + logger.error(f"Error creating {task_name} command") + task_status = QueryTaskStatus.FAILED + update_query_task_metadata( + sql_adapter, + task_id, + dict(status=task_status, duration=0, start_time=start_time), + ) + + return QueryTaskResult( + task_id=task_id, + status=task_status, + duration=0, + ).dict() + + +def generic_run_query_task( + sql_adapter: SQL_Adapter, + logger: Logger, + clp_logs_dir: Path, + task_command: List[str], + task_name: str, + job_id: str, + task_id: int, + start_time: datetime.datetime, +): + clo_log_path = get_task_log_file_path(clp_logs_dir, job_id, task_id) + clo_log_file = open(clo_log_path, "w") + + task_status = QueryTaskStatus.RUNNING + update_query_task_metadata( + sql_adapter, task_id, dict(status=task_status, start_time=start_time) + ) + + logger.info(f'Running: {" ".join(task_command)}') + task_proc = subprocess.Popen( + task_command, + preexec_fn=os.setpgrp, + close_fds=True, + stdout=clo_log_file, + stderr=clo_log_file, + ) + + def sigterm_handler(_signo, _stack_frame): + logger.debug("Entered sigterm handler") + if task_proc.poll() is None: + logger.debug(f"Trying to kill {task_name} process") + # Kill the process group in case the search process also forked + os.killpg(os.getpgid(task_proc.pid), signal.SIGTERM) + os.waitpid(task_proc.pid, 0) + logger.info(f"Cancelling {task_name} task.") + # Add 128 to follow convention for exit codes from signals + # https://tldp.org/LDP/abs/html/exitcodes.html#AEN23549 + sys.exit(_signo + 128) + + # Register the function to kill the child process at exit + signal.signal(signal.SIGTERM, sigterm_handler) + + logger.info(f"Waiting for {task_name} to finish") + # communicate is equivalent to wait in this case, but avoids deadlocks if we switch to piping + # stdout/stderr in the future. + task_proc.communicate() + return_code = task_proc.returncode + if 0 != return_code: + task_status = QueryTaskStatus.FAILED + logger.error( + f"{task_name} task {task_id} failed for job {job_id} - return_code={return_code}" + ) + else: + task_status = QueryTaskStatus.SUCCEEDED + logger.info(f"{task_name} task {task_id} completed for job {job_id}") + + clo_log_file.close() + duration = (datetime.datetime.now() - start_time).total_seconds() + + update_query_task_metadata( + sql_adapter, task_id, dict(status=task_status, start_time=start_time, duration=duration) + ) + task_result = QueryTaskResult( status=task_status, task_id=task_id, From fe3659902f6af375210dabadef4d039b2052c320 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 26 Jun 2024 16:08:41 -0400 Subject: [PATCH 24/28] Add doc string --- .../scheduler/query/query_scheduler.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index 2068ee4a6..d8b7a7481 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -306,11 +306,15 @@ def get_archive_and_file_split_ids( msg_ix: int, ): """ - TBD - :param job: - :param - :param - :return: + Fetches the id of the file split id and the id of the archive containing + the file split based on the following criteria: + - 1. The file split's original file id = orig_file_id + - 2. The file split includes the message with index = msg_ix + :param db_conn: + :param orig_file_id: original file id of the split + :param msg_ix: message index the file split must include + :return: A list of (archive id, file split id) on success. An empty list if + an exception occurs while interacting with the database. """ query = f"""SELECT archive_id, id as file_split_id From 0529b377cbe162d25a8d7639be2a56dff79b4297 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 26 Jun 2024 16:17:07 -0400 Subject: [PATCH 25/28] Linter --- .../job_orchestration/executor/query/extract_ir_task.py | 5 ++++- .../job_orchestration/executor/query/fs_search_task.py | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index 3b4965099..afced3a6e 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -9,7 +9,10 @@ from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app -from job_orchestration.executor.query.utils import generic_run_query_task, report_command_creation_failure +from job_orchestration.executor.query.utils import ( + generic_run_query_task, + report_command_creation_failure, +) from job_orchestration.scheduler.job_config import ExtractIrJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 9042c1242..c273f07d9 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -9,7 +9,10 @@ from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app -from job_orchestration.executor.query.utils import generic_run_query_task, report_command_creation_failure +from job_orchestration.executor.query.utils import ( + generic_run_query_task, + report_command_creation_failure, +) from job_orchestration.scheduler.job_config import SearchJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -109,7 +112,6 @@ def search( logger.info(f"Started {TASK_NAME} task for job {job_id}") - start_time = datetime.datetime.now() task_status: QueryTaskStatus sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) From bd7083f028a91d0fe1dd151a8817470425b51963 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Thu, 27 Jun 2024 20:16:22 -0400 Subject: [PATCH 26/28] Update docstring. --- .../scheduler/query/query_scheduler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py index d8b7a7481..015480662 100644 --- a/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/query/query_scheduler.py @@ -306,13 +306,13 @@ def get_archive_and_file_split_ids( msg_ix: int, ): """ - Fetches the id of the file split id and the id of the archive containing - the file split based on the following criteria: - - 1. The file split's original file id = orig_file_id - - 2. The file split includes the message with index = msg_ix + Fetches the IDs of the file split and the archive containing the file split based on the + following criteria: + 1. The file split's original file id = `orig_file_id` + 2. The file split includes the message with index = `msg_ix` :param db_conn: - :param orig_file_id: original file id of the split - :param msg_ix: message index the file split must include + :param orig_file_id: Original file id of the split + :param msg_ix: Index of the message that the file split must include :return: A list of (archive id, file split id) on success. An empty list if an exception occurs while interacting with the database. """ From 4a4d5b2ba947c78d949da980c5bc1244bab48caa Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Thu, 27 Jun 2024 20:23:17 -0400 Subject: [PATCH 27/28] Rename generic_run_query_task to run_query_task; Make search_task's make_command exception safe; Fix some PEP violations; Some clean-up. --- .../executor/query/extract_ir_task.py | 14 ++++++------- .../executor/query/fs_search_task.py | 21 +++++++++---------- .../job_orchestration/executor/query/utils.py | 8 +++---- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index afced3a6e..c74dfab37 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -10,8 +10,8 @@ from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( - generic_run_query_task, report_command_creation_failure, + run_query_task, ) from job_orchestration.scheduler.job_config import ExtractIrJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -63,15 +63,14 @@ def extract_ir( clp_metadata_db_conn_params: dict, results_cache_uri: str, ) -> Dict[str, Any]: - # Task name - TASK_NAME = "IR extraction" + task_name = "IR extraction" # Setup logging to file clp_logs_dir = Path(os.getenv("CLP_LOGS_DIR")) clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) set_logging_level(logger, clp_logging_level) - logger.info(f"Started {TASK_NAME} task for job {job_id}") + logger.info(f"Started {task_name} task for job {job_id}") start_time = datetime.datetime.now() task_status: QueryTaskStatus @@ -95,22 +94,21 @@ def extract_ir( results_cache_uri=results_cache_uri, ir_collection=ir_collection, ) - if not task_command: return report_command_creation_failure( sql_adapter=sql_adapter, logger=logger, - task_name=TASK_NAME, + task_name=task_name, task_id=task_id, start_time=start_time, ) - return generic_run_query_task( + return run_query_task( sql_adapter=sql_adapter, logger=logger, clp_logs_dir=clp_logs_dir, task_command=task_command, - task_name=TASK_NAME, + task_name=task_name, job_id=job_id, task_id=task_id, start_time=start_time, diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index c273f07d9..cfebe27c0 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -1,7 +1,7 @@ import datetime import os from pathlib import Path -from typing import Any, Dict +from typing import Any, Dict, List, Optional from celery.app.task import Task from celery.utils.log import get_task_logger @@ -10,8 +10,8 @@ from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( - generic_run_query_task, report_command_creation_failure, + run_query_task, ) from job_orchestration.scheduler.job_config import SearchJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -28,7 +28,7 @@ def make_command( search_config: SearchJobConfig, results_cache_uri: str, results_collection: str, -): +) -> Optional[List[str]]: if StorageEngine.CLP == storage_engine: command = [str(clp_home / "bin" / "clo"), "s", str(archives_dir / archive_id)] if search_config.path_filter is not None: @@ -43,7 +43,8 @@ def make_command( archive_id, ] else: - raise ValueError(f"Unsupported storage engine {storage_engine}") + logger.error(f"Unsupported storage engine {storage_engine}") + return None command.append(search_config.query_string) if search_config.begin_timestamp is not None: @@ -102,15 +103,14 @@ def search( clp_metadata_db_conn_params: dict, results_cache_uri: str, ) -> Dict[str, Any]: - # Task name - TASK_NAME = "search" + task_name = "search" # Setup logging to file clp_logs_dir = Path(os.getenv("CLP_LOGS_DIR")) clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) set_logging_level(logger, clp_logging_level) - logger.info(f"Started {TASK_NAME} task for job {job_id}") + logger.info(f"Started {task_name} task for job {job_id}") start_time = datetime.datetime.now() task_status: QueryTaskStatus @@ -131,22 +131,21 @@ def search( results_cache_uri=results_cache_uri, results_collection=str(task_id), ) - if not task_command: return report_command_creation_failure( sql_adapter=sql_adapter, logger=logger, - task_name=TASK_NAME, + task_name=task_name, task_id=task_id, start_time=start_time, ) - return generic_run_query_task( + return run_query_task( sql_adapter=sql_adapter, logger=logger, clp_logs_dir=clp_logs_dir, task_command=task_command, - task_name=TASK_NAME, + task_name=task_name, job_id=job_id, task_id=task_id, start_time=start_time, diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py index b27b407a2..69d22398e 100644 --- a/components/job-orchestration/job_orchestration/executor/query/utils.py +++ b/components/job-orchestration/job_orchestration/executor/query/utils.py @@ -41,7 +41,7 @@ def report_command_creation_failure( ).dict() -def generic_run_query_task( +def run_query_task( sql_adapter: SQL_Adapter, logger: Logger, clp_logs_dir: Path, @@ -72,7 +72,7 @@ def sigterm_handler(_signo, _stack_frame): logger.debug("Entered sigterm handler") if task_proc.poll() is None: logger.debug(f"Trying to kill {task_name} process") - # Kill the process group in case the search process also forked + # Kill the process group in case the task process also forked os.killpg(os.getpgid(task_proc.pid), signal.SIGTERM) os.waitpid(task_proc.pid, 0) logger.info(f"Cancelling {task_name} task.") @@ -84,8 +84,8 @@ def sigterm_handler(_signo, _stack_frame): signal.signal(signal.SIGTERM, sigterm_handler) logger.info(f"Waiting for {task_name} to finish") - # communicate is equivalent to wait in this case, but avoids deadlocks if we switch to piping - # stdout/stderr in the future. + # `communicate` is equivalent to `wait` in this case, but avoids deadlocks if we switch to + # piping stdout/stderr in the future. task_proc.communicate() return_code = task_proc.returncode if 0 != return_code: From ba51583b081db50153381b94b7965259756023f3 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 27 Jun 2024 21:09:14 -0400 Subject: [PATCH 28/28] remove str conversion for the envvars since they were already string by default --- .../job_orchestration/executor/query/extract_ir_task.py | 6 +++--- .../job_orchestration/executor/query/fs_search_task.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py index c74dfab37..b04b809f3 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_ir_task.py @@ -67,7 +67,7 @@ def extract_ir( # Setup logging to file clp_logs_dir = Path(os.getenv("CLP_LOGS_DIR")) - clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) + clp_logging_level = os.getenv("CLP_LOGGING_LEVEL") set_logging_level(logger, clp_logging_level) logger.info(f"Started {task_name} task for job {job_id}") @@ -79,9 +79,9 @@ def extract_ir( # Make task_command clp_home = Path(os.getenv("CLP_HOME")) archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) + clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE") ir_output_dir = Path(os.getenv("CLP_IR_OUTPUT_DIR")) - ir_collection = str(os.getenv("CLP_IR_COLLECTION")) + ir_collection = os.getenv("CLP_IR_COLLECTION") extract_ir_config = ExtractIrJobConfig.parse_obj(job_config_obj) task_command = make_command( diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index cfebe27c0..baafca3e2 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -107,7 +107,7 @@ def search( # Setup logging to file clp_logs_dir = Path(os.getenv("CLP_LOGS_DIR")) - clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) + clp_logging_level = os.getenv("CLP_LOGGING_LEVEL") set_logging_level(logger, clp_logging_level) logger.info(f"Started {task_name} task for job {job_id}") @@ -119,7 +119,7 @@ def search( # Make task_command clp_home = Path(os.getenv("CLP_HOME")) archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) + clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE") search_config = SearchJobConfig.parse_obj(job_config_obj) task_command = make_command(