diff --git a/.github/workflows/clp-pr-title-checks.yaml b/.github/workflows/clp-pr-title-checks.yaml index 428e9f21d..1c8ced072 100644 --- a/.github/workflows/clp-pr-title-checks.yaml +++ b/.github/workflows/clp-pr-title-checks.yaml @@ -2,9 +2,16 @@ name: "clp-pr-title-checks" on: pull_request_target: + # NOTE: Workflows triggered by this event give the workflow access to secrets and grant the + # `GITHUB_TOKEN` read/write repository access by default. So we need to ensure: + # - This workflow doesn't inadvertently check out, build, or execute untrusted code from the + # pull request triggered by this event. + # - Each job has `permissions` set to only those necessary. types: ["edited", "opened", "reopened"] branches: ["main"] +permissions: {} + concurrency: group: "${{github.workflow}}-${{github.ref}}" diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 5fae8166f..60f1053f8 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -20,7 +20,9 @@ REDIS_COMPONENT_NAME, REDUCER_COMPONENT_NAME, RESULTS_CACHE_COMPONENT_NAME, + StorageType, WEBUI_COMPONENT_NAME, + WorkerConfig, ) from clp_py_utils.core import ( get_config_value, @@ -239,17 +241,17 @@ def generate_container_config( DockerMountType.BIND, clp_config.logs_directory, container_clp_config.logs_directory ) - container_clp_config.archive_output.directory = pathlib.Path("/") / "mnt" / "archive-output" + container_clp_config.archive_output.set_directory(pathlib.Path("/") / "mnt" / "archive-output") if not is_path_already_mounted( clp_home, CONTAINER_CLP_HOME, - clp_config.archive_output.directory, - container_clp_config.archive_output.directory, + clp_config.archive_output.get_directory(), + container_clp_config.archive_output.get_directory(), ): docker_mounts.archives_output_dir = DockerMount( DockerMountType.BIND, - clp_config.archive_output.directory, - container_clp_config.archive_output.directory, + clp_config.archive_output.get_directory(), + container_clp_config.archive_output.get_directory(), ) container_clp_config.stream_output.directory = pathlib.Path("/") / "mnt" / "stream-output" @@ -268,6 +270,18 @@ def generate_container_config( return container_clp_config, docker_mounts +def generate_worker_config(clp_config: CLPConfig) -> WorkerConfig: + worker_config = WorkerConfig() + worker_config.package = clp_config.package.copy(deep=True) + worker_config.archive_output = clp_config.archive_output.copy(deep=True) + worker_config.data_directory = clp_config.data_directory + + worker_config.stream_output_dir = clp_config.stream_output.directory + worker_config.stream_collection_name = clp_config.results_cache.stream_collection_name + + return worker_config + + def dump_container_config( container_clp_config: CLPConfig, clp_config: CLPConfig, container_name: str ) -> Tuple[pathlib.Path, pathlib.Path]: @@ -482,7 +496,7 @@ def validate_results_cache_config( def validate_worker_config(clp_config: CLPConfig): clp_config.validate_input_logs_dir() - clp_config.validate_archive_output_dir() + clp_config.validate_archive_output_config() clp_config.validate_stream_output_dir() diff --git a/components/clp-package-utils/clp_package_utils/scripts/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/decompress.py index 325f2add6..092c339a6 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/decompress.py @@ -5,7 +5,7 @@ import sys from typing import Optional -from clp_py_utils.clp_config import CLPConfig +from clp_py_utils.clp_config import CLPConfig, StorageType from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, @@ -81,6 +81,11 @@ def handle_extract_file_cmd( if clp_config is None: return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"File extraction is not supported for archive storage type: {storage_type}.") + return -1 + container_name = generate_container_name(str(JobType.FILE_EXTRACTION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( @@ -156,6 +161,13 @@ def handle_extract_stream_cmd( if clp_config is None: return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error( + f"Stream extraction is not supported for archive storage type: {storage_type}." + ) + return -1 + container_name = generate_container_name(str(JobType.IR_EXTRACTION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( diff --git a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py index 54d959771..5b9bc6d97 100644 --- a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py +++ b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py @@ -4,6 +4,8 @@ import sys from pathlib import Path +from clp_py_utils.clp_config import StorageType + from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, dump_container_config, @@ -57,6 +59,11 @@ def main(argv): logger.exception("Failed to load config.") return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"Archive deletion is not supported for storage type: {storage_type}.") + return -1 + # Validate the input timestamp begin_ts = parsed_args.begin_ts end_ts = parsed_args.end_ts diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py index d16cdcb6f..7e3c7da6e 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py @@ -167,7 +167,7 @@ def validate_and_load_config_file( """ try: clp_config = load_config_file(config_file_path, default_config_file_path, clp_home) - clp_config.validate_archive_output_dir() + clp_config.validate_archive_output_config() clp_config.validate_logs_dir() return clp_config except Exception: @@ -207,7 +207,7 @@ def handle_extract_file_cmd( list_path = parsed_args.files_from logs_dir = clp_config.logs_directory - archives_dir = clp_config.archive_output.directory + archives_dir = clp_config.archive_output.get_directory() # Generate database config file for clp db_config_file_path = logs_dir / f".decompress-db-config-{uuid.uuid4()}.yml" diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py index 735bf299d..c489c3806 100644 --- a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py @@ -54,7 +54,7 @@ def main(argv): return -1 database_config = clp_config.database - archives_dir = clp_config.archive_output.directory + archives_dir = clp_config.archive_output.get_directory() if not archives_dir.exists(): logger.error("`archive_output.directory` doesn't exist.") return -1 diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/search.py b/components/clp-package-utils/clp_package_utils/scripts/native/search.py index d166cf35f..d292656a7 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/search.py @@ -83,24 +83,31 @@ def create_and_monitor_job_in_db( logger.error(f"job {job_id} finished with unexpected status: {job_status}") -async def worker_connection_handler(reader: asyncio.StreamReader, writer: asyncio.StreamWriter): - try: - unpacker = msgpack.Unpacker() - while True: - # Read some data from the worker and feed it to msgpack - buf = await reader.read(1024) - if b"" == buf: - # Worker closed - return - unpacker.feed(buf) +def get_worker_connection_handler(raw_output: bool): + async def worker_connection_handler(reader: asyncio.StreamReader, writer: asyncio.StreamWriter): + try: + unpacker = msgpack.Unpacker() + while True: + # Read some data from the worker and feed it to msgpack + buf = await reader.read(1024) + if b"" == buf: + # Worker closed + return + unpacker.feed(buf) - # Print out any messages we can decode in the form of ORIG_PATH: MSG - for unpacked in unpacker: - print(f"{unpacked[2]}: {unpacked[1]}", end="") - except asyncio.CancelledError: - return - finally: - writer.close() + # Print out any messages we can decode in the form of ORIG_PATH: MSG, or simply MSG + # if raw output is enabled. + for unpacked in unpacker: + if raw_output: + print(f"{unpacked[1]}", end="") + else: + print(f"{unpacked[2]}: {unpacked[1]}", end="") + except asyncio.CancelledError: + return + finally: + writer.close() + + return worker_connection_handler async def do_search_without_aggregation( @@ -112,6 +119,7 @@ async def do_search_without_aggregation( end_timestamp: int | None, ignore_case: bool, path_filter: str | None, + raw_output: bool, ): ip_list = socket.gethostbyname_ex(socket.gethostname())[2] if len(ip_list) == 0: @@ -125,7 +133,7 @@ async def do_search_without_aggregation( break server = await asyncio.start_server( - client_connected_cb=worker_connection_handler, + client_connected_cb=get_worker_connection_handler(raw_output), host=host, port=0, family=socket.AF_INET, @@ -184,6 +192,7 @@ async def do_search( path_filter: str | None, do_count_aggregation: bool | None, count_by_time_bucket_size: int | None, + raw_output: bool, ): if do_count_aggregation is None and count_by_time_bucket_size is None: await do_search_without_aggregation( @@ -195,6 +204,7 @@ async def do_search( end_timestamp, ignore_case, path_filter, + raw_output, ) else: await run_function_in_process( @@ -226,12 +236,12 @@ def main(argv): args_parser.add_argument( "--begin-time", type=int, - help="Time range filter lower-bound (inclusive) as milliseconds" " from the UNIX epoch.", + help="Time range filter lower-bound (inclusive) as milliseconds from the UNIX epoch.", ) args_parser.add_argument( "--end-time", type=int, - help="Time range filter upper-bound (inclusive) as milliseconds" " from the UNIX epoch.", + help="Time range filter upper-bound (inclusive) as milliseconds from the UNIX epoch.", ) args_parser.add_argument( "--ignore-case", @@ -250,6 +260,9 @@ def main(argv): type=int, help="Count the number of results in each time span of the given size (ms).", ) + args_parser.add_argument( + "--raw", action="store_true", help="Output the search results as raw logs." + ) parsed_args = args_parser.parse_args(argv[1:]) if ( @@ -281,6 +294,7 @@ def main(argv): parsed_args.file_path, parsed_args.count, parsed_args.count_by_time, + parsed_args.raw, ) ) except asyncio.CancelledError: diff --git a/components/clp-package-utils/clp_package_utils/scripts/search.py b/components/clp-package-utils/clp_package_utils/scripts/search.py index beb7fb0b0..c01fb64b5 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/search.py @@ -7,6 +7,7 @@ import uuid import yaml +from clp_py_utils.clp_config import StorageType from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, @@ -41,12 +42,12 @@ def main(argv): args_parser.add_argument( "--begin-time", type=int, - help="Time range filter lower-bound (inclusive) as milliseconds" " from the UNIX epoch.", + help="Time range filter lower-bound (inclusive) as milliseconds from the UNIX epoch.", ) args_parser.add_argument( "--end-time", type=int, - help="Time range filter upper-bound (inclusive) as milliseconds" " from the UNIX epoch.", + help="Time range filter upper-bound (inclusive) as milliseconds from the UNIX epoch.", ) args_parser.add_argument( "--ignore-case", @@ -60,6 +61,9 @@ def main(argv): type=int, help="Count the number of results in each time span of the given size (ms).", ) + args_parser.add_argument( + "--raw", action="store_true", help="Output the search results as raw logs." + ) parsed_args = args_parser.parse_args(argv[1:]) # Validate and load config file @@ -74,6 +78,11 @@ def main(argv): logger.exception("Failed to load config.") return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"Search is not supported for archive storage type: {storage_type}.") + return -1 + container_name = generate_container_name(str(JobType.SEARCH)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) @@ -113,6 +122,8 @@ def main(argv): if parsed_args.count_by_time is not None: search_cmd.append("--count-by-time") search_cmd.append(str(parsed_args.count_by_time)) + if parsed_args.raw: + search_cmd.append("--raw") cmd = container_start_cmd + search_cmd subprocess.run(cmd, check=True) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 8097929f1..6de3174ff 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -29,6 +29,7 @@ REDIS_COMPONENT_NAME, REDUCER_COMPONENT_NAME, RESULTS_CACHE_COMPONENT_NAME, + StorageType, WEBUI_COMPONENT_NAME, ) from job_orchestration.scheduler.constants import QueueName @@ -42,6 +43,7 @@ DockerMount, DockerMountType, generate_container_config, + generate_worker_config, get_clp_home, is_container_exited, is_container_running, @@ -626,6 +628,7 @@ def start_compression_worker( ): celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" + compression_worker_mounts = [mounts.archives_output_dir] generic_start_worker( COMPRESSION_WORKER_COMPONENT_NAME, instance_id, @@ -637,8 +640,7 @@ def start_compression_worker( clp_config.redis.compression_backend_database, num_cpus, mounts, - None, - None, + compression_worker_mounts, ) @@ -652,11 +654,9 @@ def start_query_worker( celery_method = "job_orchestration.executor.query" celery_route = f"{QueueName.QUERY}" - query_worker_mount = [mounts.stream_output_dir] - query_worker_env = { - "CLP_STREAM_OUTPUT_DIR": container_clp_config.stream_output.directory, - "CLP_STREAM_COLLECTION_NAME": clp_config.results_cache.stream_collection_name, - } + query_worker_mounts = [mounts.stream_output_dir] + if clp_config.archive_output.storage.type == StorageType.FS: + query_worker_mounts.append(mounts.archives_output_dir) generic_start_worker( QUERY_WORKER_COMPONENT_NAME, @@ -669,8 +669,7 @@ def start_query_worker( clp_config.redis.query_backend_database, num_cpus, mounts, - query_worker_env, - query_worker_mount, + query_worker_mounts, ) @@ -685,8 +684,7 @@ def generic_start_worker( redis_database: int, num_cpus: int, mounts: CLPDockerMounts, - worker_specific_env: Dict[str, Any], - worker_specific_mount: List[Optional[DockerMount]], + worker_specific_mount: Optional[List[Optional[DockerMount]]], ): logger.info(f"Starting {component_name}...") @@ -694,14 +692,18 @@ def generic_start_worker( if container_exists(container_name): return - validate_worker_config(clp_config) + container_config_filename = f"{container_name}.yml" + container_config_file_path = clp_config.logs_directory / container_config_filename + container_worker_config = generate_worker_config(container_clp_config) + with open(container_config_file_path, "w") as f: + yaml.safe_dump(container_worker_config.dump_to_primitive_dict(), f) logs_dir = clp_config.logs_directory / component_name logs_dir.mkdir(parents=True, exist_ok=True) container_logs_dir = container_clp_config.logs_directory / component_name # Create necessary directories - clp_config.archive_output.directory.mkdir(parents=True, exist_ok=True) + clp_config.archive_output.get_directory().mkdir(parents=True, exist_ok=True) clp_config.stream_output.directory.mkdir(parents=True, exist_ok=True) clp_site_packages_dir = CONTAINER_CLP_HOME / "lib" / "python3" / "site-packages" @@ -724,24 +726,17 @@ def generic_start_worker( f"{container_clp_config.redis.host}:{container_clp_config.redis.port}/{redis_database}" ), "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", - "-e", f"CLP_DATA_DIR={container_clp_config.data_directory}", - "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}", + "-e", f"CLP_CONFIG_PATH={container_clp_config.logs_directory / container_config_filename}", "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", - "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", "-u", f"{os.getuid()}:{os.getgid()}", ] - if worker_specific_env: - for env_name, env_value in worker_specific_env.items(): - container_start_cmd.append("-e") - container_start_cmd.append(f"{env_name}={env_value}") - # fmt: on + necessary_mounts = [ mounts.clp_home, mounts.data_dir, mounts.logs_dir, - mounts.archives_output_dir, mounts.input_logs_dir, ] if worker_specific_mount: @@ -1125,6 +1120,12 @@ def main(argv): QUERY_WORKER_COMPONENT_NAME, ): validate_and_load_redis_credentials_file(clp_config, clp_home, True) + if target in ( + ALL_TARGET_NAME, + COMPRESSION_WORKER_COMPONENT_NAME, + QUERY_WORKER_COMPONENT_NAME, + ): + validate_worker_config(clp_config) clp_config.validate_data_dir() clp_config.validate_logs_dir() diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 79a94505d..f59de7647 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -1,10 +1,10 @@ import pathlib -import typing from enum import auto +from typing import Literal, Optional, Union from dotenv import dotenv_values from pydantic import BaseModel, PrivateAttr, validator -from strenum import KebabCaseStrEnum +from strenum import KebabCaseStrEnum, LowercaseStrEnum from .clp_logging import get_valid_logging_level, is_valid_logging_level from .core import ( @@ -48,6 +48,11 @@ class StorageEngine(KebabCaseStrEnum): CLP_S = auto() +class StorageType(LowercaseStrEnum): + FS = auto() + S3 = auto() + + VALID_STORAGE_ENGINES = [storage_engine.value for storage_engine in StorageEngine] @@ -69,12 +74,12 @@ class Database(BaseModel): host: str = "localhost" port: int = 3306 name: str = "clp-db" - ssl_cert: typing.Optional[str] = None + ssl_cert: Optional[str] = None auto_commit: bool = False compress: bool = True - username: typing.Optional[str] = None - password: typing.Optional[str] = None + username: Optional[str] = None + password: Optional[str] = None @validator("type") def validate_database_type(cls, field): @@ -227,7 +232,7 @@ class Redis(BaseModel): query_backend_database: int = 0 compression_backend_database: int = 1 # redis can perform authentication without a username - password: typing.Optional[str] + password: Optional[str] @validator("host") def validate_host(cls, field): @@ -300,12 +305,80 @@ class Queue(BaseModel): host: str = "localhost" port: int = 5672 - username: typing.Optional[str] - password: typing.Optional[str] + username: Optional[str] + password: Optional[str] -class ArchiveOutput(BaseModel): +class S3Config(BaseModel): + region_code: str + bucket: str + key_prefix: str + + access_key_id: Optional[str] = None + secret_access_key: Optional[str] = None + + @validator("region_code") + def validate_region_code(cls, field): + if field == "": + raise ValueError("region_code cannot be empty") + return field + + @validator("bucket") + def validate_bucket(cls, field): + if field == "": + raise ValueError("bucket cannot be empty") + return field + + @validator("key_prefix") + def validate_key_prefix(cls, field): + if field == "": + raise ValueError("key_prefix cannot be empty") + if not field.endswith("/"): + raise ValueError('key_prefix must end with "/"') + return field + + +class FsStorage(BaseModel): + type: Literal[StorageType.FS.value] = StorageType.FS.value directory: pathlib.Path = pathlib.Path("var") / "data" / "archives" + + @validator("directory") + def validate_directory(cls, field): + if "" == field: + raise ValueError("directory cannot be empty") + return field + + def make_config_paths_absolute(self, clp_home: pathlib.Path): + self.directory = make_config_path_absolute(clp_home, self.directory) + + def dump_to_primitive_dict(self): + d = self.dict() + d["directory"] = str(d["directory"]) + return d + + +class S3Storage(BaseModel): + type: Literal[StorageType.S3.value] = StorageType.S3.value + staging_directory: pathlib.Path = pathlib.Path("var") / "data" / "staged_archives" + s3_config: S3Config + + @validator("staging_directory") + def validate_staging_directory(cls, field): + if "" == field: + raise ValueError("staging_directory cannot be empty") + return field + + def make_config_paths_absolute(self, clp_home: pathlib.Path): + self.staging_directory = make_config_path_absolute(clp_home, self.staging_directory) + + def dump_to_primitive_dict(self): + d = self.dict() + d["staging_directory"] = str(d["staging_directory"]) + return d + + +class ArchiveOutput(BaseModel): + storage: Union[FsStorage, S3Storage] = FsStorage() target_archive_size: int = 256 * 1024 * 1024 # 256 MB target_dictionaries_size: int = 32 * 1024 * 1024 # 32 MB target_encoded_file_size: int = 256 * 1024 * 1024 # 256 MB @@ -335,13 +408,30 @@ def validate_target_segment_size(cls, field): raise ValueError("target_segment_size must be greater than 0") return field - def make_config_paths_absolute(self, clp_home: pathlib.Path): - self.directory = make_config_path_absolute(clp_home, self.directory) + def set_directory(self, directory: pathlib.Path): + storage_config = self.storage + storage_type = storage_config.type + if StorageType.FS == storage_type: + storage_config.directory = directory + elif StorageType.S3 == storage_type: + storage_config.staging_directory = directory + else: + raise NotImplementedError(f"storage.type {storage_type} is not supported") + + def get_directory(self) -> pathlib.Path: + storage_config = self.storage + storage_type = storage_config.type + if StorageType.FS == storage_config.type: + return storage_config.directory + elif StorageType.S3 == storage_type: + return storage_config.staging_directory + else: + raise NotImplementedError(f"storage.type {storage_type} is not supported") def dump_to_primitive_dict(self): d = self.dict() # Turn directory (pathlib.Path) into a primitive string - d["directory"] = str(d["directory"]) + d["storage"] = self.storage.dump_to_primitive_dict() return d @@ -352,7 +442,7 @@ class StreamOutput(BaseModel): @validator("directory") def validate_directory(cls, field): if "" == field: - raise ValueError("directory can not be empty") + raise ValueError("directory cannot be empty") return field @validator("target_uncompressed_size") @@ -408,7 +498,7 @@ def validate_port(cls, field): class CLPConfig(BaseModel): - execution_container: typing.Optional[str] + execution_container: Optional[str] = None input_logs_directory: pathlib.Path = pathlib.Path("/") @@ -436,7 +526,7 @@ class CLPConfig(BaseModel): def make_config_paths_absolute(self, clp_home: pathlib.Path): self.input_logs_directory = make_config_path_absolute(clp_home, self.input_logs_directory) self.credentials_file_path = make_config_path_absolute(clp_home, self.credentials_file_path) - self.archive_output.make_config_paths_absolute(clp_home) + self.archive_output.storage.make_config_paths_absolute(clp_home) self.stream_output.make_config_paths_absolute(clp_home) self.data_directory = make_config_path_absolute(clp_home, self.data_directory) self.logs_directory = make_config_path_absolute(clp_home, self.logs_directory) @@ -451,11 +541,19 @@ def validate_input_logs_dir(self): if not input_logs_dir.is_dir(): raise ValueError(f"input_logs_directory '{input_logs_dir}' is not a directory.") - def validate_archive_output_dir(self): + def validate_archive_output_config(self): + if ( + StorageType.S3 == self.archive_output.storage.type + and StorageEngine.CLP_S != self.package.storage_engine + ): + raise ValueError( + f"archive_output.storage.type = 's3' is only supported with package.storage_engine" + f" = '{StorageEngine.CLP_S}'" + ) try: - validate_path_could_be_dir(self.archive_output.directory) + validate_path_could_be_dir(self.archive_output.get_directory()) except ValueError as ex: - raise ValueError(f"archive_output.directory is invalid: {ex}") + raise ValueError(f"archive_output.storage's directory is invalid: {ex}") def validate_stream_output_dir(self): try: @@ -537,3 +635,23 @@ def dump_to_primitive_dict(self): d["data_directory"] = str(self.data_directory) d["logs_directory"] = str(self.logs_directory) return d + + +class WorkerConfig(BaseModel): + package: Package = Package() + archive_output: ArchiveOutput = ArchiveOutput() + data_directory: pathlib.Path = CLPConfig().data_directory + + # Only needed by query workers. + stream_output_dir: pathlib.Path = StreamOutput().directory + stream_collection_name: str = ResultsCache().stream_collection_name + + def dump_to_primitive_dict(self): + d = self.dict() + d["archive_output"] = self.archive_output.dump_to_primitive_dict() + + # Turn paths into primitive strings + d["data_directory"] = str(self.data_directory) + d["stream_output_dir"] = str(self.stream_output_dir) + + return d diff --git a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py index 1ed727367..2c8133e8a 100644 --- a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py +++ b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py @@ -52,7 +52,7 @@ def main(argv): CREATE TABLE IF NOT EXISTS `{COMPRESSION_JOBS_TABLE_NAME}` ( `id` INT NOT NULL AUTO_INCREMENT, `status` INT NOT NULL DEFAULT '{CompressionJobStatus.PENDING}', - `status_msg` VARCHAR(255) NOT NULL DEFAULT '', + `status_msg` VARCHAR(512) NOT NULL DEFAULT '', `creation_time` DATETIME(3) NOT NULL DEFAULT CURRENT_TIMESTAMP(3), `start_time` DATETIME(3) NULL DEFAULT NULL, `duration` FLOAT NULL DEFAULT NULL, diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py new file mode 100644 index 000000000..03717a445 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -0,0 +1,51 @@ +from pathlib import Path + +import boto3 +from botocore.config import Config +from botocore.exceptions import ClientError +from result import Err, Ok, Result + +from clp_py_utils.clp_config import S3Config + + +def s3_put( + s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 +) -> Result[bool, str]: + """ + Uploads a local file to an S3 bucket using AWS's PutObject operation. + :param s3_config: S3 configuration specifying the upload destination and credentials. + :param src_file: Local file to upload. + :param dest_file_name: The name for the uploaded file in the S3 bucket. + :param total_max_attempts: Maximum number of retry attempts for the upload. + :return: Result.OK(bool) on success, or Result.Err(str) with the error message otherwise. + """ + if not src_file.exists(): + return Err(f"{src_file} doesn't exist") + if not src_file.is_file(): + return Err(f"{src_file} is not a file") + if src_file.stat().st_size > 5 * 1024 * 1024 * 1024: + return Err(f"{src_file} is larger than the limit (5GiB) for a single PutObject operation.") + + config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) + + my_s3_client = boto3.client( + "s3", + region_name=s3_config.region_code, + aws_access_key_id=s3_config.access_key_id, + aws_secret_access_key=s3_config.secret_access_key, + config=config, + ) + + with open(src_file, "rb") as file_data: + try: + my_s3_client.put_object( + Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name + ) + except ClientError as e: + error_code = e.response["Error"]["Code"] + error_message = e.response["Error"]["Message"] + return Err(f"ClientError: {error_code} - {error_message}") + except Exception as e: + return Err(f"An unexpected error occurred: {e}") + + return Ok(True) diff --git a/components/clp-py-utils/pyproject.toml b/components/clp-py-utils/pyproject.toml index 4e827b926..6d68ceebe 100644 --- a/components/clp-py-utils/pyproject.toml +++ b/components/clp-py-utils/pyproject.toml @@ -10,6 +10,7 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.10" +boto3 = "^1.35.81" # mariadb version must be compatible with libmariadev installed in runtime env. # See https://mariadb.com/docs/server/connect/programming-languages/python/install/#Dependencies mariadb = "~1.0.11" @@ -19,6 +20,7 @@ python-dotenv = "^1.0.1" python-Levenshtein = "~0.22" sqlalchemy = "~2.0" PyYAML = "^6.0.1" +result = "^0.17.0" StrEnum = "^0.4.15" [build-system] diff --git a/components/core/.clang-format b/components/core/.clang-format index ff65adbae..4d0d3a87c 100644 --- a/components/core/.clang-format +++ b/components/core/.clang-format @@ -4,7 +4,7 @@ IncludeCategories: # NOTE: A header is grouped by first matching regex # Library headers. Update when adding new libraries. # NOTE: clang-format retains leading white-space on a line in violation of the YAML spec. - - Regex: "<(absl|antlr4|archive|boost|bsoncxx|catch2|curl|date|fmt|json|log_surgeon|mongocxx\ + - Regex: "<(absl|antlr4|archive|boost|bsoncxx|catch2|curl|date|fmt|json|log_surgeon|lzma|mongocxx\ |msgpack|mysql|openssl|outcome|regex_utils|simdjson|spdlog|sqlite3|string_utils|yaml-cpp|zstd)" Priority: 3 # C system headers diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 265f6cad6..a19e1ff18 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -11,8 +11,11 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # Set general compressor set(GENERAL_COMPRESSOR "zstd" CACHE STRING "The general-purpose compressor used as the 2nd-stage compressor") -set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS passthrough zstd) -if ("${GENERAL_COMPRESSOR}" STREQUAL "passthrough") +set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS lzma passthrough zstd) +if ("${GENERAL_COMPRESSOR}" STREQUAL "lzma") + add_definitions(-DUSE_LZMA_COMPRESSION=1) + message(STATUS "Using Lempel–Ziv–Markov chain Algorithm compression") +elseif ("${GENERAL_COMPRESSOR}" STREQUAL "passthrough") add_definitions(-DUSE_PASSTHROUGH_COMPRESSION=1) message(STATUS "Using passthrough compression") elseif ("${GENERAL_COMPRESSOR}" STREQUAL "zstd") @@ -98,7 +101,7 @@ endif() if(CLP_USE_STATIC_LIBS) set(Boost_USE_STATIC_LIBS ON) endif() -find_package(Boost 1.74 REQUIRED iostreams program_options filesystem system regex) +find_package(Boost 1.81 REQUIRED iostreams program_options filesystem system regex url) if(Boost_FOUND) message(STATUS "Found Boost ${Boost_VERSION}") else() @@ -224,6 +227,21 @@ else() message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for ZStd") endif() +# Find and setup LZMA Library +# TODO: Add a script in ./cmake/Modules to properly import LZMA in find_package()'s module mode +if(CLP_USE_STATIC_LIBS) + set(LIBLZMA_USE_STATIC_LIBS ON) +endif() +find_package(LibLZMA REQUIRED) +if(LIBLZMA_FOUND) + message(STATUS "Found Lzma ${LIBLZMA_VERSION_STRING}") + message(STATUS "Lzma library location: ${LIBLZMA_LIBRARIES}") + message(STATUS "Lzma Include Dir: ${LIBLZMA_INCLUDE_DIRS}") +else() + message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for Lzma") +endif() +include_directories(${LIBLZMA_INCLUDE_DIRS}) + # sqlite dependencies set(sqlite_DYNAMIC_LIBS "dl;m;pthread") include(cmake/Modules/FindLibraryDependencies.cmake) @@ -334,6 +352,8 @@ set(SOURCE_FILES_unitTest src/clp/aws/AwsAuthenticationSigner.cpp src/clp/aws/AwsAuthenticationSigner.hpp src/clp/aws/constants.hpp + src/clp/BoundedReader.cpp + src/clp/BoundedReader.hpp src/clp/BufferedFileReader.cpp src/clp/BufferedFileReader.hpp src/clp/BufferReader.cpp @@ -384,6 +404,8 @@ set(SOURCE_FILES_unitTest src/clp/ffi/ir_stream/decoding_methods.inc src/clp/ffi/ir_stream/encoding_methods.cpp src/clp/ffi/ir_stream/encoding_methods.hpp + src/clp/ffi/ir_stream/IrErrorCode.cpp + src/clp/ffi/ir_stream/IrErrorCode.hpp src/clp/ffi/ir_stream/IrUnitHandlerInterface.hpp src/clp/ffi/ir_stream/IrUnitType.hpp src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp @@ -514,6 +536,9 @@ set(SOURCE_FILES_unitTest src/clp/streaming_compression/Compressor.hpp src/clp/streaming_compression/Constants.hpp src/clp/streaming_compression/Decompressor.hpp + src/clp/streaming_compression/lzma/Compressor.cpp + src/clp/streaming_compression/lzma/Compressor.hpp + src/clp/streaming_compression/lzma/Constants.hpp src/clp/streaming_compression/passthrough/Compressor.cpp src/clp/streaming_compression/passthrough/Compressor.hpp src/clp/streaming_compression/passthrough/Decompressor.cpp @@ -550,6 +575,7 @@ set(SOURCE_FILES_unitTest submodules/sqlite3/sqlite3ext.h tests/LogSuppressor.hpp tests/test-Array.cpp + tests/test-BoundedReader.cpp tests/test-BufferedFileReader.cpp tests/test-clp_s-end_to_end.cpp tests/test-EncodedVariableInterpreter.cpp @@ -605,6 +631,7 @@ target_link_libraries(unitTest clp::regex_utils clp::string_utils yaml-cpp::yaml-cpp + ${LIBLZMA_LIBRARIES} ZStd::ZStd ) target_compile_features(unitTest diff --git a/components/core/src/clp/BoundedReader.cpp b/components/core/src/clp/BoundedReader.cpp new file mode 100644 index 000000000..9bca08f71 --- /dev/null +++ b/components/core/src/clp/BoundedReader.cpp @@ -0,0 +1,43 @@ +#include "BoundedReader.hpp" + +#include + +#include "ErrorCode.hpp" + +namespace clp { +auto BoundedReader::try_seek_from_begin(size_t pos) -> ErrorCode { + auto const next_pos = pos > m_bound ? m_bound : pos; + if (auto const rc = m_reader->try_seek_from_begin(next_pos); ErrorCode_Success != rc) { + m_curr_pos = ErrorCode_EndOfFile == rc ? next_pos : m_curr_pos; + return rc; + } + m_curr_pos = next_pos; + if (m_curr_pos >= m_bound) { + return ErrorCode_EndOfFile; + } + return ErrorCode_Success; +} + +auto BoundedReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) + -> ErrorCode { + if (m_curr_pos == m_bound) { + num_bytes_read = 0; + return ErrorCode_EndOfFile; + } + + if ((m_curr_pos + num_bytes_to_read) > m_bound) { + num_bytes_to_read = m_bound - m_curr_pos; + } + + auto const rc = m_reader->try_read(buf, num_bytes_to_read, num_bytes_read); + m_curr_pos += num_bytes_read; + if (ErrorCode_EndOfFile == rc) { + if (0 == num_bytes_read) { + return ErrorCode_EndOfFile; + } + } else if (ErrorCode_Success != rc) { + return rc; + } + return ErrorCode_Success; +} +} // namespace clp diff --git a/components/core/src/clp/BoundedReader.hpp b/components/core/src/clp/BoundedReader.hpp new file mode 100644 index 000000000..cfcb07422 --- /dev/null +++ b/components/core/src/clp/BoundedReader.hpp @@ -0,0 +1,89 @@ +#ifndef CLP_BOUNDEDREADER_HPP +#define CLP_BOUNDEDREADER_HPP + +#include +#include + +#include "ErrorCode.hpp" +#include "ReaderInterface.hpp" + +namespace clp { +/** + * BoundedReader is a ReaderInterface designed to wrap other ReaderInterfaces and prevent users + * from reading or seeking beyond a certain point in the underlying input stream. + * + * This is useful when the underlying input stream is divided into several logical segments and we + * want to prevent a reader for an earlier segment consuming any bytes from a later segment. In + * particular, reading part of a later segment may force the reader for that later segment to seek + * backwards, which can be either inefficient or impossible for certain kinds of input streams. + */ +class BoundedReader : public ReaderInterface { +public: + // Constructor + explicit BoundedReader(ReaderInterface* reader, size_t bound) + : m_reader{reader}, + m_bound{bound} { + if (nullptr == m_reader) { + throw ReaderInterface::OperationFailed(ErrorCode_BadParam, __FILE__, __LINE__); + } + m_curr_pos = m_reader->get_pos(); + if (m_curr_pos > m_bound) { + throw ReaderInterface::OperationFailed(ErrorCode_BadParam, __FILE__, __LINE__); + } + } + + // Methods implementing the ReaderInterface + /** + * Tries to get the current position of the read head in the underlying reader. + * @param pos Returns the position of the underlying reader's head + * @return ErrorCode_Success on success + * @return ErrorCode_errno on failure + */ + [[nodiscard]] auto try_get_pos(size_t& pos) -> ErrorCode override { + return m_reader->try_get_pos(pos); + } + + /** + * Tries to seek to the given position, limited by the bound. + * @param pos + * @return ErrorCode_Success on success + * @return ErrorCode_EndOfFile on EOF or if trying to seek beyond the checkpoint + * @return ErrorCode_errno on failure + */ + [[nodiscard]] auto try_seek_from_begin(size_t pos) -> ErrorCode override; + + /** + * Tries to read up to a given number of bytes from the file, limited by the bound. + * @param buf + * @param num_bytes_to_read The number of bytes to try and read + * @param num_bytes_read The actual number of bytes read + * @return ErrorCode_errno on error + * @return ErrorCode_EndOfFile on EOF or trying to read after hitting checkpoint + * @return ErrorCode_Success on success + */ + [[nodiscard]] auto + try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) -> ErrorCode override; + + /** + * This function is unsupported because BoundedReader can not delegate to a potentially + * efficient implementation in the underlying reader, as the underlying reader's implementation + * will not respect the bound. + * @return ErrorCode_Unsupported + */ + [[nodiscard]] auto try_read_to_delimiter( + [[maybe_unused]] char delim, + [[maybe_unused]] bool keep_delimiter, + [[maybe_unused]] bool append, + [[maybe_unused]] std::string& str + ) -> ErrorCode override { + return ErrorCode_Unsupported; + } + +private: + ReaderInterface* m_reader{nullptr}; + size_t m_bound{}; + size_t m_curr_pos{}; +}; +} // namespace clp + +#endif // CLP_BOUNDEDREADER_HPP diff --git a/components/core/src/clp/StringReader.cpp b/components/core/src/clp/StringReader.cpp index 9fa2c27d3..8dd0a3793 100644 --- a/components/core/src/clp/StringReader.cpp +++ b/components/core/src/clp/StringReader.cpp @@ -41,6 +41,10 @@ ErrorCode StringReader::try_read(char* buf, size_t num_bytes_to_read, size_t& nu } ErrorCode StringReader::try_seek_from_begin(size_t pos) { + if (pos > input_string.size()) { + this->pos = input_string.size(); + return ErrorCode_EndOfFile; + } this->pos = pos; return ErrorCode_Success; } diff --git a/components/core/src/clp/clo/CommandLineArguments.cpp b/components/core/src/clp/clo/CommandLineArguments.cpp index fffc3d783..f0a7f7ecc 100644 --- a/components/core/src/clp/clo/CommandLineArguments.cpp +++ b/components/core/src/clp/clo/CommandLineArguments.cpp @@ -8,8 +8,8 @@ #include +#include "../../reducer/types.hpp" #include "../cli_utils.hpp" -#include "../reducer/types.hpp" #include "../spdlog_with_specializations.hpp" #include "../version.hpp" @@ -181,10 +181,6 @@ auto CommandLineArguments::parse_ir_extraction_arguments( // clang-format off options_ir_extraction .add_options()( - "temp-output-dir", - po::value(&m_ir_temp_output_dir)->value_name("DIR"), - "Temporary output directory for IR chunks while they're being written" - )( "target-size", po::value(&m_ir_target_size)->value_name("SIZE"), "Target size (B) for each IR chunk before a new chunk is created" @@ -287,10 +283,6 @@ auto CommandLineArguments::parse_ir_extraction_arguments( if (m_ir_mongodb_collection.empty()) { throw invalid_argument("COLLECTION not specified or empty."); } - - if (m_ir_temp_output_dir.empty()) { - m_ir_temp_output_dir = m_ir_output_dir; - } return ParsingResult::Success; } diff --git a/components/core/src/clp/clo/CommandLineArguments.hpp b/components/core/src/clp/clo/CommandLineArguments.hpp index 9e6d311c3..d84b96a18 100644 --- a/components/core/src/clp/clo/CommandLineArguments.hpp +++ b/components/core/src/clp/clo/CommandLineArguments.hpp @@ -54,10 +54,6 @@ class CommandLineArguments : public CommandLineArgumentsBase { [[nodiscard]] auto get_ir_output_dir() const -> std::string const& { return m_ir_output_dir; } - [[nodiscard]] auto get_ir_temp_output_dir() const -> std::string const& { - return m_ir_temp_output_dir; - } - [[nodiscard]] auto get_ir_mongodb_uri() const -> std::string const& { return m_ir_mongodb_uri; } [[nodiscard]] auto get_ir_mongodb_collection() const -> std::string const& { @@ -187,7 +183,6 @@ class CommandLineArguments : public CommandLineArgumentsBase { std::string m_file_split_id; size_t m_ir_target_size{128ULL * 1024 * 1024}; std::string m_ir_output_dir; - std::string m_ir_temp_output_dir; std::string m_ir_mongodb_uri; std::string m_ir_mongodb_collection; diff --git a/components/core/src/clp/clo/OutputHandler.cpp b/components/core/src/clp/clo/OutputHandler.cpp index bdf1bb1bd..1d92777c5 100644 --- a/components/core/src/clp/clo/OutputHandler.cpp +++ b/components/core/src/clp/clo/OutputHandler.cpp @@ -100,7 +100,7 @@ ErrorCode ResultsCacheOutputHandler::flush() { try { m_results.emplace_back(std::move(bsoncxx::builder::basic::make_document( bsoncxx::builder::basic::kvp( - cResultsCacheKeys::OrigFileId, + cResultsCacheKeys::SearchOutput::OrigFileId, std::move(result.orig_file_id) ), bsoncxx::builder::basic::kvp( diff --git a/components/core/src/clp/clo/clo.cpp b/components/core/src/clp/clo/clo.cpp index f29df0306..d62049e6b 100644 --- a/components/core/src/clp/clo/clo.cpp +++ b/components/core/src/clp/clo/clo.cpp @@ -171,7 +171,7 @@ bool extract_ir(CommandLineArguments const& command_line_args) { string const& orig_file_id, size_t begin_message_ix, size_t end_message_ix, - bool is_last_ir_chunk) { + bool is_last_chunk) { auto dest_ir_file_name = orig_file_id; dest_ir_file_name += "_" + std::to_string(begin_message_ix); dest_ir_file_name += "_" + std::to_string(end_message_ix); @@ -195,13 +195,9 @@ bool extract_ir(CommandLineArguments const& command_line_args) { dest_ir_file_name ), bsoncxx::builder::basic::kvp( - clp::clo::cResultsCacheKeys::OrigFileId, + clp::clo::cResultsCacheKeys::IrOutput::StreamId, orig_file_id ), - bsoncxx::builder::basic::kvp( - clp::clo::cResultsCacheKeys::IrOutput::FileSplitId, - file_split_id - ), bsoncxx::builder::basic::kvp( clp::clo::cResultsCacheKeys::IrOutput::BeginMsgIx, static_cast(begin_message_ix) @@ -211,8 +207,8 @@ bool extract_ir(CommandLineArguments const& command_line_args) { static_cast(end_message_ix) ), bsoncxx::builder::basic::kvp( - clp::clo::cResultsCacheKeys::IrOutput::IsLastIrChunk, - is_last_ir_chunk + clp::clo::cResultsCacheKeys::IrOutput::IsLastChunk, + is_last_chunk ) ))); return true; @@ -224,7 +220,7 @@ bool extract_ir(CommandLineArguments const& command_line_args) { archive_reader, *file_metadata_ix_ptr, command_line_args.get_ir_target_size(), - command_line_args.get_ir_temp_output_dir(), + command_line_args.get_ir_output_dir(), ir_output_handler )) { diff --git a/components/core/src/clp/clo/constants.hpp b/components/core/src/clp/clo/constants.hpp index 86f7313f2..945bde83e 100644 --- a/components/core/src/clp/clo/constants.hpp +++ b/components/core/src/clp/clo/constants.hpp @@ -3,17 +3,16 @@ // NOLINTBEGIN(cppcoreguidelines-avoid-c-arrays, readability-identifier-naming) namespace clp::clo::cResultsCacheKeys { -constexpr char OrigFileId[]{"orig_file_id"}; - namespace IrOutput { constexpr char Path[]{"path"}; -constexpr char FileSplitId[]{"file_split_id"}; +constexpr char StreamId[]{"stream_id"}; constexpr char BeginMsgIx[]{"begin_msg_ix"}; constexpr char EndMsgIx[]{"end_msg_ix"}; -constexpr char IsLastIrChunk[]{"is_last_ir_chunk"}; +constexpr char IsLastChunk[]{"is_last_chunk"}; } // namespace IrOutput namespace SearchOutput { +constexpr char OrigFileId[]{"orig_file_id"}; constexpr char OrigFilePath[]{"orig_file_path"}; constexpr char LogEventIx[]{"log_event_ix"}; constexpr char Timestamp[]{"timestamp"}; diff --git a/components/core/src/clp/clp/CommandLineArguments.cpp b/components/core/src/clp/clp/CommandLineArguments.cpp index b1aad7a9e..4b3111267 100644 --- a/components/core/src/clp/clp/CommandLineArguments.cpp +++ b/components/core/src/clp/clp/CommandLineArguments.cpp @@ -255,13 +255,6 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { ->default_value(m_ir_target_size), "Target size (B) for each IR chunk before a new chunk is created" ); - options_ir.add_options()( - "temp-output-dir", - po::value(&m_ir_temp_output_dir) - ->value_name("DIR") - ->default_value(m_ir_temp_output_dir), - "Temporary output directory for IR chunks while they're being written" - ); po::options_description all_ir_options; all_ir_options.add(ir_positional_options); @@ -311,10 +304,6 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { if (m_orig_file_id.empty()) { throw invalid_argument("ORIG_FILE_ID cannot be empty."); } - - if (m_ir_temp_output_dir.empty()) { - m_ir_temp_output_dir = m_output_dir; - } } else if (Command::Compress == m_command) { // Define compression hidden positional options po::options_description compression_positional_options; diff --git a/components/core/src/clp/clp/CommandLineArguments.hpp b/components/core/src/clp/clp/CommandLineArguments.hpp index 7e42a1243..49aa1d866 100644 --- a/components/core/src/clp/clp/CommandLineArguments.hpp +++ b/components/core/src/clp/clp/CommandLineArguments.hpp @@ -38,8 +38,6 @@ class CommandLineArguments : public CommandLineArgumentsBase { std::string const& get_path_prefix_to_remove() const { return m_path_prefix_to_remove; } - std::string const& get_ir_temp_output_dir() const { return m_ir_temp_output_dir; } - std::string const& get_output_dir() const { return m_output_dir; } std::string const& get_schema_file_path() const { return m_schema_file_path; } @@ -96,7 +94,6 @@ class CommandLineArguments : public CommandLineArgumentsBase { size_t m_ir_msg_ix{0}; size_t m_ir_target_size{128ULL * 1024 * 1024}; bool m_sort_input_files; - std::string m_ir_temp_output_dir; std::string m_output_dir; std::string m_schema_file_path; bool m_show_progress; diff --git a/components/core/src/clp/clp/FileDecompressor.hpp b/components/core/src/clp/clp/FileDecompressor.hpp index 932cab7c5..17a8b8e43 100644 --- a/components/core/src/clp/clp/FileDecompressor.hpp +++ b/components/core/src/clp/clp/FileDecompressor.hpp @@ -6,17 +6,17 @@ #include #include +#include "../ErrorCode.hpp" #include "../FileWriter.hpp" #include "../ir/constants.hpp" #include "../ir/LogEventSerializer.hpp" +#include "../ir/types.hpp" #include "../spdlog_with_specializations.hpp" #include "../streaming_archive/MetadataDB.hpp" #include "../streaming_archive/reader/Archive.hpp" #include "../streaming_archive/reader/File.hpp" #include "../streaming_archive/reader/Message.hpp" -#include "ErrorCode.hpp" -#include "ir/types.hpp" -#include "Utils.hpp" +#include "../Utils.hpp" namespace clp::clp { /** @@ -39,7 +39,7 @@ class FileDecompressor { * * @tparam IrOutputHandler Function to handle the resulting IR chunks. * Signature: (std::filesystem::path const& ir_file_path, string const& orig_file_id, - * size_t begin_message_ix, size_t end_message_ix, bool is_last_ir_chunk) -> bool; + * size_t begin_message_ix, size_t end_message_ix, bool is_last_chunk) -> bool; * The function returns whether it succeeded. * @param archive_reader * @param file_metadata_ix diff --git a/components/core/src/clp/clp/decompression.cpp b/components/core/src/clp/clp/decompression.cpp index 6b87f6777..ce7cbd5c7 100644 --- a/components/core/src/clp/clp/decompression.cpp +++ b/components/core/src/clp/clp/decompression.cpp @@ -7,12 +7,12 @@ #include "../FileWriter.hpp" #include "../GlobalMySQLMetadataDB.hpp" #include "../GlobalSQLiteMetadataDB.hpp" +#include "../ir/constants.hpp" #include "../spdlog_with_specializations.hpp" #include "../streaming_archive/reader/Archive.hpp" #include "../TraceableException.hpp" #include "../Utils.hpp" #include "FileDecompressor.hpp" -#include "ir/constants.hpp" #include "utils.hpp" using std::cerr; @@ -282,7 +282,7 @@ bool decompress_to_ir(CommandLineArguments& command_line_args) { string const& orig_file_id, size_t begin_message_ix, size_t end_message_ix, - [[maybe_unused]] bool is_last_ir_chunk) { + [[maybe_unused]] bool is_last_chunk) { auto dest_ir_file_name = orig_file_id; dest_ir_file_name += "_" + std::to_string(begin_message_ix); dest_ir_file_name += "_" + std::to_string(end_message_ix); @@ -310,7 +310,7 @@ bool decompress_to_ir(CommandLineArguments& command_line_args) { archive_reader, *file_metadata_ix_ptr, command_line_args.get_ir_target_size(), - command_line_args.get_ir_temp_output_dir(), + command_line_args.get_output_dir(), ir_output_handler )) { diff --git a/components/core/src/clp/clp/utils.cpp b/components/core/src/clp/clp/utils.cpp index 0f05d75ac..123f9a836 100644 --- a/components/core/src/clp/clp/utils.cpp +++ b/components/core/src/clp/clp/utils.cpp @@ -9,9 +9,9 @@ #include "../GlobalMySQLMetadataDB.hpp" #include "../GlobalSQLiteMetadataDB.hpp" #include "../spdlog_with_specializations.hpp" +#include "../streaming_archive/Constants.hpp" +#include "../TraceableException.hpp" #include "../Utils.hpp" -#include "streaming_archive/Constants.hpp" -#include "TraceableException.hpp" using std::string; using std::vector; diff --git a/components/core/src/clp/clp/utils.hpp b/components/core/src/clp/clp/utils.hpp index 0a6918445..47adc50f2 100644 --- a/components/core/src/clp/clp/utils.hpp +++ b/components/core/src/clp/clp/utils.hpp @@ -7,11 +7,11 @@ #include +#include "../ErrorCode.hpp" #include "../GlobalMetadataDB.hpp" #include "../GlobalMetadataDBConfig.hpp" -#include "ErrorCode.hpp" +#include "../TraceableException.hpp" #include "FileToCompress.hpp" -#include "TraceableException.hpp" namespace clp::clp { // Types diff --git a/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp b/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp index a8a8cf617..8e8bb15f5 100644 --- a/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp +++ b/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp @@ -153,6 +153,20 @@ node_type_matches_value_type(SchemaTree::Node::Type type, Value const& value) -> KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs ) -> bool; +/** + * @param node_id_value_pairs + * @param schema_tree + * @return A result containing a bitmap where every bit corresponds to the ID of a node in the + * schema tree, and the set bits correspond to the nodes in the subtree defined by all paths from + * the root node to the nodes in `node_id_value_pairs`; or an error code indicating a failure: + * - std::errc::result_out_of_range if a node ID in `node_id_value_pairs` doesn't exist in the + * schema tree. + */ +[[nodiscard]] auto get_schema_subtree_bitmap( + KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs, + SchemaTree const& schema_tree +) -> OUTCOME_V2_NAMESPACE::std_result>; + /** * Inserts the given key-value pair into the JSON object (map). * @param node The schema tree node of the key to insert. @@ -175,6 +189,34 @@ node_type_matches_value_type(SchemaTree::Node::Type type, Value const& value) -> */ [[nodiscard]] auto decode_as_encoded_text_ast(Value const& val) -> std::optional; +/** + * Serializes the given node-ID-value pairs into a `nlohmann::json` object. + * @param schema_tree + * @param node_id_value_pairs + * @param schema_subtree_bitmap + * @return A result containing the serialized JSON object or an error code indicating the failure: + * - std::errc::protocol_error if a value in the log event couldn't be decoded, or it couldn't be + * inserted into a JSON object. + */ +[[nodiscard]] auto serialize_node_id_value_pairs_to_json( + SchemaTree const& schema_tree, + KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs, + vector const& schema_subtree_bitmap +) -> OUTCOME_V2_NAMESPACE::std_result; + +/** + * @param node A non-root schema tree node. + * @param parent_node_id_to_key_names + * @return true if `node`'s key is unique among its sibling nodes with `parent_node_id_to_key_names` + * updated to keep track of this unique key name. + * @return false if a sibling of `node` has the same key. + */ +[[nodiscard]] auto check_key_uniqueness_among_sibling_nodes( + SchemaTree::Node const& node, + std::unordered_map>& + parent_node_id_to_key_names +) -> bool; + auto node_type_matches_value_type(SchemaTree::Node::Type type, Value const& value) -> bool { switch (type) { case SchemaTree::Node::Type::Obj: @@ -202,6 +244,7 @@ auto validate_node_id_value_pairs( try { std::unordered_map> parent_node_id_to_key_names; + std::vector key_duplication_checked_node_id_bitmap(schema_tree.get_size(), false); for (auto const& [node_id, value] : node_id_value_pairs) { auto const& node{schema_tree.get_node(node_id)}; if (node.is_root()) { @@ -226,20 +269,38 @@ auto validate_node_id_value_pairs( return std::errc::operation_not_permitted; } - // We checked that the node isn't the root above, so we can query the underlying ID - // safely without a repeated check. - auto const parent_node_id{node.get_parent_id_unsafe()}; - auto const key_name{node.get_key_name()}; - if (parent_node_id_to_key_names.contains(parent_node_id)) { - auto const [it, new_key_inserted]{ - parent_node_id_to_key_names.at(parent_node_id).emplace(key_name) - }; - if (false == new_key_inserted) { - // The key is duplicated under the same parent + if (false + == check_key_uniqueness_among_sibling_nodes(node, parent_node_id_to_key_names)) + { + return std::errc::protocol_not_supported; + } + + // Iteratively check if there's any key duplication in the node's ancestors until: + // 1. The ancestor has already been checked. We only need to check an ancestor node + // once since if there are key duplications among its siblings, it would've been + // caught when the sibling was first checked (the order in which siblings get checked + // doesn't affect the results). + // 2. We reach the root node. + auto next_ancestor_node_id_to_check{node.get_parent_id_unsafe()}; + while (false == key_duplication_checked_node_id_bitmap[next_ancestor_node_id_to_check]) + { + auto const& node_to_check{schema_tree.get_node(next_ancestor_node_id_to_check)}; + if (node_to_check.is_root()) { + key_duplication_checked_node_id_bitmap[node_to_check.get_id()] = true; + break; + } + + if (false + == check_key_uniqueness_among_sibling_nodes( + node_to_check, + parent_node_id_to_key_names + )) + { return std::errc::protocol_not_supported; } - } else { - parent_node_id_to_key_names.emplace(parent_node_id, std::unordered_set{key_name}); + + key_duplication_checked_node_id_bitmap[next_ancestor_node_id_to_check] = true; + next_ancestor_node_id_to_check = node_to_check.get_parent_id_unsafe(); } } } catch (SchemaTree::OperationFailed const& ex) { @@ -269,6 +330,38 @@ auto is_leaf_node( return true; } +auto get_schema_subtree_bitmap( + KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs, + SchemaTree const& schema_tree +) -> OUTCOME_V2_NAMESPACE::std_result> { + vector schema_subtree_bitmap(schema_tree.get_size(), false); + for (auto const& [node_id, val] : node_id_value_pairs) { + if (node_id >= schema_subtree_bitmap.size()) { + return std::errc::result_out_of_range; + } + schema_subtree_bitmap[node_id] = true; + + // Iteratively mark the parents as true + auto optional_parent_id{schema_tree.get_node(node_id).get_parent_id()}; + while (true) { + // Ideally, we'd use this if statement as the loop condition, but clang-tidy will + // complain about an unchecked `optional` access. + if (false == optional_parent_id.has_value()) { + // Reached the root + break; + } + auto const parent_id{optional_parent_id.value()}; + if (schema_subtree_bitmap[parent_id]) { + // Parent already set by other child + break; + } + schema_subtree_bitmap[parent_id] = true; + optional_parent_id = schema_tree.get_node(parent_id).get_parent_id(); + } + } + return schema_subtree_bitmap; +} + auto insert_kv_pair_into_json_obj( SchemaTree::Node const& node, std::optional const& optional_val, @@ -332,54 +425,13 @@ auto decode_as_encoded_text_ast(Value const& val) -> std::optional { ? val.get_immutable_view().decode_and_unparse() : val.get_immutable_view().decode_and_unparse(); } -} // namespace - -auto KeyValuePairLogEvent::create( - std::shared_ptr schema_tree, - NodeIdValuePairs node_id_value_pairs, - UtcOffset utc_offset -) -> OUTCOME_V2_NAMESPACE::std_result { - if (auto const ret_val{validate_node_id_value_pairs(*schema_tree, node_id_value_pairs)}; - std::errc{} != ret_val) - { - return ret_val; - } - return KeyValuePairLogEvent{std::move(schema_tree), std::move(node_id_value_pairs), utc_offset}; -} - -auto KeyValuePairLogEvent::get_schema_subtree_bitmap( -) const -> OUTCOME_V2_NAMESPACE::std_result> { - auto schema_subtree_bitmap{vector(m_schema_tree->get_size(), false)}; - for (auto const& [node_id, val] : m_node_id_value_pairs) { - if (node_id >= schema_subtree_bitmap.size()) { - return std::errc::result_out_of_range; - } - schema_subtree_bitmap[node_id] = true; - - // Iteratively mark the parents as true - auto optional_parent_id{m_schema_tree->get_node(node_id).get_parent_id()}; - while (true) { - // Ideally, we'd use this if statement as the loop condition, but clang-tidy will - // complain about an unchecked `optional` access. - if (false == optional_parent_id.has_value()) { - // Reached the root - break; - } - auto const parent_id{optional_parent_id.value()}; - if (schema_subtree_bitmap[parent_id]) { - // Parent already set by other child - break; - } - schema_subtree_bitmap[parent_id] = true; - optional_parent_id = m_schema_tree->get_node(parent_id).get_parent_id(); - } - } - return schema_subtree_bitmap; -} -auto KeyValuePairLogEvent::serialize_to_json( -) const -> OUTCOME_V2_NAMESPACE::std_result { - if (m_node_id_value_pairs.empty()) { +auto serialize_node_id_value_pairs_to_json( + SchemaTree const& schema_tree, + KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs, + vector const& schema_subtree_bitmap +) -> OUTCOME_V2_NAMESPACE::std_result { + if (node_id_value_pairs.empty()) { return nlohmann::json::object(); } @@ -393,12 +445,6 @@ auto KeyValuePairLogEvent::serialize_to_json( // vector grows). std::stack dfs_stack; - auto const schema_subtree_bitmap_ret{get_schema_subtree_bitmap()}; - if (schema_subtree_bitmap_ret.has_error()) { - return schema_subtree_bitmap_ret.error(); - } - auto const& schema_subtree_bitmap{schema_subtree_bitmap_ret.value()}; - // Traverse the schema tree in DFS order, but only traverse the nodes that are set in // `schema_subtree_bitmap`. // @@ -408,7 +454,7 @@ auto KeyValuePairLogEvent::serialize_to_json( // // On the way up, add the current node's `nlohmann::json::object_t` to the parent's // `nlohmann::json::object_t`. - auto const& root_schema_tree_node{m_schema_tree->get_root()}; + auto const& root_schema_tree_node{schema_tree.get_root()}; auto root_json_obj = nlohmann::json::object_t(); dfs_stack.emplace( @@ -424,13 +470,13 @@ auto KeyValuePairLogEvent::serialize_to_json( continue; } auto const child_schema_tree_node_id{top.get_next_child_schema_tree_node()}; - auto const& child_schema_tree_node{m_schema_tree->get_node(child_schema_tree_node_id)}; - if (m_node_id_value_pairs.contains(child_schema_tree_node_id)) { + auto const& child_schema_tree_node{schema_tree.get_node(child_schema_tree_node_id)}; + if (node_id_value_pairs.contains(child_schema_tree_node_id)) { // Handle leaf node if (false == insert_kv_pair_into_json_obj( child_schema_tree_node, - m_node_id_value_pairs.at(child_schema_tree_node_id), + node_id_value_pairs.at(child_schema_tree_node_id), top.get_json_obj() )) { @@ -452,4 +498,109 @@ auto KeyValuePairLogEvent::serialize_to_json( return root_json_obj; } + +auto check_key_uniqueness_among_sibling_nodes( + SchemaTree::Node const& node, + std::unordered_map>& + parent_node_id_to_key_names +) -> bool { + // The caller checks that the given node is not the root, so we can query the underlying + // parent ID safely without a check. + auto const parent_node_id{node.get_parent_id_unsafe()}; + auto const key_name{node.get_key_name()}; + auto const parent_node_id_to_key_names_it{parent_node_id_to_key_names.find(parent_node_id)}; + if (parent_node_id_to_key_names_it != parent_node_id_to_key_names.end()) { + auto const [it, new_key_inserted]{parent_node_id_to_key_names_it->second.emplace(key_name)}; + if (false == new_key_inserted) { + // The key is duplicated under the same parent + return false; + } + } else { + parent_node_id_to_key_names.emplace(parent_node_id, std::unordered_set{key_name}); + } + return true; +} +} // namespace + +auto KeyValuePairLogEvent::create( + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, + NodeIdValuePairs auto_gen_node_id_value_pairs, + NodeIdValuePairs user_gen_node_id_value_pairs, + UtcOffset utc_offset +) -> OUTCOME_V2_NAMESPACE::std_result { + if (nullptr == auto_gen_keys_schema_tree || nullptr == user_gen_keys_schema_tree) { + return std::errc::invalid_argument; + } + + if (auto const ret_val{validate_node_id_value_pairs( + *auto_gen_keys_schema_tree, + auto_gen_node_id_value_pairs + )}; + std::errc{} != ret_val) + { + return ret_val; + } + + if (auto const ret_val{validate_node_id_value_pairs( + *user_gen_keys_schema_tree, + user_gen_node_id_value_pairs + )}; + std::errc{} != ret_val) + { + return ret_val; + } + + return KeyValuePairLogEvent{ + std::move(auto_gen_keys_schema_tree), + std::move(user_gen_keys_schema_tree), + std::move(auto_gen_node_id_value_pairs), + std::move(user_gen_node_id_value_pairs), + utc_offset + }; +} + +auto KeyValuePairLogEvent::get_auto_gen_keys_schema_subtree_bitmap( +) const -> OUTCOME_V2_NAMESPACE::std_result> { + return get_schema_subtree_bitmap(m_auto_gen_node_id_value_pairs, *m_auto_gen_keys_schema_tree); +} + +auto KeyValuePairLogEvent::get_user_gen_keys_schema_subtree_bitmap( +) const -> outcome_v2::std_result> { + return get_schema_subtree_bitmap(m_user_gen_node_id_value_pairs, *m_user_gen_keys_schema_tree); +} + +auto KeyValuePairLogEvent::serialize_to_json( +) const -> OUTCOME_V2_NAMESPACE::std_result> { + auto const auto_gen_keys_schema_subtree_bitmap_result{get_auto_gen_keys_schema_subtree_bitmap() + }; + if (auto_gen_keys_schema_subtree_bitmap_result.has_error()) { + return auto_gen_keys_schema_subtree_bitmap_result.error(); + } + auto serialized_auto_gen_kv_pairs_result{serialize_node_id_value_pairs_to_json( + *m_auto_gen_keys_schema_tree, + m_auto_gen_node_id_value_pairs, + auto_gen_keys_schema_subtree_bitmap_result.value() + )}; + if (serialized_auto_gen_kv_pairs_result.has_error()) { + return serialized_auto_gen_kv_pairs_result.error(); + } + + auto const user_gen_keys_schema_subtree_bitmap_result{get_user_gen_keys_schema_subtree_bitmap() + }; + if (user_gen_keys_schema_subtree_bitmap_result.has_error()) { + return user_gen_keys_schema_subtree_bitmap_result.error(); + } + auto serialized_user_gen_kv_pairs_result{serialize_node_id_value_pairs_to_json( + *m_user_gen_keys_schema_tree, + m_user_gen_node_id_value_pairs, + user_gen_keys_schema_subtree_bitmap_result.value() + )}; + if (serialized_user_gen_kv_pairs_result.has_error()) { + return serialized_user_gen_kv_pairs_result.error(); + } + + return {std::move(serialized_auto_gen_kv_pairs_result.value()), + std::move(serialized_user_gen_kv_pairs_result.value())}; +} } // namespace clp::ffi diff --git a/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp b/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp index f6334d378..2929c7498 100644 --- a/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp +++ b/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp @@ -17,10 +17,13 @@ namespace clp::ffi { /** * A log event containing key-value pairs. Each event contains: - * - A collection of node-ID & value pairs, where each pair represents a leaf `SchemaTreeNode` in - * the `SchemaTree`. - * - A reference to the `SchemaTree` - * - The UTC offset of the current log event + * - A reference to the schema tree for auto-generated keys. + * - A reference to the schema tree for user-generated keys. + * - A collection of auto-generated node-ID & value pairs, where each pair represents a leaf + * `SchemaTree::Node` in the schema tree for auto-generated keys. + * - A collection of user-generated node-ID & value pairs, where each pair represents a leaf + * `SchemaTree::Node` in the schema tree for user-generated keys. + * - The UTC offset of the current log event. */ class KeyValuePairLogEvent { public: @@ -29,15 +32,21 @@ class KeyValuePairLogEvent { // Factory functions /** - * @param schema_tree - * @param node_id_value_pairs + * @param auto_gen_keys_schema_tree + * @param user_gen_keys_schema_tree + * @param auto_gen_node_id_value_pairs + * @param user_gen_node_id_value_pairs * @param utc_offset * @return A result containing the key-value pair log event or an error code indicating the - * failure. See `validate_node_id_value_pairs` for the possible error codes. + * failure: + * - std::errc::invalid_argument if any of the given schema tree pointers are null. + * - Forwards `validate_node_id_value_pairs`'s return values. */ [[nodiscard]] static auto create( - std::shared_ptr schema_tree, - NodeIdValuePairs node_id_value_pairs, + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, + NodeIdValuePairs auto_gen_node_id_value_pairs, + NodeIdValuePairs user_gen_node_id_value_pairs, UtcOffset utc_offset ) -> OUTCOME_V2_NAMESPACE::std_result; @@ -53,51 +62,77 @@ class KeyValuePairLogEvent { ~KeyValuePairLogEvent() = default; // Methods - [[nodiscard]] auto get_schema_tree() const -> SchemaTree const& { return *m_schema_tree; } + [[nodiscard]] auto get_auto_gen_keys_schema_tree() const -> SchemaTree const& { + return *m_auto_gen_keys_schema_tree; + } - [[nodiscard]] auto get_node_id_value_pairs() const -> NodeIdValuePairs const& { - return m_node_id_value_pairs; + [[nodiscard]] auto get_user_gen_keys_schema_tree() const -> SchemaTree const& { + return *m_user_gen_keys_schema_tree; } - [[nodiscard]] auto get_utc_offset() const -> UtcOffset { return m_utc_offset; } + [[nodiscard]] auto get_auto_gen_node_id_value_pairs() const -> NodeIdValuePairs const& { + return m_auto_gen_node_id_value_pairs; + } + + [[nodiscard]] auto get_user_gen_node_id_value_pairs() const -> NodeIdValuePairs const& { + return m_user_gen_node_id_value_pairs; + } /** * @return A result containing a bitmap where every bit corresponds to the ID of a node in the - * schema tree, and the set bits correspond to the nodes in the subtree defined by all paths - * from the root node to the nodes in `node_id_value_pairs`; or an error code indicating a - * failure: - * - std::errc::result_out_of_range if a node ID in `node_id_value_pairs` doesn't exist in the - * schema tree. + * schema tree for auto-generated keys, and the set bits correspond to the nodes in the subtree + * defined by all paths from the root node to the nodes in `m_auto_gen_node_id_value_pairs`; or + * an error code indicating a failure: + * - Forwards `get_schema_subtree_bitmap`'s return values. */ - [[nodiscard]] auto get_schema_subtree_bitmap( + [[nodiscard]] auto get_auto_gen_keys_schema_subtree_bitmap( ) const -> OUTCOME_V2_NAMESPACE::std_result>; /** - * Serializes the log event into a `nlohmann::json` object. - * @return A result containing the serialized JSON object or an error code indicating the - * failure: - * - std::errc::protocol_error if a value in the log event couldn't be decoded or it couldn't be - * inserted into a JSON object. - * - std::errc::result_out_of_range if a node ID in the log event doesn't exist in the schema - * tree. + * @return A result containing a bitmap where every bit corresponds to the ID of a node in the + * schema tree for user-generated keys, and the set bits correspond to the nodes in the subtree + * defined by all paths from the root node to the nodes in `m_user_gen_node_id_value_pairs`; or + * an error code indicating a failure: + * - Forwards `get_schema_subtree_bitmap`'s return values. + */ + [[nodiscard]] auto get_user_gen_keys_schema_subtree_bitmap( + ) const -> OUTCOME_V2_NAMESPACE::std_result>; + + [[nodiscard]] auto get_utc_offset() const -> UtcOffset { return m_utc_offset; } + + /** + * Serializes the log event into `nlohmann::json` objects. + * @return A result containing a pair or an error code indicating the failure: + * - The pair: + * - Serialized auto-generated key-value pairs as a JSON object + * - Serialized user-generated key-value pairs as a JSON object + * - The possible error codes: + * - Forwards `get_auto_gen_keys_schema_subtree_bitmap`'s return values on failure. + * - Forwards `serialize_node_id_value_pairs_to_json`'s return values on failure. */ [[nodiscard]] auto serialize_to_json( - ) const -> OUTCOME_V2_NAMESPACE::std_result; + ) const -> OUTCOME_V2_NAMESPACE::std_result>; private: // Constructor KeyValuePairLogEvent( - std::shared_ptr schema_tree, - NodeIdValuePairs node_id_value_pairs, + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, + NodeIdValuePairs auto_gen_node_id_value_pairs, + NodeIdValuePairs user_gen_node_id_value_pairs, UtcOffset utc_offset ) - : m_schema_tree{std::move(schema_tree)}, - m_node_id_value_pairs{std::move(node_id_value_pairs)}, + : m_auto_gen_keys_schema_tree{std::move(auto_gen_keys_schema_tree)}, + m_user_gen_keys_schema_tree{std::move(user_gen_keys_schema_tree)}, + m_auto_gen_node_id_value_pairs{std::move(auto_gen_node_id_value_pairs)}, + m_user_gen_node_id_value_pairs{std::move(user_gen_node_id_value_pairs)}, m_utc_offset{utc_offset} {} // Variables - std::shared_ptr m_schema_tree; - NodeIdValuePairs m_node_id_value_pairs; + std::shared_ptr m_auto_gen_keys_schema_tree; + std::shared_ptr m_user_gen_keys_schema_tree; + NodeIdValuePairs m_auto_gen_node_id_value_pairs; + NodeIdValuePairs m_user_gen_node_id_value_pairs; UtcOffset m_utc_offset{0}; }; } // namespace clp::ffi diff --git a/components/core/src/clp/ffi/SchemaTree.hpp b/components/core/src/clp/ffi/SchemaTree.hpp index 46494fa71..4efbbf81e 100644 --- a/components/core/src/clp/ffi/SchemaTree.hpp +++ b/components/core/src/clp/ffi/SchemaTree.hpp @@ -128,6 +128,8 @@ class SchemaTree { ~Node() = default; // Methods + [[nodiscard]] auto operator==(Node const& rhs) const -> bool = default; + [[nodiscard]] auto get_id() const -> id_t { return m_id; } [[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); } @@ -249,6 +251,10 @@ class SchemaTree { ~SchemaTree() = default; // Methods + [[nodiscard]] auto operator==(SchemaTree const& rhs) const -> bool { + return m_tree_nodes == rhs.m_tree_nodes; + } + [[nodiscard]] auto get_size() const -> size_t { return m_tree_nodes.size(); } [[nodiscard]] auto get_root() const -> Node const& { return m_tree_nodes[cRootId]; } diff --git a/components/core/src/clp/ffi/ir_stream/Deserializer.hpp b/components/core/src/clp/ffi/ir_stream/Deserializer.hpp index 3418a39ae..d31699cd2 100644 --- a/components/core/src/clp/ffi/ir_stream/Deserializer.hpp +++ b/components/core/src/clp/ffi/ir_stream/Deserializer.hpp @@ -115,7 +115,8 @@ class Deserializer { Deserializer(IrUnitHandler ir_unit_handler) : m_ir_unit_handler{std::move(ir_unit_handler)} {} // Variables - std::shared_ptr m_schema_tree{std::make_shared()}; + std::shared_ptr m_auto_gen_keys_schema_tree{std::make_shared()}; + std::shared_ptr m_user_gen_keys_schema_tree{std::make_shared()}; UtcOffset m_utc_offset{0}; IrUnitHandler m_ir_unit_handler; bool m_is_complete{false}; @@ -183,9 +184,13 @@ auto Deserializer::deserialize_next_ir_unit(ReaderInterface& read auto const ir_unit_type{optional_ir_unit_type.value()}; switch (ir_unit_type) { case IrUnitType::LogEvent: { - auto result{ - deserialize_ir_unit_kv_pair_log_event(reader, tag, m_schema_tree, m_utc_offset) - }; + auto result{deserialize_ir_unit_kv_pair_log_event( + reader, + tag, + m_auto_gen_keys_schema_tree, + m_user_gen_keys_schema_tree, + m_utc_offset + )}; if (result.has_error()) { return result.error(); } @@ -207,7 +212,7 @@ auto Deserializer::deserialize_next_ir_unit(ReaderInterface& read } auto const node_locator{result.value()}; - if (m_schema_tree->has_node(node_locator)) { + if (m_user_gen_keys_schema_tree->has_node(node_locator)) { return std::errc::protocol_error; } @@ -217,7 +222,7 @@ auto Deserializer::deserialize_next_ir_unit(ReaderInterface& read return ir_error_code_to_errc(err); } - std::ignore = m_schema_tree->insert_node(node_locator); + std::ignore = m_user_gen_keys_schema_tree->insert_node(node_locator); break; } diff --git a/components/core/src/clp/ffi/ir_stream/IrErrorCode.cpp b/components/core/src/clp/ffi/ir_stream/IrErrorCode.cpp new file mode 100644 index 000000000..f9a00ca1e --- /dev/null +++ b/components/core/src/clp/ffi/ir_stream/IrErrorCode.cpp @@ -0,0 +1,26 @@ +#include "IrErrorCode.hpp" + +#include + +using IrErrorCategory = clp::error_handling::ErrorCategory; +using clp::ffi::ir_stream::IrErrorCodeEnum; + +template <> +auto IrErrorCategory::name() const noexcept -> char const* { + return "clp::ffi::ir_stream::IrErrorCode"; +} + +template <> +auto IrErrorCategory::message(IrErrorCodeEnum error_enum) const -> std::string { + switch (error_enum) { + case IrErrorCodeEnum::DecodingMethodFailure: + return "The decoding method failed."; + case IrErrorCodeEnum::EndOfStream: + return "The end-of-stream IR unit has already been consumed."; + case IrErrorCodeEnum::IncompleteStream: + return "The IR stream ended with a truncated IR unit or did not terminate with an " + "end-of-stream IR unit."; + default: + return "Unknown error code enum."; + } +} diff --git a/components/core/src/clp/ffi/ir_stream/IrErrorCode.hpp b/components/core/src/clp/ffi/ir_stream/IrErrorCode.hpp new file mode 100644 index 000000000..8eaad4e16 --- /dev/null +++ b/components/core/src/clp/ffi/ir_stream/IrErrorCode.hpp @@ -0,0 +1,24 @@ +#ifndef CLP_IRERRORCODE_HPP +#define CLP_IRERRORCODE_HPP + +#include + +#include "../../error_handling/ErrorCode.hpp" + +namespace clp::ffi::ir_stream { +/** + * This enum class represents all possible error codes related to serializing or deserializing CLP + * IR streams. + */ +enum class IrErrorCodeEnum : uint8_t { + DecodingMethodFailure, + EndOfStream, + IncompleteStream, +}; + +using IrErrorCode = clp::error_handling::ErrorCode; +} // namespace clp::ffi::ir_stream + +CLP_ERROR_HANDLING_MARK_AS_ERROR_CODE_ENUM(clp::ffi::ir_stream::IrErrorCodeEnum); + +#endif // CLP_IRERRORCODE_HPP diff --git a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp index 5e1813a3e..cea4a1b84 100644 --- a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp +++ b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp @@ -551,7 +551,8 @@ auto deserialize_ir_unit_utc_offset_change(ReaderInterface& reader auto deserialize_ir_unit_kv_pair_log_event( ReaderInterface& reader, encoded_tag_t tag, - std::shared_ptr schema_tree, + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, UtcOffset utc_offset ) -> OUTCOME_V2_NAMESPACE::std_result { auto const schema_result{deserialize_schema(reader, tag)}; @@ -579,7 +580,9 @@ auto deserialize_ir_unit_kv_pair_log_event( } return KeyValuePairLogEvent::create( - std::move(schema_tree), + std::move(auto_gen_keys_schema_tree), + std::move(user_gen_keys_schema_tree), + {}, std::move(node_id_value_pairs), utc_offset ); diff --git a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp index 68ed4408b..451f627db 100644 --- a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp +++ b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp @@ -57,10 +57,12 @@ namespace clp::ffi::ir_stream { * Deserializes a key-value pair log event IR unit. * @param reader * @param tag - * @param schema_tree Schema tree used to construct the KV-pair log event. + * @param auto_gen_keys_schema_tree Schema tree for auto-generated keys, used to construct the + * KV-pair log event. + * @param user_gen_keys_schema_tree Schema tree for user-generated keys, used to construct the + * KV-pair log event. * @param utc_offset UTC offset used to construct the KV-pair log event. - * @return A result containing the deserialized log event or an error code indicating the - * failure: + * @return A result containing the deserialized log event or an error code indicating the failure: * - std::errc::result_out_of_range if the IR stream is truncated. * - std::errc::protocol_error if the IR stream is corrupted. * - std::errc::protocol_not_supported if the IR stream contains an unsupported metadata format @@ -72,7 +74,8 @@ namespace clp::ffi::ir_stream { [[nodiscard]] auto deserialize_ir_unit_kv_pair_log_event( ReaderInterface& reader, encoded_tag_t tag, - std::shared_ptr schema_tree, + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, UtcOffset utc_offset ) -> OUTCOME_V2_NAMESPACE::std_result; } // namespace clp::ffi::ir_stream diff --git a/components/core/src/clp/ir/EncodedTextAst.cpp b/components/core/src/clp/ir/EncodedTextAst.cpp index f0ee4d493..72a8f2729 100644 --- a/components/core/src/clp/ir/EncodedTextAst.cpp +++ b/components/core/src/clp/ir/EncodedTextAst.cpp @@ -5,7 +5,7 @@ #include #include "../ffi/encoding_methods.hpp" -#include "ffi/ir_stream/decoding_methods.hpp" +#include "../ffi/ir_stream/decoding_methods.hpp" using clp::ffi::decode_float_var; using clp::ffi::decode_integer_var; diff --git a/components/core/src/clp/ir/LogEvent.hpp b/components/core/src/clp/ir/LogEvent.hpp index 4a3ef7567..e2d4b310e 100644 --- a/components/core/src/clp/ir/LogEvent.hpp +++ b/components/core/src/clp/ir/LogEvent.hpp @@ -5,8 +5,8 @@ #include #include +#include "../time_types.hpp" #include "EncodedTextAst.hpp" -#include "time_types.hpp" #include "types.hpp" namespace clp::ir { diff --git a/components/core/src/clp/streaming_compression/Constants.hpp b/components/core/src/clp/streaming_compression/Constants.hpp index 4649c2e98..080f3a20b 100644 --- a/components/core/src/clp/streaming_compression/Constants.hpp +++ b/components/core/src/clp/streaming_compression/Constants.hpp @@ -7,6 +7,7 @@ namespace clp::streaming_compression { enum class CompressorType : uint8_t { ZSTD = 0x10, + LZMA = 0x20, Passthrough = 0xFF, }; } // namespace clp::streaming_compression diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp new file mode 100644 index 000000000..34c1a0e2b --- /dev/null +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -0,0 +1,203 @@ +#include "Compressor.hpp" + +#include +#include +#include +#include + +#include +#include + +#include "../../ErrorCode.hpp" +#include "../../FileWriter.hpp" +#include "../../TraceableException.hpp" +#include "../../type_utils.hpp" + +namespace clp::streaming_compression::lzma { +auto Compressor::open(FileWriter& file_writer) -> void { + if (nullptr != m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); + } + + m_lzma_stream.detach_input(); + if (false + == m_lzma_stream.attach_output( + m_compressed_stream_block_buffer.data(), + m_compressed_stream_block_buffer.size() + )) + { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + m_compressed_stream_file_writer = &file_writer; + m_uncompressed_stream_pos = 0; +} + +auto Compressor::close() -> void { + if (nullptr == m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + if (m_lzma_stream.avail_in() > 0) { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + + flush_lzma(LZMA_FINISH); + m_lzma_stream.end_and_detach_output(); + m_compressed_stream_file_writer = nullptr; +} + +auto Compressor::write(char const* data, size_t data_length) -> void { + if (nullptr == m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + if (false + == m_lzma_stream + .attach_input(clp::size_checked_pointer_cast(data), data_length)) + { + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + } + encode_lzma(); + m_lzma_stream.detach_input(); + m_uncompressed_stream_pos += data_length; +} + +auto Compressor::flush() -> void { + if (nullptr == m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + flush_lzma(LZMA_SYNC_FLUSH); +} + +auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { + if (nullptr == m_compressed_stream_file_writer) { + return ErrorCode_NotInit; + } + pos = m_uncompressed_stream_pos; + return ErrorCode_Success; +} + +auto Compressor::encode_lzma() -> void { + while (m_lzma_stream.avail_in() > 0) { + if (0 == m_lzma_stream.avail_out()) { + flush_stream_output_block_buffer(); + } + auto const rc = m_lzma_stream.lzma_code(LZMA_RUN); + switch (rc) { + case LZMA_OK: + break; + case LZMA_BUF_ERROR: + SPDLOG_ERROR("LZMA compressor input stream is corrupt. No encoding " + "progress can be made."); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + default: + SPDLOG_ERROR( + "lzma_code() returned an unexpected value - {}.", + static_cast(rc) + ); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + } +} + +auto Compressor::flush_lzma(lzma_action flush_action) -> void { + if (false == LzmaStream::is_flush_action(flush_action)) { + SPDLOG_ERROR( + "lzma_code() supplied with invalid flush action - {}.", + static_cast(flush_action) + ); + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + } + + bool flushed{false}; + while (false == flushed) { + if (0 == m_lzma_stream.avail_out()) { + flush_stream_output_block_buffer(); + } + auto const rc = m_lzma_stream.lzma_code(flush_action); + switch (rc) { + case LZMA_OK: + break; + case LZMA_STREAM_END: + // NOTE: flush may not have completed if a multithreaded encoder is using action + // LZMA_FULL_BARRIER. For now, we skip this check. + flushed = true; + break; + case LZMA_BUF_ERROR: + // NOTE: this can happen if we are using LZMA_FULL_FLUSH or LZMA_FULL_BARRIER. These + // two actions keeps encoding input data alongside flushing buffered encoded data. + SPDLOG_ERROR("LZMA compressor input stream is corrupt. No encoding " + "progress can be made."); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + default: + SPDLOG_ERROR( + "lzma_code() returned an unexpected value - {}.", + static_cast(rc) + ); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + } + flush_stream_output_block_buffer(); +} + +auto Compressor::flush_stream_output_block_buffer() -> void { + if (cCompressedStreamBlockBufferSize == m_lzma_stream.avail_out()) { + return; + } + m_compressed_stream_file_writer->write( + clp::size_checked_pointer_cast(m_compressed_stream_block_buffer.data()), + cCompressedStreamBlockBufferSize - m_lzma_stream.avail_out() + ); + if (false + == m_lzma_stream.attach_output( + m_compressed_stream_block_buffer.data(), + m_compressed_stream_block_buffer.size() + )) + { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } +} + +Compressor::LzmaStream::LzmaStream(int compression_level, size_t dict_size, lzma_check check) { + lzma_options_lzma options; + if (0 != lzma_lzma_preset(&options, compression_level)) { + SPDLOG_ERROR("Failed to initialize LZMA options' compression level."); + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + } + options.dict_size = dict_size; + std::array filters{{ + {.id = LZMA_FILTER_LZMA2, .options = &options}, + {.id = LZMA_VLI_UNKNOWN, .options = nullptr}, + }}; + + auto const rc = lzma_stream_encoder(&m_stream, filters.data(), check); + if (LZMA_OK == rc) { + return; + } + + char const* msg{nullptr}; + switch (rc) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_OPTIONS_ERROR: + msg = "Specified preset is not supported"; + break; + + case LZMA_UNSUPPORTED_CHECK: + msg = "Specified integrity check is not supported"; + break; + + case LZMA_PROG_ERROR: + msg = "Input arguments are not sane"; + break; + + default: + msg = "Unknown error"; + break; + } + + SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast(rc)); + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); +} +} // namespace clp::streaming_compression::lzma diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp new file mode 100644 index 000000000..de665eaf6 --- /dev/null +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -0,0 +1,230 @@ +#ifndef CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP +#define CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP + +#include +#include + +#include + +#include "../../Array.hpp" +#include "../../ErrorCode.hpp" +#include "../../FileWriter.hpp" +#include "../../TraceableException.hpp" +#include "../Compressor.hpp" +#include "Constants.hpp" + +namespace clp::streaming_compression::lzma { +/** + * Implements a LZMA compressor that compresses byte input data to a file. + */ +class Compressor : public ::clp::streaming_compression::Compressor { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + + // Methods + [[nodiscard]] auto what() const noexcept -> char const* override { + return "streaming_compression::lzma::Compressor operation failed"; + } + }; + + // Constructors + Compressor() : Compressor{cDefaultCompressionLevel, cDefaultDictionarySize, LZMA_CHECK_CRC64} {} + + Compressor(int compression_level, size_t dict_size, lzma_check check) + : m_lzma_stream{compression_level, dict_size, check} {} + + // Destructor + ~Compressor() override = default; + + // Delete copy constructor and assignment operator + Compressor(Compressor const&) = delete; + auto operator=(Compressor const&) -> Compressor& = delete; + + // Default move constructor and assignment operator + Compressor(Compressor&&) noexcept = default; + auto operator=(Compressor&&) noexcept -> Compressor& = default; + + // Methods implementing the WriterInterface + /** + * Writes the given data to the compressor + * @param data + * @param data_length + */ + auto write(char const* data, size_t data_length) -> void override; + + /** + * Writes any internally buffered data to file and ends the current frame + * + * Forces all the encoded data buffered by LZMA to be available at output + */ + auto flush() -> void override; + + /** + * Tries to get the current position of the write head + * @param pos Position of the write head + * @return ErrorCode_NotInit if the compressor is not open + * @return ErrorCode_Success on success + */ + auto try_get_pos(size_t& pos) const -> ErrorCode override; + + // Methods implementing the Compressor interface + /** + * Closes the compressor + */ + auto close() -> void override; + + /** + * Open the compression stream for encoding to the file_writer. + * + * @param file_writer + */ + auto open(FileWriter& file_writer) -> void override; + +private: + /** + * Wrapper class around lzma_stream providing easier usage. + */ + class LzmaStream { + public: + /** + * Initializes an LZMA compression encoder and its streams. + * + * @param compression_level Compression preset level in the range [0-9] where the higher + * numbers use increasingly more memory for greater compression ratios. + * @param dict_size Max amount of recently processed uncompressed bytes to keep in the + * memory. + * @param check Type of check to verify the integrity of the uncompressed data. + * LZMA_CHECK_CRC64 is the default in the xz command line tool. If the .xz file needs to be + * decompressed with XZ-Embedded, use LZMA_CHECK_CRC32 instead. + * + * @throw `OperationFailed` `ErrorCode_BadParam` if the LZMA options are invalid or the + * encoder fails to initialize. + */ + LzmaStream(int compression_level, size_t dict_size, lzma_check check); + + // Destructor + ~LzmaStream() = default; + + // Delete copy constructor and assignment operator + LzmaStream(LzmaStream const&) = delete; + auto operator=(LzmaStream const&) -> LzmaStream& = delete; + + // Default move constructor and assignment operator + LzmaStream(LzmaStream&&) noexcept = default; + auto operator=(LzmaStream&&) noexcept -> LzmaStream& = default; + + /** + * Attaches a pre-allocated block buffer to the encoder's input stream. + * + * @return false if the data buffer is null. + * @return true on success. + */ + [[nodiscard]] auto attach_input(uint8_t const* data_ptr, size_t data_length) -> bool { + if (nullptr == data_ptr) { + return false; + } + m_stream.next_in = data_ptr; + m_stream.avail_in = data_length; + return true; + } + + /** + * Attaches a pre-allocated block buffer to the encoder's output stream. + * + * @return false if the data buffer is null or empty. + * @return true on success. + */ + [[nodiscard]] auto attach_output(uint8_t* data_ptr, size_t data_length) -> bool { + if (nullptr == data_ptr || 0 == data_length) { + return false; + } + m_stream.next_out = data_ptr; + m_stream.avail_out = data_length; + return true; + } + + [[nodiscard]] auto avail_in() const -> size_t { return m_stream.avail_in; } + + [[nodiscard]] auto avail_out() const -> size_t { return m_stream.avail_out; } + + /** + * Unset the internal fields of the encoder's input stream. + */ + auto detach_input() -> void { + m_stream.next_in = nullptr; + m_stream.avail_in = 0; + } + + /** + * End the LZMA stream and unset the internal fields of the encoder's output stream. + */ + auto end_and_detach_output() -> void { + lzma_end(&m_stream); + m_stream.next_out = nullptr; + m_stream.avail_out = 0; + } + + [[nodiscard]] static auto is_flush_action(lzma_action action) -> bool { + return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action + || LZMA_FULL_BARRIER == action || LZMA_FINISH == action; + } + + [[nodiscard]] auto lzma_code(lzma_action action) -> lzma_ret { + return ::lzma_code(&m_stream, action); + } + + private: + lzma_stream m_stream = LZMA_STREAM_INIT; + }; + + static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB + + /** + * Invokes lzma_code() repeatedly with LZMA_RUN until the input is exhausted + * + * At the end of the workflow, the last bytes of encoded data may still be buffered in the LZMA + * stream and thus not immediately available at the output block buffer. + * + * Assumes input stream and output block buffer are both in valid states. + * @throw `OperationFailed` if LZMA returns an unexpected error value + */ + auto encode_lzma() -> void; + + /** + * Invokes lzma_code() repeatedly with the given flushing action until all encoded data is made + * available at the output block buffer + * + * Once flushing starts, the workflow action needs to stay the same until flushing is signaled + * complete by LZMA (aka LZMA_STREAM_END is reached). + * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274 + * + * Assumes input stream and output block buffer are both in valid states. + * @param flush_action + * @throw `OperationFailed` if the provided action is not an LZMA flush + * action, or if LZMA returns an unexpected error value + */ + auto flush_lzma(lzma_action flush_action) -> void; + + /** + * Flushes the current compressed data in the output block buffer to the output file handler. + * + * Also resets the output block buffer to receive new data. + */ + auto flush_stream_output_block_buffer() -> void; + + // Variables + FileWriter* m_compressed_stream_file_writer{nullptr}; + + // Compressed stream variables + Array m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize}; + LzmaStream m_lzma_stream; + size_t m_uncompressed_stream_pos{0}; +}; +} // namespace clp::streaming_compression::lzma + +#endif // CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP diff --git a/components/core/src/clp/streaming_compression/lzma/Constants.hpp b/components/core/src/clp/streaming_compression/lzma/Constants.hpp new file mode 100644 index 000000000..4e261187a --- /dev/null +++ b/components/core/src/clp/streaming_compression/lzma/Constants.hpp @@ -0,0 +1,15 @@ +#ifndef STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP +#define STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP + +#include + +#include + +namespace clp::streaming_compression::lzma { +constexpr int cDefaultCompressionLevel{3}; +constexpr int cMinCompressionLevel{0}; +constexpr int cMaxCompressionLevel{9}; +constexpr uint32_t cDefaultDictionarySize{LZMA_DICT_SIZE_DEFAULT}; +} // namespace clp::streaming_compression::lzma + +#endif // STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP diff --git a/components/core/src/clp_s/ArchiveWriter.hpp b/components/core/src/clp_s/ArchiveWriter.hpp index 3b13f4426..82a0122bc 100644 --- a/components/core/src/clp_s/ArchiveWriter.hpp +++ b/components/core/src/clp_s/ArchiveWriter.hpp @@ -122,9 +122,9 @@ class ArchiveWriter { * @return the epoch time corresponding to the string timestamp */ epochtime_t ingest_timestamp_entry( - std::string const& key, + std::string_view key, int32_t node_id, - std::string const& timestamp, + std::string_view timestamp, uint64_t& pattern_id ) { return m_timestamp_dict.ingest_entry(key, node_id, timestamp, pattern_id); @@ -136,11 +136,11 @@ class ArchiveWriter { * @param node_id * @param timestamp */ - void ingest_timestamp_entry(std::string const& key, int32_t node_id, double timestamp) { + void ingest_timestamp_entry(std::string_view key, int32_t node_id, double timestamp) { m_timestamp_dict.ingest_entry(key, node_id, timestamp); } - void ingest_timestamp_entry(std::string const& key, int32_t node_id, int64_t timestamp) { + void ingest_timestamp_entry(std::string_view key, int32_t node_id, int64_t timestamp) { m_timestamp_dict.ingest_entry(key, node_id, timestamp); } diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index 1656a5d59..9ca0c947e 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -8,11 +8,35 @@ set( ../clp/database_utils.hpp ../clp/Defs.h ../clp/ErrorCode.hpp + ../clp/ffi/ir_stream/decoding_methods.cpp + ../clp/ffi/ir_stream/decoding_methods.hpp + ../clp/ffi/ir_stream/Deserializer.hpp + ../clp/ffi/ir_stream/encoding_methods.cpp + ../clp/ffi/ir_stream/encoding_methods.hpp + ../clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp + ../clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp + ../clp/ffi/ir_stream/Serializer.cpp + ../clp/ffi/ir_stream/Serializer.hpp + ../clp/ffi/ir_stream/utils.cpp + ../clp/ffi/ir_stream/utils.hpp + ../clp/ffi/KeyValuePairLogEvent.cpp + ../clp/ffi/KeyValuePairLogEvent.hpp + ../clp/ffi/SchemaTree.cpp + ../clp/ffi/SchemaTree.hpp + ../clp/ffi/utils.cpp + ../clp/ffi/utils.hpp + ../clp/ffi/Value.hpp + ../clp/FileDescriptor.cpp + ../clp/FileDescriptor.hpp ../clp/GlobalMetadataDB.hpp ../clp/GlobalMetadataDBConfig.cpp ../clp/GlobalMetadataDBConfig.hpp ../clp/GlobalMySQLMetadataDB.cpp ../clp/GlobalMySQLMetadataDB.hpp + ../clp/ir/EncodedTextAst.cpp + ../clp/ir/EncodedTextAst.hpp + ../clp/ir/parsing.cpp + ../clp/ir/parsing.hpp ../clp/MySQLDB.cpp ../clp/MySQLDB.hpp ../clp/MySQLParamBindings.cpp @@ -23,9 +47,16 @@ set( ../clp/networking/socket_utils.hpp ../clp/ReaderInterface.cpp ../clp/ReaderInterface.hpp + ../clp/ReadOnlyMemoryMappedFile.cpp + ../clp/ReadOnlyMemoryMappedFile.hpp ../clp/streaming_archive/ArchiveMetadata.cpp ../clp/streaming_archive/ArchiveMetadata.hpp + ../clp/streaming_compression/zstd/Decompressor.cpp + ../clp/streaming_compression/zstd/Decompressor.hpp ../clp/TraceableException.hpp + ../clp/time_types.hpp + ../clp/utf8_utils.cpp + ../clp/utf8_utils.hpp ../clp/WriterInterface.cpp ../clp/WriterInterface.hpp ) diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 99539b627..4218d9d60 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -6,9 +6,9 @@ #include #include "../clp/cli_utils.hpp" +#include "../clp/type_utils.hpp" #include "../reducer/types.hpp" #include "FileReader.hpp" -#include "type_utils.hpp" namespace po = boost::program_options; @@ -148,6 +148,9 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { po::options_description compression_options("Compression options"); std::string metadata_db_config_file_path; std::string input_path_list_file_path; + constexpr std::string_view cJsonFileType{"json"}; + constexpr std::string_view cKeyValueIrFileType{"kv-ir"}; + std::string file_type{cJsonFileType}; // clang-format off compression_options.add_options()( "compression-level", @@ -202,6 +205,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { "disable-log-order", po::bool_switch(&m_disable_log_order), "Do not record log order at ingestion time." + )( + "file-type", + po::value(&file_type)->value_name("FILE_TYPE")->default_value(file_type), + "The type of file being compressed (json or kv-ir)" ); // clang-format on @@ -255,6 +262,30 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { throw std::invalid_argument("No input paths specified."); } + if (cJsonFileType == file_type) { + m_file_type = FileType::Json; + } else if (cKeyValueIrFileType == file_type) { + m_file_type = FileType::KeyValueIr; + if (m_structurize_arrays) { + SPDLOG_ERROR( + "Invalid combination of arguments; --file-type {} and " + "--structurize-arrays can't be used together", + cKeyValueIrFileType + ); + return ParsingResult::Failure; + } + if (false == m_timestamp_key.empty()) { + SPDLOG_ERROR( + "Invalid combination of arguments; --file-type {} and " + "--timestamp-key can't be used together", + cKeyValueIrFileType + ); + return ParsingResult::Failure; + } + } else { + throw std::invalid_argument("Unknown FILE_TYPE: " + file_type); + } + // Parse and validate global metadata DB config if (false == metadata_db_config_file_path.empty()) { clp::GlobalMetadataDBConfig metadata_db_config; diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp index a87e9b6bd..47c244646 100644 --- a/components/core/src/clp_s/CommandLineArguments.hpp +++ b/components/core/src/clp_s/CommandLineArguments.hpp @@ -36,6 +36,11 @@ class CommandLineArguments { Stdout, }; + enum class FileType : uint8_t { + Json = 0, + KeyValueIr + }; + // Constructors explicit CommandLineArguments(std::string const& program_name) : m_program_name(program_name) {} @@ -116,6 +121,8 @@ class CommandLineArguments { bool get_record_log_order() const { return false == m_disable_log_order; } + [[nodiscard]] auto get_file_type() const -> FileType { return m_file_type; } + private: // Methods /** @@ -184,6 +191,7 @@ class CommandLineArguments { size_t m_target_ordered_chunk_size{}; size_t m_minimum_table_size{1ULL * 1024 * 1024}; // 1 MB bool m_disable_log_order{false}; + FileType m_file_type{FileType::Json}; // Metadata db variables std::optional m_metadata_db_config; diff --git a/components/core/src/clp_s/JsonConstructor.cpp b/components/core/src/clp_s/JsonConstructor.cpp index 95e3fa2c5..8886f2074 100644 --- a/components/core/src/clp_s/JsonConstructor.cpp +++ b/components/core/src/clp_s/JsonConstructor.cpp @@ -122,7 +122,7 @@ void JsonConstructor::construct_in_order() { new_file_path.filename() ), bsoncxx::builder::basic::kvp( - constants::results_cache::decompression::cOrigFileId, + constants::results_cache::decompression::cStreamId, m_option.archive_id ), bsoncxx::builder::basic::kvp( @@ -134,7 +134,7 @@ void JsonConstructor::construct_in_order() { last_idx ), bsoncxx::builder::basic::kvp( - constants::results_cache::decompression::cIsLastIrChunk, + constants::results_cache::decompression::cIsLastChunk, false == open_new_writer ) ))); diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index d14a221b3..c917b1f09 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -1,15 +1,78 @@ #include "JsonParser.hpp" +#include #include +#include #include +#include +#include +#include #include #include +#include "../clp/ffi/ir_stream/decoding_methods.hpp" +#include "../clp/ffi/ir_stream/Deserializer.hpp" +#include "../clp/ffi/ir_stream/IrUnitType.hpp" +#include "../clp/ffi/KeyValuePairLogEvent.hpp" +#include "../clp/ffi/SchemaTree.hpp" +#include "../clp/ffi/utils.hpp" +#include "../clp/ffi/Value.hpp" +#include "../clp/ir/EncodedTextAst.hpp" +#include "../clp/streaming_compression/zstd/Decompressor.hpp" +#include "../clp/time_types.hpp" #include "archive_constants.hpp" +#include "ErrorCode.hpp" #include "JsonFileIterator.hpp" +using clp::ffi::ir_stream::Deserializer; +using clp::ffi::ir_stream::IRErrorCode; +using clp::ffi::KeyValuePairLogEvent; +using clp::UtcOffset; + namespace clp_s { +/** + * Class that implements `clp::ffi::ir_stream::IrUnitHandlerInterface` for Key-Value IR compression. + */ +class IrUnitHandler { +public: + [[nodiscard]] auto handle_log_event(KeyValuePairLogEvent&& log_event) -> IRErrorCode { + m_deserialized_log_event.emplace(std::move(log_event)); + return IRErrorCode::IRErrorCode_Success; + } + + [[nodiscard]] static auto handle_utc_offset_change( + [[maybe_unused]] UtcOffset utc_offset_old, + [[maybe_unused]] UtcOffset utc_offset_new + ) -> IRErrorCode { + return IRErrorCode::IRErrorCode_Decode_Error; + } + + [[nodiscard]] auto handle_schema_tree_node_insertion( + [[maybe_unused]] clp::ffi::SchemaTree::NodeLocator schema_tree_node_locator + ) -> IRErrorCode { + return IRErrorCode::IRErrorCode_Success; + } + + [[nodiscard]] auto handle_end_of_stream() -> IRErrorCode { + m_is_complete = true; + return IRErrorCode::IRErrorCode_Success; + } + + [[nodiscard]] auto get_deserialized_log_event( + ) const -> std::optional const& { + return m_deserialized_log_event; + } + + void clear() { m_is_complete = false; } + + [[nodiscard]] auto is_complete() const -> bool { return m_is_complete; } + +private: + std::optional m_deserialized_log_event; + bool m_is_complete{false}; +}; + JsonParser::JsonParser(JsonParserOption const& option) : m_num_messages(0), m_target_encoded_size(option.target_encoded_size), @@ -557,6 +620,309 @@ int32_t JsonParser::add_metadata_field(std::string_view const field_name, NodeTy return m_archive_writer->add_node(metadata_subtree_id, type, field_name); } +auto JsonParser::get_archive_node_type( + clp::ffi::SchemaTree const& tree, + std::pair> const& kv_pair +) -> NodeType { + clp::ffi::SchemaTree::Node const& tree_node = tree.get_node(kv_pair.first); + clp::ffi::SchemaTree::Node::Type const ir_node_type = tree_node.get_type(); + bool const node_has_value = kv_pair.second.has_value(); + clp::ffi::Value node_value{}; + if (node_has_value) { + node_value = kv_pair.second.value(); + } + switch (ir_node_type) { + case clp::ffi::SchemaTree::Node::Type::Int: + return NodeType::Integer; + case clp::ffi::SchemaTree::Node::Type::Float: + return NodeType::Float; + case clp::ffi::SchemaTree::Node::Type::Bool: + return NodeType::Boolean; + case clp::ffi::SchemaTree::Node::Type::UnstructuredArray: + return NodeType::UnstructuredArray; + case clp::ffi::SchemaTree::Node::Type::Str: + if (node_value.is()) { + return NodeType::VarString; + } + return NodeType::ClpString; + case clp::ffi::SchemaTree::Node::Type::Obj: + if (node_has_value && node_value.is_null()) { + return NodeType::NullValue; + } + return NodeType::Object; + default: + throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); + } +} + +auto JsonParser::add_node_to_archive_and_translations( + uint32_t ir_node_id, + clp::ffi::SchemaTree::Node const& ir_node_to_add, + NodeType archive_node_type, + int32_t parent_node_id +) -> int { + auto validated_escaped_key + = clp::ffi::validate_and_escape_utf8_string(ir_node_to_add.get_key_name()); + std::string node_key; + if (validated_escaped_key.has_value()) { + node_key = validated_escaped_key.value(); + } else { + SPDLOG_ERROR("Key is not UTF-8 compliant: \"{}\"", ir_node_to_add.get_key_name()); + throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); + } + int const curr_node_archive_id + = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key); + + m_ir_node_to_archive_node_id_mapping.emplace( + std::make_pair(ir_node_id, archive_node_type), + curr_node_archive_id + ); + return curr_node_archive_id; +} + +auto JsonParser::get_archive_node_id( + uint32_t ir_node_id, + NodeType archive_node_type, + clp::ffi::SchemaTree const& ir_tree +) -> int { + int curr_node_archive_id{constants::cRootNodeId}; + auto flat_map_location + = m_ir_node_to_archive_node_id_mapping.find(std::pair{ir_node_id, archive_node_type}); + + if (m_ir_node_to_archive_node_id_mapping.end() != flat_map_location) { + return flat_map_location->second; + } + + std::vector ir_id_stack; + ir_id_stack.push_back(ir_node_id); + int32_t next_parent_archive_id{constants::cRootNodeId}; + NodeType next_node_type = archive_node_type; + + while (true) { + auto const& curr_node = ir_tree.get_node(ir_id_stack.back()); + auto parent_of_curr_node_id = curr_node.get_parent_id(); + if (parent_of_curr_node_id.has_value()) { + ir_id_stack.push_back(parent_of_curr_node_id.value()); + next_node_type = NodeType::Object; + } else { + next_parent_archive_id = constants::cRootNodeId; + break; + } + + flat_map_location = m_ir_node_to_archive_node_id_mapping.find( + std::pair{ir_id_stack.back(), next_node_type} + ); + if (m_ir_node_to_archive_node_id_mapping.end() != flat_map_location) { + curr_node_archive_id = flat_map_location->second; + next_parent_archive_id = flat_map_location->second; + ir_id_stack.pop_back(); + break; + } + } + + while (false == ir_id_stack.empty()) { + auto const& curr_node = ir_tree.get_node(ir_id_stack.back()); + if (1 == ir_id_stack.size()) { + curr_node_archive_id = add_node_to_archive_and_translations( + ir_id_stack.back(), + curr_node, + archive_node_type, + next_parent_archive_id + ); + } else { + curr_node_archive_id = add_node_to_archive_and_translations( + ir_id_stack.back(), + curr_node, + NodeType::Object, + next_parent_archive_id + ); + } + next_parent_archive_id = curr_node_archive_id; + ir_id_stack.pop_back(); + } + return curr_node_archive_id; +} + +void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv) { + clp::ffi::SchemaTree const& tree = kv.get_user_gen_keys_schema_tree(); + for (auto const& pair : kv.get_user_gen_node_id_value_pairs()) { + NodeType const archive_node_type = get_archive_node_type(tree, pair); + auto const node_id = get_archive_node_id(pair.first, archive_node_type, tree); + + switch (archive_node_type) { + case NodeType::Integer: { + auto const i64_value + = pair.second.value().get_immutable_view(); + m_current_parsed_message.add_value(node_id, i64_value); + } break; + case NodeType::Float: { + auto const d_value + = pair.second.value().get_immutable_view(); + m_current_parsed_message.add_value(node_id, d_value); + } break; + case NodeType::Boolean: { + auto const b_value + = pair.second.value().get_immutable_view(); + m_current_parsed_message.add_value(node_id, b_value); + } break; + case NodeType::VarString: { + auto const validated_escaped_string = clp::ffi::validate_and_escape_utf8_string( + pair.second.value().get_immutable_view() + ); + std::string str; + if (validated_escaped_string.has_value()) { + str = validated_escaped_string.value(); + } else { + SPDLOG_ERROR( + "String is not utf8 compliant: \"{}\"", + pair.second.value().get_immutable_view() + ); + throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); + } + m_current_parsed_message.add_value(node_id, str); + } break; + case NodeType::ClpString: { + std::string encoded_str; + std::string decoded_value; + if (pair.second.value().is()) { + decoded_value = pair.second.value() + .get_immutable_view() + .decode_and_unparse() + .value(); + + } else { + decoded_value = pair.second.value() + .get_immutable_view() + .decode_and_unparse() + .value(); + } + auto const validated_escaped_encoded_string + = clp::ffi::validate_and_escape_utf8_string(decoded_value.c_str()); + if (validated_escaped_encoded_string.has_value()) { + encoded_str = validated_escaped_encoded_string.value(); + } else { + SPDLOG_ERROR("Encoded string is not utf8 compliant: \"{}\"", decoded_value); + throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); + } + m_current_parsed_message.add_value(node_id, encoded_str); + } break; + case NodeType::UnstructuredArray: { + std::string array_str; + if (pair.second.value().is()) { + array_str = pair.second.value() + .get_immutable_view() + .decode_and_unparse() + .value(); + } else { + array_str = pair.second.value() + .get_immutable_view() + .decode_and_unparse() + .value(); + } + m_current_parsed_message.add_value(node_id, array_str); + break; + } + default: + // Don't need to add value for obj or null + break; + } + m_current_schema.insert_ordered(node_id); + } + + int32_t const current_schema_id = m_archive_writer->add_schema(m_current_schema); + m_current_parsed_message.set_id(current_schema_id); + m_archive_writer->append_message(current_schema_id, m_current_schema, m_current_parsed_message); +} + +auto JsonParser::parse_from_ir() -> bool { + for (auto& file_path : m_file_paths) { + clp::streaming_compression::zstd::Decompressor decompressor; + size_t curr_pos{}; + size_t last_pos{}; + decompressor.open(file_path); + + auto deserializer_result{Deserializer::create(decompressor, IrUnitHandler{}) + }; + if (deserializer_result.has_error()) { + decompressor.close(); + m_archive_writer->close(); + return false; + } + auto& deserializer = deserializer_result.value(); + auto& ir_unit_handler{deserializer.get_ir_unit_handler()}; + + int32_t log_event_idx_node_id{}; + auto add_log_event_idx_node = [&]() { + if (m_record_log_order) { + log_event_idx_node_id + = add_metadata_field(constants::cLogEventIdxName, NodeType::Integer); + } + }; + add_log_event_idx_node(); + while (true) { + auto const kv_log_event_result{deserializer.deserialize_next_ir_unit(decompressor)}; + + if (kv_log_event_result.has_error()) { + m_archive_writer->close(); + decompressor.close(); + return false; + } + if (kv_log_event_result.value() == clp::ffi::ir_stream::IrUnitType::EndOfStream) { + break; + } + if (kv_log_event_result.value() == clp::ffi::ir_stream::IrUnitType::LogEvent) { + auto const kv_log_event = &(ir_unit_handler.get_deserialized_log_event().value()); + + m_current_schema.clear(); + + // Add log_event_idx field to metadata for record + if (m_record_log_order) { + m_current_parsed_message.add_value( + log_event_idx_node_id, + m_archive_writer->get_next_log_event_id() + ); + m_current_schema.insert_ordered(log_event_idx_node_id); + } + + try { + parse_kv_log_event(*kv_log_event); + } catch (std::exception const& e) { + SPDLOG_ERROR("Encountered error while parsing a kv log event - {}", e.what()); + m_archive_writer->close(); + decompressor.close(); + return false; + } + + if (m_archive_writer->get_data_size() >= m_target_encoded_size) { + m_ir_node_to_archive_node_id_mapping.clear(); + decompressor.try_get_pos(curr_pos); + m_archive_writer->increment_uncompressed_size(curr_pos - last_pos); + last_pos = curr_pos; + split_archive(); + add_log_event_idx_node(); + } + + ir_unit_handler.clear(); + m_current_parsed_message.clear(); + + } else if (kv_log_event_result.value() + == clp::ffi::ir_stream::IrUnitType::SchemaTreeNodeInsertion) + { + continue; + } else { + m_archive_writer->close(); + decompressor.close(); + return false; + } + } + m_ir_node_to_archive_node_id_mapping.clear(); + decompressor.try_get_pos(curr_pos); + m_archive_writer->increment_uncompressed_size(curr_pos - last_pos); + decompressor.close(); + } + return true; +} + void JsonParser::store() { m_archive_writer->close(); } diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index bfd423c22..a89c746c7 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -1,17 +1,25 @@ #ifndef CLP_S_JSONPARSER_HPP #define CLP_S_JSONPARSER_HPP +#include #include +#include #include #include +#include #include #include +#include #include #include +#include "../clp/ffi/KeyValuePairLogEvent.hpp" +#include "../clp/ffi/SchemaTree.hpp" +#include "../clp/ffi/Value.hpp" #include "../clp/GlobalMySQLMetadataDB.hpp" #include "ArchiveWriter.hpp" +#include "CommandLineArguments.hpp" #include "DictionaryWriter.hpp" #include "FileReader.hpp" #include "FileWriter.hpp" @@ -25,10 +33,12 @@ #include "ZstdCompressor.hpp" using namespace simdjson; +using clp::ffi::KeyValuePairLogEvent; namespace clp_s { struct JsonParserOption { std::vector file_paths; + CommandLineArguments::FileType input_file_type{CommandLineArguments::FileType::Json}; std::string timestamp_key; std::string archives_dir; size_t target_encoded_size{}; @@ -63,6 +73,12 @@ class JsonParser { */ [[nodiscard]] bool parse(); + /** + * Parses the Key Value IR Stream and stores the data in the archive. + * @return whether the IR Stream was parsed successfully + */ + [[nodiscard]] auto parse_from_ir() -> bool; + /** * Writes the metadata and archive data to disk. */ @@ -78,6 +94,51 @@ class JsonParser { */ void parse_line(ondemand::value line, int32_t parent_node_id, std::string const& key); + /** + * Determines the archive node type based on the IR node type and value. + * @param ir_node_type schema node type from the IR stream + * @param node_has_value Boolean that says whether or not the node has value. + * @param node_value The IR schema node value if the node has value + * @return The NodeType that should be used for the archive node + */ + static auto get_archive_node_type( + clp::ffi::SchemaTree const& tree, + std::pair> const& + kv_pair + ) -> NodeType; + + /** + * Adds new schema node to archive and adds translation for IR node ID and NodeType to mapping + * @param ir_node_id ID of the IR node + * @param ir_node_to_add IR Schema Node that is being translated to archive + * @param archive_node_type Type of the archive node + * @param parent_node_id ID of the parent of the IR node + */ + auto add_node_to_archive_and_translations( + uint32_t ir_node_id, + clp::ffi::SchemaTree::Node const& ir_node_to_add, + NodeType archive_node_type, + int32_t parent_node_id + ) -> int; + + /** + * Gets the archive node ID for an IR node. + * @param ir_node_id ID of the IR node + * @param archive_node_type Type of the archive node + * @param ir_tree The IR schema tree + */ + auto get_archive_node_id( + uint32_t ir_node_id, + NodeType archive_node_type, + clp::ffi::SchemaTree const& ir_tree + ) -> int; + + /** + * Parses a Key Value Log Event. + * @param kv the Key Value Log Event + */ + void parse_kv_log_event(KeyValuePairLogEvent const& kv); + /** * Parses an array within a JSON line * @param line the JSON array @@ -121,6 +182,9 @@ class JsonParser { size_t m_max_document_size; bool m_structurize_arrays{false}; bool m_record_log_order{true}; + + absl::flat_hash_map, int32_t> + m_ir_node_to_archive_node_id_mapping; }; } // namespace clp_s diff --git a/components/core/src/clp_s/ParsedMessage.hpp b/components/core/src/clp_s/ParsedMessage.hpp index c843e2b7b..c1b6d7a35 100644 --- a/components/core/src/clp_s/ParsedMessage.hpp +++ b/components/core/src/clp_s/ParsedMessage.hpp @@ -1,8 +1,10 @@ #ifndef CLP_S_PARSEDMESSAGE_HPP #define CLP_S_PARSEDMESSAGE_HPP +#include #include #include +#include #include #include @@ -34,6 +36,10 @@ class ParsedMessage { m_message.emplace(node_id, value); } + inline void add_value(int32_t node_id, std::string_view value) { + m_message.emplace(node_id, std::string{value}); + } + /** * Adds a timestamp value and its encoding to the message for a given MST node ID. * @param node_id @@ -55,6 +61,10 @@ class ParsedMessage { m_unordered_message.emplace_back(value); } + inline void add_unordered_value(std::string_view value) { + m_unordered_message.emplace_back(std::string{value}); + } + /** * Clears the message */ diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.cpp b/components/core/src/clp_s/TimestampDictionaryWriter.cpp index 39e66a6af..952bc36db 100644 --- a/components/core/src/clp_s/TimestampDictionaryWriter.cpp +++ b/components/core/src/clp_s/TimestampDictionaryWriter.cpp @@ -1,6 +1,8 @@ #include "TimestampDictionaryWriter.hpp" +#include #include +#include #include "Utils.hpp" @@ -42,9 +44,9 @@ uint64_t TimestampDictionaryWriter::get_pattern_id(TimestampPattern const* patte } epochtime_t TimestampDictionaryWriter::ingest_entry( - std::string const& key, + std::string_view key, int32_t node_id, - std::string const& timestamp, + std::string_view timestamp, uint64_t& pattern_id ) { epochtime_t ret; @@ -88,7 +90,7 @@ epochtime_t TimestampDictionaryWriter::ingest_entry( } void TimestampDictionaryWriter::ingest_entry( - std::string const& key, + std::string_view key, int32_t node_id, double timestamp ) { @@ -103,7 +105,7 @@ void TimestampDictionaryWriter::ingest_entry( } void TimestampDictionaryWriter::ingest_entry( - std::string const& key, + std::string_view key, int32_t node_id, int64_t timestamp ) { diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.hpp b/components/core/src/clp_s/TimestampDictionaryWriter.hpp index 29288fd48..7c214a39e 100644 --- a/components/core/src/clp_s/TimestampDictionaryWriter.hpp +++ b/components/core/src/clp_s/TimestampDictionaryWriter.hpp @@ -1,9 +1,11 @@ #ifndef CLP_S_TIMESTAMPDICTIONARYWRITER_HPP #define CLP_S_TIMESTAMPDICTIONARYWRITER_HPP +#include #include #include #include +#include #include #include @@ -47,9 +49,9 @@ class TimestampDictionaryWriter { * @return the epoch time corresponding to the string timestamp */ epochtime_t ingest_entry( - std::string const& key, + std::string_view key, int32_t node_id, - std::string const& timestamp, + std::string_view timestamp, uint64_t& pattern_id ); @@ -59,9 +61,9 @@ class TimestampDictionaryWriter { * @param node_id * @param timestamp */ - void ingest_entry(std::string const& key, int32_t node_id, double timestamp); + void ingest_entry(std::string_view key, int32_t node_id, double timestamp); - void ingest_entry(std::string const& key, int32_t node_id, int64_t timestamp); + void ingest_entry(std::string_view key, int32_t node_id, int64_t timestamp); /** * TODO: guarantee epoch milliseconds. The current clp-s approach to encoding timestamps and diff --git a/components/core/src/clp_s/TimestampEntry.hpp b/components/core/src/clp_s/TimestampEntry.hpp index 326ed9d73..47a26fd9e 100644 --- a/components/core/src/clp_s/TimestampEntry.hpp +++ b/components/core/src/clp_s/TimestampEntry.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -43,7 +44,7 @@ class TimestampEntry { m_epoch_start(cEpochTimeMax), m_epoch_end(cEpochTimeMin) {} - TimestampEntry(std::string const& key_name) + TimestampEntry(std::string_view key_name) : m_encoding(UnkownTimestampEncoding), m_epoch_start_double(cDoubleEpochTimeMax), m_epoch_end_double(cDoubleEpochTimeMin), diff --git a/components/core/src/clp_s/TimestampPattern.cpp b/components/core/src/clp_s/TimestampPattern.cpp index 4ddb5648e..11fab3480 100644 --- a/components/core/src/clp_s/TimestampPattern.cpp +++ b/components/core/src/clp_s/TimestampPattern.cpp @@ -4,6 +4,8 @@ #include #include +#include +#include #include #include @@ -12,6 +14,7 @@ using clp::string_utils::convert_string_to_int; using std::string; +using std::string_view; using std::to_string; using std::vector; @@ -71,7 +74,7 @@ append_padded_value_notz(int value, char padding_character, size_t max_length, s * @return true if conversion succeeds, false otherwise */ static bool convert_string_to_number( - string const& str, + string_view str, size_t begin_ix, size_t end_ix, char padding_character, @@ -89,7 +92,7 @@ static bool convert_string_to_number( * @return true if conversion succeeds, false otherwise */ static bool convert_string_to_number_notz( - string const& str, + string_view str, size_t max_digits, size_t begin_ix, size_t& end_ix, @@ -125,7 +128,7 @@ append_padded_value_notz(int value, char padding_character, size_t max_length, s } static bool convert_string_to_number( - string const& str, + string_view str, size_t begin_ix, size_t end_ix, char padding_character, @@ -154,7 +157,7 @@ static bool convert_string_to_number( } static bool convert_string_to_number_notz( - string const& str, + string_view str, size_t max_digits, size_t begin_ix, size_t& end_ix, @@ -306,7 +309,7 @@ void TimestampPattern::init() { } TimestampPattern const* TimestampPattern::search_known_ts_patterns( - string const& line, + string_view line, epochtime_t& timestamp, size_t& timestamp_begin_pos, size_t& timestamp_end_pos @@ -342,7 +345,7 @@ void TimestampPattern::clear() { } bool TimestampPattern::parse_timestamp( - string const& line, + string_view line, epochtime_t& timestamp, size_t& timestamp_begin_pos, size_t& timestamp_end_pos @@ -827,23 +830,20 @@ bool TimestampPattern::parse_timestamp( } auto dot_position = line.find('.'); auto nanosecond_start = dot_position + 1; - if (std::string::npos == dot_position || 0 == dot_position + if (string::npos == dot_position || 0 == dot_position || cNanosecondDigits != (line.length() - nanosecond_start)) { return false; } - auto timestamp_view = std::string_view(line); - if (false - == convert_string_to_int(timestamp_view.substr(0, dot_position), timestamp)) - { + if (false == convert_string_to_int(line.substr(0, dot_position), timestamp)) { return false; } epochtime_t timestamp_nanoseconds; if (false == convert_string_to_int( - timestamp_view.substr(nanosecond_start, cNanosecondDigits), + line.substr(nanosecond_start, cNanosecondDigits), timestamp_nanoseconds )) { @@ -1070,14 +1070,14 @@ void TimestampPattern::insert_formatted_timestamp(epochtime_t timestamp, string& case 'E': // UNIX epoch milliseconds // Note: this timestamp format is required to make up the entire timestamp, so // this is safe - new_msg = std::to_string(timestamp); + new_msg = to_string(timestamp); break; case 'F': { // Nanosecond precision floating point UNIX epoch timestamp constexpr auto cNanosecondDigits = 9; // Note: this timestamp format is required to make up the entire timestamp, so // this is safe - new_msg = std::to_string(timestamp); + new_msg = to_string(timestamp); new_msg.insert(new_msg.end() - cNanosecondDigits, '.'); break; } diff --git a/components/core/src/clp_s/TimestampPattern.hpp b/components/core/src/clp_s/TimestampPattern.hpp index 9219d33bb..278bb82e1 100644 --- a/components/core/src/clp_s/TimestampPattern.hpp +++ b/components/core/src/clp_s/TimestampPattern.hpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include "Defs.hpp" @@ -83,7 +85,7 @@ class TimestampPattern { * @return pointer to the timestamp pattern if found, nullptr otherwise */ static TimestampPattern const* search_known_ts_patterns( - std::string const& line, + std::string_view line, epochtime_t& timestamp, size_t& timestamp_begin_pos, size_t& timestamp_end_pos @@ -121,7 +123,7 @@ class TimestampPattern { * @return true if parsed successfully, false otherwise */ bool parse_timestamp( - std::string const& line, + std::string_view line, epochtime_t& timestamp, size_t& timestamp_begin_pos, size_t& timestamp_end_pos diff --git a/components/core/src/clp_s/archive_constants.hpp b/components/core/src/clp_s/archive_constants.hpp index b76af2944..6dd7b6928 100644 --- a/components/core/src/clp_s/archive_constants.hpp +++ b/components/core/src/clp_s/archive_constants.hpp @@ -29,10 +29,10 @@ constexpr char cLogEventIdxName[] = "log_event_idx"; namespace results_cache::decompression { constexpr char cPath[]{"path"}; -constexpr char cOrigFileId[]{"orig_file_id"}; +constexpr char cStreamId[]{"stream_id"}; constexpr char cBeginMsgIx[]{"begin_msg_ix"}; constexpr char cEndMsgIx[]{"end_msg_ix"}; -constexpr char cIsLastIrChunk[]{"is_last_ir_chunk"}; +constexpr char cIsLastChunk[]{"is_last_chunk"}; } // namespace results_cache::decompression namespace results_cache::search { diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index b76683caf..0f7b5643a 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -88,6 +88,7 @@ bool compress(CommandLineArguments const& command_line_arguments) { clp_s::JsonParserOption option{}; option.file_paths = command_line_arguments.get_file_paths(); + option.input_file_type = command_line_arguments.get_file_type(); option.archives_dir = archives_dir.string(); option.target_encoded_size = command_line_arguments.get_target_encoded_size(); option.max_document_size = command_line_arguments.get_max_document_size(); @@ -113,9 +114,16 @@ bool compress(CommandLineArguments const& command_line_arguments) { } clp_s::JsonParser parser(option); - if (false == parser.parse()) { - SPDLOG_ERROR("Encountered error while parsing input"); - return false; + if (CommandLineArguments::FileType::KeyValueIr == option.input_file_type) { + if (false == parser.parse_from_ir()) { + SPDLOG_ERROR("Encountered error while parsing input"); + return false; + } + } else { + if (false == parser.parse()) { + SPDLOG_ERROR("Encountered error while parsing input"); + return false; + } } parser.store(); return true; diff --git a/components/core/src/clp_s/search/StringLiteral.hpp b/components/core/src/clp_s/search/StringLiteral.hpp index 4ac6b9f2f..67c902a29 100644 --- a/components/core/src/clp_s/search/StringLiteral.hpp +++ b/components/core/src/clp_s/search/StringLiteral.hpp @@ -4,6 +4,7 @@ #include #include +#include "../Utils.hpp" #include "Literal.hpp" namespace clp_s::search { @@ -68,19 +69,8 @@ class StringLiteral : public Literal { m_string_type = LiteralType::VarStringT; } - // If '?' and '*' are not escaped, we add LiteralType::ClpStringT to m_string_type - bool escape = false; - for (char const c : m_v) { - if ('\\' == c) { - escape = !escape; - } else if ('?' == c || '*' == c) { - if (false == escape) { - m_string_type |= LiteralType::ClpStringT; - break; - } - } else { - escape = false; - } + if (StringUtils::has_unescaped_wildcards(m_v)) { + m_string_type |= LiteralType::ClpStringT; } } }; diff --git a/components/core/src/reducer/reducer_server.cpp b/components/core/src/reducer/reducer_server.cpp index ab35b7396..a243c763c 100644 --- a/components/core/src/reducer/reducer_server.cpp +++ b/components/core/src/reducer/reducer_server.cpp @@ -121,7 +121,7 @@ void PeriodicUpsertTask::operator()([[maybe_unused]] boost::system::error_code c } auto& upsert_timer = m_server_ctx->get_upsert_timer(); - upsert_timer.expires_from_now(std::chrono::milliseconds(m_server_ctx->get_upsert_interval())); + upsert_timer.expires_after(std::chrono::milliseconds(m_server_ctx->get_upsert_interval())); upsert_timer.async_wait(PeriodicUpsertTask(m_server_ctx)); } @@ -205,9 +205,8 @@ void SchedulerUpdateListenerTask::operator()( if (m_server_ctx->is_timeline_aggregation()) { auto& upsert_timer = m_server_ctx->get_upsert_timer(); - upsert_timer.expires_from_now( - std::chrono::milliseconds(m_server_ctx->get_upsert_interval()) - ); + upsert_timer.expires_after(std::chrono::milliseconds(m_server_ctx->get_upsert_interval() + )); upsert_timer.async_wait(PeriodicUpsertTask(m_server_ctx)); } diff --git a/components/core/tests/test-BoundedReader.cpp b/components/core/tests/test-BoundedReader.cpp new file mode 100644 index 000000000..9d1a9d2c0 --- /dev/null +++ b/components/core/tests/test-BoundedReader.cpp @@ -0,0 +1,99 @@ +#include +#include +#include +#include + +#include + +#include "../src/clp/BoundedReader.hpp" +#include "../src/clp/ErrorCode.hpp" +#include "../src/clp/StringReader.hpp" + +TEST_CASE("Test Bounded Reader", "[BoundedReader]") { + constexpr std::string_view cTestString{"0123456789"}; + + SECTION("BoundedReader does not support try_read_to_delimiter") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, cTestString.size()}; + std::string tmp; + REQUIRE(clp::ErrorCode_Unsupported + == bounded_reader.try_read_to_delimiter('0', false, false, tmp)); + } + + SECTION("BoundedReader does not allow reads beyond end of underlying stream.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, cTestString.size() + 1}; + std::array buf{}; + size_t num_bytes_read{}; + auto rc = bounded_reader.try_read(buf.data(), cTestString.size() + 1, num_bytes_read); + REQUIRE(clp::ErrorCode_Success == rc); + REQUIRE(num_bytes_read == cTestString.size()); + REQUIRE(cTestString.size() == string_reader.get_pos()); + REQUIRE(cTestString.size() == bounded_reader.get_pos()); + } + + SECTION("BoundedReader does not allow reads beyond checkpoint.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, 1}; + std::array buf{}; + size_t num_bytes_read{}; + auto rc = bounded_reader.try_read(buf.data(), cTestString.size(), num_bytes_read); + REQUIRE(clp::ErrorCode_Success == rc); + REQUIRE(1 == num_bytes_read); + REQUIRE(1 == string_reader.get_pos()); + REQUIRE(1 == bounded_reader.get_pos()); + rc = bounded_reader.try_read(buf.data(), 1, num_bytes_read); + REQUIRE(clp::ErrorCode_EndOfFile == rc); + REQUIRE(0 == num_bytes_read); + REQUIRE(1 == string_reader.get_pos()); + REQUIRE(1 == bounded_reader.get_pos()); + } + + SECTION("BoundedReader does allow reads before checkpoint.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, 1}; + char buf{}; + size_t num_bytes_read{}; + auto rc = bounded_reader.try_read(&buf, 1, num_bytes_read); + REQUIRE(clp::ErrorCode_Success == rc); + REQUIRE(1 == num_bytes_read); + REQUIRE(1 == string_reader.get_pos()); + REQUIRE(1 == bounded_reader.get_pos()); + } + + SECTION("BoundedReader does not allow seeks beyond end of underlying stream.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, cTestString.size() + 1}; + auto rc = bounded_reader.try_seek_from_begin(cTestString.size() + 1); + REQUIRE(clp::ErrorCode_EndOfFile == rc); + REQUIRE(cTestString.size() == string_reader.get_pos()); + REQUIRE(cTestString.size() == bounded_reader.get_pos()); + } + + SECTION("BoundedReader does not allow seeks beyond checkpoint.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, 1}; + size_t num_bytes_read{}; + auto rc = bounded_reader.try_seek_from_begin(cTestString.size()); + REQUIRE(clp::ErrorCode_EndOfFile == rc); + REQUIRE(1 == string_reader.get_pos()); + REQUIRE(1 == bounded_reader.get_pos()); + } + + SECTION("BoundedReader does allow seeks before checkpoint.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, 2}; + size_t num_bytes_read{}; + auto rc = bounded_reader.try_seek_from_begin(1); + REQUIRE(clp::ErrorCode_Success == rc); + REQUIRE(1 == string_reader.get_pos()); + REQUIRE(1 == bounded_reader.get_pos()); + } +} diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index 0fbae9e3a..9f0df9306 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include @@ -15,6 +17,7 @@ #include "../src/clp/ReadOnlyMemoryMappedFile.hpp" #include "../src/clp/streaming_compression/Compressor.hpp" #include "../src/clp/streaming_compression/Decompressor.hpp" +#include "../src/clp/streaming_compression/lzma/Compressor.hpp" #include "../src/clp/streaming_compression/passthrough/Compressor.hpp" #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp" #include "../src/clp/streaming_compression/zstd/Compressor.hpp" @@ -25,56 +28,48 @@ using clp::ErrorCode_Success; using clp::FileWriter; using clp::streaming_compression::Compressor; using clp::streaming_compression::Decompressor; +using std::string; +using std::string_view; -TEST_CASE("StreamingCompression", "[StreamingCompression]") { - // Initialize constants - constexpr size_t cBufferSize{128L * 1024 * 1024}; // 128MB - constexpr auto cCompressionChunkSizes = std::to_array( - {cBufferSize / 100, - cBufferSize / 50, - cBufferSize / 25, - cBufferSize / 10, - cBufferSize / 5, - cBufferSize / 2, - cBufferSize} - ); - constexpr size_t cAlphabetLength{26}; - std::string const compressed_file_path{"test_streaming_compressed_file.bin"}; - - // Initialize compression devices - std::unique_ptr compressor; - std::unique_ptr decompressor; - - SECTION("ZStd single phase compression") { - compressor = std::make_unique(); - decompressor = std::make_unique(); - } - - SECTION("Passthrough compression") { - compressor = std::make_unique(); - decompressor = std::make_unique(); - } +namespace { +constexpr string_view cCompressedFilePath{"test_streaming_compressed_file.bin"}; +constexpr size_t cBufferSize{128L * 1024 * 1024}; // 128MB +constexpr auto cCompressionChunkSizes = std::to_array( + {0, + cBufferSize / 100, + cBufferSize / 50, + cBufferSize / 25, + cBufferSize / 10, + cBufferSize / 5, + cBufferSize / 2, + cBufferSize} +); - // Initialize buffers - Array uncompressed_buffer{cBufferSize}; - for (size_t i{0}; i < cBufferSize; ++i) { - uncompressed_buffer.at(i) = static_cast(('a' + (i % cAlphabetLength))); - } +auto compress(std::unique_ptr compressor, char const* src) -> void; - Array decompressed_buffer{cBufferSize}; +auto decompress_and_compare( + std::unique_ptr decompressor, + Array const& uncompressed_buffer, + Array& decompressed_buffer +) -> void; - // Compress +auto compress(std::unique_ptr compressor, char const* src) -> void { FileWriter file_writer; - file_writer.open(compressed_file_path, FileWriter::OpenMode::CREATE_FOR_WRITING); + file_writer.open(string(cCompressedFilePath), FileWriter::OpenMode::CREATE_FOR_WRITING); compressor->open(file_writer); for (auto const chunk_size : cCompressionChunkSizes) { - compressor->write(uncompressed_buffer.data(), chunk_size); + compressor->write(src, chunk_size); } compressor->close(); file_writer.close(); +} - // Decompress and compare - clp::ReadOnlyMemoryMappedFile const memory_mapped_compressed_file{compressed_file_path}; +auto decompress_and_compare( + std::unique_ptr decompressor, + Array const& uncompressed_buffer, + Array& decompressed_buffer +) -> void { + clp::ReadOnlyMemoryMappedFile const memory_mapped_compressed_file{string(cCompressedFilePath)}; auto const compressed_file_view{memory_mapped_compressed_file.get_view()}; decompressor->open(compressed_file_view.data(), compressed_file_view.size()); @@ -98,7 +93,6 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") { num_uncompressed_bytes += chunk_size; } - // Sanity check REQUIRE( (std::accumulate( cCompressionChunkSizes.cbegin(), @@ -107,7 +101,39 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") { ) == num_uncompressed_bytes) ); +} +} // namespace + +TEST_CASE("StreamingCompression", "[StreamingCompression]") { + constexpr size_t cAlphabetLength{26}; + + std::unique_ptr compressor; + std::unique_ptr decompressor; + + Array decompressed_buffer{cBufferSize}; + Array uncompressed_buffer{cBufferSize}; + for (size_t i{0}; i < cBufferSize; ++i) { + uncompressed_buffer.at(i) = static_cast(('a' + (i % cAlphabetLength))); + } + + SECTION("ZStd single phase compression") { + compressor = std::make_unique(); + compress(std::move(compressor), uncompressed_buffer.data()); + decompressor = std::make_unique(); + decompress_and_compare(std::move(decompressor), uncompressed_buffer, decompressed_buffer); + } + + SECTION("Passthrough compression") { + compressor = std::make_unique(); + compress(std::move(compressor), uncompressed_buffer.data()); + decompressor = std::make_unique(); + decompress_and_compare(std::move(decompressor), uncompressed_buffer, decompressed_buffer); + } + + SECTION("LZMA compression") { + compressor = std::make_unique(); + compress(std::move(compressor), uncompressed_buffer.data()); + } - // Cleanup - boost::filesystem::remove(compressed_file_path); + boost::filesystem::remove(string(cCompressedFilePath)); } diff --git a/components/core/tests/test-error_handling.cpp b/components/core/tests/test-error_handling.cpp index 2d640ed57..44327c833 100644 --- a/components/core/tests/test-error_handling.cpp +++ b/components/core/tests/test-error_handling.cpp @@ -9,6 +9,7 @@ #include #include "../src/clp/error_handling/ErrorCode.hpp" +#include "../src/clp/ffi/ir_stream/IrErrorCode.hpp" using clp::error_handling::ErrorCategory; using clp::error_handling::ErrorCode; @@ -139,3 +140,17 @@ TEST_CASE("test_error_code_implementation", "[error_handling][ErrorCode]") { REQUIRE((AlwaysSuccessErrorCode{AlwaysSuccessErrorCodeEnum::Success} != success_error_code)); REQUIRE((BinaryErrorCode{BinaryErrorCodeEnum::Success} != always_success_error_code)); } + +TEST_CASE("test_ir_error_code", "[error_handling][ErrorCode][IrErrorCode]") { + using clp::ffi::ir_stream::IrErrorCode; + using clp::ffi::ir_stream::IrErrorCodeEnum; + + auto assert_error_code_matches_error_code_enum = [](IrErrorCodeEnum error_code_enum) -> bool { + std::error_code const error_code{IrErrorCode{error_code_enum}}; + return error_code == IrErrorCode{error_code_enum}; + }; + + REQUIRE(assert_error_code_matches_error_code_enum(IrErrorCodeEnum::DecodingMethodFailure)); + REQUIRE(assert_error_code_matches_error_code_enum(IrErrorCodeEnum::EndOfStream)); + REQUIRE(assert_error_code_matches_error_code_enum(IrErrorCodeEnum::IncompleteStream)); +} diff --git a/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp b/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp index 5b8ad82cd..8f76a2f1a 100644 --- a/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp +++ b/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp @@ -87,9 +87,13 @@ auto test_ir_unit_handler_interface(clp::ffi::ir_stream::IrUnitHandlerInterface auto test_ir_unit_handler_interface(clp::ffi::ir_stream::IrUnitHandlerInterface auto& handler ) -> void { - auto test_log_event_result{ - KeyValuePairLogEvent::create(std::make_shared(), {}, cTestUtcOffset) - }; + auto test_log_event_result{KeyValuePairLogEvent::create( + std::make_shared(), + std::make_shared(), + {}, + {}, + cTestUtcOffset + )}; REQUIRE( (false == test_log_event_result.has_error() && IRErrorCode::IRErrorCode_Success @@ -127,7 +131,7 @@ TEMPLATE_TEST_CASE( REQUIRE( (optional_log_event.has_value() && optional_log_event.value().get_utc_offset() == cTestUtcOffset - && optional_log_event.value().get_node_id_value_pairs().empty()) + && optional_log_event.value().get_user_gen_node_id_value_pairs().empty()) ); auto const& optional_schema_tree_locator{handler.get_schema_tree_node_locator()}; REQUIRE( diff --git a/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp b/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp index 2e9cfb691..9ffee4f68 100644 --- a/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp +++ b/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp @@ -11,6 +11,7 @@ #include #include +#include #include "../src/clp/ffi/encoding_methods.hpp" #include "../src/clp/ffi/KeyValuePairLogEvent.hpp" @@ -81,6 +82,25 @@ auto insert_invalid_node_id_value_pairs_with_node_type_errors( KeyValuePairLogEvent::NodeIdValuePairs& invalid_node_id_value_pairs ) -> void; +/** + * Asserts that `KeyValuePairLogEvent` creation fails with the expected error code. + * @param auto_gen_keys_schema_tree + * @param user_gen_keys_schema_tree + * @param auto_gen_node_id_value_pairs + * @param user_gen_node_id_value_pairs + * @param utc_offset + * @param expected_error_code + * @return Whether the assertion succeeded. + */ +[[nodiscard]] auto assert_kv_pair_log_event_creation_failure( + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, + KeyValuePairLogEvent::NodeIdValuePairs auto_gen_node_id_value_pairs, + KeyValuePairLogEvent::NodeIdValuePairs user_gen_node_id_value_pairs, + UtcOffset utc_offset, + std::errc expected_error_code +) -> bool; + template requires(std::is_same_v || std::is_same_v) @@ -197,6 +217,24 @@ auto insert_invalid_node_id_value_pairs_with_node_type_errors( invalid_node_id_value_pairs.emplace(node_id, Value{}); } } + +auto assert_kv_pair_log_event_creation_failure( + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, + KeyValuePairLogEvent::NodeIdValuePairs auto_gen_node_id_value_pairs, + KeyValuePairLogEvent::NodeIdValuePairs user_gen_node_id_value_pairs, + UtcOffset utc_offset, + std::errc expected_error_code +) -> bool { + auto const result{KeyValuePairLogEvent::create( + std::move(auto_gen_keys_schema_tree), + std::move(user_gen_keys_schema_tree), + std::move(auto_gen_node_id_value_pairs), + std::move(user_gen_node_id_value_pairs), + utc_offset + )}; + return result.has_error() && result.error() == expected_error_code; +} } // namespace TEST_CASE("ffi_Value_basic", "[ffi][Value]") { @@ -250,22 +288,23 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") { * | * |------------> <1:a:Obj> * | | - * |--> <2:a:Int> |--> <3:b:Obj> - * | - * |------------> <4:c:Obj> - * | | - * |--> <5:d:Str> |--> <7:a:UnstructuredArray> - * | | - * |--> <6:d:Bool> |--> <8:d:Str> - * | | - * |--> <10:e:Obj> |--> <9:d:Float> - * | - * |--> <11:f:Obj> + * |--> <2:b:Int> |--> <3:b:Obj> + * | | | + * |--> <12:a:Int> | |------------> <4:c:Obj> + * | | | + * | |--> <5:d:Str> |--> <7:a:UnstructuredArray> + * | | | + * | |--> <6:d:Bool> |--> <8:d:Str> + * | | | + * | |--> <10:e:Obj> |--> <9:d:Float> + * | | + * |--> <13:b:Bool> |--> <11:f:Obj> */ - auto const schema_tree{std::make_shared()}; + auto const auto_gen_keys_schema_tree{std::make_shared()}; + auto const user_gen_keys_schema_tree{std::make_shared()}; std::vector const locators{ {SchemaTree::cRootId, "a", SchemaTree::Node::Type::Obj}, - {SchemaTree::cRootId, "a", SchemaTree::Node::Type::Int}, + {SchemaTree::cRootId, "b", SchemaTree::Node::Type::Int}, {1, "b", SchemaTree::Node::Type::Obj}, {3, "c", SchemaTree::Node::Type::Obj}, {3, "d", SchemaTree::Node::Type::Str}, @@ -274,63 +313,88 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") { {4, "d", SchemaTree::Node::Type::Str}, {4, "d", SchemaTree::Node::Type::Float}, {3, "e", SchemaTree::Node::Type::Obj}, - {4, "f", SchemaTree::Node::Type::Obj} + {4, "f", SchemaTree::Node::Type::Obj}, + {SchemaTree::cRootId, "a", SchemaTree::Node::Type::Int}, + {1, "b", SchemaTree::Node::Type::Bool} }; for (auto const& locator : locators) { - REQUIRE_NOTHROW(schema_tree->insert_node(locator)); + REQUIRE_NOTHROW(auto_gen_keys_schema_tree->insert_node(locator)); + REQUIRE_NOTHROW(user_gen_keys_schema_tree->insert_node(locator)); } + REQUIRE((*auto_gen_keys_schema_tree == *user_gen_keys_schema_tree)); + SECTION("Test empty ID-value pairs") { - KeyValuePairLogEvent::NodeIdValuePairs node_id_value_pairs; auto const result{KeyValuePairLogEvent::create( - schema_tree, - std::move(node_id_value_pairs), + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + {}, + {}, UtcOffset{0} )}; REQUIRE_FALSE(result.has_error()); } + SECTION("Test schema tree pointers being null") { + REQUIRE(assert_kv_pair_log_event_creation_failure( + nullptr, + user_gen_keys_schema_tree, + {}, + {}, + UtcOffset{0}, + std::errc::invalid_argument + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + nullptr, + {}, + {}, + UtcOffset{0}, + std::errc::invalid_argument + )); + } + SECTION("Test mismatched types") { KeyValuePairLogEvent::NodeIdValuePairs invalid_node_id_value_pairs; // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) // Int: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 2, invalid_node_id_value_pairs ); // Float: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 9, invalid_node_id_value_pairs ); // Bool: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 6, invalid_node_id_value_pairs ); // Str: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 5, invalid_node_id_value_pairs ); // UnstructuredArray: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 7, invalid_node_id_value_pairs ); // Obj: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 3, invalid_node_id_value_pairs ); @@ -343,26 +407,37 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") { } else { node_id_value_pair_to_test.emplace(node_id, std::nullopt); } - auto const result{KeyValuePairLogEvent::create( - schema_tree, - std::move(node_id_value_pair_to_test), - UtcOffset{0} - )}; - REQUIRE(result.has_error()); - auto const& err{result.error()}; - REQUIRE((std::errc::protocol_error == err)); + + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + node_id_value_pair_to_test, + {}, + UtcOffset{0}, + std::errc::protocol_error + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + {}, + node_id_value_pair_to_test, + UtcOffset{0}, + std::errc::protocol_error + )); } } SECTION("Test valid ID-value pairs") { - KeyValuePairLogEvent::NodeIdValuePairs node_id_value_pairs; + constexpr std::string_view cJsonArrayToEncode{"[\"a\", 1, 0.1, null]"}; + constexpr std::string_view cStaticText{"Test"}; + KeyValuePairLogEvent::NodeIdValuePairs valid_node_id_value_pairs; /* * The sub schema tree of `node_id_value_pairs`: * <0:root:Obj> * | * |------------> <1:a:Obj> * | | - * |--> <2:a:Int> |--> <3:b:Obj> + * |--> <2:b:Int> |--> <3:b:Obj> * | * |------------> <4:c:Obj> * | | @@ -375,77 +450,206 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") { * |--> <11:f:Obj> */ // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - node_id_value_pairs.emplace(2, Value{static_cast(0)}); - node_id_value_pairs.emplace(5, Value{string{"Test"}}); - node_id_value_pairs.emplace( + valid_node_id_value_pairs.emplace(2, Value{static_cast(0)}); + valid_node_id_value_pairs.emplace(5, Value{string{cStaticText}}); + valid_node_id_value_pairs.emplace( 8, Value{get_encoded_text_ast(cStringToEncode)} ); - node_id_value_pairs.emplace( + valid_node_id_value_pairs.emplace( 7, - Value{get_encoded_text_ast(cStringToEncode)} + Value{get_encoded_text_ast(cJsonArrayToEncode)} ); - node_id_value_pairs.emplace(10, Value{}); - node_id_value_pairs.emplace(11, std::nullopt); + valid_node_id_value_pairs.emplace(10, Value{}); + valid_node_id_value_pairs.emplace(11, std::nullopt); // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - auto const result{ - KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0}) - }; + auto const result{KeyValuePairLogEvent::create( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0} + )}; REQUIRE_FALSE(result.has_error()); - SECTION("Test duplicated key conflict on node #3") { - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - node_id_value_pairs.emplace(6, Value{static_cast(false)}); - auto const result{ - KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0}) + SECTION("Test JSON serialization") { + nlohmann::json const subtree_rooted_at_node_4 + = {{"a", nlohmann::json::parse(cJsonArrayToEncode)}, + {"d", cStringToEncode}, + {"f", nlohmann::json::object_t()}}; + nlohmann::json const subtree_rooted_at_node_3 + = {{"c", subtree_rooted_at_node_4}, {"d", cStaticText}, {"e", nullptr}}; + nlohmann::json const expected = { + {"a", {{"b", subtree_rooted_at_node_3}}}, + {"b", 0}, }; - REQUIRE(result.has_error()); - REQUIRE((std::errc::protocol_not_supported == result.error())); + + auto const& kv_pair_log_event{result.value()}; + auto const serialized_json_result{kv_pair_log_event.serialize_to_json()}; + REQUIRE_FALSE(serialized_json_result.has_error()); + auto const& [serialized_auto_gen_kv_pairs, serialized_user_gen_kv_pairs]{ + serialized_json_result.value() + }; + REQUIRE((serialized_auto_gen_kv_pairs == expected)); + REQUIRE((serialized_user_gen_kv_pairs == expected)); } - SECTION("Test duplicated key conflict on node #4") { + SECTION("Test duplicated key conflict under node #3") { + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - node_id_value_pairs.emplace(9, Value{static_cast(0.0)}); - auto const result{ - KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0}) - }; - REQUIRE(result.has_error()); - REQUIRE((std::errc::protocol_not_supported == result.error())); + invalid_node_id_value_pairs.emplace(6, Value{static_cast(false)}); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + } + + SECTION("Test duplicated key conflict under node #4") { + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + invalid_node_id_value_pairs.emplace(9, Value{static_cast(0.0)}); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + } + + SECTION("Test duplicated keys among siblings of node #1") { + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + invalid_node_id_value_pairs.emplace(12, static_cast(0)); + // Node #12 has the same key as its sibling node #1 + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + } + + SECTION("Test duplicated keys among siblings of node #3") { + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + invalid_node_id_value_pairs.emplace(13, false); + // Node #13 has the same key as its sibling node #3 + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); } SECTION("Test invalid sub-tree on node #3") { - node_id_value_pairs.emplace(3, std::nullopt); - auto const result{ - KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0}) - }; + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; + invalid_node_id_value_pairs.emplace(3, std::nullopt); // Node #3 is empty, but its descendants appear in the sub schema tree (node #5 & #10) - REQUIRE(result.has_error()); - REQUIRE((std::errc::operation_not_permitted == result.error())); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::operation_not_permitted + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::operation_not_permitted + )); } SECTION("Test invalid sub-tree on node #4") { - node_id_value_pairs.emplace(4, Value{}); - auto const result{ - KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0}) - }; + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; + invalid_node_id_value_pairs.emplace(4, Value{}); // Node #4 is null, but its descendants appear in the sub schema tree (node #5 & #10) - REQUIRE(result.has_error()); - REQUIRE((std::errc::operation_not_permitted == result.error())); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::operation_not_permitted + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::operation_not_permitted + )); } } SECTION("Test out-of-bound node ID") { KeyValuePairLogEvent::NodeIdValuePairs node_id_value_pairs_out_of_bound; node_id_value_pairs_out_of_bound.emplace( - static_cast(schema_tree->get_size()), + static_cast(user_gen_keys_schema_tree->get_size()), Value{} ); - auto const out_of_bound_result{KeyValuePairLogEvent::create( - schema_tree, - std::move(node_id_value_pairs_out_of_bound), - UtcOffset{0} - )}; - REQUIRE(out_of_bound_result.has_error()); - REQUIRE((std::errc::operation_not_permitted == out_of_bound_result.error())); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + node_id_value_pairs_out_of_bound, + {}, + UtcOffset{0}, + std::errc::operation_not_permitted + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + {}, + node_id_value_pairs_out_of_bound, + UtcOffset{0}, + std::errc::operation_not_permitted + )); } } diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index 1ee1e3542..347dadb7a 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -1246,12 +1246,14 @@ TEMPLATE_TEST_CASE( auto const& deserialized_log_event{deserialized_log_events.at(idx)}; auto const num_leaves_in_json_obj{count_num_leaves(expect)}; - auto const num_kv_pairs{deserialized_log_event.get_node_id_value_pairs().size()}; + auto const num_kv_pairs{deserialized_log_event.get_user_gen_node_id_value_pairs().size()}; REQUIRE((num_leaves_in_json_obj == num_kv_pairs)); auto const serialized_json_result{deserialized_log_event.serialize_to_json()}; REQUIRE_FALSE(serialized_json_result.has_error()); - REQUIRE((expect == serialized_json_result.value())); + auto const& [auto_generated, user_generated]{serialized_json_result.value()}; + REQUIRE(auto_generated.empty()); + REQUIRE((expect == user_generated)); } auto const eof_result{deserializer.deserialize_next_ir_unit(reader)}; diff --git a/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh index f2965f9fd..e6b6b3579 100755 --- a/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh +++ b/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh @@ -10,7 +10,7 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" lib_install_scripts_dir="${script_dir}/.." # NOTE: The remaining installation scripts depend on boost, so we install it beforehand. -"${lib_install_scripts_dir}/install-boost.sh" 1.76.0 +"${lib_install_scripts_dir}/install-boost.sh" 1.87.0 "${lib_install_scripts_dir}/fmtlib.sh" 8.0.1 "${lib_install_scripts_dir}/spdlog.sh" 1.9.2 diff --git a/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh index 66ea4ac4f..c51a521c1 100755 --- a/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh @@ -18,4 +18,5 @@ dnf install -y \ libzstd-devel \ make \ mariadb-connector-c-devel \ - openssl-devel + openssl-devel \ + xz-devel diff --git a/components/core/tools/scripts/lib_install/install-boost.sh b/components/core/tools/scripts/lib_install/install-boost.sh index 9e5f9a1c5..2733e9886 100755 --- a/components/core/tools/scripts/lib_install/install-boost.sh +++ b/components/core/tools/scripts/lib_install/install-boost.sh @@ -29,12 +29,12 @@ cd $temp_dir # Download source tar_filename=boost_${version_with_underscores}.tar.gz -curl -fsSL https://boostorg.jfrog.io/artifactory/main/release/${version}/source/${tar_filename} -o ${tar_filename} +curl -fsSL https://archives.boost.io/release/${version}/source/${tar_filename} -o ${tar_filename} tar xzf ${tar_filename} cd boost_${version_with_underscores} # Build -./bootstrap.sh --with-libraries=filesystem,iostreams,program_options,regex,system +./bootstrap.sh --with-libraries=filesystem,iostreams,program_options,regex,system,url ./b2 -j${num_cpus} # Install diff --git a/components/core/tools/scripts/lib_install/liblzma.sh b/components/core/tools/scripts/lib_install/liblzma.sh new file mode 100755 index 000000000..a73ff79b9 --- /dev/null +++ b/components/core/tools/scripts/lib_install/liblzma.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash + +# Exit on any error +set -e + +# Error on undefined variable +set -u + +# Dependencies: +# - curl +# - make +# - gcc +# NOTE: Dependencies should be installed outside the script to allow the script to be largely distro-agnostic + +for cmd in curl make gcc; do + if ! $cmd --version >/dev/null 2>&1; then + echo "Error: Required dependency '$cmd' not found" + exit 1 + fi +done + +cUsage="Usage: ${BASH_SOURCE[0]} [ <.deb output directory>]" +if [ "$#" -lt 1 ] ; then + echo $cUsage + exit +fi +version=$1 + +package_name=liblzma +temp_dir=/tmp/${package_name}-installation +deb_output_dir=${temp_dir} +if [[ "$#" -gt 1 ]] ; then + deb_output_dir="$(readlink -f "$2")" + if [ ! -d ${deb_output_dir} ] ; then + echo "${deb_output_dir} does not exist or is not a directory" + exit + fi +fi + +# Note: we won't check if the package already exists + +# Get number of cpu cores +num_cpus=$(grep -c ^processor /proc/cpuinfo) + +# Download +mkdir -p $temp_dir +cd $temp_dir +extracted_dir=${temp_dir}/xz-${version} +if [ ! -e ${extracted_dir} ] ; then + tar_filename=xz-${version}.tar.gz + if [ ! -e ${tar_filename} ] ; then + curl -fsSL https://github.com/tukaani-project/xz/releases/download/v${version}/${tar_filename} -o ${tar_filename} + fi + tar -xf ${tar_filename} +fi + +# Build +cd ${extracted_dir} +mkdir build +cd build +cmake -DCMAKE_POSITION_INDEPENDENT_CODE=TRUE ../ +make -j${num_cpus} +make install liblzma + +# Clean up +rm -rf $temp_dir diff --git a/components/core/tools/scripts/lib_install/macos/install-all.sh b/components/core/tools/scripts/lib_install/macos/install-all.sh index 97e41903d..cb24dd054 100755 --- a/components/core/tools/scripts/lib_install/macos/install-all.sh +++ b/components/core/tools/scripts/lib_install/macos/install-all.sh @@ -21,6 +21,7 @@ brew install \ mongo-cxx-driver \ msgpack-cxx \ spdlog \ + xz \ zstd # Install pkg-config if it isn't already installed diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh index 1e21314cc..839f6d3c3 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh @@ -10,10 +10,11 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" lib_install_scripts_dir=$script_dir/.. # NOTE: boost must be installed first since the remaining packages depend on it -"$lib_install_scripts_dir"/install-boost.sh 1.74.0 +"$lib_install_scripts_dir"/install-boost.sh 1.87.0 "$lib_install_scripts_dir"/fmtlib.sh 8.0.1 "$lib_install_scripts_dir"/libarchive.sh 3.5.1 +"$lib_install_scripts_dir"/liblzma.sh 5.4.6 "$lib_install_scripts_dir"/lz4.sh 1.8.2 "$lib_install_scripts_dir"/mongocxx.sh 3.10.2 "$lib_install_scripts_dir"/msgpack.sh 7.0.0 diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh index 8997ffe01..3ea3b3ed5 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh @@ -20,6 +20,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ jq \ libcurl4 \ libcurl4-openssl-dev \ + liblzma-dev \ libmariadb-dev \ libssl-dev \ make \ diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh index 7799c9ba5..839f6d3c3 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh @@ -9,8 +9,12 @@ set -u script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" lib_install_scripts_dir=$script_dir/.. +# NOTE: boost must be installed first since the remaining packages depend on it +"$lib_install_scripts_dir"/install-boost.sh 1.87.0 + "$lib_install_scripts_dir"/fmtlib.sh 8.0.1 "$lib_install_scripts_dir"/libarchive.sh 3.5.1 +"$lib_install_scripts_dir"/liblzma.sh 5.4.6 "$lib_install_scripts_dir"/lz4.sh 1.8.2 "$lib_install_scripts_dir"/mongocxx.sh 3.10.2 "$lib_install_scripts_dir"/msgpack.sh 7.0.0 diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh index 9ed6b9b10..ea055ffdf 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh @@ -15,11 +15,9 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ build-essential \ git \ jq \ - libboost-filesystem-dev \ - libboost-iostreams-dev \ - libboost-program-options-dev \ libcurl4 \ libcurl4-openssl-dev \ + liblzma-dev \ libmariadb-dev \ libssl-dev \ openjdk-11-jdk \ diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index ce88ad185..593c07bd7 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -4,6 +4,7 @@ import pathlib import subprocess from contextlib import closing +from typing import Any, Dict, Optional import yaml from celery.app.task import Task @@ -12,9 +13,14 @@ COMPRESSION_JOBS_TABLE_NAME, COMPRESSION_TASKS_TABLE_NAME, Database, + S3Config, StorageEngine, + StorageType, + WorkerConfig, ) from clp_py_utils.clp_logging import set_logging_level +from clp_py_utils.core import read_yaml_config_file +from clp_py_utils.s3_utils import s3_put from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.celery import app from job_orchestration.scheduler.constants import CompressionTaskStatus @@ -108,6 +114,7 @@ def make_clp_s_command( archive_output_dir: pathlib.Path, clp_config: ClpIoConfig, db_config_file_path: pathlib.Path, + enable_s3_write: bool, ): # fmt: off compression_cmd = [ @@ -120,6 +127,9 @@ def make_clp_s_command( ] # fmt: on + if enable_s3_write: + compression_cmd.append("--single-file-archive") + if clp_config.input.timestamp_key is not None: compression_cmd.append("--timestamp-key") compression_cmd.append(clp_config.input.timestamp_key) @@ -128,10 +138,9 @@ def make_clp_s_command( def run_clp( + worker_config: WorkerConfig, clp_config: ClpIoConfig, clp_home: pathlib.Path, - data_dir: pathlib.Path, - archive_output_dir: pathlib.Path, logs_dir: pathlib.Path, job_id: int, task_id: int, @@ -143,10 +152,9 @@ def run_clp( """ Compresses files from an FS into archives on an FS + :param worker_config: WorkerConfig :param clp_config: ClpIoConfig :param clp_home: - :param data_dir: - :param archive_output_dir: :param logs_dir: :param job_id: :param task_id: @@ -156,16 +164,31 @@ def run_clp( :param clp_metadata_db_connection_config :return: tuple -- (whether compression was successful, output messages) """ - clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) - instance_id_str = f"compression-job-{job_id}-task-{task_id}" + clp_storage_engine = worker_config.package.storage_engine + data_dir = worker_config.data_directory + archive_output_dir = worker_config.archive_output.get_directory() + # Generate database config file for clp db_config_file_path = data_dir / f"{instance_id_str}-db-config.yml" db_config_file = open(db_config_file_path, "w") yaml.safe_dump(clp_metadata_db_connection_config, db_config_file) db_config_file.close() + # Get s3 config + s3_config: S3Config + enable_s3_write = False + storage_type = worker_config.archive_output.storage.type + if StorageType.S3 == storage_type: + if StorageEngine.CLP_S != clp_storage_engine: + error_msg = f"S3 storage is not supported for storage engine: {clp_storage_engine}." + logger.error(error_msg) + return False, {"error_message": error_msg} + + s3_config = worker_config.archive_output.storage.s3_config + enable_s3_write = True + if StorageEngine.CLP == clp_storage_engine: compression_cmd = make_clp_command( clp_home=clp_home, @@ -179,6 +202,7 @@ def run_clp( archive_output_dir=archive_output_dir, clp_config=clp_config, db_config_file_path=db_config_file_path, + enable_s3_write=enable_s3_write, ) else: logger.error(f"Unsupported storage engine {clp_storage_engine}") @@ -212,48 +236,62 @@ def run_clp( # Compute the total amount of data compressed last_archive_stats = None + last_line_decoded = False total_uncompressed_size = 0 total_compressed_size = 0 - while True: + + # Handle job metadata update and s3 write if enabled + s3_error = None + while not last_line_decoded: + stats: Optional[Dict[str, Any]] = None + line = proc.stdout.readline() if not line: - break - stats = json.loads(line.decode("ascii")) - if last_archive_stats is not None and stats["id"] != last_archive_stats["id"]: - # We've started a new archive so add the previous archive's last - # reported size to the total - total_uncompressed_size += last_archive_stats["uncompressed_size"] - total_compressed_size += last_archive_stats["size"] - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - update_job_metadata_and_tags( - db_cursor, - job_id, - clp_metadata_db_connection_config["table_prefix"], - tag_ids, - last_archive_stats, - ) - db_conn.commit() + last_line_decoded = True + else: + stats = json.loads(line.decode("ascii")) + + if last_archive_stats is not None and ( + None is stats or stats["id"] != last_archive_stats["id"] + ): + if enable_s3_write: + archive_id = last_archive_stats["id"] + archive_path = archive_output_dir / archive_id + + if s3_error is None: + logger.info(f"Uploading archive {archive_id} to S3...") + result = s3_put(s3_config, archive_path, archive_id) + + if result.is_err(): + logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") + s3_error = result.err_value + # NOTE: It's possible `proc` finishes before we call `terminate` on it, in + # which case the process will still return success. + proc.terminate() + else: + logger.info(f"Finished uploading archive {archive_id} to S3.") + + archive_path.unlink() + + if s3_error is None: + # We've started a new archive so add the previous archive's last reported size to + # the total + total_uncompressed_size += last_archive_stats["uncompressed_size"] + total_compressed_size += last_archive_stats["size"] + with closing(sql_adapter.create_connection(True)) as db_conn, closing( + db_conn.cursor(dictionary=True) + ) as db_cursor: + update_job_metadata_and_tags( + db_cursor, + job_id, + clp_metadata_db_connection_config["table_prefix"], + tag_ids, + last_archive_stats, + ) + db_conn.commit() last_archive_stats = stats - if last_archive_stats is not None: - # Add the last archive's last reported size - total_uncompressed_size += last_archive_stats["uncompressed_size"] - total_compressed_size += last_archive_stats["size"] - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - update_job_metadata_and_tags( - db_cursor, - job_id, - clp_metadata_db_connection_config["table_prefix"], - tag_ids, - last_archive_stats, - ) - db_conn.commit() - # Wait for compression to finish return_code = proc.wait() if 0 != return_code: @@ -274,10 +312,16 @@ def run_clp( "total_uncompressed_size": total_uncompressed_size, "total_compressed_size": total_compressed_size, } - if compression_successful: + + if compression_successful and s3_error is None: return CompressionTaskStatus.SUCCEEDED, worker_output else: - worker_output["error_message"] = f"See logs {stderr_log_path}" + error_msgs = [] + if compression_successful is False: + error_msgs.append(f"See logs {stderr_log_path}") + if s3_error is not None: + error_msgs.append(s3_error) + worker_output["error_message"] = "\n".join(error_msgs) return CompressionTaskStatus.FAILED, worker_output @@ -291,15 +335,28 @@ def compress( paths_to_compress_json: str, clp_metadata_db_connection_config, ): - clp_home_str = os.getenv("CLP_HOME") - data_dir_str = os.getenv("CLP_DATA_DIR") - archive_output_dir_str = os.getenv("CLP_ARCHIVE_OUTPUT_DIR") - logs_dir_str = os.getenv("CLP_LOGS_DIR") + clp_home = pathlib.Path(os.getenv("CLP_HOME")) # Set logging level + logs_dir = pathlib.Path(os.getenv("CLP_LOGS_DIR")) clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) set_logging_level(logger, clp_logging_level) + # Load configuration + try: + worker_config = WorkerConfig.parse_obj( + read_yaml_config_file(pathlib.Path(os.getenv("CLP_CONFIG_PATH"))) + ) + except Exception as ex: + error_msg = "Failed to load worker config" + logger.exception(error_msg) + return CompressionTaskResult( + task_id=task_id, + status=CompressionTaskStatus.FAILED, + duration=0, + error_message=error_msg, + ) + clp_io_config = ClpIoConfig.parse_raw(clp_io_config_json) paths_to_compress = PathsToCompress.parse_raw(paths_to_compress_json) @@ -308,11 +365,10 @@ def compress( start_time = datetime.datetime.now() logger.info(f"[job_id={job_id} task_id={task_id}] COMPRESSION STARTED.") compression_task_status, worker_output = run_clp( + worker_config, clp_io_config, - pathlib.Path(clp_home_str), - pathlib.Path(data_dir_str), - pathlib.Path(archive_output_dir_str), - pathlib.Path(logs_dir_str), + clp_home, + logs_dir, job_id, task_id, tag_ids, diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index 423ebb757..58ae43450 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -5,14 +5,15 @@ from celery.app.task import Task from celery.utils.log import get_task_logger -from clp_py_utils.clp_config import Database, StorageEngine +from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( - report_command_creation_failure, + report_task_failure, run_query_task, ) +from job_orchestration.executor.utils import load_worker_config from job_orchestration.scheduler.job_config import ExtractIrJobConfig, ExtractJsonJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -21,15 +22,17 @@ def make_command( - storage_engine: str, clp_home: Path, - archives_dir: Path, + worker_config: WorkerConfig, archive_id: str, - stream_output_dir: Path, job_config: dict, results_cache_uri: str, - stream_collection_name: str, ) -> Optional[List[str]]: + storage_engine = worker_config.package.storage_engine + archives_dir = worker_config.archive_output.get_directory() + stream_output_dir = worker_config.stream_output_dir + stream_collection_name = worker_config.stream_collection_name + if StorageEngine.CLP == storage_engine: logger.info("Starting IR extraction") extract_ir_config = ExtractIrJobConfig.parse_obj(job_config) @@ -97,28 +100,38 @@ def extract_stream( task_status: QueryTaskStatus sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) + # Load configuration + clp_config_path = Path(os.getenv("CLP_CONFIG_PATH")) + worker_config = load_worker_config(clp_config_path, logger) + if worker_config is None: + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + + if worker_config.archive_output.storage.type == StorageType.S3: + logger.error(f"Stream extraction is not supported for the S3 storage type") + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + # Make task_command clp_home = Path(os.getenv("CLP_HOME")) - archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE") - stream_output_dir = Path(os.getenv("CLP_STREAM_OUTPUT_DIR")) - stream_collection_name = os.getenv("CLP_STREAM_COLLECTION_NAME") task_command = make_command( - storage_engine=clp_storage_engine, clp_home=clp_home, - archives_dir=archive_directory, + worker_config=worker_config, archive_id=archive_id, - stream_output_dir=stream_output_dir, job_config=job_config, results_cache_uri=results_cache_uri, - stream_collection_name=stream_collection_name, ) if not task_command: - return report_command_creation_failure( + logger.error(f"Error creating {task_name} command") + return report_task_failure( sql_adapter=sql_adapter, - logger=logger, - task_name=task_name, task_id=task_id, start_time=start_time, ) diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 598bfdcfc..7cf7b330f 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -5,14 +5,15 @@ from celery.app.task import Task from celery.utils.log import get_task_logger -from clp_py_utils.clp_config import Database, StorageEngine +from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( - report_command_creation_failure, + report_task_failure, run_query_task, ) +from job_orchestration.executor.utils import load_worker_config from job_orchestration.scheduler.job_config import SearchJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -21,14 +22,16 @@ def make_command( - storage_engine: str, clp_home: Path, - archives_dir: Path, + worker_config: WorkerConfig, archive_id: str, search_config: SearchJobConfig, results_cache_uri: str, results_collection: str, ) -> Optional[List[str]]: + storage_engine = worker_config.package.storage_engine + archives_dir = worker_config.archive_output.get_directory() + if StorageEngine.CLP == storage_engine: command = [str(clp_home / "bin" / "clo"), "s", str(archives_dir / archive_id)] if search_config.path_filter is not None: @@ -116,26 +119,40 @@ def search( task_status: QueryTaskStatus sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) + # Load configuration + clp_config_path = Path(os.getenv("CLP_CONFIG_PATH")) + worker_config = load_worker_config(clp_config_path, logger) + if worker_config is None: + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + + if worker_config.archive_output.storage.type == StorageType.S3: + logger.error(f"Search is not supported for the S3 storage type") + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + # Make task_command clp_home = Path(os.getenv("CLP_HOME")) - archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE") search_config = SearchJobConfig.parse_obj(job_config) task_command = make_command( - storage_engine=clp_storage_engine, clp_home=clp_home, - archives_dir=archive_directory, + worker_config=worker_config, archive_id=archive_id, search_config=search_config, results_cache_uri=results_cache_uri, results_collection=job_id, ) if not task_command: - return report_command_creation_failure( + logger.error(f"Error creating {task_name} command") + return report_task_failure( sql_adapter=sql_adapter, - logger=logger, - task_name=task_name, task_id=task_id, start_time=start_time, ) diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py index 69d22398e..523abbe00 100644 --- a/components/job-orchestration/job_orchestration/executor/query/utils.py +++ b/components/job-orchestration/job_orchestration/executor/query/utils.py @@ -19,14 +19,11 @@ def get_task_log_file_path(clp_logs_dir: Path, job_id: str, task_id: int) -> Pat return worker_logs_dir / f"{task_id}-clo.log" -def report_command_creation_failure( +def report_task_failure( sql_adapter: SQL_Adapter, - logger: Logger, - task_name: str, task_id: int, start_time: datetime.datetime, ): - logger.error(f"Error creating {task_name} command") task_status = QueryTaskStatus.FAILED update_query_task_metadata( sql_adapter, diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py new file mode 100644 index 000000000..47ea702ae --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -0,0 +1,23 @@ +from logging import Logger +from pathlib import Path +from typing import Optional + +from clp_py_utils.clp_config import WorkerConfig +from clp_py_utils.core import read_yaml_config_file + + +def load_worker_config( + config_path: Path, + logger: Logger, +) -> Optional[WorkerConfig]: + """ + Loads a WorkerConfig object from the specified configuration file. + :param config_path: Path to the configuration file. + :param logger: Logger instance for reporting errors if loading fails. + :return: The loaded WorkerConfig object on success, None otherwise. + """ + try: + return WorkerConfig.parse_obj(read_yaml_config_file(config_path)) + except Exception: + logger.exception("Failed to load worker config") + return None diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index 62b7a27fc..bd793686b 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -53,13 +53,14 @@ def update_compression_task_metadata(db_cursor, task_id, kv): logger.error("Must specify at least one field to update") raise ValueError - field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()] + field_set_expressions = [f"{k} = %s" for k in kv.keys()] query = f""" - UPDATE {COMPRESSION_TASKS_TABLE_NAME} - SET {", ".join(field_set_expressions)} - WHERE id={task_id} + UPDATE {COMPRESSION_TASKS_TABLE_NAME} + SET {", ".join(field_set_expressions)} + WHERE id = %s """ - db_cursor.execute(query) + values = list(kv.values()) + [task_id] + db_cursor.execute(query, values) def update_compression_job_metadata(db_cursor, job_id, kv): @@ -67,13 +68,14 @@ def update_compression_job_metadata(db_cursor, job_id, kv): logger.error("Must specify at least one field to update") raise ValueError - field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()] + field_set_expressions = [f"{k} = %s" for k in kv.keys()] query = f""" - UPDATE {COMPRESSION_JOBS_TABLE_NAME} - SET {", ".join(field_set_expressions)} - WHERE id={job_id} + UPDATE {COMPRESSION_JOBS_TABLE_NAME} + SET {", ".join(field_set_expressions)} + WHERE id = %s """ - db_cursor.execute(query) + values = list(kv.values()) + [job_id] + db_cursor.execute(query, values) def search_and_schedule_new_tasks(db_conn, db_cursor, clp_metadata_db_connection_config): diff --git a/components/log-viewer-webui/client/src/api/query.js b/components/log-viewer-webui/client/src/api/query.js index eda1db21c..f48f610a1 100644 --- a/components/log-viewer-webui/client/src/api/query.js +++ b/components/log-viewer-webui/client/src/api/query.js @@ -2,22 +2,11 @@ import axios from "axios"; /** - * @typedef {object} ExtractIrResp + * @typedef {object} ExtractStreamResp + * @property {string} stream_id * @property {number} begin_msg_ix * @property {number} end_msg_ix - * @property {string} file_split_id - * @property {boolean} is_last_ir_chunk - * @property {string} orig_file_id - * @property {string} path - * @property {string} _id - */ - -/** - * @typedef {object} ExtractJsonResp - * @property {number} begin_msg_ix - * @property {number} end_msg_ix - * @property {boolean} is_last_ir_chunk - * @property {string} orig_file_id + * @property {boolean} is_last_chunk * @property {string} path * @property {string} _id */ @@ -30,7 +19,7 @@ import axios from "axios"; * @param {string} streamId * @param {number} logEventIdx * @param {Function} onUploadProgress Callback to handle upload progress events. - * @return {Promise>} + * @return {Promise>} */ const submitExtractStreamJob = async (extractJobType, streamId, logEventIdx, onUploadProgress) => { return await axios.post( diff --git a/components/log-viewer-webui/server/src/DbManager.js b/components/log-viewer-webui/server/src/DbManager.js index e1ec00812..fc48ba5e8 100644 --- a/components/log-viewer-webui/server/src/DbManager.js +++ b/components/log-viewer-webui/server/src/DbManager.js @@ -171,7 +171,7 @@ class DbManager { */ async getExtractedStreamFileMetadata (streamId, logEventIdx) { return await this.#streamFilesCollection.findOne({ - orig_file_id: streamId, + stream_id: streamId, begin_msg_ix: {$lte: logEventIdx}, end_msg_ix: {$gt: logEventIdx}, }); diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml index f19b93463..22b03b889 100644 --- a/components/package-template/src/etc/clp-config.yml +++ b/components/package-template/src/etc/clp-config.yml @@ -66,7 +66,9 @@ # ## Where archives should be output to #archive_output: -# directory: "var/data/archives" +# storage: +# type: "fs" +# directory: "var/data/archives" # # # How much data CLP should try to compress into each archive # target_archive_size: 268435456 # 256 MB diff --git a/docs/requirements.txt b/docs/requirements.txt index 84466dcae..dd8ca3593 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,10 +1,6 @@ -myst-parser>=2.0.0 -# Locked to avoid pydata/pydata-sphinx-theme#1676 until its fix is released in a version above -# 0.15.2 -pydata-sphinx-theme==0.14.4 -# Locked to avoid the following issue until a fix is released: -# https://github.com/sphinx-doc/sphinx/issues/13002 -sphinx==8.0.2 -sphinx_design>=0.5.0 +myst-parser>=4.0.0 +pydata-sphinx-theme>=0.16.0 +sphinx>=8.1.3 +sphinx_design>=0.6.1 sphinx-copybutton>=0.5.2 -sphinxcontrib-mermaid>=0.9.2 +sphinxcontrib-mermaid>=1.0.0 diff --git a/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md b/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md index 654b9bf5a..1bc90910a 100644 --- a/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md +++ b/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md @@ -10,7 +10,7 @@ Before you run any commands below, you should review the scripts to ensure they any dependencies or apply any configurations that you don't expect. ::: -To install all dependencies, run: +To install all dependencies, run the following with elevated privileges: :::{note} The packages built from source ([install-packages-from-source.sh][src-install-script]) are installed diff --git a/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md b/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md index 53ee0ecbd..776c2d43e 100644 --- a/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md +++ b/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md @@ -10,7 +10,7 @@ Before you run any commands below, you should review the scripts to ensure they any dependencies or apply any configurations that you don't expect. ::: -To install all dependencies, run: +To install all dependencies, run the following with elevated privileges: ```shell components/core/tools/scripts/lib_install/ubuntu-focal/install-all.sh diff --git a/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md b/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md index 186098446..2e5d4eb3c 100644 --- a/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md +++ b/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md @@ -10,7 +10,7 @@ Before you run any commands below, you should review the scripts to ensure they any dependencies or apply any configurations that you don't expect. ::: -To install all dependencies, run: +To install all dependencies, run the following with elevated privileges: ```shell components/core/tools/scripts/lib_install/ubuntu-jammy/install-all.sh