From 9bb40d155bc2f04142043ec276c3492b68339b38 Mon Sep 17 00:00:00 2001 From: James Meneghello Date: Wed, 6 Nov 2024 07:30:03 +0800 Subject: [PATCH] refactor: Optimise metadata discovery for large databases (#528) Overrides the SDK functions to instead use the `get_multi_*` functions from SQLAlchemy Inspector. On our database of ~120 tables, this reduces the discovery runtime from 10-12 minutes to about 30 seconds. - Closes #284 --- tap_postgres/client.py | 137 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/tap_postgres/client.py b/tap_postgres/client.py index a6ae564..42c3ef9 100644 --- a/tap_postgres/client.py +++ b/tap_postgres/client.py @@ -20,6 +20,8 @@ import sqlalchemy.types from psycopg2 import extras from singer_sdk import SQLConnector, SQLStream +from singer_sdk import typing as th +from singer_sdk._singerlib import CatalogEntry, MetadataMapping, Schema from singer_sdk.connectors.sql import SQLToJSONSchema from singer_sdk.helpers._state import increment_state from singer_sdk.helpers._typing import TypeConformanceLevel @@ -32,6 +34,12 @@ from singer_sdk.helpers.types import Context from sqlalchemy.dialects import postgresql from sqlalchemy.engine import Engine + from sqlalchemy.engine.interfaces import ( # type: ignore[attr-defined] + ReflectedColumn, + ReflectedIndex, + ReflectedPrimaryKeyConstraint, + TableKey, + ) from sqlalchemy.engine.reflection import Inspector @@ -183,6 +191,135 @@ def get_schema_names(self, engine: Engine, inspected: Inspector) -> list[str]: return self.config["filter_schemas"] return super().get_schema_names(engine, inspected) + # Uses information_schema for speed. + def discover_catalog_entry_optimized( # noqa: PLR0913 + self, + engine: Engine, + inspected: Inspector, + schema_name: str, + table_name: str, + is_view: bool, + table_data: dict[TableKey, list[ReflectedColumn]], + pk_data: dict[TableKey, ReflectedPrimaryKeyConstraint], + index_data: dict[TableKey, list[ReflectedIndex]], + ) -> CatalogEntry: + """Create `CatalogEntry` object for the given table or a view. + + Args: + engine: SQLAlchemy engine + inspected: SQLAlchemy inspector instance for engine + schema_name: Schema name to inspect + table_name: Name of the table or a view + is_view: Flag whether this object is a view, returned by `get_object_names` + table_data: Cached inspector data for the relevant tables + pk_data: Cached inspector data for the relevant primary keys + index_data: Cached inspector data for the relevant indexes + + Returns: + `CatalogEntry` object for the given table or a view + """ + # Initialize unique stream name + unique_stream_id = f"{schema_name}-{table_name}" + table_key = (schema_name, table_name) + + # Detect key properties + possible_primary_keys: list[list[str]] = [] + pk_def = pk_data.get(table_key, {}) + if pk_def and "constrained_columns" in pk_def: + possible_primary_keys.append(pk_def["constrained_columns"]) + + # An element of the columns list is ``None`` if it's an expression and is + # returned in the ``expressions`` list of the reflected index. + possible_primary_keys.extend( + index_def["column_names"] + for index_def in index_data.get(table_key, []) + if index_def.get("unique", False) + ) + + key_properties = next(iter(possible_primary_keys), None) + + # Initialize columns list + table_schema = th.PropertiesList() + for column_def in table_data.get(table_key, []): + column_name = column_def["name"] + is_nullable = column_def.get("nullable", False) + jsonschema_type: dict = self.to_jsonschema_type(column_def["type"]) + table_schema.append( + th.Property( + name=column_name, + wrapped=th.CustomType(jsonschema_type), + nullable=is_nullable, + required=column_name in key_properties if key_properties else False, + ), + ) + schema = table_schema.to_dict() + + # Initialize available replication methods + addl_replication_methods: list[str] = [""] # By default an empty list. + # Notes regarding replication methods: + # - 'INCREMENTAL' replication must be enabled by the user by specifying + # a replication_key value. + # - 'LOG_BASED' replication must be enabled by the developer, according + # to source-specific implementation capabilities. + replication_method = next(reversed(["FULL_TABLE", *addl_replication_methods])) + + # Create the catalog entry object + return CatalogEntry( + tap_stream_id=unique_stream_id, + stream=unique_stream_id, + table=table_name, + key_properties=key_properties, + schema=Schema.from_dict(schema), + is_view=is_view, + replication_method=replication_method, + metadata=MetadataMapping.get_standard_metadata( + schema_name=schema_name, + schema=schema, + replication_method=replication_method, + key_properties=key_properties, + valid_replication_keys=None, # Must be defined by user + ), + database=None, # Expects single-database context + row_count=None, + stream_alias=None, + replication_key=None, # Must be defined by user + ) + + def discover_catalog_entries(self) -> list[dict]: + """Return a list of catalog entries from discovery. + + Returns: + The discovered catalog entries as a list. + """ + result: list[dict] = [] + engine = self._engine + inspected = sa.inspect(engine) + for schema_name in self.get_schema_names(engine, inspected): + # Use get_multi_* data here instead of pulling per-table + table_data = inspected.get_multi_columns(schema=schema_name) + pk_data = inspected.get_multi_pk_constraint(schema=schema_name) + index_data = inspected.get_multi_indexes(schema=schema_name) + + # Iterate through each table and view + for table_name, is_view in self.get_object_names( + engine, + inspected, + schema_name, + ): + catalog_entry = self.discover_catalog_entry_optimized( + engine, + inspected, + schema_name, + table_name, + is_view, + table_data, + pk_data, + index_data, + ) + result.append(catalog_entry.to_dict()) + + return result + class PostgresStream(SQLStream): """Stream class for Postgres streams."""